[cgi-wiki-dev] [PATCH] Abstract searching to base class
Simon Cozens
cgi-wiki-dev@earth.li
Mon, 23 Feb 2004 11:50:38 +0000
--LQksG6bCIzRHxTLp
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
This is a pretty hefty patch which abstracts out the two searching classes
into a Search::Base class, in preparation for adding a third search engine
backend. (Plucene)
While constructing this I found a particularly confusing thing about the
full text search backend. index_node is called as follows:
$search->index_node($node, $content);
However, DBIxFTS says this:
my ($self, $node) = @_;
(No content?)
my $dbh = $self->{_dbh};
my $fts_all = DBIx::FullTextSearch->open($dbh, "_content_and_title_fts");
my $fts_titles = DBIx::FullTextSearch->open($dbh, "_title_fts");
$fts_all->index_document($node);
$fts_titles->index_document($node);
(No difference between purely-title and content-and-title indexing?)
Where does it get the content from?
--
"There is no statute of limitations on stupidity."
-- Randomly produced by a computer program called Markov3.
--LQksG6bCIzRHxTLp
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="base.patch"
diff -ruN CGI-Wiki-0.50/MANIFEST CGI-Wiki-0.50-search/MANIFEST
--- CGI-Wiki-0.50/MANIFEST Thu Nov 20 11:12:58 2003
+++ CGI-Wiki-0.50-search/MANIFEST Mon Feb 23 11:25:10 2004
@@ -9,6 +9,7 @@
lib/CGI/Wiki/Extending.pod
lib/CGI/Wiki/Formatter/Default.pm
lib/CGI/Wiki/Plugin.pm
+lib/CGI/Wiki/Search/Base.pm
lib/CGI/Wiki/Search/DBIxFTS.pm
lib/CGI/Wiki/Search/SII.pm
lib/CGI/Wiki/Setup/DBIxFTSMySQL.pm
diff -ruN CGI-Wiki-0.50/lib/CGI/Wiki/Search/Base.pm CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/Base.pm
--- CGI-Wiki-0.50/lib/CGI/Wiki/Search/Base.pm Thu Jan 1 01:00:00 1970
+++ CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/Base.pm Mon Feb 23 11:43:40 2004
@@ -0,0 +1,229 @@
+package CGI::Wiki::Search::Base;
+
+use strict;
+use Search::InvertedIndex;
+use Carp "croak";
+
+use vars qw( @ISA $VERSION );
+
+sub _abstract {
+ my $who = (caller(1))[3];
+ croak "$who is an abstract method which the ".(ref shift).
+ " class has not provided";
+}
+
+$VERSION = 0.01;
+
+=head1 NAME
+
+CGI::Wiki::Search::Base - Base class for CGI::Wiki search plugins
+
+=head1 SYNOPSIS
+
+ my $search = CGI::Wiki::Search::XXX->new( @args );
+ my %wombat_nodes = $search->search_nodes("wombat");
+
+This class details the methods that need to be overriden by search plugins.
+
+=cut
+
+=head1 METHODS
+
+=over 4
+
+=item B<new>
+
+ my $search = CGI::Wiki::Search::XXX->new( @args );
+
+Creates a new searcher. By default the arguments are just passed to
+C<_init>, so you may wish to override that instead.
+
+=cut
+
+sub new {
+ my ($class, @args) = @_;
+ my $self = {};
+ bless $self, $class;
+ return $self->_init(@args);
+}
+
+sub _init {
+ my ($self, %args) = @_;
+ @{$self}{keys %args} = values %args;
+ return $self;
+}
+
+=item B<search_nodes>
+
+ # Find all the nodes which contain the word 'expert'.
+ my %results = $search->search_nodes('expert');
+
+Returns a (possibly empty) hash whose keys are the node names and
+whose values are the scores in some kind of relevance-scoring system I
+haven't entirely come up with yet. For OR searches, this could
+initially be the number of terms that appear in the node, perhaps.
+
+Defaults to AND searches (if $and_or is not supplied, or is anything
+other than C<OR> or C<or>).
+
+Searches are case-insensitive.
+
+=cut
+
+sub search_nodes {
+ my ($self, $termstr, $and_or) = @_;
+
+ $and_or = lc($and_or);
+ unless ( defined $and_or and $and_or eq "or" ) {
+ $and_or = "and";
+ }
+
+ # Extract individual search terms.
+ my @terms = $self->analyze($termstr);
+
+ return $self->_do_search($and_or, \@terms);
+}
+
+sub _do_search { shift->_abstract };
+
+=item B<analyze>
+
+ @terms = $self->analyze($string)
+
+Splits a string into a set of terms for indexing and searching. Typically
+this is done case-insensitively, splitting at word boundaries, and extracting
+words that contain at least 1 word characters.
+
+=cut
+
+sub analyze {
+ my ($self, $string) = @_;
+ return grep { length > 1 # ignore single characters
+ and ! /^\W*$/ } # and things composed entirely
+ # of non-word characters
+ split( /\b/, # split at word boundaries
+ lc($string) # be case-insensitive
+ );
+}
+
+=item B<fuzzy_title_match>
+
+ $wiki->write_node( "King's Cross St Pancras", "A station." );
+ my %matches = $search->fuzzy_title_match( "Kings Cross St. Pancras" );
+
+Returns a (possibly empty) hash whose keys are the node names and
+whose values are the scores in some kind of relevance-scoring system I
+haven't entirely come up with yet.
+
+Note that even if an exact match is found, any other similar enough
+matches will also be returned. However, any exact match is guaranteed
+to have the highest relevance score.
+
+The matching is done against "canonicalised" forms of the search
+string and the node titles in the database: stripping vowels, repeated
+letters and non-word characters, and lowercasing.
+
+=cut
+
+sub fuzzy_title_match {
+ my ($self, $string) = @_;
+ my $canonical = $self->canonicalise_title( $string );
+ $self->_fuzzy_match($string, $canonical);
+}
+
+sub _fuzzy_match { shift->_abstract };
+
+=item B<index_node>
+
+ $search->index_node($node, $content);
+
+Indexes or reindexes the given node in the search engine indexes.
+You must supply both the node name and its content.
+
+=cut
+
+sub index_node {
+ my ($self, $node, $content) = @_;
+ croak "Must supply a node name" unless $node;
+ croak "Must supply node content" unless defined $content;
+
+ # Index the individual words in the node content and title.
+ my @keys = $self->analyze("$content $node");
+ $self->_index_node($node, $content, \@keys);
+ $self->_index_fuzzy($node, $self->canonicalise_title( $node ));
+}
+
+sub _index_node { shift->_abstract };
+sub _index_fuzzy { shift->_abstract };
+
+=head2 C<canonicalise_title>
+
+ $fuzzy = $self->canonicalise_title( $ node);
+
+Returns the node title as suitable for fuzzy searching: with punctuation
+and spaces removes, vowels removed, and double letters squashed.
+
+=cut
+
+sub canonicalise_title {
+ my ($self, $title) = @_;
+ return "" unless $title;
+ my $canonical = lc($title);
+ $canonical =~ s/\W//g; # remove non-word characters
+ $canonical =~ s/[aeiouy]//g; # remove vowels and 'y'
+ $canonical =~ tr/a-z//s; # collapse doubled (or tripled, etc) letters
+ return $canonical;
+}
+
+=item B<delete_node>
+
+ $search->delete_node($node);
+
+Removes the given node from the search indexes. NOTE: It's up to you to
+make sure the node is removed from the backend store. Croaks on error.
+
+=cut
+
+sub delete_node {
+ my ($self, $node) = @_;
+ croak "Must supply a node name" unless $node;
+ $self->_delete_node($node);
+}
+
+sub _delete_node { shift->_abstract };
+
+=item B<supports_phrase_searches>
+
+ if ( $search->supports_phrase_searches ) {
+ return $search->search_nodes( '"fox in socks"' );
+ }
+
+Returns true if this search backend supports phrase searching, and
+false otherwise.
+
+=cut
+
+sub supports_phrase_searches { shift->_abstract };
+
+=item B<supports_fuzzy_searches>
+
+ if ( $search->supports_fuzzy_searches ) {
+ return $search->fuzzy_title_match("Kings Cross St Pancreas");
+ }
+
+Returns true if this search backend supports fuzzy title matching, and
+false otherwise.
+
+=cut
+
+sub supports_fuzzy_searches { shift->_abstract };
+
+=back
+
+=head1 SEE ALSO
+
+L<CGI::Wiki>
+
+=cut
+
+1;
diff -ruN CGI-Wiki-0.50/lib/CGI/Wiki/Search/DBIxFTS.pm CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/DBIxFTS.pm
--- CGI-Wiki-0.50/lib/CGI/Wiki/Search/DBIxFTS.pm Thu Nov 20 11:13:01 2003
+++ CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/DBIxFTS.pm Mon Feb 23 11:24:42 2004
@@ -3,6 +3,7 @@
use strict;
use DBIx::FullTextSearch;
use Carp "croak";
+use base 'CGI::Wiki::Search::Base';
use vars qw( @ISA $VERSION );
@@ -42,13 +43,6 @@
=cut
-sub new {
- my ($class, @args) = @_;
- my $self = {};
- bless $self, $class;
- return $self->_init(@args);
-}
-
sub _init {
my ($self, %args) = @_;
croak "Must supply a database handle" unless $args{dbh};
@@ -56,28 +50,8 @@
return $self;
}
-=item B<search_nodes>
-
- # Find all the nodes which contain both the word 'expert' and the
- # phrase 'wombat defenestration'.
- %results = $search->search_nodes('expert "wombat defenestration"');
-
- # Find all the nodes which contain at least one of the words
- # 'buffy', 'pony', and 'pie'.
- %results = $search->search_nodes('buffy pony pie', 'OR');
-
-Returns a (possibly empty) hash whose keys are the node names and
-whose values are the scores in some kind of relevance-scoring system I
-haven't entirely come up with yet. For OR searches, this could
-initially be the number of terms that appear in the node, perhaps.
-
-Defaults to AND searches (if $and_or is not supplied, or is anything
-other than C<OR> or C<or>).
-
-Searches are case-insensitive.
-
-=cut
-
+# We can't use the base version, since we're doing the analysis
+# differently between searching and indexing
sub search_nodes {
my ($self, $termstr, $and_or) = @_;
@@ -86,10 +60,6 @@
$and_or = "AND";
}
- # Note: Not sure yet whether the term extraction below is going to be
- # common between backends. Move it back into CGI::Wiki if it turns
- # out to be.
-
# Extract individual search terms - first phrases (between double quotes).
my @terms = ($termstr =~ m/"([^"]+)"/g);
$termstr =~ s/"[^"]*"//g;
@@ -108,17 +78,8 @@
return map { $_ => 1 } @finds;
}
-=item B<index_node>
-
- $search->index_node($node);
-
-Indexes or reindexes the given node in the FTS indexes in the backend
-storage.
-
-=cut
-
sub index_node {
- my ($self, $node) = @_;
+ my ($self, $node, $content) = @_;
my $dbh = $self->{_dbh};
my $fts_all = DBIx::FullTextSearch->open($dbh, "_content_and_title_fts");
@@ -131,16 +92,6 @@
delete $fts_titles->{db_backend}; # ditto
}
-=item B<delete_node>
-
- $search->delete_node($node);
-
-Removes the given node from the search indexes. NOTE: It's up to you to
-make sure the node is removed from the backend store. Croaks on error,
-returns true on success.
-
-=cut
-
sub delete_node {
my ($self, $node) = @_;
my $dbh = $self->{_dbh};
@@ -155,20 +106,8 @@
return 1;
}
-=item B<supports_phrase_searches>
-
- if ( $search->supports_phrase_searches ) {
- return $search->search_nodes( '"fox in socks"' );
- }
-
-Returns true if this search backend supports phrase searching, and
-false otherwise.
-
-=cut
-
-sub supports_phrase_searches {
- return 1;
-}
+sub supports_phrase_searches { return 1; }
+sub supports_fuzzy_searches { return 0; }
=back
diff -ruN CGI-Wiki-0.50/lib/CGI/Wiki/Search/SII.pm CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/SII.pm
--- CGI-Wiki-0.50/lib/CGI/Wiki/Search/SII.pm Sun Dec 21 11:31:16 2003
+++ CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/SII.pm Mon Feb 23 11:34:58 2004
@@ -7,6 +7,7 @@
use vars qw( @ISA $VERSION );
$VERSION = 0.08;
+use base 'CGI::Wiki::Search::Base';
=head1 NAME
@@ -20,6 +21,8 @@
Provides search-related methods for CGI::Wiki
+See also L<CGI::Wiki::Base>
+
=cut
=head1 METHODS
@@ -53,13 +56,6 @@
=cut
-sub new {
- my ($class, @args) = @_;
- my $self = {};
- bless $self, $class;
- return $self->_init(@args);
-}
-
sub _init {
my ($self, %args) = @_;
my $indexdb = $args{indexdb};
@@ -74,42 +70,10 @@
return $self;
}
-=item B<search_nodes>
-
- # Find all the nodes which contain the word 'expert'.
- my %results = $search->search_nodes('expert');
-
-Returns a (possibly empty) hash whose keys are the node names and
-whose values are the scores in some kind of relevance-scoring system I
-haven't entirely come up with yet. For OR searches, this could
-initially be the number of terms that appear in the node, perhaps.
-
-Defaults to AND searches (if $and_or is not supplied, or is anything
-other than C<OR> or C<or>).
-
-Searches are case-insensitive.
-
-=cut
-
-sub search_nodes {
- my ($self, $termstr, $and_or) = @_;
-
- $and_or = lc($and_or);
- unless ( defined $and_or and $and_or eq "or" ) {
- $and_or = "and";
- }
-
- # Extract individual search terms.
- my @terms = grep { length > 1 # ignore single characters
- and ! /^\W*$/ } # and things composed entirely
- # of non-word characters
- split( /\b/, # split at word boundaries
- lc($termstr) # be case-insensitive
- );
-
- # Create a leaf for each search term.
+sub _do_search {
+ my ($self, $and_or, $terms) = @_;
my @leaves;
- foreach my $term ( @terms ) {
+ foreach my $term ( @$terms ) {
my $leaf = Search::InvertedIndex::Query::Leaf->new(-key => $term,
-group => "nodes" );
push @leaves, $leaf;
@@ -131,29 +95,8 @@
return %results;
}
-=item B<fuzzy_title_match>
-
- $wiki->write_node( "King's Cross St Pancras", "A station." );
- my %matches = $search->fuzzy_title_match( "Kings Cross St. Pancras" );
-
-Returns a (possibly empty) hash whose keys are the node names and
-whose values are the scores in some kind of relevance-scoring system I
-haven't entirely come up with yet.
-
-Note that even if an exact match is found, any other similar enough
-matches will also be returned. However, any exact match is guaranteed
-to have the highest relevance score.
-
-The matching is done against "canonicalised" forms of the search
-string and the node titles in the database: stripping vowels, repeated
-letters and non-word characters, and lowercasing.
-
-=cut
-
-sub fuzzy_title_match {
- my ($self, $string) = @_;
- my $canonical = $self->_canonicalise_title( $string );
-
+sub _fuzzy_match {
+ my ($self, $string, $canonical) = @_;
my $leaf = Search::InvertedIndex::Query::Leaf->new(
-key => $canonical,
-group => "fuzzy_titles" );
@@ -171,42 +114,21 @@
return %results;
}
-=item B<index_node>
-
- $search->index_node($node, $content);
-
-Indexes or reindexes the given node in the L<Search::InvertedIndex>
-indexes. You must supply both the node name and its content.
-
-=cut
-
-sub index_node {
- my ($self, $node, $content) = @_;
- croak "Must supply a node name" unless $node;
- croak "Must supply node content" unless defined $content;
-
- # Index the individual words in the node content and title.
- my @keys = grep { length > 1 # ignore single characters
- and ! /^\W*$/ } # and things composed entirely
- # of non-word characters
- split( /\b/, # split at word boundaries
- lc( # be case-insensitive
- "$content $node" # index content and title
- )
- );
+sub _index_node {
+ my ($self, $node, $content, $keys) = @_;
my $update = Search::InvertedIndex::Update->new(
-group => "nodes",
-index => $node,
-data => $content,
- -keys => { map { $_ => 1 } @keys }
+ -keys => { map { $_ => 1 } @$keys }
);
$self->{_map}->update( -update => $update );
+}
- # Index a canonicalised form of the title for fuzzy searches.
- my $canonical = $self->_canonicalise_title( $node );
-
- $update = Search::InvertedIndex::Update->new(
+sub _index_fuzzy {
+ my ($self, $node, $canonical) = @_;
+ my $update = Search::InvertedIndex::Update->new(
-group => "fuzzy_titles",
-index => $node . "_fuzzy_title",
-data => $node,
@@ -215,51 +137,19 @@
$self->{_map}->update( -update => $update );
}
-sub _canonicalise_title {
- my ($self, $title) = @_;
- return "" unless $title;
- my $canonical = lc($title);
- $canonical =~ s/\W//g; # remove non-word characters
- $canonical =~ s/[aeiouy]//g; # remove vowels and 'y'
- $canonical =~ s/(\w)\1+/$1/eg; # collapse doubled (or tripled, etc) letters
- return $canonical;
-}
-
-=item B<delete_node>
-
- $search->delete_node($node);
-
-Removes the given node from the search indexes. NOTE: It's up to you to
-make sure the node is removed from the backend store. Croaks on error.
-
-=cut
-
-sub delete_node {
+sub _delete_node {
my ($self, $node) = @_;
- croak "Must supply a node name" unless $node;
$self->{_map}->remove_index_from_all({ -index => $node });
}
-=item B<supports_phrase_searches>
-
- if ( $search->supports_phrase_searches ) {
- return $search->search_nodes( '"fox in socks"' );
- }
-
-Returns true if this search backend supports phrase searching, and
-false otherwise.
-
-=cut
-
-sub supports_phrase_searches {
- return 0;
-}
+sub supports_phrase_searches { return 0; }
+sub supports_fuzzy_searches { return 1; }
=back
=head1 SEE ALSO
-L<CGI::Wiki>
+L<CGI::Wiki>, L<CGI::Wiki::Base>
=cut
diff -ruN CGI-Wiki-0.50/lib/CGI/Wiki.pm CGI-Wiki-0.50-search/lib/CGI/Wiki.pm
--- CGI-Wiki-0.50/lib/CGI/Wiki.pm Sun Dec 21 11:57:04 2003
+++ CGI-Wiki-0.50-search/lib/CGI/Wiki.pm Mon Feb 23 10:53:14 2004
@@ -468,11 +468,28 @@
$self->search_obj->supports_phrase_searches( @args ) if $self->search_obj;
}
+=item B<supports_fuzzy_searches>
+
+ if ( $wiki->supports_fuzzy_searches ) {
+ return $wiki->fuzzy_title_match( 'Kings Cross, St Pancreas' );
+ }
+
+Returns true if your chosen search backend supports fuzzy title searching,
+and false otherwise.
+
+=cut
+
+sub supports_fuzzy_searches {
+ my ($self, @args) = @_;
+ $self->search_obj->supports_fuzzy_searches( @args ) if $self->search_obj;
+}
+
+
=item B<fuzzy_title_match>
-B<NOTE:> This section of the documentation assumes you are using the
-L<CGI::Wiki::Search::SII> backend; this feature has not yet been
-implemented for the L<CGI::Wiki::Search::DBIxFTS> backend.
+B<NOTE:> This section of the documentation assumes you are using a
+search engine which supports fuzzy matching. (See above.) The
+L<CGI::Wiki::Search::DBIxFTS> backend in particular does not.
$wiki->write_node( "King's Cross St Pancras", "A station." );
my %matches = $wiki->fuzzy_title_match( "Kings Cross St. Pancras" );
@@ -496,7 +513,11 @@
sub fuzzy_title_match {
my ($self, @args) = @_;
if ( $self->search_obj ) {
- $self->search_obj->fuzzy_title_match( @args );
+ if ($self->search_obj->supports_fuzzy_searches) {
+ $self->search_obj->fuzzy_title_match( @args );
+ } else {
+ croak "Search backend doesn't support fuzzy searches";
+ }
} else {
croak "No search backend defined.";
}
diff -ruN CGI-Wiki-0.50/t/013_fuzzy_title_match.t CGI-Wiki-0.50-search/t/013_fuzzy_title_match.t
--- CGI-Wiki-0.50/t/013_fuzzy_title_match.t Fri Nov 21 22:40:22 2003
+++ CGI-Wiki-0.50-search/t/013_fuzzy_title_match.t Mon Feb 23 11:44:21 2004
@@ -15,7 +15,7 @@
my $search = $wiki->search_obj;
skip "No search backend in this combination", 5 unless $search;
skip "Search backend doesn't support fuzzy searching", 5
- unless $search->can("fuzzy_title_match");
+ unless $search->supports_fuzzy_searches;
# Fuzzy match with differing punctuation.
$wiki->write_node( "King's Cross St Pancras", "station" )
--LQksG6bCIzRHxTLp--