diff -ruN CGI-Wiki-0.50/MANIFEST CGI-Wiki-0.50-search/MANIFEST --- CGI-Wiki-0.50/MANIFEST Thu Nov 20 11:12:58 2003 +++ CGI-Wiki-0.50-search/MANIFEST Mon Feb 23 11:25:10 2004 @@ -9,6 +9,7 @@ lib/CGI/Wiki/Extending.pod lib/CGI/Wiki/Formatter/Default.pm lib/CGI/Wiki/Plugin.pm +lib/CGI/Wiki/Search/Base.pm lib/CGI/Wiki/Search/DBIxFTS.pm lib/CGI/Wiki/Search/SII.pm lib/CGI/Wiki/Setup/DBIxFTSMySQL.pm diff -ruN CGI-Wiki-0.50/lib/CGI/Wiki/Search/Base.pm CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/Base.pm --- CGI-Wiki-0.50/lib/CGI/Wiki/Search/Base.pm Thu Jan 1 01:00:00 1970 +++ CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/Base.pm Mon Feb 23 11:43:40 2004 @@ -0,0 +1,229 @@ +package CGI::Wiki::Search::Base; + +use strict; +use Search::InvertedIndex; +use Carp "croak"; + +use vars qw( @ISA $VERSION ); + +sub _abstract { + my $who = (caller(1))[3]; + croak "$who is an abstract method which the ".(ref shift). + " class has not provided"; +} + +$VERSION = 0.01; + +=head1 NAME + +CGI::Wiki::Search::Base - Base class for CGI::Wiki search plugins + +=head1 SYNOPSIS + + my $search = CGI::Wiki::Search::XXX->new( @args ); + my %wombat_nodes = $search->search_nodes("wombat"); + +This class details the methods that need to be overriden by search plugins. + +=cut + +=head1 METHODS + +=over 4 + +=item B + + my $search = CGI::Wiki::Search::XXX->new( @args ); + +Creates a new searcher. By default the arguments are just passed to +C<_init>, so you may wish to override that instead. + +=cut + +sub new { + my ($class, @args) = @_; + my $self = {}; + bless $self, $class; + return $self->_init(@args); +} + +sub _init { + my ($self, %args) = @_; + @{$self}{keys %args} = values %args; + return $self; +} + +=item B + + # Find all the nodes which contain the word 'expert'. + my %results = $search->search_nodes('expert'); + +Returns a (possibly empty) hash whose keys are the node names and +whose values are the scores in some kind of relevance-scoring system I +haven't entirely come up with yet. For OR searches, this could +initially be the number of terms that appear in the node, perhaps. + +Defaults to AND searches (if $and_or is not supplied, or is anything +other than C or C). + +Searches are case-insensitive. + +=cut + +sub search_nodes { + my ($self, $termstr, $and_or) = @_; + + $and_or = lc($and_or); + unless ( defined $and_or and $and_or eq "or" ) { + $and_or = "and"; + } + + # Extract individual search terms. + my @terms = $self->analyze($termstr); + + return $self->_do_search($and_or, \@terms); +} + +sub _do_search { shift->_abstract }; + +=item B + + @terms = $self->analyze($string) + +Splits a string into a set of terms for indexing and searching. Typically +this is done case-insensitively, splitting at word boundaries, and extracting +words that contain at least 1 word characters. + +=cut + +sub analyze { + my ($self, $string) = @_; + return grep { length > 1 # ignore single characters + and ! /^\W*$/ } # and things composed entirely + # of non-word characters + split( /\b/, # split at word boundaries + lc($string) # be case-insensitive + ); +} + +=item B + + $wiki->write_node( "King's Cross St Pancras", "A station." ); + my %matches = $search->fuzzy_title_match( "Kings Cross St. Pancras" ); + +Returns a (possibly empty) hash whose keys are the node names and +whose values are the scores in some kind of relevance-scoring system I +haven't entirely come up with yet. + +Note that even if an exact match is found, any other similar enough +matches will also be returned. However, any exact match is guaranteed +to have the highest relevance score. + +The matching is done against "canonicalised" forms of the search +string and the node titles in the database: stripping vowels, repeated +letters and non-word characters, and lowercasing. + +=cut + +sub fuzzy_title_match { + my ($self, $string) = @_; + my $canonical = $self->canonicalise_title( $string ); + $self->_fuzzy_match($string, $canonical); +} + +sub _fuzzy_match { shift->_abstract }; + +=item B + + $search->index_node($node, $content); + +Indexes or reindexes the given node in the search engine indexes. +You must supply both the node name and its content. + +=cut + +sub index_node { + my ($self, $node, $content) = @_; + croak "Must supply a node name" unless $node; + croak "Must supply node content" unless defined $content; + + # Index the individual words in the node content and title. + my @keys = $self->analyze("$content $node"); + $self->_index_node($node, $content, \@keys); + $self->_index_fuzzy($node, $self->canonicalise_title( $node )); +} + +sub _index_node { shift->_abstract }; +sub _index_fuzzy { shift->_abstract }; + +=head2 C + + $fuzzy = $self->canonicalise_title( $ node); + +Returns the node title as suitable for fuzzy searching: with punctuation +and spaces removes, vowels removed, and double letters squashed. + +=cut + +sub canonicalise_title { + my ($self, $title) = @_; + return "" unless $title; + my $canonical = lc($title); + $canonical =~ s/\W//g; # remove non-word characters + $canonical =~ s/[aeiouy]//g; # remove vowels and 'y' + $canonical =~ tr/a-z//s; # collapse doubled (or tripled, etc) letters + return $canonical; +} + +=item B + + $search->delete_node($node); + +Removes the given node from the search indexes. NOTE: It's up to you to +make sure the node is removed from the backend store. Croaks on error. + +=cut + +sub delete_node { + my ($self, $node) = @_; + croak "Must supply a node name" unless $node; + $self->_delete_node($node); +} + +sub _delete_node { shift->_abstract }; + +=item B + + if ( $search->supports_phrase_searches ) { + return $search->search_nodes( '"fox in socks"' ); + } + +Returns true if this search backend supports phrase searching, and +false otherwise. + +=cut + +sub supports_phrase_searches { shift->_abstract }; + +=item B + + if ( $search->supports_fuzzy_searches ) { + return $search->fuzzy_title_match("Kings Cross St Pancreas"); + } + +Returns true if this search backend supports fuzzy title matching, and +false otherwise. + +=cut + +sub supports_fuzzy_searches { shift->_abstract }; + +=back + +=head1 SEE ALSO + +L + +=cut + +1; diff -ruN CGI-Wiki-0.50/lib/CGI/Wiki/Search/DBIxFTS.pm CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/DBIxFTS.pm --- CGI-Wiki-0.50/lib/CGI/Wiki/Search/DBIxFTS.pm Thu Nov 20 11:13:01 2003 +++ CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/DBIxFTS.pm Mon Feb 23 11:24:42 2004 @@ -3,6 +3,7 @@ use strict; use DBIx::FullTextSearch; use Carp "croak"; +use base 'CGI::Wiki::Search::Base'; use vars qw( @ISA $VERSION ); @@ -42,13 +43,6 @@ =cut -sub new { - my ($class, @args) = @_; - my $self = {}; - bless $self, $class; - return $self->_init(@args); -} - sub _init { my ($self, %args) = @_; croak "Must supply a database handle" unless $args{dbh}; @@ -56,28 +50,8 @@ return $self; } -=item B - - # Find all the nodes which contain both the word 'expert' and the - # phrase 'wombat defenestration'. - %results = $search->search_nodes('expert "wombat defenestration"'); - - # Find all the nodes which contain at least one of the words - # 'buffy', 'pony', and 'pie'. - %results = $search->search_nodes('buffy pony pie', 'OR'); - -Returns a (possibly empty) hash whose keys are the node names and -whose values are the scores in some kind of relevance-scoring system I -haven't entirely come up with yet. For OR searches, this could -initially be the number of terms that appear in the node, perhaps. - -Defaults to AND searches (if $and_or is not supplied, or is anything -other than C or C). - -Searches are case-insensitive. - -=cut - +# We can't use the base version, since we're doing the analysis +# differently between searching and indexing sub search_nodes { my ($self, $termstr, $and_or) = @_; @@ -86,10 +60,6 @@ $and_or = "AND"; } - # Note: Not sure yet whether the term extraction below is going to be - # common between backends. Move it back into CGI::Wiki if it turns - # out to be. - # Extract individual search terms - first phrases (between double quotes). my @terms = ($termstr =~ m/"([^"]+)"/g); $termstr =~ s/"[^"]*"//g; @@ -108,17 +78,8 @@ return map { $_ => 1 } @finds; } -=item B - - $search->index_node($node); - -Indexes or reindexes the given node in the FTS indexes in the backend -storage. - -=cut - sub index_node { - my ($self, $node) = @_; + my ($self, $node, $content) = @_; my $dbh = $self->{_dbh}; my $fts_all = DBIx::FullTextSearch->open($dbh, "_content_and_title_fts"); @@ -131,16 +92,6 @@ delete $fts_titles->{db_backend}; # ditto } -=item B - - $search->delete_node($node); - -Removes the given node from the search indexes. NOTE: It's up to you to -make sure the node is removed from the backend store. Croaks on error, -returns true on success. - -=cut - sub delete_node { my ($self, $node) = @_; my $dbh = $self->{_dbh}; @@ -155,20 +106,8 @@ return 1; } -=item B - - if ( $search->supports_phrase_searches ) { - return $search->search_nodes( '"fox in socks"' ); - } - -Returns true if this search backend supports phrase searching, and -false otherwise. - -=cut - -sub supports_phrase_searches { - return 1; -} +sub supports_phrase_searches { return 1; } +sub supports_fuzzy_searches { return 0; } =back diff -ruN CGI-Wiki-0.50/lib/CGI/Wiki/Search/SII.pm CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/SII.pm --- CGI-Wiki-0.50/lib/CGI/Wiki/Search/SII.pm Sun Dec 21 11:31:16 2003 +++ CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/SII.pm Mon Feb 23 11:34:58 2004 @@ -7,6 +7,7 @@ use vars qw( @ISA $VERSION ); $VERSION = 0.08; +use base 'CGI::Wiki::Search::Base'; =head1 NAME @@ -20,6 +21,8 @@ Provides search-related methods for CGI::Wiki +See also L + =cut =head1 METHODS @@ -53,13 +56,6 @@ =cut -sub new { - my ($class, @args) = @_; - my $self = {}; - bless $self, $class; - return $self->_init(@args); -} - sub _init { my ($self, %args) = @_; my $indexdb = $args{indexdb}; @@ -74,42 +70,10 @@ return $self; } -=item B - - # Find all the nodes which contain the word 'expert'. - my %results = $search->search_nodes('expert'); - -Returns a (possibly empty) hash whose keys are the node names and -whose values are the scores in some kind of relevance-scoring system I -haven't entirely come up with yet. For OR searches, this could -initially be the number of terms that appear in the node, perhaps. - -Defaults to AND searches (if $and_or is not supplied, or is anything -other than C or C). - -Searches are case-insensitive. - -=cut - -sub search_nodes { - my ($self, $termstr, $and_or) = @_; - - $and_or = lc($and_or); - unless ( defined $and_or and $and_or eq "or" ) { - $and_or = "and"; - } - - # Extract individual search terms. - my @terms = grep { length > 1 # ignore single characters - and ! /^\W*$/ } # and things composed entirely - # of non-word characters - split( /\b/, # split at word boundaries - lc($termstr) # be case-insensitive - ); - - # Create a leaf for each search term. +sub _do_search { + my ($self, $and_or, $terms) = @_; my @leaves; - foreach my $term ( @terms ) { + foreach my $term ( @$terms ) { my $leaf = Search::InvertedIndex::Query::Leaf->new(-key => $term, -group => "nodes" ); push @leaves, $leaf; @@ -131,29 +95,8 @@ return %results; } -=item B - - $wiki->write_node( "King's Cross St Pancras", "A station." ); - my %matches = $search->fuzzy_title_match( "Kings Cross St. Pancras" ); - -Returns a (possibly empty) hash whose keys are the node names and -whose values are the scores in some kind of relevance-scoring system I -haven't entirely come up with yet. - -Note that even if an exact match is found, any other similar enough -matches will also be returned. However, any exact match is guaranteed -to have the highest relevance score. - -The matching is done against "canonicalised" forms of the search -string and the node titles in the database: stripping vowels, repeated -letters and non-word characters, and lowercasing. - -=cut - -sub fuzzy_title_match { - my ($self, $string) = @_; - my $canonical = $self->_canonicalise_title( $string ); - +sub _fuzzy_match { + my ($self, $string, $canonical) = @_; my $leaf = Search::InvertedIndex::Query::Leaf->new( -key => $canonical, -group => "fuzzy_titles" ); @@ -171,42 +114,21 @@ return %results; } -=item B - - $search->index_node($node, $content); - -Indexes or reindexes the given node in the L -indexes. You must supply both the node name and its content. - -=cut - -sub index_node { - my ($self, $node, $content) = @_; - croak "Must supply a node name" unless $node; - croak "Must supply node content" unless defined $content; - - # Index the individual words in the node content and title. - my @keys = grep { length > 1 # ignore single characters - and ! /^\W*$/ } # and things composed entirely - # of non-word characters - split( /\b/, # split at word boundaries - lc( # be case-insensitive - "$content $node" # index content and title - ) - ); +sub _index_node { + my ($self, $node, $content, $keys) = @_; my $update = Search::InvertedIndex::Update->new( -group => "nodes", -index => $node, -data => $content, - -keys => { map { $_ => 1 } @keys } + -keys => { map { $_ => 1 } @$keys } ); $self->{_map}->update( -update => $update ); +} - # Index a canonicalised form of the title for fuzzy searches. - my $canonical = $self->_canonicalise_title( $node ); - - $update = Search::InvertedIndex::Update->new( +sub _index_fuzzy { + my ($self, $node, $canonical) = @_; + my $update = Search::InvertedIndex::Update->new( -group => "fuzzy_titles", -index => $node . "_fuzzy_title", -data => $node, @@ -215,51 +137,19 @@ $self->{_map}->update( -update => $update ); } -sub _canonicalise_title { - my ($self, $title) = @_; - return "" unless $title; - my $canonical = lc($title); - $canonical =~ s/\W//g; # remove non-word characters - $canonical =~ s/[aeiouy]//g; # remove vowels and 'y' - $canonical =~ s/(\w)\1+/$1/eg; # collapse doubled (or tripled, etc) letters - return $canonical; -} - -=item B - - $search->delete_node($node); - -Removes the given node from the search indexes. NOTE: It's up to you to -make sure the node is removed from the backend store. Croaks on error. - -=cut - -sub delete_node { +sub _delete_node { my ($self, $node) = @_; - croak "Must supply a node name" unless $node; $self->{_map}->remove_index_from_all({ -index => $node }); } -=item B - - if ( $search->supports_phrase_searches ) { - return $search->search_nodes( '"fox in socks"' ); - } - -Returns true if this search backend supports phrase searching, and -false otherwise. - -=cut - -sub supports_phrase_searches { - return 0; -} +sub supports_phrase_searches { return 0; } +sub supports_fuzzy_searches { return 1; } =back =head1 SEE ALSO -L +L, L =cut diff -ruN CGI-Wiki-0.50/lib/CGI/Wiki.pm CGI-Wiki-0.50-search/lib/CGI/Wiki.pm --- CGI-Wiki-0.50/lib/CGI/Wiki.pm Sun Dec 21 11:57:04 2003 +++ CGI-Wiki-0.50-search/lib/CGI/Wiki.pm Mon Feb 23 10:53:14 2004 @@ -468,11 +468,28 @@ $self->search_obj->supports_phrase_searches( @args ) if $self->search_obj; } +=item B + + if ( $wiki->supports_fuzzy_searches ) { + return $wiki->fuzzy_title_match( 'Kings Cross, St Pancreas' ); + } + +Returns true if your chosen search backend supports fuzzy title searching, +and false otherwise. + +=cut + +sub supports_fuzzy_searches { + my ($self, @args) = @_; + $self->search_obj->supports_fuzzy_searches( @args ) if $self->search_obj; +} + + =item B -B This section of the documentation assumes you are using the -L backend; this feature has not yet been -implemented for the L backend. +B This section of the documentation assumes you are using a +search engine which supports fuzzy matching. (See above.) The +L backend in particular does not. $wiki->write_node( "King's Cross St Pancras", "A station." ); my %matches = $wiki->fuzzy_title_match( "Kings Cross St. Pancras" ); @@ -496,7 +513,11 @@ sub fuzzy_title_match { my ($self, @args) = @_; if ( $self->search_obj ) { - $self->search_obj->fuzzy_title_match( @args ); + if ($self->search_obj->supports_fuzzy_searches) { + $self->search_obj->fuzzy_title_match( @args ); + } else { + croak "Search backend doesn't support fuzzy searches"; + } } else { croak "No search backend defined."; } diff -ruN CGI-Wiki-0.50/t/013_fuzzy_title_match.t CGI-Wiki-0.50-search/t/013_fuzzy_title_match.t --- CGI-Wiki-0.50/t/013_fuzzy_title_match.t Fri Nov 21 22:40:22 2003 +++ CGI-Wiki-0.50-search/t/013_fuzzy_title_match.t Mon Feb 23 11:44:21 2004 @@ -15,7 +15,7 @@ my $search = $wiki->search_obj; skip "No search backend in this combination", 5 unless $search; skip "Search backend doesn't support fuzzy searching", 5 - unless $search->can("fuzzy_title_match"); + unless $search->supports_fuzzy_searches; # Fuzzy match with differing punctuation. $wiki->write_node( "King's Cross St Pancras", "station" )