[cgi-wiki-dev] [PATCH] Abstract searching to base class

Simon Cozens cgi-wiki-dev@earth.li
Mon, 23 Feb 2004 11:50:38 +0000


--LQksG6bCIzRHxTLp
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

This is a pretty hefty patch which abstracts out the two searching classes
into a Search::Base class, in preparation for adding a third search engine
backend. (Plucene)

While constructing this I found a particularly confusing thing about the
full text search backend. index_node is called as follows:
    $search->index_node($node, $content);

However, DBIxFTS says this:
    my ($self, $node) = @_;

(No content?)

    my $dbh = $self->{_dbh};
    my $fts_all = DBIx::FullTextSearch->open($dbh, "_content_and_title_fts");
    my $fts_titles = DBIx::FullTextSearch->open($dbh, "_title_fts");

    $fts_all->index_document($node);
    $fts_titles->index_document($node);

(No difference between purely-title and content-and-title indexing?)

Where does it get the content from?
-- 
"There is no statute of limitations on stupidity."
-- Randomly produced by a computer program called Markov3.

--LQksG6bCIzRHxTLp
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="base.patch"

diff -ruN CGI-Wiki-0.50/MANIFEST CGI-Wiki-0.50-search/MANIFEST
--- CGI-Wiki-0.50/MANIFEST	Thu Nov 20 11:12:58 2003
+++ CGI-Wiki-0.50-search/MANIFEST	Mon Feb 23 11:25:10 2004
@@ -9,6 +9,7 @@
 lib/CGI/Wiki/Extending.pod
 lib/CGI/Wiki/Formatter/Default.pm
 lib/CGI/Wiki/Plugin.pm
+lib/CGI/Wiki/Search/Base.pm
 lib/CGI/Wiki/Search/DBIxFTS.pm
 lib/CGI/Wiki/Search/SII.pm
 lib/CGI/Wiki/Setup/DBIxFTSMySQL.pm
diff -ruN CGI-Wiki-0.50/lib/CGI/Wiki/Search/Base.pm CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/Base.pm
--- CGI-Wiki-0.50/lib/CGI/Wiki/Search/Base.pm	Thu Jan  1 01:00:00 1970
+++ CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/Base.pm	Mon Feb 23 11:43:40 2004
@@ -0,0 +1,229 @@
+package CGI::Wiki::Search::Base;
+
+use strict;
+use Search::InvertedIndex;
+use Carp "croak";
+
+use vars qw( @ISA $VERSION );
+
+sub _abstract {
+    my $who = (caller(1))[3];
+    croak "$who is an abstract method which the ".(ref shift).
+    " class has not provided";
+}
+
+$VERSION = 0.01;
+
+=head1 NAME
+
+CGI::Wiki::Search::Base - Base class for CGI::Wiki search plugins
+
+=head1 SYNOPSIS
+
+  my $search = CGI::Wiki::Search::XXX->new( @args );
+  my %wombat_nodes = $search->search_nodes("wombat");
+
+This class details the methods that need to be overriden by search plugins.
+
+=cut
+
+=head1 METHODS
+
+=over 4
+
+=item B<new>
+
+  my $search = CGI::Wiki::Search::XXX->new( @args );
+
+Creates a new searcher. By default the arguments are just passed to
+C<_init>, so you may wish to override that instead.
+
+=cut
+
+sub new {
+    my ($class, @args) = @_;
+    my $self = {};
+    bless $self, $class;
+    return $self->_init(@args);
+}
+
+sub _init {
+    my ($self, %args) = @_;
+    @{$self}{keys %args} = values %args;
+    return $self;
+}
+
+=item B<search_nodes>
+
+  # Find all the nodes which contain the word 'expert'.
+  my %results = $search->search_nodes('expert');
+
+Returns a (possibly empty) hash whose keys are the node names and
+whose values are the scores in some kind of relevance-scoring system I
+haven't entirely come up with yet. For OR searches, this could
+initially be the number of terms that appear in the node, perhaps.
+
+Defaults to AND searches (if $and_or is not supplied, or is anything
+other than C<OR> or C<or>).
+
+Searches are case-insensitive.
+
+=cut
+
+sub search_nodes {
+    my ($self, $termstr, $and_or) = @_;
+
+    $and_or = lc($and_or);
+    unless ( defined $and_or and $and_or eq "or" ) {
+        $and_or = "and";
+    }
+
+    # Extract individual search terms.
+    my @terms = $self->analyze($termstr);
+
+    return $self->_do_search($and_or, \@terms);
+}
+
+sub _do_search { shift->_abstract };
+
+=item B<analyze>
+
+    @terms = $self->analyze($string)
+
+Splits a string into a set of terms for indexing and searching. Typically
+this is done case-insensitively, splitting at word boundaries, and extracting
+words that contain at least 1 word characters.
+
+=cut
+
+sub analyze {
+    my ($self, $string) = @_;
+    return grep { length > 1            # ignore single characters
+                 and ! /^\W*$/ }        # and things composed entirely
+                                        #   of non-word characters
+          split( /\b/,                  # split at word boundaries
+                       lc($string)      # be case-insensitive
+                   );
+}
+
+=item B<fuzzy_title_match>
+
+  $wiki->write_node( "King's Cross St Pancras", "A station." );
+  my %matches = $search->fuzzy_title_match( "Kings Cross St. Pancras" );
+
+Returns a (possibly empty) hash whose keys are the node names and
+whose values are the scores in some kind of relevance-scoring system I
+haven't entirely come up with yet.
+
+Note that even if an exact match is found, any other similar enough
+matches will also be returned. However, any exact match is guaranteed
+to have the highest relevance score.
+
+The matching is done against "canonicalised" forms of the search
+string and the node titles in the database: stripping vowels, repeated
+letters and non-word characters, and lowercasing.
+
+=cut
+
+sub fuzzy_title_match {
+    my ($self, $string) = @_;
+    my $canonical = $self->canonicalise_title( $string );
+    $self->_fuzzy_match($string, $canonical);
+}
+
+sub _fuzzy_match { shift->_abstract };
+
+=item B<index_node>
+
+  $search->index_node($node, $content);
+
+Indexes or reindexes the given node in the search engine indexes. 
+You must supply both the node name and its content.
+
+=cut
+
+sub index_node {
+    my ($self, $node, $content) = @_;
+    croak "Must supply a node name" unless $node;
+    croak "Must supply node content" unless defined $content;
+
+    # Index the individual words in the node content and title.
+    my @keys = $self->analyze("$content $node");
+    $self->_index_node($node, $content, \@keys);
+    $self->_index_fuzzy($node, $self->canonicalise_title( $node ));
+}
+
+sub _index_node  { shift->_abstract };
+sub _index_fuzzy { shift->_abstract };
+
+=head2 C<canonicalise_title>
+
+    $fuzzy = $self->canonicalise_title( $ node);
+
+Returns the node title as suitable for fuzzy searching: with punctuation
+and spaces removes, vowels removed, and double letters squashed.
+
+=cut
+
+sub canonicalise_title {
+    my ($self, $title) = @_;
+    return "" unless $title;
+    my $canonical = lc($title);
+    $canonical =~ s/\W//g;         # remove non-word characters
+    $canonical =~ s/[aeiouy]//g;   # remove vowels and 'y'
+    $canonical =~ tr/a-z//s;       # collapse doubled (or tripled, etc) letters
+    return $canonical;
+}
+
+=item B<delete_node>
+
+  $search->delete_node($node);
+
+Removes the given node from the search indexes.  NOTE: It's up to you to
+make sure the node is removed from the backend store.  Croaks on error.
+
+=cut
+
+sub delete_node {
+    my ($self, $node) = @_;
+    croak "Must supply a node name" unless $node;
+    $self->_delete_node($node);
+}
+
+sub _delete_node { shift->_abstract };
+
+=item B<supports_phrase_searches>
+
+  if ( $search->supports_phrase_searches ) {
+      return $search->search_nodes( '"fox in socks"' );
+  }
+
+Returns true if this search backend supports phrase searching, and
+false otherwise.
+
+=cut
+
+sub supports_phrase_searches { shift->_abstract };
+
+=item B<supports_fuzzy_searches>
+
+  if ( $search->supports_fuzzy_searches ) {
+      return $search->fuzzy_title_match("Kings Cross St Pancreas");
+  }
+
+Returns true if this search backend supports fuzzy title matching, and
+false otherwise.
+
+=cut
+
+sub supports_fuzzy_searches { shift->_abstract };
+
+=back
+
+=head1 SEE ALSO
+
+L<CGI::Wiki>
+
+=cut
+
+1;
diff -ruN CGI-Wiki-0.50/lib/CGI/Wiki/Search/DBIxFTS.pm CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/DBIxFTS.pm
--- CGI-Wiki-0.50/lib/CGI/Wiki/Search/DBIxFTS.pm	Thu Nov 20 11:13:01 2003
+++ CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/DBIxFTS.pm	Mon Feb 23 11:24:42 2004
@@ -3,6 +3,7 @@
 use strict;
 use DBIx::FullTextSearch;
 use Carp "croak";
+use base 'CGI::Wiki::Search::Base';
 
 use vars qw( @ISA $VERSION );
 
@@ -42,13 +43,6 @@
 
 =cut
 
-sub new {
-    my ($class, @args) = @_;
-    my $self = {};
-    bless $self, $class;
-    return $self->_init(@args);
-}
-
 sub _init {
     my ($self, %args) = @_;
     croak "Must supply a database handle" unless $args{dbh};
@@ -56,28 +50,8 @@
     return $self;
 }
 
-=item B<search_nodes>
-
-  # Find all the nodes which contain both the word 'expert' and the
-  # phrase 'wombat defenestration'.
-  %results = $search->search_nodes('expert "wombat defenestration"');
-
-  # Find all the nodes which contain at least one of the words
-  # 'buffy', 'pony', and 'pie'.
-  %results = $search->search_nodes('buffy pony pie', 'OR');
-
-Returns a (possibly empty) hash whose keys are the node names and
-whose values are the scores in some kind of relevance-scoring system I
-haven't entirely come up with yet. For OR searches, this could
-initially be the number of terms that appear in the node, perhaps.
-
-Defaults to AND searches (if $and_or is not supplied, or is anything
-other than C<OR> or C<or>).
-
-Searches are case-insensitive.
-
-=cut
-
+# We can't use the base version, since we're doing the analysis
+# differently between searching and indexing
 sub search_nodes {
     my ($self, $termstr, $and_or) = @_;
 
@@ -86,10 +60,6 @@
         $and_or = "AND";
     }
 
-    # Note: Not sure yet whether the term extraction below is going to be
-    # common between backends.  Move it back into CGI::Wiki if it turns
-    # out to be.
-
     # Extract individual search terms - first phrases (between double quotes).
     my @terms = ($termstr =~ m/"([^"]+)"/g);
     $termstr =~ s/"[^"]*"//g;
@@ -108,17 +78,8 @@
     return map { $_ => 1 } @finds;
 }
 
-=item B<index_node>
-
-  $search->index_node($node);
-
-Indexes or reindexes the given node in the FTS indexes in the backend
-storage.
-
-=cut
-
 sub index_node {
-    my ($self, $node) = @_;
+    my ($self, $node, $content) = @_;
 
     my $dbh = $self->{_dbh};
     my $fts_all = DBIx::FullTextSearch->open($dbh, "_content_and_title_fts");
@@ -131,16 +92,6 @@
     delete $fts_titles->{db_backend}; # ditto
 }
 
-=item B<delete_node>
-
-  $search->delete_node($node);
-
-Removes the given node from the search indexes.  NOTE: It's up to you to
-make sure the node is removed from the backend store.  Croaks on error,
-returns true on success.
-
-=cut
-
 sub delete_node {
     my ($self, $node) = @_;
     my $dbh = $self->{_dbh};
@@ -155,20 +106,8 @@
     return 1;
 }
 
-=item B<supports_phrase_searches>
-
-  if ( $search->supports_phrase_searches ) {
-      return $search->search_nodes( '"fox in socks"' );
-  }
-
-Returns true if this search backend supports phrase searching, and
-false otherwise.
-
-=cut
-
-sub supports_phrase_searches {
-    return 1;
-}
+sub supports_phrase_searches { return 1; }
+sub supports_fuzzy_searches  { return 0; }
 
 =back
 
diff -ruN CGI-Wiki-0.50/lib/CGI/Wiki/Search/SII.pm CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/SII.pm
--- CGI-Wiki-0.50/lib/CGI/Wiki/Search/SII.pm	Sun Dec 21 11:31:16 2003
+++ CGI-Wiki-0.50-search/lib/CGI/Wiki/Search/SII.pm	Mon Feb 23 11:34:58 2004
@@ -7,6 +7,7 @@
 use vars qw( @ISA $VERSION );
 
 $VERSION = 0.08;
+use base 'CGI::Wiki::Search::Base';
 
 =head1 NAME
 
@@ -20,6 +21,8 @@
 
 Provides search-related methods for CGI::Wiki
 
+See also L<CGI::Wiki::Base>
+
 =cut
 
 =head1 METHODS
@@ -53,13 +56,6 @@
 
 =cut
 
-sub new {
-    my ($class, @args) = @_;
-    my $self = {};
-    bless $self, $class;
-    return $self->_init(@args);
-}
-
 sub _init {
     my ($self, %args) = @_;
     my $indexdb = $args{indexdb};
@@ -74,42 +70,10 @@
     return $self;
 }
 
-=item B<search_nodes>
-
-  # Find all the nodes which contain the word 'expert'.
-  my %results = $search->search_nodes('expert');
-
-Returns a (possibly empty) hash whose keys are the node names and
-whose values are the scores in some kind of relevance-scoring system I
-haven't entirely come up with yet. For OR searches, this could
-initially be the number of terms that appear in the node, perhaps.
-
-Defaults to AND searches (if $and_or is not supplied, or is anything
-other than C<OR> or C<or>).
-
-Searches are case-insensitive.
-
-=cut
-
-sub search_nodes {
-    my ($self, $termstr, $and_or) = @_;
-
-    $and_or = lc($and_or);
-    unless ( defined $and_or and $and_or eq "or" ) {
-        $and_or = "and";
-    }
-
-    # Extract individual search terms.
-    my @terms = grep { length > 1            # ignore single characters
-                      and ! /^\W*$/ }        # and things composed entirely
-                                             #   of non-word characters
-               split( /\b/,                  # split at word boundaries
-                            lc($termstr)     # be case-insensitive
-                    );
-
-    # Create a leaf for each search term.
+sub _do_search {
+    my ($self, $and_or, $terms) = @_;
     my @leaves;
-    foreach my $term ( @terms ) {
+    foreach my $term ( @$terms ) {
         my $leaf = Search::InvertedIndex::Query::Leaf->new(-key   => $term,
                                                            -group => "nodes" );
         push @leaves, $leaf;
@@ -131,29 +95,8 @@
     return %results;
 }
 
-=item B<fuzzy_title_match>
-
-  $wiki->write_node( "King's Cross St Pancras", "A station." );
-  my %matches = $search->fuzzy_title_match( "Kings Cross St. Pancras" );
-
-Returns a (possibly empty) hash whose keys are the node names and
-whose values are the scores in some kind of relevance-scoring system I
-haven't entirely come up with yet.
-
-Note that even if an exact match is found, any other similar enough
-matches will also be returned. However, any exact match is guaranteed
-to have the highest relevance score.
-
-The matching is done against "canonicalised" forms of the search
-string and the node titles in the database: stripping vowels, repeated
-letters and non-word characters, and lowercasing.
-
-=cut
-
-sub fuzzy_title_match {
-    my ($self, $string) = @_;
-    my $canonical = $self->_canonicalise_title( $string );
-
+sub _fuzzy_match {
+    my ($self, $string, $canonical) = @_;
     my $leaf = Search::InvertedIndex::Query::Leaf->new(
         -key   => $canonical,
         -group => "fuzzy_titles" );
@@ -171,42 +114,21 @@
     return %results;
 }
 
-=item B<index_node>
-
-  $search->index_node($node, $content);
-
-Indexes or reindexes the given node in the L<Search::InvertedIndex>
-indexes.  You must supply both the node name and its content.
-
-=cut
-
-sub index_node {
-    my ($self, $node, $content) = @_;
-    croak "Must supply a node name" unless $node;
-    croak "Must supply node content" unless defined $content;
-
-    # Index the individual words in the node content and title.
-    my @keys = grep { length > 1                 # ignore single characters
-                      and ! /^\W*$/ }            # and things composed entirely
-                                                 #   of non-word characters
-               split( /\b/,                      # split at word boundaries
-                            lc(                  # be case-insensitive
-                                "$content $node" # index content and title
-                              )
-                    );
 
+sub _index_node {
+    my ($self, $node, $content, $keys) = @_;
     my $update = Search::InvertedIndex::Update->new(
         -group => "nodes",
         -index => $node,
         -data  => $content,
-        -keys => { map { $_ => 1 } @keys }
+        -keys => { map { $_ => 1 } @$keys }
     );
     $self->{_map}->update( -update => $update );
+}
 
-    # Index a canonicalised form of the title for fuzzy searches.
-    my $canonical = $self->_canonicalise_title( $node );
-
-    $update = Search::InvertedIndex::Update->new(
+sub _index_fuzzy {
+    my ($self, $node, $canonical) = @_;
+    my $update = Search::InvertedIndex::Update->new(
         -group => "fuzzy_titles",
         -index => $node . "_fuzzy_title",
         -data  => $node,
@@ -215,51 +137,19 @@
     $self->{_map}->update( -update => $update );
 }
 
-sub _canonicalise_title {
-    my ($self, $title) = @_;
-    return "" unless $title;
-    my $canonical = lc($title);
-    $canonical =~ s/\W//g;         # remove non-word characters
-    $canonical =~ s/[aeiouy]//g;   # remove vowels and 'y'
-    $canonical =~ s/(\w)\1+/$1/eg; # collapse doubled (or tripled, etc) letters
-    return $canonical;
-}
-
-=item B<delete_node>
-
-  $search->delete_node($node);
-
-Removes the given node from the search indexes.  NOTE: It's up to you to
-make sure the node is removed from the backend store.  Croaks on error.
-
-=cut
-
-sub delete_node {
+sub _delete_node {
     my ($self, $node) = @_;
-    croak "Must supply a node name" unless $node;
     $self->{_map}->remove_index_from_all({ -index => $node });
 }
 
-=item B<supports_phrase_searches>
-
-  if ( $search->supports_phrase_searches ) {
-      return $search->search_nodes( '"fox in socks"' );
-  }
-
-Returns true if this search backend supports phrase searching, and
-false otherwise.
-
-=cut
-
-sub supports_phrase_searches {
-    return 0;
-}
+sub supports_phrase_searches { return 0; }
+sub supports_fuzzy_searches  { return 1; }
 
 =back
 
 =head1 SEE ALSO
 
-L<CGI::Wiki>
+L<CGI::Wiki>, L<CGI::Wiki::Base>
 
 =cut
 
diff -ruN CGI-Wiki-0.50/lib/CGI/Wiki.pm CGI-Wiki-0.50-search/lib/CGI/Wiki.pm
--- CGI-Wiki-0.50/lib/CGI/Wiki.pm	Sun Dec 21 11:57:04 2003
+++ CGI-Wiki-0.50-search/lib/CGI/Wiki.pm	Mon Feb 23 10:53:14 2004
@@ -468,11 +468,28 @@
     $self->search_obj->supports_phrase_searches( @args ) if $self->search_obj;
 }
 
+=item B<supports_fuzzy_searches>
+
+  if ( $wiki->supports_fuzzy_searches ) {
+      return $wiki->fuzzy_title_match( 'Kings Cross, St Pancreas' );
+  }
+
+Returns true if your chosen search backend supports fuzzy title searching,
+and false otherwise.
+
+=cut
+
+sub supports_fuzzy_searches {
+    my ($self, @args) = @_;
+    $self->search_obj->supports_fuzzy_searches( @args ) if $self->search_obj;
+}
+
+
 =item B<fuzzy_title_match>
 
-B<NOTE:> This section of the documentation assumes you are using the
-L<CGI::Wiki::Search::SII> backend; this feature has not yet been
-implemented for the L<CGI::Wiki::Search::DBIxFTS> backend.
+B<NOTE:> This section of the documentation assumes you are using a
+search engine which supports fuzzy matching. (See above.) The 
+L<CGI::Wiki::Search::DBIxFTS> backend in particular does not.
 
   $wiki->write_node( "King's Cross St Pancras", "A station." );
   my %matches = $wiki->fuzzy_title_match( "Kings Cross St. Pancras" );
@@ -496,7 +513,11 @@
 sub fuzzy_title_match {
     my ($self, @args) = @_;
     if ( $self->search_obj ) {
-        $self->search_obj->fuzzy_title_match( @args );
+        if ($self->search_obj->supports_fuzzy_searches) {
+            $self->search_obj->fuzzy_title_match( @args );
+        } else {
+            croak "Search backend doesn't support fuzzy searches";
+        }
     } else {
         croak "No search backend defined.";
     }
diff -ruN CGI-Wiki-0.50/t/013_fuzzy_title_match.t CGI-Wiki-0.50-search/t/013_fuzzy_title_match.t
--- CGI-Wiki-0.50/t/013_fuzzy_title_match.t	Fri Nov 21 22:40:22 2003
+++ CGI-Wiki-0.50-search/t/013_fuzzy_title_match.t	Mon Feb 23 11:44:21 2004
@@ -15,7 +15,7 @@
         my $search = $wiki->search_obj;
         skip "No search backend in this combination", 5 unless $search;
         skip "Search backend doesn't support fuzzy searching", 5
-            unless $search->can("fuzzy_title_match");
+            unless $search->supports_fuzzy_searches;
 
         # Fuzzy match with differing punctuation.
         $wiki->write_node( "King's Cross St Pancras", "station" )

--LQksG6bCIzRHxTLp--