[cgi-wiki-dev] patch to CGI::Wiki

Tom Insam cgi-wiki-dev@earth.li
Fri, 3 Sep 2004 16:34:38 +0100


--Apple-Mail-6--45638562
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=US-ASCII;
	format=flowed

This is a nasty first-cut patch to add proper character set support to 
CGI::Wiki, as per my anguished rambling in #openguides today. It's 
nasty. But then, frankly, CGI::Wiki could do with a bit of database 
abstraction...

good things about it: I don't think it'll change anything if you don't 
use it. The default is iso-8895-1, which is the effective default if 
you don't say anything else. Search::InvertedIndex is broken and won't 
index stuff with the utf-8 flag set, so I index the raw bytes, again, 
this means that current live data shouldn't be affected. Not that I've 
tried this.

bad things about it: apart from the ugliness? It requires perl 5.8. 
This is a very bad thing, but there are ways of getting around it. New 
version soon. There's a work-around in the patch for HTML::Parser's 
uselessness as well, this can go away when 
http://rt.cpan.org/NoAuth/Bug.html?id=7014 is landed.

I have an example living at http://movieos.org/wiki/wiki.cgi - this is 
a patched CGI::Wiki and a CGI::Wiki::Kwiki patched to use it. You'll 
need a decent font. :-)

tom

--Apple-Mail-6--45638562
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
	x-unix-mode=0664;
	name="CGI-Wiki_UTF8_patch"
Content-Disposition: attachment;
	filename=CGI-Wiki_UTF8_patch

Only in .: .DS_Store
Only in .: Makefile
Only in .: blib
Only in ./lib: .DS_Store
Only in ./lib/CGI: .DS_Store
Only in ./lib/CGI/Wiki: .DS_Store
diff -ur /tmp/CGI-Wiki-0.54/lib/CGI/Wiki/Store/Database.pm ./lib/CGI/Wiki/Store/Database.pm
--- /tmp/CGI-Wiki-0.54/lib/CGI/Wiki/Store/Database.pm	Fri Jun 25 20:26:14 2004
+++ ./lib/CGI/Wiki/Store/Database.pm	Fri Sep  3 14:39:56 2004
@@ -10,6 +10,7 @@
 use Time::Seconds;
 use Carp qw( carp croak );
 use Digest::MD5 qw( md5_hex );
+use Encode;
 
 $VERSION = '0.22';
 
@@ -86,6 +87,7 @@
         $self->{_dbuser} = $args{dbuser} || "";
         $self->{_dbpass} = $args{dbpass} || "";
         $self->{_dbhost} = $args{dbhost} || "";
+        $self->{_charset} = $args{charset} || "iso-8859-1";
 
         # Connect to database and store the database handle.
         my ($dbname, $dbuser, $dbpass, $dbhost) =
@@ -167,12 +169,12 @@
     # specified in the call.
     my $dbh = $self->dbh;
     my $sql = "SELECT metadata_type, metadata_value FROM metadata WHERE "
-         . "node=" . $dbh->quote($args{name}) . " AND "
-         . "version=" . $dbh->quote($data{version});
+         . "node=" . $dbh->quote($self->charset_encode($args{name})) . " AND "
+         . "version=" . $dbh->quote($self->charset_encode($data{version}));
     my $sth = $dbh->prepare($sql);
     $sth->execute or croak $dbh->errstr;
     my %metadata;
-    while ( my ($type, $val) = $sth->fetchrow_array ) {
+    while ( my ($type, $val) = $self->charset_decode( $sth->fetchrow_array ) ) {
         if ( defined $metadata{$type} ) {
 	    push @{$metadata{$type}}, $val;
 	} else {
@@ -194,13 +196,13 @@
     my $sql;
     if ( $args{version} ) {
         $sql = "SELECT text, version, modified FROM content"
-             . " WHERE  name=" . $dbh->quote($args{name})
-             . " AND version=" . $dbh->quote($args{version});
+             . " WHERE  name=" . $dbh->quote($self->charset_encode($args{name}))
+             . " AND version=" . $dbh->quote($self->charset_encode($args{version}));
     } else {
         $sql = "SELECT text, version, modified FROM node
-                WHERE name=" . $dbh->quote($args{name});
+                WHERE name=" . $dbh->quote($self->charset_encode($args{name}));
     }
-    my @results = $dbh->selectrow_array($sql);
+    my @results = $self->charset_decode( $dbh->selectrow_array($sql) );
     @results = ("", 0, "") unless scalar @results;
     my %data;
     @data{ qw( content version last_modified ) } = @results;
@@ -216,7 +218,7 @@
         $string .= "\0\0\0" . $key . "\0\0"
                  . join("\0", sort @{$metadata{$key}} );
     }
-    return md5_hex($string);
+    return md5_hex($self->charset_encode($string));
 }
 
 # Expects an array of hashes whose keys and values are scalars.
@@ -293,7 +295,7 @@
     my $sth = $dbh->prepare($sql);
     $sth->execute or croak $dbh->errstr;
     my @backlinks;
-    while ( my $backlink = $sth->fetchrow_array ) {
+    while ( my ($backlink) = $self->charset_decode( $sth->fetchrow_array ) ) {
         push @backlinks, $backlink;
     }
     return @backlinks;
@@ -320,7 +322,7 @@
     my $sth = $dbh->prepare($sql);
     $sth->execute or croak $dbh->errstr;
     my @links;
-    while ( my $link = $sth->fetchrow_array ) {
+    while ( my ($link) = $self->charset_decode( $sth->fetchrow_array ) ) {
         push @links, $link;
     }
     return @links;
@@ -395,15 +397,15 @@
         croak "Can't get version number" unless $version;
         $version++;
         $sql = "UPDATE node SET version=" . $dbh->quote($version)
-	     . ", text=" . $dbh->quote($content)
+	     . ", text=" . $dbh->quote($self->charset_encode($content))
 	     . ", modified=" . $dbh->quote($timestamp)
-	     . " WHERE name=" . $dbh->quote($node);
+	     . " WHERE name=" . $dbh->quote($self->charset_encode($node));
 	$dbh->do($sql) or croak "Error updating database: " . DBI->errstr;
     } else {
         $version = 1;
         $sql = "INSERT INTO node (name, version, text, modified)
                 VALUES ("
-             . join(", ", map { $dbh->quote($_) }
+             . join(", ", map { $dbh->quote($self->charset_encode($_)) }
 		              ($node, $version, $content, $timestamp)
                    )
              . ")";
@@ -413,7 +415,7 @@
     # In either case we need to add to the history.
     $sql = "INSERT INTO content (name, version, text, modified)
             VALUES ("
-         . join(", ", map { $dbh->quote($_) }
+         . join(", ", map { $dbh->quote($self->charset_encode($_)) }
 		          ($node, $version, $content, $timestamp)
                )
          . ")";
@@ -421,10 +423,10 @@
 
     # And to the backlinks.
     $dbh->do("DELETE FROM internal_links WHERE link_from="
-             . $dbh->quote($node) ) or croak $dbh->errstr;
+             . $dbh->quote($self->charset_encode($node)) ) or croak $dbh->errstr;
     foreach my $links_to ( @links_to ) {
         $sql = "INSERT INTO internal_links (link_from, link_to) VALUES ("
-             . join(", ", map { $dbh->quote($_) } ( $node, $links_to ) ) . ")";
+             . join(", ", map { $dbh->quote($self->charset_encode($_)) } ( $node, $links_to ) ) . ")";
         # Better to drop a backlink or two than to lose the whole update.
         # Shevek wants a case-sensitive wiki, Jerakeen wants a case-insensitive
         # one, MySQL compares case-sensitively on varchars unless you add
@@ -457,7 +459,7 @@
             foreach my $value ( @values ) {
                 my $sql = "INSERT INTO metadata "
                     . "(node, version, metadata_type, metadata_value) VALUES ("
-                    . join(", ", map { $dbh->quote($_) }
+                    . join(", ", map { $dbh->quote($self->charset_encode($_)) }
                                  ( $node, $version, $type, $value )
                           )
                     . ")";
@@ -469,7 +471,7 @@
             my $value_to_store = $self->_checksum_hashes( @values );
             my $sql = "INSERT INTO metadata "
                     . "(node, version, metadata_type, metadata_value) VALUES ("
-                    . join(", ", map { $dbh->quote($_) }
+                    . join(", ", map { $dbh->quote($self->charset_encode($_)) }
                            ( $node, $version, $type_to_store, $value_to_store )
                           )
                     . ")";
@@ -843,7 +845,7 @@
         my $sth = $dbh->prepare( "SELECT metadata_type, metadata_value
                                   FROM metadata WHERE node=? AND version=?" );
         $sth->execute( $find->{name}, $find->{version} );
-        while ( my ($type, $value) = $sth->fetchrow_array ) {
+        while ( my ($type, $value) = $self->charset_decode( $sth->fetchrow_array ) ) {
 	    if ( defined $metadata{$type} ) {
                 push @{$metadata{$type}}, $value;
 	    } else {
@@ -869,7 +871,7 @@
     my $dbh = $self->dbh;
     my $sql = "SELECT name FROM node;";
     my $nodes = $dbh->selectall_arrayref($sql); 
-    return ( map { $_->[0] } (@$nodes) );
+    return ( map { $self->charset_decode( $_->[0] ) } (@$nodes) );
 }
 
 =item B<list_nodes_by_metadata>
@@ -1011,6 +1013,30 @@
     return if $self->{_external_dbh};
     my $dbh = $self->dbh;
     $dbh->disconnect if $dbh;
+}
+
+# decode a string of octets into perl's internal encoding, based on the
+# charset parameter we were passed. Takes a list, returns a list.
+sub charset_decode {
+  my $self = shift;
+  my @input = @_;
+  my @output;
+  for (@input) {
+    push( @output, Encode::decode( $self->{_charset}, $_ ) );
+  }
+  return @output;
+}
+
+# convert a perl string into a series of octets we can put into the database
+# takes a list, returns a list
+sub charset_encode {
+  my $self = shift;
+  my @input = @_;
+  my @output;
+  for (@input) {
+    push( @output, Encode::encode( $self->{_charset}, $_ ) );
+  }
+  return @output;
 }
 
 1;
Only in ./lib/CGI/Wiki: TestConfig.pm
Only in ./lib/CGI/Wiki: TestConfig.pm~
diff -ur /tmp/CGI-Wiki-0.54/lib/CGI/Wiki.pm ./lib/CGI/Wiki.pm
--- /tmp/CGI-Wiki-0.54/lib/CGI/Wiki.pm	Fri Jun 25 20:29:34 2004
+++ ./lib/CGI/Wiki.pm	Fri Sep  3 15:37:27 2004
@@ -474,8 +474,9 @@
 
 sub search_nodes {
     my ($self, @args) = @_;
+    my @terms = map { $self->store->charset_encode($_) } @args;
     if ( $self->search_obj ) {
-        $self->search_obj->search_nodes( @args );
+        $self->search_obj->search_nodes( @terms );
     } else {
         croak "No search backend defined.";
     }
@@ -679,7 +680,7 @@
 
     my $search = $self->{_search};
     if ($search and $content) {
-        $search->index_node($node, $content);
+        $search->index_node($node, $store->charset_encode($content) );
     }
     return 1;
 }
@@ -700,7 +701,15 @@
     my $formatter = $self->{_formatter};
     # Add on $self to the call so the formatter can access things like whether
     # a linked-to node exists, etc.
-    return $formatter->format( $raw, $self, $metadata );
+    my $result = $formatter->format( $raw, $self, $metadata );
+    
+    # Nasty hack to work around an HTML::Parser deficiency
+    use Encode;
+    if (Encode::is_utf8($raw)) {
+      Encode::_utf8_on( $result );
+    }
+    
+    return $result;
 }
 
 =item B<store>
Only in .: pm_to_blib
Only in ./t: sii-db-file-test.db
Only in ./t: sqlite-test.db

--Apple-Mail-6--45638562--