[PATCH] postgres support for DBIx::TextIndex

Jim Blomo Fri, 19 Mar 2004 02:03:49 -0800

Hello,

I wrote a Pg.pm module for DBIx::TextIndex, which previously only
supported mysql.  I've done testing with around ~150k documents and
everything seems to be working well.  In order to get this working, I
had to change TextIndex.pm in a few ways:


- Pass primary key twice to $sth->execute for updates.  This is because
  I had to break down mysql's nonstandard "replace into" to 
  "delete from ...; insert into", so I needed an extra placeholder for
  the delete statement.
  **NOTE** This breaks the mysql (default) DB plugin.

- Use $sth->bind_param to do binding. This is because postgres' bytea
  type is not quoted properly by default.  This should not break mysql.

- Deprecate use of Text::Unaccent.  It looks like Text::Unaccent doesn't
  work correctly with Perl 5.8.2, probably because it uses some internal
  encoding for strings now.  It doesn't pass `make test` either.

If someone who has mysql wants to look at this patch, I think mysql.pm
can be slightly changed to accommodate it.  Please CC me on replies, as I
am not subscribed to this list.

Jim Blomo

diff -urN DBIx.orig/TextIndex/Pg.pm DBIx/TextIndex/Pg.pm
--- DBIx.orig/TextIndex/Pg.pm   1969-12-31 16:00:00.000000000 -0800
+++ DBIx/TextIndex/Pg.pm        2004-03-17 12:47:19.000000000 -0800
@@ -0,0 +1,515 @@
+#!/usr/bin/perl
+
+# PostgreSQL module for DBIx::TextIndex
+
+use strict;
+
+sub db_add_mask {
+    my $self = shift;
+    return <<END;
+DELETE FROM $self->{MASK_TABLE} where mask = ?;
+INSERT into $self->{MASK_TABLE} (mask, docs_vector) values (?, ?)
+END
+
+}
+
+sub db_delete_mask {
+    my $self = shift;
+    return <<END;
+delete from $self->{MASK_TABLE}
+where mask = ?
+END
+
+}
+
+sub db_drop_table {
+    my $self = shift;
+    my $table = shift;
+
+       if( $self->{INDEX_DBH}->selectrow_array("SELECT tablename FROM pg_tables WHERE 
tablename = '$table'") ) {
+               $self->{INDEX_DBH}->do("DROP TABLE $table");
+       }
+}
+
+sub db_table_exists {
+    my $self = shift;
+    my $table = shift;
+
+       return 1 if $self->{INDEX_DBH}->selectrow_array("SELECT tablename FROM 
pg_tables WHERE tablename = '$table'");
+    return 0;
+}
+
+sub db_create_collection_table {
+    my $self = shift;
+    return <<END;
+CREATE TABLE collection (
+  collection varchar(30) PRIMARY KEY default '',
+  version numeric(10,2) NOT NULL default 0.00,
+  max_indexed_id int NOT NULL default 0,
+  doc_table varchar(30) NOT NULL default '',
+  doc_id_field varchar(30) NOT NULL default '',
+  doc_fields varchar(250) NOT NULL default '',
+  charset varchar(50) NOT NULL default '',
+  stoplist varchar(255) NOT NULL default '',
+  proximity_index varchar(1) NOT NULL default '0',
+  error_empty_query varchar(255) NOT NULL default '',
+  error_quote_count varchar(255) NOT NULL default '',
+  error_no_results varchar(255) NOT NULL default '',
+  error_no_results_stop varchar(255) NOT NULL default '',
+  max_word_length int NOT NULL default 0,
+  result_threshold int NOT NULL default 0,
+  phrase_threshold int NOT NULL default 0,
+  min_wildcard_length int NOT NULL default 0,
+  decode_html_entities varchar(1) NOT NULL default '0',
+  scoring_method varchar(20) NOT NULL default '',
+  update_commit_interval int NOT NULL default 0
+)
+END
+
+}
+
+sub db_insert_collection_table_row {
+    my $self = shift;
+    my $row = shift;
+    my @fields;
+    my @values;
+    while (my ($field, $value) = each %$row) {
+       push @fields, $field;
+       push @values, $value;
+    }
+    my $collection_fields = join ', ', @fields;
+    my $place_holders = join ', ', (('?') x ($#fields + 1)); 
+    my $sql = <<END;
+insert into $self->{COLLECTION_TABLE}
+($collection_fields)
+values ($place_holders)
+END
+    $self->{INDEX_DBH}->do($sql, undef, @values);
+
+}
+
+sub db_fetch_max_indexed_id {
+    my $self = shift;
+
+    return <<END;
+SELECT max_indexed_id
+FROM $self->{COLLECTION_TABLE}
+WHERE collection = ?
+END
+
+}
+
+sub db_fetch_collection_version {
+    my $self = shift;
+
+    return <<END;
+select max(version) from $self->{COLLECTION_TABLE}
+END
+
+}
+
+sub db_collection_count {
+    my $self = shift;
+
+    return <<END;
+select count(*) from $self->{COLLECTION_TABLE}
+END
+
+}
+
+sub db_update_collection_info {
+    my $self = shift;
+    my $field = shift;
+
+    return <<END;
+update $self->{COLLECTION_TABLE}
+set $field = ?
+where collection = ?
+END
+
+}
+
+sub db_delete_collection_info {
+    my $self = shift;
+
+    return <<END;
+delete from $self->{COLLECTION_TABLE}
+where collection = ?
+END
+
+}
+
+sub db_store_collection_info {
+    my $self = shift;
+
+    my @collection_fields = @{$self->{COLLECTION_FIELDS}};
+    my $collection_fields = join ', ', @collection_fields;
+    my $place_holders = join ', ', (('?') x ($#collection_fields + 1)); 
+    return <<END;
+insert into $self->{COLLECTION_TABLE}
+($collection_fields)
+values
+($place_holders)
+END
+
+}
+
+sub db_fetch_collection_info {
+    my $self = shift;
+
+    my $collection_fields = join ', ', @{$self->{COLLECTION_FIELDS}};
+
+    return <<END;
+select
+$collection_fields
+from $self->{COLLECTION_TABLE}
+where collection = ?
+END
+
+}
+
+sub db_fetch_all_collection_rows {
+    my $self = shift;
+
+    return <<END;
+select * from $self->{COLLECTION_TABLE}
+END
+
+}
+
+sub db_phrase_scan_cz {
+    my $self = shift;
+    my $result_docs = shift;
+    my $fno = shift;
+
+    return <<END;
+select $self->{DOC_ID_FIELD}, $self->{DOC_FIELDS}->[$fno]
+from   $self->{DOC_TABLE}
+where  $self->{DOC_ID_FIELD} in ($result_docs)
+END
+
+}
+
+sub db_phrase_scan {
+    my $self = shift;
+    my $result_docs = shift;
+    my $fno = shift;
+
+    return <<END;
+select $self->{DOC_ID_FIELD}
+from   $self->{DOC_TABLE}
+where  $self->{DOC_ID_FIELD} IN ($result_docs)
+       and $self->{DOC_FIELDS}->[$fno] like ?
+END
+
+}
+
+sub db_fetch_maxtf {
+    my $self = shift;
+    my $fields = shift;
+
+    return <<END;
+select field_no, maxtf
+from $self->{MAXTF_TABLE}
+where field_no in ($fields)
+END
+
+}
+
+sub db_fetch_docweights {
+    my $self = shift;
+    my $fields = shift;
+
+    return <<END;
+select field_no, avg_docweight, docweights
+from $self->{DOCWEIGHTS_TABLE}
+where field_no in ($fields)
+END
+
+}
+
+
+sub db_fetch_all_docs_vector {
+    my $self = shift;
+    return <<END;
+SELECT all_docs_vector
+from $self->{ALL_DOCS_VECTOR_TABLE}
+END
+
+}
+
+sub db_update_all_docs_vector {
+    my $self = shift;
+    return <<END;
+DELETE FROM $self->{ALL_DOCS_VECTOR_TABLE} WHERE id = 1;
+INSERT INTO $self->{ALL_DOCS_VECTOR_TABLE}
+(id, all_docs_vector)
+VALUES (1, ?)
+END
+}
+
+sub db_docfreq_t {
+    my $self = shift;
+    my $table = shift;
+
+    return <<END;
+select docfreq_t from $table
+where word = ?
+END
+
+}
+
+sub db_fetch_mask {
+    my $self = shift;
+
+    return <<END;
+select docs_vector
+from $self->{MASK_TABLE}
+where mask = ?
+END
+
+}
+
+sub db_fetch_term_docs {
+    my $self = shift;
+    my $table = shift;
+
+    return <<END;
+select term_docs
+from $table
+where word = ?
+END
+
+}
+
+sub db_fetch_term_freq_and_docs {
+    my $self = shift;
+    my $table = shift;
+    return <<END;
+select docfreq_t, term_docs
+from $table
+where word = ?
+END
+
+}
+
+sub db_fetch_words {
+    my $self = shift;
+    my $table = shift;
+
+    return <<END;
+select word
+from $table
+where word like ?
+END
+
+}
+
+sub db_ping_doc {
+    my $self = shift;
+
+    return <<END;
+select 1
+from $self->{DOC_TABLE}
+where $self->{DOC_ID_FIELD} = ?
+END
+
+}
+
+sub db_fetch_doc {
+    my $self = shift;
+    my $field = shift;
+
+    return <<END;
+select $field
+from $self->{DOC_TABLE}
+where $self->{DOC_ID_FIELD} = ?
+END
+
+}
+
+sub db_update_maxtf {
+    my $self = shift;
+
+    return <<END;
+DELETE FROM $self->{MAXTF_TABLE} WHERE field_no = ?;
+INSERT into $self->{MAXTF_TABLE} (field_no, maxtf) values (?, ?)
+END
+
+}
+
+sub db_update_docweights {
+    my $self = shift;
+
+    return <<END;
+DELETE FROM $self->{DOCWEIGHTS_TABLE} WHERE field_no = ?;
+INSERT into $self->{DOCWEIGHTS_TABLE} (field_no, avg_docweight, docweights) values 
(?, ?, ?)
+END
+
+}
+
+sub db_inverted_replace {
+    my $self = shift;
+    my $table = shift;
+
+    return <<END;
+DELETE FROM $table WHERE word = ?;
+INSERT into $table
+(word, docfreq_t, term_docs)
+values (?, ?, ?)
+END
+
+}
+
+sub db_inverted_remove {
+    my $self = shift;
+    my $table = shift;
+    
+    return <<END;
+delete from $table
+where word = ?
+END
+
+}
+
+sub db_inverted_select {
+    my $self = shift;
+    my $table = shift;
+
+    return <<END;
+select docfreq_t, term_docs
+from $table
+where word = ?
+END
+
+}
+
+sub db_create_mask_table {
+    my $self = shift;
+
+    return <<END;
+create table $self->{MASK_TABLE} (
+  mask             varchar(100)            primary key,
+  docs_vector text                not null
+)
+END
+
+}
+
+sub db_create_docweights_table {
+    my $self = shift;
+    return <<END;
+create table $self->{DOCWEIGHTS_TABLE} (
+  field_no        integer         primary key,
+  avg_docweight    float                   not null,
+  docweights      bytea                   not null
+)
+END
+}
+
+sub db_create_maxterm_table {
+    my $self = shift;
+
+    return <<END;
+create table $self->{MAXTF_TABLE} (
+  field_no        integer         primary key,
+  maxtf           bytea                   not null
+)
+END
+
+}
+
+sub db_create_all_docs_vector_table {
+    my $self = shift;
+
+    return <<END;
+CREATE TABLE $self->{ALL_DOCS_VECTOR_TABLE} (
+  id               INT PRIMARY KEY,
+  all_docs_vector  text              NOT NULL
+)
+END
+}
+
+sub db_create_inverted_table {
+    my $self = shift;
+    my $table = shift;
+    my $max_word = $self->{MAX_WORD_LENGTH};
+
+    return <<END;
+create table $table (
+  word             varchar($max_word)      primary key,
+  docfreq_t       int not null,
+  term_docs       bytea                   not null
+)
+END
+
+}
+
+sub db_pindex_search {
+    my $self = shift;
+    my $fno = shift;
+    my $words = shift;
+    my $docs = shift;
+
+    return <<END;
+select word, doc, pos
+from $self->{PINDEX_TABLES}->[$fno]
+where doc in ($docs) and word in ($words)
+order by doc
+END
+
+}
+
+sub db_pindex_create {
+    my $self = shift;
+    my $table = shift;
+    my $max_word = $self->{MAX_WORD_LENGTH};
+
+    return <<END;
+create table $table (
+  word            varchar($max_word)      not null,
+  doc             integer                 not null,
+  pos             integer                 not null,
+  primary key             (doc, word)
+)
+END
+
+}
+
+sub db_pindex_add {
+    my $self = shift;
+    my $table = shift;
+
+    return <<END;
+insert into $table (word, doc, pos)
+values (?, ?, ?)
+END
+
+}
+
+sub db_pindex_remove {
+    my $self = shift;
+    my $table = shift;
+    my $docs = shift;
+
+    return <<END;
+delete from $table
+where doc in ($docs)
+END
+
+}
+
+
+sub db_total_words {
+    my $self = shift;
+    my $table = shift;
+
+    return <<END;
+select SUM(docfreq_t)
+from $table
+END
+
+}
+
+1;
+
diff -urN DBIx.orig/TextIndex.pm DBIx/TextIndex.pm
--- DBIx.orig/TextIndex.pm      2003-10-01 13:30:51.000000000 -0700
+++ DBIx/TextIndex.pm   2004-03-17 16:39:29.000000000 -0800
@@ -13,7 +13,9 @@
 use DBIx::TextIndex::QueryParser;
 use DBIx::TextIndex::TermDocsCache;
 use HTML::Entities ();
-use Text::Unaccent qw(unac_string);
+#use Text::Unaccent qw(unac_string);
+
+use DBD::Pg;
 
 my $GEN = 'DBIx::TextIndex::Exception::General';
 my $DA  = 'DBIx::TextIndex::Exception::DataAccess';
@@ -239,7 +241,7 @@
     $vector->Index_List_Store(@$ids);
 
     print "Adding mask ($mask) to table $self->{MASK_TABLE}\n" if $PA > 1;
-    $self->{INDEX_DBH}->do($self->db_add_mask, undef, $mask, $vector->to_Enum);
+    $self->{INDEX_DBH}->do($self->db_add_mask, undef, $mask, $mask, $vector->to_Enum);
     return 1;
 }
 
@@ -429,7 +431,11 @@
            $maxtf[$doc_id] = 0;
        }
        my $packed_maxtf = pack 'w' x ($#maxtf + 1), @maxtf;
-       $sth->execute($fno, $packed_maxtf);
+       #$sth->execute($fno, $packed_maxtf);
+    $sth->bind_param( 1, $fno );
+    $sth->bind_param( 2, $fno );
+    $sth->bind_param( 3, $packed_maxtf, { pg_type => DBD::Pg::PG_BYTEA } );
+    $sth->execute();
     }
 }
 
@@ -450,7 +456,12 @@
        }
        my $packed_w_d = pack 'f*', @w_d;
        # FIXME: we should update the average, leave it alone for now
-       $sth->execute($fno, $self->{AVG_W_D}->[$fno], $packed_w_d);
+       #$sth->execute($fno, $self->{AVG_W_D}->[$fno], $packed_w_d);
+       $sth->bind_param( 1, $fno );
+       $sth->bind_param( 2, $fno );
+       $sth->bind_param( 3, $self->{AVG_W_D}->[$fno] );
+       $sth->bind_param( 4, $packed_w_d, { pg_type => DBD::Pg::PG_BYTEA } );
+       $sth->execute();
     }
 }
 
@@ -504,8 +515,14 @@
                push @new_term_docs, ($term_docs->[$i], $term_docs->[$i + 1]);
            }
 
-           $sth_replace->execute($word, $docfreq_t,
-                                 pack_term_docs([EMAIL PROTECTED]));
+               #$sth_replace->execute($word, $docfreq_t,
+               #         pack_term_docs([EMAIL PROTECTED]));
+           $sth_replace->bind_param( 1, $word );
+           $sth_replace->bind_param( 2, $word );
+           $sth_replace->bind_param( 3, $docfreq_t );
+           $sth_replace->bind_param( 4, pack_term_docs( [EMAIL PROTECTED] ),
+               { pg_type => DBD::Pg::PG_BYTEA } );
+           $sth_replace->execute();
        }
     }
 }
@@ -1706,7 +1723,7 @@
 sub _lc_and_unac {
     my $self = shift;
     my $s = shift;
-    $s = unac_string($self->{CHARSET}, $s);
+       #$s = unac_string($self->{CHARSET}, $s);
     $s = lc($s);
     return $s;
 }
@@ -1749,7 +1766,11 @@
        }
        $maxtf[0] = 0 unless defined $maxtf[0];
        my $packed_maxtf = pack 'w' x ($#maxtf + 1), @maxtf;
-       $sth->execute($fno, $packed_maxtf);
+       #$sth->execute($fno, $packed_maxtf);
+       $sth->bind_param( 1, $fno );
+       $sth->bind_param( 2, $fno );
+       $sth->bind_param( 3, $packed_maxtf, { pg_type => DBD::Pg::PG_BYTEA } );
+       $sth->execute();
     }
     # Delete temporary in-memory structure
     delete($self->{NEW_MAXTF});
@@ -1780,7 +1801,12 @@
        $w_d[0] = 0 unless defined $w_d[0];
        # FIXME: this takes too much space
        my $packed_w_d = pack 'f*', @w_d;
-       $sth->execute($fno, $avg_w_d, $packed_w_d);
+       #$sth->execute($fno, $avg_w_d, $packed_w_d);
+       $sth->bind_param( 1, $fno );
+       $sth->bind_param( 2, $fno );
+       $sth->bind_param( 3, $avg_w_d );
+       $sth->bind_param( 4, $packed_w_d, { pg_type => DBD::Pg::PG_BYTEA } );
+       $sth->execute();
     }
     # Delete temporary in-memory structure
     delete($self->{NEW_W_D});
@@ -1812,11 +1838,16 @@
 
            my $term_docs = pack_term_docs_append_vint($o_term_docs, $term_docs_vint);
 
-           $i_sth->execute(
-               $word,
-               $self->{DOCFREQ_T}->[$fno]->{$word} + $o_docfreq_t,
-               $term_docs,
-           ) or warn $self->{INDEX_DBH}->err;
+               # $i_sth->execute(
+               #     $word,
+               # $self->{DOCFREQ_T}->[$fno]->{$word} + $o_docfreq_t,
+               # $term_docs,
+               # ) or warn $self->{INDEX_DBH}->err;
+               $i_sth->bind_param( 1, $word );
+               $i_sth->bind_param( 2, $word );
+               $i_sth->bind_param( 3, $self->{DOCFREQ_T}->[$fno]->{$word} + 
$o_docfreq_t );
+               $i_sth->bind_param( 4, $term_docs, { pg_type => DBD::Pg::PG_BYTEA } );
+               $i_sth->execute() or warn $self->{INDEX_DBH}->err;
 
            delete($self->{TERM_DOCS_VINT}->[$fno]->{$word});
            $wc++;

[PATCH] postgres support for DBIx::TextIndex

Reply via email to