Hello,
I wrote a Pg.pm module for DBIx::TextIndex, which previously only
supported mysql. I've done testing with around ~150k documents and
everything seems to be working well. In order to get this working, I
had to change TextIndex.pm in a few ways:
- Pass primary key twice to $sth->execute for updates. This is because
I had to break down mysql's nonstandard "replace into" to
"delete from ...; insert into", so I needed an extra placeholder for
the delete statement.
**NOTE** This breaks the mysql (default) DB plugin.
- Use $sth->bind_param to do binding. This is because postgres' bytea
type is not quoted properly by default. This should not break mysql.
- Deprecate use of Text::Unaccent. It looks like Text::Unaccent doesn't
work correctly with Perl 5.8.2, probably because it uses some internal
encoding for strings now. It doesn't pass `make test` either.
If someone who has mysql wants to look at this patch, I think mysql.pm
can be slightly changed to accommodate it. Please CC me on replies, as I
am not subscribed to this list.
Jim Blomo
diff -urN DBIx.orig/TextIndex/Pg.pm DBIx/TextIndex/Pg.pm
--- DBIx.orig/TextIndex/Pg.pm 1969-12-31 16:00:00.000000000 -0800
+++ DBIx/TextIndex/Pg.pm 2004-03-17 12:47:19.000000000 -0800
@@ -0,0 +1,515 @@
+#!/usr/bin/perl
+
+# PostgreSQL module for DBIx::TextIndex
+
+use strict;
+
+sub db_add_mask {
+ my $self = shift;
+ return <<END;
+DELETE FROM $self->{MASK_TABLE} where mask = ?;
+INSERT into $self->{MASK_TABLE} (mask, docs_vector) values (?, ?)
+END
+
+}
+
+sub db_delete_mask {
+ my $self = shift;
+ return <<END;
+delete from $self->{MASK_TABLE}
+where mask = ?
+END
+
+}
+
+sub db_drop_table {
+ my $self = shift;
+ my $table = shift;
+
+ if( $self->{INDEX_DBH}->selectrow_array("SELECT tablename FROM pg_tables WHERE
tablename = '$table'") ) {
+ $self->{INDEX_DBH}->do("DROP TABLE $table");
+ }
+}
+
+sub db_table_exists {
+ my $self = shift;
+ my $table = shift;
+
+ return 1 if $self->{INDEX_DBH}->selectrow_array("SELECT tablename FROM
pg_tables WHERE tablename = '$table'");
+ return 0;
+}
+
+sub db_create_collection_table {
+ my $self = shift;
+ return <<END;
+CREATE TABLE collection (
+ collection varchar(30) PRIMARY KEY default '',
+ version numeric(10,2) NOT NULL default 0.00,
+ max_indexed_id int NOT NULL default 0,
+ doc_table varchar(30) NOT NULL default '',
+ doc_id_field varchar(30) NOT NULL default '',
+ doc_fields varchar(250) NOT NULL default '',
+ charset varchar(50) NOT NULL default '',
+ stoplist varchar(255) NOT NULL default '',
+ proximity_index varchar(1) NOT NULL default '0',
+ error_empty_query varchar(255) NOT NULL default '',
+ error_quote_count varchar(255) NOT NULL default '',
+ error_no_results varchar(255) NOT NULL default '',
+ error_no_results_stop varchar(255) NOT NULL default '',
+ max_word_length int NOT NULL default 0,
+ result_threshold int NOT NULL default 0,
+ phrase_threshold int NOT NULL default 0,
+ min_wildcard_length int NOT NULL default 0,
+ decode_html_entities varchar(1) NOT NULL default '0',
+ scoring_method varchar(20) NOT NULL default '',
+ update_commit_interval int NOT NULL default 0
+)
+END
+
+}
+
+sub db_insert_collection_table_row {
+ my $self = shift;
+ my $row = shift;
+ my @fields;
+ my @values;
+ while (my ($field, $value) = each %$row) {
+ push @fields, $field;
+ push @values, $value;
+ }
+ my $collection_fields = join ', ', @fields;
+ my $place_holders = join ', ', (('?') x ($#fields + 1));
+ my $sql = <<END;
+insert into $self->{COLLECTION_TABLE}
+($collection_fields)
+values ($place_holders)
+END
+ $self->{INDEX_DBH}->do($sql, undef, @values);
+
+}
+
+sub db_fetch_max_indexed_id {
+ my $self = shift;
+
+ return <<END;
+SELECT max_indexed_id
+FROM $self->{COLLECTION_TABLE}
+WHERE collection = ?
+END
+
+}
+
+sub db_fetch_collection_version {
+ my $self = shift;
+
+ return <<END;
+select max(version) from $self->{COLLECTION_TABLE}
+END
+
+}
+
+sub db_collection_count {
+ my $self = shift;
+
+ return <<END;
+select count(*) from $self->{COLLECTION_TABLE}
+END
+
+}
+
+sub db_update_collection_info {
+ my $self = shift;
+ my $field = shift;
+
+ return <<END;
+update $self->{COLLECTION_TABLE}
+set $field = ?
+where collection = ?
+END
+
+}
+
+sub db_delete_collection_info {
+ my $self = shift;
+
+ return <<END;
+delete from $self->{COLLECTION_TABLE}
+where collection = ?
+END
+
+}
+
+sub db_store_collection_info {
+ my $self = shift;
+
+ my @collection_fields = @{$self->{COLLECTION_FIELDS}};
+ my $collection_fields = join ', ', @collection_fields;
+ my $place_holders = join ', ', (('?') x ($#collection_fields + 1));
+ return <<END;
+insert into $self->{COLLECTION_TABLE}
+($collection_fields)
+values
+($place_holders)
+END
+
+}
+
+sub db_fetch_collection_info {
+ my $self = shift;
+
+ my $collection_fields = join ', ', @{$self->{COLLECTION_FIELDS}};
+
+ return <<END;
+select
+$collection_fields
+from $self->{COLLECTION_TABLE}
+where collection = ?
+END
+
+}
+
+sub db_fetch_all_collection_rows {
+ my $self = shift;
+
+ return <<END;
+select * from $self->{COLLECTION_TABLE}
+END
+
+}
+
+sub db_phrase_scan_cz {
+ my $self = shift;
+ my $result_docs = shift;
+ my $fno = shift;
+
+ return <<END;
+select $self->{DOC_ID_FIELD}, $self->{DOC_FIELDS}->[$fno]
+from $self->{DOC_TABLE}
+where $self->{DOC_ID_FIELD} in ($result_docs)
+END
+
+}
+
+sub db_phrase_scan {
+ my $self = shift;
+ my $result_docs = shift;
+ my $fno = shift;
+
+ return <<END;
+select $self->{DOC_ID_FIELD}
+from $self->{DOC_TABLE}
+where $self->{DOC_ID_FIELD} IN ($result_docs)
+ and $self->{DOC_FIELDS}->[$fno] like ?
+END
+
+}
+
+sub db_fetch_maxtf {
+ my $self = shift;
+ my $fields = shift;
+
+ return <<END;
+select field_no, maxtf
+from $self->{MAXTF_TABLE}
+where field_no in ($fields)
+END
+
+}
+
+sub db_fetch_docweights {
+ my $self = shift;
+ my $fields = shift;
+
+ return <<END;
+select field_no, avg_docweight, docweights
+from $self->{DOCWEIGHTS_TABLE}
+where field_no in ($fields)
+END
+
+}
+
+
+sub db_fetch_all_docs_vector {
+ my $self = shift;
+ return <<END;
+SELECT all_docs_vector
+from $self->{ALL_DOCS_VECTOR_TABLE}
+END
+
+}
+
+sub db_update_all_docs_vector {
+ my $self = shift;
+ return <<END;
+DELETE FROM $self->{ALL_DOCS_VECTOR_TABLE} WHERE id = 1;
+INSERT INTO $self->{ALL_DOCS_VECTOR_TABLE}
+(id, all_docs_vector)
+VALUES (1, ?)
+END
+}
+
+sub db_docfreq_t {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+select docfreq_t from $table
+where word = ?
+END
+
+}
+
+sub db_fetch_mask {
+ my $self = shift;
+
+ return <<END;
+select docs_vector
+from $self->{MASK_TABLE}
+where mask = ?
+END
+
+}
+
+sub db_fetch_term_docs {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+select term_docs
+from $table
+where word = ?
+END
+
+}
+
+sub db_fetch_term_freq_and_docs {
+ my $self = shift;
+ my $table = shift;
+ return <<END;
+select docfreq_t, term_docs
+from $table
+where word = ?
+END
+
+}
+
+sub db_fetch_words {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+select word
+from $table
+where word like ?
+END
+
+}
+
+sub db_ping_doc {
+ my $self = shift;
+
+ return <<END;
+select 1
+from $self->{DOC_TABLE}
+where $self->{DOC_ID_FIELD} = ?
+END
+
+}
+
+sub db_fetch_doc {
+ my $self = shift;
+ my $field = shift;
+
+ return <<END;
+select $field
+from $self->{DOC_TABLE}
+where $self->{DOC_ID_FIELD} = ?
+END
+
+}
+
+sub db_update_maxtf {
+ my $self = shift;
+
+ return <<END;
+DELETE FROM $self->{MAXTF_TABLE} WHERE field_no = ?;
+INSERT into $self->{MAXTF_TABLE} (field_no, maxtf) values (?, ?)
+END
+
+}
+
+sub db_update_docweights {
+ my $self = shift;
+
+ return <<END;
+DELETE FROM $self->{DOCWEIGHTS_TABLE} WHERE field_no = ?;
+INSERT into $self->{DOCWEIGHTS_TABLE} (field_no, avg_docweight, docweights) values
(?, ?, ?)
+END
+
+}
+
+sub db_inverted_replace {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+DELETE FROM $table WHERE word = ?;
+INSERT into $table
+(word, docfreq_t, term_docs)
+values (?, ?, ?)
+END
+
+}
+
+sub db_inverted_remove {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+delete from $table
+where word = ?
+END
+
+}
+
+sub db_inverted_select {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+select docfreq_t, term_docs
+from $table
+where word = ?
+END
+
+}
+
+sub db_create_mask_table {
+ my $self = shift;
+
+ return <<END;
+create table $self->{MASK_TABLE} (
+ mask varchar(100) primary key,
+ docs_vector text not null
+)
+END
+
+}
+
+sub db_create_docweights_table {
+ my $self = shift;
+ return <<END;
+create table $self->{DOCWEIGHTS_TABLE} (
+ field_no integer primary key,
+ avg_docweight float not null,
+ docweights bytea not null
+)
+END
+}
+
+sub db_create_maxterm_table {
+ my $self = shift;
+
+ return <<END;
+create table $self->{MAXTF_TABLE} (
+ field_no integer primary key,
+ maxtf bytea not null
+)
+END
+
+}
+
+sub db_create_all_docs_vector_table {
+ my $self = shift;
+
+ return <<END;
+CREATE TABLE $self->{ALL_DOCS_VECTOR_TABLE} (
+ id INT PRIMARY KEY,
+ all_docs_vector text NOT NULL
+)
+END
+}
+
+sub db_create_inverted_table {
+ my $self = shift;
+ my $table = shift;
+ my $max_word = $self->{MAX_WORD_LENGTH};
+
+ return <<END;
+create table $table (
+ word varchar($max_word) primary key,
+ docfreq_t int not null,
+ term_docs bytea not null
+)
+END
+
+}
+
+sub db_pindex_search {
+ my $self = shift;
+ my $fno = shift;
+ my $words = shift;
+ my $docs = shift;
+
+ return <<END;
+select word, doc, pos
+from $self->{PINDEX_TABLES}->[$fno]
+where doc in ($docs) and word in ($words)
+order by doc
+END
+
+}
+
+sub db_pindex_create {
+ my $self = shift;
+ my $table = shift;
+ my $max_word = $self->{MAX_WORD_LENGTH};
+
+ return <<END;
+create table $table (
+ word varchar($max_word) not null,
+ doc integer not null,
+ pos integer not null,
+ primary key (doc, word)
+)
+END
+
+}
+
+sub db_pindex_add {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+insert into $table (word, doc, pos)
+values (?, ?, ?)
+END
+
+}
+
+sub db_pindex_remove {
+ my $self = shift;
+ my $table = shift;
+ my $docs = shift;
+
+ return <<END;
+delete from $table
+where doc in ($docs)
+END
+
+}
+
+
+sub db_total_words {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+select SUM(docfreq_t)
+from $table
+END
+
+}
+
+1;
+
diff -urN DBIx.orig/TextIndex.pm DBIx/TextIndex.pm
--- DBIx.orig/TextIndex.pm 2003-10-01 13:30:51.000000000 -0700
+++ DBIx/TextIndex.pm 2004-03-17 16:39:29.000000000 -0800
@@ -13,7 +13,9 @@
use DBIx::TextIndex::QueryParser;
use DBIx::TextIndex::TermDocsCache;
use HTML::Entities ();
-use Text::Unaccent qw(unac_string);
+#use Text::Unaccent qw(unac_string);
+
+use DBD::Pg;
my $GEN = 'DBIx::TextIndex::Exception::General';
my $DA = 'DBIx::TextIndex::Exception::DataAccess';
@@ -239,7 +241,7 @@
$vector->Index_List_Store(@$ids);
print "Adding mask ($mask) to table $self->{MASK_TABLE}\n" if $PA > 1;
- $self->{INDEX_DBH}->do($self->db_add_mask, undef, $mask, $vector->to_Enum);
+ $self->{INDEX_DBH}->do($self->db_add_mask, undef, $mask, $mask, $vector->to_Enum);
return 1;
}
@@ -429,7 +431,11 @@
$maxtf[$doc_id] = 0;
}
my $packed_maxtf = pack 'w' x ($#maxtf + 1), @maxtf;
- $sth->execute($fno, $packed_maxtf);
+ #$sth->execute($fno, $packed_maxtf);
+ $sth->bind_param( 1, $fno );
+ $sth->bind_param( 2, $fno );
+ $sth->bind_param( 3, $packed_maxtf, { pg_type => DBD::Pg::PG_BYTEA } );
+ $sth->execute();
}
}
@@ -450,7 +456,12 @@
}
my $packed_w_d = pack 'f*', @w_d;
# FIXME: we should update the average, leave it alone for now
- $sth->execute($fno, $self->{AVG_W_D}->[$fno], $packed_w_d);
+ #$sth->execute($fno, $self->{AVG_W_D}->[$fno], $packed_w_d);
+ $sth->bind_param( 1, $fno );
+ $sth->bind_param( 2, $fno );
+ $sth->bind_param( 3, $self->{AVG_W_D}->[$fno] );
+ $sth->bind_param( 4, $packed_w_d, { pg_type => DBD::Pg::PG_BYTEA } );
+ $sth->execute();
}
}
@@ -504,8 +515,14 @@
push @new_term_docs, ($term_docs->[$i], $term_docs->[$i + 1]);
}
- $sth_replace->execute($word, $docfreq_t,
- pack_term_docs([EMAIL PROTECTED]));
+ #$sth_replace->execute($word, $docfreq_t,
+ # pack_term_docs([EMAIL PROTECTED]));
+ $sth_replace->bind_param( 1, $word );
+ $sth_replace->bind_param( 2, $word );
+ $sth_replace->bind_param( 3, $docfreq_t );
+ $sth_replace->bind_param( 4, pack_term_docs( [EMAIL PROTECTED] ),
+ { pg_type => DBD::Pg::PG_BYTEA } );
+ $sth_replace->execute();
}
}
}
@@ -1706,7 +1723,7 @@
sub _lc_and_unac {
my $self = shift;
my $s = shift;
- $s = unac_string($self->{CHARSET}, $s);
+ #$s = unac_string($self->{CHARSET}, $s);
$s = lc($s);
return $s;
}
@@ -1749,7 +1766,11 @@
}
$maxtf[0] = 0 unless defined $maxtf[0];
my $packed_maxtf = pack 'w' x ($#maxtf + 1), @maxtf;
- $sth->execute($fno, $packed_maxtf);
+ #$sth->execute($fno, $packed_maxtf);
+ $sth->bind_param( 1, $fno );
+ $sth->bind_param( 2, $fno );
+ $sth->bind_param( 3, $packed_maxtf, { pg_type => DBD::Pg::PG_BYTEA } );
+ $sth->execute();
}
# Delete temporary in-memory structure
delete($self->{NEW_MAXTF});
@@ -1780,7 +1801,12 @@
$w_d[0] = 0 unless defined $w_d[0];
# FIXME: this takes too much space
my $packed_w_d = pack 'f*', @w_d;
- $sth->execute($fno, $avg_w_d, $packed_w_d);
+ #$sth->execute($fno, $avg_w_d, $packed_w_d);
+ $sth->bind_param( 1, $fno );
+ $sth->bind_param( 2, $fno );
+ $sth->bind_param( 3, $avg_w_d );
+ $sth->bind_param( 4, $packed_w_d, { pg_type => DBD::Pg::PG_BYTEA } );
+ $sth->execute();
}
# Delete temporary in-memory structure
delete($self->{NEW_W_D});
@@ -1812,11 +1838,16 @@
my $term_docs = pack_term_docs_append_vint($o_term_docs, $term_docs_vint);
- $i_sth->execute(
- $word,
- $self->{DOCFREQ_T}->[$fno]->{$word} + $o_docfreq_t,
- $term_docs,
- ) or warn $self->{INDEX_DBH}->err;
+ # $i_sth->execute(
+ # $word,
+ # $self->{DOCFREQ_T}->[$fno]->{$word} + $o_docfreq_t,
+ # $term_docs,
+ # ) or warn $self->{INDEX_DBH}->err;
+ $i_sth->bind_param( 1, $word );
+ $i_sth->bind_param( 2, $word );
+ $i_sth->bind_param( 3, $self->{DOCFREQ_T}->[$fno]->{$word} +
$o_docfreq_t );
+ $i_sth->bind_param( 4, $term_docs, { pg_type => DBD::Pg::PG_BYTEA } );
+ $i_sth->execute() or warn $self->{INDEX_DBH}->err;
delete($self->{TERM_DOCS_VINT}->[$fno]->{$word});
$wc++;