On Fri, Sep 21, 2012 at 10:22:44AM +0100, Nicholas Clark said: > I'm not a search engineer (recovering or otherwise), so this represents > rather more work that I wanted to do.
I'll try and know something together but really it's fairly simple algorithm. Warning untested: my %index; foreach my $doc (@corpus) { my $text = slurp($doc); my @tokens = tokenize($text); foreach my $token (@tokens) { $index{$token}->{$doc}++; } } my $D = scalar(@corpus); foreach my $query (@queries) { my %results; my @tokens = tokenize($query); foreach my $token (@tokens) { my $docs = $index->{$token}; my $d = size keys %$docs; foreach my $doc (keys %docs) { # http://en.wikipedia.org/wiki/Tf*idf my $tf = $docs->{$doc}; my $idf = log($D / $d); $results{$doc} += $tf * $idf; } } my $count = 1; foreach my $doc (sort { $results{$b} <=> $results{$a} } %results) { print "$count) $doc (score ".$results->{$doc}.")\n"; $count++; } } sub tokenize { my $text = shift; my @words = split ' ', $text; return map { stem($_) } grep { !$STOP_WORDS{$_} } @words; } # world's most usless stemmer # here for munging performance checking only sub stem { my $word = shift; $word =~ s!(ing|s|ed|ly$); $word; }