What translation do you see when you run:
bin/nutch net.nutch.searcher.Query
If you're using RawFieldQueryParser as your base, then you're probably seeing something like:
Translated: +(() () ()) +url:store/view^0.0
If so, then the problem is in part that RawFieldQueryFilter specifies a boost of 0.0, and also that RawFieldQueryFilter only works correctly when the field value is not tokenized, since it always constructs a Lucene TermQuery.
Probably we also need to fix BasicQueryFilter.java so that, when no clauses are present in the default field, nothing is added to the query. That would remove the "+(() () ())". I've added a patch which does this.
What you want to see is something more like:
Translated: +url:"store view"^2.0
To implement this you'll need some of the logic in BasicQueryFilter.java, to check if the clause is a Term or Phrase,translate it to either a Lucene TermQuery or PhraseQuery. I've attached a file called QueryFilterBase.java that should implement this correctly. It compiles, but I have not yet tested it. Tell me if it works for you.
Thanks,
Doug
Luke Baker wrote:
Hey,
I tried setting up a query filter for the entire url. I used a fields attribute in the plugin.xml for this query filter. It worked mostly, except when I ran searches like:
url:store/view http
url:example.com http
When I queried for those, I would just get a completely blank page. The same is true when using a '.' or '/' in the query using the cc: query filter.
Any idea what is causing this?
Thanks,
Luke Baker
------------------------------------------------------- This SF.Net email is sponsored by BEA Weblogic Workshop FREE Java Enterprise J2EE developer tools! Get your free copy of BEA WebLogic Workshop 8.1 today. http://ads.osdn.com/?ad_id=5047&alloc_id=10808&op=click _______________________________________________ Nutch-developers mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-developers
--- BasicQueryFilter.java.~1.1.~ 2004-07-09 13:27:42.000000000 -0700
+++ BasicQueryFilter.java 2004-09-03 13:38:24.000000000 -0700
@@ -44,6 +44,9 @@
public static void setSlop(int slop) { SLOP = slop; }
public BooleanQuery filter(Query input, BooleanQuery output) {
+ if (!hasDefaultFieldClauses(input))
+ return output; // nothing to do
+
BooleanQuery basic = new BooleanQuery();
addClauses(input, "url", basic, URL_BOOST, SLOP);
addClauses(input, "anchor", basic, ANCHOR_BOOST,
@@ -54,6 +57,20 @@
return output;
}
+ private boolean hasDefaultFieldClauses(Query input) {
+ boolean hasDefaultFieldClauses = false;
+ Clause[] clauses = input.getClauses();
+ for (int i = 0; i < clauses.length; i++) {
+ Clause c = clauses[i];
+ if (c.getField().equals(Clause.DEFAULT_FIELD)) {
+ hasDefaultFieldClauses = true;
+ break;
+ }
+ }
+ return hasDefaultFieldClauses;
+ }
+
+
/** Add all terms from a Nutch query to a Lucene query, searching the named
* field as a sloppy phrase and as individual terms.. */
private static void addClauses(Query input, String field,
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.searcher;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.index.Term;
import net.nutch.analysis.CommonGrams;
import net.nutch.searcher.Query.Clause;
import net.nutch.searcher.Query.Phrase;
/** Translate query fields to search the same-named field, as indexed by an
* IndexingFilter. */
public abstract class QueryFilterBase implements QueryFilter {
private String field;
private float boost = 1.0f;
/** Construct for the named field.*/
protected QueryFilterBase(String field) {
this(field, 1.0f);
}
/** Construct for the named field, boosting as specified.*/
protected QueryFilterBase(String field, float boost) {
this.field = field;
this.boost = boost;
}
public BooleanQuery filter(Query input, BooleanQuery output)
throws QueryException {
// examine each clause in the Nutch query
Clause[] clauses = input.getClauses();
for (int i = 0; i < clauses.length; i++) {
Clause c = clauses[i];
// skip non-matching clauses
if (!c.getField().equals(field))
continue;
// optimize phrase clause
if (c.isPhrase()) {
String[] opt = CommonGrams.optimizePhrase(c.getPhrase(), field);
if (opt.length==1) {
c = new Clause(new Query.Term(opt[0]),
c.isRequired(), c.isProhibited());
} else {
c = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited());
}
}
// construct appropriate Lucene clause
org.apache.lucene.search.Query luceneClause;
if (c.isPhrase()) {
Phrase nutchPhrase = c.getPhrase();
Query.Term[] terms = nutchPhrase.getTerms();
PhraseQuery lucenePhrase = new PhraseQuery();
for (int j = 0; j < terms.length; j++) {
lucenePhrase.add(new Term(field, terms[j].toString()));
}
luceneClause = lucenePhrase;
} else {
luceneClause = new TermQuery(new Term(field, c.toString()));
}
// set boost
luceneClause.setBoost(boost);
// add it as specified in query
output.add(luceneClause, c.isRequired(), c.isProhibited());
}
// return the modified Lucene query
return output;
}
}
