[Lucene-dev] New QueryParser

Brian Goetz Wed, 13 Jun 2001 02:49:25 -0700
I think I've got the query parser overhauled.  It addresses all the
concerns that have been raised so far, plus a few others that people
haven't yet raised (like what happens if a field name is also a stop
word.)  

It accepts the same language as before (plus and minus, parens), plus
  AND, &&  -> both terms required
  OR, ||   -> default combination
  NOT, !   -> next term is prohibited

and you can put a boost factor after the term with 
  ^n.n (need digits both before and after the decimal)

It is also savvy about the analyzer turning one term into more than one,
or zero, and only applies the analyzer to the term text, not the entire
query.  Examples:

  a AND NOT b
  +a -b
  a b^2.0
  field1:a field2:b
  a -(c || d || e)
  a "b c d"
  a +"b c d"

Here's the QueryParser.jj (total rewrite).  I've also included my 
JUnit test case for it afterwards, as an example of some of the
cases I've tested.  If people would please try it out, and get me 
some feedback before I check it in, that would be helpful.  


----- BEGIN QueryParser.jj

// QueryParser.jj
// Copyright (c) 1997-2001 Douglass R. Cutting.  
// Author: Brian Goetz

options {
  STATIC= false;
}

PARSER_BEGIN(QueryParser)

package com.lucene.queryParser;

import java.util.Vector;
import java.io.*;
import com.lucene.index.Term;
import com.lucene.analysis.*;
import com.lucene.search.*;

/**
 * This class is generated by JavaCC.  The only method that clients should need
 * to call is <a href="#parse">parse()</a>.
 *
 * The syntax for query strings is as follows:
 * A Query is a series of clauses.
 * A clause may be prefixed by: 
 * <ul>
 * <li> a plus (<code>+</code>) or a minus (<code>-</code>) sign, indicating
 * that the clause is required or prohibited respectively; or
 * <li> a term followed by a colon, indicating the field to be searched.
 * This enables one to construct queries which search multiple fields.
 * </ul>
 *
 * A clause may be either a:
 * <ul>
 * <li> a term, indicating all the documents that contain this term; or
 * <li> a nested query, enclosed in parentheses.  Note that this may be used
 * with a <code>+</code>/<code>-</code> prefix to require any of a set of
 * terms.
 * </ul>
 *
 * Thus, in BNF, the query grammar is:
 * <pre>
 *   Query  ::= ( Clause )*
 *   Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
 * </pre>
 */

public class QueryParser {
  /** Parses a query string, returning a
   * <a href="lucene.search.Query.html">Query</a>.
   *  @param query      the query string to be parsed.
   *  @param field      the default field for query terms.
   *  @param analyzer   used to find terms in the query text.
   */
  static public Query parse(String query, String field, Analyzer analyzer)
       throws ParseException {
    QueryParser parser = new QueryParser(new StringReader(query));
    return parser.Query(field);
  }
       
  Analyzer analyzer;
  String field;
  int phraseSlop = 0;

  /** Constructs a query parser.
   *  @param field      the default field for query terms.
   *  @param analyzer   used to find terms in the query text.
   */
  public QueryParser(String f, Analyzer a) {
    this(new StringReader(""));
    analyzer = a;
    field = f;
  }

  /** Parses a query string, returning a
   * <a href="lucene.search.Query.html">Query</a>.
   *  @param query      the query string to be parsed.
   */
  public Query parse(String query) throws ParseException {
    ReInit(new StringReader(query));
    return Query(field);
  }

  /** Sets the default slop for phrases.  If zero, then exact phrase matches
    are required.  Zero by default. */
  public void setPhraseSlop(int s) { phraseSlop = s; }
  /** Gets the default slop for phrases. */
  public int getPhraseSlop() { return phraseSlop; }

  private void addClause(Vector clauses, int conj, int mods, 
                        Query q) {
    boolean required, prohibited;

    // If this term is introduced by AND, make the preceding term required,
    // unless it's already prohibited
    if (conj == CONJ_AND) {
      BooleanClause c = (BooleanClause) clauses.elementAt(clauses.size()-1);
      if (!c.prohibited)
        c.required = true;
    }

    // We might have been passed a null query; the term might have been
    // filtered away by the analyzer. 
    if (q == null)
      return;

    // We set REQUIRED if we're introduced by AND or +; PROHIBITED if
    // introduced by NOT or -; make sure not to set both.
    prohibited = (mods == MOD_NOT);
    required = (mods == MOD_REQ);
    if (conj == CONJ_AND && !prohibited)
      required = true;
    clauses.addElement(new BooleanClause(q, required, prohibited));
  }

  private Query getFieldQuery(String field, Analyzer analyzer, 
                             String queryText) {
    // Use the analyzer to get all the tokens, and then build a TermQuery,
    // PhraseQuery, or nothing based on the term count

    TokenStream source = analyzer.tokenStream(new StringReader(queryText));
    Vector v = new Vector();
    com.lucene.analysis.Token t;

    while (true) {
      try {
        t = source.next();
      } 
      catch (IOException e) {
        t = null;
      }
      if (t == null) 
        break;
      v.addElement(t.termText());
    }
    if (v.size() == 0) 
      return null;
    else if (v.size() == 1) 
      return new TermQuery(new Term(field, (String) v.elementAt(0)));
    else {
      PhraseQuery q = new PhraseQuery();
      q.setSlop(phraseSlop);
      for (int i=0; i<v.size(); i++) {
        q.add(new Term(field, (String) v.elementAt(i)));
      }
      return q;
    }
  }

  public static void main(String[] args) throws Exception {
    QueryParser qp = new QueryParser("field", 
                                     new com.lucene.analysis.SimpleAnalyzer());
    Query q = qp.parse(args[0]);
    System.out.println(q.toString("field"));
  }

  private static final int CONJ_NONE   = 0;
  private static final int CONJ_AND    = 1;
  private static final int CONJ_OR     = 2;

  private static final int MOD_NONE    = 0;
  private static final int MOD_NOT     = 10;
  private static final int MOD_REQ     = 11;
}

PARSER_END(QueryParser)

/* ***************** */
/* Token Definitions */
/* ***************** */

<*> TOKEN : {
  <#_ALPHA_CHAR: ["a"-"z", "A"-"Z"] >
| <#_NUM_CHAR:   ["0"-"9"] >
| <#_ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
| <#_IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_" ] >
| <#_IDENTIFIER: <_ALPHA_CHAR> (<_IDENTIFIER_CHAR>)* >
| <#_NEWLINE:    ( "\r\n" | "\r" | "\n" ) >
| <#_WHITESPACE: ( " " | "\t" ) >
| <#_QCHAR:      ( "\\" (<_NEWLINE> | ~["a"-"z", "A"-"Z", "0"-"9"] ) ) >
| <#_RESTOFLINE: (~["\r", "\n"])* >
}

<DEFAULT> TOKEN : {
  <AND:       ("AND" | "&&") >
| <OR:        ("OR" | "||") >
| <NOT:       ("NOT" | "!") >
| <PLUS:      "+" >
| <MINUS:     "-" >
| <LPAREN:    "(" >
| <RPAREN:    ")" >
| <COLON:     ":" >
| <CARAT:     "^" >
| <QUOTED:     "\"" (~["\""])+ "\"">
| <NUMBER:    (<_NUM_CHAR>)+ "." (<_NUM_CHAR>)+ >
| <TERM:      <_IDENTIFIER_CHAR> 
              ( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^" ] )* >
}

<DEFAULT> SKIP : {
  <<_WHITESPACE>>
}

// *   Query  ::= ( Clause )*
// *   Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )

int Conjunction() : { 
  int ret = CONJ_NONE;
}
{
  [ 
    <AND> { ret = CONJ_AND; } 
    | <OR>  { ret = CONJ_OR; }
  ]
  { return ret; }
}

int Modifiers() : { 
  int ret = MOD_NONE;
}
{
  [ 
     <PLUS> { ret = MOD_REQ; }  
     | <MINUS> { ret = MOD_NOT; }
     | <NOT> { ret = MOD_NOT; }
  ]
  { return ret; }
}

Query Query(String field) :
{
  Vector clauses = new Vector();
  Query q;
  int conj, mods; 
}
{
  mods=Modifiers() q=Clause(field) 
  { addClause(clauses, CONJ_NONE, mods, q); }
  ( 
    conj=Conjunction() mods=Modifiers() q=Clause(field) 
    { addClause(clauses, conj, mods, q); }
  )*
    {
      BooleanQuery query = new BooleanQuery();
      for (int i = 0; i < clauses.size(); i++)
        query.add((BooleanClause)clauses.elementAt(i));
      return query;
    }
}

Query Clause(String field) : {
  Query q;
  Token fieldToken=null;
}
{
  [
    LOOKAHEAD(2)
    fieldToken=<TERM> <COLON> { field = fieldToken.image; }
  ]

  (
   q=Term(field) 
   | <LPAREN> q=Query(field) <RPAREN>
  )
    {
      return q;
    }
}
    

Query Term(String field) : { 
  Token term, boost=null; 
  Query q;
}
{
  ( 
    (term=<TERM>|term=<NUMBER>) [ <CARAT> boost=<NUMBER> ]
      { q = getFieldQuery(field, analyzer, term.image); }
    | term=<QUOTED> 
      { q = getFieldQuery(field, analyzer, 
                          term.image.substring(1, term.image.length()-1)); }
  )
  { 
    if (boost != null) {
      float f = (float) 1.0;
      try { 
        f = Float.parseFloat(boost.image);
      }
      catch (Exception ignored) { }

      if (q instanceof TermQuery) 
        ((TermQuery) q).setBoost(f);
      else if (q instanceof PhraseQuery) 
        ((PhraseQuery) q).setBoost(f);
    }
    return q; 
  }
}



----- END 


----- BEGIN TestQueryParser.java

package com.lucene.queryParser;

import java.io.*;
import junit.framework.*;

import com.lucene.*;
import com.lucene.queryParser.*;
import com.lucene.search.*;
import com.lucene.analysis.*;
import com.lucene.analysis.Token;

public class TestQueryParser extends TestCase {

   public TestQueryParser(String name) {
      super(name);
   }

  public static Analyzer qpAnalyzer = new QPTestAnalyzer();

  public static class QPTestFilter extends TokenFilter {

    /**
     * Filter which discards the token 'stop' and which expands the
     * token 'phrase' into 'phrase1 phrase2'
     */
    public QPTestFilter(TokenStream in) {
      input = in;
    }
    
    boolean inPhrase = false;
    int savedStart=0, savedEnd=0;

    public Token next() throws IOException {
      if (inPhrase) {
        inPhrase = false;
        return new Token("phrase2", savedStart, savedEnd);
      }
      else
        for (Token token = input.next(); token != null; token = input.next())
          if (token.termText().equals("phrase")) {
            inPhrase = true;
            savedStart = token.startOffset();
            savedEnd = token.endOffset();
            return new Token("phrase1", savedStart, savedEnd);
          }
          else if (!token.termText().equals("stop"))
            return token;
      return null;
    }
  }
  
  public static class QPTestAnalyzer extends Analyzer {

    public QPTestAnalyzer() {
    }

    /** Filters LowerCaseTokenizer with StopFilter. */
    public final TokenStream tokenStream(Reader reader) {
      return new QPTestFilter(new LowerCaseTokenizer(reader));
    }
  }
  
   /**
    * initialize this TemplateTester by creating a WebMacro instance
    * and a default Context.
    */
  public void init () throws Exception
  {
  }
  
  public void assertQueryEquals(String query, Analyzer a, String result) 
  throws Exception {
    if (a == null)
      a = new SimpleAnalyzer();
    QueryParser qp = new QueryParser("field", a);
    Query q = qp.parse(query);
    String s = q.toString("field");
    if (!s.equals(result)) {
      System.err.println("Query /" + query + "/ yielded /" + s 
                         + "/, expecting /" + result + "/");
      assert(false);
    }
  }

  public void testSimple() throws Exception {
    assertQueryEquals("term term term", null, "(term term term)");
    assertQueryEquals("term term1 term2", null, "(term term term)");
    assertQueryEquals("term 1.0 1 2", null, "(term)");

    assertQueryEquals("a AND b", null, "(+a +b)");
    assertQueryEquals("a AND NOT b", null, "(+a -b)");
    assertQueryEquals("a AND -b", null, "(+a -b)");
    assertQueryEquals("a AND !b", null, "(+a -b)");
    assertQueryEquals("a && b", null, "(+a +b)");
    assertQueryEquals("a&&b", null, "(+a +b)");
    assertQueryEquals("a && ! b", null, "(+a -b)");

    assertQueryEquals("a OR b", null, "(a b)");
    assertQueryEquals("a || b", null, "(a b)");
    assertQueryEquals("a OR !b", null, "(a -b)");
    assertQueryEquals("a OR ! b", null, "(a -b)");
    assertQueryEquals("a OR -b", null, "(a -b)");

    assertQueryEquals("+term -term term", null, "(+term -term term)");
    assertQueryEquals("foo:term AND field:anotherTerm", null, 
                      "(+foo:term +anotherterm)");
    assertQueryEquals("term AND \"phrase phrase\"", null, 
                      "(+term +\"phrase phrase\")");

    assertQueryEquals("germ term^2.0", null, "(germ term^2.0)");
    assertQueryEquals("term^2.0", null, "(term^2.0)");
  }

  public void testQPA() throws Exception {
    assertQueryEquals("term term term", qpAnalyzer, "(term term term)");
    assertQueryEquals("term +stop term", qpAnalyzer, "(term term)");
    assertQueryEquals("term -stop term", qpAnalyzer, "(term term)");
    assertQueryEquals("drop AND stop AND roll", qpAnalyzer, "(+drop +roll)");
    assertQueryEquals("term phrase term", qpAnalyzer, 
                      "(term \"phrase1 phrase2\" term)");
    assertQueryEquals("term AND NOT phrase term", qpAnalyzer, 
                      "(+term -\"phrase1 phrase2\" term)");
    assertQueryEquals("stop", qpAnalyzer, "()");
  }
}

----- END 

_______________________________________________
Lucene-dev mailing list
[EMAIL PROTECTED]
http://lists.sourceforge.net/lists/listinfo/lucene-dev
[Lucene-dev] New QueryParser

Reply via email to