Hi,
I have written a my own weighted synonym filter and tried to integrate it
inside an analyzer.
The analyzer as defined in the schema.xml is:
the field type is
*<fieldType name="Company_Name" class="solr.TextField"
positionIncrementGap="100" >
<analyzer type="index">
<tokenizer class="solr.**WhitespaceTokenizerFactory"/>
**
<filter class="DTSynonymFactory"
FreskoFunction="**SimilarityProbManual.txt"
ignoreCase="true" expand="false"/>
<!--<filter class="solr.**EnglishPorterFilterFactory"
protected="protwords.txt"/>-->
<!--<filter class="solr.**RemoveDuplicatesTokenFilterFac**tory"/>-->
</analyzer>
<analyzer type="query">
<tokenizer class="solr.**StandardTokenizerFactory"/>
<filter class="solr.**LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
<!--<filter class="solr.**EnglishPorterFilterFactory"
protected="protwords.txt"/>-->
<!--<filter class="solr.**RemoveDuplicatesTokenFilterFac**tory"/
>-->
</analyzer>
</fieldType>*
The problem is that I always get in the Token next(Token reusableToken)
method in DTSynonymFilter a token with a termBuffer containing 10 emty
chars.
*
*
*I have debugged and stepped into Solr code and found that *
*in class DocInverterPerField
Token token = stream.next(localToken); line 134*
*
localToken contains a termBuffer with 10 empty chars ('')*
*What am I doing wrong ???
*
The java code:
*
import com.google.common.collect.**ArrayListMultimap;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.analysis.**Token;
import org.apache.lucene.analysis.**TokenFilter;
import org.apache.lucene.analysis.**TokenStream;
import org.apache.lucene.analysis.**payloads.PayloadHelper;
import org.apache.lucene.index.**Payload;
/**
*
* @author david
*/
public class DTSynonymFilter extends TokenFilter {
public DTSynonymFilter(TokenStream input, ArrayListMultimap<String,
Synonym> syns) {
super(input);
this.synsMap = syns;
System.out.println("in DTSynonymFilter synsMap ");
}
public static final String SYNONYM = "<SYNONYM>";
TokenFilter tf;
private LinkedList<Token> synonymTokenQueue = new LinkedList<Token>();
private ArrayListMultimap<String, Synonym> synsMap = null;
private LinkedList<Token> buffer;
private Token nextTok(Token target) throws IOException {
if (buffer != null && !buffer.isEmpty()) {
return buffer.removeFirst();
} else {
return input.next(target);
}
}
private void pushTok(Token t) {
if (buffer == null) {
buffer = new LinkedList<Token>();
}
buffer.addFirst(t);
}
@Override
public Token next(Token reusableToken) throws IOException {
if (synonymTokenQueue.size() > 0) {
return synonymTokenQueue.removeFirst(* *);
}
if (reusableToken == null) {
return null;
}
reusableToken.setPayload(new Payload(new byte[]{(byte) 1}));
// System.out.println("trying to get synonyms for "+reusableToken);
// System.out.println(synsMap.* *get(reusableToken.term()));
List<Synonym> syns = synsMap.get(reusableToken.**term());
for (Synonym synonym : synsMap.get(reusableToken.**term())) {
System.out.println(synonym);
}
Payload boostPayload;
for (Synonym synonym : syns) {
//Token(char[] startTermBuffer, int termBufferOffset, int
termBufferLength, int start, int end)
// Token synToken = new Token(synonym.getToken().**toCharArray(),
reusableToken.startOffset(), reusableToken.endOffset(),
synonym.getToken().length(), 0);//, t.startOffset(), t.endOffset(),
SYNONYM);
Token newTok = new Token(reusableToken.**startOffset(),
reusableToken.endOffset(), SYNONYM);
newTok.setTermBuffer(synonym.**getToken().toCharArray(), 0,
synonym.getToken().length());
// set the position increment to zero
// this tells lucene the synonym is
// in the exact same location as the originating word
newTok.setPositionIncrement(0)**;
boostPayload = new Payload(PayloadHelper.**
encodeFloat(synonym.getWieght(**)));
newTok.setPayload(**boostPayload);
synonymTokenQueue.add(newTok);
}
return reusableToken;
}
}
import DTSynonymFilter;
import com.google.common.collect.**ArrayListMultimap;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.**Token;
import org.apache.lucene.analysis.**TokenStream;
import org.apache.solr.analysis.**BaseTokenFilterFactory;
import org.apache.solr.analysis.**TokenizerFactory;
import org.apache.solr.common.**ResourceLoader;
import org.apache.solr.common.util.**StrUtils;
import org.apache.solr.util.plugin.**ResourceLoaderAware;
/**
*
* @author david
*/
public class DTSynonymFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {
boolean informed=false;
String synonyms=null;
public DTSynonymFactory(){
// this.syns= ArrayListMultimap.create();
}
final static Logger log = Logger.getLogger(**DTSynonymFactory.class.**
getName());
private static TokenizerFactory loadTokenizerFactory(* *ResourceLoader
loader, String cname, Map<String, String> args) {
TokenizerFactory tokFactory = (TokenizerFactory)
loader.newInstance(cname);
tokFactory.init(args);
return tokFactory;
}
private ArrayListMultimap<String, Synonym> syns = null;
public DTSynonymFilter create(TokenStream input) {
Thread.dumpStack();
try {
Thread.sleep(5000);
} catch (InterruptedException ex) {
Logger.getLogger(**DTSynonymFactory.class.**getName()).log(Level.SEVERE,
null, ex);
}
if(syns!=null){
System.out.println("in create() syns is "+syns+" syns size is
"+" " );
return new DTSynonymFilter(input,syns);
}
else{
System.out.println("in create() syns is "+syns+" and informed is
"+informed);
return new DTSynonymFilter(input,null);
}
}
@Override
public void inform(ResourceLoader loader) {
synonyms = args.get("FreskoFunction");
System.out.println("in DTSynonymFilter.inform() synonyms file is
"+synonyms);
boolean ignoreCase = getBoolean("ignoreCase", false);
System.out.println("in DTSynonymFilter.inform() ignoreCase is
"+ignoreCase);
boolean expand = getBoolean("expand", true);
System.out.println("in DTSynonymFilter.inform() expand is "+expand);
//String seperator =
String tf = args.get("tokenizerFactory");
TokenizerFactory tokFactory = null;
if (tf != null) {
tokFactory = loadTokenizerFactory(loader, tf, args);
}
if (tf != null) {
System.out.println("**TokenizerFactory loaded ");
}
if (synonyms != null) {
List<String> wlist = null;
try {
File synonymFile = new File(synonyms);
if (synonymFile.exists()) {
wlist = loader.getLines(synonyms);
} else {
List<String> files = StrUtils.splitFileNames(**
synonyms);
for (String file : files) {
wlist = loader.getLines(file.trim());
}
}
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException(e);
}
syns = ArrayListMultimap.create();
populateSynMap("\\|", wlist);
if(syns==null){
System.out.println("sysns after create and populate is
null!!!!!!");
Thread.sleep(5000);
}
else{
System.out.println("after crete the size of syns is
"+syns.size());
informed=true;
}
// synMap = new SynonymMap(ignoreCase);
// parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
}
else{
throw new RuntimeException("Could not find synonyms");
}
}catch(Exception e){
e.printStackTrace();
throw new RuntimeException(e);
}
}
}
}
}
* Thanks in advance