OK here is what i have so far...a new class called
TokenNameFinderEvaluatorMultiple (what a mouthful!)
The class is attached...
Basically what it does is maintaining 2 name finders (only need 2 atm)
and merges their results just before they are sent to the evaluator...i
wan't sure however whether i 'm supposed to keep the duplicates when
merging the Span arrays...also does the order matter or can i use a set
to remove duplicates if needed? that will mess up the order though!
Jim
p.s. I'm not sure how to quickly test it though without potentially
messing up my project...any chance you could test it? i don't see any
reason why it shouldn't work (assuming duplicates are to be kept)...
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.namefind;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import opennlp.tools.cmdline.PerformanceMonitor;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.eval.Evaluator;
import opennlp.tools.util.eval.FMeasure;
/**
* The {@link TokenNameFinderEvaluator} measures the performance
* of the given {@link TokenNameFinder} with the provided
* reference {@link NameSample}s.
*
* @see Evaluator
* @see TokenNameFinder
* @see NameSample
*/
public class TokenNameFinderEvaluatorMultiple extends Evaluator<NameSample> {
private FMeasure fmeasure = new FMeasure();
/**
* The {@link TokenNameFinder} used to create the predicted
* {@link NameSample} objects.
*/
private TokenNameFinder nameFinder1; //e.g. maxent
private TokenNameFinder nameFinder2; //e.g. dictionary
/**
* Initializes the current instance with the given
* {@link TokenNameFinder}.
*
* @param nameFinder the {@link TokenNameFinder} to evaluate.
* @param listeners evaluation sample listeners
*/
public TokenNameFinderEvaluator(TokenNameFinder ... nameFinder, TokenNameFinderEvaluationMonitor ... listeners) {
super(listeners);
this.nameFinder1 = nameFinder[0];
this.nameFinder2 = nameFinder[1];//we can have more nameFinders
}
/**
* Evaluates the given reference {@link NameSample} object.
*
* This is done by finding the names with the
* {@link TokenNameFinder} in the sentence from the reference
* {@link NameSample}. The found names are then used to
* calculate and update the scores.
*
* @param reference the reference {@link NameSample}.
*
* @return the predicted {@link NameSample}.
*/
@Override
protected NameSample processSample(NameSample reference) {
if (reference.isClearAdaptiveDataSet()) {
nameFinder1.clearAdaptiveData();//clearAdaptiveData for all namefinders
nameFinder1.clearAdaptiveData();
}
Span[] predictedNames1 = nameFinder1.find(reference.getSentence()); // predict 1st
Span[] predictedNames2 = nameFinder2.find(reference.getSentence()); // predict 2nd
Span[] predictedTotal = mergeSpans(predictedNames1, predictedNames2); //merge predictions - not sure whether to keep or remove duplicates
Span[] references = reference.getNames();
//helper function for array concat
private Span[] mergeSpans(Span[] x, Span[] y){
Span[] temp= new Span[x.length + y.length];
System.arraycopy(x, 0, temp, 0, x.length);
System.arraycopy(B, 0, temp, x.length, y.length);
return temp;
}
// OPENNLP-396 When evaluating with a file in the old format
// the type of the span is null, but must be set to default to match
// the output of the name finder.
for (int i = 0; i < references.length; i++) {
if (references[i].getType() == null) {
references[i] = new Span(references[i].getStart(), references[i].getEnd(), "default");
}
}
fmeasure.updateScores(references, predictedTotal);//use predicted total here
return new NameSample(reference.getSentence(), predictedTotal, reference.isClearAdaptiveDataSet());//and here
}
public FMeasure getFMeasure() {
return fmeasure;
}
// all changes are above this line -----------------------------------------------------------------------------------
@Deprecated
public static void main(String[] args) throws IOException,
InvalidFormatException {
if (args.length == 4) {
System.out.println("Loading name finder model ...");
InputStream modelIn = new FileInputStream(args[3]);
TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
TokenNameFinder nameFinder = new NameFinderME(model);
System.out.println("Performing evaluation ...");
TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(nameFinder);
final NameSampleDataStream sampleStream = new NameSampleDataStream(
new PlainTextByLineStream(new InputStreamReader(new FileInputStream(args[2]), args[1])));
final PerformanceMonitor monitor = new PerformanceMonitor("sent");
monitor.startAndPrintThroughput();
ObjectStream<NameSample> iterator = new ObjectStream<NameSample>() {
public NameSample read() throws IOException {
monitor.incrementCounter();
return sampleStream.read();
}
public void reset() throws IOException {
sampleStream.reset();
}
public void close() throws IOException {
sampleStream.close();
}
};
evaluator.evaluate(iterator);
monitor.stopAndPrintFinalResult();
System.out.println();
System.out.println("F-Measure: " + evaluator.getFMeasure().getFMeasure());
System.out.println("Recall: " + evaluator.getFMeasure().getRecallScore());
System.out.println("Precision: " + evaluator.getFMeasure().getPrecisionScore());
}
else {
// usage: -encoding code test.file model.file
}
}
}