Please always mention the issue number in the commit message. Thanks, Jörn
On Fri, 2016-03-11 at 17:37 +0000, [email protected] wrote: > Author: beylerian > Date: Fri Mar 11 17:37:07 2016 > New Revision: 1734600 > > URL: http://svn.apache.org/viewvc?rev=1734600&view=rev > Log: > added unit tests, corrected some mistakes, need more unit tests > > Added: > opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java > - copied, changed from r1733577, opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java > opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java > - copied, changed from r1733577, opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java > Removed: > opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java > opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java > Modified: > opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java > opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java > opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGe > nerator.java > opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java > opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java > opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java > opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContext > Generator.java > opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerat > or.java > opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java > opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java > opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.jav > a > opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java > opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java > opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/Tester.java > > Modified: opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/mai > n/java/opennlp/tools/disambiguator/WSDParameters.java?rev=1734600&r1= > 1734599&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java > (original) > +++ opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java Fri > Mar 11 17:37:07 2016 > @@ -27,6 +27,7 @@ public abstract class WSDParameters { > > protected boolean isCoarseSense; > public static boolean isStemCompare; > + protected boolean returnMultiple; > > public static enum SenseSource { > WORDNET, WSDHELPER, OTHER; > @@ -61,8 +62,17 @@ public abstract class WSDParameters { > this.senseSource = senseSource; > } > > + public boolean isReturnMultiple() { > + return returnMultiple; > + } > + > + public void setReturnMultiple(boolean returnMultiple) { > + this.returnMultiple = returnMultiple; > + } > + > public WSDParameters() { > this.isCoarseSense = false; > + this.returnMultiple = false; > } > > /** > > Modified: opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/mai > n/java/opennlp/tools/disambiguator/WSDisambiguator.java?rev=1734600&r > 1=1734599&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java > (original) > +++ opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java > Fri Mar 11 17:37:07 2016 > @@ -150,7 +150,7 @@ public abstract class WSDisambiguator { > > if (WSDHelper.getNonRelevWordsDef(tokenTags[i]) != null) { > String s = IMSParameters.SenseSource.WSDHELPER.name() + " > " > - + tokenTags[i]; > + + WSDHelper.getNonRelevWordsDef(tokenTags[i]); > String[] sense = { s }; > > senses.add(sense); > > Modified: opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGe > nerator.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/mai > n/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.jav > a?rev=1734600&r1=1734599&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGe > nerator.java (original) > +++ opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGe > nerator.java Fri Mar 11 17:37:07 2016 > @@ -55,8 +55,9 @@ public class DefaultIMSContextGenerator > } > > public String[] extractSurroundingWords(int index, String[] toks, > - String[] lemmas) { > + String[] lemmas, int windowSize) { > > + // TODO consider the windowSize > ArrayList<String> contextWords = new ArrayList<String>(); > > for (int i = 0; i < toks.length; i++) { > @@ -123,7 +124,7 @@ public class DefaultIMSContextGenerator > > HashSet<String> surroundingWords = new HashSet<>(); > surroundingWords.addAll(Arrays.asList(extractSurroundingWords(in > dex, toks, > - lemmas))); > + lemmas, windowSize))); > > String[] localCollocations = extractLocalCollocations(index, > toks, ngram); > > > Modified: opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/mai > n/java/opennlp/tools/disambiguator/ims/IMSME.java?rev=1734600&r1=1734 > 599&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java > (original) > +++ opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java Fri Mar > 11 17:37:07 2016 > @@ -41,16 +41,17 @@ public class IMSME extends WSDisambiguat > > protected static IMSContextGenerator cg = new > DefaultIMSContextGenerator(); > > - public IMSME(IMSParameters params){ > + public IMSME(IMSParameters params) { > this.params = params; > } > - > + > public IMSME(IMSModel model, IMSParameters params) { > this.imsModel = model; > this.params = params; > - > -// Assert.assertEquals(model.getWindowSize(),params.getWindowSize > ()); > -// Assert.assertEquals(model.getNgram(),params.getNgram()); > + } > + > + public IMSModel getModel() { > + return imsModel; > } > > public void setModel(IMSModel model) { > @@ -65,7 +66,7 @@ public class IMSME extends WSDisambiguat > TrainingParameters mlParams, IMSParameters imsParams, > IMSFactory imsfactory) throws IOException { > > - ArrayList<String> surroundingWordModel = > buildSurroundingWords(samples); > + ArrayList<String> surroundingWordModel = > buildSurroundingWords(samples, imsParams.getWindowSize()); > > HashMap<String, String> manifestInfoEntries = new > HashMap<String, String>(); > > @@ -88,13 +89,13 @@ public class IMSME extends WSDisambiguat > > events.add(ev); > > - es = ObjectStreamUtils.createObjectStream(events); > - > } while ((sample = samples.read()) != null); > } > > - EventTrainer trainer = TrainerFactory.getEventTrainer( > - mlParams.getSettings(), manifestInfoEntries); > + es = ObjectStreamUtils.createObjectStream(events); > + > + EventTrainer trainer = TrainerFactory > + .getEventTrainer(mlParams.getSettings(), > manifestInfoEntries); > imsModel = trainer.train(es); > > return new IMSModel(lang, wordTag, imsParams.windowSize, > imsParams.ngram, > @@ -102,13 +103,13 @@ public class IMSME extends WSDisambiguat > } > > public static ArrayList<String> buildSurroundingWords( > - ObjectStream<WSDSample> samples) throws IOException { > + ObjectStream<WSDSample> samples, int windowSize) throws > IOException { > DefaultIMSContextGenerator imsCG = new > DefaultIMSContextGenerator(); > ArrayList<String> surroundingWordsModel = new > ArrayList<String>(); > WSDSample sample; > while ((sample = samples.read()) != null) { > - String[] words = imsCG.extractSurroundingWords( > - sample.getTargetPosition(), sample.getSentence(), > sample.getLemmas()); > + String[] words = > imsCG.extractSurroundingWords(sample.getTargetPosition(), > + sample.getSentence(), sample.getLemmas(), windowSize); > > if (words.length > 0) { > for (String word : words) { > @@ -125,10 +126,11 @@ public class IMSME extends WSDisambiguat > if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) { > String wordTag = sample.getTargetWordTag(); > > - String trainingFile = ((IMSParameters) this.getParams()) > - .getTrainingDataDirectory() + sample.getTargetWordTag(); > + if (imsModel == null > + || > !imsModel.getWordTag().equals(sample.getTargetWordTag())) { > > - if (imsModel==null || > !imsModel.getWordTag().equals(sample.getTargetWordTag())) { > + String trainingFile = ((IMSParameters) this.getParams()) > + .getTrainingDataDirectory() + sample.getTargetWordTag(); > > File file = new File(trainingFile + ".ims.model"); > if (file.exists() && !file.isDirectory()) { > @@ -167,11 +169,11 @@ public class IMSME extends WSDisambiguat > } > > } else { > - > MFS mfs = new MFS(); > return mfs.disambiguate(wordTag); > } > } else { > + > String outcome = ""; > > String[] context = cg.getContext(sample, > @@ -226,8 +228,8 @@ public class IMSME extends WSDisambiguat > */ > public String[] disambiguate(String[] tokenizedContext, String[] > tokenTags, > String[] lemmas, int index) { > - return disambiguate(new WSDSample(tokenizedContext, tokenTags, > lemmas, > - index)); > + return disambiguate( > + new WSDSample(tokenizedContext, tokenTags, lemmas, index)); > } > > } > > Modified: opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/mai > n/java/opennlp/tools/disambiguator/ims/IMSParameters.java?rev=1734600 > &r1=1734599&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java > (original) > +++ opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java > Fri Mar 11 17:37:07 2016 > @@ -20,6 +20,7 @@ > package opennlp.tools.disambiguator.ims; > > import java.io.File; > +import java.security.InvalidParameterException; > > import opennlp.tools.disambiguator.WSDParameters; > > @@ -34,6 +35,11 @@ public class IMSParameters extends WSDPa > protected int ngram; > > protected String trainingDataDirectory; > + > + protected static final int DFLT_WIN_SIZE = 3; > + protected static final int DFLT_NGRAM = 2; > + protected static final String DFLT_LANG_CODE = "En"; > + protected static final SenseSource DFLT_SOURCE = > SenseSource.WORDNET; > > /** > * This constructor takes only two parameters. The default > language used is > @@ -49,8 +55,9 @@ public class IMSParameters extends WSDPa > * the source of the training data > */ > public IMSParameters(int windowSize, int ngram, SenseSource > senseSource, > - String trainingDataDirectory) { > - this.languageCode = "En"; > + String trainingDataDirectory){ > + > + this.languageCode = DFLT_LANG_CODE; > this.windowSize = windowSize; > this.ngram = ngram; > this.senseSource = senseSource; > @@ -63,19 +70,7 @@ public class IMSParameters extends WSDPa > } > > public IMSParameters(String trainingDataDirectory) { > - this(3, 2, SenseSource.WORDNET, trainingDataDirectory); > - > - File folder = new File(trainingDataDirectory); > - if (!folder.exists()) > - folder.mkdirs(); > - } > - > - public IMSParameters() { > - this(3, 2, SenseSource.WORDNET, null); > - } > - > - public IMSParameters(int windowSize, int ngram) { > - this(windowSize, ngram, SenseSource.WORDNET, null); > + this(DFLT_WIN_SIZE, DFLT_NGRAM, DFLT_SOURCE, > trainingDataDirectory); > } > > public String getLanguageCode() { > @@ -109,7 +104,6 @@ public class IMSParameters extends WSDPa > * Creates the context generator of IMS > */ > public IMSContextGenerator createContextGenerator() { > - > return new DefaultIMSContextGenerator(); > } > > @@ -123,7 +117,7 @@ public class IMSParameters extends WSDPa > > @Override > public boolean isValid() { > - // TODO Auto-generated method stub > + // TODO recheck this pattern switch to maps > return true; > } > > > Modified: opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/mai > n/java/opennlp/tools/disambiguator/mfs/MFS.java?rev=1734600&r1=173459 > 9&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java (original) > +++ opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java Fri Mar 11 > 17:37:07 2016 > @@ -194,9 +194,12 @@ public class MFS extends WSDisambiguator > WordPOS wordPOS = new WordPOS(word, pos); > > ArrayList<Synset> synsets = wordPOS.getSynsets(); > - > - int size = synsets.size(); > - > + int size; > + if (this.parameters.isReturnMultiple()) { > + size = synsets.size(); > + } else { > + size = 1; > + } > String[] senses = new String[size]; > > for (int i = 0; i < size; i++) { > > Modified: opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContext > Generator.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/mai > n/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.j > ava?rev=1734600&r1=1734599&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContext > Generator.java (original) > +++ opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContext > Generator.java Fri Mar 11 17:37:07 2016 > @@ -39,6 +39,7 @@ public class DefaultOSCCContextGenerator > public String[] extractSurroundingContextClusters(int index, > String[] toks, > String[] tags, String[] lemmas, int windowSize) { > > + // TODO consider windowSize > ArrayList<String> contextClusters = new ArrayList<String>(); > > for (int i = 0; i < toks.length; i++) { > @@ -49,19 +50,19 @@ public class DefaultOSCCContextGenerator > > String lemma = lemmas[i].toLowerCase().replaceAll("[^a- > z_]", "") > .trim(); > - > - WordPOS word = new WordPOS(lemma, tags[i]); > > - // TODO check fix for "_" and null pointers > - if (lemma.length() > 1 && !lemma.contains("_")) { > - try{ > - ArrayList<Synset> synsets = word.getSynsets(); > - if (synsets!=null && synsets.size() > 0 ){ > - contextClusters.add(synsets.get(0).getOffset() + ""); > - } > - }catch(NullPointerException ex) > - { > - //TODO tagger mistake add proper exception > + WordPOS word = new WordPOS(lemma, tags[i]); > + > + if (lemma.length() > 1) { > + try { > + ArrayList<Synset> synsets = word.getSynsets(); > + if (synsets != null && synsets.size() > 0) { > + for (Synset syn : synsets){ > + contextClusters.add(syn.getOffset() + ""); > + } > + } > + } catch (NullPointerException ex) { > + // TODO tagger mistake add proper exception > } > } > > @@ -80,30 +81,32 @@ public class DefaultOSCCContextGenerator > */ > @Override > public String[] getContext(int index, String[] toks, String[] > tags, > - String[] lemmas, int windowSize) { > + String[] lemmas, int windowSize, ArrayList<String> model) { > > HashSet<String> surroundingContextClusters = new HashSet<>(); > - surroundingContextClusters.addAll(Arrays > - .asList(extractSurroundingContextClusters(index, toks, tags, > lemmas, > - windowSize))); > + surroundingContextClusters > + .addAll(Arrays.asList(extractSurroundingContextClusters(inde > x, toks, > + tags, lemmas, windowSize))); > > - String[] serializedFeatures = new > String[surroundingContextClusters.size()]; > + String[] serializedFeatures = new String[model.size()]; > > int i = 0; > - > - for (String feature : surroundingContextClusters) { > - serializedFeatures[i] = "F" + i + "=" + feature; > + for (String word : model) { > + if (surroundingContextClusters.contains(word.toString())) { > + serializedFeatures[i] = "F" + i + "=1"; > + } else { > + serializedFeatures[i] = "F" + i + "=0"; > + } > i++; > } > > return serializedFeatures; > - > } > > - public String[] getContext(WSDSample sample, int windowSize) { > + public String[] getContext(WSDSample sample, int windowSize, > ArrayList<String> model) { > > return getContext(sample.getTargetPosition(), > sample.getSentence(), > - sample.getTags(), sample.getLemmas(), windowSize); > + sample.getTags(), sample.getLemmas(), windowSize, model); > } > > } > > Modified: opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerat > or.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/mai > n/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java?rev > =1734600&r1=1734599&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerat > or.java (original) > +++ opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerat > or.java Fri Mar 11 17:37:07 2016 > @@ -19,6 +19,8 @@ > > package opennlp.tools.disambiguator.oscc; > > +import java.util.ArrayList; > + > import opennlp.tools.disambiguator.WSDSample; > > /** > @@ -27,7 +29,7 @@ import opennlp.tools.disambiguator.WSDSa > public interface OSCCContextGenerator { > > String[] getContext(int index, String[] toks, String[] tags, > String[] lemmas, > - int windowSize); > + int windowSize, ArrayList<String> model); > > - String[] getContext(WSDSample sample, int windowSize); > + String[] getContext(WSDSample sample, int windowSize, > ArrayList<String> model); > } > > Modified: opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/mai > n/java/opennlp/tools/disambiguator/oscc/OSCCME.java?rev=1734600&r1=17 > 34599&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java > (original) > +++ opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java Fri > Mar 11 17:37:07 2016 > @@ -22,7 +22,6 @@ import java.io.IOException; > import java.util.ArrayList; > import java.util.HashMap; > > -import junit.framework.Assert; > import opennlp.tools.disambiguator.WSDHelper; > import opennlp.tools.disambiguator.WSDSample; > import opennlp.tools.disambiguator.WSDisambiguator; > @@ -53,11 +52,11 @@ import opennlp.tools.util.TrainingParame > * Please see {@link DefaultOSCCContextGenerator} > * > * The approach finds the context clusters surrounding the target > and uses a > - * classifier to judge on the best case. > + * classifier to judge on the best case. > * > * Here an ME classifier is used. > * > -*/ > + */ > public class OSCCME extends WSDisambiguator { > > protected OSCCModel osccModel; > @@ -69,10 +68,12 @@ public class OSCCME extends WSDisambigua > } > > public OSCCME(OSCCModel model, OSCCParameters params) { > - this.osccModel = osccModel; > + this.osccModel = model; > this.params = params; > + } > > - Assert.assertEquals(model.getWindowSize(), > params.getWindowSize()); > + public OSCCModel getModel() { > + return osccModel; > } > > public void setModel(OSCCModel model) { > @@ -85,7 +86,10 @@ public class OSCCME extends WSDisambigua > > public static OSCCModel train(String lang, ObjectStream<WSDSample> > samples, > TrainingParameters mlParams, OSCCParameters osccParams, > - OSCCFactory imsfactory) throws IOException { > + OSCCFactory osccFactory) throws IOException { > + > + ArrayList<String> surroundingClusterModel = > buildSurroundingClusters( > + samples, osccParams.getWindowSize()); > > HashMap<String, String> manifestInfoEntries = new > HashMap<String, String>(); > > @@ -99,39 +103,57 @@ public class OSCCME extends WSDisambigua > if (sample != null) { > wordTag = sample.getTargetWordTag(); > do { > - > String sense = sample.getSenseIDs().get(0); > - > - String[] context = cg.getContext(sample, > osccParams.windowSize); > + String[] context = cg.getContext(sample, > osccParams.windowSize, > + surroundingClusterModel); > Event ev = new Event(sense + "", context); > - > events.add(ev); > - > - es = ObjectStreamUtils.createObjectStream(events); > - > } while ((sample = samples.read()) != null); > } > > - EventTrainer trainer = TrainerFactory.getEventTrainer( > - mlParams.getSettings(), manifestInfoEntries); > + es = ObjectStreamUtils.createObjectStream(events); > + EventTrainer trainer = TrainerFactory > + .getEventTrainer(mlParams.getSettings(), > manifestInfoEntries); > + > osccModel = trainer.train(es); > > - return new OSCCModel(lang, wordTag, osccParams.windowSize, > osccModel, manifestInfoEntries, imsfactory); > + return new OSCCModel(lang, wordTag, osccParams.windowSize, > osccModel, > + surroundingClusterModel, manifestInfoEntries, osccFactory); > } > > + public static ArrayList<String> buildSurroundingClusters( > + ObjectStream<WSDSample> samples, int windowSize) throws > IOException { > + // TODO modify to clusters > + DefaultOSCCContextGenerator osccCG = new > DefaultOSCCContextGenerator(); > + ArrayList<String> surroundingWordsModel = new > ArrayList<String>(); > + WSDSample sample; > + while ((sample = samples.read()) != null) { > + String[] words = osccCG.extractSurroundingContextClusters( > + sample.getTargetPosition(), sample.getSentence(), > sample.getTags(), > + sample.getLemmas(), windowSize); > + > + if (words.length > 0) { > + for (String word : words) { > + surroundingWordsModel.add(word); > + } > + } > + } > + samples.reset(); > + return surroundingWordsModel; > + } > > @Override > public String[] disambiguate(WSDSample sample) { > if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) { > String wordTag = sample.getTargetWordTag(); > > - String trainingFile = ((OSCCParameters) this.getParams()) > - .getTrainingDataDirectory() + sample.getTargetWordTag(); > - > if (osccModel == null > || > !osccModel.getWordTag().equals(sample.getTargetWordTag())) { > > - File file = new File(trainingFile + ".ims.model"); > + String trainingFile = ((OSCCParameters) this.getParams()) > + .getTrainingDataDirectory() + sample.getTargetWordTag(); > + > + File file = new File(trainingFile + ".oscc.model"); > if (file.exists() && !file.isDirectory()) { > try { > setModel(new OSCCModel(file)); > @@ -147,7 +169,8 @@ public class OSCCME extends WSDisambigua > String outcome = ""; > > String[] context = cg.getContext(sample, > - ((OSCCParameters) this.params).windowSize); > + ((OSCCParameters) this.params).windowSize, > + osccModel.getContextClusters()); > > double[] outcomeProbs = > osccModel.getOSCCMaxentModel().eval(context); > outcome = > osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs); > @@ -174,7 +197,8 @@ public class OSCCME extends WSDisambigua > String outcome = ""; > > String[] context = cg.getContext(sample, > - ((OSCCParameters) this.params).windowSize); > + ((OSCCParameters) this.params).windowSize, > + osccModel.getContextClusters()); > > double[] outcomeProbs = > osccModel.getOSCCMaxentModel().eval(context); > outcome = > osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs); > @@ -223,8 +247,8 @@ public class OSCCME extends WSDisambigua > */ > public String[] disambiguate(String[] tokenizedContext, String[] > tokenTags, > String[] lemmas, int index) { > - return disambiguate(new WSDSample(tokenizedContext, tokenTags, > lemmas, > - index)); > + return disambiguate( > + new WSDSample(tokenizedContext, tokenTags, lemmas, index)); > } > > } > > Modified: opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/mai > n/java/opennlp/tools/disambiguator/oscc/OSCCModel.java?rev=1734600&r1 > =1734599&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java > (original) > +++ opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java Fri > Mar 11 17:37:07 2016 > @@ -21,6 +21,7 @@ import java.io.File; > import java.io.IOException; > import java.io.InputStream; > import java.util.ArrayList; > +import java.util.Arrays; > import java.util.Map; > import java.util.Properties; > import java.net.URL; > @@ -43,13 +44,13 @@ public class OSCCModel extends BaseModel > private static final String WINSIZE = "winsize"; > private static final String CONTEXTCLUSTERS = "contextclusters"; > > - //private ArrayList<String> contextClusters = new > ArrayList<String>(); > + private ArrayList<String> contextClusters = new > ArrayList<String>(); > private String wordTag; > private int windowSize; > > - /*public ArrayList<String> getContextClusters() { > + public ArrayList<String> getContextClusters() { > return contextClusters; > - }*/ > + } > > public int getWindowSize() { > return windowSize; > @@ -59,9 +60,9 @@ public class OSCCModel extends BaseModel > this.windowSize = windowSize; > } > > - /* public void setContextClusters(ArrayList<String> > contextClusters) { > + public void setContextClusters(ArrayList<String> contextClusters) > { > this.contextClusters = contextClusters; > - }*/ > + } > > public String getWordTag() { > return wordTag; > @@ -72,7 +73,7 @@ public class OSCCModel extends BaseModel > } > > public OSCCModel(String languageCode, String wordTag, int > windowSize, > - MaxentModel osccModel, > + MaxentModel osccModel, ArrayList<String> contextClusters, > Map<String, String> manifestInfoEntries, OSCCFactory factory) > { > super(COMPONENT_NAME, languageCode, manifestInfoEntries, > factory); > > @@ -80,17 +81,17 @@ public class OSCCModel extends BaseModel > this.setManifestProperty(WORDTAG, wordTag); > this.setManifestProperty(WINSIZE, windowSize + ""); > > -// this.setManifestProperty(CONTEXTCLUSTERS, > -// StringUtils.join(contextClusters, ",")); > + this.setManifestProperty(CONTEXTCLUSTERS, > + StringUtils.join(contextClusters, ",")); > > - //this.contextClusters = contextClusters; > + this.contextClusters = contextClusters; > checkArtifactMap(); > } > > public OSCCModel(String languageCode, String wordTag, int > windowSize, > - int ngram, MaxentModel osccModel, > + int ngram, MaxentModel osccModel, ArrayList<String> > contextClusters, > OSCCFactory factory) { > - this(languageCode, wordTag, windowSize, osccModel, > + this(languageCode, wordTag, windowSize, osccModel, > contextClusters, > null, factory); > } > > @@ -135,10 +136,10 @@ public class OSCCModel extends BaseModel > > public void updateAttributes() { > Properties manifest = (Properties) > artifactMap.get(MANIFEST_ENTRY); > - //String contextClusters = (String) > manifest.get(CONTEXTCLUSTERS); > + String contextClusters = (String) manifest.get(CONTEXTCLUSTERS); > > - /* this.contextClusters = new ArrayList( > - Arrays.asList(contextClusters.split(",")));*/ > + this.contextClusters = new ArrayList( > + Arrays.asList(contextClusters.split(","))); > this.wordTag = (String) manifest.get(WORDTAG); > this.windowSize = Integer.parseInt((String) > manifest.get(WINSIZE)); > } > > Modified: opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.jav > a > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/mai > n/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java?rev=17346 > 00&r1=1734599&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.jav > a (original) > +++ opennlp/sandbox/opennlp- > wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.jav > a Fri Mar 11 17:37:07 2016 > @@ -70,13 +70,11 @@ public class OSCCParameters extends WSDP > } > > public OSCCParameters() { > - // TODO change the "" into null ?? > - this(DFLT_WIN_SIZE, DFLT_SOURCE, ""); > + this(DFLT_WIN_SIZE, DFLT_SOURCE, null); > } > > public OSCCParameters(int windowSize) { > - // TODO change the "" into null ?? > - this(windowSize, DFLT_SOURCE, ""); > + this(windowSize, DFLT_SOURCE, null); > } > > public String getLanguageCode() { > > Copied: opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java (from > r1733577, opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java) > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/tes > t/java/opennlp/tools/disambiguator/IMSMETester.java?p2=opennlp/sandbo > x/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java&p1=ope > nnlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java&r1=17335 > 77&r2=1734600&rev=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java > (original) > +++ opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java Fri > Mar 11 17:37:07 2016 > @@ -19,107 +19,173 @@ > > package opennlp.tools.disambiguator; > > +import static org.junit.Assert.*; > + > +import java.io.File; > +import java.io.IOException; > import java.util.ArrayList; > import java.util.List; > > +import org.junit.BeforeClass; > +import org.junit.Test; > + > +import opennlp.tools.disambiguator.datareader.SemcorReaderExtended; > +import opennlp.tools.disambiguator.ims.IMSFactory; > import opennlp.tools.disambiguator.ims.IMSME; > +import opennlp.tools.disambiguator.ims.IMSModel; > import opennlp.tools.disambiguator.ims.IMSParameters; > +import opennlp.tools.util.ObjectStream; > import opennlp.tools.util.Span; > +import opennlp.tools.util.TrainingParameters; > > /** > - * This is a typical example of how to call the disambiguation > function in the > - * IMS class. > - * <ul> > - * <li>In the 2 first examples, the training data exist, therefore > the IMS > - * approach is used.</li> > - * <li>In the 3rd example, the training data for the word to > disambiguate are > - * absent, therefore the Most Frequent Sents (MFS) is returend</li> > - * </ul> > + * This is the test class for {@link IMSME}. > + * > + * The scope of this test is to make sure that the IMS disambiguator > code can be > + * executed. This test can not detect mistakes which lead to > incorrect feature > + * generation or other mistakes which decrease the disambiguation > performance of the > + * disambiguator. > + * > + * In this test the {@link IMSME} is trained with Semcor and then > the computed > + * model is used to predict sentences from the training sentences. > */ > -public class IMSTester { > - > - public static void main(String[] args) { > - > - // TODO write unit test > - > - String modelsDir = "src\\test\\resources\\models\\"; > +public class IMSMETester { > + // TODO write more tests > + // TODO modify when we fix the parameter model > + > + static String modelsDir = "src\\test\\resources\\models\\"; > + static String trainingDataDirectory = > "src\\test\\resources\\supervised\\models\\"; > + > + static IMSParameters IMSParams; > + static IMSME ims; > + static IMSFactory IMSFactory; > + static IMSModel model; > + > + static String test = "please.v"; > + static File outFile; > + > + static String test1 = "We need to discuss an important topic, > please write to me soon."; > + static String test2 = "The component was highly radioactive to the > point that" > + + " it has been activated the second it touched water"; > + static String test3 = "The summer is almost over and I did not go > to the beach even once"; > + > + static String[] sentence1; > + static String[] sentence2; > + static String[] sentence3; > + > + static String[] tags1; > + static String[] tags2; > + static String[] tags3; > + > + static String[] lemmas1; > + static String[] lemmas2; > + static String[] lemmas3; > + > + /* > + * Setup the testing variables > + */ > + @BeforeClass > + public static void setUpAndTraining() { > WSDHelper.loadTokenizer(modelsDir + "en-token.bin"); > WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict"); > WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin"); > > - IMSParameters params = new IMSParameters(""); > - > - WSDHelper.print(params.getTrainingDataDirectory()); > + sentence1 = WSDHelper.getTokenizer().tokenize(test1); > + sentence2 = WSDHelper.getTokenizer().tokenize(test2); > + sentence3 = WSDHelper.getTokenizer().tokenize(test3); > + > + tags1 = WSDHelper.getTagger().tag(sentence1); > + tags2 = WSDHelper.getTagger().tag(sentence2); > + tags3 = WSDHelper.getTagger().tag(sentence3); > > - IMSME ims = new IMSME(params); > - > - > - // This is how to make the context for one-word-disambiguation > using IMS > - > - String test1 = "We need to discuss important topic, please write > to me soon."; > - String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1); > - String[] tags1 = WSDHelper.getTagger().tag(sentence1); > List<String> tempLemmas1 = new ArrayList<String>(); > for (int i = 0; i < sentence1.length; i++) { > - String lemma = WSDHelper.getLemmatizer() > - .lemmatize(sentence1[i], tags1[i]); > - tempLemmas1.add(lemma); > + tempLemmas1 > + .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], > tags1[i])); > } > - String[] lemmas1 = tempLemmas1.toArray(new > String[tempLemmas1.size()]); > + lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]); > > - // output > - String[] senses1 = ims.disambiguate(sentence1, tags1, lemmas1, > 8); > - System.out.print(lemmas1[8] + " :\t"); > - WSDHelper.print(senses1); > - WSDHelper.print("*****************************"); > - > - // This is how to make the context for disambiguation of span of > words > - > - String test2 = "The component was highly radioactive to the > point that" > - + " it has been activated the second it touched water"; > - String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2); > - String[] tags2 = WSDHelper.getTagger().tag(sentence2); > List<String> tempLemmas2 = new ArrayList<String>(); > for (int i = 0; i < sentence2.length; i++) { > - String lemma = WSDHelper.getLemmatizer() > - .lemmatize(sentence2[i], tags2[i]); > - tempLemmas2.add(lemma); > + tempLemmas2 > + .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], > tags2[i])); > } > - String[] lemmas2 = tempLemmas2.toArray(new > String[tempLemmas2.size()]); > - Span span = new Span(3, 7); > - > - // output > - List<String[]> senses2 = ims.disambiguate(sentence2, tags2, > lemmas2, span); > - for (int i = span.getStart(); i < span.getEnd() + 1; i++) { > - String[] senses = senses2.get(i - span.getStart()); > - System.out.print(lemmas2[i] + " :\t"); > - WSDHelper.print(senses); > - WSDHelper.print("----------"); > - } > - > - WSDHelper.print("*****************************"); > + lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]); > > - // This is how to make the context for all-words-disambiguation > - > - String test3 = "The summer almost over and I not to the beach > even once"; > - String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3); > - String[] tags3 = WSDHelper.getTagger().tag(sentence3); > List<String> tempLemmas3 = new ArrayList<String>(); > for (int i = 0; i < sentence3.length; i++) { > - String lemma = WSDHelper.getLemmatizer() > - .lemmatize(sentence3[i], tags3[i]); > - tempLemmas3.add(lemma); > + tempLemmas3 > + .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], > tags3[i])); > } > - String[] lemmas3 = tempLemmas3.toArray(new > String[tempLemmas3.size()]); > + lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]); > > - // output > - List<String[]> senses3 = ims.disambiguate(sentence3, tags3, > lemmas3); > - for (int i = 0; i < sentence3.length; i++) { > - String[] senses = senses3.get(i); > - System.out.print(lemmas3[i] + " :\t"); > - WSDHelper.print(senses); > - WSDHelper.print("----------"); > + IMSParams = new IMSParameters(""); > + IMSParams.setTrainingDataDirectory(trainingDataDirectory); > + IMSFactory = new IMSFactory(); > + TrainingParameters trainingParams = new TrainingParameters(); > + SemcorReaderExtended sr = new SemcorReaderExtended(); > + ObjectStream<WSDSample> sampleStream = > sr.getSemcorDataStream(test); > + > + IMSModel writeModel = null; > + /* > + * Tests training the disambiguator We test both writing and > reading a model > + * file trained by semcor > + */ > + > + try { > + writeModel = IMSME.train("en", sampleStream, trainingParams, > IMSParams, > + IMSFactory); > + assertNotNull("Checking the model to be written", writeModel); > + writeModel.writeModel(IMSParams.getTrainingDataDirectory() + > test); > + outFile = new File( > + IMSParams.getTrainingDataDirectory() + test + > ".ims.model"); > + model = new IMSModel(outFile); > + assertNotNull("Checking the read model", model); > + ims = new IMSME(model, IMSParams); > + assertNotNull("Checking the disambiguator", ims); > + } catch (IOException e1) { > + e1.printStackTrace(); > + fail("Exception in training"); > } > } > > + /* > + * Tests disambiguating only one word : The ambiguous word > "please" > + */ > + @Test > + public void testOneWordDisambiguation() { > + String[] senses = ims.disambiguate(sentence1, tags1, lemmas1, > 8); > + > + assertEquals("Check number of senses", 1, senses.length); > + } > + > + /* > + * Tests disambiguating a word Span In this case we test a mix of > monosemous > + * and polysemous words as well as words that do not need > disambiguation such > + * as determiners > + */ > + @Test > + public void testWordSpanDisambiguation() { > + Span span = new Span(3, 7); > + List<String[]> senses = ims.disambiguate(sentence2, tags2, > lemmas2, span); > + > + assertEquals("Check number of returned words", 5, > senses.size()); > + assertEquals("Check number of senses", 1, senses.get(0).length); > + assertEquals("Check monosemous word", 1, senses.get(1).length); > + assertEquals("Check preposition", "WSDHELPER to", > senses.get(2)[0]); > + assertEquals("Check determiner", "WSDHELPER determiner", > senses.get(3)[0]); > + } > + > + /* > + * Tests disambiguating all the words > + */ > + @Test > + public void testAllWordsDisambiguation() { > + List<String[]> senses = ims.disambiguate(sentence3, tags3, > lemmas3); > + > + assertEquals("Check number of returned words", 15, > senses.size()); > + assertEquals("Check preposition", "WSDHELPER personal pronoun", > + senses.get(6)[0]); > + } > + > } > > Modified: opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/tes > t/java/opennlp/tools/disambiguator/LeskTester.java?rev=1734600&r1=173 > 4599&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java > (original) > +++ opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java Fri Mar > 11 17:37:07 2016 > @@ -19,81 +19,137 @@ > > package opennlp.tools.disambiguator; > > +import static org.junit.Assert.assertEquals; > + > import java.util.ArrayList; > import java.util.List; > > import opennlp.tools.disambiguator.lesk.Lesk; > import opennlp.tools.disambiguator.lesk.LeskParameters; > import opennlp.tools.disambiguator.lesk.LeskParameters.LESK_TYPE; > +import opennlp.tools.util.Span; > > +import org.junit.BeforeClass; > import org.junit.Test; > > +/** > + * This is the test class for {@link Lesk}. > + * > + * The scope of this test is to make sure that the Lesk > disambiguator code can be > + * executed. This test can not detect mistakes which lead to > incorrect feature > + * generation or other mistakes which decrease the disambiguation > performance of the > + * disambiguator. > + */ > public class LeskTester { > - @Test > - public static void main(String[] args) { > + // TODO write more tests > + > + static String modelsDir = "src\\test\\resources\\models\\"; > + > + static Lesk lesk; > + > + static String test1 = "We need to discuss an important topic, > please write to me soon."; > + static String test2 = "The component was highly radioactive to the > point that" > + + " it has been activated the second it touched water"; > + static String test3 = "The summer is almost over and I did not go > to the beach even once"; > + > + static String[] sentence1; > + static String[] sentence2; > + static String[] sentence3; > + > + static String[] tags1; > + static String[] tags2; > + static String[] tags3; > + > + static String[] lemmas1; > + static String[] lemmas2; > + static String[] lemmas3; > + > + /* > + * Setup the testing variables > + */ > + @BeforeClass > + public static void setUp() { > > - Lesk lesk = new Lesk(); > - LeskParameters params = new LeskParameters(); > - params.setLeskType(LESK_TYPE.LESK_EXT); > - boolean a[] = { true, true, true, true, true, true, true, true, > true, true }; > - params.setFeatures(a); > - lesk.setParams(params); > - String modelsDir = "src\\test\\resources\\models\\"; > WSDHelper.loadTokenizer(modelsDir + "en-token.bin"); > WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict"); > WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin"); > > - String test1 = "I went to the bank to deposit money."; > - String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1); > - int targetWordIndex1 = 5; > - String[] tags1 = WSDHelper.getTagger().tag(sentence1); > + sentence1 = WSDHelper.getTokenizer().tokenize(test1); > + sentence2 = WSDHelper.getTokenizer().tokenize(test2); > + sentence3 = WSDHelper.getTokenizer().tokenize(test3); > + > + tags1 = WSDHelper.getTagger().tag(sentence1); > + tags2 = WSDHelper.getTagger().tag(sentence2); > + tags3 = WSDHelper.getTagger().tag(sentence3); > + > List<String> tempLemmas1 = new ArrayList<String>(); > for (int i = 0; i < sentence1.length; i++) { > - String lemma = WSDHelper.getLemmatizer() > - .lemmatize(sentence1[i], tags1[i]); > - tempLemmas1.add(lemma); > + tempLemmas1 > + .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], > tags1[i])); > } > - String[] lemmas1 = tempLemmas1.toArray(new > String[tempLemmas1.size()]); > - String[] results1 = lesk.disambiguate(sentence1, tags1, lemmas1, > - targetWordIndex1); > - WSDHelper.print(results1); > - WSDHelper.printResults(lesk, results1); > - > - WSDHelper.print("----------------------------------------"); > - > - String test2 = "it was a strong argument that his hypothesis was > true"; > - String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2); > - int targetWordIndex2 = 4; > - String[] tags2 = WSDHelper.getTagger().tag(sentence2); > + lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]); > + > List<String> tempLemmas2 = new ArrayList<String>(); > - for (int i = 0; i < sentence1.length; i++) { > - String lemma = WSDHelper.getLemmatizer() > - .lemmatize(sentence2[i], tags2[i]); > - tempLemmas2.add(lemma); > + for (int i = 0; i < sentence2.length; i++) { > + tempLemmas2 > + .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], > tags2[i])); > } > - String[] lemmas2 = tempLemmas2.toArray(new > String[tempLemmas2.size()]); > - String[] results2 = lesk.disambiguate(sentence2, tags2, lemmas2, > - targetWordIndex2); > - WSDHelper.print(results2); > - WSDHelper.printResults(lesk, results2); > - WSDHelper.print("----------------------------------------"); > - > - String test3 = "the component was highly radioactive to the > point that it has been activated the second it touched water"; > - String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3); > - int targetWordIndex3 = 3; > - String[] tags3 = WSDHelper.getTagger().tag(sentence3); > + lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]); > + > List<String> tempLemmas3 = new ArrayList<String>(); > for (int i = 0; i < sentence3.length; i++) { > - String lemma = WSDHelper.getLemmatizer() > - .lemmatize(sentence3[i], tags3[i]); > - tempLemmas3.add(lemma); > + tempLemmas3 > + .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], > tags3[i])); > } > - String[] lemmas3 = tempLemmas3.toArray(new > String[tempLemmas3.size()]); > - String[] results3 = lesk.disambiguate(sentence3, tags3, lemmas3, > - targetWordIndex3); > - WSDHelper.print(results3); > - WSDHelper.printResults(lesk, results3); > - WSDHelper.print("----------------------------------------"); > + lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]); > + > + lesk = new Lesk(); > + > + LeskParameters params = new LeskParameters(); > + params.setLeskType(LESK_TYPE.LESK_EXT); > + boolean a[] = { true, true, true, true, true, true, true, true, > true, > + true }; > + params.setFeatures(a); > + lesk.setParams(params); > + } > + > + /* > + * Tests disambiguating only one word : The ambiguous word > "please" > + */ > + @Test > + public void testOneWordDisambiguation() { > + String[] senses = lesk.disambiguate(sentence1, tags1, lemmas1, > 8); > + > + assertEquals("Check number of senses", 1, senses.length); > + } > + > + /* > + * Tests disambiguating a word Span In this case we test a mix of > monosemous > + * and polysemous words as well as words that do not need > disambiguation such > + * as determiners > + */ > + @Test > + public void testWordSpanDisambiguation() { > + Span span = new Span(3, 7); > + List<String[]> senses = lesk.disambiguate(sentence2, tags2, > lemmas2, span); > + > + assertEquals("Check number of returned words", 5, > senses.size()); > + assertEquals("Check number of senses", 3, senses.get(0).length); > + assertEquals("Check monosemous word", 1, senses.get(1).length); > + assertEquals("Check preposition", "WSDHELPER to", > senses.get(2)[0]); > + assertEquals("Check determiner", "WSDHELPER determiner", > senses.get(3)[0]); > + } > + > + /* > + * Tests disambiguating all the words > + */ > + @Test > + public void testAllWordsDisambiguation() { > + List<String[]> senses = lesk.disambiguate(sentence3, tags3, > lemmas3); > + > + assertEquals("Check number of returned words", 15, > senses.size()); > + assertEquals("Check preposition", "WSDHELPER personal pronoun", > + senses.get(6)[0]); > } > > } > \ No newline at end of file > > Modified: opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/tes > t/java/opennlp/tools/disambiguator/MFSTester.java?rev=1734600&r1=1734 > 599&r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java > (original) > +++ opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java Fri Mar > 11 17:37:07 2016 > @@ -19,96 +19,128 @@ > > package opennlp.tools.disambiguator; > > +import static org.junit.Assert.assertEquals; > + > import java.util.ArrayList; > import java.util.List; > +import org.junit.BeforeClass; > +import org.junit.Test; > > import opennlp.tools.disambiguator.mfs.MFS; > import opennlp.tools.util.Span; > > /** > - * This is a typical example of how to call the disambiguation > function in the > - * MFS class. > + * This is the test class for {@link MFS}. > + * > + * The scope of this test is to make sure that the MFS disambiguator > code can be > + * executed. This test can not detect mistakes which lead to > incorrect feature > + * generation or other mistakes which decrease the disambiguation > performance of the > + * disambiguator. > */ > public class MFSTester { > + // TODO write more tests > + // TODO modify when we fix the parameter model > + > + static String modelsDir = "src\\test\\resources\\models\\"; > + > + static MFS mfs; > + > + static String test1 = "We need to discuss an important topic, > please write to me soon."; > + static String test2 = "The component was highly radioactive to the > point that" > + + " it has been activated the second it touched water"; > + static String test3 = "The summer is almost over and I did not go > to the beach even once"; > + > + static String[] sentence1; > + static String[] sentence2; > + static String[] sentence3; > + > + static String[] tags1; > + static String[] tags2; > + static String[] tags3; > + > + static String[] lemmas1; > + static String[] lemmas2; > + static String[] lemmas3; > + > + /* > + * Setup the testing variables and the training files > + */ > + @BeforeClass > + public static void setUpAndTraining() { > > - public static void main(String[] args) { > - String modelsDir = "src\\test\\resources\\models\\"; > WSDHelper.loadTokenizer(modelsDir + "en-token.bin"); > WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict"); > WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin"); > > - MFS mfs = new MFS(); > + sentence1 = WSDHelper.getTokenizer().tokenize(test1); > + sentence2 = WSDHelper.getTokenizer().tokenize(test2); > + sentence3 = WSDHelper.getTokenizer().tokenize(test3); > + > + tags1 = WSDHelper.getTagger().tag(sentence1); > + tags2 = WSDHelper.getTagger().tag(sentence2); > + tags3 = WSDHelper.getTagger().tag(sentence3); > > - /** > - * This is how to make the context for one-word-disambiguation > using IMS > - */ > - String test1 = "We need to discuss important topic, please write > to me soon."; > - String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1); > - String[] tags1 = WSDHelper.getTagger().tag(sentence1); > List<String> tempLemmas1 = new ArrayList<String>(); > for (int i = 0; i < sentence1.length; i++) { > - String lemma = WSDHelper.getLemmatizer() > - .lemmatize(sentence1[i], tags1[i]); > - tempLemmas1.add(lemma); > + tempLemmas1 > + .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], > tags1[i])); > } > - String[] lemmas1 = tempLemmas1.toArray(new > String[tempLemmas1.size()]); > + lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]); > > - // output > - String[] senses1 = mfs.disambiguate(sentence1, tags1, lemmas1, > 8); > - System.out.print(lemmas1[8] + " :\t"); > - WSDHelper.print(senses1); > - WSDHelper.print("*****************************"); > - > - /** > - * This is how to make the context for disambiguation of span of > words > - */ > - String test2 = "The component was highly radioactive to the > point that" > - + " it has been activated the second it touched water"; > - String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2); > - String[] tags2 = WSDHelper.getTagger().tag(sentence2); > List<String> tempLemmas2 = new ArrayList<String>(); > for (int i = 0; i < sentence2.length; i++) { > - String lemma = WSDHelper.getLemmatizer() > - .lemmatize(sentence2[i], tags2[i]); > - tempLemmas2.add(lemma); > + tempLemmas2 > + .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], > tags2[i])); > } > - String[] lemmas2 = tempLemmas2.toArray(new > String[tempLemmas2.size()]); > - Span span = new Span(3, 7); > + lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]); > > - // output > - List<String[]> senses2 = mfs.disambiguate(sentence2, tags2, > lemmas2, span); > - for (int i = span.getStart(); i < span.getEnd() + 1; i++) { > - String[] senses = senses2.get(i - span.getStart()); > - System.out.print(lemmas2[i] + " :\t"); > - WSDHelper.print(senses); > - WSDHelper.print("----------"); > - } > - > - WSDHelper.print("*****************************"); > - > - /** > - * This is how to make the context for all-words-disambiguation > - */ > - String test3 = "The summer is almost over and I have not been to > the beach even once"; > - String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3); > - String[] tags3 = WSDHelper.getTagger().tag(sentence3); > List<String> tempLemmas3 = new ArrayList<String>(); > for (int i = 0; i < sentence3.length; i++) { > - String lemma = WSDHelper.getLemmatizer() > - .lemmatize(sentence3[i], tags3[i]); > - tempLemmas3.add(lemma); > + tempLemmas3 > + .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], > tags3[i])); > } > - String[] lemmas3 = tempLemmas3.toArray(new > String[tempLemmas3.size()]); > + lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]); > > - // output > - List<String[]> senses3 = mfs.disambiguate(sentence3, tags3, > lemmas3); > - for (int i = 0; i < sentence3.length; i++) { > - String[] senses = senses3.get(i); > - System.out.print(lemmas3[i] + " :\t"); > - WSDHelper.print(senses); > - WSDHelper.print("----------"); > - } > + mfs = new MFS(); > > } > > + /* > + * Tests disambiguating only one word : The ambiguous word > "please" > + */ > + @Test > + public void testOneWordDisambiguation() { > + String[] senses = mfs.disambiguate(sentence1, tags1, lemmas1, > 8); > + > + assertEquals("Check number of senses", 1, senses.length); > + } > + > + /* > + * Tests disambiguating a word Span In this case we test a mix of > monosemous > + * and polysemous words as well as words that do not need > disambiguation such > + * as determiners > + */ > + @Test > + public void testWordSpanDisambiguation() { > + Span span = new Span(3, 7); > + List<String[]> senses = mfs.disambiguate(sentence2, tags2, > lemmas2, span); > + > + assertEquals("Check number of returned words", 5, > senses.size()); > + assertEquals("Check number of senses", 1, senses.get(0).length); > + assertEquals("Check monosemous word", 1, senses.get(1).length); > + assertEquals("Check preposition", "WSDHELPER to", > senses.get(2)[0]); > + assertEquals("Check determiner", "WSDHELPER determiner", > senses.get(3)[0]); > + } > + > + /* > + * Tests disambiguating all the words > + */ > + @Test > + public void testAllWordsDisambiguation() { > + List<String[]> senses = mfs.disambiguate(sentence3, tags3, > lemmas3); > + > + assertEquals("Check number of returned words", 15, > senses.size()); > + assertEquals("Check preposition", "WSDHELPER personal pronoun", > + senses.get(6)[0]); > + } > } > \ No newline at end of file > > Copied: opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java (from > r1733577, opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java) > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/tes > t/java/opennlp/tools/disambiguator/OSCCMETester.java?p2=opennlp/sandb > ox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java&p1=op > ennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java&r1=1733 > 577&r2=1734600&rev=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java > (original) > +++ opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java Fri > Mar 11 17:37:07 2016 > @@ -19,11 +19,18 @@ > > package opennlp.tools.disambiguator; > > +import static org.junit.Assert.assertEquals; > +import static org.junit.Assert.assertNotNull; > +import static org.junit.Assert.fail; > + > import java.io.File; > import java.io.IOException; > import java.util.ArrayList; > import java.util.List; > > +import org.junit.BeforeClass; > +import org.junit.Test; > + > import opennlp.tools.disambiguator.datareader.SemcorReaderExtended; > import opennlp.tools.disambiguator.oscc.OSCCFactory; > import opennlp.tools.disambiguator.oscc.OSCCME; > @@ -33,84 +40,154 @@ import opennlp.tools.util.ObjectStream; > import opennlp.tools.util.Span; > import opennlp.tools.util.TrainingParameters; > > -public class OSCCTester { > - > - public static void main(String[] args) { > - > - SemcorReaderExtended sr = new SemcorReaderExtended(); > - > - String modelsDir = "src\\test\\resources\\models\\"; > +/** > + * This is the test class for {@link OSCCME}. > + * > + * The scope of this test is to make sure that the OSCC > disambiguator code can > + * be executed. This test can not detect mistakes which lead to > incorrect > + * feature generation or other mistakes which decrease the > disambiguation > + * performance of the disambiguator. > + * > + * In this test the {@link OSCCME} is trained with Semcor and then > the computed > + * model is used to predict sentences from the training sentences. > + */ > +public class OSCCMETester { > + // TODO write more tests > + // TODO modify when we fix the parameter model > + > + static String modelsDir = "src\\test\\resources\\models\\"; > + static String trainingDataDirectory = > "src\\test\\resources\\supervised\\models\\"; > + > + static OSCCParameters OSCCParams; > + static OSCCME oscc; > + static OSCCFactory osccFactory; > + static OSCCModel model; > + > + static String test = "please.v"; > + static File outFile; > + > + static String test1 = "We need to discuss an important topic, > please write to me soon."; > + static String test2 = "The component was highly radioactive to the > point that" > + + " it has been activated the second it touched water"; > + static String test3 = "The summer is almost over and I did not go > to the beach even once"; > + > + static String[] sentence1; > + static String[] sentence2; > + static String[] sentence3; > + > + static String[] tags1; > + static String[] tags2; > + static String[] tags3; > + > + static String[] lemmas1; > + static String[] lemmas2; > + static String[] lemmas3; > + > + /* > + * Setup the testing variables > + */ > + @BeforeClass > + public static void setUpAndTraining() { > WSDHelper.loadTokenizer(modelsDir + "en-token.bin"); > WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict"); > WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin"); > > - String test = "write.v"; > - TrainingParameters trainingParams = new TrainingParameters(); > - OSCCParameters OSCCParams = new OSCCParameters(""); > - OSCCFactory OSCCFactory = new OSCCFactory(); > + sentence1 = WSDHelper.getTokenizer().tokenize(test1); > + sentence2 = WSDHelper.getTokenizer().tokenize(test2); > + sentence3 = WSDHelper.getTokenizer().tokenize(test3); > + > + tags1 = WSDHelper.getTagger().tag(sentence1); > + tags2 = WSDHelper.getTagger().tag(sentence2); > + tags3 = WSDHelper.getTagger().tag(sentence3); > + > + List<String> tempLemmas1 = new ArrayList<String>(); > + for (int i = 0; i < sentence1.length; i++) { > + tempLemmas1 > + .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], > tags1[i])); > + } > + lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]); > > + List<String> tempLemmas2 = new ArrayList<String>(); > + for (int i = 0; i < sentence2.length; i++) { > + tempLemmas2 > + .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], > tags2[i])); > + } > + lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]); > + > + List<String> tempLemmas3 = new ArrayList<String>(); > + for (int i = 0; i < sentence3.length; i++) { > + tempLemmas3 > + .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], > tags3[i])); > + } > + lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]); > + > + OSCCParams = new OSCCParameters(""); > + OSCCParams.setTrainingDataDirectory(trainingDataDirectory); > + osccFactory = new OSCCFactory(); > + TrainingParameters trainingParams = new TrainingParameters(); > + SemcorReaderExtended sr = new SemcorReaderExtended(); > ObjectStream<WSDSample> sampleStream = > sr.getSemcorDataStream(test); > > - OSCCModel model = null; > - OSCCModel readModel = null; > - try { > - model = OSCCME.train("en", sampleStream, trainingParams, > OSCCParams, > - OSCCFactory); > - model.writeModel(test); > - File outFile = new File(test + ".OSCC.model"); > - readModel = new OSCCModel(outFile); > + OSCCModel writeModel = null; > + /* > + * Tests training the disambiguator We test both writing and > reading a model > + * file trained by semcor > + */ > > + try { > + writeModel = OSCCME.train("en", sampleStream, trainingParams, > OSCCParams, > + osccFactory); > + assertNotNull("Checking the model to be written", writeModel); > + writeModel.writeModel(OSCCParams.getTrainingDataDirectory() + > test); > + outFile = new File( > + OSCCParams.getTrainingDataDirectory() + test + > ".oscc.model"); > + model = new OSCCModel(outFile); > + assertNotNull("Checking the read model", model); > + oscc = new OSCCME(model, OSCCParams); > + assertNotNull("Checking the disambiguator", oscc); > } catch (IOException e1) { > - // TODO Auto-generated catch block > e1.printStackTrace(); > + fail("Exception in training"); > } > - OSCCME OSCC = new OSCCME(readModel, OSCCParams); > + } > > - /** > - * This is how to make the context for one-word-disambiguation > using OSCC > - */ > - String test1 = "We need to discuss important topic, please write > to me soon."; > - String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1); > - String[] tags1 = WSDHelper.getTagger().tag(sentence1); > - List<String> tempLemmas1 = new ArrayList<String>(); > - for (int i = 0; i < sentence1.length; i++) { > - String lemma = WSDHelper.getLemmatizer() > - .lemmatize(sentence1[i], tags1[i]); > - tempLemmas1.add(lemma); > - } > - String[] lemmas1 = tempLemmas1.toArray(new > String[tempLemmas1.size()]); > + /* > + * Tests disambiguating only one word : The ambiguous word > "please" > + */ > + @Test > + public void testOneWordDisambiguation() { > + String[] senses = oscc.disambiguate(sentence1, tags1, lemmas1, > 8); > > - // output > - String[] senses1 = OSCC.disambiguate(sentence1, tags1, lemmas1, > 8); > - System.out.print(lemmas1[8] + " :\t"); > - WSDHelper.print(senses1); > - WSDHelper.print("*****************************"); > + assertEquals("Check number of senses", 1, senses.length); > + } > > - /** > - * This is how to make the context for disambiguation of span of > words > - */ > - String test2 = "The component was highly radioactive to the > point that" > - + " it has been activated the second it touched water"; > - String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2); > - String[] tags2 = WSDHelper.getTagger().tag(sentence2); > - List<String> tempLemmas2 = new ArrayList<String>(); > - for (int i = 0; i < sentence2.length; i++) { > - String lemma = WSDHelper.getLemmatizer() > - .lemmatize(sentence2[i], tags2[i]); > - tempLemmas2.add(lemma); > - } > - String[] lemmas2 = tempLemmas2.toArray(new > String[tempLemmas2.size()]); > + /* > + * Tests disambiguating a word Span In this case we test a mix of > monosemous > + * and polysemous words as well as words that do not need > disambiguation such > + * as determiners > + */ > + @Test > + public void testWordSpanDisambiguation() { > Span span = new Span(3, 7); > + List<String[]> senses = oscc.disambiguate(sentence2, tags2, > lemmas2, span); > > - // output > - List<String[]> senses2 = OSCC.disambiguate(sentence2, tags2, > lemmas2, span); > - for (int i = span.getStart(); i < span.getEnd() + 1; i++) { > - String[] senses = senses2.get(i - span.getStart()); > - System.out.print(lemmas2[i] + " :\t"); > - WSDHelper.print(senses); > - WSDHelper.print("----------"); > - } > + assertEquals("Check number of returned words", 5, > senses.size()); > + assertEquals("Check number of senses", 1, senses.get(0).length); > + assertEquals("Check monosemous word", 1, senses.get(1).length); > + assertEquals("Check preposition", "WSDHELPER to", > senses.get(2)[0]); > + assertEquals("Check determiner", "WSDHELPER determiner", > senses.get(3)[0]); > + } > > - WSDHelper.print("*****************************"); > + /* > + * Tests disambiguating all the words > + */ > + @Test > + public void testAllWordsDisambiguation() { > + List<String[]> senses = oscc.disambiguate(sentence3, tags3, > lemmas3); > + > + assertEquals("Check number of returned words", 15, > senses.size()); > + assertEquals("Check preposition", "WSDHELPER personal pronoun", > + senses.get(6)[0]); > } > + > } > \ No newline at end of file > > Modified: opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/Tester.java > URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/tes > t/java/opennlp/tools/disambiguator/Tester.java?rev=1734600&r1=1734599 > &r2=1734600&view=diff > ===================================================================== > ========= > --- opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/Tester.java (original) > +++ opennlp/sandbox/opennlp- > wsd/src/test/java/opennlp/tools/disambiguator/Tester.java Fri Mar 11 > 17:37:07 2016 > @@ -1,36 +1,40 @@ > package opennlp.tools.disambiguator; > > +import java.util.ArrayList; > +import java.util.List; > > +import opennlp.tools.disambiguator.ims.IMSME; > +import opennlp.tools.disambiguator.ims.IMSParameters; > > public class Tester { > > public static void main(String[] args) { > -// > -// String modelsDir = "src\\test\\resources\\models\\"; > -// WSDHelper.loadTokenizer(modelsDir + "en-token.bin"); > -// WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict"); > -// WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin"); > -// > -// IMSME ims = new IMSME(); > -// > -// String test3 = "The summer is almost over and I haven't been > to the beach even once"; > -// String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3); > -// String[] tags3 = WSDHelper.getTagger().tag(sentence3); > -// List<String> tempLemmas3 = new ArrayList<String>(); > -// for (int i = 0; i < sentence3.length; i++) { > -// String lemma = WSDHelper.getLemmatizer() > -// .lemmatize(sentence3[i], tags3[i]); > -// tempLemmas3.add(lemma); > -// } > -// String[] lemmas3 = tempLemmas3.toArray(new > String[tempLemmas3.size()]); > -// > -// // output > -// List<String[]> senses3 = ims.disambiguate(sentence3, tags3, > lemmas3); > -// for (int i = 0; i < sentence3.length; i++) { > -// System.out.print(sentence3[i] + " : "); > -// WSDHelper.printResults(ims, senses3.get(i)); > -// WSDHelper.print("----------"); > -// } > + > + String modelsDir = "src\\test\\resources\\models\\"; > + WSDHelper.loadTokenizer(modelsDir + "en-token.bin"); > + WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict"); > + WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin"); > + > + IMSME ims = new IMSME(new IMSParameters("\\")); > + > + String test3 = "The summer is almost over and I haven't been to > the beach even once"; > + String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3); > + String[] tags3 = WSDHelper.getTagger().tag(sentence3); > + List<String> tempLemmas3 = new ArrayList<String>(); > + for (int i = 0; i < sentence3.length; i++) { > + String lemma = > WSDHelper.getLemmatizer().lemmatize(sentence3[i], > + tags3[i]); > + tempLemmas3.add(lemma); > + } > + String[] lemmas3 = tempLemmas3.toArray(new > String[tempLemmas3.size()]); > + > + // output > + List<String[]> senses3 = ims.disambiguate(sentence3, tags3, > lemmas3); > + for (int i = 0; i < sentence3.length; i++) { > + System.out.print(sentence3[i] + " : "); > + WSDHelper.printResults(ims, senses3.get(i)); > + WSDHelper.print("----------"); > + } > > } > } > \ No newline at end of file > >
