Jörn, Can you please review my change to the ExtensionLoader? I modified it to accept singletons (private constructor and the field INSTANCE).
Thank you, William 2014-04-16 12:26 GMT-03:00 <co...@apache.org>: > Author: colen > Date: Wed Apr 16 15:26:24 2014 > New Revision: 1587944 > > URL: http://svn.apache.org/r1587944 > Log: > OPENNLP-674 Added factory to Doccat > > Added: > > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java > (with props) > > opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java > (with props) > opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/doccat/ > > opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/doccat/DoccatSample.txt > (with props) > Modified: > > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java > > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java > > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java > > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java > > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java > > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java > > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java > > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java > > Modified: > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java > URL: > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java?rev=1587944&r1=1587943&r2=1587944&view=diff > > ============================================================================== > --- > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java > (original) > +++ > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java > Wed Apr 16 15:26:24 2014 > @@ -34,8 +34,10 @@ import opennlp.tools.cmdline.doccat.Docc > import opennlp.tools.cmdline.params.CVParams; > import opennlp.tools.doccat.DoccatCrossValidator; > import opennlp.tools.doccat.DoccatEvaluationMonitor; > +import opennlp.tools.doccat.DoccatFactory; > import opennlp.tools.doccat.DocumentSample; > import opennlp.tools.doccat.FeatureGenerator; > +import opennlp.tools.tokenize.Tokenizer; > import opennlp.tools.util.eval.EvaluationMonitor; > import opennlp.tools.util.model.ModelUtil; > > @@ -88,13 +90,18 @@ public final class DoccatCrossValidatorT > FeatureGenerator[] featureGenerators = DoccatTrainerTool > .createFeatureGenerators(params.getFeatureGenerators()); > > + Tokenizer tokenizer = DoccatTrainerTool.createTokenizer(params > + .getTokenizer()); > + > DoccatEvaluationMonitor[] listenersArr = listeners > .toArray(new DoccatEvaluationMonitor[listeners.size()]); > > DoccatCrossValidator validator; > try { > + DoccatFactory factory = DoccatFactory.create(params.getFactory(), > + tokenizer, featureGenerators); > validator = new DoccatCrossValidator(params.getLang(), mlParams, > - featureGenerators, listenersArr); > + factory, listenersArr); > > validator.evaluate(sampleStream, params.getFolds()); > } catch (IOException e) { > > Modified: > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java > URL: > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java?rev=1587944&r1=1587943&r2=1587944&view=diff > > ============================================================================== > --- > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java > (original) > +++ > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java > Wed Apr 16 15:26:24 2014 > @@ -26,16 +26,19 @@ import opennlp.tools.cmdline.TerminateTo > import opennlp.tools.cmdline.doccat.DoccatTrainerTool.TrainerToolParams; > import opennlp.tools.cmdline.params.TrainingToolParams; > import opennlp.tools.doccat.BagOfWordsFeatureGenerator; > +import opennlp.tools.doccat.DoccatFactory; > import opennlp.tools.doccat.DoccatModel; > import opennlp.tools.doccat.DocumentCategorizerME; > import opennlp.tools.doccat.DocumentSample; > import opennlp.tools.doccat.FeatureGenerator; > +import opennlp.tools.tokenize.Tokenizer; > +import opennlp.tools.tokenize.WhitespaceTokenizer; > import opennlp.tools.util.ext.ExtensionLoader; > import opennlp.tools.util.model.ModelUtil; > > public class DoccatTrainerTool > extends AbstractTrainerTool<DocumentSample, TrainerToolParams> { > - > + > interface TrainerToolParams extends TrainingParams, TrainingToolParams { > } > > @@ -47,7 +50,7 @@ public class DoccatTrainerTool > public String getShortDescription() { > return "trainer for the learnable document categorizer"; > } > - > + > @Override > public void run(String format, String[] args) { > super.run(format, args); > @@ -64,10 +67,14 @@ public class DoccatTrainerTool > FeatureGenerator[] featureGenerators = createFeatureGenerators(params > .getFeatureGenerators()); > > + Tokenizer tokenizer = createTokenizer(params.getTokenizer()); > + > DoccatModel model; > try { > + DoccatFactory factory = DoccatFactory.create(params.getFactory(), > + tokenizer, featureGenerators); > model = DocumentCategorizerME.train(params.getLang(), sampleStream, > - mlParams, featureGenerators); > + mlParams, factory); > } catch (IOException e) { > throw new TerminateToolException(-1, "IO error while reading > training data or indexing data: " + > e.getMessage(), e); > @@ -79,10 +86,17 @@ public class DoccatTrainerTool > // sorry that this can fail > } > } > - > + > CmdLineUtil.writeModel("document categorizer", modelOutFile, model); > } > > + static Tokenizer createTokenizer(String tokenizer) { > + if(tokenizer != null) { > + return ExtensionLoader.instantiateExtension(Tokenizer.class, > tokenizer); > + } > + return WhitespaceTokenizer.INSTANCE; > + } > + > static FeatureGenerator[] createFeatureGenerators(String > featureGeneratorsNames) { > if(featureGeneratorsNames == null) { > FeatureGenerator[] def = {new BagOfWordsFeatureGenerator()}; > > Modified: > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java > URL: > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java?rev=1587944&r1=1587943&r2=1587944&view=diff > > ============================================================================== > --- > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java > (original) > +++ > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java > Wed Apr 16 15:26:24 2014 > @@ -32,4 +32,12 @@ interface TrainingParams extends BasicTr > @OptionalParameter > String getFeatureGenerators(); > > + @ParameterDescription(valueName = "tokenizer", description = "Tokenizer > implementation. WhitespaceTokenizer is used if not specified.") > + @OptionalParameter > + String getTokenizer(); > + > + @ParameterDescription(valueName = "factoryName", description = "A > sub-class of DoccatFactory where to get implementation and resources.") > + @OptionalParameter > + String getFactory(); > + > } > > Modified: > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java > URL: > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java?rev=1587944&r1=1587943&r2=1587944&view=diff > > ============================================================================== > --- > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java > (original) > +++ > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java > Wed Apr 16 15:26:24 2014 > @@ -34,18 +34,19 @@ public class DoccatCrossValidator { > > private DoccatEvaluationMonitor[] listeners; > > - private FeatureGenerator[] featureGenarators; > + private DoccatFactory factory; > + > > /** > * Creates a {@link DoccatCrossValidator} with the given > * {@link FeatureGenerator}s. > */ > public DoccatCrossValidator(String languageCode, TrainingParameters > mlParams, > - FeatureGenerator[] featureGenerators, DoccatEvaluationMonitor[] > listeners) { > + DoccatFactory factory, DoccatEvaluationMonitor ... listeners) { > this.languageCode = languageCode; > this.params = mlParams; > this.listeners = listeners; > - this.featureGenarators = featureGenerators; > + this.factory = factory; > } > > /** > @@ -70,7 +71,7 @@ public class DoccatCrossValidator { > .next(); > > DoccatModel model = DocumentCategorizerME.train(languageCode, > - trainingSampleStream, params, featureGenarators); > + trainingSampleStream, params, factory); > > DocumentCategorizerEvaluator evaluator = new > DocumentCategorizerEvaluator( > new DocumentCategorizerME(model), listeners); > > Added: > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java > URL: > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java?rev=1587944&view=auto > > ============================================================================== > --- > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java > (added) > +++ > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java > Wed Apr 16 15:26:24 2014 > @@ -0,0 +1,174 @@ > +/* > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +package opennlp.tools.doccat; > + > +import java.util.Arrays; > +import java.util.Iterator; > +import java.util.List; > +import java.util.Map; > + > +import opennlp.tools.tokenize.Tokenizer; > +import opennlp.tools.tokenize.WhitespaceTokenizer; > +import opennlp.tools.util.BaseToolFactory; > +import opennlp.tools.util.InvalidFormatException; > +import opennlp.tools.util.ext.ExtensionLoader; > + > +/** > + * The factory that provides Doccat default implementations and resources > + */ > +public class DoccatFactory extends BaseToolFactory { > + > + private static final String FEATURE_GENERATORS = > "doccat.featureGenerators"; > + private static final String TOKENIZER_NAME = "doccat.tokenizer"; > + > + private FeatureGenerator[] featureGenerators; > + private Tokenizer tokenizer; > + > + /** > + * Creates a {@link DoccatFactory} that provides the default > implementation of > + * the resources. > + */ > + public DoccatFactory() { > + } > + > + /** > + * Creates a {@link DoccatFactory}. Use this constructor to > programmatically > + * create a factory. > + * > + * @param tokenizer > + * @param featureGenerators > + */ > + public DoccatFactory(Tokenizer tokenizer, FeatureGenerator[] > featureGenerators) { > + this.init(tokenizer, featureGenerators); > + } > + > + protected void init(Tokenizer tokenizer, FeatureGenerator[] > featureGenerators) { > + > + this.featureGenerators = featureGenerators; > + this.tokenizer = tokenizer; > + } > + > + @Override > + public Map<String, String> createManifestEntries() { > + Map<String, String> manifestEntries = super.createManifestEntries(); > + > + if (getTokenizer() != null) { > + manifestEntries.put(TOKENIZER_NAME, getTokenizer().getClass() > + .getCanonicalName()); > + } > + > + if (getFeatureGenerators() != null) { > + manifestEntries.put(FEATURE_GENERATORS, > featureGeneratorsAsString()); > + } > + > + return manifestEntries; > + } > + > + private String featureGeneratorsAsString() { > + List<FeatureGenerator> fgs = Arrays.asList(getFeatureGenerators()); > + Iterator<FeatureGenerator> iter = fgs.iterator(); > + StringBuilder sb = new StringBuilder(); > + if (iter.hasNext()) { > + sb.append(iter.next().getClass().getCanonicalName()); > + while (iter.hasNext()) { > + sb.append(',').append(iter.next().getClass().getCanonicalName()); > + } > + } > + return sb.toString(); > + } > + > + @Override > + public void validateArtifactMap() throws InvalidFormatException { > + // nothing to validate > + } > + > + public static DoccatFactory create(String subclassName, Tokenizer > tokenizer, > + FeatureGenerator[] featureGenerators) throws InvalidFormatException > { > + if (subclassName == null) { > + // will create the default factory > + return new DoccatFactory(tokenizer, featureGenerators); > + } > + try { > + DoccatFactory theFactory = ExtensionLoader.instantiateExtension( > + DoccatFactory.class, subclassName); > + theFactory.init(tokenizer, featureGenerators); > + return theFactory; > + } catch (Exception e) { > + String msg = "Could not instantiate the " + subclassName > + + ". The initialization throw an exception."; > + System.err.println(msg); > + e.printStackTrace(); > + throw new InvalidFormatException(msg, e); > + } > + > + } > + > + private FeatureGenerator[] loadFeatureGenerators(String classNames) { > + String[] classes = classNames.split(","); > + FeatureGenerator[] fgs = new FeatureGenerator[classes.length]; > + > + for (int i = 0; i < classes.length; i++) { > + fgs[i] = > ExtensionLoader.instantiateExtension(FeatureGenerator.class, > + classes[i]); > + } > + return fgs; > + } > + > + public FeatureGenerator[] getFeatureGenerators() { > + if (featureGenerators == null) { > + if (artifactProvider != null) { > + String classNames = artifactProvider > + .getManifestProperty(FEATURE_GENERATORS); > + if (classNames != null) { > + this.featureGenerators = loadFeatureGenerators(classNames); > + } > + } > + if (featureGenerators == null) { // could not load using artifact > provider > + // load bag of words as default > + FeatureGenerator[] bow = { new BagOfWordsFeatureGenerator() }; > + this.featureGenerators = bow; > + } > + } > + return featureGenerators; > + } > + > + public void setFeatureGenerators(FeatureGenerator[] featureGenerators) { > + this.featureGenerators = featureGenerators; > + } > + > + public Tokenizer getTokenizer() { > + if (this.tokenizer == null) { > + if (artifactProvider != null) { > + String className = > artifactProvider.getManifestProperty(TOKENIZER_NAME); > + if (className != null) { > + this.tokenizer = ExtensionLoader.instantiateExtension( > + Tokenizer.class, className); > + } > + } > + if (this.tokenizer == null) { // could not load using artifact > provider > + this.tokenizer = WhitespaceTokenizer.INSTANCE; > + } > + } > + return tokenizer; > + } > + > + public void setTokenizer(Tokenizer tokenizer) { > + this.tokenizer = tokenizer; > + } > + > +} > > Propchange: > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java > > ------------------------------------------------------------------------------ > svn:mime-type = text/plain > > Modified: > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java > URL: > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java?rev=1587944&r1=1587943&r2=1587944&view=diff > > ============================================================================== > --- > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java > (original) > +++ > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java > Wed Apr 16 15:26:24 2014 > @@ -25,34 +25,50 @@ import java.util.Map; > > import opennlp.tools.ml.model.AbstractModel; > import opennlp.tools.ml.model.MaxentModel; > +import opennlp.tools.util.BaseToolFactory; > import opennlp.tools.util.InvalidFormatException; > import opennlp.tools.util.model.BaseModel; > > public class DoccatModel extends BaseModel { > - > + > private static final String COMPONENT_NAME = "DocumentCategorizerME"; > private static final String DOCCAT_MODEL_ENTRY_NAME = "doccat.model"; > - > - protected DoccatModel(String languageCode, MaxentModel doccatModel, > - Map<String, String> manifestInfoEntries) { > - super(COMPONENT_NAME, languageCode, manifestInfoEntries); > - > + > + public DoccatModel(String languageCode, MaxentModel doccatModel, > + Map<String, String> manifestInfoEntries, DoccatFactory factory) { > + super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory); > + > artifactMap.put(DOCCAT_MODEL_ENTRY_NAME, doccatModel); > checkArtifactMap(); > } > - > + > + /** > + * @deprecated Use > + * {@link #DoccatModel(String, MaxentModel, Map, > DoccatFactory)} > + * instead and pass in a {@link DoccatFactory} > + */ > + protected DoccatModel(String languageCode, MaxentModel doccatModel, > + Map<String, String> manifestInfoEntries) { > + this(languageCode, doccatModel, manifestInfoEntries, new > DoccatFactory()); > + } > + > + /** > + * @deprecated Use > + * {@link #DoccatModel(String, MaxentModel, Map, > DoccatFactory)} > + * instead and pass in a {@link DoccatFactory} > + */ > public DoccatModel(String languageCode, MaxentModel doccatModel) { > this(languageCode, doccatModel, null); > } > - > + > public DoccatModel(InputStream in) throws IOException, > InvalidFormatException { > super(COMPONENT_NAME, in); > } > - > + > public DoccatModel(File modelFile) throws IOException, > InvalidFormatException { > super(COMPONENT_NAME, modelFile); > } > - > + > public DoccatModel(URL modelURL) throws IOException, > InvalidFormatException { > super(COMPONENT_NAME, modelURL); > } > @@ -66,7 +82,23 @@ public class DoccatModel extends BaseMod > } > } > > + public DoccatFactory getFactory() { > + return (DoccatFactory) this.toolFactory; > + } > + > + @Override > + protected Class<? extends BaseToolFactory> getDefaultFactory() { > + return DoccatFactory.class; > + } > + > + /** > + * @deprecated Use {@link #getMaxentModel()} instead. > + */ > public MaxentModel getChunkerModel() { > return (MaxentModel) artifactMap.get(DOCCAT_MODEL_ENTRY_NAME); > } > + > + public MaxentModel getMaxentModel() { > + return (MaxentModel) artifactMap.get(DOCCAT_MODEL_ENTRY_NAME); > + } > } > > Modified: > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java > URL: > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1587944&r1=1587943&r2=1587944&view=diff > > ============================================================================== > --- > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java > (original) > +++ > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java > Wed Apr 16 15:26:24 2014 > @@ -25,7 +25,6 @@ import java.util.Map; > > import opennlp.tools.ml.model.MaxentModel; > import opennlp.tools.ml.model.TrainUtil; > -import opennlp.tools.tokenize.SimpleTokenizer; > import opennlp.tools.tokenize.Tokenizer; > import opennlp.tools.util.ObjectStream; > import opennlp.tools.util.TrainingParameters; > @@ -40,29 +39,35 @@ public class DocumentCategorizerME imple > * Shared default thread safe feature generator. > */ > private static FeatureGenerator defaultFeatureGenerator = new > BagOfWordsFeatureGenerator(); > - > - private MaxentModel model; > + > + private DoccatModel model; > private DocumentCategorizerContextGenerator mContextGenerator; > > /** > - * Initializes a the current instance with a doccat model and custom > feature generation. > - * The feature generation must be identical to the configuration at > training time. > - * > + * Initializes a the current instance with a doccat model and custom > feature > + * generation. The feature generation must be identical to the > configuration > + * at training time. > + * > * @param model > * @param featureGenerators > + * > + * @deprecated train a {@link DoccatModel} with a specific > + * {@link DoccatFactory} to customize the {@link > FeatureGenerator}s > */ > public DocumentCategorizerME(DoccatModel model, FeatureGenerator... > featureGenerators) { > - this.model = model.getChunkerModel(); > + this.model = model; > this.mContextGenerator = new > DocumentCategorizerContextGenerator(featureGenerators); > } > - > + > /** > * Initializes the current instance with a doccat model. Default > feature generation is used. > - * > + * > * @param model > */ > public DocumentCategorizerME(DoccatModel model) { > - this(model, defaultFeatureGenerator); > + this.model = model; > + this.mContextGenerator = new > DocumentCategorizerContextGenerator(this.model > + .getFactory().getFeatureGenerators()); > } > > /** > @@ -71,7 +76,7 @@ public class DocumentCategorizerME imple > * @param text > */ > public double[] categorize(String text[]) { > - return model.eval(mContextGenerator.getContext(text)); > + return > model.getMaxentModel().eval(mContextGenerator.getContext(text)); > } > > /** > @@ -79,57 +84,79 @@ public class DocumentCategorizerME imple > * is passed to the feature generation. > */ > public double[] categorize(String documentText) { > - Tokenizer tokenizer = SimpleTokenizer.INSTANCE; > + Tokenizer tokenizer = model.getFactory().getTokenizer(); > return categorize(tokenizer.tokenize(documentText)); > } > > public String getBestCategory(double[] outcome) { > - return model.getBestOutcome(outcome); > + return model.getMaxentModel().getBestOutcome(outcome); > } > > public int getIndex(String category) { > - return model.getIndex(category); > + return model.getMaxentModel().getIndex(category); > } > > public String getCategory(int index) { > - return model.getOutcome(index); > + return model.getMaxentModel().getOutcome(index); > } > > public int getNumberOfCategories() { > - return model.getNumOutcomes(); > + return model.getMaxentModel().getNumOutcomes(); > } > > public String getAllResults(double results[]) { > - return model.getAllOutcomes(results); > + return model.getMaxentModel().getAllOutcomes(results); > } > > + /** > + * @deprecated Use > + * {@link #train(String, ObjectStream, TrainingParameters, > DoccatFactory)} > + * instead. > + */ > public static DoccatModel train(String languageCode, > ObjectStream<DocumentSample> samples, > TrainingParameters mlParams, FeatureGenerator... featureGenerators) > throws IOException { > - > + > if (featureGenerators.length == 0) { > featureGenerators = new > FeatureGenerator[]{defaultFeatureGenerator}; > } > - > + > Map<String, String> manifestInfoEntries = new HashMap<String, > String>(); > - > + > MaxentModel model = TrainUtil.train( > new DocumentCategorizerEventStream(samples, featureGenerators), > mlParams.getSettings(), manifestInfoEntries); > - > + > return new DoccatModel(languageCode, model, manifestInfoEntries); > } > - > + > + public static DoccatModel train(String languageCode, > ObjectStream<DocumentSample> samples, > + TrainingParameters mlParams, DoccatFactory factory) > + throws IOException { > + > + Map<String, String> manifestInfoEntries = new HashMap<String, > String>(); > + > + MaxentModel model = TrainUtil.train( > + new DocumentCategorizerEventStream(samples, > factory.getFeatureGenerators()), > + mlParams.getSettings(), manifestInfoEntries); > + > + return new DoccatModel(languageCode, model, manifestInfoEntries, > factory); > + } > + > /** > * Trains a doccat model with default feature generation. > - * > + * > * @param languageCode > * @param samples > - * > + * > * @return the trained doccat model > - * > + * > * @throws IOException > - * @throws ObjectStreamException > + * @throws ObjectStreamException > + * > + * @deprecated Use > + * {@link #train(String, ObjectStream, TrainingParameters, > DoccatFactory)} > + * instead. > */ > public static DoccatModel train(String languageCode, > ObjectStream<DocumentSample> samples) throws IOException { > return train(languageCode, samples, > ModelUtil.createDefaultTrainingParameters(), defaultFeatureGenerator); > > Modified: > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java > URL: > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java?rev=1587944&r1=1587943&r2=1587944&view=diff > > ============================================================================== > --- > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java > (original) > +++ > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java > Wed Apr 16 15:26:24 2014 > @@ -52,7 +52,7 @@ public class SentenceDetectorFactory ext > /** > * Creates a {@link SentenceDetectorFactory}. Use this constructor to > * programmatically create a factory. > - * > + * > * @param languageCode > * @param abbreviationDictionary > * @param eosCharacters > @@ -61,7 +61,7 @@ public class SentenceDetectorFactory ext > Dictionary abbreviationDictionary, char[] eosCharacters) { > this.init(languageCode, useTokenEnd, abbreviationDictionary, > eosCharacters); > } > - > + > protected void init(String languageCode, boolean useTokenEnd, > Dictionary abbreviationDictionary, char[] eosCharacters) { > this.languageCode = languageCode; > > Modified: > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java > URL: > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java?rev=1587944&r1=1587943&r2=1587944&view=diff > > ============================================================================== > --- > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java > (original) > +++ > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java > Wed Apr 16 15:26:24 2014 > @@ -17,6 +17,8 @@ > > package opennlp.tools.util.ext; > > +import java.lang.reflect.Field; > + > /** > * The {@link ExtensionLoader} is responsible to load extensions to the > OpenNLP library. > * <p> > @@ -64,6 +66,24 @@ public class ExtensionLoader { > } catch (InstantiationException e) { > throw new ExtensionNotLoadedException(e); > } catch (IllegalAccessException e) { > + // constructor is private. Try to load using INSTANCE > + Field instanceField; > + try { > + instanceField = extClazz.getDeclaredField("INSTANCE"); > + } catch (NoSuchFieldException e1) { > + throw new ExtensionNotLoadedException(e1); > + } catch (SecurityException e1) { > + throw new ExtensionNotLoadedException(e1); > + } > + if(instanceField != null) { > + try { > + return (T) instanceField.get(null); > + } catch (IllegalArgumentException e1) { > + throw new ExtensionNotLoadedException(e1); > + } catch (IllegalAccessException e1) { > + throw new ExtensionNotLoadedException(e1); > + } > + } > throw new ExtensionNotLoadedException(e); > } > } > > Added: > opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java > URL: > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java?rev=1587944&view=auto > > ============================================================================== > --- > opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java > (added) > +++ > opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java > Wed Apr 16 15:26:24 2014 > @@ -0,0 +1,100 @@ > +package opennlp.tools.doccat; > + > +import static org.junit.Assert.assertEquals; > +import static org.junit.Assert.assertNotNull; > + > +import java.io.ByteArrayInputStream; > +import java.io.ByteArrayOutputStream; > +import java.io.IOException; > + > +import opennlp.tools.formats.ResourceAsStreamFactory; > +import opennlp.tools.tokenize.SimpleTokenizer; > +import opennlp.tools.tokenize.WhitespaceTokenizer; > +import opennlp.tools.util.InputStreamFactory; > +import opennlp.tools.util.ObjectStream; > +import opennlp.tools.util.PlainTextByLineStream; > +import opennlp.tools.util.TrainingParameters; > + > +import org.junit.Test; > + > +/** > + * Tests for the {@link DoccatFactory} class. > + */ > +public class DoccatFactoryTest { > + > + private static ObjectStream<DocumentSample> createSampleStream() > + throws IOException { > + > + InputStreamFactory isf = new ResourceAsStreamFactory( > + DoccatFactoryTest.class, > "/opennlp/tools/doccat/DoccatSample.txt"); > + > + return new DocumentSampleStream(new PlainTextByLineStream(isf, > "UTF-8")); > + } > + > + private static DoccatModel train() throws IOException { > + return DocumentCategorizerME.train("x-unspecified", > createSampleStream(), > + TrainingParameters.defaultParams()); > + } > + > + private static DoccatModel train(DoccatFactory factory) throws > IOException { > + return DocumentCategorizerME.train("x-unspecified", > createSampleStream(), > + TrainingParameters.defaultParams(), factory); > + } > + > + @Test > + public void testDefault() throws IOException { > + DoccatModel model = train(); > + > + assertNotNull(model); > + > + ByteArrayOutputStream out = new ByteArrayOutputStream(); > + model.serialize(out); > + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); > + > + DoccatModel fromSerialized = new DoccatModel(in); > + > + DoccatFactory factory = fromSerialized.getFactory(); > + > + assertNotNull(factory); > + > + assertEquals(1, factory.getFeatureGenerators().length); > + assertEquals(BagOfWordsFeatureGenerator.class, > + factory.getFeatureGenerators()[0].getClass()); > + > + assertEquals(WhitespaceTokenizer.INSTANCE, factory.getTokenizer()); > + > + } > + > + @Test > + public void testCustom() throws IOException { > + FeatureGenerator[] featureGenerators = { new > BagOfWordsFeatureGenerator(), > + new NGramFeatureGenerator() }; > + DoccatFactory factory = new DoccatFactory(SimpleTokenizer.INSTANCE, > + featureGenerators); > + > + DoccatModel model = train(factory); > + > + assertNotNull(model); > + > + ByteArrayOutputStream out = new ByteArrayOutputStream(); > + model.serialize(out); > + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); > + > + DoccatModel fromSerialized = new DoccatModel(in); > + > + factory = fromSerialized.getFactory(); > + > + assertNotNull(factory); > + > + assertEquals(2, factory.getFeatureGenerators().length); > + assertEquals(BagOfWordsFeatureGenerator.class, > + factory.getFeatureGenerators()[0].getClass()); > + assertEquals(NGramFeatureGenerator.class, > + factory.getFeatureGenerators()[1].getClass()); > + > + assertEquals(SimpleTokenizer.INSTANCE.getClass(), > factory.getTokenizer() > + .getClass()); > + > + } > + > +} > > Propchange: > opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java > > ------------------------------------------------------------------------------ > svn:mime-type = text/plain > > >