resourc...

Jörn Kottmann Tue, 22 Apr 2014 04:49:40 -0700

Hello,

sorry for my late response. I reviewed your change and it
looks good to me. Maybe you could add a comment about it
to the Javadoc of that method.


Jörn 

On Wed, 2014-04-16 at 12:37 -0300, William Colen wrote:
> Jörn,
> 
> Can you please review my change to the ExtensionLoader? I modified it to
> accept singletons (private constructor and the field INSTANCE).
> 
> Thank you,
> William
> 
> 
> 2014-04-16 12:26 GMT-03:00 <co...@apache.org>:
> 
> > Author: colen
> > Date: Wed Apr 16 15:26:24 2014
> > New Revision: 1587944
> >
> > URL: http://svn.apache.org/r1587944
> > Log:
> > OPENNLP-674 Added factory to Doccat
> >
> > Added:
> >
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
> >   (with props)
> >
> > opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
> >   (with props)
> >     opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/doccat/
> >
> > opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/doccat/DoccatSample.txt
> >   (with props)
> > Modified:
> >
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
> >
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
> >
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
> >
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
> >
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
> >
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
> >
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
> >
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
> >
> > Modified:
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
> > URL:
> > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java?rev=1587944&r1=1587943&r2=1587944&view=diff
> >
> > ==============================================================================
> > ---
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
> > (original)
> > +++
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
> > Wed Apr 16 15:26:24 2014
> > @@ -34,8 +34,10 @@ import opennlp.tools.cmdline.doccat.Docc
> >  import opennlp.tools.cmdline.params.CVParams;
> >  import opennlp.tools.doccat.DoccatCrossValidator;
> >  import opennlp.tools.doccat.DoccatEvaluationMonitor;
> > +import opennlp.tools.doccat.DoccatFactory;
> >  import opennlp.tools.doccat.DocumentSample;
> >  import opennlp.tools.doccat.FeatureGenerator;
> > +import opennlp.tools.tokenize.Tokenizer;
> >  import opennlp.tools.util.eval.EvaluationMonitor;
> >  import opennlp.tools.util.model.ModelUtil;
> >
> > @@ -88,13 +90,18 @@ public final class DoccatCrossValidatorT
> >      FeatureGenerator[] featureGenerators = DoccatTrainerTool
> >          .createFeatureGenerators(params.getFeatureGenerators());
> >
> > +    Tokenizer tokenizer = DoccatTrainerTool.createTokenizer(params
> > +        .getTokenizer());
> > +
> >      DoccatEvaluationMonitor[] listenersArr = listeners
> >          .toArray(new DoccatEvaluationMonitor[listeners.size()]);
> >
> >      DoccatCrossValidator validator;
> >      try {
> > +      DoccatFactory factory = DoccatFactory.create(params.getFactory(),
> > +          tokenizer, featureGenerators);
> >        validator = new DoccatCrossValidator(params.getLang(), mlParams,
> > -          featureGenerators, listenersArr);
> > +          factory, listenersArr);
> >
> >        validator.evaluate(sampleStream, params.getFolds());
> >      } catch (IOException e) {
> >
> > Modified:
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
> > URL:
> > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java?rev=1587944&r1=1587943&r2=1587944&view=diff
> >
> > ==============================================================================
> > ---
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
> > (original)
> > +++
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
> > Wed Apr 16 15:26:24 2014
> > @@ -26,16 +26,19 @@ import opennlp.tools.cmdline.TerminateTo
> >  import opennlp.tools.cmdline.doccat.DoccatTrainerTool.TrainerToolParams;
> >  import opennlp.tools.cmdline.params.TrainingToolParams;
> >  import opennlp.tools.doccat.BagOfWordsFeatureGenerator;
> > +import opennlp.tools.doccat.DoccatFactory;
> >  import opennlp.tools.doccat.DoccatModel;
> >  import opennlp.tools.doccat.DocumentCategorizerME;
> >  import opennlp.tools.doccat.DocumentSample;
> >  import opennlp.tools.doccat.FeatureGenerator;
> > +import opennlp.tools.tokenize.Tokenizer;
> > +import opennlp.tools.tokenize.WhitespaceTokenizer;
> >  import opennlp.tools.util.ext.ExtensionLoader;
> >  import opennlp.tools.util.model.ModelUtil;
> >
> >  public class DoccatTrainerTool
> >      extends AbstractTrainerTool<DocumentSample, TrainerToolParams> {
> > -
> > +
> >    interface TrainerToolParams extends TrainingParams, TrainingToolParams {
> >    }
> >
> > @@ -47,7 +50,7 @@ public class DoccatTrainerTool
> >    public String getShortDescription() {
> >      return "trainer for the learnable document categorizer";
> >    }
> > -
> > +
> >    @Override
> >    public void run(String format, String[] args) {
> >      super.run(format, args);
> > @@ -64,10 +67,14 @@ public class DoccatTrainerTool
> >      FeatureGenerator[] featureGenerators = createFeatureGenerators(params
> >          .getFeatureGenerators());
> >
> > +    Tokenizer tokenizer = createTokenizer(params.getTokenizer());
> > +
> >      DoccatModel model;
> >      try {
> > +      DoccatFactory factory = DoccatFactory.create(params.getFactory(),
> > +          tokenizer, featureGenerators);
> >        model = DocumentCategorizerME.train(params.getLang(), sampleStream,
> > -          mlParams, featureGenerators);
> > +          mlParams, factory);
> >      } catch (IOException e) {
> >        throw new TerminateToolException(-1, "IO error while reading
> > training data or indexing data: " +
> >            e.getMessage(), e);
> > @@ -79,10 +86,17 @@ public class DoccatTrainerTool
> >          // sorry that this can fail
> >        }
> >      }
> > -
> > +
> >      CmdLineUtil.writeModel("document categorizer", modelOutFile, model);
> >    }
> >
> > +  static Tokenizer createTokenizer(String tokenizer) {
> > +    if(tokenizer != null) {
> > +      return ExtensionLoader.instantiateExtension(Tokenizer.class,
> > tokenizer);
> > +    }
> > +    return WhitespaceTokenizer.INSTANCE;
> > +  }
> > +
> >    static FeatureGenerator[] createFeatureGenerators(String
> > featureGeneratorsNames) {
> >      if(featureGeneratorsNames == null) {
> >        FeatureGenerator[] def = {new BagOfWordsFeatureGenerator()};
> >
> > Modified:
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
> > URL:
> > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java?rev=1587944&r1=1587943&r2=1587944&view=diff
> >
> > ==============================================================================
> > ---
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
> > (original)
> > +++
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
> > Wed Apr 16 15:26:24 2014
> > @@ -32,4 +32,12 @@ interface TrainingParams extends BasicTr
> >    @OptionalParameter
> >    String getFeatureGenerators();
> >
> > +  @ParameterDescription(valueName = "tokenizer", description = "Tokenizer
> > implementation. WhitespaceTokenizer is used if not specified.")
> > +  @OptionalParameter
> > +  String getTokenizer();
> > +
> > +  @ParameterDescription(valueName = "factoryName", description = "A
> > sub-class of DoccatFactory where to get implementation and resources.")
> > +  @OptionalParameter
> > +  String getFactory();
> > +
> >  }
> >
> > Modified:
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
> > URL:
> > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java?rev=1587944&r1=1587943&r2=1587944&view=diff
> >
> > ==============================================================================
> > ---
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
> > (original)
> > +++
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
> > Wed Apr 16 15:26:24 2014
> > @@ -34,18 +34,19 @@ public class DoccatCrossValidator {
> >
> >    private DoccatEvaluationMonitor[] listeners;
> >
> > -  private FeatureGenerator[] featureGenarators;
> > +  private DoccatFactory factory;
> > +
> >
> >    /**
> >     * Creates a {@link DoccatCrossValidator} with the given
> >     * {@link FeatureGenerator}s.
> >     */
> >    public DoccatCrossValidator(String languageCode, TrainingParameters
> > mlParams,
> > -      FeatureGenerator[] featureGenerators, DoccatEvaluationMonitor[]
> > listeners) {
> > +      DoccatFactory factory, DoccatEvaluationMonitor ... listeners) {
> >      this.languageCode = languageCode;
> >      this.params = mlParams;
> >      this.listeners = listeners;
> > -    this.featureGenarators = featureGenerators;
> > +    this.factory = factory;
> >    }
> >
> >    /**
> > @@ -70,7 +71,7 @@ public class DoccatCrossValidator {
> >            .next();
> >
> >        DoccatModel model = DocumentCategorizerME.train(languageCode,
> > -          trainingSampleStream, params, featureGenarators);
> > +          trainingSampleStream, params, factory);
> >
> >        DocumentCategorizerEvaluator evaluator = new
> > DocumentCategorizerEvaluator(
> >            new DocumentCategorizerME(model), listeners);
> >
> > Added:
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
> > URL:
> > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java?rev=1587944&view=auto
> >
> > ==============================================================================
> > ---
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
> > (added)
> > +++
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
> > Wed Apr 16 15:26:24 2014
> > @@ -0,0 +1,174 @@
> > +/*
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements.  See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version 2.0
> > + * (the "License"); you may not use this file except in compliance with
> > + * the License. You may obtain a copy of the License at
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +package opennlp.tools.doccat;
> > +
> > +import java.util.Arrays;
> > +import java.util.Iterator;
> > +import java.util.List;
> > +import java.util.Map;
> > +
> > +import opennlp.tools.tokenize.Tokenizer;
> > +import opennlp.tools.tokenize.WhitespaceTokenizer;
> > +import opennlp.tools.util.BaseToolFactory;
> > +import opennlp.tools.util.InvalidFormatException;
> > +import opennlp.tools.util.ext.ExtensionLoader;
> > +
> > +/**
> > + * The factory that provides Doccat default implementations and resources
> > + */
> > +public class DoccatFactory extends BaseToolFactory {
> > +
> > +  private static final String FEATURE_GENERATORS =
> > "doccat.featureGenerators";
> > +  private static final String TOKENIZER_NAME = "doccat.tokenizer";
> > +
> > +  private FeatureGenerator[] featureGenerators;
> > +  private Tokenizer tokenizer;
> > +
> > +  /**
> > +   * Creates a {@link DoccatFactory} that provides the default
> > implementation of
> > +   * the resources.
> > +   */
> > +  public DoccatFactory() {
> > +  }
> > +
> > +  /**
> > +   * Creates a {@link DoccatFactory}. Use this constructor to
> > programmatically
> > +   * create a factory.
> > +   *
> > +   * @param tokenizer
> > +   * @param featureGenerators
> > +   */
> > +  public DoccatFactory(Tokenizer tokenizer, FeatureGenerator[]
> > featureGenerators) {
> > +    this.init(tokenizer, featureGenerators);
> > +  }
> > +
> > +  protected void init(Tokenizer tokenizer, FeatureGenerator[]
> > featureGenerators) {
> > +
> > +    this.featureGenerators = featureGenerators;
> > +    this.tokenizer = tokenizer;
> > +  }
> > +
> > +  @Override
> > +  public Map<String, String> createManifestEntries() {
> > +    Map<String, String> manifestEntries = super.createManifestEntries();
> > +
> > +    if (getTokenizer() != null) {
> > +      manifestEntries.put(TOKENIZER_NAME, getTokenizer().getClass()
> > +          .getCanonicalName());
> > +    }
> > +
> > +    if (getFeatureGenerators() != null) {
> > +      manifestEntries.put(FEATURE_GENERATORS,
> > featureGeneratorsAsString());
> > +    }
> > +
> > +    return manifestEntries;
> > +  }
> > +
> > +  private String featureGeneratorsAsString() {
> > +    List<FeatureGenerator> fgs = Arrays.asList(getFeatureGenerators());
> > +    Iterator<FeatureGenerator> iter = fgs.iterator();
> > +    StringBuilder sb = new StringBuilder();
> > +    if (iter.hasNext()) {
> > +      sb.append(iter.next().getClass().getCanonicalName());
> > +      while (iter.hasNext()) {
> > +        sb.append(',').append(iter.next().getClass().getCanonicalName());
> > +      }
> > +    }
> > +    return sb.toString();
> > +  }
> > +
> > +  @Override
> > +  public void validateArtifactMap() throws InvalidFormatException {
> > +    // nothing to validate
> > +  }
> > +
> > +  public static DoccatFactory create(String subclassName, Tokenizer
> > tokenizer,
> > +      FeatureGenerator[] featureGenerators) throws InvalidFormatException
> > {
> > +    if (subclassName == null) {
> > +      // will create the default factory
> > +      return new DoccatFactory(tokenizer, featureGenerators);
> > +    }
> > +    try {
> > +      DoccatFactory theFactory = ExtensionLoader.instantiateExtension(
> > +          DoccatFactory.class, subclassName);
> > +      theFactory.init(tokenizer, featureGenerators);
> > +      return theFactory;
> > +    } catch (Exception e) {
> > +      String msg = "Could not instantiate the " + subclassName
> > +          + ". The initialization throw an exception.";
> > +      System.err.println(msg);
> > +      e.printStackTrace();
> > +      throw new InvalidFormatException(msg, e);
> > +    }
> > +
> > +  }
> > +
> > +  private FeatureGenerator[] loadFeatureGenerators(String classNames) {
> > +    String[] classes = classNames.split(",");
> > +    FeatureGenerator[] fgs = new FeatureGenerator[classes.length];
> > +
> > +    for (int i = 0; i < classes.length; i++) {
> > +      fgs[i] =
> > ExtensionLoader.instantiateExtension(FeatureGenerator.class,
> > +          classes[i]);
> > +    }
> > +    return fgs;
> > +  }
> > +
> > +  public FeatureGenerator[] getFeatureGenerators() {
> > +    if (featureGenerators == null) {
> > +      if (artifactProvider != null) {
> > +        String classNames = artifactProvider
> > +            .getManifestProperty(FEATURE_GENERATORS);
> > +        if (classNames != null) {
> > +          this.featureGenerators = loadFeatureGenerators(classNames);
> > +        }
> > +      }
> > +      if (featureGenerators == null) { // could not load using artifact
> > provider
> > +        // load bag of words as default
> > +        FeatureGenerator[] bow = { new BagOfWordsFeatureGenerator() };
> > +        this.featureGenerators = bow;
> > +      }
> > +    }
> > +    return featureGenerators;
> > +  }
> > +
> > +  public void setFeatureGenerators(FeatureGenerator[] featureGenerators) {
> > +    this.featureGenerators = featureGenerators;
> > +  }
> > +
> > +  public Tokenizer getTokenizer() {
> > +    if (this.tokenizer == null) {
> > +      if (artifactProvider != null) {
> > +        String className =
> > artifactProvider.getManifestProperty(TOKENIZER_NAME);
> > +        if (className != null) {
> > +          this.tokenizer = ExtensionLoader.instantiateExtension(
> > +              Tokenizer.class, className);
> > +        }
> > +      }
> > +      if (this.tokenizer == null) { // could not load using artifact
> > provider
> > +        this.tokenizer = WhitespaceTokenizer.INSTANCE;
> > +      }
> > +    }
> > +    return tokenizer;
> > +  }
> > +
> > +  public void setTokenizer(Tokenizer tokenizer) {
> > +    this.tokenizer = tokenizer;
> > +  }
> > +
> > +}
> >
> > Propchange:
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
> >
> > ------------------------------------------------------------------------------
> >     svn:mime-type = text/plain
> >
> > Modified:
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
> > URL:
> > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java?rev=1587944&r1=1587943&r2=1587944&view=diff
> >
> > ==============================================================================
> > ---
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
> > (original)
> > +++
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
> > Wed Apr 16 15:26:24 2014
> > @@ -25,34 +25,50 @@ import java.util.Map;
> >
> >  import opennlp.tools.ml.model.AbstractModel;
> >  import opennlp.tools.ml.model.MaxentModel;
> > +import opennlp.tools.util.BaseToolFactory;
> >  import opennlp.tools.util.InvalidFormatException;
> >  import opennlp.tools.util.model.BaseModel;
> >
> >  public class DoccatModel extends BaseModel {
> > -
> > +
> >    private static final String COMPONENT_NAME = "DocumentCategorizerME";
> >    private static final String DOCCAT_MODEL_ENTRY_NAME = "doccat.model";
> > -
> > -  protected DoccatModel(String languageCode, MaxentModel doccatModel,
> > -      Map<String, String> manifestInfoEntries) {
> > -    super(COMPONENT_NAME, languageCode, manifestInfoEntries);
> > -
> > +
> > +  public DoccatModel(String languageCode, MaxentModel doccatModel,
> > +      Map<String, String> manifestInfoEntries, DoccatFactory factory) {
> > +    super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
> > +
> >      artifactMap.put(DOCCAT_MODEL_ENTRY_NAME, doccatModel);
> >      checkArtifactMap();
> >    }
> > -
> > +
> > +  /**
> > +   * @deprecated Use
> > +   *             {@link #DoccatModel(String, MaxentModel, Map,
> > DoccatFactory)}
> > +   *             instead and pass in a {@link DoccatFactory}
> > +   */
> > +  protected DoccatModel(String languageCode, MaxentModel doccatModel,
> > +      Map<String, String> manifestInfoEntries) {
> > +    this(languageCode, doccatModel, manifestInfoEntries, new
> > DoccatFactory());
> > +  }
> > +
> > +  /**
> > +   * @deprecated Use
> > +   *             {@link #DoccatModel(String, MaxentModel, Map,
> > DoccatFactory)}
> > +   *             instead and pass in a {@link DoccatFactory}
> > +   */
> >    public DoccatModel(String languageCode, MaxentModel doccatModel) {
> >      this(languageCode, doccatModel, null);
> >    }
> > -
> > +
> >    public DoccatModel(InputStream in) throws IOException,
> > InvalidFormatException {
> >      super(COMPONENT_NAME, in);
> >    }
> > -
> > +
> >    public DoccatModel(File modelFile) throws IOException,
> > InvalidFormatException {
> >      super(COMPONENT_NAME, modelFile);
> >    }
> > -
> > +
> >    public DoccatModel(URL modelURL) throws IOException,
> > InvalidFormatException {
> >      super(COMPONENT_NAME, modelURL);
> >    }
> > @@ -66,7 +82,23 @@ public class DoccatModel extends BaseMod
> >      }
> >    }
> >
> > +  public DoccatFactory getFactory() {
> > +    return (DoccatFactory) this.toolFactory;
> > +  }
> > +
> > +  @Override
> > +  protected Class<? extends BaseToolFactory> getDefaultFactory() {
> > +    return DoccatFactory.class;
> > +  }
> > +
> > +  /**
> > +   * @deprecated Use {@link #getMaxentModel()} instead.
> > +   */
> >    public MaxentModel getChunkerModel() {
> >      return (MaxentModel) artifactMap.get(DOCCAT_MODEL_ENTRY_NAME);
> >    }
> > +
> > +  public MaxentModel getMaxentModel() {
> > +    return (MaxentModel) artifactMap.get(DOCCAT_MODEL_ENTRY_NAME);
> > +  }
> >  }
> >
> > Modified:
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
> > URL:
> > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1587944&r1=1587943&r2=1587944&view=diff
> >
> > ==============================================================================
> > ---
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
> > (original)
> > +++
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
> > Wed Apr 16 15:26:24 2014
> > @@ -25,7 +25,6 @@ import java.util.Map;
> >
> >  import opennlp.tools.ml.model.MaxentModel;
> >  import opennlp.tools.ml.model.TrainUtil;
> > -import opennlp.tools.tokenize.SimpleTokenizer;
> >  import opennlp.tools.tokenize.Tokenizer;
> >  import opennlp.tools.util.ObjectStream;
> >  import opennlp.tools.util.TrainingParameters;
> > @@ -40,29 +39,35 @@ public class DocumentCategorizerME imple
> >     * Shared default thread safe feature generator.
> >     */
> >    private static FeatureGenerator defaultFeatureGenerator = new
> > BagOfWordsFeatureGenerator();
> > -
> > -  private MaxentModel model;
> > +
> > +  private DoccatModel model;
> >    private DocumentCategorizerContextGenerator mContextGenerator;
> >
> >    /**
> > -   * Initializes a the current instance with a doccat model and custom
> > feature generation.
> > -   * The feature generation must be identical to the configuration at
> > training time.
> > -   *
> > +   * Initializes a the current instance with a doccat model and custom
> > feature
> > +   * generation. The feature generation must be identical to the
> > configuration
> > +   * at training time.
> > +   *
> >     * @param model
> >     * @param featureGenerators
> > +   *
> > +   * @deprecated train a {@link DoccatModel} with a specific
> > +   *             {@link DoccatFactory} to customize the {@link
> > FeatureGenerator}s
> >     */
> >    public DocumentCategorizerME(DoccatModel model, FeatureGenerator...
> > featureGenerators) {
> > -    this.model = model.getChunkerModel();
> > +    this.model = model;
> >      this.mContextGenerator = new
> > DocumentCategorizerContextGenerator(featureGenerators);
> >    }
> > -
> > +
> >    /**
> >     * Initializes the current instance with a doccat model. Default
> > feature generation is used.
> > -   *
> > +   *
> >     * @param model
> >     */
> >    public DocumentCategorizerME(DoccatModel model) {
> > -    this(model, defaultFeatureGenerator);
> > +    this.model = model;
> > +    this.mContextGenerator = new
> > DocumentCategorizerContextGenerator(this.model
> > +        .getFactory().getFeatureGenerators());
> >    }
> >
> >    /**
> > @@ -71,7 +76,7 @@ public class DocumentCategorizerME imple
> >     * @param text
> >     */
> >    public double[] categorize(String text[]) {
> > -    return model.eval(mContextGenerator.getContext(text));
> > +    return
> > model.getMaxentModel().eval(mContextGenerator.getContext(text));
> >    }
> >
> >    /**
> > @@ -79,57 +84,79 @@ public class DocumentCategorizerME imple
> >     * is passed to the feature generation.
> >     */
> >    public double[] categorize(String documentText) {
> > -    Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
> > +    Tokenizer tokenizer = model.getFactory().getTokenizer();
> >      return categorize(tokenizer.tokenize(documentText));
> >    }
> >
> >    public String getBestCategory(double[] outcome) {
> > -    return model.getBestOutcome(outcome);
> > +    return model.getMaxentModel().getBestOutcome(outcome);
> >    }
> >
> >    public int getIndex(String category) {
> > -    return model.getIndex(category);
> > +    return model.getMaxentModel().getIndex(category);
> >    }
> >
> >    public String getCategory(int index) {
> > -    return model.getOutcome(index);
> > +    return model.getMaxentModel().getOutcome(index);
> >    }
> >
> >    public int getNumberOfCategories() {
> > -    return model.getNumOutcomes();
> > +    return model.getMaxentModel().getNumOutcomes();
> >    }
> >
> >    public String getAllResults(double results[]) {
> > -    return model.getAllOutcomes(results);
> > +    return model.getMaxentModel().getAllOutcomes(results);
> >    }
> >
> > +   /**
> > +   * @deprecated Use
> > +   *             {@link #train(String, ObjectStream, TrainingParameters,
> > DoccatFactory)}
> > +   *             instead.
> > +   */
> >     public static DoccatModel train(String languageCode,
> > ObjectStream<DocumentSample> samples,
> >         TrainingParameters mlParams, FeatureGenerator... featureGenerators)
> >     throws IOException {
> > -
> > +
> >       if (featureGenerators.length == 0) {
> >         featureGenerators = new
> > FeatureGenerator[]{defaultFeatureGenerator};
> >       }
> > -
> > +
> >       Map<String, String> manifestInfoEntries = new HashMap<String,
> > String>();
> > -
> > +
> >       MaxentModel model = TrainUtil.train(
> >           new DocumentCategorizerEventStream(samples, featureGenerators),
> >           mlParams.getSettings(), manifestInfoEntries);
> > -
> > +
> >       return new DoccatModel(languageCode, model, manifestInfoEntries);
> >     }
> > -
> > +
> > +   public static DoccatModel train(String languageCode,
> > ObjectStream<DocumentSample> samples,
> > +       TrainingParameters mlParams, DoccatFactory factory)
> > +   throws IOException {
> > +
> > +     Map<String, String> manifestInfoEntries = new HashMap<String,
> > String>();
> > +
> > +     MaxentModel model = TrainUtil.train(
> > +         new DocumentCategorizerEventStream(samples,
> > factory.getFeatureGenerators()),
> > +         mlParams.getSettings(), manifestInfoEntries);
> > +
> > +     return new DoccatModel(languageCode, model, manifestInfoEntries,
> > factory);
> > +   }
> > +
> >    /**
> >     * Trains a doccat model with default feature generation.
> > -   *
> > +   *
> >     * @param languageCode
> >     * @param samples
> > -   *
> > +   *
> >     * @return the trained doccat model
> > -   *
> > +   *
> >     * @throws IOException
> > -   * @throws ObjectStreamException
> > +   * @throws ObjectStreamException
> > +   *
> > +   * @deprecated Use
> > +   *             {@link #train(String, ObjectStream, TrainingParameters,
> > DoccatFactory)}
> > +   *             instead.
> >     */
> >    public static DoccatModel train(String languageCode,
> > ObjectStream<DocumentSample> samples) throws IOException {
> >      return train(languageCode, samples,
> > ModelUtil.createDefaultTrainingParameters(), defaultFeatureGenerator);
> >
> > Modified:
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
> > URL:
> > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java?rev=1587944&r1=1587943&r2=1587944&view=diff
> >
> > ==============================================================================
> > ---
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
> > (original)
> > +++
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
> > Wed Apr 16 15:26:24 2014
> > @@ -52,7 +52,7 @@ public class SentenceDetectorFactory ext
> >    /**
> >     * Creates a {@link SentenceDetectorFactory}. Use this constructor to
> >     * programmatically create a factory.
> > -   *
> > +   *
> >     * @param languageCode
> >     * @param abbreviationDictionary
> >     * @param eosCharacters
> > @@ -61,7 +61,7 @@ public class SentenceDetectorFactory ext
> >        Dictionary abbreviationDictionary, char[] eosCharacters) {
> >      this.init(languageCode, useTokenEnd, abbreviationDictionary,
> > eosCharacters);
> >    }
> > -
> > +
> >    protected void init(String languageCode, boolean useTokenEnd,
> >        Dictionary abbreviationDictionary, char[] eosCharacters) {
> >      this.languageCode = languageCode;
> >
> > Modified:
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
> > URL:
> > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java?rev=1587944&r1=1587943&r2=1587944&view=diff
> >
> > ==============================================================================
> > ---
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
> > (original)
> > +++
> > opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
> > Wed Apr 16 15:26:24 2014
> > @@ -17,6 +17,8 @@
> >
> >  package opennlp.tools.util.ext;
> >
> > +import java.lang.reflect.Field;
> > +
> >  /**
> >   * The {@link ExtensionLoader} is responsible to load extensions to the
> > OpenNLP library.
> >   * <p>
> > @@ -64,6 +66,24 @@ public class ExtensionLoader {
> >          } catch (InstantiationException e) {
> >            throw new ExtensionNotLoadedException(e);
> >          } catch (IllegalAccessException e) {
> > +          // constructor is private. Try to load using INSTANCE
> > +          Field instanceField;
> > +          try {
> > +            instanceField = extClazz.getDeclaredField("INSTANCE");
> > +          } catch (NoSuchFieldException e1) {
> > +            throw new ExtensionNotLoadedException(e1);
> > +          } catch (SecurityException e1) {
> > +            throw new ExtensionNotLoadedException(e1);
> > +          }
> > +          if(instanceField != null) {
> > +            try {
> > +              return (T) instanceField.get(null);
> > +            } catch (IllegalArgumentException e1) {
> > +              throw new ExtensionNotLoadedException(e1);
> > +            } catch (IllegalAccessException e1) {
> > +              throw new ExtensionNotLoadedException(e1);
> > +            }
> > +          }
> >            throw new ExtensionNotLoadedException(e);
> >          }
> >        }
> >
> > Added:
> > opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
> > URL:
> > http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java?rev=1587944&view=auto
> >
> > ==============================================================================
> > ---
> > opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
> > (added)
> > +++
> > opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
> > Wed Apr 16 15:26:24 2014
> > @@ -0,0 +1,100 @@
> > +package opennlp.tools.doccat;
> > +
> > +import static org.junit.Assert.assertEquals;
> > +import static org.junit.Assert.assertNotNull;
> > +
> > +import java.io.ByteArrayInputStream;
> > +import java.io.ByteArrayOutputStream;
> > +import java.io.IOException;
> > +
> > +import opennlp.tools.formats.ResourceAsStreamFactory;
> > +import opennlp.tools.tokenize.SimpleTokenizer;
> > +import opennlp.tools.tokenize.WhitespaceTokenizer;
> > +import opennlp.tools.util.InputStreamFactory;
> > +import opennlp.tools.util.ObjectStream;
> > +import opennlp.tools.util.PlainTextByLineStream;
> > +import opennlp.tools.util.TrainingParameters;
> > +
> > +import org.junit.Test;
> > +
> > +/**
> > + * Tests for the {@link DoccatFactory} class.
> > + */
> > +public class DoccatFactoryTest {
> > +
> > +  private static ObjectStream<DocumentSample> createSampleStream()
> > +      throws IOException {
> > +
> > +    InputStreamFactory isf = new ResourceAsStreamFactory(
> > +        DoccatFactoryTest.class,
> > "/opennlp/tools/doccat/DoccatSample.txt");
> > +
> > +    return new DocumentSampleStream(new PlainTextByLineStream(isf,
> > "UTF-8"));
> > +  }
> > +
> > +  private static DoccatModel train() throws IOException {
> > +    return DocumentCategorizerME.train("x-unspecified",
> > createSampleStream(),
> > +        TrainingParameters.defaultParams());
> > +  }
> > +
> > +  private static DoccatModel train(DoccatFactory factory) throws
> > IOException {
> > +    return DocumentCategorizerME.train("x-unspecified",
> > createSampleStream(),
> > +        TrainingParameters.defaultParams(), factory);
> > +  }
> > +
> > +  @Test
> > +  public void testDefault() throws IOException {
> > +    DoccatModel model = train();
> > +
> > +    assertNotNull(model);
> > +
> > +    ByteArrayOutputStream out = new ByteArrayOutputStream();
> > +    model.serialize(out);
> > +    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
> > +
> > +    DoccatModel fromSerialized = new DoccatModel(in);
> > +
> > +    DoccatFactory factory = fromSerialized.getFactory();
> > +
> > +    assertNotNull(factory);
> > +
> > +    assertEquals(1, factory.getFeatureGenerators().length);
> > +    assertEquals(BagOfWordsFeatureGenerator.class,
> > +        factory.getFeatureGenerators()[0].getClass());
> > +
> > +    assertEquals(WhitespaceTokenizer.INSTANCE, factory.getTokenizer());
> > +
> > +  }
> > +
> > +  @Test
> > +  public void testCustom() throws IOException {
> > +    FeatureGenerator[] featureGenerators = { new
> > BagOfWordsFeatureGenerator(),
> > +        new NGramFeatureGenerator() };
> > +    DoccatFactory factory = new DoccatFactory(SimpleTokenizer.INSTANCE,
> > +        featureGenerators);
> > +
> > +    DoccatModel model = train(factory);
> > +
> > +    assertNotNull(model);
> > +
> > +    ByteArrayOutputStream out = new ByteArrayOutputStream();
> > +    model.serialize(out);
> > +    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
> > +
> > +    DoccatModel fromSerialized = new DoccatModel(in);
> > +
> > +    factory = fromSerialized.getFactory();
> > +
> > +    assertNotNull(factory);
> > +
> > +    assertEquals(2, factory.getFeatureGenerators().length);
> > +    assertEquals(BagOfWordsFeatureGenerator.class,
> > +        factory.getFeatureGenerators()[0].getClass());
> > +    assertEquals(NGramFeatureGenerator.class,
> > +        factory.getFeatureGenerators()[1].getClass());
> > +
> > +    assertEquals(SimpleTokenizer.INSTANCE.getClass(),
> > factory.getTokenizer()
> > +        .getClass());
> > +
> > +  }
> > +
> > +}
> >
> > Propchange:
> > opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
> >
> > ------------------------------------------------------------------------------
> >     svn:mime-type = text/plain
> >
> >
> >

Re: svn commit: r1587944 [1/2] - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/cmdline/doccat/ main/java/opennlp/tools/doccat/ main/java/opennlp/tools/sentdetect/ main/java/opennlp/tools/util/ext/ test/java/opennlp/tools/doccat/ test/resourc...

Reply via email to