resourc...

William Colen Wed, 16 Apr 2014 08:38:40 -0700

Jörn,

Can you please review my change to the ExtensionLoader? I modified it to
accept singletons (private constructor and the field INSTANCE).


Thank you,
William


2014-04-16 12:26 GMT-03:00 <co...@apache.org>:

> Author: colen
> Date: Wed Apr 16 15:26:24 2014
> New Revision: 1587944
>
> URL: http://svn.apache.org/r1587944
> Log:
> OPENNLP-674 Added factory to Doccat
>
> Added:
>
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
>   (with props)
>
> opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
>   (with props)
>     opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/doccat/
>
> opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/doccat/DoccatSample.txt
>   (with props)
> Modified:
>
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
>
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
>
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
>
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
>
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
>
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
>
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
>
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
>
> Modified:
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
> URL:
> http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java?rev=1587944&r1=1587943&r2=1587944&view=diff
>
> ==============================================================================
> ---
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
> (original)
> +++
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
> Wed Apr 16 15:26:24 2014
> @@ -34,8 +34,10 @@ import opennlp.tools.cmdline.doccat.Docc
>  import opennlp.tools.cmdline.params.CVParams;
>  import opennlp.tools.doccat.DoccatCrossValidator;
>  import opennlp.tools.doccat.DoccatEvaluationMonitor;
> +import opennlp.tools.doccat.DoccatFactory;
>  import opennlp.tools.doccat.DocumentSample;
>  import opennlp.tools.doccat.FeatureGenerator;
> +import opennlp.tools.tokenize.Tokenizer;
>  import opennlp.tools.util.eval.EvaluationMonitor;
>  import opennlp.tools.util.model.ModelUtil;
>
> @@ -88,13 +90,18 @@ public final class DoccatCrossValidatorT
>      FeatureGenerator[] featureGenerators = DoccatTrainerTool
>          .createFeatureGenerators(params.getFeatureGenerators());
>
> +    Tokenizer tokenizer = DoccatTrainerTool.createTokenizer(params
> +        .getTokenizer());
> +
>      DoccatEvaluationMonitor[] listenersArr = listeners
>          .toArray(new DoccatEvaluationMonitor[listeners.size()]);
>
>      DoccatCrossValidator validator;
>      try {
> +      DoccatFactory factory = DoccatFactory.create(params.getFactory(),
> +          tokenizer, featureGenerators);
>        validator = new DoccatCrossValidator(params.getLang(), mlParams,
> -          featureGenerators, listenersArr);
> +          factory, listenersArr);
>
>        validator.evaluate(sampleStream, params.getFolds());
>      } catch (IOException e) {
>
> Modified:
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
> URL:
> http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java?rev=1587944&r1=1587943&r2=1587944&view=diff
>
> ==============================================================================
> ---
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
> (original)
> +++
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
> Wed Apr 16 15:26:24 2014
> @@ -26,16 +26,19 @@ import opennlp.tools.cmdline.TerminateTo
>  import opennlp.tools.cmdline.doccat.DoccatTrainerTool.TrainerToolParams;
>  import opennlp.tools.cmdline.params.TrainingToolParams;
>  import opennlp.tools.doccat.BagOfWordsFeatureGenerator;
> +import opennlp.tools.doccat.DoccatFactory;
>  import opennlp.tools.doccat.DoccatModel;
>  import opennlp.tools.doccat.DocumentCategorizerME;
>  import opennlp.tools.doccat.DocumentSample;
>  import opennlp.tools.doccat.FeatureGenerator;
> +import opennlp.tools.tokenize.Tokenizer;
> +import opennlp.tools.tokenize.WhitespaceTokenizer;
>  import opennlp.tools.util.ext.ExtensionLoader;
>  import opennlp.tools.util.model.ModelUtil;
>
>  public class DoccatTrainerTool
>      extends AbstractTrainerTool<DocumentSample, TrainerToolParams> {
> -
> +
>    interface TrainerToolParams extends TrainingParams, TrainingToolParams {
>    }
>
> @@ -47,7 +50,7 @@ public class DoccatTrainerTool
>    public String getShortDescription() {
>      return "trainer for the learnable document categorizer";
>    }
> -
> +
>    @Override
>    public void run(String format, String[] args) {
>      super.run(format, args);
> @@ -64,10 +67,14 @@ public class DoccatTrainerTool
>      FeatureGenerator[] featureGenerators = createFeatureGenerators(params
>          .getFeatureGenerators());
>
> +    Tokenizer tokenizer = createTokenizer(params.getTokenizer());
> +
>      DoccatModel model;
>      try {
> +      DoccatFactory factory = DoccatFactory.create(params.getFactory(),
> +          tokenizer, featureGenerators);
>        model = DocumentCategorizerME.train(params.getLang(), sampleStream,
> -          mlParams, featureGenerators);
> +          mlParams, factory);
>      } catch (IOException e) {
>        throw new TerminateToolException(-1, "IO error while reading
> training data or indexing data: " +
>            e.getMessage(), e);
> @@ -79,10 +86,17 @@ public class DoccatTrainerTool
>          // sorry that this can fail
>        }
>      }
> -
> +
>      CmdLineUtil.writeModel("document categorizer", modelOutFile, model);
>    }
>
> +  static Tokenizer createTokenizer(String tokenizer) {
> +    if(tokenizer != null) {
> +      return ExtensionLoader.instantiateExtension(Tokenizer.class,
> tokenizer);
> +    }
> +    return WhitespaceTokenizer.INSTANCE;
> +  }
> +
>    static FeatureGenerator[] createFeatureGenerators(String
> featureGeneratorsNames) {
>      if(featureGeneratorsNames == null) {
>        FeatureGenerator[] def = {new BagOfWordsFeatureGenerator()};
>
> Modified:
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
> URL:
> http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java?rev=1587944&r1=1587943&r2=1587944&view=diff
>
> ==============================================================================
> ---
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
> (original)
> +++
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/TrainingParams.java
> Wed Apr 16 15:26:24 2014
> @@ -32,4 +32,12 @@ interface TrainingParams extends BasicTr
>    @OptionalParameter
>    String getFeatureGenerators();
>
> +  @ParameterDescription(valueName = "tokenizer", description = "Tokenizer
> implementation. WhitespaceTokenizer is used if not specified.")
> +  @OptionalParameter
> +  String getTokenizer();
> +
> +  @ParameterDescription(valueName = "factoryName", description = "A
> sub-class of DoccatFactory where to get implementation and resources.")
> +  @OptionalParameter
> +  String getFactory();
> +
>  }
>
> Modified:
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
> URL:
> http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java?rev=1587944&r1=1587943&r2=1587944&view=diff
>
> ==============================================================================
> ---
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
> (original)
> +++
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
> Wed Apr 16 15:26:24 2014
> @@ -34,18 +34,19 @@ public class DoccatCrossValidator {
>
>    private DoccatEvaluationMonitor[] listeners;
>
> -  private FeatureGenerator[] featureGenarators;
> +  private DoccatFactory factory;
> +
>
>    /**
>     * Creates a {@link DoccatCrossValidator} with the given
>     * {@link FeatureGenerator}s.
>     */
>    public DoccatCrossValidator(String languageCode, TrainingParameters
> mlParams,
> -      FeatureGenerator[] featureGenerators, DoccatEvaluationMonitor[]
> listeners) {
> +      DoccatFactory factory, DoccatEvaluationMonitor ... listeners) {
>      this.languageCode = languageCode;
>      this.params = mlParams;
>      this.listeners = listeners;
> -    this.featureGenarators = featureGenerators;
> +    this.factory = factory;
>    }
>
>    /**
> @@ -70,7 +71,7 @@ public class DoccatCrossValidator {
>            .next();
>
>        DoccatModel model = DocumentCategorizerME.train(languageCode,
> -          trainingSampleStream, params, featureGenarators);
> +          trainingSampleStream, params, factory);
>
>        DocumentCategorizerEvaluator evaluator = new
> DocumentCategorizerEvaluator(
>            new DocumentCategorizerME(model), listeners);
>
> Added:
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
> URL:
> http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java?rev=1587944&view=auto
>
> ==============================================================================
> ---
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
> (added)
> +++
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
> Wed Apr 16 15:26:24 2014
> @@ -0,0 +1,174 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements.  See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +package opennlp.tools.doccat;
> +
> +import java.util.Arrays;
> +import java.util.Iterator;
> +import java.util.List;
> +import java.util.Map;
> +
> +import opennlp.tools.tokenize.Tokenizer;
> +import opennlp.tools.tokenize.WhitespaceTokenizer;
> +import opennlp.tools.util.BaseToolFactory;
> +import opennlp.tools.util.InvalidFormatException;
> +import opennlp.tools.util.ext.ExtensionLoader;
> +
> +/**
> + * The factory that provides Doccat default implementations and resources
> + */
> +public class DoccatFactory extends BaseToolFactory {
> +
> +  private static final String FEATURE_GENERATORS =
> "doccat.featureGenerators";
> +  private static final String TOKENIZER_NAME = "doccat.tokenizer";
> +
> +  private FeatureGenerator[] featureGenerators;
> +  private Tokenizer tokenizer;
> +
> +  /**
> +   * Creates a {@link DoccatFactory} that provides the default
> implementation of
> +   * the resources.
> +   */
> +  public DoccatFactory() {
> +  }
> +
> +  /**
> +   * Creates a {@link DoccatFactory}. Use this constructor to
> programmatically
> +   * create a factory.
> +   *
> +   * @param tokenizer
> +   * @param featureGenerators
> +   */
> +  public DoccatFactory(Tokenizer tokenizer, FeatureGenerator[]
> featureGenerators) {
> +    this.init(tokenizer, featureGenerators);
> +  }
> +
> +  protected void init(Tokenizer tokenizer, FeatureGenerator[]
> featureGenerators) {
> +
> +    this.featureGenerators = featureGenerators;
> +    this.tokenizer = tokenizer;
> +  }
> +
> +  @Override
> +  public Map<String, String> createManifestEntries() {
> +    Map<String, String> manifestEntries = super.createManifestEntries();
> +
> +    if (getTokenizer() != null) {
> +      manifestEntries.put(TOKENIZER_NAME, getTokenizer().getClass()
> +          .getCanonicalName());
> +    }
> +
> +    if (getFeatureGenerators() != null) {
> +      manifestEntries.put(FEATURE_GENERATORS,
> featureGeneratorsAsString());
> +    }
> +
> +    return manifestEntries;
> +  }
> +
> +  private String featureGeneratorsAsString() {
> +    List<FeatureGenerator> fgs = Arrays.asList(getFeatureGenerators());
> +    Iterator<FeatureGenerator> iter = fgs.iterator();
> +    StringBuilder sb = new StringBuilder();
> +    if (iter.hasNext()) {
> +      sb.append(iter.next().getClass().getCanonicalName());
> +      while (iter.hasNext()) {
> +        sb.append(',').append(iter.next().getClass().getCanonicalName());
> +      }
> +    }
> +    return sb.toString();
> +  }
> +
> +  @Override
> +  public void validateArtifactMap() throws InvalidFormatException {
> +    // nothing to validate
> +  }
> +
> +  public static DoccatFactory create(String subclassName, Tokenizer
> tokenizer,
> +      FeatureGenerator[] featureGenerators) throws InvalidFormatException
> {
> +    if (subclassName == null) {
> +      // will create the default factory
> +      return new DoccatFactory(tokenizer, featureGenerators);
> +    }
> +    try {
> +      DoccatFactory theFactory = ExtensionLoader.instantiateExtension(
> +          DoccatFactory.class, subclassName);
> +      theFactory.init(tokenizer, featureGenerators);
> +      return theFactory;
> +    } catch (Exception e) {
> +      String msg = "Could not instantiate the " + subclassName
> +          + ". The initialization throw an exception.";
> +      System.err.println(msg);
> +      e.printStackTrace();
> +      throw new InvalidFormatException(msg, e);
> +    }
> +
> +  }
> +
> +  private FeatureGenerator[] loadFeatureGenerators(String classNames) {
> +    String[] classes = classNames.split(",");
> +    FeatureGenerator[] fgs = new FeatureGenerator[classes.length];
> +
> +    for (int i = 0; i < classes.length; i++) {
> +      fgs[i] =
> ExtensionLoader.instantiateExtension(FeatureGenerator.class,
> +          classes[i]);
> +    }
> +    return fgs;
> +  }
> +
> +  public FeatureGenerator[] getFeatureGenerators() {
> +    if (featureGenerators == null) {
> +      if (artifactProvider != null) {
> +        String classNames = artifactProvider
> +            .getManifestProperty(FEATURE_GENERATORS);
> +        if (classNames != null) {
> +          this.featureGenerators = loadFeatureGenerators(classNames);
> +        }
> +      }
> +      if (featureGenerators == null) { // could not load using artifact
> provider
> +        // load bag of words as default
> +        FeatureGenerator[] bow = { new BagOfWordsFeatureGenerator() };
> +        this.featureGenerators = bow;
> +      }
> +    }
> +    return featureGenerators;
> +  }
> +
> +  public void setFeatureGenerators(FeatureGenerator[] featureGenerators) {
> +    this.featureGenerators = featureGenerators;
> +  }
> +
> +  public Tokenizer getTokenizer() {
> +    if (this.tokenizer == null) {
> +      if (artifactProvider != null) {
> +        String className =
> artifactProvider.getManifestProperty(TOKENIZER_NAME);
> +        if (className != null) {
> +          this.tokenizer = ExtensionLoader.instantiateExtension(
> +              Tokenizer.class, className);
> +        }
> +      }
> +      if (this.tokenizer == null) { // could not load using artifact
> provider
> +        this.tokenizer = WhitespaceTokenizer.INSTANCE;
> +      }
> +    }
> +    return tokenizer;
> +  }
> +
> +  public void setTokenizer(Tokenizer tokenizer) {
> +    this.tokenizer = tokenizer;
> +  }
> +
> +}
>
> Propchange:
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
>
> ------------------------------------------------------------------------------
>     svn:mime-type = text/plain
>
> Modified:
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
> URL:
> http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java?rev=1587944&r1=1587943&r2=1587944&view=diff
>
> ==============================================================================
> ---
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
> (original)
> +++
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
> Wed Apr 16 15:26:24 2014
> @@ -25,34 +25,50 @@ import java.util.Map;
>
>  import opennlp.tools.ml.model.AbstractModel;
>  import opennlp.tools.ml.model.MaxentModel;
> +import opennlp.tools.util.BaseToolFactory;
>  import opennlp.tools.util.InvalidFormatException;
>  import opennlp.tools.util.model.BaseModel;
>
>  public class DoccatModel extends BaseModel {
> -
> +
>    private static final String COMPONENT_NAME = "DocumentCategorizerME";
>    private static final String DOCCAT_MODEL_ENTRY_NAME = "doccat.model";
> -
> -  protected DoccatModel(String languageCode, MaxentModel doccatModel,
> -      Map<String, String> manifestInfoEntries) {
> -    super(COMPONENT_NAME, languageCode, manifestInfoEntries);
> -
> +
> +  public DoccatModel(String languageCode, MaxentModel doccatModel,
> +      Map<String, String> manifestInfoEntries, DoccatFactory factory) {
> +    super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
> +
>      artifactMap.put(DOCCAT_MODEL_ENTRY_NAME, doccatModel);
>      checkArtifactMap();
>    }
> -
> +
> +  /**
> +   * @deprecated Use
> +   *             {@link #DoccatModel(String, MaxentModel, Map,
> DoccatFactory)}
> +   *             instead and pass in a {@link DoccatFactory}
> +   */
> +  protected DoccatModel(String languageCode, MaxentModel doccatModel,
> +      Map<String, String> manifestInfoEntries) {
> +    this(languageCode, doccatModel, manifestInfoEntries, new
> DoccatFactory());
> +  }
> +
> +  /**
> +   * @deprecated Use
> +   *             {@link #DoccatModel(String, MaxentModel, Map,
> DoccatFactory)}
> +   *             instead and pass in a {@link DoccatFactory}
> +   */
>    public DoccatModel(String languageCode, MaxentModel doccatModel) {
>      this(languageCode, doccatModel, null);
>    }
> -
> +
>    public DoccatModel(InputStream in) throws IOException,
> InvalidFormatException {
>      super(COMPONENT_NAME, in);
>    }
> -
> +
>    public DoccatModel(File modelFile) throws IOException,
> InvalidFormatException {
>      super(COMPONENT_NAME, modelFile);
>    }
> -
> +
>    public DoccatModel(URL modelURL) throws IOException,
> InvalidFormatException {
>      super(COMPONENT_NAME, modelURL);
>    }
> @@ -66,7 +82,23 @@ public class DoccatModel extends BaseMod
>      }
>    }
>
> +  public DoccatFactory getFactory() {
> +    return (DoccatFactory) this.toolFactory;
> +  }
> +
> +  @Override
> +  protected Class<? extends BaseToolFactory> getDefaultFactory() {
> +    return DoccatFactory.class;
> +  }
> +
> +  /**
> +   * @deprecated Use {@link #getMaxentModel()} instead.
> +   */
>    public MaxentModel getChunkerModel() {
>      return (MaxentModel) artifactMap.get(DOCCAT_MODEL_ENTRY_NAME);
>    }
> +
> +  public MaxentModel getMaxentModel() {
> +    return (MaxentModel) artifactMap.get(DOCCAT_MODEL_ENTRY_NAME);
> +  }
>  }
>
> Modified:
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
> URL:
> http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1587944&r1=1587943&r2=1587944&view=diff
>
> ==============================================================================
> ---
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
> (original)
> +++
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
> Wed Apr 16 15:26:24 2014
> @@ -25,7 +25,6 @@ import java.util.Map;
>
>  import opennlp.tools.ml.model.MaxentModel;
>  import opennlp.tools.ml.model.TrainUtil;
> -import opennlp.tools.tokenize.SimpleTokenizer;
>  import opennlp.tools.tokenize.Tokenizer;
>  import opennlp.tools.util.ObjectStream;
>  import opennlp.tools.util.TrainingParameters;
> @@ -40,29 +39,35 @@ public class DocumentCategorizerME imple
>     * Shared default thread safe feature generator.
>     */
>    private static FeatureGenerator defaultFeatureGenerator = new
> BagOfWordsFeatureGenerator();
> -
> -  private MaxentModel model;
> +
> +  private DoccatModel model;
>    private DocumentCategorizerContextGenerator mContextGenerator;
>
>    /**
> -   * Initializes a the current instance with a doccat model and custom
> feature generation.
> -   * The feature generation must be identical to the configuration at
> training time.
> -   *
> +   * Initializes a the current instance with a doccat model and custom
> feature
> +   * generation. The feature generation must be identical to the
> configuration
> +   * at training time.
> +   *
>     * @param model
>     * @param featureGenerators
> +   *
> +   * @deprecated train a {@link DoccatModel} with a specific
> +   *             {@link DoccatFactory} to customize the {@link
> FeatureGenerator}s
>     */
>    public DocumentCategorizerME(DoccatModel model, FeatureGenerator...
> featureGenerators) {
> -    this.model = model.getChunkerModel();
> +    this.model = model;
>      this.mContextGenerator = new
> DocumentCategorizerContextGenerator(featureGenerators);
>    }
> -
> +
>    /**
>     * Initializes the current instance with a doccat model. Default
> feature generation is used.
> -   *
> +   *
>     * @param model
>     */
>    public DocumentCategorizerME(DoccatModel model) {
> -    this(model, defaultFeatureGenerator);
> +    this.model = model;
> +    this.mContextGenerator = new
> DocumentCategorizerContextGenerator(this.model
> +        .getFactory().getFeatureGenerators());
>    }
>
>    /**
> @@ -71,7 +76,7 @@ public class DocumentCategorizerME imple
>     * @param text
>     */
>    public double[] categorize(String text[]) {
> -    return model.eval(mContextGenerator.getContext(text));
> +    return
> model.getMaxentModel().eval(mContextGenerator.getContext(text));
>    }
>
>    /**
> @@ -79,57 +84,79 @@ public class DocumentCategorizerME imple
>     * is passed to the feature generation.
>     */
>    public double[] categorize(String documentText) {
> -    Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
> +    Tokenizer tokenizer = model.getFactory().getTokenizer();
>      return categorize(tokenizer.tokenize(documentText));
>    }
>
>    public String getBestCategory(double[] outcome) {
> -    return model.getBestOutcome(outcome);
> +    return model.getMaxentModel().getBestOutcome(outcome);
>    }
>
>    public int getIndex(String category) {
> -    return model.getIndex(category);
> +    return model.getMaxentModel().getIndex(category);
>    }
>
>    public String getCategory(int index) {
> -    return model.getOutcome(index);
> +    return model.getMaxentModel().getOutcome(index);
>    }
>
>    public int getNumberOfCategories() {
> -    return model.getNumOutcomes();
> +    return model.getMaxentModel().getNumOutcomes();
>    }
>
>    public String getAllResults(double results[]) {
> -    return model.getAllOutcomes(results);
> +    return model.getMaxentModel().getAllOutcomes(results);
>    }
>
> +   /**
> +   * @deprecated Use
> +   *             {@link #train(String, ObjectStream, TrainingParameters,
> DoccatFactory)}
> +   *             instead.
> +   */
>     public static DoccatModel train(String languageCode,
> ObjectStream<DocumentSample> samples,
>         TrainingParameters mlParams, FeatureGenerator... featureGenerators)
>     throws IOException {
> -
> +
>       if (featureGenerators.length == 0) {
>         featureGenerators = new
> FeatureGenerator[]{defaultFeatureGenerator};
>       }
> -
> +
>       Map<String, String> manifestInfoEntries = new HashMap<String,
> String>();
> -
> +
>       MaxentModel model = TrainUtil.train(
>           new DocumentCategorizerEventStream(samples, featureGenerators),
>           mlParams.getSettings(), manifestInfoEntries);
> -
> +
>       return new DoccatModel(languageCode, model, manifestInfoEntries);
>     }
> -
> +
> +   public static DoccatModel train(String languageCode,
> ObjectStream<DocumentSample> samples,
> +       TrainingParameters mlParams, DoccatFactory factory)
> +   throws IOException {
> +
> +     Map<String, String> manifestInfoEntries = new HashMap<String,
> String>();
> +
> +     MaxentModel model = TrainUtil.train(
> +         new DocumentCategorizerEventStream(samples,
> factory.getFeatureGenerators()),
> +         mlParams.getSettings(), manifestInfoEntries);
> +
> +     return new DoccatModel(languageCode, model, manifestInfoEntries,
> factory);
> +   }
> +
>    /**
>     * Trains a doccat model with default feature generation.
> -   *
> +   *
>     * @param languageCode
>     * @param samples
> -   *
> +   *
>     * @return the trained doccat model
> -   *
> +   *
>     * @throws IOException
> -   * @throws ObjectStreamException
> +   * @throws ObjectStreamException
> +   *
> +   * @deprecated Use
> +   *             {@link #train(String, ObjectStream, TrainingParameters,
> DoccatFactory)}
> +   *             instead.
>     */
>    public static DoccatModel train(String languageCode,
> ObjectStream<DocumentSample> samples) throws IOException {
>      return train(languageCode, samples,
> ModelUtil.createDefaultTrainingParameters(), defaultFeatureGenerator);
>
> Modified:
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
> URL:
> http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java?rev=1587944&r1=1587943&r2=1587944&view=diff
>
> ==============================================================================
> ---
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
> (original)
> +++
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorFactory.java
> Wed Apr 16 15:26:24 2014
> @@ -52,7 +52,7 @@ public class SentenceDetectorFactory ext
>    /**
>     * Creates a {@link SentenceDetectorFactory}. Use this constructor to
>     * programmatically create a factory.
> -   *
> +   *
>     * @param languageCode
>     * @param abbreviationDictionary
>     * @param eosCharacters
> @@ -61,7 +61,7 @@ public class SentenceDetectorFactory ext
>        Dictionary abbreviationDictionary, char[] eosCharacters) {
>      this.init(languageCode, useTokenEnd, abbreviationDictionary,
> eosCharacters);
>    }
> -
> +
>    protected void init(String languageCode, boolean useTokenEnd,
>        Dictionary abbreviationDictionary, char[] eosCharacters) {
>      this.languageCode = languageCode;
>
> Modified:
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
> URL:
> http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java?rev=1587944&r1=1587943&r2=1587944&view=diff
>
> ==============================================================================
> ---
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
> (original)
> +++
> opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/ext/ExtensionLoader.java
> Wed Apr 16 15:26:24 2014
> @@ -17,6 +17,8 @@
>
>  package opennlp.tools.util.ext;
>
> +import java.lang.reflect.Field;
> +
>  /**
>   * The {@link ExtensionLoader} is responsible to load extensions to the
> OpenNLP library.
>   * <p>
> @@ -64,6 +66,24 @@ public class ExtensionLoader {
>          } catch (InstantiationException e) {
>            throw new ExtensionNotLoadedException(e);
>          } catch (IllegalAccessException e) {
> +          // constructor is private. Try to load using INSTANCE
> +          Field instanceField;
> +          try {
> +            instanceField = extClazz.getDeclaredField("INSTANCE");
> +          } catch (NoSuchFieldException e1) {
> +            throw new ExtensionNotLoadedException(e1);
> +          } catch (SecurityException e1) {
> +            throw new ExtensionNotLoadedException(e1);
> +          }
> +          if(instanceField != null) {
> +            try {
> +              return (T) instanceField.get(null);
> +            } catch (IllegalArgumentException e1) {
> +              throw new ExtensionNotLoadedException(e1);
> +            } catch (IllegalAccessException e1) {
> +              throw new ExtensionNotLoadedException(e1);
> +            }
> +          }
>            throw new ExtensionNotLoadedException(e);
>          }
>        }
>
> Added:
> opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
> URL:
> http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java?rev=1587944&view=auto
>
> ==============================================================================
> ---
> opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
> (added)
> +++
> opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
> Wed Apr 16 15:26:24 2014
> @@ -0,0 +1,100 @@
> +package opennlp.tools.doccat;
> +
> +import static org.junit.Assert.assertEquals;
> +import static org.junit.Assert.assertNotNull;
> +
> +import java.io.ByteArrayInputStream;
> +import java.io.ByteArrayOutputStream;
> +import java.io.IOException;
> +
> +import opennlp.tools.formats.ResourceAsStreamFactory;
> +import opennlp.tools.tokenize.SimpleTokenizer;
> +import opennlp.tools.tokenize.WhitespaceTokenizer;
> +import opennlp.tools.util.InputStreamFactory;
> +import opennlp.tools.util.ObjectStream;
> +import opennlp.tools.util.PlainTextByLineStream;
> +import opennlp.tools.util.TrainingParameters;
> +
> +import org.junit.Test;
> +
> +/**
> + * Tests for the {@link DoccatFactory} class.
> + */
> +public class DoccatFactoryTest {
> +
> +  private static ObjectStream<DocumentSample> createSampleStream()
> +      throws IOException {
> +
> +    InputStreamFactory isf = new ResourceAsStreamFactory(
> +        DoccatFactoryTest.class,
> "/opennlp/tools/doccat/DoccatSample.txt");
> +
> +    return new DocumentSampleStream(new PlainTextByLineStream(isf,
> "UTF-8"));
> +  }
> +
> +  private static DoccatModel train() throws IOException {
> +    return DocumentCategorizerME.train("x-unspecified",
> createSampleStream(),
> +        TrainingParameters.defaultParams());
> +  }
> +
> +  private static DoccatModel train(DoccatFactory factory) throws
> IOException {
> +    return DocumentCategorizerME.train("x-unspecified",
> createSampleStream(),
> +        TrainingParameters.defaultParams(), factory);
> +  }
> +
> +  @Test
> +  public void testDefault() throws IOException {
> +    DoccatModel model = train();
> +
> +    assertNotNull(model);
> +
> +    ByteArrayOutputStream out = new ByteArrayOutputStream();
> +    model.serialize(out);
> +    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
> +
> +    DoccatModel fromSerialized = new DoccatModel(in);
> +
> +    DoccatFactory factory = fromSerialized.getFactory();
> +
> +    assertNotNull(factory);
> +
> +    assertEquals(1, factory.getFeatureGenerators().length);
> +    assertEquals(BagOfWordsFeatureGenerator.class,
> +        factory.getFeatureGenerators()[0].getClass());
> +
> +    assertEquals(WhitespaceTokenizer.INSTANCE, factory.getTokenizer());
> +
> +  }
> +
> +  @Test
> +  public void testCustom() throws IOException {
> +    FeatureGenerator[] featureGenerators = { new
> BagOfWordsFeatureGenerator(),
> +        new NGramFeatureGenerator() };
> +    DoccatFactory factory = new DoccatFactory(SimpleTokenizer.INSTANCE,
> +        featureGenerators);
> +
> +    DoccatModel model = train(factory);
> +
> +    assertNotNull(model);
> +
> +    ByteArrayOutputStream out = new ByteArrayOutputStream();
> +    model.serialize(out);
> +    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
> +
> +    DoccatModel fromSerialized = new DoccatModel(in);
> +
> +    factory = fromSerialized.getFactory();
> +
> +    assertNotNull(factory);
> +
> +    assertEquals(2, factory.getFeatureGenerators().length);
> +    assertEquals(BagOfWordsFeatureGenerator.class,
> +        factory.getFeatureGenerators()[0].getClass());
> +    assertEquals(NGramFeatureGenerator.class,
> +        factory.getFeatureGenerators()[1].getClass());
> +
> +    assertEquals(SimpleTokenizer.INSTANCE.getClass(),
> factory.getTokenizer()
> +        .getClass());
> +
> +  }
> +
> +}
>
> Propchange:
> opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DoccatFactoryTest.java
>
> ------------------------------------------------------------------------------
>     svn:mime-type = text/plain
>
>
>

Re: svn commit: r1587944 [1/2] - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/cmdline/doccat/ main/java/opennlp/tools/doccat/ main/java/opennlp/tools/sentdetect/ main/java/opennlp/tools/util/ext/ test/java/opennlp/tools/doccat/ test/resourc...

Reply via email to