Repository: opennlp Updated Branches: refs/heads/trunk c4c4fd3f4 -> 4a4b591f0
Remove references to PlainTextByLineStream constructor that takes InputStream The PlainTextByLineStream that takes InputStream was deprecated for a while. We can remove safely remove it after reviewing internal code that was still using it. Left the deprecated code that was using the constructor for a latter work. See issue OPENNLP-882 Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/4a4b591f Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/4a4b591f Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/4a4b591f Branch: refs/heads/trunk Commit: 4a4b591f0944add15a6c718980d863f147f6dad4 Parents: c4c4fd3 Author: William Colen <[email protected]> Authored: Thu Nov 10 11:40:41 2016 -0200 Committer: William Colen <[email protected]> Committed: Thu Nov 10 11:40:41 2016 -0200 ---------------------------------------------------------------------- .../namefind/CensusDictionaryCreatorTool.java | 16 ++--- .../formats/LeipzigDoccatSampleStream.java | 9 +-- .../LeipzigDocumentSampleStreamFactory.java | 5 +- .../formats/NameFinderCensus90NameStream.java | 19 ++++++ .../parser/chunking/ParserEventStream.java | 7 +- .../parser/treeinsert/ParserEventStream.java | 20 ++++-- .../tools/postag/WordTagSampleStream.java | 7 +- .../formats/LeipzigDoccatSampleStreamTest.java | 10 +-- .../NameFinderCensus90NameStreamTest.java | 17 +++-- .../tools/postag/POSTaggerFactoryTest.java | 17 +++-- .../opennlp/tools/postag/POSTaggerMETest.java | 16 +++-- .../uima/namefind/NameFinderTrainer.java | 42 ++++++------ .../opennlp/uima/tokenize/TokenizerTrainer.java | 70 ++++++++++---------- 13 files changed, 143 insertions(+), 112 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/CensusDictionaryCreatorTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/CensusDictionaryCreatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/CensusDictionaryCreatorTool.java index 6798938..8159ef0 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/CensusDictionaryCreatorTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/CensusDictionaryCreatorTool.java @@ -18,7 +18,6 @@ package opennlp.tools.cmdline.namefind; import java.io.File; -import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; @@ -31,6 +30,7 @@ import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.TerminateToolException; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.formats.NameFinderCensus90NameStream; +import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.StringList; @@ -106,23 +106,17 @@ public class CensusDictionaryCreatorTool extends BasicCmdLineTool { CmdLineUtil.checkInputFile("Name data", testData); CmdLineUtil.checkOutputFile("Dictionary file", dictOutFile); - FileInputStream sampleDataIn = CmdLineUtil.openInFile(testData); - ObjectStream<StringList> sampleStream = new NameFinderCensus90NameStream(sampleDataIn, - Charset.forName(params.getEncoding())); + InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(testData); Dictionary mDictionary; - try { + try ( + ObjectStream<StringList> sampleStream = new NameFinderCensus90NameStream( + sampleDataIn, Charset.forName(params.getEncoding()))) { System.out.println("Creating Dictionary..."); mDictionary = createDictionary(sampleStream); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.getMessage(), e); - } finally { - try { - sampleStream.close(); - } catch(IOException e) { - // sorry this can fail.. - } } System.out.println("Saving Dictionary..."); http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java index d28beb7..0af66ae 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java @@ -18,12 +18,13 @@ package opennlp.tools.formats; import java.io.IOException; -import java.io.InputStream; import java.io.PrintStream; +import java.nio.charset.StandardCharsets; import opennlp.tools.doccat.DocumentSample; import opennlp.tools.tokenize.SimpleTokenizer; import opennlp.tools.util.FilterObjectStream; +import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.PlainTextByLineStream; /** @@ -51,13 +52,13 @@ public class LeipzigDoccatSampleStream extends * @throws IOException IOException */ LeipzigDoccatSampleStream(String language, int sentencesPerDocument, - InputStream in) throws IOException { - super(new PlainTextByLineStream(in, "UTF-8")); + InputStreamFactory in) throws IOException { + super(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); System.setOut(new PrintStream(System.out, true, "UTF-8")); this.language = language; this.sentencesPerDocument = sentencesPerDocument; } - + public DocumentSample read() throws IOException { int count = 0; http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java index 37dac7e..c5e5c26 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java @@ -70,8 +70,9 @@ public class LeipzigDocumentSampleStreamFactory for (int i = 0; i < sentencesFiles.length; i++) { try { - sampleStreams[i] = new LeipzigDoccatSampleStream(sentencesFiles[i].getName().substring(0, 3), 20, - CmdLineUtil.openInFile(sentencesFiles[i])); + sampleStreams[i] = new LeipzigDoccatSampleStream( + sentencesFiles[i].getName().substring(0, 3), 20, + CmdLineUtil.createInputStreamFactory(sentencesFiles[i])); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while opening sample data: " + e.getMessage(), e); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-tools/src/main/java/opennlp/tools/formats/NameFinderCensus90NameStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/NameFinderCensus90NameStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/NameFinderCensus90NameStream.java index ee1d15a..ee3f933 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/NameFinderCensus90NameStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/NameFinderCensus90NameStream.java @@ -20,6 +20,7 @@ import java.io.InputStream; import java.nio.charset.Charset; import java.util.Locale; +import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.StringList; @@ -64,8 +65,26 @@ public class NameFinderCensus90NameStream implements ObjectStream<StringList> { * This constructor takes an <code>InputStream</code> and a <code>Charset</code> * and opens an associated stream object with the specified encoding specified. * + * @param in an <code>InputStreamFactory</code> for the input file. + * @param encoding the <code>Charset</code> to apply to the input stream. + * @throws IOException + */ + public NameFinderCensus90NameStream(InputStreamFactory in, Charset encoding) + throws IOException { + this.locale = new Locale("en"); // locale is English + this.encoding = encoding; + this.lineStream = new PlainTextByLineStream(in, this.encoding); + } + + + /** + * This constructor takes an <code>InputStream</code> and a <code>Charset</code> + * and opens an associated stream object with the specified encoding specified. + * * @param in an <code>InputStream</code> for the input file. * @param encoding the <code>Charset</code> to apply to the input stream. + * + * @deprecated use {@link NameFinderCensus90NameStream#NameFinderCensus90NameStream(InputStreamFactory, Charset)} */ public NameFinderCensus90NameStream(InputStream in, Charset encoding) { this.locale = new Locale("en"); // locale is English http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/ParserEventStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/ParserEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/ParserEventStream.java index af202af..88c4e24 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/ParserEventStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/ParserEventStream.java @@ -18,8 +18,10 @@ package opennlp.tools.parser.chunking; import java.io.FileInputStream; +import java.nio.charset.Charset; import java.util.List; +import opennlp.tools.cmdline.SystemInputStreamFactory; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.ml.model.Event; import opennlp.tools.parser.AbstractBottomUpParser; @@ -204,7 +206,10 @@ public class ParserEventStream extends AbstractParserEventStream { if (fun) { Parse.useFunctionTags(true); } - ObjectStream<Event> es = new ParserEventStream(new ParseSampleStream(new PlainTextByLineStream(new java.io.InputStreamReader(System.in))), rules, etype, dict); + ObjectStream<Event> es = new ParserEventStream( + new ParseSampleStream(new PlainTextByLineStream( + new SystemInputStreamFactory(), Charset.defaultCharset())), + rules, etype, dict); Event event; while ((event = es.read()) != null) { System.out.println(event); http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/ParserEventStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/ParserEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/ParserEventStream.java index 4087db8..6f6c85d 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/ParserEventStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/ParserEventStream.java @@ -20,11 +20,13 @@ package opennlp.tools.parser.treeinsert; import java.io.File; import java.io.FileInputStream; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import opennlp.tools.cmdline.SystemInputStreamFactory; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelReader; import opennlp.tools.ml.model.AbstractModel; @@ -379,13 +381,19 @@ public class ParserEventStream extends AbstractParserEventStream { if (fun) { Parse.useFunctionTags(true); } - ObjectStream<Event> es = new ParserEventStream(new ParseSampleStream(new PlainTextByLineStream(new java.io.InputStreamReader(System.in))), rules, etype, dict); - Event e; - while ((e = es.read()) != null) { - if (model != null) { - System.out.print(model.eval(e.getContext())[model.getIndex(e.getOutcome())]+" "); + + try (ObjectStream<Event> es = new ParserEventStream( + new ParseSampleStream(new PlainTextByLineStream( + new SystemInputStreamFactory(), Charset.defaultCharset())), + rules, etype, dict)) { + Event e; + while ((e = es.read()) != null) { + if (model != null) { + System.out.print( + model.eval(e.getContext())[model.getIndex(e.getOutcome())] + " "); + } + System.out.println(e); } - System.out.println(e); } } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java index 724a9c4..644b566 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/postag/WordTagSampleStream.java @@ -39,13 +39,8 @@ public class WordTagSampleStream extends FilterObjectStream<String, POSSample> { /** * Initializes the current instance. * - * @param sentences reader with sentences - * @throws IOException IOException + * @param sentences the sentences */ - public WordTagSampleStream(Reader sentences) throws IOException { - super(new PlainTextByLineStream(sentences)); - } - public WordTagSampleStream(ObjectStream<String> sentences) { super(sentences); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java index 409991e..5797ab4 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java @@ -21,18 +21,18 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import java.io.IOException; -import java.io.InputStream; + +import org.junit.Test; import opennlp.tools.doccat.DocumentSample; +import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; -import org.junit.Test; - public class LeipzigDoccatSampleStreamTest { @Test public void testParsingSample() throws IOException { - InputStream in = LeipzigDoccatSampleStreamTest.class.getResourceAsStream( + InputStreamFactory in = new ResourceAsStreamFactory(getClass(), "/opennlp/tools/formats/leipzig-en.sample"); ObjectStream<DocumentSample> sampleStream = @@ -51,5 +51,7 @@ public class LeipzigDoccatSampleStreamTest { assertEquals("en", doc4.getCategory()); assertNull(sampleStream.read()); + + sampleStream.close(); } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-tools/src/test/java/opennlp/tools/formats/NameFinderCensus90NameStreamTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/NameFinderCensus90NameStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/NameFinderCensus90NameStreamTest.java index d48f188..84fc28d 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/formats/NameFinderCensus90NameStreamTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/formats/NameFinderCensus90NameStreamTest.java @@ -15,25 +15,28 @@ package opennlp.tools.formats; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.Charset; +import org.junit.Test; + +import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.StringList; -import org.junit.Test; - public class NameFinderCensus90NameStreamTest { - private static ObjectStream<StringList> openData(String name) throws IOException { - InputStream in = NameFinderCensus90NameStreamTest.class.getResourceAsStream("/opennlp/tools/formats/" + name); + private static ObjectStream<StringList> openData(String name) + throws IOException { + InputStreamFactory in = new ResourceAsStreamFactory( + NameFinderCensus90NameStreamTest.class, + "/opennlp/tools/formats/" + name); - return new NameFinderCensus90NameStream(in, Charset.forName("utf-8")); + return new NameFinderCensus90NameStream(in, UTF_8); } @Test http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerFactoryTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerFactoryTest.java index 20c12d9..fbab448 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerFactoryTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerFactoryTest.java @@ -17,26 +17,28 @@ package opennlp.tools.postag; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertTrue; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; + +import org.junit.Test; import opennlp.tools.dictionary.Dictionary; +import opennlp.tools.formats.ResourceAsStreamFactory; import opennlp.tools.postag.DummyPOSTaggerFactory.DummyPOSContextGenerator; import opennlp.tools.postag.DummyPOSTaggerFactory.DummyPOSDictionary; import opennlp.tools.postag.DummyPOSTaggerFactory.DummyPOSSequenceValidator; import opennlp.tools.util.BaseToolFactory; +import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; import opennlp.tools.util.model.ModelType; -import org.junit.Test; - /** * Tests for the {@link POSTaggerFactory} class. */ @@ -44,10 +46,11 @@ public class POSTaggerFactoryTest { private static ObjectStream<POSSample> createSampleStream() throws IOException { - InputStream in = POSTaggerFactoryTest.class.getClassLoader() - .getResourceAsStream("opennlp/tools/postag/AnnotatedSentences.txt"); + InputStreamFactory in = new ResourceAsStreamFactory( + POSTaggerFactoryTest.class, + "/opennlp/tools/postag/AnnotatedSentences.txt"); - return new WordTagSampleStream((new InputStreamReader(in))); + return new WordTagSampleStream(new PlainTextByLineStream(in, UTF_8)); } static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory) http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java index 6001de6..1d99687 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java @@ -18,27 +18,29 @@ package opennlp.tools.postag; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import org.junit.Test; + +import opennlp.tools.formats.ResourceAsStreamFactory; +import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.model.ModelType; -import org.junit.Test; - /** * Tests for the {@link POSTaggerME} class. */ public class POSTaggerMETest { private static ObjectStream<POSSample> createSampleStream() throws IOException { - InputStream in = POSTaggerMETest.class.getClassLoader().getResourceAsStream( - "opennlp/tools/postag/AnnotatedSentences.txt"); + InputStreamFactory in = new ResourceAsStreamFactory(POSTaggerMETest.class, + "/opennlp/tools/postag/AnnotatedSentences.txt"); - return new WordTagSampleStream((new InputStreamReader(in))); + return new WordTagSampleStream(new PlainTextByLineStream(in, UTF_8)); } /** http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java ---------------------------------------------------------------------- diff --git a/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java b/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java index d637c68..dcc0ddc 100644 --- a/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java +++ b/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java @@ -18,13 +18,11 @@ package opennlp.uima.namefind; import java.io.File; -import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; @@ -32,6 +30,18 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.FSIndex; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.collection.CasConsumer_ImplBase; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceProcessException; +import org.apache.uima.util.Level; +import org.apache.uima.util.Logger; +import org.apache.uima.util.ProcessTrace; + import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool; import opennlp.tools.ml.maxent.GIS; import opennlp.tools.namefind.BioCodec; @@ -40,6 +50,8 @@ import opennlp.tools.namefind.NameSample; import opennlp.tools.namefind.NameSampleDataStream; import opennlp.tools.namefind.TokenNameFinderFactory; import opennlp.tools.namefind.TokenNameFinderModel; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.ObjectStreamUtils; import opennlp.tools.util.PlainTextByLineStream; @@ -51,18 +63,6 @@ import opennlp.uima.util.OpennlpUtil; import opennlp.uima.util.SampleTraceStream; import opennlp.uima.util.UimaUtil; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.FSIndex; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.collection.CasConsumer_ImplBase; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceProcessException; -import org.apache.uima.util.Level; -import org.apache.uima.util.Logger; -import org.apache.uima.util.ProcessTrace; - /** * OpenNLP NameFinder trainer. * <p> @@ -369,7 +369,6 @@ public final class NameFinderTrainer extends CasConsumer_ImplBase { // create training stream ... ObjectStream<NameSample> samples = ObjectStreamUtils.createObjectStream(nameFinderSamples); - InputStream additionalTrainingDataIn = null; Writer samplesOut = null; TokenNameFinderModel nameModel; try { @@ -379,10 +378,14 @@ public final class NameFinderTrainer extends CasConsumer_ImplBase { logger.log(Level.INFO, "Using additional training data file: " + additionalTrainingDataFile); } - additionalTrainingDataIn = new FileInputStream(additionalTrainingDataFile); + InputStreamFactory additionalTrainingDataIn = new MarkableFileInputStreamFactory( + new File(additionalTrainingDataFile)); + Charset additionalTrainingDataCharset = Charset + .forName(additionalTrainingDataEncoding); ObjectStream<NameSample> additionalSamples = new NameSampleDataStream( - new PlainTextByLineStream(new InputStreamReader(additionalTrainingDataIn, additionalTrainingDataEncoding))); + new PlainTextByLineStream(additionalTrainingDataIn, + additionalTrainingDataCharset)); samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples); } @@ -405,9 +408,6 @@ public final class NameFinderTrainer extends CasConsumer_ImplBase { new TokenNameFinderFactory(featureGeneratorDefinition, resourceMap, new BioCodec())); } finally { - if (additionalTrainingDataIn != null) { - additionalTrainingDataIn.close(); - } if (samplesOut != null) { samplesOut.close(); http://git-wip-us.apache.org/repos/asf/opennlp/blob/4a4b591f/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java ---------------------------------------------------------------------- diff --git a/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java b/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java index d6309dd..ece9eca 100644 --- a/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java +++ b/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java @@ -18,24 +18,37 @@ package opennlp.uima.tokenize; import java.io.File; -import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.LinkedList; import java.util.List; +import org.apache.uima.UimaContext; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.FSIndex; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.TypeSystem; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.collection.CasConsumer_ImplBase; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceProcessException; +import org.apache.uima.util.Level; +import org.apache.uima.util.Logger; +import org.apache.uima.util.ProcessTrace; + import opennlp.tools.ml.maxent.GIS; import opennlp.tools.tokenize.TokenSample; import opennlp.tools.tokenize.TokenSampleStream; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; +import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.ObjectStreamUtils; import opennlp.tools.util.PlainTextByLineStream; @@ -46,19 +59,6 @@ import opennlp.uima.util.OpennlpUtil; import opennlp.uima.util.SampleTraceStream; import opennlp.uima.util.UimaUtil; -import org.apache.uima.UimaContext; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.FSIndex; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.TypeSystem; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.collection.CasConsumer_ImplBase; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.ResourceProcessException; -import org.apache.uima.util.Level; -import org.apache.uima.util.Logger; -import org.apache.uima.util.ProcessTrace; - /** * OpenNLP Tokenizer trainer. * <p> @@ -231,37 +231,35 @@ public final class TokenizerTrainer extends CasConsumer_ImplBase { // if trace file // serialize events ... - InputStream additionalTrainingDataIn = null; Writer samplesOut = null; TokenizerModel tokenModel; - try { - if (additionalTrainingDataFile != null) { - - if (mLogger.isLoggable(Level.INFO)) { - mLogger.log(Level.INFO, "Using addional training data file: " + additionalTrainingDataFile); - } + if (additionalTrainingDataFile != null) { - additionalTrainingDataIn = new FileInputStream(additionalTrainingDataFile); + if (mLogger.isLoggable(Level.INFO)) { + mLogger.log(Level.INFO, "Using addional training data file: " + additionalTrainingDataFile); + } - ObjectStream<TokenSample> additionalSamples = new TokenSampleStream( - new PlainTextByLineStream(new InputStreamReader(additionalTrainingDataIn, additionalTrainingDataEncoding))); + InputStreamFactory additionalTrainingDataIn = new MarkableFileInputStreamFactory( + new File(additionalTrainingDataFile)); - samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples); - } + Charset additionalTrainingDataCharset = Charset + .forName(additionalTrainingDataEncoding); - if (sampleTraceFile != null) { - samplesOut = new OutputStreamWriter(new FileOutputStream(sampleTraceFile), sampleTraceFileEncoding); - samples = new SampleTraceStream<TokenSample>(samples, samplesOut); - } + ObjectStream<TokenSample> additionalSamples = new TokenSampleStream( + new PlainTextByLineStream(additionalTrainingDataIn, + additionalTrainingDataCharset)); - tokenModel = TokenizerME.train(language, samples, isSkipAlphaNumerics); + samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples); } - finally { - if (additionalTrainingDataIn != null) - additionalTrainingDataIn.close(); + + if (sampleTraceFile != null) { + samplesOut = new OutputStreamWriter(new FileOutputStream(sampleTraceFile), sampleTraceFileEncoding); + samples = new SampleTraceStream<TokenSample>(samples, samplesOut); } + tokenModel = TokenizerME.train(language, samples, isSkipAlphaNumerics); + // dereference to allow garbage collection tokenSamples = null;
