This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 2b51a25d OPENNLP-1366: Fix Training of MaxEnt Model with large corpora
fails with java.io.UTFDataFormatException (#427)
2b51a25d is described below
commit 2b51a25d0f825d52799d3ea82a044ae872f585dc
Author: Martin Wiesner <[email protected]>
AuthorDate: Fri Oct 28 14:11:29 2022 +0200
OPENNLP-1366: Fix Training of MaxEnt Model with large corpora fails with
java.io.UTFDataFormatException (#427)
* OPENNLP-1366: Fix Training of MaxEnt Model with large corpora fails with
java.io.UTFDataFormatException
---
.../tools/ml/maxent/io/BinaryGISModelWriter.java | 3 +-
.../tools/ml/maxent/io/BinaryQNModelWriter.java | 3 +-
.../tools/ml/model/BinaryFileDataReader.java | 2 +-
.../tools/ml/model/ModelParameterChunker.java | 142 +++++++++++++++++++++
.../ml/naivebayes/BinaryNaiveBayesModelWriter.java | 3 +-
.../ml/perceptron/BinaryPerceptronModelWriter.java | 3 +-
.../tools/ml/model/ModelParameterChunkerTest.java | 114 +++++++++++++++++
7 files changed, 265 insertions(+), 5 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/io/BinaryGISModelWriter.java
b/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/io/BinaryGISModelWriter.java
index 37e6ba4f..47defd65 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/io/BinaryGISModelWriter.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/io/BinaryGISModelWriter.java
@@ -24,6 +24,7 @@ import java.io.IOException;
import java.util.zip.GZIPOutputStream;
import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.ModelParameterChunker;
/**
* Model writer that saves models in binary format.
@@ -68,7 +69,7 @@ public class BinaryGISModelWriter extends GISModelWriter {
}
public void writeUTF(String s) throws java.io.IOException {
- output.writeUTF(s);
+ ModelParameterChunker.writeUTF(output, s);
}
public void writeInt(int i) throws java.io.IOException {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/io/BinaryQNModelWriter.java
b/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/io/BinaryQNModelWriter.java
index 1da4d1ee..95007562 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/io/BinaryQNModelWriter.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/io/BinaryQNModelWriter.java
@@ -24,6 +24,7 @@ import java.io.IOException;
import java.util.zip.GZIPOutputStream;
import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.ModelParameterChunker;
public class BinaryQNModelWriter extends QNModelWriter {
protected DataOutputStream output;
@@ -64,7 +65,7 @@ public class BinaryQNModelWriter extends QNModelWriter {
}
public void writeUTF(String s) throws IOException {
- output.writeUTF(s);
+ ModelParameterChunker.writeUTF(output, s);
}
public void writeInt(int i) throws IOException {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/ml/model/BinaryFileDataReader.java
b/opennlp-tools/src/main/java/opennlp/tools/ml/model/BinaryFileDataReader.java
index b7eaf2b6..009d9b2e 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/ml/model/BinaryFileDataReader.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/ml/model/BinaryFileDataReader.java
@@ -56,7 +56,7 @@ public class BinaryFileDataReader implements DataReader {
}
public String readUTF() throws IOException {
- return input.readUTF();
+ return ModelParameterChunker.readUTF(input);
}
}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/ml/model/ModelParameterChunker.java
b/opennlp-tools/src/main/java/opennlp/tools/ml/model/ModelParameterChunker.java
new file mode 100644
index 00000000..7e77c707
--- /dev/null
+++
b/opennlp-tools/src/main/java/opennlp/tools/ml/model/ModelParameterChunker.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.ml.model;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.UTFDataFormatException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A helper class that handles Strings with more than 64k (65535 bytes) in
length.
+ * This is achieved via the signature {@link #SIGNATURE_CHUNKED_PARAMS} at the
beginning of
+ * the String instance to be written to a {@link DataOutputStream}.
+ * <p>
+ * Background: In OpenNLP, for large(r) corpora, we train models whose (UTF
String) parameters will exceed
+ * the {@link #MAX_CHUNK_SIZE_BYTES} bytes limit set in {@link
DataOutputStream}.
+ * For writing and reading those models, we have to chunk up those string
instances in 64kB blocks and
+ * recombine them correctly upon reading a (binary) model file.
+ * <p>
+ * The problem was raised in <a
href="https://issues.apache.org/jira/browse/OPENNLP-1366">ticket
OPENNLP-1366</a>.
+ * <p>
+ * Solution strategy:
+ * <ul>
+ * <li>If writing parameters to a {@link DataOutputStream} blows up with a
{@link UTFDataFormatException} a
+ * large String instance is chunked up and written as appropriate blocks.</li>
+ * <li>To indicate that chunking was conducted, we start with the {@link
#SIGNATURE_CHUNKED_PARAMS} indicator,
+ * directly followed by the number of chunks used. This way, when reading in
chunked model parameters,
+ * recombination is achieved transparently.</li>
+ * </ul>
+ * <p>
+ * Note: Both, existing (binary) model files and newly trained models which
don't require the chunking
+ * technique, will be supported like in previous OpenNLP versions.
+ *
+ * @author <a href="mailto:[email protected]">Martin Wiesner</a>
+ * @author <a href="mailto:[email protected]">Mark Struberg</a>
+ */
+public final class ModelParameterChunker {
+
+ /*
+ * A signature that denotes the start of a String that required chunking.
+ *
+ * Semantics:
+ * If a model parameter (String) carries the below signature at the very
beginning, this indicates
+ * that 'n > 1' chunks must be processed to obtain the whole model
parameters. Otherwise, those would not be
+ * written to the binary model files (as reported in OPENNLP-1366) if the
training occurs on large corpora
+ * as used, for instance, in the context of (very large) German NLP models.
+ */
+ public static final String SIGNATURE_CHUNKED_PARAMS =
"CHUNKED-MODEL-PARAMS:"; // followed by no of chunks!
+
+ private static final int MAX_CHUNK_SIZE_BYTES = 65535; // the maximum
'utflen' DataOutputStream can handle
+
+ private ModelParameterChunker(){
+ // private utility class ct s
+ }
+
+ /**
+ * Reads model parameters from {@code dis}. In case the stream start with
{@link #SIGNATURE_CHUNKED_PARAMS},
+ * the number of chunks is detected and the original large parameter string
is reconstructed from several
+ * chunks.
+ *
+ * @param dis The stream which will be used to read the model parameter
from.
+ */
+ public static String readUTF(DataInputStream dis) throws IOException {
+ String data = dis.readUTF();
+ if (data.startsWith(SIGNATURE_CHUNKED_PARAMS)) {
+ String chunkElements = data.replace(SIGNATURE_CHUNKED_PARAMS, "");
+ int chunkSize = Integer.parseInt(chunkElements);
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < chunkSize; i++) {
+ sb.append(dis.readUTF());
+ }
+ return sb.toString(); // the reconstructed model parameter string
+ } else { // default case: no chunked data -> just return the read data /
parameter information
+ return data;
+ }
+ }
+
+ /**
+ * Writes the model parameter {@code s} to {@code dos}. In case {@code s}
does exceed
+ * {@link #MAX_CHUNK_SIZE_BYTES} in length, the chunking mechanism is used;
otherwise the parameter is
+ * written 'as is'.
+ *
+ * @param dos The {@link DataOutputStream} stream which will be used to
persist the model.
+ * @param s The input string that is checked for length and chunked if
{@link #MAX_CHUNK_SIZE_BYTES} is
+ * exceeded.
+ */
+ public static void writeUTF(DataOutputStream dos, String s) throws
IOException {
+ try {
+ dos.writeUTF(s);
+ } catch (UTFDataFormatException dfe) {
+ // we definitely have to chunk the given model parameter 's' as it
exceeds the bytes allowed for 1 chunk
+ final String[] chunks = splitByByteLength(s);
+ // write the signature string with the amount of chunks for reading the
model file correctly
+ dos.writeUTF(SIGNATURE_CHUNKED_PARAMS + chunks.length); // add number of
required chunks
+ for (String c: chunks) {
+ dos.writeUTF(c);
+ }
+ }
+ }
+
+ private static String[] splitByByteLength(String input) {
+ CharBuffer in = CharBuffer.wrap(input);
+ ByteBuffer out = ByteBuffer.allocate(MAX_CHUNK_SIZE_BYTES); // output
buffer of required size
+ CharsetEncoder coder = StandardCharsets.UTF_8.newEncoder();
+ List<String> chunks = new ArrayList<>();
+ int pos = 0;
+ while (true) {
+ CoderResult cr = coder.encode(in, out, true);
+ int nPos = input.length() - in.length();
+ String s = input.substring(pos, nPos);
+ chunks.add(s);
+ pos = nPos;
+ out.rewind();
+ if (! cr.isOverflow()) {
+ break;
+ }
+ }
+ return chunks.toArray(new String[0]);
+ }
+}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/BinaryNaiveBayesModelWriter.java
b/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/BinaryNaiveBayesModelWriter.java
index 15e997f9..59b90946 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/BinaryNaiveBayesModelWriter.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/BinaryNaiveBayesModelWriter.java
@@ -24,6 +24,7 @@ import java.io.IOException;
import java.util.zip.GZIPOutputStream;
import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.ModelParameterChunker;
/**
* Model writer that saves models in binary format.
@@ -64,7 +65,7 @@ public class BinaryNaiveBayesModelWriter extends
NaiveBayesModelWriter {
}
public void writeUTF(String s) throws java.io.IOException {
- output.writeUTF(s);
+ ModelParameterChunker.writeUTF(output, s);
}
public void writeInt(int i) throws java.io.IOException {
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/BinaryPerceptronModelWriter.java
b/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/BinaryPerceptronModelWriter.java
index 21eaa3fc..2a56b1d8 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/BinaryPerceptronModelWriter.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/BinaryPerceptronModelWriter.java
@@ -24,6 +24,7 @@ import java.io.IOException;
import java.util.zip.GZIPOutputStream;
import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.ModelParameterChunker;
/**
* Model writer that saves models in binary format.
@@ -65,7 +66,7 @@ public class BinaryPerceptronModelWriter extends
PerceptronModelWriter {
}
public void writeUTF(String s) throws java.io.IOException {
- output.writeUTF(s);
+ ModelParameterChunker.writeUTF(output, s);
}
public void writeInt(int i) throws java.io.IOException {
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/ml/model/ModelParameterChunkerTest.java
b/opennlp-tools/src/test/java/opennlp/tools/ml/model/ModelParameterChunkerTest.java
new file mode 100644
index 00000000..8d497091
--- /dev/null
+++
b/opennlp-tools/src/test/java/opennlp/tools/ml/model/ModelParameterChunkerTest.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.ml.model;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for {@link ModelParameterChunker}.
+ *
+ * @author <a href="mailto:[email protected]">Martin Wiesner</a>
+ */
+public class ModelParameterChunkerTest {
+
+ private File tmp;
+
+ @Before
+ public void setup() throws IOException {
+ tmp = File.createTempFile("chunker-test", ".dat");
+ tmp.deleteOnExit();
+ }
+
+ @After
+ public void tearDown() {
+ tmp = null;
+ }
+
+ /*
+ * Note: 8k Integer elements will be concatenated into a flat String. The
size of the resulting character
+ * sequence won't hit the critical 64K limit (see:
DataOutputStream#writeUTF).
+ *
+ * No chunking is therefore required.
+ */
+ @Test
+ public void testWriteReadUTFWithoutChunking() throws FileNotFoundException {
+ // 8k ints -> 48042 bytes for a flat String
+ testAndCheck(8192, 48042);
+ }
+
+ /*
+ * Note: 16k Integer elements will be concatenated into a flat String. The
size of the resulting character
+ * sequence will exceed the critical 64K limit (see:
DataOutputStream#writeUTF).
+ *
+ * Chunking is therefore required and used internally to avoid the blow up
of the serialization procedure.
+ *
+ * When restoring the chunked String, the signature string
(#SIGNATURE_CHUNKED_PARAMS) will be escaped.
+ * Thus, we can assume the restored string must be equal to the artificially
created original input.
+ */
+ @Test
+ public void testWriteReadUTFWithChunking() throws FileNotFoundException {
+ // 16k ints -> 103578 bytes for a flat String
+ testAndCheck(16384, 103578);
+ }
+
+ private void testAndCheck(int elementCount, int expectedByteLength) {
+ String p = getParameter(elementCount);
+ Assert.assertNotNull(p);
+ Assert.assertFalse(p.trim().isEmpty());
+ Assert.assertEquals(expectedByteLength,
p.getBytes(StandardCharsets.UTF_8).length);
+
+ // TEST
+ try (DataOutputStream dos = new
DataOutputStream(Files.newOutputStream(tmp.toPath()))) {
+ ModelParameterChunker.writeUTF(dos, p);
+ } catch (IOException e) {
+ Assert.fail(e.getLocalizedMessage());
+ }
+ // VERIFY
+ try (DataInputStream dis = new
DataInputStream(Files.newInputStream(tmp.toPath()))) {
+ String restoredBelow64K = ModelParameterChunker.readUTF(dis);
+ // assumptions
+ Assert.assertNotNull(restoredBelow64K);
+ Assert.assertFalse(restoredBelow64K.trim().isEmpty());
+ Assert.assertEquals(p, restoredBelow64K);
+ Assert.assertEquals(expectedByteLength,
p.getBytes(StandardCharsets.UTF_8).length);
+ } catch (IOException e) {
+ Assert.fail(e.getLocalizedMessage());
+ }
+ }
+
+ private String getParameter(int elementCount) {
+ List<Integer> someParameters = new ArrayList<>(elementCount);
+ for (int i = 0; i < elementCount; i++) {
+ someParameters.add(i);
+ }
+ return someParameters.toString();
+ }
+}