subbudvk commented on code in PR #1029:
URL: https://github.com/apache/opennlp/pull/1029#discussion_r3176449862
##########
opennlp-core/opennlp-ml/opennlp-ml-libsvm/src/test/java/opennlp/tools/ml/libsvm/doccat/SvmDoccatModelTest.java:
##########
@@ -19,8 +19,12 @@
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
+import java.io.File;
import java.io.IOException;
+import java.io.InvalidClassException;
+import java.io.ObjectOutputStream;
import java.util.ArrayList;
+import java.util.HashMap;
Review Comment:
I think having a null InputStream path test (either deserialize(null) or
deserialize(null, limits)) could be useful
##########
opennlp-core/opennlp-ml/opennlp-ml-libsvm/src/main/java/opennlp/tools/ml/libsvm/doccat/SvmDoccatModel.java:
##########
@@ -184,16 +201,172 @@ public void serialize(OutputStream out) throws
IOException {
}
/**
- * Deserializes a {@link SvmDoccatModel} from the given {@link InputStream}.
+ * Deserializes a {@link SvmDoccatModel} from the given {@link InputStream}
+ * using {@link DeserializationLimits#DEFAULT default} resource limits.
+ * <p>
+ * The stream is filtered via an {@link ObjectInputFilter} that allow-lists
+ * only the classes required to reconstruct an {@link SvmDoccatModel}, plus
+ * resource limits on graph depth, references, and array length. Foreign
+ * payloads are rejected with {@link java.io.InvalidClassException} before
+ * {@link ObjectInputStream#readObject()} returns.
+ * <p>
+ * Callers should still treat this method as defense-in-depth: only invoke
+ * it on streams from trusted sources.
+ * <p>
+ * If the default limits are too tight for an unusually large model — for
+ * example, a model with more than {@value #MAX_ARRAY_DEFAULT} entries in a
+ * single map — use {@link #deserialize(InputStream, DeserializationLimits)}
+ * to supply higher limits. The class allow-list is intentionally not
+ * configurable; loosening it would defeat the purpose of the filter.
*
* @param in The {@link InputStream} to read from. Must not be {@code null}.
* @return A valid {@link SvmDoccatModel} instance.
- * @throws IOException Thrown if IO errors occurred during deserialization.
+ * @throws IOException Thrown if IO errors occurred during deserialization,
+ * including {@link java.io.InvalidClassException} when
+ * the stream contains a class outside the allow-list or
+ * exceeds a resource limit.
* @throws ClassNotFoundException Thrown if required classes are not found.
*/
public static SvmDoccatModel deserialize(InputStream in) throws IOException,
ClassNotFoundException {
+ return deserialize(in, DeserializationLimits.DEFAULT);
+ }
+
+ /**
+ * Deserializes a {@link SvmDoccatModel} from the given {@link InputStream}
+ * using the supplied {@link DeserializationLimits resource limits}.
+ * <p>
+ * Use this overload when the {@link DeserializationLimits#DEFAULT default
+ * limits} reject a legitimate model — for example, models trained with very
+ * large feature vocabularies. The class allow-list applied to the stream is
+ * the same as for {@link #deserialize(InputStream)}; only the numeric
+ * limits change.
+ *
+ * @param in The {@link InputStream} to read from. Must not be {@code
null}.
+ * @param limits The {@link DeserializationLimits} to apply. Must not be
+ * {@code null}.
+ * @return A valid {@link SvmDoccatModel} instance.
+ * @throws IOException Thrown if IO errors occurred during deserialization,
+ * including {@link java.io.InvalidClassException} when
+ * the stream contains a class outside the allow-list or
+ * exceeds one of the supplied limits.
+ * @throws ClassNotFoundException Thrown if required classes are not found.
+ * @throws NullPointerException if {@code limits} is {@code null}.
+ */
+ public static SvmDoccatModel deserialize(InputStream in,
DeserializationLimits limits)
+ throws IOException, ClassNotFoundException {
+ Objects.requireNonNull(limits, "limits must not be null");
try (ObjectInputStream ois = new ObjectInputStream(in)) {
+ ois.setObjectInputFilter(buildFilter(limits));
return (SvmDoccatModel) ois.readObject();
}
}
+
+ /**
+ * Resource limits applied to the {@link ObjectInputFilter} used by
+ * {@link SvmDoccatModel#deserialize(InputStream, DeserializationLimits)}.
+ * <p>
+ * The limits bound graph traversal regardless of the class allow-list and
+ * provide defense-in-depth against pathological streams. The
+ * {@linkplain #DEFAULT default values} are generous enough for typical
+ * production models; raise them only if a legitimate model is rejected.
+ *
+ * @param maxDepth Maximum object-graph nesting depth. Must be {@code
> 0}.
+ * @param maxRefs Maximum number of internal references the stream may
+ * create. Must be {@code > 0}.
+ * @param maxArrayLength Maximum length of any single array allocation
+ * requested by the stream. Must be {@code > 0}.
+ * @throws IllegalArgumentException if any of {@code maxDepth},
+ * {@code maxRefs}, or {@code
maxArrayLength}
+ * is {@code <= 0}.
+ */
+ public record DeserializationLimits(long maxDepth, long maxRefs, long
maxArrayLength) {
+
+ /**
+ * Default limits. Sized to allow typical production-scale models to
+ * round-trip while still bounding pathological streams.
+ */
+ public static final DeserializationLimits DEFAULT =
+ new DeserializationLimits(MAX_DEPTH_DEFAULT, MAX_REFS_DEFAULT,
MAX_ARRAY_DEFAULT);
+
+ public DeserializationLimits {
+ if (maxDepth <= 0) {
+ throw new IllegalArgumentException("maxDepth must be > 0");
+ }
+ if (maxRefs <= 0) {
+ throw new IllegalArgumentException("maxRefs must be > 0");
+ }
+ if (maxArrayLength <= 0) {
+ throw new IllegalArgumentException("maxArrayLength must be > 0");
+ }
+ }
+ }
+
+ // Default resource limits applied during deserialization. These are
+ // intentionally generous so that legitimate models — which may contain
+ // millions of support vectors — round-trip without issue, while still
+ // bounding pathological inputs.
+ private static final long MAX_DEPTH_DEFAULT = 64;
+ private static final long MAX_REFS_DEFAULT = 5_000_000;
+ private static final long MAX_ARRAY_DEFAULT = 10_000_000;
+
+ // Allow-list of fully qualified class names that may appear in the
+ // serialized graph of a SvmDoccatModel. Anything else is rejected.
+ private static final Set<String> ALLOWED_CLASSES = Set.of(
+ // OpenNLP types persisted by this model
+ "opennlp.tools.ml.libsvm.doccat.SvmDoccatModel",
+ "opennlp.tools.ml.libsvm.doccat.SvmDoccatConfiguration",
+ "opennlp.tools.ml.libsvm.doccat.TermWeightingStrategy",
+ "opennlp.tools.ml.libsvm.doccat.FeatureSelectionStrategy",
+ // zlibsvm domain + configuration types reachable from SvmModel
+ "de.hhn.mi.configuration.SvmConfigurationImpl",
+ "de.hhn.mi.configuration.SvmType",
+ "de.hhn.mi.configuration.KernelType",
+ "de.hhn.mi.domain.SvmModelImpl",
+ "de.hhn.mi.domain.SvmMetaInformationImpl",
+ "de.hhn.mi.domain.SvmFeatureImpl",
+ "de.hhn.mi.domain.SvmClassLabelImpl",
+ // Native libsvm structures embedded in SvmModelImpl
+ "libsvm.svm_model",
+ "libsvm.svm_node",
+ "libsvm.svm_parameter",
+ // JDK types used in field declarations
+ "java.lang.String",
+ "java.lang.Number",
+ "java.lang.Integer",
+ "java.lang.Double",
+ "java.lang.Boolean",
+ "java.lang.Enum",
Review Comment:
I think enum serialization presents the concrete enum class name (e.g.,
TermWeightingStrategy) to the filter, not java.lang.Enum
##########
opennlp-core/opennlp-ml/opennlp-ml-libsvm/src/main/java/opennlp/tools/ml/libsvm/doccat/SvmDoccatModel.java:
##########
@@ -184,16 +201,172 @@ public void serialize(OutputStream out) throws
IOException {
}
/**
- * Deserializes a {@link SvmDoccatModel} from the given {@link InputStream}.
+ * Deserializes a {@link SvmDoccatModel} from the given {@link InputStream}
+ * using {@link DeserializationLimits#DEFAULT default} resource limits.
+ * <p>
+ * The stream is filtered via an {@link ObjectInputFilter} that allow-lists
+ * only the classes required to reconstruct an {@link SvmDoccatModel}, plus
+ * resource limits on graph depth, references, and array length. Foreign
+ * payloads are rejected with {@link java.io.InvalidClassException} before
+ * {@link ObjectInputStream#readObject()} returns.
+ * <p>
+ * Callers should still treat this method as defense-in-depth: only invoke
+ * it on streams from trusted sources.
+ * <p>
+ * If the default limits are too tight for an unusually large model — for
+ * example, a model with more than {@value #MAX_ARRAY_DEFAULT} entries in a
+ * single map — use {@link #deserialize(InputStream, DeserializationLimits)}
+ * to supply higher limits. The class allow-list is intentionally not
+ * configurable; loosening it would defeat the purpose of the filter.
*
* @param in The {@link InputStream} to read from. Must not be {@code null}.
* @return A valid {@link SvmDoccatModel} instance.
- * @throws IOException Thrown if IO errors occurred during deserialization.
+ * @throws IOException Thrown if IO errors occurred during deserialization,
+ * including {@link java.io.InvalidClassException} when
+ * the stream contains a class outside the allow-list or
+ * exceeds a resource limit.
* @throws ClassNotFoundException Thrown if required classes are not found.
*/
public static SvmDoccatModel deserialize(InputStream in) throws IOException,
ClassNotFoundException {
+ return deserialize(in, DeserializationLimits.DEFAULT);
+ }
+
+ /**
+ * Deserializes a {@link SvmDoccatModel} from the given {@link InputStream}
+ * using the supplied {@link DeserializationLimits resource limits}.
+ * <p>
+ * Use this overload when the {@link DeserializationLimits#DEFAULT default
+ * limits} reject a legitimate model — for example, models trained with very
+ * large feature vocabularies. The class allow-list applied to the stream is
+ * the same as for {@link #deserialize(InputStream)}; only the numeric
+ * limits change.
+ *
+ * @param in The {@link InputStream} to read from. Must not be {@code
null}.
+ * @param limits The {@link DeserializationLimits} to apply. Must not be
+ * {@code null}.
+ * @return A valid {@link SvmDoccatModel} instance.
+ * @throws IOException Thrown if IO errors occurred during deserialization,
+ * including {@link java.io.InvalidClassException} when
+ * the stream contains a class outside the allow-list or
+ * exceeds one of the supplied limits.
+ * @throws ClassNotFoundException Thrown if required classes are not found.
+ * @throws NullPointerException if {@code limits} is {@code null}.
+ */
+ public static SvmDoccatModel deserialize(InputStream in,
DeserializationLimits limits)
+ throws IOException, ClassNotFoundException {
+ Objects.requireNonNull(limits, "limits must not be null");
try (ObjectInputStream ois = new ObjectInputStream(in)) {
+ ois.setObjectInputFilter(buildFilter(limits));
return (SvmDoccatModel) ois.readObject();
}
}
+
+ /**
+ * Resource limits applied to the {@link ObjectInputFilter} used by
+ * {@link SvmDoccatModel#deserialize(InputStream, DeserializationLimits)}.
+ * <p>
+ * The limits bound graph traversal regardless of the class allow-list and
+ * provide defense-in-depth against pathological streams. The
+ * {@linkplain #DEFAULT default values} are generous enough for typical
+ * production models; raise them only if a legitimate model is rejected.
+ *
+ * @param maxDepth Maximum object-graph nesting depth. Must be {@code
> 0}.
+ * @param maxRefs Maximum number of internal references the stream may
+ * create. Must be {@code > 0}.
+ * @param maxArrayLength Maximum length of any single array allocation
+ * requested by the stream. Must be {@code > 0}.
+ * @throws IllegalArgumentException if any of {@code maxDepth},
+ * {@code maxRefs}, or {@code
maxArrayLength}
+ * is {@code <= 0}.
+ */
+ public record DeserializationLimits(long maxDepth, long maxRefs, long
maxArrayLength) {
+
+ /**
+ * Default limits. Sized to allow typical production-scale models to
+ * round-trip while still bounding pathological streams.
+ */
+ public static final DeserializationLimits DEFAULT =
+ new DeserializationLimits(MAX_DEPTH_DEFAULT, MAX_REFS_DEFAULT,
MAX_ARRAY_DEFAULT);
+
+ public DeserializationLimits {
+ if (maxDepth <= 0) {
+ throw new IllegalArgumentException("maxDepth must be > 0");
+ }
+ if (maxRefs <= 0) {
+ throw new IllegalArgumentException("maxRefs must be > 0");
+ }
+ if (maxArrayLength <= 0) {
+ throw new IllegalArgumentException("maxArrayLength must be > 0");
+ }
+ }
+ }
+
+ // Default resource limits applied during deserialization. These are
+ // intentionally generous so that legitimate models — which may contain
+ // millions of support vectors — round-trip without issue, while still
+ // bounding pathological inputs.
+ private static final long MAX_DEPTH_DEFAULT = 64;
+ private static final long MAX_REFS_DEFAULT = 5_000_000;
+ private static final long MAX_ARRAY_DEFAULT = 10_000_000;
+
+ // Allow-list of fully qualified class names that may appear in the
+ // serialized graph of a SvmDoccatModel. Anything else is rejected.
+ private static final Set<String> ALLOWED_CLASSES = Set.of(
+ // OpenNLP types persisted by this model
+ "opennlp.tools.ml.libsvm.doccat.SvmDoccatModel",
+ "opennlp.tools.ml.libsvm.doccat.SvmDoccatConfiguration",
+ "opennlp.tools.ml.libsvm.doccat.TermWeightingStrategy",
+ "opennlp.tools.ml.libsvm.doccat.FeatureSelectionStrategy",
+ // zlibsvm domain + configuration types reachable from SvmModel
+ "de.hhn.mi.configuration.SvmConfigurationImpl",
+ "de.hhn.mi.configuration.SvmType",
+ "de.hhn.mi.configuration.KernelType",
+ "de.hhn.mi.domain.SvmModelImpl",
+ "de.hhn.mi.domain.SvmMetaInformationImpl",
+ "de.hhn.mi.domain.SvmFeatureImpl",
+ "de.hhn.mi.domain.SvmClassLabelImpl",
+ // Native libsvm structures embedded in SvmModelImpl
+ "libsvm.svm_model",
+ "libsvm.svm_node",
+ "libsvm.svm_parameter",
+ // JDK types used in field declarations
+ "java.lang.String",
+ "java.lang.Number",
Review Comment:
We do have Integer and Double listed seperately. Any reason to still include
abstract java.lang.Number?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]