Copilot commented on code in PR #4336:
URL: https://github.com/apache/solr/pull/4336#discussion_r3142616733
##########
solr/core/src/java/org/apache/solr/util/FileTypeMagicUtil.java:
##########
@@ -196,14 +201,110 @@ public static boolean isFileForbiddenInConfigset(byte[]
bytes) {
Arrays.asList(
System.getProperty(
"solr.configset.upload.mimetypes.forbidden",
-
"application/x-java-applet,application/zip,application/x-tar,text/x-shellscript")
+
"application/x-java-applet,application/zip,application/x-tar,"
+ +
"application/gzip,application/x-bzip2,application/x-xz,"
+ +
"text/x-shellscript,application/x-dosexec,application/x-executable,"
+ +
"application/x-java-serialized-object,application/x-mach-binary")
.split(",")));
Review Comment:
`forbiddenTypes` is built by splitting the system property on commas without
trimming whitespace. If an operator sets
`solr.configset.upload.mimetypes.forbidden` with spaces after commas (a common
pattern), entries like " application/zip" won't match and forbidden types could
be unintentionally allowed. Consider trimming each token (and filtering
empties) when building the set.
##########
solr/core/src/java/org/apache/solr/util/FileTypeMagicUtil.java:
##########
@@ -196,14 +201,110 @@ public static boolean isFileForbiddenInConfigset(byte[]
bytes) {
Arrays.asList(
System.getProperty(
"solr.configset.upload.mimetypes.forbidden",
-
"application/x-java-applet,application/zip,application/x-tar,text/x-shellscript")
+
"application/x-java-applet,application/zip,application/x-tar,"
+ +
"application/gzip,application/x-bzip2,application/x-xz,"
+ +
"text/x-shellscript,application/x-dosexec,application/x-executable,"
+ +
"application/x-java-serialized-object,application/x-mach-binary")
.split(",")));
- private String guessTypeFallbackToOctetStream(ContentInfo contentInfo) {
- if (contentInfo == null) {
- return ContentType.OTHER.getMimeType();
- } else {
- return contentInfo.getContentType().getMimeType();
- }
+ /**
+ * Detects JVM class files by the 0xCAFEBABE magic. Kotlin, Scala and Groovy
use the same bytes.
+ */
+ private static boolean isJavaClass(byte[] b) {
+ return b.length >= 4
+ && (b[0] & 0xFF) == 0xCA
+ && (b[1] & 0xFF) == 0xFE
+ && (b[2] & 0xFF) == 0xBA
+ && (b[3] & 0xFF) == 0xBE;
+ }
+
+ /**
+ * Detects ZIP/JAR archives by the PK magic at offset 0. Handles three
signatures: PK\x03\x04
+ * (local file header), PK\x05\x06 (empty archive), PK\x07\x08 (data
descriptor).
+ *
+ * <p>Polyglot files (e.g. JPEG+ZIP) are not detected: their ZIP content is
appended after the
+ * outer format's end marker and does not appear at offset 0.
+ */
+ private static boolean isZip(byte[] b) {
+ return b.length >= 4
+ && b[0] == 'P'
+ && b[1] == 'K'
+ && ((b[2] == 0x03 && b[3] == 0x04)
+ || (b[2] == 0x05 && b[3] == 0x06)
+ || (b[2] == 0x07 && b[3] == 0x08));
+ }
+
+ /**
+ * Detects POSIX/GNU ustar TAR archives via the magic at offset 257.
V7-format tars have no magic
+ * and are not detected, but they are essentially extinct.
+ */
+ private static boolean isTar(byte[] b) {
+ return b.length >= 262
+ && b[257] == 'u'
+ && b[258] == 's'
+ && b[259] == 't'
+ && b[260] == 'a'
+ && b[261] == 'r';
+ }
+
+ /** Detects gzip compressed files (magic: 1F 8B). */
+ private static boolean isGzip(byte[] b) {
+ return b.length >= 2 && (b[0] & 0xFF) == 0x1F && (b[1] & 0xFF) == 0x8B;
+ }
+
+ /** Detects bzip2 compressed files (magic: "BZh"). */
+ private static boolean isBzip2(byte[] b) {
+ return b.length >= 3 && b[0] == 'B' && b[1] == 'Z' && b[2] == 'h';
+ }
+
+ /** Detects xz compressed files (magic: FD 37 7A 58 5A 00). */
+ private static boolean isXz(byte[] b) {
+ return b.length >= 6
+ && (b[0] & 0xFF) == 0xFD
+ && b[1] == '7'
+ && b[2] == 'z'
+ && b[3] == 'X'
+ && b[4] == 'Z'
+ && b[5] == 0x00;
+ }
+
+ /** Detects shell scripts by the shebang (#!) at the start of the file. */
Review Comment:
`isShellScript` currently classifies any file starting with a shebang (`#!`)
as `text/x-shellscript`, but the Javadoc says “shell or bash script”. Since
this will also match e.g. Python/Ruby/Perl scripts (as your tests demonstrate),
either tighten detection to shell-specific interpreters or update the
documentation/type description to reflect “shebang script” rather than “shell
script”.
```suggestion
/** Detects shebang-based scripts by the {@code #!} marker at the start of
the file. */
```
##########
solr/core/src/test/org/apache/solr/util/FileTypeMagicUtilTest.java:
##########
@@ -29,27 +30,119 @@ public void testGuessMimeType() throws IOException {
assertResourceMimeType("text/x-shellscript", "/magic/shell.sh.txt");
}
+ public void testGuessMimeTypeBytes() {
+ // Empty / null
+ assertEquals("application/octet-stream",
FileTypeMagicUtil.INSTANCE.guessMimeType(new byte[0]));
+ assertFalse(FileTypeMagicUtil.isFileForbiddenInConfigset(new byte[0]));
+
+ // Java class: 0xCAFEBABE + version 52 (Java 8)
+ byte[] javaClass = {(byte) 0xCA, (byte) 0xFE, (byte) 0xBA, (byte) 0xBE, 0,
0, 0, 52};
+ assertEquals("application/x-java-applet",
FileTypeMagicUtil.INSTANCE.guessMimeType(javaClass));
+
+ // Java class: preview-compiled (minor=0xFFFF, major=61 / Java 17).
+ // A previous version had a signed-integer overflow that allowed these
through.
+ byte[] previewClass = {
+ (byte) 0xCA, (byte) 0xFE, (byte) 0xBA, (byte) 0xBE, (byte) 0xFF, (byte)
0xFF, 0, 61
+ };
+ assertEquals(
+ "application/x-java-applet",
FileTypeMagicUtil.INSTANCE.guessMimeType(previewClass));
+
+ // ZIP: PK\x03\x04
+ byte[] zip = {'P', 'K', 0x03, 0x04, 0, 0, 0, 0};
+ assertEquals("application/zip",
FileTypeMagicUtil.INSTANCE.guessMimeType(zip));
+
+ // ZIP: PK\x05\x06 (empty archive)
+ byte[] emptyZip = {'P', 'K', 0x05, 0x06, 0, 0, 0, 0};
+ assertEquals("application/zip",
FileTypeMagicUtil.INSTANCE.guessMimeType(emptyZip));
+
+ // ZIP: PK\x07\x08 (data-descriptor signature)
+ byte[] ddZip = {'P', 'K', 0x07, 0x08, 0, 0, 0, 0};
+ assertEquals("application/zip",
FileTypeMagicUtil.INSTANCE.guessMimeType(ddZip));
+
+ // gzip compressed file
+ byte[] gzip = {(byte) 0x1F, (byte) 0x8B, 0x08, 0x00};
+ assertEquals("application/gzip",
FileTypeMagicUtil.INSTANCE.guessMimeType(gzip));
+
+ // bzip2 compressed file
+ byte[] bzip2 = {'B', 'Z', 'h', '9'};
+ assertEquals("application/x-bzip2",
FileTypeMagicUtil.INSTANCE.guessMimeType(bzip2));
+
+ // xz compressed file
+ byte[] xz = {(byte) 0xFD, '7', 'z', 'X', 'Z', 0x00};
+ assertEquals("application/x-xz",
FileTypeMagicUtil.INSTANCE.guessMimeType(xz));
+
+ // Shell scripts — various interpreter paths
+ assertShellScript("#!/bin/sh\necho hello\n");
+ assertShellScript("#!/usr/bin/env python3\nprint('hi')\n");
+ assertShellScript("#!/opt/homebrew/bin/python3\nprint('hi')\n");
+ assertShellScript("#! /bin/bash\necho hi\n");
+ assertShellScript("#!/nix/store/xxx-bash/bin/bash\necho hi\n");
+
+ // MZ: Windows EXE / self-extracting ZIP
+ byte[] mz = {'M', 'Z', 0, 0};
+ assertEquals("application/x-dosexec",
FileTypeMagicUtil.INSTANCE.guessMimeType(mz));
+
+ // ELF: Linux native binary
+ byte[] elf = {0x7F, 'E', 'L', 'F', 0x02, 0x01};
+ assertEquals("application/x-executable",
FileTypeMagicUtil.INSTANCE.guessMimeType(elf));
+
Review Comment:
The new detector adds many additional forbidden MIME types (gzip/bzip2/xz,
MZ/ELF/Mach-O, Java serialized, etc.), but the tests here only assert
`guessMimeType(...)` for them. Consider also asserting
`FileTypeMagicUtil.isFileForbiddenInConfigset(bytes)` is `true` for each of
these signatures so the default forbidden list and the detector logic can’t
drift apart unnoticed.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]