janhoy commented on code in PR #4336:
URL: https://github.com/apache/solr/pull/4336#discussion_r3142577325


##########
solr/core/src/java/org/apache/solr/util/FileTypeMagicUtil.java:
##########
@@ -199,11 +202,63 @@ public static boolean isFileForbiddenInConfigset(byte[] 
bytes) {
                       
"application/x-java-applet,application/zip,application/x-tar,text/x-shellscript")
                   .split(",")));
 
-  private String guessTypeFallbackToOctetStream(ContentInfo contentInfo) {
-    if (contentInfo == null) {
-      return ContentType.OTHER.getMimeType();
-    } else {
-      return contentInfo.getContentType().getMimeType();
+  /**
+   * Detects JVM class files by the 0xCAFEBABE magic. Kotlin, Scala and Groovy 
use the same bytes.
+   */
+  private static boolean isJavaClass(byte[] b) {
+    return b.length >= 4
+        && (b[0] & 0xFF) == 0xCA
+        && (b[1] & 0xFF) == 0xFE
+        && (b[2] & 0xFF) == 0xBA
+        && (b[3] & 0xFF) == 0xBE;
+  }
+
+  /**
+   * Detects ZIP/JAR archives by the PK magic at offset 0. Handles three 
signatures: PK\x03\x04
+   * (local file header), PK\x05\x06 (empty archive), PK\x07\x08 (data 
descriptor).
+   *
+   * <p>Polyglot files (e.g. JPEG+ZIP) are not detected: their ZIP content is 
appended after the
+   * outer format's end marker and does not appear at offset 0.
+   */
+  private static boolean isZip(byte[] b) {
+    return b.length >= 4
+        && b[0] == 'P'
+        && b[1] == 'K'
+        && ((b[2] == 0x03 && b[3] == 0x04)
+            || (b[2] == 0x05 && b[3] == 0x06)
+            || (b[2] == 0x07 && b[3] == 0x08));
+  }
+
+  /**
+   * Detects TAR archives via the POSIX/GNU ustar magic at offset 257, and 
compressed archives
+   * (gzip, bzip2, xz) which commonly wrap tarballs. V7-format tars have no 
magic and are not
+   * detected, but they are essentially extinct. Plain compressed files 
without a tar payload are
+   * also blocked — they have no legitimate use in a Solr configset.
+   */
+  private static boolean isTar(byte[] b) {
+    if (b.length >= 262
+        && b[257] == 'u'
+        && b[258] == 's'
+        && b[259] == 't'
+        && b[260] == 'a'
+        && b[261] == 'r') {
+      return true;
     }
+    if (b.length >= 2 && (b[0] & 0xFF) == 0x1F && (b[1] & 0xFF) == 0x8B) 
return true; // gzip
+    if (b.length >= 3 && b[0] == 'B' && b[1] == 'Z' && b[2] == 'h') return 
true; // bzip2
+    return b.length >= 6 // xz: FD 37 7A 58 5A 00
+        && (b[0] & 0xFF) == 0xFD
+        && b[1] == '7'
+        && b[2] == 'z'
+        && b[3] == 'X'
+        && b[4] == 'Z'
+        && b[5] == 0x00;

Review Comment:
   I think it is more correct to detect gzip/bzip2/xz as individual types and 
add them to the forbidden list



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to