janhoy commented on code in PR #4336:
URL: https://github.com/apache/solr/pull/4336#discussion_r3142577325
##########
solr/core/src/java/org/apache/solr/util/FileTypeMagicUtil.java:
##########
@@ -199,11 +202,63 @@ public static boolean isFileForbiddenInConfigset(byte[]
bytes) {
"application/x-java-applet,application/zip,application/x-tar,text/x-shellscript")
.split(",")));
- private String guessTypeFallbackToOctetStream(ContentInfo contentInfo) {
- if (contentInfo == null) {
- return ContentType.OTHER.getMimeType();
- } else {
- return contentInfo.getContentType().getMimeType();
+ /**
+ * Detects JVM class files by the 0xCAFEBABE magic. Kotlin, Scala and Groovy
use the same bytes.
+ */
+ private static boolean isJavaClass(byte[] b) {
+ return b.length >= 4
+ && (b[0] & 0xFF) == 0xCA
+ && (b[1] & 0xFF) == 0xFE
+ && (b[2] & 0xFF) == 0xBA
+ && (b[3] & 0xFF) == 0xBE;
+ }
+
+ /**
+ * Detects ZIP/JAR archives by the PK magic at offset 0. Handles three
signatures: PK\x03\x04
+ * (local file header), PK\x05\x06 (empty archive), PK\x07\x08 (data
descriptor).
+ *
+ * <p>Polyglot files (e.g. JPEG+ZIP) are not detected: their ZIP content is
appended after the
+ * outer format's end marker and does not appear at offset 0.
+ */
+ private static boolean isZip(byte[] b) {
+ return b.length >= 4
+ && b[0] == 'P'
+ && b[1] == 'K'
+ && ((b[2] == 0x03 && b[3] == 0x04)
+ || (b[2] == 0x05 && b[3] == 0x06)
+ || (b[2] == 0x07 && b[3] == 0x08));
+ }
+
+ /**
+ * Detects TAR archives via the POSIX/GNU ustar magic at offset 257, and
compressed archives
+ * (gzip, bzip2, xz) which commonly wrap tarballs. V7-format tars have no
magic and are not
+ * detected, but they are essentially extinct. Plain compressed files
without a tar payload are
+ * also blocked — they have no legitimate use in a Solr configset.
+ */
+ private static boolean isTar(byte[] b) {
+ if (b.length >= 262
+ && b[257] == 'u'
+ && b[258] == 's'
+ && b[259] == 't'
+ && b[260] == 'a'
+ && b[261] == 'r') {
+ return true;
}
+ if (b.length >= 2 && (b[0] & 0xFF) == 0x1F && (b[1] & 0xFF) == 0x8B)
return true; // gzip
+ if (b.length >= 3 && b[0] == 'B' && b[1] == 'Z' && b[2] == 'h') return
true; // bzip2
+ return b.length >= 6 // xz: FD 37 7A 58 5A 00
+ && (b[0] & 0xFF) == 0xFD
+ && b[1] == '7'
+ && b[2] == 'z'
+ && b[3] == 'X'
+ && b[4] == 'Z'
+ && b[5] == 0x00;
Review Comment:
I think it is more correct to detect gzip/bzip2/xz as individual types and
add them to the forbidden list
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]