janhoy commented on code in PR #4336:
URL: https://github.com/apache/solr/pull/4336#discussion_r3215363266
##########
solr/core/src/test/org/apache/solr/util/FileTypeMagicUtilTest.java:
##########
@@ -17,40 +17,251 @@
package org.apache.solr.util;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrException;
public class FileTypeMagicUtilTest extends SolrTestCaseJ4 {
public void testGuessMimeType() throws IOException {
- assertResourceMimeType("application/x-java-applet",
"/magic/HelloWorldJavaClass.class.bin");
- assertResourceMimeType("application/zip",
"/runtimecode/containerplugin.v.1.jar.bin");
- assertResourceMimeType("application/x-tar", "/magic/hello.tar.bin");
+ // Tests the InputStream code path for each format using inline magic
bytes, avoiding binary
+ // blobs in the repository. Text files are still tested via classpath
resources.
+ byte[] javaClass = {(byte) 0xCA, (byte) 0xFE, (byte) 0xBA, (byte) 0xBE, 0,
0, 0, 52};
+ assertStreamMimeType("application/x-java-applet", javaClass);
+
+ byte[] jar = {'P', 'K', 0x03, 0x04, 0, 0, 0, 0};
+ assertStreamMimeType("application/zip", jar);
+
+ byte[] tar = new byte[512];
+ tar[257] = 'u';
+ tar[258] = 's';
+ tar[259] = 't';
+ tar[260] = 'a';
+ tar[261] = 'r';
+ assertStreamMimeType("application/x-tar", tar);
+
+ // Shell scripts are plain text — safe to keep as a classpath resource.
assertResourceMimeType("text/x-shellscript", "/magic/shell.sh.txt");
}
+ public void testGuessMimeTypeBytes() {
+ // Empty / null
+ assertEquals("application/octet-stream",
FileTypeMagicUtil.INSTANCE.guessMimeType(new byte[0]));
+ assertFalse(FileTypeMagicUtil.isFileForbiddenInConfigset(new byte[0]));
+ assertFalse(FileTypeMagicUtil.isFileForbiddenInConfigset((byte[]) null));
+
+ // Java class: 0xCAFEBABE + version 52 (Java 8)
+ byte[] javaClass = {(byte) 0xCA, (byte) 0xFE, (byte) 0xBA, (byte) 0xBE, 0,
0, 0, 52};
+ assertEquals("application/x-java-applet",
FileTypeMagicUtil.INSTANCE.guessMimeType(javaClass));
+
+ // Java class: preview-compiled (minor=0xFFFF, major=61 / Java 17).
+ // A previous version had a signed-integer overflow that allowed these
through.
+ byte[] previewClass = {
+ (byte) 0xCA, (byte) 0xFE, (byte) 0xBA, (byte) 0xBE, (byte) 0xFF, (byte)
0xFF, 0, 61
+ };
+ assertEquals(
+ "application/x-java-applet",
FileTypeMagicUtil.INSTANCE.guessMimeType(previewClass));
+
+ // ZIP: PK\x03\x04
+ byte[] zip = {'P', 'K', 0x03, 0x04, 0, 0, 0, 0};
+ assertEquals("application/zip",
FileTypeMagicUtil.INSTANCE.guessMimeType(zip));
+
+ // ZIP: PK\x05\x06 (empty archive)
+ byte[] emptyZip = {'P', 'K', 0x05, 0x06, 0, 0, 0, 0};
+ assertEquals("application/zip",
FileTypeMagicUtil.INSTANCE.guessMimeType(emptyZip));
+
+ // ZIP: PK\x07\x08 (data-descriptor signature)
+ byte[] ddZip = {'P', 'K', 0x07, 0x08, 0, 0, 0, 0};
+ assertEquals("application/zip",
FileTypeMagicUtil.INSTANCE.guessMimeType(ddZip));
+
+ // gzip compressed file
+ byte[] gzip = {(byte) 0x1F, (byte) 0x8B, 0x08, 0x00};
+ assertEquals("application/gzip",
FileTypeMagicUtil.INSTANCE.guessMimeType(gzip));
+ assertTrue(FileTypeMagicUtil.isFileForbiddenInConfigset(gzip));
+
+ // bzip2 compressed file
+ byte[] bzip2 = {'B', 'Z', 'h', '9'};
+ assertEquals("application/x-bzip2",
FileTypeMagicUtil.INSTANCE.guessMimeType(bzip2));
+ assertTrue(FileTypeMagicUtil.isFileForbiddenInConfigset(bzip2));
+
+ // xz compressed file
+ byte[] xz = {(byte) 0xFD, '7', 'z', 'X', 'Z', 0x00};
+ assertEquals("application/x-xz",
FileTypeMagicUtil.INSTANCE.guessMimeType(xz));
+ assertTrue(FileTypeMagicUtil.isFileForbiddenInConfigset(xz));
+
+ // Shell scripts — various interpreter paths
+ assertShellScript("#!/bin/sh\necho hello\n");
+ assertShellScript("#!/usr/bin/env python3\nprint('hi')\n");
+ assertShellScript("#!/opt/homebrew/bin/python3\nprint('hi')\n");
+ assertShellScript("#! /bin/bash\necho hi\n");
+ assertShellScript("#!/nix/store/xxx-bash/bin/bash\necho hi\n");
+
+ // MZ: Windows EXE / self-extracting ZIP
+ byte[] mz = {'M', 'Z', 0, 0};
+ assertEquals("application/x-dosexec",
FileTypeMagicUtil.INSTANCE.guessMimeType(mz));
+ assertTrue(FileTypeMagicUtil.isFileForbiddenInConfigset(mz));
+
+ // ELF: Linux native binary
+ byte[] elf = {0x7F, 'E', 'L', 'F', 0x02, 0x01};
+ assertEquals("application/x-executable",
FileTypeMagicUtil.INSTANCE.guessMimeType(elf));
+ assertTrue(FileTypeMagicUtil.isFileForbiddenInConfigset(elf));
+
+ // Java serialized object
+ byte[] ser = {(byte) 0xAC, (byte) 0xED, 0x00, 0x05};
+ assertEquals(
+ "application/x-java-serialized-object",
FileTypeMagicUtil.INSTANCE.guessMimeType(ser));
+ assertTrue(FileTypeMagicUtil.isFileForbiddenInConfigset(ser));
+
+ // Mach-O: all four variants (32/64-bit, big/little-endian)
+ byte[] macho32be = {(byte) 0xFE, (byte) 0xED, (byte) 0xFA, (byte) 0xCE};
+ byte[] macho64be = {(byte) 0xFE, (byte) 0xED, (byte) 0xFA, (byte) 0xCF};
+ byte[] macho32le = {(byte) 0xCE, (byte) 0xFA, (byte) 0xED, (byte) 0xFE};
+ byte[] macho64le = {(byte) 0xCF, (byte) 0xFA, (byte) 0xED, (byte) 0xFE};
+ for (byte[] m : new byte[][] {macho32be, macho64be, macho32le, macho64le})
{
+ assertEquals("application/x-mach-binary",
FileTypeMagicUtil.INSTANCE.guessMimeType(m));
+ assertTrue(FileTypeMagicUtil.isFileForbiddenInConfigset(m));
+ }
+
+ // Plain text: not forbidden
+ assertEquals(
+ "application/octet-stream",
+ FileTypeMagicUtil.INSTANCE.guessMimeType("hello
world".getBytes(StandardCharsets.UTF_8)));
+ }
+
public void testIsFileForbiddenInConfigset() throws IOException {
- assertResourceForbiddenInConfigset("/magic/HelloWorldJavaClass.class.bin");
+ byte[] javaClass = {(byte) 0xCA, (byte) 0xFE, (byte) 0xBA, (byte) 0xBE, 0,
0, 0, 52};
+ assertTrue(FileTypeMagicUtil.isFileForbiddenInConfigset(new
ByteArrayInputStream(javaClass)));
+
+ // Text files are safe to keep as classpath resources.
assertResourceForbiddenInConfigset("/magic/shell.sh.txt");
assertResourceAllowedInConfigset("/magic/plain.txt");
}
+ public void testPolyglotZipNotDetectedFromHeaderOnly() {
+ // When only a leading chunk of bytes is available (no EOCD in the
window), a JPEG+ZIP polyglot
+ // is not detected — the JPEG magic at offset 0 wins and the result is
octet-stream.
+ // This is an accepted limitation for callers that supply only a file
header excerpt.
+ byte[] jpegMagic = {(byte) 0xFF, (byte) 0xD8, (byte) 0xFF, (byte) 0xE0};
+ assertEquals("application/octet-stream",
FileTypeMagicUtil.INSTANCE.guessMimeType(jpegMagic));
+ assertFalse(FileTypeMagicUtil.isFileForbiddenInConfigset(jpegMagic));
+ }
+
+ public void testPolyglotZipDetectedByTailScan() throws IOException {
+ // Build a JPEG+ZIP polyglot: JPEG magic at offset 0, minimal ZIP EOCD
appended at the tail.
+ // ZIP readers locate the archive by scanning backwards for the EOCD
signature (PK\x05\x06),
+ // regardless of what appears at offset 0, making such files genuine ZIP
archives.
+ byte[] jpegMagic = {(byte) 0xFF, (byte) 0xD8, (byte) 0xFF, (byte) 0xE0, 0,
0x10};
+ // Minimal valid EOCD: PK\x05\x06 + 18 zero bytes (no entries, no comment).
+ byte[] eocd = {'P', 'K', 0x05, 0x06, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0};
+ byte[] polyglot = concat(jpegMagic, eocd);
+
+ // Complete byte[] contains the EOCD — polyglot is detected and blocked in
both paths.
+ assertEquals("application/zip",
FileTypeMagicUtil.INSTANCE.guessMimeType(polyglot));
+ assertTrue(FileTypeMagicUtil.isFileForbiddenInConfigset(polyglot));
Review Comment:
I also added a test for polyglot ZIP file. I.e. a valid jpg file which is
also a valid zip (or JAR file) at the same time.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]