To be sure, I've added tests with byte-order marks, and this doesn't
influence the result:

    @Test
    public void testDetect() throws IOException {
        final Detector detector = new Tika().getDetector();
        final Metadata metadata = new Metadata();
        try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("<data>42</data>".getBytes(StandardCharsets.US_ASCII))))
{
            assertEquals(MediaType.TEXT_PLAIN, detector.detect(in,
metadata).getBaseType());
        }
        try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("<?xml?><data>42</data>".getBytes(StandardCharsets.US_ASCII))))
{
            assertEquals(MediaType.TEXT_PLAIN, detector.detect(in,
metadata).getBaseType());
        }
        try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("<?xml
version='1.0'?><data>42</data>".getBytes(StandardCharsets.US_ASCII)))) {
            assertEquals(MediaType.APPLICATION_XML, detector.detect(in,
metadata).getBaseType());
        }
        try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("\uFEFF<?xml
version='1.0'?><data>42</data>".getBytes(StandardCharsets.UTF_8)))) {
            // UTF-8 BOM, follweod by the '<' char:
            assertEquals(0xEF, in.read());
            assertEquals(0xBB, in.read());
            assertEquals(0xBF, in.read());
            assertEquals('<', in.read());
        }
        try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("\uFEFF<data>42</data>".getBytes(StandardCharsets.UTF_8))))
{
            assertEquals(MediaType.TEXT_PLAIN, detector.detect(in,
metadata).getBaseType());
        }
        try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("\uFEFF<?xml?><data>42</data>".getBytes(StandardCharsets.UTF_8))))
{
            assertEquals(MediaType.TEXT_PLAIN, detector.detect(in,
metadata).getBaseType());
        }
        try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("\uFEFF<?xml
version='1.0'?><data>42</data>".getBytes(StandardCharsets.UTF_8)))) {
            assertEquals(MediaType.APPLICATION_XML, detector.detect(in,
metadata).getBaseType());
        }
    }

–John

Reply via email to