Author: jukka Date: Tue Nov 6 03:00:38 2007 New Revision: 592371 URL: http://svn.apache.org/viewvc?rev=592371&view=rev Log: TIKA-87 - MimeTypes should allow modification of MIME types - Streamlined pattern handling
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java?rev=592371&r1=592370&r2=592371&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java Tue Nov 6 03:00:38 2007 @@ -92,9 +92,6 @@ */ private final SortedSet<MimeType> subTypes = new TreeSet<MimeType>(); - /** The Mime-Type associated recognition patterns */ - private final Patterns patterns = new Patterns(); - /** The magics associated to this Mime-Type */ private final ArrayList<Magic> magics = new ArrayList<Magic>(); @@ -203,25 +200,6 @@ } /** - * Adds a file name pattern for this media type. - * - * @param pattern file name pattern - */ - public void addPattern(String pattern) { - registry.addPattern(this, pattern); - patterns.add(pattern, this); - } - - /** - * Returns the file name patterns for this media type. - * - * @return file name patterns - */ - public String[] getPatterns() { - return patterns.getPatterns(); - } - - /** * Returns the aliases of this media type. The returned set is * newly allocated and can be freely modified by the client. * @@ -297,10 +275,6 @@ public boolean hasMagic() { return (magics.size() > 0); - } - - public boolean matches(String url) { - return (patterns.matches(url) == this); } public boolean matchesMagic(byte[] data) { Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=592371&r1=592370&r2=592371&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Tue Nov 6 03:00:38 2007 @@ -94,15 +94,19 @@ * * @param name * of the document to analyze. - * @return the Mime Content Type of the specified document name, or - * <code>null</code> if none is found. + * @return the Mime Content Type of the specified document name */ public MimeType getMimeType(String name) { - MimeType type = patterns.matches(name.toLowerCase()); - if (type != null) + MimeType type = patterns.matches(name); + if (type != null) { + return type; + } + type = patterns.matches(name.toLowerCase()); + if (type != null) { return type; - // if it's null here, then return the default type - return root; + } else { + return root; + } } /** @@ -307,13 +311,14 @@ } /** - * Adds a file name pattern for the given media type. This method should - * only be called from [EMAIL PROTECTED] MimeType#addPattern(String)}. + * Adds a file name pattern for the given media type. * * @param type media type * @param pattern file name pattern + * @throws MimeTypeException if the pattern conflicts with existing ones */ - void addPattern(MimeType type, String pattern) { + public void addPattern(MimeType type, String pattern) + throws MimeTypeException { patterns.add(pattern, type); } @@ -328,21 +333,6 @@ public int getMinLength() { return 1024; // return minLength; - } - - /** - * Add the specified mime-types in the repository. - * - * @param types - * are the mime-types to add. - */ - void add(MimeType[] types) { - if (types == null) { - return; - } - for (int i = 0; i < types.length; i++) { - add(types[i]); - } } /** Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=592371&r1=592370&r2=592371&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java Tue Nov 6 03:00:38 2007 @@ -159,7 +159,7 @@ type.setDescription( nodeElement.getFirstChild().getNodeValue()); } else if (nodeElement.getTagName().equals("glob")) { - type.addPattern(nodeElement.getAttribute("pattern")); + types.addPattern(type, nodeElement.getAttribute("pattern")); } else if (nodeElement.getTagName().equals("magic")) { readMagic(nodeElement, type); } else if (nodeElement.getTagName().equals("alias")) { Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java?rev=592371&r1=592370&r2=592371&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java Tue Nov 6 03:00:38 2007 @@ -17,80 +17,100 @@ package org.apache.tika.mime; // JDK imports -import java.util.ArrayList; +import java.util.Comparator; import java.util.HashMap; import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; /** * Defines a MimeType pattern. */ class Patterns { - private static Map<Character, String> escapeMap = - new HashMap<Character, String>(); + /** + * Index of exact name patterns. + */ + private final Map<String, MimeType> names = new HashMap<String, MimeType>(); - static { - escapeMap.put('\\', "\\\\"); - escapeMap.put('?', "\\?"); - escapeMap.put('[', "\\["); - escapeMap.put(']', "\\]"); - escapeMap.put('^', "\\^"); - escapeMap.put('.', "\\."); - escapeMap.put('-', "\\-"); - escapeMap.put('$', "\\$"); - escapeMap.put('+', "\\+"); - escapeMap.put('(', "\\("); - escapeMap.put(')', "\\)"); - escapeMap.put('{', "\\{"); - escapeMap.put('}', "\\}"); - escapeMap.put('|', "\\|"); - escapeMap.put('*', ".*"); - } + /** + * Index of extension patterns of the form "*extension". + */ + private final Map<String, MimeType> extensions = + new HashMap<String, MimeType>(); - /** Gathers all the patterns */ - private ArrayList<String> patterns = new ArrayList<String>(); + private int minExtensionLength = Integer.MAX_VALUE; - /** An index of exact matching patterns */ - private Map<String, MimeType> exactIdx = new HashMap<String, MimeType>(); + private int maxExtensionLength = 0; - /** An index of the patterns of the form "*.ext" */ - private Map<String, MimeType> extIdx = new HashMap<String, MimeType>(); + /** + * Index of generic glob patterns, sorted by length. + */ + private final SortedMap<String, MimeType> globs = + new TreeMap<String, MimeType>(new Comparator<String>() { + public int compare(String a, String b) { + int diff = b.length() - a.length(); + if (diff == 0) { + diff = a.compareTo(b); + } + return diff; + } + }); + + public void add(String pattern, MimeType type) throws MimeTypeException { + assert pattern != null && type != null; - /** A list of other patterns */ - private Map<String, MimeType> others = new HashMap<String, MimeType>(); - - void add(String[] patterns, MimeType type) { - // Some preliminary checks - if ((patterns == null) || (type == null)) { - return; - } - // All is ok, so add the patterns - for (String pattern : patterns) { - add(pattern, type); + if (pattern.indexOf('*') == -1 + && pattern.indexOf('?') == -1 + && pattern.indexOf('[') == -1) { + addName(pattern, type); + } else if (pattern.startsWith("*") + && pattern.indexOf('*', 1) == -1 + && pattern.indexOf('?') == -1 + && pattern.indexOf('[') == -1) { + addExtension(pattern.substring(1), type); + } else { + addGlob(compile(pattern), type); } } - void add(String pattern, MimeType type) { - // Some preliminary checks - if ((pattern == null) || (type == null)) { - return; + private void addName(String name, MimeType type) throws MimeTypeException { + MimeType previous = names.get(name); + if (previous == null || previous.isDescendantOf(type)) { + names.put(name, type); + } else if (previous == type || type.isDescendantOf(previous)) { + // do nothing + } else { + throw new MimeTypeException("Conflicting name pattern: " + name); } + } - // Add the pattern in the good index - if ((pattern.indexOf('*') == -1) && (pattern.indexOf('?') == -1) - && (pattern.indexOf('[') == -1)) { - exactIdx.put(pattern, type); - } else if (pattern.startsWith("*.")) { - extIdx.put(pattern.substring(2), type); + private void addExtension(String extension, MimeType type) + throws MimeTypeException { + MimeType previous = extensions.get(extension); + if (previous == null || previous.isDescendantOf(type)) { + extensions.put(extension, type); + int length = extension.length(); + minExtensionLength = Math.min(minExtensionLength, length); + maxExtensionLength = Math.max(maxExtensionLength, length); + } else if (previous == type || type.isDescendantOf(previous)) { + // do nothing } else { - others.put(escape(pattern), type); + throw new MimeTypeException( + "Conflicting extension pattern: " + extension); } - // Add the pattern in the list of patterns - patterns.add(pattern); } - String[] getPatterns() { - return patterns.toArray(new String[patterns.size()]); + private void addGlob(String glob, MimeType type) + throws MimeTypeException { + MimeType previous = globs.get(glob); + if (previous == null || previous.isDescendantOf(type)) { + extensions.put(glob, type); + } else if (previous == type || type.isDescendantOf(previous)) { + // do nothing + } else { + throw new MimeTypeException("Conflicting glob pattern: " + glob); + } } /** @@ -107,78 +127,51 @@ * special characters (`*?[') are matched before other wildcarded patterns * (since this covers the majority of the patterns). */ - MimeType matches(String resourceName) { - - // Preliminary check... - if (resourceName == null) { - return null; - } + public MimeType matches(String name) { + assert name != null; // First, try exact match of the provided resource name - MimeType type = exactIdx.get(resourceName); - if (type != null) { - return type; - } - - // Then try exact match with only the resource name - String str = last(resourceName, '/'); - if (str != null) { - type = exactIdx.get(str); - if (type != null) { - return type; - } - } - str = last(resourceName, '\\'); - if (str != null) { - type = exactIdx.get(str); - if (type != null) { - return type; - } + if (names.containsKey(name)) { + return names.get(name); } // Then try "extension" (*.xxx) matching - int idx = resourceName.indexOf('.', 0); - while (idx != -1) { - type = extIdx.get(resourceName.substring(idx + 1)); - if (type != null) { - return type; + int maxLength = Math.min(maxExtensionLength, name.length()); + for (int n = maxLength; n >= minExtensionLength; n--) { + String extension = name.substring(name.length() - n); + if (extensions.containsKey(extension)) { + return extensions.get(extension); } - idx = resourceName.indexOf('.', idx + 1); } // And finally, try complex regexp matching - String longest = null; - for (String pattern : others.keySet()) { - if ((resourceName.matches(pattern)) - && (pattern.length() > longest.length())) { - longest = pattern; + for (Map.Entry<String, MimeType> entry : globs.entrySet()) { + if (name.matches(entry.getKey())) { + return entry.getValue(); } } - if (longest != null) { - type = others.get(longest); - } - return type; - } - private final static String last(String str, char c) { - if (str == null) { - return null; - } - int idx = str.lastIndexOf(c); - if ((idx < 0) || (idx >= (str.length() - 1))) { - return null; - } - return str.substring(idx + 1); + return null; } - private final static String escape(String str) { - StringBuffer result = new StringBuffer(str.length()); - for (int i = 0; i < str.length(); i++) { - String charAt = String.valueOf(str.charAt(i)); - String replace = escapeMap.get(charAt); - result.append((replace != null) ? replace : charAt); + private String compile(String glob) { + StringBuilder pattern = new StringBuilder(); + pattern.append("\\A"); + for (int i = 0; i < glob.length(); i++) { + char ch = glob.charAt(i); + if (ch == '?') { + pattern.append('.'); + } else if (ch == '*') { + pattern.append(".*"); + } else if ("\\[]^.-$+(){}|".indexOf(ch) != -1) { + pattern.append('\\'); + pattern.append(ch); + } else { + pattern.append(ch); + } } - return result.toString(); + pattern.append("\\z"); + return pattern.toString(); } } Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=592371&r1=592370&r2=592371&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java Tue Nov 6 03:00:38 2007 @@ -104,49 +104,45 @@ private MimeType getMimeType(InputStream stream, Metadata metadata) throws IOException { MimeTypes types = config.getMimeRepository(); - MimeType type = null; - // Get type based on metadata hint (if available) - String typename = metadata.get(Metadata.CONTENT_TYPE); - if (typename != null) { - try { - type = types.forName(typename); - } catch (MimeTypeException e) { - // Malformed type name, ignore - } - } - - // Get (or verify) type based on resourceName hint (if available) - String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY); - if (resourceName != null) { - MimeType match = types.getMimeType(resourceName); - if (match != null && (type == null || !type.matches(resourceName))) { - type = match; - } - } - - // Get (or verify) type based on magic prefix + // Get type based on magic prefix stream.mark(types.getMinLength()); try { byte[] prefix = getPrefix(stream, types.getMinLength()); - MimeType match = types.getMimeType(prefix); - if (match != null && (type == null || !type.matches(prefix))) { - type = match; + MimeType type = types.getMimeType(prefix); + if (type != null) { + return type; } } finally { stream.reset(); } - // Finally, use the default type if no matches found - if (type == null) { + // Get type based on resourceName hint (if available) + String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY); + if (resourceName != null) { + MimeType type = types.getMimeType(resourceName); + if (type != null) { + return type; + } + } + + // Get type based on metadata hint (if available) + String typename = metadata.get(Metadata.CONTENT_TYPE); + if (typename != null) { try { - type = types.forName(MimeTypes.DEFAULT); + return types.forName(typename); } catch (MimeTypeException e) { - // Should never happen + // Malformed type name, ignore } } - return type; + // Finally, use the default type if no matches found + try { + return types.forName(MimeTypes.DEFAULT); + } catch (MimeTypeException e) { + // Should never happen + return null; + } } /** Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=592371&r1=592370&r2=592371&view=diff ============================================================================== --- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original) +++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Tue Nov 6 03:00:38 2007 @@ -92,7 +92,6 @@ <glob pattern="*.rng" /> <glob pattern="*.rnx" /> <glob pattern="*.roles" /> - <glob pattern="*.rss" /> <glob pattern="*.sh" /> <glob pattern="*.sql" /> <glob pattern="*.svg" /> Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=592371&r1=592370&r2=592371&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java Tue Nov 6 03:00:38 2007 @@ -49,7 +49,7 @@ * @param tp the parameters encapsulated in a TestParams instance * @throws IOException */ - private void assertAutoDetect(TestParams tp) throws IOException { + private void assertAutoDetect(TestParams tp) throws Exception { InputStream input = AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName); @@ -72,9 +72,6 @@ assertTrue("Expected content not found: " + tp, writer.toString().contains(tp.expectedContentFragment)); - } catch (Throwable t) { - fail("Test error asserting auto detect for parameters: " + t - + "\nParameters: " + tp); } finally { input.close(); }