Author: jukka Date: Mon May 5 11:15:10 2008 New Revision: 653548 URL: http://svn.apache.org/viewvc?rev=653548&view=rev Log: TIKA-87: MimeTypes should allow modification of MIME types TIKA-89: Rename MimeType and MimeTypes - Trying to decouple the MIME type registry from Tika configuration - Work in progress
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java incubator/tika/trunk/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java incubator/tika/trunk/src/test/java/org/apache/tika/mime/MediaTypeTest.java Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java?rev=653548&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java Mon May 5 11:15:10 2008 @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.mime; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.regex.Pattern; + +/** + * Internet media type. + */ +public final class MediaType { + + private static final Map<String, String> NO_PARAMETERS = + Collections.emptyMap(); + + private static final Pattern SPECIAL = + Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=]"); + + private static final Pattern SPECIAL_OR_WHITESPACE = + Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]"); + + public static final MediaType OCTET_STREAM = + new MediaType("application", "octet-stream", NO_PARAMETERS); + + public static final MediaType TEXT_PLAIN = + new MediaType("text", "plain", NO_PARAMETERS); + + public static final MediaType APPLICATION_XML = + new MediaType("application", "xml", NO_PARAMETERS); + + /** + * Parses the given string to a media type. The string is expected + * to be of the form "type/subtype(; parameter=...)*" as defined + * in RFC 2045. + * <p> + * Note that currently this method only parses the "type/subtype" part + * of the string. Any parameters are simply discarded. + * + * @param string media type string to be parsed + * @return parsed media type + * @throws IllegalArgumentException if the string is not a media type + */ + public static MediaType parse(String string) { + int slash = string.indexOf('/'); + if (slash == -1) { + throw new IllegalArgumentException("Invalid media type: " + string); + } + + String type = string.substring(0, slash); + String subtype = string.substring(slash + 1); + // String parameters = ""; + + int colon = subtype.indexOf(';'); + if (colon != -1) { + // parameters = subtype.substring(colon + 1); + subtype = subtype.substring(0, colon); + } + + return new MediaType(type, subtype); + } + + private final String type; + + private final String subtype; + + private final SortedMap<String, String> parameters; + + public MediaType( + String type, String subtype, Map<String, String> parameters) { + this.type = type.trim().toLowerCase(); + this.subtype = subtype.trim().toLowerCase(); + this.parameters = new TreeMap<String, String>(); + for (Map.Entry<String, String> entry : parameters.entrySet()) { + this.parameters.put( + entry.getKey().trim().toLowerCase(), entry.getValue()); + } + } + + public MediaType(String type, String subtype) { + this(type, subtype, NO_PARAMETERS); + } + + private static Map<String, String> union( + Map<String, String> a, Map<String, String> b) { + if (a.isEmpty()) { + return b; + } else if (b.isEmpty()) { + return a; + } else { + Map<String, String> union = new HashMap<String, String>(); + union.putAll(a); + union.putAll(b); + return union; + } + } + + public MediaType(MediaType type, Map<String, String> parameters) { + this(type.type, type.subtype, union(type.parameters, parameters)); + } + + public MediaType getBaseType() { + if (parameters.isEmpty()) { + return this; + } else { + return new MediaType(type, subtype); + } + } + + public String getType() { + return type; + } + + public String getSubtype() { + return subtype; + } + + public Map<String, String> getParameters() { + return Collections.unmodifiableMap(parameters); + } + + public boolean isSpecializationOf(MediaType that) { + if (OCTET_STREAM.equals(that)) { + return true; + } else if (!type.equals(that.type)) { + return false; + } else if (!parameters.entrySet().containsAll(that.parameters.entrySet())) { + return false; + } else if (TEXT_PLAIN.equals(that.getBaseType())) { + return true; + } else if (APPLICATION_XML.equals(that.getBaseType()) + && subtype.endsWith("+xml")) { + return true; + } else { + return subtype.equals(that.subtype); + } + } + + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append(type); + builder.append('/'); + builder.append(subtype); + for (Map.Entry<String, String> entry : parameters.entrySet()) { + builder.append("; "); + builder.append(entry.getKey()); + builder.append("="); + String value = entry.getValue(); + if (SPECIAL_OR_WHITESPACE.matcher(value).find()) { + builder.append('"'); + builder.append(SPECIAL.matcher(value).replaceAll("\\\\$0")); + builder.append('"'); + } else { + builder.append(value); + } + } + return builder.toString(); + } + + public boolean equals(Object object) { + if (object instanceof MediaType) { + MediaType that = (MediaType) object; + return type.equals(that.type) + && subtype.equals(that.subtype) + && parameters.equals(that.parameters); + } else { + return false; + } + } + + public int hashCode() { + int hash = 17; + hash = hash * 31 + type.hashCode(); + hash = hash * 31 + subtype.hashCode(); + hash = hash * 31 + parameters.hashCode(); + return hash; + } + +} Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java?rev=653548&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java Mon May 5 11:15:10 2008 @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.mime; + +import java.util.HashMap; +import java.util.Map; + +/** + * Registry of Internet media types. + */ +public final class MediaTypeRegistry { + + private final Map<MediaType, MediaType> aliases = + new HashMap<MediaType, MediaType>(); + + public void addAlias(MediaType canonical, MediaType alias) { + aliases.put(alias, canonical); + } + + public MediaType unalias(MediaType type) { + MediaType canonical = aliases.get(type.getBaseType()); + if (canonical != null) { + return new MediaType(canonical, type.getParameters()); + } else { + return type; + } + } + +} Added: incubator/tika/trunk/src/test/java/org/apache/tika/mime/MediaTypeTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/MediaTypeTest.java?rev=653548&view=auto ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/mime/MediaTypeTest.java (added) +++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/MediaTypeTest.java Mon May 5 11:15:10 2008 @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.mime; + +import java.util.HashMap; +import java.util.Map; + +import junit.framework.TestCase; + +public class MediaTypeTest extends TestCase { + + public void testBasics() { + assertEquals( + "application/octet-stream", + new MediaType("application", "octet-stream").toString()); + + assertEquals( + "text/plain", + new MediaType("text", "plain").toString()); + + Map<String, String> parameters = new HashMap<String, String>(); + assertEquals( + "text/plain", + new MediaType("text", "plain", parameters).toString()); + + parameters.put("charset", "UTF-8"); + assertEquals( + "text/plain; charset=UTF-8", + new MediaType("text", "plain", parameters).toString()); + + parameters.put("x-eol-style", "crlf"); + assertEquals( + "text/plain; charset=UTF-8; x-eol-style=crlf", + new MediaType("text", "plain", parameters).toString()); + } + + public void testLowerCase() { + assertEquals( + "text/plain", + new MediaType("TEXT", "PLAIN").toString()); + assertEquals( + "text/plain", + new MediaType("Text", "Plain").toString()); + + Map<String, String> parameters = new HashMap<String, String>(); + assertEquals( + "text/plain", + new MediaType("text", "PLAIN", parameters).toString()); + + parameters.put("CHARSET", "UTF-8"); + assertEquals( + "text/plain; charset=UTF-8", + new MediaType("TEXT", "plain", parameters).toString()); + + parameters.put("X-Eol-Style", "crlf"); + assertEquals( + "text/plain; charset=UTF-8; x-eol-style=crlf", + new MediaType("TeXt", "PlAiN", parameters).toString()); + } + + public void testTrim() { + assertEquals( + "text/plain", + new MediaType(" text ", " plain ").toString()); + assertEquals( + "text/plain", + new MediaType("\ttext", "plain\t").toString()); + + Map<String, String> parameters = new HashMap<String, String>(); + assertEquals( + "text/plain", + new MediaType("text\r\n", " \tplain", parameters).toString()); + + parameters.put(" charset", "UTF-8"); + assertEquals( + "text/plain; charset=UTF-8", + new MediaType("\n\ntext", "plain \r", parameters).toString()); + + parameters.put("\r\n\tx-eol-style \t", "crlf"); + assertEquals( + "text/plain; charset=UTF-8; x-eol-style=crlf", + new MediaType(" text", "\tplain ", parameters).toString()); + } + + public void testQuote() { + Map<String, String> parameters = new HashMap<String, String>(); + parameters.put("a", " value with spaces "); + parameters.put("b", "text/plain"); + parameters.put("c", "()<>@,;:\\\"/[]?="); + assertEquals( + "text/plain; a=\" value with spaces \"; b=\"text\\/plain\"" + + "; c=\"\\(\\)\\<\\>[EMAIL PROTECTED],\\;\\:\\\\\\\"\\/\\[\\]\\?\\=\"", + new MediaType("text", "plain", parameters).toString()); + } + + +}