Author: nick
Date: Thu Nov 27 13:42:49 2014
New Revision: 1642152
URL: http://svn.apache.org/r1642152
Log:
TIKA-1487 Based on the file format docs from OpenOffice, add detection and mime
types for the older Excel 2, 3 and 4 pre-ole2 formats
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1642152&r1=1642151&r2=1642152&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
(original)
+++
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Thu Nov 27 13:42:49 2014
@@ -1395,7 +1395,6 @@
<match value="Foglio\ di\ lavoro\ Microsoft\ Exce" type="string"
offset="2080"/>
<match value="Biff5" type="string" offset="2114"/>
<match value="Biff5" type="string" offset="2121"/>
- <match value="\x09\x04\x06\x00\x00\x00\x10\x00" type="string"
offset="0"/>
<match value="0xd0cf11e0a1b11ae1" type="string" offset="0:8">
<match value="W\x00o\x00r\x00k\x00b\x00o\x00o\x00k" type="string"
offset="1152:4096" />
</match>
@@ -1429,6 +1428,60 @@
<sub-class-of type="application/x-tika-ooxml"/>
</mime-type>
+ <mime-type type="application/vnd.ms-excel.sheet.4">
+ <_comment>Microsoft Excel 4 Worksheet</_comment>
+ <magic priority="60">
+ <match value="0x09040600" type="string" offset="0">
+ <match value="0x00001000" type="string" offset="4"/> <!-- Sheet -->
+ <match value="0x00002000" type="string" offset="4"/> <!-- Chart -->
+ <match value="0x00004000" type="string" offset="4"/> <!-- Macro -->
+ </match>
+ </magic>
+ <sub-class-of type="application/x-tika-old-excel"/>
+ </mime-type>
+ <mime-type type="application/vnd.ms-excel.workspace.4">
+ <_comment>Microsoft Excel 4 Workspace</_comment>
+ <magic priority="60">
+ <match value="0x09040600" type="string" offset="0">
+ <match value="0x00000001" type="string" offset="4"/>
+ </match>
+ </magic>
+ <sub-class-of type="application/x-tika-old-excel"/>
+ </mime-type>
+
+ <mime-type type="application/vnd.ms-excel.sheet.3">
+ <_comment>Microsoft Excel 3 Worksheet</_comment>
+ <magic priority="60">
+ <match value="0x09020600" type="string" offset="0">
+ <match value="0x00001000" type="string" offset="4"/> <!-- Sheet -->
+ <match value="0x00002000" type="string" offset="4"/> <!-- Chart -->
+ <match value="0x00004000" type="string" offset="4"/> <!-- Macro -->
+ </match>
+ </magic>
+ <sub-class-of type="application/x-tika-old-excel"/>
+ </mime-type>
+ <mime-type type="application/vnd.ms-excel.workspace.3">
+ <_comment>Microsoft Excel 3 Workspace</_comment>
+ <magic priority="60">
+ <match value="0x09020600" type="string" offset="0">
+ <match value="0x00000001" type="string" offset="4"/>
+ </match>
+ </magic>
+ <sub-class-of type="application/x-tika-old-excel"/>
+ </mime-type>
+
+ <mime-type type="application/vnd.ms-excel.sheet.2">
+ <_comment>Microsoft Excel 2 Worksheet</_comment>
+ <magic priority="60">
+ <match value="0x09000400" type="string" offset="0">
+ <match value="0x00001000" type="string" offset="4"/> <!-- Sheet -->
+ <match value="0x00002000" type="string" offset="4"/> <!-- Chart -->
+ <match value="0x00004000" type="string" offset="4"/> <!-- Macro -->
+ </match>
+ </magic>
+ <sub-class-of type="application/x-tika-old-excel"/>
+ </mime-type>
+
<mime-type type="application/vnd.ms-fontobject">
<glob pattern="*.eot"/>
</mime-type>
@@ -3399,6 +3452,10 @@
</magic>
</mime-type>
+ <mime-type type="application/x-tika-old-excel">
+ <_comment>Pre-OLE2 (Old) Microsoft Excel Worksheets</_comment>
+ </mime-type>
+
<!-- =================================================================== -->
<!-- Office Open XML file formats -->
<!-- http://www.ecma-international.org/publications/standards/Ecma-376.htm
-->
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1642152&r1=1642151&r2=1642152&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Thu Nov 27 13:42:49 2014
@@ -232,6 +232,28 @@ public class TestMimeTypes {
}
/**
+ * Files from Excel 2 through 4 are based on the BIFF record
+ * structure, but without a wrapping OLE2 structure.
+ * Excel 5 and Excel 95+ work on OLE2
+ */
+ @Test
+ public void testOldExcel() throws Exception {
+ // With just a name, we'll think everything's a new Excel file
+ assertTypeByName("application/vnd.ms-excel","testEXCEL_4.xls");
+ assertTypeByName("application/vnd.ms-excel","testEXCEL_5.xls");
+ assertTypeByName("application/vnd.ms-excel","testEXCEL_95.xls");
+
+ // With data, we can work out if it's old or new style
+ assertTypeByData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
+ assertTypeByData("application/x-tika-msoffice","testEXCEL_5.xls");
+ assertTypeByData("application/x-tika-msoffice","testEXCEL_95.xls");
+
+
assertTypeByNameAndData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
+ assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_5.xls");
+ assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_95.xls");
+ }
+
+ /**
* Note - detecting container formats by mime magic is very very
* iffy, as we can't be sure where things will end up.
* People really ought to use the container aware detection...