Repository: tika Updated Branches: refs/heads/master bfd1d9139 -> 8e819c3ca
TIKA-2122 : add all headers from MSG and RFC822 files Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/8e819c3c Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/8e819c3c Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/8e819c3c Branch: refs/heads/master Commit: 8e819c3caf3ff3b0492f600b4193d1b3ee74f51b Parents: bfd1d91 Author: tballison <[email protected]> Authored: Mon Oct 17 14:10:46 2016 -0400 Committer: tballison <[email protected]> Committed: Mon Oct 17 14:10:46 2016 -0400 ---------------------------------------------------------------------- .../java/org/apache/tika/metadata/Message.java | 6 ++ .../src/test/java/org/apache/tika/TikaTest.java | 8 ++ .../tika/parser/mail/MailContentHandler.java | 5 ++ .../tika/parser/microsoft/OutlookExtractor.java | 87 +++++++++++++++++++- .../tika/parser/mail/RFC822ParserTest.java | 1 + .../parser/microsoft/OutlookParserTest.java | 15 ++++ 6 files changed, 121 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-core/src/main/java/org/apache/tika/metadata/Message.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Message.java b/tika-core/src/main/java/org/apache/tika/metadata/Message.java index ffb9413..dad3952 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Message.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Message.java @@ -16,10 +16,16 @@ */ package org.apache.tika.metadata; +import org.apache.tika.Tika; + /** * A collection of Message related property names. */ public interface Message { + String MESSAGE_PREFIX = "Message"+ Metadata.NAMESPACE_PREFIX_DELIMITER; + + String MESSAGE_RAW_HEADER_PREFIX = MESSAGE_PREFIX+"Raw-Header"+Metadata.NAMESPACE_PREFIX_DELIMITER; + String MESSAGE_RECIPIENT_ADDRESS = "Message-Recipient-Address"; String MESSAGE_FROM = "Message-From"; http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-core/src/test/java/org/apache/tika/TikaTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index 690db33..0bc5a83 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -296,4 +296,12 @@ public abstract class TikaTest { i++; } } + + public static void debug(Metadata metadata) { + for (String n : metadata.names()) { + for (String v : metadata.getValues(n)) { + System.out.println(n + " : "+v); + } + } + } } http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index 6a9bc1b..60170e6 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -51,6 +51,7 @@ import org.apache.tika.config.TikaConfig; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; @@ -238,6 +239,7 @@ class MailContentHandler implements ContentHandler { try { String fieldname = field.getName(); + ParsedField parsedField = LenientFieldParser.getParser().parse( field, DecodeMonitor.SILENT); if (fieldname.equalsIgnoreCase("From")) { @@ -276,6 +278,9 @@ class MailContentHandler implements ContentHandler { date = tryOtherDateFormats(field.getBody()); } metadata.set(TikaCoreProperties.CREATED, date); + } else { + metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX+parsedField.getName(), + field.getBody()); } } catch (RuntimeException me) { if (strictParsing) { http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index c1db274..76ac17f 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -25,13 +25,19 @@ import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.text.ParseException; +import java.util.ArrayList; import java.util.Date; +import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.james.mime4j.codec.DecodeMonitor; +import org.apache.james.mime4j.codec.DecoderUtil; +import org.apache.james.mime4j.dom.field.ParsedField; +import org.apache.james.mime4j.field.LenientFieldParser; import org.apache.poi.hmef.attribute.MAPIRtfAttribute; import org.apache.poi.hsmf.MAPIMessage; import org.apache.poi.hsmf.datatypes.AttachmentChunks; @@ -66,6 +72,18 @@ import org.xml.sax.SAXException; * Outlook Message Parser. */ public class OutlookExtractor extends AbstractPOIFSExtractor { + + + private static Pattern HEADER_KEY_PAT = + Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z"); + //this according to the spec; in practice, it is probably more likely + //that a "split field" fails to start with a space character than + //that a real header contains anything but [-_A-Za-z0-9]. + //e.g. + //header: this header goes onto the next line + //<mailto:[email protected]... + + private static final Metadata EMPTY_METADATA = new Metadata(); HtmlEncodingDetector detector = new HtmlEncodingDetector(); @@ -118,8 +136,19 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } } catch (ChunkNotFoundException he) { } // Will be fixed in POI 3.7 Final + try { + Map<String, String[]> headers = normalizeHeaders(msg.getHeaders()); + for (Map.Entry<String, String[]> e : headers.entrySet()) { + String headerKey = e.getKey(); + for (String headerValue : e.getValue()) { + metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX+headerKey, headerValue); + } + } + } catch (ChunkNotFoundException e) { + + } - // Date - try two ways to find it + // Date - try two ways to find it // First try via the proper chunk if (msg.getMessageDate() != null) { metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime()); @@ -264,6 +293,62 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } } + //As of 3.15, POI currently returns header[] by splitting on /\r?\n/ + //this rebuilds headers that are broken up over several lines + //this also decodes encoded headers. + private Map<String, String[]> normalizeHeaders(String[] rows) { + Map<String, String[]> ret = new LinkedHashMap<>(); + if (rows == null) { + return ret; + } + StringBuilder sb = new StringBuilder(); + Map<String, List<String>> headers = new LinkedHashMap(); + Matcher headerKeyMatcher = HEADER_KEY_PAT.matcher(""); + String lastKey = null; + int consec = 0; + for (String row : rows) { + headerKeyMatcher.reset(row); + if (headerKeyMatcher.find()) { + if (lastKey != null) { + List<String> vals = headers.get(lastKey); + vals = (vals == null) ? new ArrayList<String>() : vals; + vals.add(decodeHeader(sb.toString())); + headers.put(lastKey, vals); + } + //reset sb + sb.setLength(0); + lastKey = headerKeyMatcher.group(1).trim(); + sb.append(headerKeyMatcher.group(2).trim()); + consec = 0; + } else { + if (consec > 0) { + sb.append("\n"); + } + sb.append(row); + } + consec++; + } + + //make sure to add the last value + if (sb.length() > 0 && lastKey != null) { + List<String> vals = headers.get(lastKey); + vals = (vals == null) ? new ArrayList<String>() : vals; + vals.add(decodeHeader(sb.toString())); + headers.put(lastKey, vals); + } + + //convert to array + for (Map.Entry<String, List<String>> e : headers.entrySet()) { + ret.put(e.getKey(), e.getValue().toArray(new String[e.getValue().size()])); + } + return ret; + + } + + private String decodeHeader(String header) { + return DecoderUtil.decodeEncodedWords(header, DecodeMonitor.SILENT); + } + private void header(XHTMLContentHandler xhtml, String key, String value) throws SAXException { if (value != null && value.length() > 0) { http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index c7fcbfb..035b1c2 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -367,6 +367,7 @@ public class RFC822ParserTest extends TikaTest { assertContains("TEST DATA FOR TIKA.", handler.toString()); assertContains("This is text inside an unencrypted zip file", handler.toString()); assertContains("TIKA-1028", handler.toString()); + assertEquals("<[email protected]>", metadata.get("Message:Raw-Header:Return-Path")); } /** http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index 8662e65..c15308f 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -26,6 +26,7 @@ import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import java.io.InputStream; import java.io.StringWriter; +import java.util.Arrays; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -71,6 +72,12 @@ public class OutlookParserTest extends TikaTest { "L'\u00C9quipe Microsoft Outlook Express", metadata.get(Metadata.AUTHOR)); + //ensure that "raw" header is correctly decoded + assertEquals( + "L'\u00C9quipe Microsoft Outlook Express <[email protected]>", + metadata.get(Metadata.MESSAGE_RAW_HEADER_PREFIX+"From")); + + // Stored as Thu, 5 Apr 2007 09:26:06 -0700 assertEquals( "2007-04-05T16:26:06Z", @@ -108,6 +115,14 @@ public class OutlookParserTest extends TikaTest { Matcher matcher = pattern.matcher(content); assertTrue(matcher.find()); assertFalse(matcher.find()); + + //test that last header is added + assertContains("29 Jan 2009 19:17:10.0163 (UTC) FILETIME=[2ED25E30:01C98246]", + Arrays.asList(metadata.getValues("Message:Raw-Header:X-OriginalArrivalTime"))); + //confirm next line is added correctly + assertContains("from athena.apache.org (HELO athena.apache.org) (140.211.11.136)\n" + + " by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 29 Jan 2009 11:17:08 -0800", + Arrays.asList(metadata.getValues("Message:Raw-Header:Received"))); } /**
