Author: btellier
Date: Mon Jun 29 08:39:30 2015
New Revision: 1688139

URL: http://svn.apache.org/r1688139
Log:
MAILBOX-234 Dates extraction from headers

Modified:
    
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java
    
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json

Modified: 
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java
URL: 
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- 
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java
 (original)
+++ 
james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java
 Mon Jun 29 08:39:30 2015
@@ -19,6 +19,7 @@
 
 package org.apache.james.mailbox.elasticsearch.json;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ArrayListMultimap;
 import com.google.common.collect.ImmutableMultimap;
@@ -39,6 +40,8 @@ import java.time.format.DateTimeFormatte
 import java.util.HashSet;
 import java.util.Optional;
 import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
@@ -46,6 +49,14 @@ public class HeaderCollection {
 
     public static class Builder {
 
+        // Some sent e-mail have this form : Wed,  3 Jun 2015 09:05:46 +0000 
(UTC)
+        // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed,  3 Jun 
2015 09:05:46 +0000 only
+        // This REGEXP is here to match ( in order to remove ) the possible 
invalid end of a header date
+        // Example of matching patterns :
+        //  (UTC)
+        //  (CEST)
+        private static final Pattern DATE_SANITIZING_PATTERN = 
Pattern.compile(" *\\(.*\\) *");
+
         private final Set<EMailer> toAddressSet;
         private final Set<EMailer> fromAddressSet;
         private final Set<EMailer> ccAddressSet;
@@ -135,13 +146,26 @@ public class HeaderCollection {
 
         private Optional<ZonedDateTime> toISODate(String value) {
             try {
-                return Optional.of(ZonedDateTime.parse(value, 
DateTimeFormatter.RFC_1123_DATE_TIME));
+                return Optional.of(ZonedDateTime.parse(
+                    sanitizeDateStringHeaderValue(value),
+                    DateTimeFormatter.RFC_1123_DATE_TIME));
             } catch (Exception e) {
                 LOGGER.info("Can not parse receive date " + value);
                 return Optional.empty();
             }
         }
 
+        @VisibleForTesting String sanitizeDateStringHeaderValue(String value) {
+            // Some sent e-mail have this form : Wed,  3 Jun 2015 09:05:46 
+0000 (UTC)
+            // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed,  3 
Jun 2015 09:05:46 +0000 only
+            // This method is here to convert the first date into something 
parsable by RFC_1123_DATE_TIME DateTimeFormatter
+            Matcher sanitizerMatcher = DATE_SANITIZING_PATTERN.matcher(value);
+            if (sanitizerMatcher.find()) {
+                return value.substring(0 , sanitizerMatcher.start());
+            }
+            return value;
+        }
+
     }
 
     public static final String TO = "to";

Modified: 
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java
URL: 
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- 
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java
 (original)
+++ 
james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java
 Mon Jun 29 08:39:30 2015
@@ -130,6 +130,13 @@ public class HeaderCollectionTest {
     }
 
     @Test
+    public void nonStandardDatesShouldBeRetreived() {
+        HeaderCollection headerCollection = HeaderCollection.builder().add(new 
FieldImpl("Date", "Thu, 4 Jun 2015 06:08:41 +0200 (UTC)")).build();
+        
assertThat(DATE_TIME_FORMATTER.format(headerCollection.getSentDate().get()))
+            .isEqualTo("2015/06/04 06:08:41");
+    }
+
+    @Test
     public void dateShouldBeAbsentOnInvalidHeader() {
         HeaderCollection headerCollection = HeaderCollection.builder().add(new 
FieldImpl("Date", "Not a date")).build();
         assertThat(headerCollection.getSentDate().isPresent())
@@ -148,4 +155,32 @@ public class HeaderCollectionTest {
         HeaderCollection.builder().add(null).build();
     }
 
+    @Test
+    public void sanitizeDateStringHeaderValueShouldRemoveCESTPart() {
+        assertThat(HeaderCollection.builder()
+            .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200 
(CEST)"))
+            .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200");
+    }
+
+    @Test
+    public void sanitizeDateStringHeaderValueShouldRemoveUTCPart() {
+        assertThat(HeaderCollection.builder()
+            .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200  
(UTC)  "))
+            .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200");
+    }
+
+    @Test
+    public void sanitizeDateStringHeaderValueShouldNotChangeAcceptableString() 
{
+        assertThat(HeaderCollection.builder()
+            .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200"))
+            .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200");
+    }
+
+    @Test
+    public void sanitizeDateStringHeaderValueShouldNotChangeEmptyString() {
+        assertThat(HeaderCollection.builder()
+            .sanitizeDateStringHeaderValue(""))
+            .isEqualTo("");
+    }
+
 }

Modified: 
james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml
URL: 
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml 
(original)
+++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml 
Mon Jun 29 08:39:30 2015
@@ -32,7 +32,7 @@ Delivered-To: mailing list server-dev@ja
 Received: (qmail 37236 invoked by uid 99); 4 Jun 2015 09:23:38 -0000
 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28)
     by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 04 Jun 2015 09:23:38 +0000
-Date: Thu, 4 Jun 2015 09:23:37 +0000
+Date: Thu, 4 Jun 2015 09:23:37 +0000 (UTC)
 From: "Tellier Benoit (JIRA)" <j...@apache.org>
 To: server-dev@james.apache.org
 Message-ID: <jira.12835341.1433409792000.9340.1433409817...@atlassian.jira>

Modified: 
james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml
URL: 
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml 
(original)
+++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml 
Mon Jun 29 08:39:30 2015
@@ -32,7 +32,7 @@ Delivered-To: mailing list server-dev@ja
 Received: (qmail 43130 invoked by uid 99); 4 Jun 2015 09:27:38 -0000
 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28)
     by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 04 Jun 2015 09:27:38 +0000
-Date: Thu, 4 Jun 2015 09:27:37 +0000
+Date: Thu, 4 Jun 2015 09:27:37 +0000 (UTC)
 From: "Tellier Benoit (JIRA)" <j...@apache.org>
 To: server-dev@james.apache.org
 Message-ID: <jira.12781874.1426269127000.9353.1433410057...@atlassian.jira>

Modified: 
james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml
URL: 
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml 
(original)
+++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml 
Mon Jun 29 08:39:30 2015
@@ -34,7 +34,7 @@ Delivered-To: mailing list server-dev@ja
 Received: (qmail 1132 invoked by uid 99); 2 Jun 2015 08:16:20 -0000
 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28)
     by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 02 Jun 2015 08:16:20 +0000
-Date: Tue, 2 Jun 2015 08:16:19 +0000
+Date: Tue, 2 Jun 2015 08:16:19 +0000 (UTC)
 From: "Eric Charles (JIRA)" <j...@apache.org>
 To: server-dev@james.apache.org
 Message-ID: <jira.12473940.1284322083000.91735.1433232979...@atlassian.jira>

Modified: 
james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml
URL: 
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml 
(original)
+++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml 
Mon Jun 29 08:39:30 2015
@@ -34,7 +34,7 @@ Delivered-To: mailing list mailet-api@ja
 Received: (qmail 81730 invoked by uid 99); 15 May 2015 06:36:00 -0000
 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28)
     by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 15 May 2015 06:36:00 +0000
-Date: Fri, 15 May 2015 06:35:59 +0000
+Date: Fri, 15 May 2015 06:35:59 +0000 (UTC)
 From: "Eric Charles (JIRA)" <mailet-...@james.apache.org>
 To: mailet-...@james.apache.org
 Message-ID: <jira.12825882.1430301328000.124152.1431671759...@atlassian.jira>

Modified: 
james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml
URL: 
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml 
(original)
+++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml 
Mon Jun 29 08:39:30 2015
@@ -31,7 +31,7 @@ Content-Type: multipart/mixed; boundary=
 Content-Transfer-Encoding: 7bit
 MIME-Version: 1.0
 From: "Content-filter at spam.minet.net" <postmas...@minet.net>
-Date: Wed, 3 Jun 2015 09:05:46 +0000
+Date: Wed, 3 Jun 2015 09:05:46 +0000 (UTC)
 To: <r...@listes.minet.net>
 Message-ID: <vass-izaxqm...@spam.minet.net>
 Subject: [root] UNCHECKED contents in mail FROM <quenti...@riseup.net>

Modified: 
james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json
URL: 
http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- 
james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json 
(original)
+++ 
james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json 
Mon Jun 29 08:39:30 2015
@@ -12,7 +12,7 @@
          "1.0"
        ],
        "date": [
-         "Wed, 3 Jun 2015 09:05:46 +0000"
+         "Wed, 3 Jun 2015 09:05:46 +0000 (UTC)"
        ],
        "x-beenthere": [
          "r...@listes.minet.net"
@@ -103,7 +103,7 @@
   "subject": [
        "[root] UNCHECKED contents in mail FROM <quenti...@riseup.net>"
   ],
-  "sentDate": "2015-06-07T00:00:00+0200",
+  "sentDate": "2015-06-03T09:05:46+0000",
   "properties": [
        {
          "namespace": "http://james.apache.org/rfc2045/Content-Type";,



---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org
For additional commands, e-mail: server-dev-h...@james.apache.org

Reply via email to