JAMES-2046 Implement robust SentDate parsing for Memory search

Reuse ElasticSearch code.


Project: http://git-wip-us.apache.org/repos/asf/james-project/repo
Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/21ce9e82
Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/21ce9e82
Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/21ce9e82

Branch: refs/heads/master
Commit: 21ce9e82a5effac912841eede6fc54a09d7d0090
Parents: eaedcd2
Author: benwa <btell...@linagora.com>
Authored: Thu Aug 24 13:40:16 2017 +0700
Committer: benwa <btell...@linagora.com>
Committed: Fri Aug 25 15:31:42 2017 +0700

----------------------------------------------------------------------
 .../elasticsearch/json/HeaderCollection.java    | 42 +----------
 .../json/HeaderCollectionTest.java              | 28 --------
 .../search/comparator/CombinedComparator.java   | 13 ++--
 .../search/comparator/SentDateComparator.java   | 73 +++++++++++++-------
 .../comparator/SentDateComparatorTest.java      | 68 ++++++++++++++++++
 5 files changed, 124 insertions(+), 100 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/james-project/blob/21ce9e82/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java
----------------------------------------------------------------------
diff --git 
a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java
 
b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java
index 2b26a66..85782ad 100644
--- 
a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java
+++ 
b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java
@@ -24,23 +24,18 @@ import java.util.HashSet;
 import java.util.Locale;
 import java.util.Optional;
 import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
 import org.apache.james.mailbox.store.search.SearchUtil;
+import org.apache.james.mailbox.store.search.comparator.SentDateComparator;
 import org.apache.james.mime4j.dom.address.Address;
 import org.apache.james.mime4j.dom.address.Group;
 import org.apache.james.mime4j.dom.address.Mailbox;
 import org.apache.james.mime4j.field.address.LenientAddressParser;
 import org.apache.james.mime4j.stream.Field;
 import org.apache.james.mime4j.util.MimeUtil;
-import org.apache.james.util.date.ImapDateTimeFormatter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
-import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ArrayListMultimap;
 import com.google.common.collect.ImmutableMultimap;
@@ -51,14 +46,6 @@ public class HeaderCollection {
 
     public static class Builder {
 
-        // Some sent e-mail have this form : Wed,  3 Jun 2015 09:05:46 +0000 
(UTC)
-        // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed,  3 Jun 
2015 09:05:46 +0000 only
-        // This REGEXP is here to match ( in order to remove ) the possible 
invalid end of a header date
-        // Example of matching patterns :
-        //  (UTC)
-        //  (CEST)
-        private static final Pattern DATE_SANITIZING_PATTERN = 
Pattern.compile(" *\\(.*\\) *");
-
         private final Set<EMailer> toAddressSet;
         private final Set<EMailer> fromAddressSet;
         private final Set<EMailer> ccAddressSet;
@@ -114,7 +101,7 @@ public class HeaderCollection {
                     subjectSet.add(headerValue);
                     break;
                 case DATE:
-                    sentDate = toISODate(headerValue);
+                    sentDate = SentDateComparator.toISODate(headerValue);
                     break;
             }
         }
@@ -152,29 +139,6 @@ public class HeaderCollection {
             }
             throw new RuntimeException(headerName + " is not a address header 
name");
         }
-
-        private Optional<ZonedDateTime> toISODate(String value) {
-            try {
-                return Optional.of(ZonedDateTime.parse(
-                    sanitizeDateStringHeaderValue(value),
-                    ImapDateTimeFormatter.rfc5322()));
-            } catch (Exception e) {
-                LOGGER.info("Can not parse receive date " + value);
-                return Optional.empty();
-            }
-        }
-
-        @VisibleForTesting String sanitizeDateStringHeaderValue(String value) {
-            // Some sent e-mail have this form : Wed,  3 Jun 2015 09:05:46 
+0000 (UTC)
-            // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed,  3 
Jun 2015 09:05:46 +0000 only
-            // This method is here to convert the first date into something 
parsable by RFC_1123_DATE_TIME DateTimeFormatter
-            Matcher sanitizerMatcher = DATE_SANITIZING_PATTERN.matcher(value);
-            if (sanitizerMatcher.find()) {
-                return value.substring(0 , sanitizerMatcher.start());
-            }
-            return value;
-        }
-
     }
 
     public static final String TO = "to";
@@ -185,8 +149,6 @@ public class HeaderCollection {
     public static final String SUBJECT = "subject";
     public static final String DATE = "date";
 
-    private static final Logger LOGGER = 
LoggerFactory.getLogger(HeaderCollection.class);
-
     public static Builder builder() {
         return new Builder();
     }

http://git-wip-us.apache.org/repos/asf/james-project/blob/21ce9e82/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java
----------------------------------------------------------------------
diff --git 
a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java
 
b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java
index b1f8fae..93a7b02 100644
--- 
a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java
+++ 
b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java
@@ -267,32 +267,4 @@ public class HeaderCollectionTest {
         HeaderCollection.builder().add(null).build();
     }
 
-    @Test
-    public void sanitizeDateStringHeaderValueShouldRemoveCESTPart() {
-        assertThat(HeaderCollection.builder()
-            .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200 
(CEST)"))
-            .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200");
-    }
-
-    @Test
-    public void sanitizeDateStringHeaderValueShouldRemoveUTCPart() {
-        assertThat(HeaderCollection.builder()
-            .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200  
(UTC)  "))
-            .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200");
-    }
-
-    @Test
-    public void sanitizeDateStringHeaderValueShouldNotChangeAcceptableString() 
{
-        assertThat(HeaderCollection.builder()
-            .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200"))
-            .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200");
-    }
-
-    @Test
-    public void sanitizeDateStringHeaderValueShouldNotChangeEmptyString() {
-        assertThat(HeaderCollection.builder()
-            .sanitizeDateStringHeaderValue(""))
-            .isEqualTo("");
-    }
-
 }

http://git-wip-us.apache.org/repos/asf/james-project/blob/21ce9e82/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/CombinedComparator.java
----------------------------------------------------------------------
diff --git 
a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/CombinedComparator.java
 
b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/CombinedComparator.java
index 38bae1d..53d270f 100644
--- 
a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/CombinedComparator.java
+++ 
b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/CombinedComparator.java
@@ -95,14 +95,11 @@ public class CombinedComparator implements 
Comparator<MailboxMessage>{
 
     @Override
     public int compare(MailboxMessage o1, MailboxMessage o2) {
-        int i = 0;
-        for (Comparator<MailboxMessage> comparator : comparators) {
-            i = comparator.compare(o1, o2);
-            if (i != 0) {
-                break;
-            }
-        }
-        return i;
+        return comparators.stream()
+            .map(comparator -> comparator.compare(o1, o2))
+            .filter(result -> result != 0)
+            .findFirst()
+            .orElse(0);
     }
 
 }

http://git-wip-us.apache.org/repos/asf/james-project/blob/21ce9e82/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/SentDateComparator.java
----------------------------------------------------------------------
diff --git 
a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/SentDateComparator.java
 
b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/SentDateComparator.java
index 8a1373e..3a897f7 100644
--- 
a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/SentDateComparator.java
+++ 
b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/SentDateComparator.java
@@ -18,44 +18,69 @@
  ****************************************************************/
 package org.apache.james.mailbox.store.search.comparator;
 
-import java.io.StringReader;
+import java.time.Instant;
+import java.time.ZonedDateTime;
 import java.util.Comparator;
-import java.util.Date;
+import java.util.Optional;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.james.mailbox.store.mail.model.MailboxMessage;
-import org.apache.james.mime4j.dom.datetime.DateTime;
-import org.apache.james.mime4j.field.datetime.parser.DateTimeParser;
-import org.apache.james.mime4j.field.datetime.parser.ParseException;
+import org.apache.james.util.date.ImapDateTimeFormatter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.annotations.VisibleForTesting;
 
 /**
  * {@link Comparator} which works like stated in RFC5256 2.2 Sent Date
- *
  */
 public class SentDateComparator extends AbstractHeaderComparator {
+
     public final static Comparator<MailboxMessage> SENTDATE = new 
SentDateComparator();
+    private static final Logger LOGGER = 
LoggerFactory.getLogger(SentDateComparator.class);
+    // Some sent e-mail have this form : Wed,  3 Jun 2015 09:05:46 +0000 (UTC)
+    // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed,  3 Jun 2015 
09:05:46 +0000 only
+    // This REGEXP is here to match ( in order to remove ) the possible 
invalid end of a header date
+    // Example of matching patterns :
+    //  (UTC)
+    //  (CEST)
+    private static final Pattern DATE_SANITIZING_PATTERN = Pattern.compile(" 
*\\(.*\\) *");
+
+    public static Optional<ZonedDateTime> toISODate(String value) {
+        try {
+            return Optional.of(ZonedDateTime.parse(
+                sanitizeDateStringHeaderValue(value),
+                ImapDateTimeFormatter.rfc5322()));
+        } catch (Exception e) {
+            LOGGER.info("Can not parse receive date " + value);
+            return Optional.empty();
+        }
+    }
+
+     @VisibleForTesting
+     static String sanitizeDateStringHeaderValue(String value) {
+        // Some sent e-mail have this form : Wed,  3 Jun 2015 09:05:46 +0000 
(UTC)
+        // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed,  3 Jun 
2015 09:05:46 +0000 only
+        // This method is here to convert the first date into something 
parsable by RFC_1123_DATE_TIME DateTimeFormatter
+        Matcher sanitizerMatcher = DATE_SANITIZING_PATTERN.matcher(value);
+        if (sanitizerMatcher.find()) {
+            return value.substring(0 , sanitizerMatcher.start());
+        }
+        return value;
+    }
 
     @Override
     public int compare(MailboxMessage o1, MailboxMessage o2) {
-        Date date1 = getSentDate(o1);
-        Date date2 = getSentDate(o2);
-        int i = date1.compareTo(date2);
-        
-        // sent date was the same so use the uid as tie-breaker
-        if (i == 0) {
-            return UidComparator.UID.compare(o1, o2);
-        }
-        return 0;
+        Instant date1 = getSentDate(o1);
+        Instant date2 = getSentDate(o2);
+        return date1.compareTo(date2);
     }
     
-    private Date getSentDate(MailboxMessage message) {
+    private Instant getSentDate(MailboxMessage message) {
         final String value = getHeaderValue("Date", message);
-        final StringReader reader = new StringReader(value);
-        try {
-            DateTime dateTime = new DateTimeParser(reader).parseAll();
-            return dateTime.getDate();
-        } catch (ParseException e) {
-            // if we can not parse the date header we should use the 
internaldate as fallback
-            return message.getInternalDate();
-        }
+        return toISODate(value)
+            .map(ZonedDateTime::toInstant)
+            .orElse(message.getInternalDate().toInstant());
     }
 }

http://git-wip-us.apache.org/repos/asf/james-project/blob/21ce9e82/mailbox/store/src/test/java/org/apache/james/mailbox/store/search/comparator/SentDateComparatorTest.java
----------------------------------------------------------------------
diff --git 
a/mailbox/store/src/test/java/org/apache/james/mailbox/store/search/comparator/SentDateComparatorTest.java
 
b/mailbox/store/src/test/java/org/apache/james/mailbox/store/search/comparator/SentDateComparatorTest.java
new file mode 100644
index 0000000..ddd5a63
--- /dev/null
+++ 
b/mailbox/store/src/test/java/org/apache/james/mailbox/store/search/comparator/SentDateComparatorTest.java
@@ -0,0 +1,68 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one   *
+ * or more contributor license agreements.  See the NOTICE file *
+ * distributed with this work for additional information        *
+ * regarding copyright ownership.  The ASF licenses this file   *
+ * to you under the Apache License, Version 2.0 (the            *
+ * "License"); you may not use this file except in compliance   *
+ * with the License.  You may obtain a copy of the License at   *
+ *                                                              *
+ *   http://www.apache.org/licenses/LICENSE-2.0                 *
+ *                                                              *
+ * Unless required by applicable law or agreed to in writing,   *
+ * software distributed under the License is distributed on an  *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
+ * KIND, either express or implied.  See the License for the    *
+ * specific language governing permissions and limitations      *
+ * under the License.                                           *
+ ****************************************************************/
+
+package org.apache.james.mailbox.store.search.comparator;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import org.junit.Test;
+
+public class SentDateComparatorTest {
+    @Test
+    public void sanitizeDateStringHeaderValueShouldRemoveCESTPart() {
+        assertThat(
+            SentDateComparator.sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 
04:09:35 +0200 (CEST)"))
+            .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200");
+    }
+
+    @Test
+    public void sanitizeDateStringHeaderValueShouldRemoveUTCPart() {
+        assertThat(
+            SentDateComparator.sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 
04:09:35 +0200  (UTC)  "))
+            .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200");
+    }
+
+    @Test
+    public void sanitizeDateStringHeaderValueShouldNotChangeAcceptableString() 
{
+        assertThat(
+            SentDateComparator.sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 
04:09:35 +0200"))
+            .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200");
+    }
+
+    @Test
+    public void sanitizeDateStringHeaderValueShouldRemoveBrackets() {
+        assertThat(
+            SentDateComparator.sanitizeDateStringHeaderValue("invalid 
(removeMe)"))
+            .isEqualTo("invalid");
+    }
+
+    @Test
+    public void sanitizeDateStringHeaderValueShouldKeepUnclosedBrackets() {
+        assertThat(
+            SentDateComparator.sanitizeDateStringHeaderValue("invalid 
(removeMe"))
+            .isEqualTo("invalid (removeMe");
+    }
+
+    @Test
+    public void sanitizeDateStringHeaderValueShouldNotChangeEmptyString() {
+        assertThat(
+            SentDateComparator.sanitizeDateStringHeaderValue(""))
+            .isEqualTo("");
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org
For additional commands, e-mail: server-dev-h...@james.apache.org

Reply via email to