JAMES-2046 Implement robust SentDate parsing for Memory search Reuse ElasticSearch code.
Project: http://git-wip-us.apache.org/repos/asf/james-project/repo Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/21ce9e82 Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/21ce9e82 Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/21ce9e82 Branch: refs/heads/master Commit: 21ce9e82a5effac912841eede6fc54a09d7d0090 Parents: eaedcd2 Author: benwa <btell...@linagora.com> Authored: Thu Aug 24 13:40:16 2017 +0700 Committer: benwa <btell...@linagora.com> Committed: Fri Aug 25 15:31:42 2017 +0700 ---------------------------------------------------------------------- .../elasticsearch/json/HeaderCollection.java | 42 +---------- .../json/HeaderCollectionTest.java | 28 -------- .../search/comparator/CombinedComparator.java | 13 ++-- .../search/comparator/SentDateComparator.java | 73 +++++++++++++------- .../comparator/SentDateComparatorTest.java | 68 ++++++++++++++++++ 5 files changed, 124 insertions(+), 100 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/james-project/blob/21ce9e82/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java ---------------------------------------------------------------------- diff --git a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java index 2b26a66..85782ad 100644 --- a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java +++ b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java @@ -24,23 +24,18 @@ import java.util.HashSet; import java.util.Locale; import java.util.Optional; import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.james.mailbox.store.search.SearchUtil; +import org.apache.james.mailbox.store.search.comparator.SentDateComparator; import org.apache.james.mime4j.dom.address.Address; import org.apache.james.mime4j.dom.address.Group; import org.apache.james.mime4j.dom.address.Mailbox; import org.apache.james.mime4j.field.address.LenientAddressParser; import org.apache.james.mime4j.stream.Field; import org.apache.james.mime4j.util.MimeUtil; -import org.apache.james.util.date.ImapDateTimeFormatter; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ImmutableMultimap; @@ -51,14 +46,6 @@ public class HeaderCollection { public static class Builder { - // Some sent e-mail have this form : Wed, 3 Jun 2015 09:05:46 +0000 (UTC) - // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed, 3 Jun 2015 09:05:46 +0000 only - // This REGEXP is here to match ( in order to remove ) the possible invalid end of a header date - // Example of matching patterns : - // (UTC) - // (CEST) - private static final Pattern DATE_SANITIZING_PATTERN = Pattern.compile(" *\\(.*\\) *"); - private final Set<EMailer> toAddressSet; private final Set<EMailer> fromAddressSet; private final Set<EMailer> ccAddressSet; @@ -114,7 +101,7 @@ public class HeaderCollection { subjectSet.add(headerValue); break; case DATE: - sentDate = toISODate(headerValue); + sentDate = SentDateComparator.toISODate(headerValue); break; } } @@ -152,29 +139,6 @@ public class HeaderCollection { } throw new RuntimeException(headerName + " is not a address header name"); } - - private Optional<ZonedDateTime> toISODate(String value) { - try { - return Optional.of(ZonedDateTime.parse( - sanitizeDateStringHeaderValue(value), - ImapDateTimeFormatter.rfc5322())); - } catch (Exception e) { - LOGGER.info("Can not parse receive date " + value); - return Optional.empty(); - } - } - - @VisibleForTesting String sanitizeDateStringHeaderValue(String value) { - // Some sent e-mail have this form : Wed, 3 Jun 2015 09:05:46 +0000 (UTC) - // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed, 3 Jun 2015 09:05:46 +0000 only - // This method is here to convert the first date into something parsable by RFC_1123_DATE_TIME DateTimeFormatter - Matcher sanitizerMatcher = DATE_SANITIZING_PATTERN.matcher(value); - if (sanitizerMatcher.find()) { - return value.substring(0 , sanitizerMatcher.start()); - } - return value; - } - } public static final String TO = "to"; @@ -185,8 +149,6 @@ public class HeaderCollection { public static final String SUBJECT = "subject"; public static final String DATE = "date"; - private static final Logger LOGGER = LoggerFactory.getLogger(HeaderCollection.class); - public static Builder builder() { return new Builder(); } http://git-wip-us.apache.org/repos/asf/james-project/blob/21ce9e82/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java ---------------------------------------------------------------------- diff --git a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java index b1f8fae..93a7b02 100644 --- a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java +++ b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java @@ -267,32 +267,4 @@ public class HeaderCollectionTest { HeaderCollection.builder().add(null).build(); } - @Test - public void sanitizeDateStringHeaderValueShouldRemoveCESTPart() { - assertThat(HeaderCollection.builder() - .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200 (CEST)")) - .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200"); - } - - @Test - public void sanitizeDateStringHeaderValueShouldRemoveUTCPart() { - assertThat(HeaderCollection.builder() - .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200 (UTC) ")) - .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200"); - } - - @Test - public void sanitizeDateStringHeaderValueShouldNotChangeAcceptableString() { - assertThat(HeaderCollection.builder() - .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200")) - .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200"); - } - - @Test - public void sanitizeDateStringHeaderValueShouldNotChangeEmptyString() { - assertThat(HeaderCollection.builder() - .sanitizeDateStringHeaderValue("")) - .isEqualTo(""); - } - } http://git-wip-us.apache.org/repos/asf/james-project/blob/21ce9e82/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/CombinedComparator.java ---------------------------------------------------------------------- diff --git a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/CombinedComparator.java b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/CombinedComparator.java index 38bae1d..53d270f 100644 --- a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/CombinedComparator.java +++ b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/CombinedComparator.java @@ -95,14 +95,11 @@ public class CombinedComparator implements Comparator<MailboxMessage>{ @Override public int compare(MailboxMessage o1, MailboxMessage o2) { - int i = 0; - for (Comparator<MailboxMessage> comparator : comparators) { - i = comparator.compare(o1, o2); - if (i != 0) { - break; - } - } - return i; + return comparators.stream() + .map(comparator -> comparator.compare(o1, o2)) + .filter(result -> result != 0) + .findFirst() + .orElse(0); } } http://git-wip-us.apache.org/repos/asf/james-project/blob/21ce9e82/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/SentDateComparator.java ---------------------------------------------------------------------- diff --git a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/SentDateComparator.java b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/SentDateComparator.java index 8a1373e..3a897f7 100644 --- a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/SentDateComparator.java +++ b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/comparator/SentDateComparator.java @@ -18,44 +18,69 @@ ****************************************************************/ package org.apache.james.mailbox.store.search.comparator; -import java.io.StringReader; +import java.time.Instant; +import java.time.ZonedDateTime; import java.util.Comparator; -import java.util.Date; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.james.mailbox.store.mail.model.MailboxMessage; -import org.apache.james.mime4j.dom.datetime.DateTime; -import org.apache.james.mime4j.field.datetime.parser.DateTimeParser; -import org.apache.james.mime4j.field.datetime.parser.ParseException; +import org.apache.james.util.date.ImapDateTimeFormatter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.annotations.VisibleForTesting; /** * {@link Comparator} which works like stated in RFC5256 2.2 Sent Date - * */ public class SentDateComparator extends AbstractHeaderComparator { + public final static Comparator<MailboxMessage> SENTDATE = new SentDateComparator(); + private static final Logger LOGGER = LoggerFactory.getLogger(SentDateComparator.class); + // Some sent e-mail have this form : Wed, 3 Jun 2015 09:05:46 +0000 (UTC) + // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed, 3 Jun 2015 09:05:46 +0000 only + // This REGEXP is here to match ( in order to remove ) the possible invalid end of a header date + // Example of matching patterns : + // (UTC) + // (CEST) + private static final Pattern DATE_SANITIZING_PATTERN = Pattern.compile(" *\\(.*\\) *"); + + public static Optional<ZonedDateTime> toISODate(String value) { + try { + return Optional.of(ZonedDateTime.parse( + sanitizeDateStringHeaderValue(value), + ImapDateTimeFormatter.rfc5322())); + } catch (Exception e) { + LOGGER.info("Can not parse receive date " + value); + return Optional.empty(); + } + } + + @VisibleForTesting + static String sanitizeDateStringHeaderValue(String value) { + // Some sent e-mail have this form : Wed, 3 Jun 2015 09:05:46 +0000 (UTC) + // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed, 3 Jun 2015 09:05:46 +0000 only + // This method is here to convert the first date into something parsable by RFC_1123_DATE_TIME DateTimeFormatter + Matcher sanitizerMatcher = DATE_SANITIZING_PATTERN.matcher(value); + if (sanitizerMatcher.find()) { + return value.substring(0 , sanitizerMatcher.start()); + } + return value; + } @Override public int compare(MailboxMessage o1, MailboxMessage o2) { - Date date1 = getSentDate(o1); - Date date2 = getSentDate(o2); - int i = date1.compareTo(date2); - - // sent date was the same so use the uid as tie-breaker - if (i == 0) { - return UidComparator.UID.compare(o1, o2); - } - return 0; + Instant date1 = getSentDate(o1); + Instant date2 = getSentDate(o2); + return date1.compareTo(date2); } - private Date getSentDate(MailboxMessage message) { + private Instant getSentDate(MailboxMessage message) { final String value = getHeaderValue("Date", message); - final StringReader reader = new StringReader(value); - try { - DateTime dateTime = new DateTimeParser(reader).parseAll(); - return dateTime.getDate(); - } catch (ParseException e) { - // if we can not parse the date header we should use the internaldate as fallback - return message.getInternalDate(); - } + return toISODate(value) + .map(ZonedDateTime::toInstant) + .orElse(message.getInternalDate().toInstant()); } } http://git-wip-us.apache.org/repos/asf/james-project/blob/21ce9e82/mailbox/store/src/test/java/org/apache/james/mailbox/store/search/comparator/SentDateComparatorTest.java ---------------------------------------------------------------------- diff --git a/mailbox/store/src/test/java/org/apache/james/mailbox/store/search/comparator/SentDateComparatorTest.java b/mailbox/store/src/test/java/org/apache/james/mailbox/store/search/comparator/SentDateComparatorTest.java new file mode 100644 index 0000000..ddd5a63 --- /dev/null +++ b/mailbox/store/src/test/java/org/apache/james/mailbox/store/search/comparator/SentDateComparatorTest.java @@ -0,0 +1,68 @@ +/**************************************************************** + * Licensed to the Apache Software Foundation (ASF) under one * + * or more contributor license agreements. See the NOTICE file * + * distributed with this work for additional information * + * regarding copyright ownership. The ASF licenses this file * + * to you under the Apache License, Version 2.0 (the * + * "License"); you may not use this file except in compliance * + * with the License. You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, * + * software distributed under the License is distributed on an * + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * + * KIND, either express or implied. See the License for the * + * specific language governing permissions and limitations * + * under the License. * + ****************************************************************/ + +package org.apache.james.mailbox.store.search.comparator; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.Test; + +public class SentDateComparatorTest { + @Test + public void sanitizeDateStringHeaderValueShouldRemoveCESTPart() { + assertThat( + SentDateComparator.sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200 (CEST)")) + .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200"); + } + + @Test + public void sanitizeDateStringHeaderValueShouldRemoveUTCPart() { + assertThat( + SentDateComparator.sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200 (UTC) ")) + .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200"); + } + + @Test + public void sanitizeDateStringHeaderValueShouldNotChangeAcceptableString() { + assertThat( + SentDateComparator.sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200")) + .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200"); + } + + @Test + public void sanitizeDateStringHeaderValueShouldRemoveBrackets() { + assertThat( + SentDateComparator.sanitizeDateStringHeaderValue("invalid (removeMe)")) + .isEqualTo("invalid"); + } + + @Test + public void sanitizeDateStringHeaderValueShouldKeepUnclosedBrackets() { + assertThat( + SentDateComparator.sanitizeDateStringHeaderValue("invalid (removeMe")) + .isEqualTo("invalid (removeMe"); + } + + @Test + public void sanitizeDateStringHeaderValueShouldNotChangeEmptyString() { + assertThat( + SentDateComparator.sanitizeDateStringHeaderValue("")) + .isEqualTo(""); + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: server-dev-unsubscr...@james.apache.org For additional commands, e-mail: server-dev-h...@james.apache.org