On Sat, 5 Sep 2020 at 00:44, <[email protected]> wrote:
>
> This is an automated email from the ASF dual-hosted git repository.
>
> humbedooh pushed a commit to branch master
> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git
>
> commit f314d5250999e2afb2ab5063d35afe7d1c1114fa
> Author: Daniel Gruno <[email protected]>
> AuthorDate: Sat Sep 5 01:41:57 2020 +0200
>
> re-align with old pony for cluster generator and unit tests
>
> The general idea here is, if we find an email without a charset at all,
> and we detect non-ascii characters in it, we assume it must be UTF-8 and
> grab the raw bytes. We also convert it internally to a string for the
> Body class, but we don't set the Body class' character set to anything.
> This way, we keep the cluster generator happy by passing it bytes, while
> keeping the rest happy by having a string representation that can be
> unflowed. As DKIM does not use the msgbody itself, it won't be affected
> by this change.
This information belongs in the code.
> ---
> tools/archiver.py | 53
> ++++++++++++++++++++++++++++++---------------
> tools/plugins/generators.py | 7 +++---
> 2 files changed, 40 insertions(+), 20 deletions(-)
>
> diff --git a/tools/archiver.py b/tools/archiver.py
> index cfa3c3a..82ad32c 100755
> --- a/tools/archiver.py
> +++ b/tools/archiver.py
> @@ -143,9 +143,7 @@ def normalize_lid(lid: str) -> str: # N.B. Also used by
> import-mbox.py
> # Belt-and-braces: remove possible extraneous chars
> lid = "<%s>" % lid.strip(" <>").replace("@", ".")
> # Replace invalid characters with underscores so as to not invalidate
> doc IDs.
> - lid = re.sub(
> - r"[^-+~_<>.a-zA-Z0-9@]", "_", lid
> - )
> + lid = re.sub(r"[^-+~_<>.a-zA-Z0-9@]", "_", lid)
> # Finally, ensure we have a loosely valid list ID value
> if not re.match(r"^<.+\..+>$", lid):
> print("Invalid list-id %s" % lid)
> @@ -172,24 +170,39 @@ def message_attachments(msg: email.message.Message) ->
> typing.Tuple[list, dict]:
> class Body:
> def __init__(self, part: email.message.Message):
> self.content_type = part.get_content_type()
> - self.charsets = set([part.get_content_charset()]) # Part's charset
> - self.charsets.update(
> - [part.get_charsets()[0]]
> - ) # Parent charset as fallback if any/different
> - self.character_set = "us-ascii"
> + self.charsets = [part.get_content_charset()] # Part's charset
> + parent_charset = part.get_charsets()[0]
> + if parent_charset and parent_charset != self.charsets[0]:
> + self.charsets.append(
> + parent_charset
> + ) # Parent charset as fallback if any/different
> + self.character_set = None
> + self.has_charset = False
> self.string: typing.Optional[str] = None
> self.flowed = "format=flowed" in part.get("content-type", "")
> - contents = part.get_payload(decode=True)
> - if contents is not None:
> - for cs in self.charsets:
> - if cs:
> + self.bytes = part.get_payload(decode=True)
> + if self.bytes is not None:
> + valid_encodings = [x for x in self.charsets if x]
> + if valid_encodings:
> + for cs in valid_encodings:
> try:
> - self.string = contents.decode(cs)
> + self.string = self.bytes.decode(cs)
> self.character_set = str(cs)
> + self.has_charset = True
> + break
> except UnicodeDecodeError:
> pass
> if not self.string:
> - self.string = contents.decode("us-ascii", errors="replace")
> + self.string = self.bytes.decode("us-ascii", errors="replace")
> + if valid_encodings:
> + self.character_set = "us-ascii"
> + # If no character encoding, but we find non-ASCII chars,
> assume bytes were UTF-8
> + elif len(self.bytes) != len(self.bytes.decode("us-ascii",
> "ignore")):
> + part.set_charset("utf-8")
> + self.bytes = part.get_payload(decode=True)
> + # Set the .string, but not a character set, as we don't
> know it for sure.
> + # This is mainly so the older generators won't barf.
> + self.string = self.bytes.decode("utf-8", "replace")
>
> def __repr__(self):
> return self.string
> @@ -200,8 +213,8 @@ class Body:
> def assign(self, new_string):
> self.string = new_string
>
> - def encode(self, charset="utf-8", errors="strict"):
> - return self.string.encode(charset, errors=errors)
> + def encode(self, encoding="utf-8", errors="strict"):
> + return self.string.encode(encoding=encoding, errors=errors)
>
> def unflow(self, convert_lf=False):
> """Unflows text of type format=flowed.
> @@ -405,7 +418,12 @@ class Archiver(object): # N.B. Also used by
> import-mbox.py
> if generator:
> try:
> mid = plugins.generators.generate(
> - generator, msg, body, lid, attachments, raw_msg
> + generator,
> + msg,
> + body if body.character_set else body.bytes,
> + lid,
> + attachments,
> + raw_msg,
> )
> except Exception as err:
> if logger:
> @@ -431,6 +449,7 @@ class Archiver(object): # N.B. Also used by
> import-mbox.py
> irt = ""
> all_mids = list(id_set) # Convert to list
> document_id = all_mids[0]
> +
> output_json = {
> "from_raw": msg_metadata["from"],
> "from": msg_metadata["from"],
> diff --git a/tools/plugins/generators.py b/tools/plugins/generators.py
> index 122633d..79ae9c9 100644
> --- a/tools/plugins/generators.py
> +++ b/tools/plugins/generators.py
> @@ -234,6 +234,8 @@ def medium(msg, body, lid, _attachments, _raw_msg):
> # as the archived-at may change from node to node (and will change if not in
> the raw mbox file)
> # Also the lid is not included in the hash, so the hash does not change if
> the lid is overridden
> #
> +
> +
> def cluster(msg, body, lid, attachments, _raw_msg):
> """
> Use data that is guaranteed to be the same across cluster setups
> @@ -268,16 +270,15 @@ def cluster(msg, body, lid, attachments, _raw_msg):
> # Use text body
> if not body: # Make sure body is not None, which will fail.
> body = ""
> - xbody = body.encode('utf-8', 'ignore')
> + xbody = body if type(body) is bytes else body.encode('utf-8',
> errors='ignore')
>
> # Crop out any trailing whitespace in body
> xbody = re.sub(b"\s+$", b"", xbody)
>
> # Use Message-Id (or '' if missing)
> - xbody += bytes(msg.get('Message-Id', ''), encoding='ascii')
> + xbody += bytes(msg.get('message-id', ''), encoding='ascii')
>
> # Use Date header. Don't use archived-at, as the archiver sets this if
> not present.
> - mdate = None
> mdatestring = "(null)" # Default to null, ONLY changed if replicable
> across imports
> try:
> mdate = email.utils.parsedate_tz(msg.get('date'))
>