On Sat, 5 Sep 2020 at 08:54, <[email protected]> wrote:
>
> This is an automated email from the ASF dual-hosted git repository.
>
> humbedooh pushed a commit to branch master
> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail-foal.git
>
>
> The following commit(s) were added to refs/heads/master by this push:
> new fafc765 Refactor, drop the double decode attempt.
> fafc765 is described below
>
> commit fafc7651d9d02dfde727bd1f0da13722de8b3c38
> Author: Daniel Gruno <[email protected]>
> AuthorDate: Sat Sep 5 09:54:03 2020 +0200
>
> Refactor, drop the double decode attempt.
>
> We should assume US-ASCII, but if it's not, it's quicker,
> processing-wise, to immediately fall back to utf-8 instead of trying to
> first determine if it is indeed UTF-8-worthy. Either it'll work as
> US-ASCII, or it will work with the UTF-8 with 'replace'.
This info belongs in the code.
> ---
> tools/archiver.py | 13 ++++++++-----
> 1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/tools/archiver.py b/tools/archiver.py
> index f875cc3..c52207b 100755
> --- a/tools/archiver.py
> +++ b/tools/archiver.py
> @@ -192,12 +192,15 @@ class Body:
> break
> except UnicodeDecodeError:
> pass
> + # If no character set was defined, the email MUST be US-ASCII by
> RFC822 defaults
> + # This isn't always the case, as we're about to discover.
> if not self.string:
> - self.string = self.bytes.decode("us-ascii", errors="replace")
> - if valid_encodings:
> - self.character_set = "us-ascii"
> - # If no character encoding, but we find non-ASCII chars,
> assume bytes were UTF-8
> - elif len(self.bytes) != len(self.bytes.decode("us-ascii",
> "ignore")):
> + try:
> + self.string = self.bytes.decode("us-ascii",
> errors="strict")
> + if valid_encodings:
> + self.character_set = "us-ascii"
> + except UnicodeDecodeError:
> + # If us-ascii strict fails, it's probably undeclared
> UTF-8.
> # Set the .string, but not a character set, as we don't
> know it for sure.
> # This is mainly so the older generators won't barf.
> self.string = self.bytes.decode("utf-8", "replace")
>