On Sat, 15 Aug 2020 at 14:14, Daniel Gruno <humbed...@apache.org> wrote:
>
> On 15/08/2020 14.50, sebb wrote:
> > There was another incompatible change:
> >
> > The Archiver __init__ method keyword parameter parse_html was renamed
> > to parseHtml.
> > This breaks compatibility with previous versions, making testing of
> > older versions much harder.
> > It's probably OK to add new keyword parameters, but please don't make
> > arbitrary changes to existing parameters.
>
> I think it's the other way around - parseHTML was renamed to parse_html.

Oops, yes it was the other way round.

> It wasn't completely arbitrary, it was to conform with PEP8 guidelines
> and generally harmonizing the way we type arguments in PM.
> I can get the test suite to work around this, not a problem.
> I'd much rather have a version-aware unit tester than having to stick to

OK, but it will need ongoing maintenance.

> the same parameters for all eternity, as that may hinder future
> development. One example being the DKIM PR where the raw message body
> now needs to get passed through (which arguably is much better than
> relying on .as_bytes()).
>
> >
> > On Fri, 14 Aug 2020 at 18:47, Daniel Gruno <humbed...@apache.org> wrote:
> >>
> >> Apoologies, should be in working condition again now.
> >>
> >> On 14/08/2020 16.55, sebb wrote:
> >>> This commit broke the functioning of the --generator option for the 
> >>> archiver.
> >>>
> >>> Please fix.
> >>>
> >>> On Mon, 10 Aug 2020 at 17:13, <humbed...@apache.org> wrote:
> >>>>
> >>>> This is an automated email from the ASF dual-hosted git repository.
> >>>>
> >>>> humbedooh pushed a commit to branch master
> >>>> in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git
> >>>>
> >>>>
> >>>> The following commit(s) were added to refs/heads/master by this push:
> >>>>        new 95beb51  Pull main operation into main(), linting/PEP 
> >>>> conformity fixes
> >>>> 95beb51 is described below
> >>>>
> >>>> commit 95beb51158b58bcb9fdb1371af7699b72598ac34
> >>>> Author: Daniel Gruno <humbed...@apache.org>
> >>>> AuthorDate: Mon Aug 10 18:13:05 2020 +0200
> >>>>
> >>>>       Pull main operation into main(), linting/PEP conformity fixes
> >>>>
> >>>>       Also prepping for adding ES 7.x support.
> >>>>       import-mbox will need to be updated shortly
> >>>> ---
> >>>>    tools/archiver.py | 123 
> >>>> +++++++++++++++++++++++++-----------------------------
> >>>>    1 file changed, 58 insertions(+), 65 deletions(-)
> >>>>
> >>>> diff --git a/tools/archiver.py b/tools/archiver.py
> >>>> index b97db76..2c50f19 100755
> >>>> --- a/tools/archiver.py
> >>>> +++ b/tools/archiver.py
> >>>> @@ -62,23 +62,19 @@ import uuid
> >>>>    import json
> >>>>    import certifi
> >>>>    import urllib.parse
> >>>> +import argparse
> >>>> +import netaddr
> >>>>
> >>>>    # Fetch config
> >>>>    config = PonymailConfig()
> >>>>    auth = None
> >>>> -parseHTML = False
> >>>> -iBody = None  # N.B. Also used by import-mbox.py
> >>>> -args=None
> >>>> -dumpDir = None
> >>>> -aURL = None
> >>>>
> >>>>    if config.has_section('mailman') and config.has_option('mailman', 
> >>>> 'plugin'):
> >>>>        from zope.interface import implementer
> >>>>        from mailman.interfaces.archiver import IArchiver
> >>>>        from mailman.interfaces.archiver import ArchivePolicy
> >>>>        logger = logging.getLogger("mailman.archiver")
> >>>> -elif __name__ == '__main__':
> >>>> -    import argparse
> >>>> +
> >>>>
> >>>>    if config.has_option('archiver', 'baseurl'):
> >>>>        aURL = config.get('archiver', 'baseurl')
> >>>> @@ -102,8 +98,7 @@ def parse_attachment(part):
> >>>>            if cdtype == "attachment" or cdtype == 'inline':
> >>>>                fd = part.get_payload(decode=True)
> >>>>                # Allow for empty string
> >>>> -            if fd is None:
> >>>> -                return None, None
> >>>> +            if fd is None: return None, None
> >>>>                filename = part.get_filename()
> >>>>                if filename:
> >>>>                    print("Found attachment: %s" % filename)
> >>>> @@ -160,7 +155,7 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>
> >>>>        """ Intercept index calls and fix up consistency argument """
> >>>>        def index(self, **kwargs):
> >>>> -        if ES_MAJOR in [5,6]:
> >>>> +        if ES_MAJOR in [5,6,7]:
> >>>>                if kwargs.pop('consistency', None): # drop the key if 
> >>>> present
> >>>>                    if self.wait_for_active_shards: # replace with wait 
> >>>> if defined
> >>>>                        kwargs['wait_for_active_shards'] = 
> >>>> self.wait_for_active_shards
> >>>> @@ -168,10 +163,11 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>                **kwargs
> >>>>            )
> >>>>
> >>>> -    def __init__(self, parseHTML=False):
> >>>> +    def __init__(self, generator='full', parse_html=False, 
> >>>> dump_dir=None):
> >>>>            """ Just initialize ES. """
> >>>> -        self.html = parseHTML
> >>>> -        if parseHTML:
> >>>> +        self.html = parse_html
> >>>> +        self.generator = generator
> >>>> +        if parse_html:
> >>>>                import html2text
> >>>>                self.html2text = html2text.html2text
> >>>>            self.dbname = config.get("elasticsearch", "dbname")
> >>>> @@ -180,7 +176,7 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>            self.consistency = config.get('elasticsearch', 'write', 
> >>>> fallback='quorum')
> >>>>            if ES_MAJOR == 2:
> >>>>                pass
> >>>> -        elif ES_MAJOR in [5,6]:
> >>>> +        elif ES_MAJOR in [5,6,7]:
> >>>>                self.wait_for_active_shards = config.get('elasticsearch', 
> >>>> 'wait', fallback=1)
> >>>>            else:
> >>>>                raise Exception("Unexpected elasticsearch version ", 
> >>>> ES_VERSION)
> >>>> @@ -209,14 +205,14 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>                }
> >>>>                )
> >>>>            # If we have a dump dir, we can risk failing the connection.
> >>>> -        if dumpDir:
> >>>> +        if dump_dir:
> >>>>                try:
> >>>>                    self.es = Elasticsearch(dbs,
> >>>>                        max_retries=5,
> >>>>                        retry_on_timeout=True
> >>>>                        )
> >>>>                except:
> >>>> -                print("ES connection failed, but dumponfail specified, 
> >>>> dumping to %s" % dumpDir)
> >>>> +                print("ES connection failed, but dumponfail specified, 
> >>>> dumping to %s" % dump_dir)
> >>>>            else:
> >>>>                self.es = Elasticsearch(dbs,
> >>>>                    max_retries=5,
> >>>> @@ -233,13 +229,12 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>                    contents[part_meta['hash']] = part_file
> >>>>            return attachments, contents
> >>>>
> >>>> -
> >>>> -    def msgbody(self, msg):
> >>>> +    def msgbody(self, msg, verbose=False, ignore_body=None):
> >>>>            body = None
> >>>>            firstHTML = None
> >>>>            for part in msg.walk():
> >>>>                # can be called from importer
> >>>> -            if args and args.verbose:
> >>>> +            if verbose:
> >>>>                    print("Content-Type: %s" % part.get_content_type())
> >>>>                """
> >>>>                    Find the first body part and the first HTML part
> >>>> @@ -251,12 +246,12 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>                    if not body and part.get_content_type() == 
> >>>> 'text/enriched':
> >>>>                        body = part.get_payload(decode=True)
> >>>>                    elif self.html and not firstHTML and 
> >>>> part.get_content_type() == 'text/html':
> >>>> -                    firstHTML = part.get_payload(decode=True)
> >>>> +                    first_html = part.get_payload(decode=True)
> >>>>                except Exception as err:
> >>>>                    print(err)
> >>>>
> >>>>            # this requires a GPL lib, user will have to install it 
> >>>> themselves
> >>>> -        if firstHTML and (not body or len(body) <= 1 or (iBody and 
> >>>> str(body).find(str(iBody)) != -1)):
> >>>> +        if firstHTML and (not body or len(body) <= 1 or (ignore_body 
> >>>> and str(body).find(str(ignore_body)) != -1)):
> >>>>                body = self.html2text(firstHTML.decode("utf-8", 'ignore') 
> >>>> if type(firstHTML) is bytes else firstHTML)
> >>>>
> >>>>            # See issue#463
> >>>> @@ -273,7 +268,7 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>            return body
> >>>>
> >>>>        # N.B. this is also called by import-mbox.py
> >>>> -    def compute_updates(self, lid, private, msg):
> >>>> +    def compute_updates(self, args, lid, private, msg):
> >>>>            """Determine what needs to be sent to the archiver.
> >>>>
> >>>>            :param lid: The list id
> >>>> @@ -324,7 +319,7 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>            # mdate calculations are all done, prepare the index entry
> >>>>            epoch = email.utils.mktime_tz(mdate)
> >>>>            mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", 
> >>>> time.gmtime(epoch))
> >>>> -        body = self.msgbody(msg)
> >>>> +        body = self.msgbody(msg, verbose=args.verbose, 
> >>>> ignore_body=args.ibody)
> >>>>            try:
> >>>>                if 'content-type' in msg_metadata and 
> >>>> msg_metadata['content-type'].find("flowed") != -1:
> >>>>                    body = convertToWrapped(body, character_set="utf-8")
> >>>> @@ -352,8 +347,9 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>                except Exception as err:
> >>>>                    if logger:
> >>>>                        # N.B. use .get just in case there is no 
> >>>> message-id
> >>>> -                    logger.warning("Could not generate MID: %s. MSGID: 
> >>>> %s", err, msg_metadata.get('message-id','?'))
> >>>> +                    logger.warning("Could not generate MID: %s. MSGID: 
> >>>> %s", err, msg_metadata.get('message-id', '?'))
> >>>>                    mid = pmid
> >>>> +
> >>>>                if 'in-reply-to' in msg_metadata:
> >>>>                    try:
> >>>>                        try:
> >>>> @@ -383,7 +379,7 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>
> >>>>            return  ojson, contents, msg_metadata, irt
> >>>>
> >>>> -    def archive_message(self, mlist, msg, raw_msg):
> >>>> +    def archive_message(self, args, mlist, msg, raw_message):
> >>>>            """Send the message to the archiver.
> >>>>
> >>>>            :param mlist: The IMailingList object.
> >>>> @@ -402,7 +398,7 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>            elif hasattr(mlist, 'archive_policy') and 
> >>>> mlist.archive_policy is not ArchivePolicy.public:
> >>>>                private = True
> >>>>
> >>>> -        ojson, contents, msg_metadata, irt = self.compute_updates(lid, 
> >>>> private, msg)
> >>>> +        ojson, contents, msg_metadata, irt = self.compute_updates(args, 
> >>>> lid, private, msg)
> >>>>            if not ojson:
> >>>>                _id = msg.get('message-id') or msg.get('Subject') or 
> >>>> msg.get("Date")
> >>>>                raise Exception("Could not parse message %s for %s" % 
> >>>> (_id,lid))
> >>>> @@ -438,23 +434,23 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>                    consistency = self.consistency,
> >>>>                    body = {
> >>>>                        "message-id": msg_metadata['message-id'],
> >>>> -                    "source": self.mbox_source(raw_msg)
> >>>> +                    "source": self.mbox_source(raw_message)
> >>>>                    }
> >>>>                )
> >>>>            # If we have a dump dir and ES failed, push to dump dir 
> >>>> instead as a JSON object
> >>>>            # We'll leave it to another process to pick up the slack.
> >>>>            except Exception as err:
> >>>> -            if dumpDir:
> >>>> +            if args.dump:
> >>>>                    print("Pushing to ES failed, but dumponfail 
> >>>> specified, dumping JSON docs")
> >>>>                    uid = uuid.uuid4()
> >>>> -                mboxPath = os.path.join(dumpDir, "%s.json" % uid)
> >>>> +                mboxPath = os.path.join(args.dump, "%s.json" % uid)
> >>>>                    with open(mboxPath, "w") as f:
> >>>>                        json.dump({
> >>>>                            'id': ojson['mid'],
> >>>>                            'mbox': ojson,
> >>>>                            'mbox_source': {
> >>>>                                "message-id": msg_metadata['message-id'],
> >>>> -                            "source": self.mbox_source(raw_msg)
> >>>> +                            "source": self.mbox_source(raw_message)
> >>>>                            },
> >>>>                            'attachments': contents
> >>>>                        },f , indent = 2)
> >>>> @@ -488,8 +484,8 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>                if dm:
> >>>>                    cid = dm.group(1)
> >>>>                    mid = dm.group(2)
> >>>> -                if self.es.exists(index = self.dbname, doc_type = 
> >>>> 'account', id = cid):
> >>>> -                    doc = self.es.get(index = self.dbname, doc_type = 
> >>>> 'account', id = cid)
> >>>> +                if self.es.exists(index=self.dbname, 
> >>>> doc_type='account', id=cid):
> >>>> +                    doc = self.es.get(index=self.dbname, 
> >>>> doc_type='account', id=cid)
> >>>>                        if doc:
> >>>>                            oldrefs.append(cid)
> >>>>                            # N.B. no index is supplied, so ES will 
> >>>> generate one
> >>>> @@ -571,7 +567,8 @@ class Archiver(object): # N.B. Also used by 
> >>>> import-mbox.py
> >>>>            """
> >>>>            return None
> >>>>
> >>>> -if __name__ == '__main__':
> >>>> +
> >>>> +def main():
> >>>>        parser = argparse.ArgumentParser(description='Command line 
> >>>> options.')
> >>>>        parser.add_argument('--lid', dest='lid', type=str, nargs=1,
> >>>>                           help='Alternate specific list ID')
> >>>> @@ -593,16 +590,15 @@ if __name__ == '__main__':
> >>>>                           help='Try to convert HTML to text if no 
> >>>> text/plain message is found')
> >>>>        parser.add_argument('--dry', dest='dry', action='store_true',
> >>>>                           help='Do not save emails to elasticsearch, 
> >>>> only test parsing')
> >>>> +    parser.add_argument('--ignorebody', dest='ibody', type=str, nargs=1,
> >>>> +                        help='Optional email bodies to treat as empty 
> >>>> (in conjunction with --html2text)')
> >>>>        parser.add_argument('--dumponfail', dest='dump',
> >>>> -                       help='If pushing to ElasticSearch fails, dump 
> >>>> documents in JSON format to this directory and fail silently.')
> >>>> +                       help='If pushing to ElasticSearch fails, dump 
> >>>> documents in JSON format to this directory and '
> >>>> +                            'fail silently.')
> >>>>        parser.add_argument('--generator', dest='generator',
> >>>>                           help='Override the generator.')
> >>>>        args = parser.parse_args()
> >>>>
> >>>> -    if args.html2text:
> >>>> -        parseHTML = True
> >>>> -    if args.dump:
> >>>> -        dumpDir = args.dump
> >>>>        if args.verbose:
> >>>>            logging.basicConfig(stream=sys.stdout, level=logging.INFO)
> >>>>        else:
> >>>> @@ -610,39 +606,34 @@ if __name__ == '__main__':
> >>>>            # Also eliminates: 'Undecodable raw error response from 
> >>>> server:' warning message
> >>>>            logging.getLogger("elasticsearch").setLevel(logging.ERROR)
> >>>>
> >>>> -    if args.generator:
> >>>> -        archiver_generator = args.generator
> >>>> -
> >>>> -    archie = Archiver(parseHTML = parseHTML)
> >>>> +    archie = Archiver(generator=args.generator or archiver_generator, 
> >>>> parse_html=args.html2text)
> >>>>        # use binary input so parser can use appropriate charset
> >>>>        input_stream = sys.stdin.buffer
> >>>>
> >>>>        try:
> >>>> -        msgstring = input_stream.read()
> >>>> +        raw_message = input_stream.read()
> >>>>            try:
> >>>> -            msg = email.message_from_bytes(msgstring)
> >>>> +            msg = email.message_from_bytes(raw_message)
> >>>>            except Exception as err:
> >>>>                print("STDIN parser exception: %s" % err)
> >>>>
> >>>>            # We're reading from STDIN, so let's fake an MM3 call
> >>>>            ispublic = True
> >>>> -        ignorefrom = None
> >>>> -        allowfrom = None
> >>>>
> >>>>            if args.altheader:
> >>>> -            altheader = args.altheader[0]
> >>>> -            if altheader in msg:
> >>>> +            alt_header = args.altheader[0]
> >>>> +            if alt_header in msg:
> >>>>                    try:
> >>>> -                    msg.replace_header('List-ID', msg.get(altheader))
> >>>> +                    msg.replace_header('List-ID', msg.get(alt_header))
> >>>>                    except:
> >>>> -                    msg.add_header('list-id', msg.get(altheader))
> >>>> +                    msg.add_header('list-id', msg.get(alt_header))
> >>>>            elif 'altheader' in sys.argv:
> >>>> -            altheader = sys.argv[len(sys.argv)-1]
> >>>> -            if altheader in msg:
> >>>> +            alt_header = sys.argv[len(sys.argv)-1]
> >>>> +            if alt_header in msg:
> >>>>                    try:
> >>>> -                    msg.replace_header('List-ID', msg.get(altheader))
> >>>> +                    msg.replace_header('List-ID', msg.get(alt_header))
> >>>>                    except:
> >>>> -                    msg.add_header('list-id', msg.get(altheader))
> >>>> +                    msg.add_header('list-id', msg.get(alt_header))
> >>>>
> >>>>            # Set specific LID?
> >>>>            if args.lid and len(args.lid[0]) > 3:
> >>>> @@ -654,21 +645,21 @@ if __name__ == '__main__':
> >>>>
> >>>>            #Ignore based on --ignore flag?
> >>>>            if args.ignorefrom:
> >>>> -            ignorefrom = args.ignorefrom[0]
> >>>> -            if fnmatch.fnmatch(msg.get("from"), ignorefrom) or 
> >>>> (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), ignorefrom)):
> >>>> +            ignore_from = args.ignorefrom[0]
> >>>> +            if fnmatch.fnmatch(msg.get("from"), ignore_from) or 
> >>>> (msg.get("list-id") and fnmatch.fnmatch(msg.get("list-id"), 
> >>>> ignore_from)):
> >>>>                    print("Ignoring message as instructed by --ignore 
> >>>> flag")
> >>>>                    sys.exit(0)
> >>>>
> >>>>            # Check CIDR if need be
> >>>>            if args.allowfrom:
> >>>> -            from netaddr import IPNetwork, IPAddress
> >>>> -            c = IPNetwork(args.allowfrom[0])
> >>>> +
> >>>> +            c = netaddr.IPNetwork(args.allowfrom[0])
> >>>>                good = False
> >>>>                for line in msg.get_all('received') or []:
> >>>>                    m = re.search(r"from .+\[(.+)\]", line)
> >>>>                    if m:
> >>>>                        try:
> >>>> -                        ip = IPAddress(m.group(1))
> >>>> +                        ip = netaddr.IPAddress(m.group(1))
> >>>>                            if ip in c:
> >>>>                                good = True
> >>>>                                msg.add_header("ip-whitelisted", "yes")
> >>>> @@ -681,19 +672,18 @@ if __name__ == '__main__':
> >>>>            # Replace date header with $now?
> >>>>            if args.makedate:
> >>>>                msg.replace_header('date', email.utils.formatdate())
> >>>> -
> >>>> +        is_public = True
> >>>>            if args.private:
> >>>> -            ispublic = False
> >>>> +            is_public = False
> >>>>            if 'list-id' in msg:
> >>>>                if not msg.get('archived-at'):
> >>>>                    msg.add_header('archived-at', 
> >>>> email.utils.formatdate())
> >>>> -            list_data = namedtuple('importmsg', ['list_id', 
> >>>> 'archive_public'])(list_id = msg.get('list-id'), archive_public=ispublic)
> >>>> +            list_data = namedtuple('importmsg', ['list_id', 
> >>>> 'archive_public'])(list_id=msg.get('list-id'),
> >>>> +                                                                        
> >>>>        archive_public=is_public)
> >>>>
> >>>>                try:
> >>>> -                lid, mid = archie.archive_message(list_data, msg, 
> >>>> msgstring)
> >>>> +                lid, mid = archie.archive_message(args, list_data, msg, 
> >>>> raw_message)
> >>>>                    print("%s: Done archiving to %s as %s!" % 
> >>>> (email.utils.formatdate(), lid, mid))
> >>>> -                if aURL:
> >>>> -                    print("Published as: %s/thread.html/%s" % (aURL, 
> >>>> urllib.parse.quote(mid)))
> >>>>                except Exception as err:
> >>>>                    if args.verbose:
> >>>>                        traceback.print_exc()
> >>>> @@ -711,3 +701,6 @@ if __name__ == '__main__':
> >>>>                print("Could not parse email: %s (@ %s)" % (err, line))
> >>>>                sys.exit(-1)
> >>>>
> >>>> +
> >>>> +if __name__ == '__main__':
> >>>> +    main()
> >>>>
> >>
>

Reply via email to