Apoologies, should be in working condition again now.

On 14/08/2020 16.55, sebb wrote:
This commit broke the functioning of the --generator option for the archiver.

Please fix.

On Mon, 10 Aug 2020 at 17:13, <humbed...@apache.org> wrote:

This is an automated email from the ASF dual-hosted git repository.

humbedooh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-ponymail.git


The following commit(s) were added to refs/heads/master by this push:
      new 95beb51  Pull main operation into main(), linting/PEP conformity fixes
95beb51 is described below

commit 95beb51158b58bcb9fdb1371af7699b72598ac34
Author: Daniel Gruno <humbed...@apache.org>
AuthorDate: Mon Aug 10 18:13:05 2020 +0200

     Pull main operation into main(), linting/PEP conformity fixes

     Also prepping for adding ES 7.x support.
     import-mbox will need to be updated shortly
---
  tools/archiver.py | 123 +++++++++++++++++++++++++-----------------------------
  1 file changed, 58 insertions(+), 65 deletions(-)

diff --git a/tools/archiver.py b/tools/archiver.py
index b97db76..2c50f19 100755
--- a/tools/archiver.py
+++ b/tools/archiver.py
@@ -62,23 +62,19 @@ import uuid
  import json
  import certifi
  import urllib.parse
+import argparse
+import netaddr

  # Fetch config
  config = PonymailConfig()
  auth = None
-parseHTML = False
-iBody = None  # N.B. Also used by import-mbox.py
-args=None
-dumpDir = None
-aURL = None

  if config.has_section('mailman') and config.has_option('mailman', 'plugin'):
      from zope.interface import implementer
      from mailman.interfaces.archiver import IArchiver
      from mailman.interfaces.archiver import ArchivePolicy
      logger = logging.getLogger("mailman.archiver")
-elif __name__ == '__main__':
-    import argparse
+

  if config.has_option('archiver', 'baseurl'):
      aURL = config.get('archiver', 'baseurl')
@@ -102,8 +98,7 @@ def parse_attachment(part):
          if cdtype == "attachment" or cdtype == 'inline':
              fd = part.get_payload(decode=True)
              # Allow for empty string
-            if fd is None:
-                return None, None
+            if fd is None: return None, None
              filename = part.get_filename()
              if filename:
                  print("Found attachment: %s" % filename)
@@ -160,7 +155,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py

      """ Intercept index calls and fix up consistency argument """
      def index(self, **kwargs):
-        if ES_MAJOR in [5,6]:
+        if ES_MAJOR in [5,6,7]:
              if kwargs.pop('consistency', None): # drop the key if present
                  if self.wait_for_active_shards: # replace with wait if defined
                      kwargs['wait_for_active_shards'] = 
self.wait_for_active_shards
@@ -168,10 +163,11 @@ class Archiver(object): # N.B. Also used by import-mbox.py
              **kwargs
          )

-    def __init__(self, parseHTML=False):
+    def __init__(self, generator='full', parse_html=False, dump_dir=None):
          """ Just initialize ES. """
-        self.html = parseHTML
-        if parseHTML:
+        self.html = parse_html
+        self.generator = generator
+        if parse_html:
              import html2text
              self.html2text = html2text.html2text
          self.dbname = config.get("elasticsearch", "dbname")
@@ -180,7 +176,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
          self.consistency = config.get('elasticsearch', 'write', 
fallback='quorum')
          if ES_MAJOR == 2:
              pass
-        elif ES_MAJOR in [5,6]:
+        elif ES_MAJOR in [5,6,7]:
              self.wait_for_active_shards = config.get('elasticsearch', 'wait', 
fallback=1)
          else:
              raise Exception("Unexpected elasticsearch version ", ES_VERSION)
@@ -209,14 +205,14 @@ class Archiver(object): # N.B. Also used by import-mbox.py
              }
              )
          # If we have a dump dir, we can risk failing the connection.
-        if dumpDir:
+        if dump_dir:
              try:
                  self.es = Elasticsearch(dbs,
                      max_retries=5,
                      retry_on_timeout=True
                      )
              except:
-                print("ES connection failed, but dumponfail specified, dumping to 
%s" % dumpDir)
+                print("ES connection failed, but dumponfail specified, dumping to 
%s" % dump_dir)
          else:
              self.es = Elasticsearch(dbs,
                  max_retries=5,
@@ -233,13 +229,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
                  contents[part_meta['hash']] = part_file
          return attachments, contents

-
-    def msgbody(self, msg):
+    def msgbody(self, msg, verbose=False, ignore_body=None):
          body = None
          firstHTML = None
          for part in msg.walk():
              # can be called from importer
-            if args and args.verbose:
+            if verbose:
                  print("Content-Type: %s" % part.get_content_type())
              """
                  Find the first body part and the first HTML part
@@ -251,12 +246,12 @@ class Archiver(object): # N.B. Also used by import-mbox.py
                  if not body and part.get_content_type() == 'text/enriched':
                      body = part.get_payload(decode=True)
                  elif self.html and not firstHTML and part.get_content_type() 
== 'text/html':
-                    firstHTML = part.get_payload(decode=True)
+                    first_html = part.get_payload(decode=True)
              except Exception as err:
                  print(err)

          # this requires a GPL lib, user will have to install it themselves
-        if firstHTML and (not body or len(body) <= 1 or (iBody and 
str(body).find(str(iBody)) != -1)):
+        if firstHTML and (not body or len(body) <= 1 or (ignore_body and 
str(body).find(str(ignore_body)) != -1)):
              body = self.html2text(firstHTML.decode("utf-8", 'ignore') if 
type(firstHTML) is bytes else firstHTML)

          # See issue#463
@@ -273,7 +268,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
          return body

      # N.B. this is also called by import-mbox.py
-    def compute_updates(self, lid, private, msg):
+    def compute_updates(self, args, lid, private, msg):
          """Determine what needs to be sent to the archiver.

          :param lid: The list id
@@ -324,7 +319,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
          # mdate calculations are all done, prepare the index entry
          epoch = email.utils.mktime_tz(mdate)
          mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
-        body = self.msgbody(msg)
+        body = self.msgbody(msg, verbose=args.verbose, ignore_body=args.ibody)
          try:
              if 'content-type' in msg_metadata and 
msg_metadata['content-type'].find("flowed") != -1:
                  body = convertToWrapped(body, character_set="utf-8")
@@ -352,8 +347,9 @@ class Archiver(object): # N.B. Also used by import-mbox.py
              except Exception as err:
                  if logger:
                      # N.B. use .get just in case there is no message-id
-                    logger.warning("Could not generate MID: %s. MSGID: %s", 
err, msg_metadata.get('message-id','?'))
+                    logger.warning("Could not generate MID: %s. MSGID: %s", 
err, msg_metadata.get('message-id', '?'))
                  mid = pmid
+
              if 'in-reply-to' in msg_metadata:
                  try:
                      try:
@@ -383,7 +379,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py

          return  ojson, contents, msg_metadata, irt

-    def archive_message(self, mlist, msg, raw_msg):
+    def archive_message(self, args, mlist, msg, raw_message):
          """Send the message to the archiver.

          :param mlist: The IMailingList object.
@@ -402,7 +398,7 @@ class Archiver(object): # N.B. Also used by import-mbox.py
          elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is not 
ArchivePolicy.public:
              private = True

-        ojson, contents, msg_metadata, irt = self.compute_updates(lid, 
private, msg)
+        ojson, contents, msg_metadata, irt = self.compute_updates(args, lid, 
private, msg)
          if not ojson:
              _id = msg.get('message-id') or msg.get('Subject') or 
msg.get("Date")
              raise Exception("Could not parse message %s for %s" % (_id,lid))
@@ -438,23 +434,23 @@ class Archiver(object): # N.B. Also used by import-mbox.py
                  consistency = self.consistency,
                  body = {
                      "message-id": msg_metadata['message-id'],
-                    "source": self.mbox_source(raw_msg)
+                    "source": self.mbox_source(raw_message)
                  }
              )
          # If we have a dump dir and ES failed, push to dump dir instead as a 
JSON object
          # We'll leave it to another process to pick up the slack.
          except Exception as err:
-            if dumpDir:
+            if args.dump:
                  print("Pushing to ES failed, but dumponfail specified, dumping 
JSON docs")
                  uid = uuid.uuid4()
-                mboxPath = os.path.join(dumpDir, "%s.json" % uid)
+                mboxPath = os.path.join(args.dump, "%s.json" % uid)
                  with open(mboxPath, "w") as f:
                      json.dump({
                          'id': ojson['mid'],
                          'mbox': ojson,
                          'mbox_source': {
                              "message-id": msg_metadata['message-id'],
-                            "source": self.mbox_source(raw_msg)
+                            "source": self.mbox_source(raw_message)
                          },
                          'attachments': contents
                      },f , indent = 2)
@@ -488,8 +484,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
              if dm:
                  cid = dm.group(1)
                  mid = dm.group(2)
-                if self.es.exists(index = self.dbname, doc_type = 'account', 
id = cid):
-                    doc = self.es.get(index = self.dbname, doc_type = 
'account', id = cid)
+                if self.es.exists(index=self.dbname, doc_type='account', 
id=cid):
+                    doc = self.es.get(index=self.dbname, doc_type='account', 
id=cid)
                      if doc:
                          oldrefs.append(cid)
                          # N.B. no index is supplied, so ES will generate one
@@ -571,7 +567,8 @@ class Archiver(object): # N.B. Also used by import-mbox.py
          """
          return None

-if __name__ == '__main__':
+
+def main():
      parser = argparse.ArgumentParser(description='Command line options.')
      parser.add_argument('--lid', dest='lid', type=str, nargs=1,
                         help='Alternate specific list ID')
@@ -593,16 +590,15 @@ if __name__ == '__main__':
                         help='Try to convert HTML to text if no text/plain 
message is found')
      parser.add_argument('--dry', dest='dry', action='store_true',
                         help='Do not save emails to elasticsearch, only test 
parsing')
+    parser.add_argument('--ignorebody', dest='ibody', type=str, nargs=1,
+                        help='Optional email bodies to treat as empty (in 
conjunction with --html2text)')
      parser.add_argument('--dumponfail', dest='dump',
-                       help='If pushing to ElasticSearch fails, dump documents 
in JSON format to this directory and fail silently.')
+                       help='If pushing to ElasticSearch fails, dump documents 
in JSON format to this directory and '
+                            'fail silently.')
      parser.add_argument('--generator', dest='generator',
                         help='Override the generator.')
      args = parser.parse_args()

-    if args.html2text:
-        parseHTML = True
-    if args.dump:
-        dumpDir = args.dump
      if args.verbose:
          logging.basicConfig(stream=sys.stdout, level=logging.INFO)
      else:
@@ -610,39 +606,34 @@ if __name__ == '__main__':
          # Also eliminates: 'Undecodable raw error response from server:' 
warning message
          logging.getLogger("elasticsearch").setLevel(logging.ERROR)

-    if args.generator:
-        archiver_generator = args.generator
-
-    archie = Archiver(parseHTML = parseHTML)
+    archie = Archiver(generator=args.generator or archiver_generator, 
parse_html=args.html2text)
      # use binary input so parser can use appropriate charset
      input_stream = sys.stdin.buffer

      try:
-        msgstring = input_stream.read()
+        raw_message = input_stream.read()
          try:
-            msg = email.message_from_bytes(msgstring)
+            msg = email.message_from_bytes(raw_message)
          except Exception as err:
              print("STDIN parser exception: %s" % err)

          # We're reading from STDIN, so let's fake an MM3 call
          ispublic = True
-        ignorefrom = None
-        allowfrom = None

          if args.altheader:
-            altheader = args.altheader[0]
-            if altheader in msg:
+            alt_header = args.altheader[0]
+            if alt_header in msg:
                  try:
-                    msg.replace_header('List-ID', msg.get(altheader))
+                    msg.replace_header('List-ID', msg.get(alt_header))
                  except:
-                    msg.add_header('list-id', msg.get(altheader))
+                    msg.add_header('list-id', msg.get(alt_header))
          elif 'altheader' in sys.argv:
-            altheader = sys.argv[len(sys.argv)-1]
-            if altheader in msg:
+            alt_header = sys.argv[len(sys.argv)-1]
+            if alt_header in msg:
                  try:
-                    msg.replace_header('List-ID', msg.get(altheader))
+                    msg.replace_header('List-ID', msg.get(alt_header))
                  except:
-                    msg.add_header('list-id', msg.get(altheader))
+                    msg.add_header('list-id', msg.get(alt_header))

          # Set specific LID?
          if args.lid and len(args.lid[0]) > 3:
@@ -654,21 +645,21 @@ if __name__ == '__main__':

          #Ignore based on --ignore flag?
          if args.ignorefrom:
-            ignorefrom = args.ignorefrom[0]
-            if fnmatch.fnmatch(msg.get("from"), ignorefrom) or (msg.get("list-id") and 
fnmatch.fnmatch(msg.get("list-id"), ignorefrom)):
+            ignore_from = args.ignorefrom[0]
+            if fnmatch.fnmatch(msg.get("from"), ignore_from) or (msg.get("list-id") and 
fnmatch.fnmatch(msg.get("list-id"), ignore_from)):
                  print("Ignoring message as instructed by --ignore flag")
                  sys.exit(0)

          # Check CIDR if need be
          if args.allowfrom:
-            from netaddr import IPNetwork, IPAddress
-            c = IPNetwork(args.allowfrom[0])
+
+            c = netaddr.IPNetwork(args.allowfrom[0])
              good = False
              for line in msg.get_all('received') or []:
                  m = re.search(r"from .+\[(.+)\]", line)
                  if m:
                      try:
-                        ip = IPAddress(m.group(1))
+                        ip = netaddr.IPAddress(m.group(1))
                          if ip in c:
                              good = True
                              msg.add_header("ip-whitelisted", "yes")
@@ -681,19 +672,18 @@ if __name__ == '__main__':
          # Replace date header with $now?
          if args.makedate:
              msg.replace_header('date', email.utils.formatdate())
-
+        is_public = True
          if args.private:
-            ispublic = False
+            is_public = False
          if 'list-id' in msg:
              if not msg.get('archived-at'):
                  msg.add_header('archived-at', email.utils.formatdate())
-            list_data = namedtuple('importmsg', ['list_id', 
'archive_public'])(list_id = msg.get('list-id'), archive_public=ispublic)
+            list_data = namedtuple('importmsg', ['list_id', 
'archive_public'])(list_id=msg.get('list-id'),
+                                                                               
archive_public=is_public)

              try:
-                lid, mid = archie.archive_message(list_data, msg, msgstring)
+                lid, mid = archie.archive_message(args, list_data, msg, 
raw_message)
                  print("%s: Done archiving to %s as %s!" % 
(email.utils.formatdate(), lid, mid))
-                if aURL:
-                    print("Published as: %s/thread.html/%s" % (aURL, 
urllib.parse.quote(mid)))
              except Exception as err:
                  if args.verbose:
                      traceback.print_exc()
@@ -711,3 +701,6 @@ if __name__ == '__main__':
              print("Could not parse email: %s (@ %s)" % (err, line))
              sys.exit(-1)

+
+if __name__ == '__main__':
+    main()


Reply via email to