On 28/06/17 17:48, Daniel Axtens wrote:
It turns out that the attempts in clean_header() to convert headers to strings are not guaranteed to work: you can end up with, for example, a base64 decoding error which makes it impossible to determine any header content. In this case, sanitise_header() should return None, and thus clean_header() should return None. We then need to plumb that through. Signed-off-by: Daniel Axtens <d...@axtens.net>
All looks good Reviewed-by: Andrew Donnellan <andrew.donnel...@au1.ibm.com>
--- patchwork/parser.py | 70 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/patchwork/parser.py b/patchwork/parser.py index 032af8a7be7c..203e11584504 100644 --- a/patchwork/parser.py +++ b/patchwork/parser.py @@ -23,6 +23,7 @@ from email.header import decode_header from email.header import make_header from email.utils import mktime_tz from email.utils import parsedate_tz +from email.errors import HeaderParseError from fnmatch import fnmatch import logging import re @@ -67,6 +68,13 @@ def sanitise_header(header_contents, header_name=None): then encode the result with make_header() """ + try: + value = decode_header(header_contents) + except HeaderParseError: + # something has gone really wrong with header parsing. + # (e.g. base64 decoding) We probably can't recover, so: + return None + # We have some Py2/Py3 issues here. # # Firstly, the email parser (before we get here) @@ -85,7 +93,6 @@ def sanitise_header(header_contents, header_name=None): # We solve this by catching any Unicode errors, and then manually # handling any interesting headers. - value = decode_header(header_contents) try: header = make_header(value, header_name=header_name, @@ -130,6 +137,9 @@ def clean_header(header): sane_header = sanitise_header(header) + if sane_header is None: + return None + # on Py2, we want to do unicode(), on Py3, str(). # That gets us the decoded, un-wrapped header. if six.PY2: @@ -157,9 +167,12 @@ def find_project_by_header(mail): for header in list_id_headers: if header in mail: + h = clean_header(mail.get(header)) + if not h: + continue for listid_re in listid_res: - match = listid_re.match(clean_header(mail.get(header))) + match = listid_re.match(h) if match: break @@ -205,7 +218,11 @@ def _find_series_by_references(project, mail): Returns: The matching ``Series`` instance, if any """ - for ref in [clean_header(mail.get('Message-Id'))] + find_references(mail): + refs = find_references(mail) + h = clean_header(mail.get('Message-Id')) + if h: + refs = [h] + refs + for ref in refs: try: return SeriesReference.objects.get( msgid=ref, series__project=project).series @@ -274,6 +291,10 @@ def find_series(project, mail): def find_author(mail): from_header = clean_header(mail.get('From')) + + if not from_header: + raise ValueError("Invalid 'From' header") + name, email = (None, None) # tuple of (regex, fn) @@ -320,7 +341,10 @@ def find_author(mail): def find_date(mail): - t = parsedate_tz(clean_header(mail.get('Date', ''))) + h = clean_header(mail.get('Date', '')) + if not h: + return datetime.datetime.utcnow() + t = parsedate_tz(h) if not t: return datetime.datetime.utcnow() return datetime.datetime.utcfromtimestamp(mktime_tz(t)) @@ -331,7 +355,7 @@ def find_headers(mail): for key, value in mail.items()] strings = [('%s: %s' % (key, header.encode())) - for (key, header) in headers] + for (key, header) in headers if header is not None] return '\n'.join(strings) @@ -347,11 +371,16 @@ def find_references(mail): if 'In-Reply-To' in mail: for in_reply_to in mail.get_all('In-Reply-To'): - refs.append(clean_header(in_reply_to).strip()) + r = clean_header(in_reply_to) + if r: + refs.append(r.strip()) if 'References' in mail: for references_header in mail.get_all('References'): - references = clean_header(references_header).split() + h = clean_header(references_header) + if not h: + continue + references = h.split() references.reverse() for ref in references: ref = ref.strip() @@ -573,6 +602,9 @@ def clean_subject(subject, drop_prefixes=None): prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$') subject = clean_header(subject) + if not subject: + raise ValueError("Invalid 'Subject' header") + if drop_prefixes is None: drop_prefixes = [] else: @@ -610,7 +642,11 @@ def subject_check(subject): """Determine if a mail is a reply.""" comment_re = re.compile(r'^(re)[:\s]\s*', re.I) - return comment_re.match(clean_header(subject)) + h = clean_header(subject) + if not h: + return False + + return comment_re.match(h) def clean_content(content): @@ -790,10 +826,10 @@ def parse_pull_request(content): def find_state(mail): """Return the state with the given name or the default.""" - state_name = clean_header(mail.get('X-Patchwork-State', '')).strip() + state_name = clean_header(mail.get('X-Patchwork-State', '')) if state_name: try: - return State.objects.get(name__iexact=state_name) + return State.objects.get(name__iexact=state_name.strip()) except State.DoesNotExist: pass return get_default_initial_patch_state() @@ -827,10 +863,10 @@ def find_delegate_by_filename(project, filenames): def find_delegate_by_header(mail): """Return the delegate with the given email or None.""" - delegate_email = clean_header(mail.get('X-Patchwork-Delegate', '')).strip() + delegate_email = clean_header(mail.get('X-Patchwork-Delegate', '')) if delegate_email: try: - return User.objects.get(email__iexact=delegate_email) + return User.objects.get(email__iexact=delegate_email.strip()) except User.DoesNotExist: pass return None @@ -856,8 +892,8 @@ def parse_mail(mail, list_id=None): if 'Message-Id' not in mail: raise ValueError("Missing 'Message-Id' header") - hint = clean_header(mail.get('X-Patchwork-Hint', '')).lower() - if hint == 'ignore': + hint = clean_header(mail.get('X-Patchwork-Hint', '')) + if hint and hint.lower() == 'ignore': logger.debug("Ignoring email due to 'ignore' hint") return @@ -872,7 +908,11 @@ def parse_mail(mail, list_id=None): # parse metadata - msgid = clean_header(mail.get('Message-Id')).strip() + msgid = clean_header(mail.get('Message-Id')) + if not msgid: + raise ValueError("Broken 'Message-Id' header") + msgid = msgid.strip() + author = find_author(mail) subject = mail.get('Subject') name, prefixes = clean_subject(subject, [project.linkname])
-- Andrew Donnellan OzLabs, ADL Canberra andrew.donnel...@au1.ibm.com IBM Australia Limited _______________________________________________ Patchwork mailing list Patchwork@lists.ozlabs.org https://lists.ozlabs.org/listinfo/patchwork