Package: release.debian.org Severity: normal User: release.debian....@packages.debian.org Usertags: unblock
Dear release team, Please unblock mat2 0.12.1-1. It ships improved support of EPUB and Microsoft Office files. It's a key package, as doxygen build-depends on it, but so far regressions haven't been reported, and autopkgtest looks good as well. The diff is quite small: ~ debdiff mat2_0.12.0-1.dsc mat2_0.12.1-1.dsc | diffstat CHANGELOG.md | 5 +++++ debian/changelog | 7 +++++++ doc/mat2.1 | 2 +- libmat2/epub.py | 49 +++++++++++++++++++++++++++++++++++++++++++------ libmat2/office.py | 2 ++ mat2 | 2 +- setup.py | 2 +- 7 files changed, 60 insertions(+), 9 deletions(-) Please find the full debdiff attached. unblock mat2/0.12.1-1 Thanks for your work, cheers, Georg
diff -Nru mat2-0.12.0/CHANGELOG.md mat2-0.12.1/CHANGELOG.md --- mat2-0.12.0/CHANGELOG.md 2020-12-18 16:55:41.000000000 +0000 +++ mat2-0.12.1/CHANGELOG.md 2021-03-19 16:54:21.000000000 +0000 @@ -1,3 +1,8 @@ +# 0.12.1 - 2021-03-19 + +- Improve epub support +- Improve MS Office support + # 0.12.0 - 2020-12-18 - Improve significantly MS Office formats support diff -Nru mat2-0.12.0/debian/changelog mat2-0.12.1/debian/changelog --- mat2-0.12.0/debian/changelog 2020-12-26 19:52:55.000000000 +0000 +++ mat2-0.12.1/debian/changelog 2021-03-20 19:11:38.000000000 +0000 @@ -1,3 +1,10 @@ +mat2 (0.12.1-1) unstable; urgency=medium + + * New upstream version 0.12.1: + - Ships improved support of EPUB and Microsoft Office files. + + -- Georg Faerber <ge...@debian.org> Sat, 20 Mar 2021 19:11:38 +0000 + mat2 (0.12.0-1) unstable; urgency=medium * Team upload. diff -Nru mat2-0.12.0/doc/mat2.1 mat2-0.12.1/doc/mat2.1 --- mat2-0.12.0/doc/mat2.1 2020-12-18 16:55:41.000000000 +0000 +++ mat2-0.12.1/doc/mat2.1 2021-03-19 16:54:21.000000000 +0000 @@ -1,4 +1,4 @@ -.TH mat2 "1" "December 2020" "mat2 0.12.0" "User Commands" +.TH mat2 "1" "March 2021" "mat2 0.12.1" "User Commands" .SH NAME mat2 \- the metadata anonymisation toolkit 2 diff -Nru mat2-0.12.0/libmat2/epub.py mat2-0.12.1/libmat2/epub.py --- mat2-0.12.0/libmat2/epub.py 2020-12-18 16:55:41.000000000 +0000 +++ mat2-0.12.1/libmat2/epub.py 2021-03-19 16:54:21.000000000 +0000 @@ -1,7 +1,9 @@ import logging import re import uuid +import zipfile import xml.etree.ElementTree as ET # type: ignore +from typing import Dict, Any from . import archive, office @@ -15,11 +17,28 @@ 'META-INF/container.xml', 'mimetype', 'OEBPS/content.opf', + 'content.opf', + 'hmh.opf', + 'OPS/.+.xml' })) + self.files_to_omit = set(map(re.compile, { # type: ignore + 'iTunesMetadata.plist', + 'META-INF/calibre_bookmarks.txt', + 'OEBPS/package.opf', + })) self.uniqid = uuid.uuid4() - def _specific_get_meta(self, full_path, file_path): - if file_path != 'OEBPS/content.opf': + + def is_archive_valid(self): + super().is_archive_valid() + with zipfile.ZipFile(self.filename) as zin: + for item in self._get_all_members(zin): + member_name = self._get_member_name(item) + if member_name.endswith('META-INF/encryption.xml'): + raise ValueError('the file contains encrypted fonts') + + def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]: + if not file_path.endswith('.opf'): return {} with open(full_path, encoding='utf-8') as f: @@ -30,14 +49,32 @@ except (TypeError, UnicodeDecodeError): return {file_path: 'harmful content', } - def _specific_cleanup(self, full_path: str): - if full_path.endswith('OEBPS/content.opf'): + def _specific_cleanup(self, full_path: str) -> bool: + if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'): return self.__handle_contentopf(full_path) elif full_path.endswith('OEBPS/toc.ncx'): return self.__handle_tocncx(full_path) + elif re.search('/OPS/[^/]+.xml$', full_path): + return self.__handle_ops_xml(full_path) return True - def __handle_tocncx(self, full_path: str): + def __handle_ops_xml(self, full_path: str) -> bool: + try: + tree, namespace = office._parse_xml(full_path) + except ET.ParseError: # pragma: nocover + logging.error("Unable to parse %s in %s.", full_path, self.filename) + return False + + for item in tree.iterfind('.//', namespace): # pragma: nocover + if item.tag.strip().lower().endswith('head'): + item.clear() + break + tree.write(full_path, xml_declaration=True, encoding='utf-8', + short_empty_elements=False) + return True + + + def __handle_tocncx(self, full_path: str) -> bool: try: tree, namespace = office._parse_xml(full_path) except ET.ParseError: # pragma: nocover @@ -53,7 +90,7 @@ short_empty_elements=False) return True - def __handle_contentopf(self, full_path: str): + def __handle_contentopf(self, full_path: str) -> bool: try: tree, namespace = office._parse_xml(full_path) except ET.ParseError: diff -Nru mat2-0.12.0/libmat2/office.py mat2-0.12.1/libmat2/office.py --- mat2-0.12.0/libmat2/office.py 2020-12-18 16:55:41.000000000 +0000 +++ mat2-0.12.1/libmat2/office.py 2021-03-19 16:54:21.000000000 +0000 @@ -87,6 +87,7 @@ self.files_to_keep = set(map(re.compile, { # type: ignore r'^\[Content_Types\]\.xml$', r'^_rels/\.rels$', + r'^xl/sharedStrings\.xml$', # https://docs.microsoft.com/en-us/office/open-xml/working-with-the-shared-string-table r'^(?:word|ppt|xl)/_rels/document\.xml\.rels$', r'^(?:word|ppt|xl)/_rels/footer[0-9]*\.xml\.rels$', r'^(?:word|ppt|xl)/_rels/header[0-9]*\.xml\.rels$', @@ -108,6 +109,7 @@ r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels', })) self.files_to_omit = set(map(re.compile, { # type: ignore + r'^\[trash\]/', r'^customXml/', r'webSettings\.xml$', r'^docProps/custom\.xml$', diff -Nru mat2-0.12.0/mat2 mat2-0.12.1/mat2 --- mat2-0.12.0/mat2 2020-12-18 16:55:41.000000000 +0000 +++ mat2-0.12.1/mat2 2021-03-19 16:54:21.000000000 +0000 @@ -17,7 +17,7 @@ print(e) sys.exit(1) -__version__ = '0.12.0' +__version__ = '0.12.1' # Make pyflakes happy assert Set diff -Nru mat2-0.12.0/setup.py mat2-0.12.1/setup.py --- mat2-0.12.0/setup.py 2020-12-18 16:55:41.000000000 +0000 +++ mat2-0.12.1/setup.py 2021-03-19 16:54:21.000000000 +0000 @@ -5,7 +5,7 @@ setuptools.setup( name="mat2", - version='0.12.0', + version='0.12.1', author="Julien (jvoisin) Voisin", author_email="julien.voisin+m...@dustri.org", description="A handy tool to trash your metadata",