Package: release.debian.org
Severity: normal
User: release.debian....@packages.debian.org
Usertags: unblock

Dear release team,

Please unblock mat2 0.12.1-1.

It ships improved support of EPUB and Microsoft Office files. It's a key
package, as doxygen build-depends on it, but so far regressions haven't been
reported, and autopkgtest looks good as well. The diff is quite small:

~ debdiff mat2_0.12.0-1.dsc mat2_0.12.1-1.dsc | diffstat

 CHANGELOG.md      |    5 +++++
 debian/changelog  |    7 +++++++
 doc/mat2.1        |    2 +-
 libmat2/epub.py   |   49 +++++++++++++++++++++++++++++++++++++++++++------
 libmat2/office.py |    2 ++
 mat2              |    2 +-
 setup.py          |    2 +-
 7 files changed, 60 insertions(+), 9 deletions(-)

Please find the full debdiff attached.

unblock mat2/0.12.1-1

Thanks for your work,
cheers,
Georg
diff -Nru mat2-0.12.0/CHANGELOG.md mat2-0.12.1/CHANGELOG.md
--- mat2-0.12.0/CHANGELOG.md	2020-12-18 16:55:41.000000000 +0000
+++ mat2-0.12.1/CHANGELOG.md	2021-03-19 16:54:21.000000000 +0000
@@ -1,3 +1,8 @@
+# 0.12.1 - 2021-03-19
+
+- Improve epub support
+- Improve MS Office support
+
 # 0.12.0 - 2020-12-18
 
 - Improve significantly MS Office formats support
diff -Nru mat2-0.12.0/debian/changelog mat2-0.12.1/debian/changelog
--- mat2-0.12.0/debian/changelog	2020-12-26 19:52:55.000000000 +0000
+++ mat2-0.12.1/debian/changelog	2021-03-20 19:11:38.000000000 +0000
@@ -1,3 +1,10 @@
+mat2 (0.12.1-1) unstable; urgency=medium
+
+  * New upstream version 0.12.1:
+    - Ships improved support of EPUB and Microsoft Office files.
+
+ -- Georg Faerber <ge...@debian.org>  Sat, 20 Mar 2021 19:11:38 +0000
+
 mat2 (0.12.0-1) unstable; urgency=medium
 
   * Team upload.
diff -Nru mat2-0.12.0/doc/mat2.1 mat2-0.12.1/doc/mat2.1
--- mat2-0.12.0/doc/mat2.1	2020-12-18 16:55:41.000000000 +0000
+++ mat2-0.12.1/doc/mat2.1	2021-03-19 16:54:21.000000000 +0000
@@ -1,4 +1,4 @@
-.TH mat2 "1" "December 2020" "mat2 0.12.0" "User Commands"
+.TH mat2 "1" "March 2021" "mat2 0.12.1" "User Commands"
 
 .SH NAME
 mat2 \- the metadata anonymisation toolkit 2
diff -Nru mat2-0.12.0/libmat2/epub.py mat2-0.12.1/libmat2/epub.py
--- mat2-0.12.0/libmat2/epub.py	2020-12-18 16:55:41.000000000 +0000
+++ mat2-0.12.1/libmat2/epub.py	2021-03-19 16:54:21.000000000 +0000
@@ -1,7 +1,9 @@
 import logging
 import re
 import uuid
+import zipfile
 import xml.etree.ElementTree as ET  # type: ignore
+from typing import Dict, Any
 
 from . import archive, office
 
@@ -15,11 +17,28 @@
             'META-INF/container.xml',
             'mimetype',
             'OEBPS/content.opf',
+            'content.opf',
+            'hmh.opf',
+            'OPS/.+.xml'
             }))
+        self.files_to_omit = set(map(re.compile, {  # type: ignore
+            'iTunesMetadata.plist',
+            'META-INF/calibre_bookmarks.txt',
+            'OEBPS/package.opf',
+             }))
         self.uniqid = uuid.uuid4()
 
-    def _specific_get_meta(self, full_path, file_path):
-        if file_path != 'OEBPS/content.opf':
+
+    def is_archive_valid(self):
+        super().is_archive_valid()
+        with zipfile.ZipFile(self.filename) as zin:
+            for item in self._get_all_members(zin):
+                member_name = self._get_member_name(item)
+                if member_name.endswith('META-INF/encryption.xml'):
+                    raise ValueError('the file contains encrypted fonts')
+
+    def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]:
+        if not file_path.endswith('.opf'):
             return {}
 
         with open(full_path, encoding='utf-8') as f:
@@ -30,14 +49,32 @@
             except (TypeError, UnicodeDecodeError):
                 return {file_path: 'harmful content', }
 
-    def _specific_cleanup(self, full_path: str):
-        if full_path.endswith('OEBPS/content.opf'):
+    def _specific_cleanup(self, full_path: str) -> bool:
+        if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
             return self.__handle_contentopf(full_path)
         elif full_path.endswith('OEBPS/toc.ncx'):
             return self.__handle_tocncx(full_path)
+        elif re.search('/OPS/[^/]+.xml$', full_path):
+            return self.__handle_ops_xml(full_path)
         return True
 
-    def __handle_tocncx(self, full_path: str):
+    def __handle_ops_xml(self, full_path: str) -> bool:
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:  # pragma: nocover
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+
+        for item in tree.iterfind('.//', namespace):  # pragma: nocover
+            if item.tag.strip().lower().endswith('head'):
+                item.clear()
+                break
+        tree.write(full_path, xml_declaration=True, encoding='utf-8',
+                   short_empty_elements=False)
+        return True
+
+
+    def __handle_tocncx(self, full_path: str) -> bool:
         try:
             tree, namespace = office._parse_xml(full_path)
         except ET.ParseError:  # pragma: nocover
@@ -53,7 +90,7 @@
                    short_empty_elements=False)
         return True
 
-    def __handle_contentopf(self, full_path: str):
+    def __handle_contentopf(self, full_path: str) -> bool:
         try:
             tree, namespace = office._parse_xml(full_path)
         except ET.ParseError:
diff -Nru mat2-0.12.0/libmat2/office.py mat2-0.12.1/libmat2/office.py
--- mat2-0.12.0/libmat2/office.py	2020-12-18 16:55:41.000000000 +0000
+++ mat2-0.12.1/libmat2/office.py	2021-03-19 16:54:21.000000000 +0000
@@ -87,6 +87,7 @@
         self.files_to_keep = set(map(re.compile, {  # type: ignore
             r'^\[Content_Types\]\.xml$',
             r'^_rels/\.rels$',
+            r'^xl/sharedStrings\.xml$',  # https://docs.microsoft.com/en-us/office/open-xml/working-with-the-shared-string-table
             r'^(?:word|ppt|xl)/_rels/document\.xml\.rels$',
             r'^(?:word|ppt|xl)/_rels/footer[0-9]*\.xml\.rels$',
             r'^(?:word|ppt|xl)/_rels/header[0-9]*\.xml\.rels$',
@@ -108,6 +109,7 @@
             r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
         }))
         self.files_to_omit = set(map(re.compile, {  # type: ignore
+            r'^\[trash\]/',
             r'^customXml/',
             r'webSettings\.xml$',
             r'^docProps/custom\.xml$',
diff -Nru mat2-0.12.0/mat2 mat2-0.12.1/mat2
--- mat2-0.12.0/mat2	2020-12-18 16:55:41.000000000 +0000
+++ mat2-0.12.1/mat2	2021-03-19 16:54:21.000000000 +0000
@@ -17,7 +17,7 @@
     print(e)
     sys.exit(1)
 
-__version__ = '0.12.0'
+__version__ = '0.12.1'
 
 # Make pyflakes happy
 assert Set
diff -Nru mat2-0.12.0/setup.py mat2-0.12.1/setup.py
--- mat2-0.12.0/setup.py	2020-12-18 16:55:41.000000000 +0000
+++ mat2-0.12.1/setup.py	2021-03-19 16:54:21.000000000 +0000
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="mat2",
-    version='0.12.0',
+    version='0.12.1',
     author="Julien (jvoisin) Voisin",
     author_email="julien.voisin+m...@dustri.org",
     description="A handy tool to trash your metadata",

Reply via email to