Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package python-pyocr for openSUSE:Factory 
checked in at 2021-08-28 22:29:33
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-pyocr (Old)
 and      /work/SRC/openSUSE:Factory/.python-pyocr.new.1899 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "python-pyocr"

Sat Aug 28 22:29:33 2021 rev:5 rq:914761 version:0.8

Changes:
--------
--- /work/SRC/openSUSE:Factory/python-pyocr/python-pyocr.changes        
2019-09-11 10:36:43.503273026 +0200
+++ /work/SRC/openSUSE:Factory/.python-pyocr.new.1899/python-pyocr.changes      
2021-08-28 22:29:55.734026009 +0200
@@ -1,0 +2,9 @@
+Thu Aug 26 10:41:00 UTC 2021 - John Paul Adrian Glaubitz 
<[email protected]>
+
+- Update to 0.8:
+  * Replaced libtesseract.image_to_pdf() by an object-oriented API that allows
+    creating PDF with more than 1 page (thanks to Matthias Kraus).
+  * Tesseract 4 + sys.frozen=True: Fix TESSDATA_PREFIX: starting with
+    Tesseract 4, the path must include tessdata/
+
+-------------------------------------------------------------------

Old:
----
  python-pyocr-0.7.2.tar.gz

New:
----
  python-pyocr-0.8.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-pyocr.spec ++++++
--- /var/tmp/diff_new_pack.ZVIiQh/_old  2021-08-28 22:29:56.114026431 +0200
+++ /var/tmp/diff_new_pack.ZVIiQh/_new  2021-08-28 22:29:56.118026436 +0200
@@ -1,7 +1,7 @@
 #
 # spec file for package python-pyocr
 #
-# Copyright (c) 2019 SUSE LINUX GmbH, Nuernberg, Germany.
+# Copyright (c) 2021 SUSE LLC
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -16,11 +16,11 @@
 #
 
 
-%define sha f4b068cdf359186bfbed36959c53e9e52e2eda84
+%define sha f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc
 %define skip_python2 1
 %{?!python_module:%define python_module() python-%{**} python3-%{**}}
 Name:           python-pyocr
-Version:        0.7.2
+Version:        0.8
 Release:        0
 Summary:        Python wrapper for OCR engines
 License:        GPL-3.0-or-later

++++++ python-pyocr-0.7.2.tar.gz -> python-pyocr-0.8.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/.git_archival.txt 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/.git_archival.txt
--- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/.git_archival.txt  
2019-06-22 20:10:54.000000000 +0200
+++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/.git_archival.txt    
2021-01-01 16:59:54.000000000 +0100
@@ -1 +1 @@
-ref-names: tag: 0.7.2, 
refs/keep-around/f4b068cdf359186bfbed36959c53e9e52e2eda84
+ref-names: tag: 0.8, refs/keep-around/f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/.gitlab-ci.yml 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/.gitlab-ci.yml
--- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/.gitlab-ci.yml     
2019-06-22 20:10:54.000000000 +0200
+++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/.gitlab-ci.yml       
2021-01-01 16:59:54.000000000 +0100
@@ -13,6 +13,9 @@
 
 
 check:
+  only:
+    - branches@World/OpenPaperwork/pyocr
+    - tags@World/OpenPaperwork/pyocr
   tags:
     - linux
     - volatile
@@ -23,12 +26,15 @@
 
 
 test:
+  only:
+    - branches@World/OpenPaperwork/pyocr
+    - tags@World/OpenPaperwork/pyocr
   tags:
     - linux
     - volatile
   <<: *apt
   script:
-    - apt-get install -y -qq python-tox
+    - apt-get install -y -qq tox
     # required for Pillow
     - apt-get install -y -qq zlib1g-dev
     - apt-get install -y -qq libjpeg-dev
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/ChangeLog 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/ChangeLog
--- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/ChangeLog  
2019-06-22 20:10:54.000000000 +0200
+++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/ChangeLog    
2021-01-01 16:59:54.000000000 +0100
@@ -1,3 +1,9 @@
+01/01/2020 - 0.8.0:
+- Replaced libtesseract.image_to_pdf() by an object-oriented API that allows
+  creating PDF with more than 1 page (thanks to Matthias Kraus).
+- Tesseract 4 + sys.frozen=True: Fix TESSDATA_PREFIX: starting with
+  Tesseract 4, the path must include tessdata/
+
 22/06/2019 - 0.7.2:
 - Fix setup.py on Windows
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/README.md 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/README.md
--- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/README.md  
2019-06-22 20:10:54.000000000 +0200
+++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/README.md    
2021-01-01 16:59:54.000000000 +0100
@@ -249,11 +249,28 @@
 import PIL.Image
 import pyocr
 
-pyocr.libtesseract.image_to_pdf(
-    PIL.Image.open("image.jpg"),
-    "output_filename"  # .pdf will be appended
-)
+image = PIL.Image.open("image.jpg")
 
+builder = pyocr.libtesseract.LibtesseractPdfBuilder()
+builder.add_image(image)    # multiple images are added as separate pages
+builder.set_lang("deu")     # optional
+builder.set_output_file("output_filename") # .pdf will be appended
+builder.build()
+```
+
+#### Add text layer to PDF
+
+```Python
+import pyocr
+import pdf2image
+
+images = pdf2image.convert_from_path("file.pdf", dpi=200, fmt='jpg')
+
+builder = pyocr.libtesseract.LibtesseractPdfBuilder()
+for image in images:
+    builder.add_image(image)
+builder.set_output_file("output") # .pdf will be appended
+builder.build()
 ```
 
 Beware this code hasn't been adapted to libtesseract 3 yet.
@@ -296,7 +313,7 @@
 
 ## Contact
 
-* 
[Mailing-list](https://gitlab.gnome.org/World/OpenPaperwork/paperwork/wikis/Contact#mailing-list)
+* [Forum](https://forum.openpaper.work/)
 * [Bug tracker](https://gitlab.gnome.org/World/OpenPaperwork/pyocr/issues)
 
 
@@ -307,7 +324,7 @@
 * [Paperwork](https://gitlab.gnome.org/World/OpenPaperwork/paperwork#readme)
 
 If you know of any other applications that use Pyocr, please
-[tell 
us](https://gitlab.gnome.org/World/OpenPaperwork/paperwork/wikis/Contact#mailing-list)
 :-)
+[tell us](https://forum.openpaper.work/) :-)
 
 ## Copyright
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/setup.py 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/setup.py
--- old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/setup.py   
2019-06-22 20:10:54.000000000 +0200
+++ new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/setup.py     
2021-01-01 16:59:54.000000000 +0100
@@ -54,8 +54,12 @@
 
 setup(
     name="pyocr",
-    description=("A Python wrapper for OCR engines (Tesseract, Cuneiform,"
-                 " etc)"),
+    description=(
+        "A Python wrapper for OCR engines (Tesseract, Cuneiform, etc)"
+    ),
+    long_description=(
+        "A Python wrapper for OCR engines (Tesseract, Cuneiform, etc)"
+    ),
     keywords="tesseract cuneiform ocr",
     version=version,
     url="https://gitlab.gnome.org/World/OpenPaperwork/pyocr";,
@@ -87,7 +91,7 @@
     },
     data_files=[],
     scripts=[],
-    zip_safe=True,
+    zip_safe=(os.name != 'nt'),
     python_requires='>=3.4',
     install_requires=[
         "Pillow",
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/src/pyocr/libtesseract/__init__.py
 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/src/pyocr/libtesseract/__init__.py
--- 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/src/pyocr/libtesseract/__init__.py
 2019-06-22 20:10:54.000000000 +0200
+++ 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/src/pyocr/libtesseract/__init__.py
   2021-01-01 16:59:54.000000000 +0100
@@ -195,29 +195,96 @@
         textonly: create pdf with only one invisible text layer. Defaults to
             False.
     '''
-    handle = tesseract_raw.init(lang=lang)
-    renderer = None
-    try:
-        tesseract_raw.set_image(handle, image)
-        tesseract_raw.set_page_seg_mode(
-            handle, tesseract_raw.PageSegMode.AUTO_OSD
-        )
-
-        tesseract_raw.set_input_name(handle, input_file)
-        tesseract_raw.recognize(handle)
-
-        renderer = tesseract_raw.init_pdf_renderer(
-            handle, output_file, textonly
-        )
-        assert(renderer)
-
-        tesseract_raw.begin_document(renderer, "")
-        tesseract_raw.add_renderer_image(handle, renderer)
-        tesseract_raw.end_document(renderer)
-    finally:
-        tesseract_raw.cleanup(handle)
-        if renderer:
-            tesseract_raw.cleanup(renderer)
+    LibtesseractPdfBuilder()\
+        .set_lang(lang)\
+        .set_output_file(output_file)\
+        .set_text_only(textonly)\
+        .add_image(image)\
+        .build()
+
+
+class LibtesseractPdfBuilder(object):
+    '''
+    Creates a pdf file with embeded text based on OCR from one or more images.
+    '''
+
+    def __init__(self):
+        self.images = []
+        self.output_file = None
+        self.lang = None
+        self.text_only = False
+
+    def set_lang(self, lang):
+        '''
+        Language to be used for ocr.
+        :param lang: three letter language code. For available languages see
+            
https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages.
+            Defaults to None.
+        '''
+        self.lang = lang
+        return self
+
+    def set_output_file(self, output_file):
+        self.output_file = output_file
+        return self
+
+    def set_text_only(self, text_only):
+        '''
+        :param text_only: create pdf with only one invisible text layer.
+        Defaults to False.
+        '''
+        self.text_only = text_only
+        return self
+
+    def add_image(self, img):
+        '''
+        Add an image to be converted to a page in the pdf
+        :param img: image to convert
+        '''
+        self.images.append(img)  # or something else
+        return self
+
+    def __validate(self):
+        if len(self.images) < 1:
+            raise ValueError(
+                "At least one image is required to build the pdf!"
+            )
+
+        if self.output_file is None:
+            raise ValueError("An output-file is required to build the pdf!")
+
+    def build(self):
+        '''
+        Create and write PDF file.
+        '''
+        self.__validate()
+
+        handle = tesseract_raw.init(lang=self.lang)
+        renderer = None
+        try:
+            tesseract_raw.set_page_seg_mode(
+                handle, tesseract_raw.PageSegMode.AUTO_OSD
+            )
+
+            renderer = tesseract_raw.init_pdf_renderer(
+                handle, self.output_file, self.text_only
+            )
+            assert renderer
+
+            tesseract_raw.begin_document(renderer, "")
+
+            for image in self.images:
+                tesseract_raw.set_image(handle, image)
+
+                # tesseract_raw.set_input_name(handle, input_file)
+                tesseract_raw.recognize(handle)
+
+                tesseract_raw.add_renderer_image(handle, renderer)
+            tesseract_raw.end_document(renderer)
+        finally:
+            tesseract_raw.cleanup(handle)
+            if renderer:
+                tesseract_raw.cleanup(renderer)
 
 
 def is_available():
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/src/pyocr/libtesseract/tesseract_raw.py
 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/src/pyocr/libtesseract/tesseract_raw.py
--- 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/src/pyocr/libtesseract/tesseract_raw.py
    2019-06-22 20:10:54.000000000 +0200
+++ 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/src/pyocr/libtesseract/tesseract_raw.py
      2021-01-01 16:59:54.000000000 +0100
@@ -28,7 +28,7 @@
             )
         )
     else:
-        TESSDATA_PREFIX = tessdata
+        TESSDATA_PREFIX = os.path.join(tessdata, "tessdata")
 
 
 if sys.platform[:3] == "win":  # pragma: no cover
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/src/pyocr/tesseract.py 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/src/pyocr/tesseract.py
--- 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/src/pyocr/tesseract.py 
    2019-06-22 20:10:54.000000000 +0200
+++ 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/src/pyocr/tesseract.py   
    2021-01-01 16:59:54.000000000 +0100
@@ -119,8 +119,7 @@
 
     if getattr(sys, 'frozen', False):  # pragma: no cover
         # Pyinstaller support
-        path = os.environ["PATH"]
-        if sys._MEIPASS in path:
+        if 'TESSDATA_PREFIX' in os.environ:
             # already changed
             return
 
@@ -128,15 +127,6 @@
         tessprefix = os.path.join(sys._MEIPASS, "data")
         logger.info("Running in packaged environment")
 
-        if not os.path.exists(os.path.join(tessprefix, "tessdata")):
-            logger.warning(
-                "Running from container, but no tessdata ({}) found !".format(
-                    tessprefix
-                )
-            )
-        else:
-            logger.info("TESSDATA_PREFIX set to [{}]".format(tessprefix))
-            os.environ['TESSDATA_PREFIX'] = tessprefix
         if not os.path.exists(tesspath):
             logger.warning(
                 "Running from container, but no tesseract ({}) found !".format(
@@ -149,6 +139,19 @@
                 tesspath + os.pathsep + os.environ['PATH']
             )
 
+        if not os.path.exists(os.path.join(tessprefix, "tessdata")):
+            logger.warning(
+                "Running from container, but no tessdata ({}) found !".format(
+                    tessprefix
+                )
+            )
+        else:
+            version = get_version(set_env=False)
+            if version[0] > 3:
+                tessprefix = os.path.join(tessprefix, "tessdata")
+            logger.info("TESSDATA_PREFIX set to [{}]".format(tessprefix))
+            os.environ['TESSDATA_PREFIX'] = tessprefix
+
 
 def can_detect_orientation():
     version = get_version()
@@ -413,7 +416,7 @@
     return [lang for lang in langs if lang and lang[-1] != ':']
 
 
-def get_version():
+def get_version(set_env=True):
     """
     Returns Tesseract version.
 
@@ -428,15 +431,15 @@
     if g_version is not None:
         return g_version
 
-    _set_environment()
+    if set_env:
+        _set_environment()
 
     command = [TESSERACT_CMD, "-v"]
 
     proc = subprocess.Popen(command,
                             startupinfo=g_subprocess_startup_info,
                             creationflags=g_creation_flags,
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT)
+                            stdout=subprocess.PIPE)
     ver_string = proc.stdout.read()
     ver_string = ver_string.decode('utf-8')
     ret = proc.wait()
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/tests/tests_libtesseract.py
 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/tests/tests_libtesseract.py
--- 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/tests/tests_libtesseract.py
        2019-06-22 20:10:54.000000000 +0200
+++ 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/tests/tests_libtesseract.py
  2021-01-01 16:59:54.000000000 +0100
@@ -1379,11 +1379,11 @@
 
     def setUp(self):
         self.image = Image.new(mode="RGB", size=(1, 1))
-        self.handle = randint(0, 2**32-1)
+        self.handle = 1234567
 
     @patch("pyocr.libtesseract.tesseract_raw")
     def test_pdf(self, raw):
-        renderer = randint(0, 2**32-1)
+        renderer = 2345671
         raw.init.return_value = self.handle
         raw.init_pdf_renderer.return_value = renderer
         libtesseract.image_to_pdf(self.image, "output")
@@ -1393,7 +1393,6 @@
         raw.set_page_seg_mode.assert_called_once_with(
             self.handle, raw.PageSegMode.AUTO_OSD
         )
-        raw.set_input_name.assert_called_once_with(self.handle, "stdin")
         raw.recognize.assert_called_once_with(self.handle)
         raw.init_pdf_renderer.assert_called_once_with(
             self.handle, "output", False
@@ -1408,6 +1407,41 @@
         )
 
     @patch("pyocr.libtesseract.tesseract_raw")
+    def test_multipage_pdf(self, raw):
+        renderer = 2345671
+        raw.init.return_value = self.handle
+        raw.init_pdf_renderer.return_value = renderer
+        libtesseract.LibtesseractPdfBuilder() \
+            .set_output_file("output")\
+            .add_image(self.image)\
+            .add_image(self.image)\
+            .build()
+
+        raw.init.assert_called_once_with(lang=None)
+        raw.set_image.assert_called_with(self.handle, self.image)
+        raw.set_image.assert_called_with(self.handle, self.image)
+        raw.set_page_seg_mode.assert_called_once_with(
+            self.handle, raw.PageSegMode.AUTO_OSD
+        )
+        raw.recognize.assert_called_with(self.handle)
+        raw.recognize.assert_called_with(self.handle)
+        raw.init_pdf_renderer.assert_called_once_with(
+            self.handle, "output", False
+        )
+        raw.begin_document.assert_called_once_with(renderer, "")
+        raw.add_renderer_image.assert_called_with(
+            self.handle, renderer
+        )
+        raw.add_renderer_image.assert_called_with(
+            self.handle, renderer
+        )
+        raw.end_document.assert_called_once_with(renderer)
+        self.assertListEqual(
+            raw.cleanup.call_args_list,
+            [call(self.handle), call(renderer)]
+        )
+
+    @patch("pyocr.libtesseract.tesseract_raw")
     def test_pdf_renderer_error(self, raw):
         renderer = None
         raw.init.return_value = self.handle
@@ -1417,15 +1451,15 @@
             libtesseract.image_to_pdf(self.image, "output")
 
         raw.init.assert_called_once_with(lang=None)
-        raw.set_image.assert_called_once_with(self.handle, self.image)
         raw.set_page_seg_mode.assert_called_once_with(
             self.handle, raw.PageSegMode.AUTO_OSD
         )
-        raw.set_input_name.assert_called_once_with(self.handle, "stdin")
-        raw.recognize.assert_called_once_with(self.handle)
         raw.init_pdf_renderer.assert_called_once_with(
             self.handle, "output", False
         )
+        self.assertFalse(raw.set_image.called)
+        self.assertFalse(raw.set_input_name.called)
+        self.assertFalse(raw.recognize.called)
         self.assertFalse(raw.begin_document.called)
         self.assertFalse(raw.add_renderer_image.called)
         self.assertFalse(raw.end_document.called)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/tests/tests_tesseract.py
 new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/tests/tests_tesseract.py
--- 
old/pyocr-0.7.2-f4b068cdf359186bfbed36959c53e9e52e2eda84/tests/tests_tesseract.py
   2019-06-22 20:10:54.000000000 +0200
+++ 
new/pyocr-0.8-f9f93bbd83ab5814b1c56c3ffe41addc9ec622dc/tests/tests_tesseract.py 
    2021-01-01 16:59:54.000000000 +0100
@@ -54,6 +54,12 @@
         popen.return_value = self.stdout
         self.assertSequenceEqual(tesseract.get_version(), (4, 0, 0))
 
+        # stderr must be explicitely ignored when calling 'tesseract -v'.
+        # See https://gitlab.gnome.org/World/OpenPaperwork/pyocr/-/issues/118
+        popen.assert_called_once()
+        (args, kwargs) = popen.call_args
+        self.assertNotIn('stderr', kwargs)
+
     @patch("subprocess.Popen")
     def test_version_tesseract4dev(self, popen):
         tesseract.g_version = None  # drop cached version

Reply via email to