Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package python-tesserocr for openSUSE:Factory checked in at 2021-06-24 18:22:30 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-tesserocr (Old) and /work/SRC/openSUSE:Factory/.python-tesserocr.new.2625 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-tesserocr" Thu Jun 24 18:22:30 2021 rev:11 rq:901578 version:2.5.2 Changes: -------- --- /work/SRC/openSUSE:Factory/python-tesserocr/python-tesserocr.changes 2020-03-26 23:35:05.210814312 +0100 +++ /work/SRC/openSUSE:Factory/.python-tesserocr.new.2625/python-tesserocr.changes 2021-06-24 18:22:47.696930910 +0200 @@ -1,0 +2,9 @@ +Wed Jun 23 17:43:23 UTC 2021 - Mia Herkt <m...@0x0.st> + +- Update to 2.5.2 + * Support new Tesseract 5 API (gh#sirfz/tesserocr#242) + * GetBestLSTMSymbolChoices crash fix (gh#sirfz/tesserocr#241) + * Fallback to BMP instead of PNG + * Create pix from a BMP image bytes (gh#sirfz/tesserocr#156) + +------------------------------------------------------------------- Old: ---- tesserocr-2.5.1.tar.gz New: ---- tesserocr-2.5.2.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-tesserocr.spec ++++++ --- /var/tmp/diff_new_pack.LSisZN/_old 2021-06-24 18:22:48.228931500 +0200 +++ /var/tmp/diff_new_pack.LSisZN/_new 2021-06-24 18:22:48.232931505 +0200 @@ -1,7 +1,7 @@ # # spec file for package python-tesserocr # -# Copyright (c) 2020 SUSE LLC +# Copyright (c) 2021 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -18,7 +18,7 @@ %{?!python_module:%define python_module() python-%{**} python3-%{**}} Name: python-tesserocr -Version: 2.5.1 +Version: 2.5.2 Release: 0 Summary: A Python wrapper around tesseract-ocr License: MIT ++++++ tesserocr-2.5.1.tar.gz -> tesserocr-2.5.2.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.5.1/MANIFEST.in new/tesserocr-2.5.2/MANIFEST.in --- old/tesserocr-2.5.1/MANIFEST.in 2019-11-08 23:49:38.000000000 +0100 +++ new/tesserocr-2.5.2/MANIFEST.in 2021-06-19 22:02:07.000000000 +0200 @@ -1,5 +1,5 @@ include README.rst include LICENSE include *.pyx *.pxd -include tests/*.py tests/*.tif +include tests/*.py tests/*.png exclude *.cpp *.so diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.5.1/PKG-INFO new/tesserocr-2.5.2/PKG-INFO --- old/tesserocr-2.5.1/PKG-INFO 2020-03-17 18:41:39.000000000 +0100 +++ new/tesserocr-2.5.2/PKG-INFO 2021-06-19 23:08:30.000000000 +0200 @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: tesserocr -Version: 2.5.1 +Version: 2.5.2 Summary: A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython Home-page: https://github.com/sirfz/tesserocr Author: Fayez Zouheiry @@ -108,6 +108,29 @@ > pip install <package_name>.whl + Build from source + ````````````````` + + If you need Windows tessocr package and your Python version is not supported by above mentioned project, + you can try to follow `step by step instructions for Windows 64bit` in `Windows.build.md`_. + + .. _Windows.build.md: Windows.build.md + + tessdata + ======== + + You may need to point to the tessdata path if it cannot be detected automatically. This can be done by setting the ``TESSDATA_PREFIX`` environment variable or by passing the path to ``PyTessBaseAPI`` (e.g.: ``PyTessBaseAPI(path='/usr/share/tessdata')``). The path should contain ``.traineddata`` files which can be found at https://github.com/tesseract-ocr/tessdata. + + Make sure you have the correct version of traineddata for your ``tesseract --version``. + + You can list the current supported languages on your system using the ``get_languages`` function: + + .. code:: python + + from tesserocr import get_languages + + print(get_languages('/usr/share/tessdata')) # or any other path that applies to your system + Usage ===== @@ -268,6 +291,8 @@ Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: Implementation :: CPython Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Programming Language :: Cython diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.5.1/README.rst new/tesserocr-2.5.2/README.rst --- old/tesserocr-2.5.1/README.rst 2019-11-09 00:11:39.000000000 +0100 +++ new/tesserocr-2.5.2/README.rst 2021-06-19 23:05:59.000000000 +0200 @@ -100,6 +100,29 @@ > pip install <package_name>.whl +Build from source +````````````````` + +If you need Windows tessocr package and your Python version is not supported by above mentioned project, +you can try to follow `step by step instructions for Windows 64bit` in `Windows.build.md`_. + +.. _Windows.build.md: Windows.build.md + +tessdata +======== + +You may need to point to the tessdata path if it cannot be detected automatically. This can be done by setting the ``TESSDATA_PREFIX`` environment variable or by passing the path to ``PyTessBaseAPI`` (e.g.: ``PyTessBaseAPI(path='/usr/share/tessdata')``). The path should contain ``.traineddata`` files which can be found at https://github.com/tesseract-ocr/tessdata. + +Make sure you have the correct version of traineddata for your ``tesseract --version``. + +You can list the current supported languages on your system using the ``get_languages`` function: + +.. code:: python + + from tesserocr import get_languages + + print(get_languages('/usr/share/tessdata')) # or any other path that applies to your system + Usage ===== diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.5.1/setup.py new/tesserocr-2.5.2/setup.py --- old/tesserocr-2.5.1/setup.py 2020-03-17 18:39:50.000000000 +0100 +++ new/tesserocr-2.5.2/setup.py 2021-06-19 22:02:07.000000000 +0200 @@ -1,17 +1,20 @@ +import codecs +import errno +import glob +import itertools import logging import os -import sys -import codecs import re import subprocess -import errno -from os.path import dirname, abspath -from os.path import split as psplit, join as pjoin +import sys +from os.path import abspath, dirname +from os.path import join as pjoin +from os.path import split as psplit + from setuptools import setup from setuptools.command.build_ext import build_ext from setuptools.extension import Extension - _LOGGER = logging.getLogger() if os.environ.get('DEBUG'): _LOGGER.setLevel(logging.DEBUG) @@ -25,6 +28,11 @@ # find_version from pip https://github.com/pypa/pip/blob/1.5.6/setup.py#L33 here = abspath(dirname(__file__)) +EXTRA_COMPILE_ARGS = { + 'msvc': ['/std:c11', '-DUSE_STD_NAMESPACE'], + 'gcc': ['-std=c++11', '-DUSE_STD_NAMESPACE'], +} + def read(*parts): return codecs.open(pjoin(here, *parts), 'r').read() @@ -32,11 +40,10 @@ def find_version(*file_paths): version_file = read(*file_paths) - version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", - version_file, re.M) + version_match = re.search('^__version__ = [\'"]([^\'"]*)[\'"]', version_file, re.M) if version_match: return version_match.group(1) - raise RuntimeError("Unable to find version string.") + raise RuntimeError('Unable to find version string.') if sys.version_info >= (3, 0): @@ -47,97 +54,186 @@ return s +def major_version(version): + versions = version.split('.') + major = int(versions[0]) + _LOGGER.info('Tesseract major version %s', major) + return major + + def version_to_int(version): subversion = None subtrahend = 0 - # Subtracts a certain amount from the version number to differentiate between - # alpha, beta and release versions. - if "alpha" in version: - version_split = version.split("alpha") + # Subtracts a certain amount from the version number to differentiate + # between alpha, beta and release versions. + if 'alpha' in version: + version_split = version.split('alpha') subversion = version_split[1] subtrahend = 2 - elif "beta" in version: - version_split = version.split("beta") + elif 'beta' in version: + version_split = version.split('beta') subversion = version_split[1] subtrahend = 1 + version = re.search(r'((?:\d+\.)+\d+)', version).group() - # Split the groups on ".", take only the first one, and print each group with leading 0 if needed - # To be safe, also handle cases where an extra group is added to the version string, or if one or two groups - # are dropped. + # Split the groups on ".", take only the first one, and print each + # group with leading 0 if needed. To be safe, also handle cases where + # an extra group is added to the version string, or if one or two + # groups are dropped. version_groups = (version.split('.') + [0, 0])[:3] - version_str = "{:02}{:02}{:02}".format(*map(int, version_groups)) - version_str = str((int(version_str, 10)-subtrahend)) + version_str = '{:02}{:02}{:02}'.format(*map(int, version_groups)) + version_str = str((int(version_str, 10) - subtrahend)) # Adds a 2 digit subversion number for the subversionrelease. - subversion_str="00" - if subversion is not None and subversion is not "": + subversion_str = '00' + if subversion is not None and subversion != '': subversion = re.search(r'(?:\d+)', subversion).group() subversion_groups = (subversion.split('-') + [0, 0])[:1] - subversion_str = "{:02}".format(*map(int, subversion_groups)) - version_str+=subversion_str + subversion_str = '{:02}'.format(*map(int, subversion_groups)) + + version_str += subversion_str return int(version_str, 16) def package_config(): """Use pkg-config to get library build parameters and tesseract version.""" - p = subprocess.Popen(['pkg-config', '--exists', '--atleast-version={}'.format(_TESSERACT_MIN_VERSION), - '--print-errors', 'tesseract'], - stderr=subprocess.PIPE) + p = subprocess.Popen( + [ + 'pkg-config', + '--exists', + '--atleast-version={}'.format(_TESSERACT_MIN_VERSION), + '--print-errors', + 'tesseract', + ], + stderr=subprocess.PIPE, + ) _, error = p.communicate() if p.returncode != 0: + if isinstance(error, bytes): + error = error.decode() + raise Exception(error) - p = subprocess.Popen(['pkg-config', '--libs', '--cflags', 'tesseract'], stdout=subprocess.PIPE) + + p = subprocess.Popen( + ['pkg-config', '--libs', '--cflags', 'tesseract'], stdout=subprocess.PIPE + ) output, _ = p.communicate() flags = _read_string(output).strip().split() - p = subprocess.Popen(['pkg-config', '--libs', '--cflags', 'lept'], stdout=subprocess.PIPE) + p = subprocess.Popen( + ['pkg-config', '--libs', '--cflags', 'lept'], stdout=subprocess.PIPE + ) output, _ = p.communicate() flags2 = _read_string(output).strip().split() - options = {'-L': 'library_dirs', - '-I': 'include_dirs', - '-l': 'libraries'} - config = {'library_dirs': [], - 'include_dirs': [], - 'libraries': []} - import itertools + options = {'-L': 'library_dirs', '-I': 'include_dirs', '-l': 'libraries'} + config = {'library_dirs': [], 'include_dirs': [], 'libraries': []} + for f in itertools.chain(flags, flags2): try: opt = options[f[:2]] except KeyError: continue val = f[2:] - if opt == 'include_dirs' and psplit(val)[1].strip(os.sep) in ('leptonica', 'tesseract'): + if opt == 'include_dirs' and psplit(val)[1].strip(os.sep) in ( + 'leptonica', + 'tesseract', + ): val = dirname(val) config[opt] += [val] - p = subprocess.Popen(['pkg-config', '--modversion', 'tesseract'], stdout=subprocess.PIPE) + + p = subprocess.Popen( + ['pkg-config', '--modversion', 'tesseract'], stdout=subprocess.PIPE + ) version, _ = p.communicate() version = _read_string(version).strip() - _LOGGER.info("Supporting tesseract v{}".format(version)) - config['cython_compile_time_env'] = {'TESSERACT_VERSION': version_to_int(version)} - _LOGGER.info("Configs from pkg-config: {}".format(config)) + _LOGGER.info('Supporting tesseract v%s', version) + config['compile_time_env'] = { + 'TESSERACT_MAJOR_VERSION': major_version(version), + 'TESSERACT_VERSION': version_to_int(version) + } + _LOGGER.info('Configs from pkg-config: %s', config) return config +def find_library(pattern, path_list, version=''): + """Help routine to find library.""" + result = [] + for path in path_list: + filepattern = os.path.join(path, pattern) + result += glob.glob(filepattern) + # ignore debug library + result = [i for i in result if not i.endswith('d.lib')] + if version: + result = [i for i in result if version in i] + return result + + def get_tesseract_version(): """Try to extract version from tesseract otherwise default min version.""" config = {'libraries': ['tesseract', 'lept']} try: - p = subprocess.Popen(['tesseract', '-v'], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + p = subprocess.Popen( + ['tesseract', '-v'], stderr=subprocess.PIPE, stdout=subprocess.PIPE + ) stdout_version, version = p.communicate() version = _read_string(version).strip() if version == '': version = _read_string(stdout_version).strip() + version_match = re.search(r'^tesseract ((?:\d+\.)+\d+).*', version, re.M) if version_match: version = version_match.group(1) else: - _LOGGER.warn('Failed to extract tesseract version number from: {}'.format(version)) + _LOGGER.warning( + 'Failed to extract tesseract version number from: %s', version + ) version = _TESSERACT_MIN_VERSION except OSError as e: - _LOGGER.warn('Failed to extract tesseract version from executable: {}'.format(e)) + _LOGGER.warning('Failed to extract tesseract version from executable: %s', e) version = _TESSERACT_MIN_VERSION - _LOGGER.info("Supporting tesseract v{}".format(version)) - version = version_to_int(version) - config['cython_compile_time_env'] = {'TESSERACT_VERSION': version} - _LOGGER.info("Building with configs: {}".format(config)) + + _LOGGER.info('Supporting tesseract v%s', version) + config['compile_time_env'] = { + 'TESSERACT_MAJOR_VERSION': major_version(version), + 'TESSERACT_VERSION': version_to_int(version) + } + if sys.platform == 'win32': + libpaths = os.getenv('LIBPATH', None) + if libpaths: + libpaths = list(filter(None, libpaths.split(';'))) + else: + libpaths = [] + + if version: + lib_version = ''.join(version.split('.')[:2]) + else: + lib_version = None + + tess_lib = find_library('tesseract*.lib', libpaths, lib_version) + if len(tess_lib) >= 1: + base = os.path.basename(sorted(tess_lib, reverse=True)[0]) + tess_lib = os.path.splitext(base)[0] + else: + error = 'Tesseract library not found in LIBPATH: {}'.format(libpaths) + raise RuntimeError(error) + + lept_lib = find_library('lept*.lib', libpaths) + if len(lept_lib) >= 1: + base = os.path.basename(sorted(lept_lib, reverse=True)[0]) + lept_lib = os.path.splitext(base)[0] + else: + error = 'Leptonica library not found in LIBPATH: {}'.format(libpaths) + raise RuntimeError(error) + + includepaths = os.getenv('INCLUDE', None) + if includepaths: + includepaths = list(filter(None, includepaths.split(';'))) + else: + includepaths = [] + + config['libraries'] = [tess_lib, lept_lib] + config['library_dirs'] = libpaths + config['include_dirs'] = includepaths + + _LOGGER.info('Building with configs: %s', config) return config @@ -148,64 +244,87 @@ except Exception as e: if isinstance(e, OSError): if e.errno != errno.ENOENT: - _LOGGER.warn('Failed to run pkg-config: {}'.format(e)) + _LOGGER.warning('Failed to run pkg-config: %s', e) else: - _LOGGER.warn('pkg-config failed to find tesseract/lept libraries: {}'.format(e)) + _LOGGER.warning( + 'pkg-config failed to find tesseract/leptonica libraries: %s', e + ) build_args = get_tesseract_version() - if build_args['cython_compile_time_env']['TESSERACT_VERSION'] >= 0x3050200: - _LOGGER.debug('tesseract >= 03.05.02 requires c++11 compiler support') - build_args['extra_compile_args'] = ['-std=c++11', '-DUSE_STD_NAMESPACE'] - - _LOGGER.debug('build parameters: {}'.format(build_args)) + _LOGGER.debug('build parameters: %s', build_args) return build_args def make_extension(): global _CYTHON_COMPILE_TIME_ENV build_args = get_build_args() - _CYTHON_COMPILE_TIME_ENV = build_args.pop('cython_compile_time_env') - return Extension("tesserocr", sources=["tesserocr.pyx"], language="c++", **build_args) + _CYTHON_COMPILE_TIME_ENV = build_args.pop('compile_time_env') + return Extension( + 'tesserocr', sources=['tesserocr.pyx'], language='c++', **build_args + ) class my_build_ext(build_ext, object): + def build_extensions(self): + compiler = self.compiler.compiler_type + _LOGGER.info('Detected compiler: %s', compiler) + extra_args = EXTRA_COMPILE_ARGS.get(compiler, EXTRA_COMPILE_ARGS['gcc']) + if isinstance(_CYTHON_COMPILE_TIME_ENV, dict): + version = _CYTHON_COMPILE_TIME_ENV.get('TESSERACT_VERSION', 0) + else: + version = 0 + + for extension in self.extensions: + if version >= 0x3050200: + _LOGGER.debug('tesseract >= 03.05.02 requires c++11 compiler support') + extension.extra_compile_args = extra_args + + build_ext.build_extensions(self) + def finalize_options(self): from Cython.Build.Dependencies import cythonize + self.distribution.ext_modules[:] = cythonize( - self.distribution.ext_modules, compile_time_env=_CYTHON_COMPILE_TIME_ENV) + self.distribution.ext_modules, compile_time_env=_CYTHON_COMPILE_TIME_ENV + ) super(my_build_ext, self).finalize_options() -setup(name='tesserocr', - version=find_version('tesserocr.pyx'), - description='A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython', - long_description=read('README.rst'), - long_description_content_type='text/x-rst', - url='https://github.com/sirfz/tesserocr', - author='Fayez Zouheiry', - author_email='iamfa...@gmail.com', - license='MIT', - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'Topic :: Multimedia :: Graphics :: Capture :: Scanners', - 'Topic :: Multimedia :: Graphics :: Graphics Conversion', - 'Topic :: Scientific/Engineering :: Image Recognition', - 'License :: OSI Approved :: MIT License', - 'Operating System :: POSIX', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: Implementation :: CPython', - 'Programming Language :: Python :: Implementation :: PyPy', - 'Programming Language :: Cython' - ], - keywords='Tesseract,tesseract-ocr,OCR,optical character recognition,PIL,Pillow,Cython', - cmdclass={'build_ext': my_build_ext}, - ext_modules=[make_extension()], - test_suite='tests', - setup_requires=['Cython>=0.23'], - ) +setup( + name='tesserocr', + version=find_version('tesserocr.pyx'), + description='A simple, Pillow-friendly, Python wrapper around ' + 'tesseract-ocr API using Cython', + long_description=read('README.rst'), + long_description_content_type='text/x-rst', + url='https://github.com/sirfz/tesserocr', + author='Fayez Zouheiry', + author_email='iamfa...@gmail.com', + license='MIT', + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'Topic :: Multimedia :: Graphics :: Capture :: Scanners', + 'Topic :: Multimedia :: Graphics :: Graphics Conversion', + 'Topic :: Scientific/Engineering :: Image Recognition', + 'License :: OSI Approved :: MIT License', + 'Operating System :: POSIX', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: PyPy', + 'Programming Language :: Cython', + ], + keywords='Tesseract,tesseract-ocr,OCR,optical character recognition,' + 'PIL,Pillow,Cython', + cmdclass={'build_ext': my_build_ext}, + ext_modules=[make_extension()], + test_suite='tests', + setup_requires=['Cython>=0.23'], +) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.5.1/tesseract.pxd new/tesserocr-2.5.2/tesseract.pxd --- old/tesserocr-2.5.1/tesseract.pxd 2019-11-08 23:49:38.000000000 +0100 +++ new/tesserocr-2.5.2/tesseract.pxd 2021-06-19 22:09:33.000000000 +0200 @@ -33,6 +33,7 @@ char *getLeptonicaVersion() Pix *pixRead(cchar_t *) Pix *pixReadMem(cuchar_t *, size_t) + Pix *pixReadMemBmp(cuchar_t *, size_t) int pixWriteMemJpeg(unsigned char **, size_t *, Pix *, int, int) int pixWriteMem(unsigned char **, size_t *, Pix *, int) void pixDestroy(Pix **) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.5.1/tesseract5.pxd new/tesserocr-2.5.2/tesseract5.pxd --- old/tesserocr-2.5.1/tesseract5.pxd 1970-01-01 01:00:00.000000000 +0100 +++ new/tesserocr-2.5.2/tesseract5.pxd 2021-06-19 22:53:48.000000000 +0200 @@ -0,0 +1,314 @@ +from libcpp cimport bool +from libcpp.pair cimport pair +from libcpp.string cimport string +from libcpp.vector cimport vector +ctypedef const char cchar_t +ctypedef const char * cchar_tp +ctypedef const unsigned char cuchar_t + +cdef extern from "leptonica/allheaders.h" nogil: + struct Pix: + int informat + + struct Box: + int x + int y + int w + int h + + struct Boxa: + int n # number of box in ptr array + Box **box # box ptr array + + struct Pixa: + int n # number of Pix in ptr array + Pix **pix # the array of ptrs to pix + Boxa *boxa # array of boxes + + struct Pta: + int n # actual number of pts + float *x + float *y # arrays of floats + + char *getImagelibVersions() + char *getLeptonicaVersion() + Pix *pixRead(cchar_t *) + Pix *pixReadMem(cuchar_t *, size_t) + Pix *pixReadMemBmp(cuchar_t *, size_t) + int pixWriteMemJpeg(unsigned char **, size_t *, Pix *, int, int) + int pixWriteMem(unsigned char **, size_t *, Pix *, int) + void pixDestroy(Pix **) + void ptaDestroy(Pta **) + int setMsgSeverity(int) + void pixaDestroy(Pixa **) + void boxaDestroy(Boxa **) + + cdef enum: + L_SEVERITY_EXTERNAL = 0 # Get the severity from the environment + L_SEVERITY_ALL = 1 # Lowest severity: print all messages + L_SEVERITY_DEBUG = 2 # Print debugging and higher messages + L_SEVERITY_INFO = 3 # Print informational and higher messages + L_SEVERITY_WARNING = 4 # Print warning and higher messages + L_SEVERITY_ERROR = 5 # Print error and higher messages + L_SEVERITY_NONE = 6 # Highest severity: print no messages + +cdef extern from "tesseract/publictypes.h" namespace "tesseract" nogil: + cdef enum PolyBlockType: + PT_UNKNOWN # Type is not yet known. Keep as the first element. + PT_FLOWING_TEXT # Text that lives inside a column. + PT_HEADING_TEXT # Text that spans more than one column. + PT_PULLOUT_TEXT # Text that is in a cross-column pull-out region. + PT_EQUATION # Partition belonging to an equation region. + PT_INLINE_EQUATION # Partition has inline equation. + PT_TABLE # Partition belonging to a table region. + PT_VERTICAL_TEXT # Text-line runs vertically. + PT_CAPTION_TEXT # Text that belongs to an image. + PT_FLOWING_IMAGE # Image that lives inside a column. + PT_HEADING_IMAGE # Image that spans more than one column. + PT_PULLOUT_IMAGE # Image that is in a cross-column pull-out region. + PT_HORZ_LINE # Horizontal Line. + PT_VERT_LINE # Vertical Line. + PT_NOISE # Lies outside of any column. + PT_COUNT + +cdef extern from "tesseract/publictypes.h" namespace "tesseract" nogil: + + cdef enum TessOrientation "tesseract::Orientation": + ORIENTATION_PAGE_UP + ORIENTATION_PAGE_RIGHT + ORIENTATION_PAGE_DOWN + ORIENTATION_PAGE_LEFT + + cdef enum TessWritingDirection "tesseract::WritingDirection": + WRITING_DIRECTION_LEFT_TO_RIGHT + WRITING_DIRECTION_RIGHT_TO_LEFT + WRITING_DIRECTION_TOP_TO_BOTTOM + + cdef enum TessTextlineOrder "tesseract::TextlineOrder": + TEXTLINE_ORDER_LEFT_TO_RIGHT + TEXTLINE_ORDER_RIGHT_TO_LEFT + TEXTLINE_ORDER_TOP_TO_BOTTOM + + cdef enum TessParagraphJustification "tesseract::ParagraphJustification": + JUSTIFICATION_UNKNOWN + JUSTIFICATION_LEFT + JUSTIFICATION_CENTER + JUSTIFICATION_RIGHT + +cdef extern from "tesseract/unichar.h" namespace "tesseract" nogil: + cdef enum StrongScriptDirection: + DIR_NEUTRAL # Text contains only neutral characters. + DIR_LEFT_TO_RIGHT # Text contains no Right-to-Left characters. + DIR_RIGHT_TO_LEFT # Text contains no Left-to-Right characters. + DIR_MIX # Text contains a mixture of left-to-right + # and right-to-left characters. + +cdef extern from "tesseract/ocrclass.h" namespace "tesseract" nogil: + ctypedef bool (*CANCEL_FUNC)(void *, int) + cdef cppclass ETEXT_DESC: + ETEXT_DESC() except + + CANCEL_FUNC cancel # returns true to cancel + void *cancel_this # this or other data for cancel + void set_deadline_msecs(int) + +cdef extern from "tesseract/pageiterator.h" namespace "tesseract" nogil: + cdef cppclass PageIterator: + void Begin() + void RestartParagraph() + bool IsWithinFirstTextlineOfParagraph() const + void RestartRow() + bool Next(PageIteratorLevel) + bool IsAtBeginningOf(PageIteratorLevel) const + bool IsAtFinalElement(PageIteratorLevel, PageIteratorLevel) const + void SetBoundingBoxComponents(bool, bool) + bool BoundingBox(PageIteratorLevel, const int, int *, int *, int *, int *) const + bool BoundingBoxInternal(PageIteratorLevel, int *, int *, int *, int *) const + bool Empty(PageIteratorLevel) const + PolyBlockType BlockType() const + Pta *BlockPolygon() const + Pix *GetBinaryImage(PageIteratorLevel) const + Pix *GetImage(PageIteratorLevel, int, Pix *, int *, int *) const + bool Baseline(PageIteratorLevel, int *, int *, int *, int *) const + void Orientation(TessOrientation *, TessWritingDirection *, TessTextlineOrder *, float *) const + void ParagraphInfo(TessParagraphJustification *, bool *, bool *, int *) const + +cdef extern from "tesseract/ltrresultiterator.h" namespace "tesseract" nogil: + cdef cppclass LTRResultIterator(PageIterator): + char *GetUTF8Text(PageIteratorLevel) const + void SetLineSeparator(cchar_t *) + void SetParagraphSeparator(cchar_t *) + float Confidence(PageIteratorLevel) const + void RowAttributes(float *, float *, float *) const + cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const + cchar_t *WordRecognitionLanguage() const + StrongScriptDirection WordDirection() const + bool WordIsFromDictionary() const + int BlanksBeforeWord() const + bool WordIsNumeric() const + bool HasBlamerInfo() const + cchar_t *GetBlamerDebug() const + cchar_t *GetBlamerMisadaptionDebug() const + bool HasTruthString() const + bool EquivalentToTruth(cchar_t *) const + char *WordTruthUTF8Text() const + char *WordNormedUTF8Text() const + cchar_t *WordLattice(int *) const + bool SymbolIsSuperscript() const + bool SymbolIsSubscript() const + bool SymbolIsDropcap() const + + cdef cppclass ChoiceIterator: + ChoiceIterator(const LTRResultIterator &) except + + bool Next() + cchar_t *GetUTF8Text() const + float Confidence() const + +cdef extern from "tesseract/resultiterator.h" namespace "tesseract" nogil: + cdef cppclass ResultIterator(LTRResultIterator): + bool ParagraphIsLtr() const + vector[vector[pair[cchar_tp, float]]] *GetBestLSTMSymbolChoices() const + +cdef extern from "tesseract/renderer.h" namespace "tesseract" nogil: + cdef cppclass TessResultRenderer: + void insert(TessResultRenderer *) + + cdef cppclass TessTextRenderer(TessResultRenderer): + TessTextRenderer(cchar_t *) except + + + cdef cppclass TessHOcrRenderer(TessResultRenderer): + TessHOcrRenderer(cchar_t *, bool) except + + + cdef cppclass TessPDFRenderer(TessResultRenderer): + TessPDFRenderer(cchar_t *, cchar_t *, bool) except + + + cdef cppclass TessUnlvRenderer(TessResultRenderer): + TessUnlvRenderer(cchar_t *) except + + + cdef cppclass TessBoxTextRenderer(TessResultRenderer): + TessBoxTextRenderer(cchar_t *) except + + + cdef cppclass TessOsdRenderer(TessResultRenderer): + TessOsdRenderer(cchar_t *) except + + +cdef extern from "tesseract/osdetect.h" namespace "tesseract" nogil: + struct OSBestResult: + int orientation_id + int script_id + float sconfidence + float oconfidence + + ctypedef int (*get_best_script)(int) + + struct OSResults: + get_best_script get_best_script + OSBestResult best_result + +cdef extern from "tesseract/baseapi.h" namespace "tesseract" nogil: + + cdef enum OcrEngineMode: + OEM_TESSERACT_ONLY + OEM_LSTM_ONLY + OEM_TESSERACT_LSTM_COMBINED + OEM_DEFAULT + + cdef enum PageSegMode: + PSM_OSD_ONLY, # Orientation and script detection only. + PSM_AUTO_OSD, # Automatic page segmentation with orientation and + # script detection. (OSD) + PSM_AUTO_ONLY, # Automatic page segmentation, but no OSD, or OCR. + PSM_AUTO, # Fully automatic page segmentation, but no OSD. + PSM_SINGLE_COLUMN, # Assume a single column of text of variable sizes. + PSM_SINGLE_BLOCK_VERT_TEXT, # Assume a single uniform block of vertically + # aligned text. + PSM_SINGLE_BLOCK, # Assume a single uniform block of text. (Default.) + PSM_SINGLE_LINE, # Treat the image as a single text line. + PSM_SINGLE_WORD, # Treat the image as a single word. + PSM_CIRCLE_WORD, # Treat the image as a single word in a circle. + PSM_SINGLE_CHAR, # Treat the image as a single character. + PSM_SPARSE_TEXT, # Find as much text as possible in no particular order. + PSM_SPARSE_TEXT_OSD, # Sparse text with orientation and script det. + PSM_RAW_LINE, # Treat the image as a single text line, bypassing + # hacks that are Tesseract-specific. + PSM_COUNT # Number of enum entries. + + cdef enum PageIteratorLevel: + RIL_BLOCK, # of text/image/separator line. + RIL_PARA, # within a block. + RIL_TEXTLINE, # within a paragraph. + RIL_WORD, # within a textline. + RIL_SYMBOL # character within a word. + + cdef cppclass TessBaseAPI: + TessBaseAPI() except + + @staticmethod + cchar_t *Version() + @staticmethod + void ClearPersistentCache() + void SetInputName(cchar_t *) + cchar_t *GetInputName() + void SetInputImage(Pix *) + Pix *GetInputImage() + int GetSourceYResolution() + cchar_t *GetDatapath() + void SetOutputName(cchar_t *) + bool SetVariable(cchar_t *, cchar_t *) + bool SetDebugVariable(cchar_t *, cchar_t *) + bool GetIntVariable(cchar_t *, int *) const + bool GetBoolVariable(cchar_t *, bool *) const + bool GetDoubleVariable(cchar_t *, double *) const + cchar_t *GetStringVariable(cchar_t *) const + bool GetVariableAsString(cchar_t *, string *) + int Init(cchar_t *, cchar_t *, OcrEngineMode mode, + char **, int, + const vector[string] *, + const vector[string] *, + bool) + int Init(cchar_t *, cchar_t *, OcrEngineMode) + int Init(cchar_t *, cchar_t *) + cchar_t *GetInitLanguagesAsString() const + void GetLoadedLanguagesAsVector(vector[string] *) const + void GetAvailableLanguagesAsVector(vector[string] *) const + void InitForAnalysePage() + void ReadConfigFile(cchar_t *) + void SetPageSegMode(PageSegMode) + PageSegMode GetPageSegMode() const + char *TesseractRect(cuchar_t *, int, int, int, int, int, int) + void ClearAdaptiveClassifier() + void SetImage(cuchar_t *, int, int, int, int) + void SetImage(Pix *) + void SetSourceResolution(int) + void SetRectangle(int, int, int, int) + Pix *GetThresholdedImage() + Boxa *GetRegions(Pixa **) + Boxa *GetTextlines(const bool, const int, Pixa **, int **, int **) + Boxa *GetStrips(Pixa **, int **) + Boxa *GetWords(Pixa **) + Boxa *GetConnectedComponents(Pixa **) + Boxa *GetComponentImages(const PageIteratorLevel, + const bool, const bool, + const int, + Pixa **, int **, int **) + int GetThresholdedImageScaleFactor() const + PageIterator *AnalyseLayout(bool) + int Recognize(ETEXT_DESC *) + bool ProcessPages(cchar_t *, cchar_t *, int, TessResultRenderer *) + bool ProcessPage(Pix *, int, cchar_t *, cchar_t *, int, TessResultRenderer *) + ResultIterator *GetIterator() + char *GetUTF8Text() + char *GetHOCRText(int) + char *GetTSVText(int) + char *GetBoxText(int) + char *GetUNLVText() + bool DetectOrientationScript(int *, float *, cchar_t **, float *) + int MeanTextConf() + int *AllWordConfidences() + bool AdaptToWordStr(PageSegMode, cchar_t *) + void Clear() + void End() + int IsValidWord(cchar_t *) + bool IsValidCharacter(cchar_t *) + bool GetTextDirection(int *, float *) + bool DetectOS(OSResults *); + cchar_t *GetUnichar(int) + const OcrEngineMode oem() const + void set_min_orientation_margin(double) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.5.1/tesserocr.egg-info/PKG-INFO new/tesserocr-2.5.2/tesserocr.egg-info/PKG-INFO --- old/tesserocr-2.5.1/tesserocr.egg-info/PKG-INFO 2020-03-17 18:41:35.000000000 +0100 +++ new/tesserocr-2.5.2/tesserocr.egg-info/PKG-INFO 2021-06-19 23:08:29.000000000 +0200 @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: tesserocr -Version: 2.5.1 +Version: 2.5.2 Summary: A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython Home-page: https://github.com/sirfz/tesserocr Author: Fayez Zouheiry @@ -108,6 +108,29 @@ > pip install <package_name>.whl + Build from source + ````````````````` + + If you need Windows tessocr package and your Python version is not supported by above mentioned project, + you can try to follow `step by step instructions for Windows 64bit` in `Windows.build.md`_. + + .. _Windows.build.md: Windows.build.md + + tessdata + ======== + + You may need to point to the tessdata path if it cannot be detected automatically. This can be done by setting the ``TESSDATA_PREFIX`` environment variable or by passing the path to ``PyTessBaseAPI`` (e.g.: ``PyTessBaseAPI(path='/usr/share/tessdata')``). The path should contain ``.traineddata`` files which can be found at https://github.com/tesseract-ocr/tessdata. + + Make sure you have the correct version of traineddata for your ``tesseract --version``. + + You can list the current supported languages on your system using the ``get_languages`` function: + + .. code:: python + + from tesserocr import get_languages + + print(get_languages('/usr/share/tessdata')) # or any other path that applies to your system + Usage ===== @@ -268,6 +291,8 @@ Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: Implementation :: CPython Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Programming Language :: Cython diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.5.1/tesserocr.egg-info/SOURCES.txt new/tesserocr-2.5.2/tesserocr.egg-info/SOURCES.txt --- old/tesserocr-2.5.1/tesserocr.egg-info/SOURCES.txt 2020-03-17 18:41:38.000000000 +0100 +++ new/tesserocr-2.5.2/tesserocr.egg-info/SOURCES.txt 2021-06-19 23:08:29.000000000 +0200 @@ -3,6 +3,7 @@ README.rst setup.py tesseract.pxd +tesseract5.pxd tesserocr.pyx tesserocr_experiment.pyx tesserocr.egg-info/PKG-INFO @@ -10,5 +11,5 @@ tesserocr.egg-info/dependency_links.txt tesserocr.egg-info/top_level.txt tests/__init__.py -tests/eurotext.tif +tests/eurotext.png tests/test_api.py \ No newline at end of file diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.5.1/tesserocr.pyx new/tesserocr-2.5.2/tesserocr.pyx --- old/tesserocr-2.5.1/tesserocr.pyx 2020-03-17 18:40:03.000000000 +0100 +++ new/tesserocr-2.5.2/tesserocr.pyx 2021-06-19 22:09:33.000000000 +0200 @@ -1,5 +1,5 @@ #!python -#cython: c_string_type=unicode, c_string_encoding=utf-8 +#cython: c_string_type=unicode, c_string_encoding=utf-8, language_level=3 """Python wrapper around the Tesseract-OCR C++ API This module provides a wrapper class :class:`PyTessBaseAPI` to call @@ -18,7 +18,7 @@ ['eng', 'osd', 'equ']) """ -__version__ = '2.5.1' +__version__ = '2.5.2' import os from io import BytesIO @@ -29,8 +29,14 @@ # PIL.Image won't be supported pass -from tesseract cimport * +IF TESSERACT_MAJOR_VERSION < 5: + from tesseract cimport * +ELSE: + from tesseract5 cimport * from libc.stdlib cimport malloc, free +from libcpp.pair cimport pair +from libcpp.vector cimport vector +from cython.operator cimport preincrement as inc, dereference as deref from cpython.version cimport PY_MAJOR_VERSION @@ -188,7 +194,7 @@ cdef class PT(_Enum): - """An enum the defines avaialbe Poly Block types. + """An enum that defines available Poly Block types. Attributes: UNKNOWN: Type is not yet known. Keep as the first element. @@ -323,7 +329,7 @@ cdef bytes _image_buffer(image): """Return raw bytes of a PIL Image""" with BytesIO() as f: - image.save(f, image.format or 'PNG') + image.save(f, image.format or 'BMP') return f.getvalue() @@ -337,8 +343,8 @@ if fmt > 0: result = pixWriteMem(&buff, &size, pix, fmt) else: - # write as JPEG if format is unknown - result = pixWriteMemJpeg(&buff, &size, pix, 0, 0) + # write as IFF_BMP if format is unknown + result = pixWriteMem(&buff, &size, pix, 1) try: if result == 1: @@ -544,7 +550,7 @@ See comment on coordinate system above. Args: - level (int): Page Iteration Level. See :class:`RIL` for avaialbe levels. + level (int): Page Iteration Level. See :class:`RIL` for available levels. Kwargs: padding (int): The padding argument to :meth:`GetImage` can be used to expand @@ -568,7 +574,7 @@ respect to the original image and is scaled by a factor scale_. Args: - level (int): Page Iteration Level. See :class:`RIL` for avaialbe levels. + level (int): Page Iteration Level. See :class:`RIL` for available levels. Returns: tuple or None if there is no such object at the current position. @@ -1046,13 +1052,31 @@ IF TESSERACT_VERSION >= 0x4000000: def GetBestLSTMSymbolChoices(self): + """Returns the LSTM choices for every LSTM timestep for the current word.""" + cdef: + vector[vector[pair[cchar_tp, float]]] *output = self._riter.GetBestLSTMSymbolChoices() + vector[vector[pair[cchar_tp, float]]].iterator it + vector[pair[cchar_tp, float]].iterator cit + vector[pair[cchar_tp, float]] configpairs + pair[cchar_tp, float] configpair + LSTMSymbolChoices = [] - output = self._riter.GetBestLSTMSymbolChoices()[0] - for tstep in output: + if output == NULL: + return LSTMSymbolChoices + + it = output.begin() + while it != output.end(): timestep = [] - for confpair in tstep: - timestep.append((confpair.first, confpair.second)) + configpairs = deref(it) + cit = configpairs.begin() + while cit != configpairs.end(): + configpair = deref(cit) + timestep.append((configpair.first, configpair.second)) + inc(cit) + LSTMSymbolChoices.append(timestep) + inc(it) + return LSTMSymbolChoices @@ -1191,7 +1215,20 @@ def __dealloc__(self): self._end_api() - cdef int _init_api(self, cchar_t *path, cchar_t *lang, + IF TESSERACT_MAJOR_VERSION >= 5: + cdef int _init_api(self, cchar_t *path, cchar_t *lang, + OcrEngineMode oem, char **configs, int configs_size, + const vector[string] *vars_vec, const vector[string] *vars_vals, + bool set_only_non_debug_params, PageSegMode psm) nogil except -1: + cdef int ret = self._baseapi.Init(path, lang, oem, configs, configs_size, vars_vec, vars_vals, + set_only_non_debug_params) + if ret == -1: + with gil: + raise RuntimeError('Failed to init API, possibly an invalid tessdata path: {}'.format(path)) + self._baseapi.SetPageSegMode(psm) + return ret + ELSE: + cdef int _init_api(self, cchar_t *path, cchar_t *lang, OcrEngineMode oem, char **configs, int configs_size, const GenericVector[STRING] *vars_vec, const GenericVector[STRING] *vars_vals, bool set_only_non_debug_params, PageSegMode psm) nogil except -1: @@ -1323,9 +1360,14 @@ Returns ``None`` if parameter was not found. """ - cdef: - bytes py_name = _b(name) - STRING val + IF TESSERACT_MAJOR_VERSION >= 5: + cdef: + bytes py_name = _b(name) + string val + ELSE: + cdef: + bytes py_name = _b(name) + STRING val if self._baseapi.GetVariableAsString(py_name, &val): return val.c_str() return None @@ -1356,7 +1398,7 @@ applicable language, and there is more chance of hallucinating incorrect words. oem (int): OCR engine mode. Defaults to :attr:`OEM.DEFAULT`. - See :class:`OEM` for all avaialbe options. + See :class:`OEM` for all available options. configs (list): List of config files to load variables from. variables (dict): Extra variables to be set. set_only_non_debug_params (bool): If ``True``, only params that do not contain @@ -1365,7 +1407,20 @@ Raises: :exc:`RuntimeError`: If API initialization fails. """ - cdef: + IF TESSERACT_MAJOR_VERSION >= 5: + cdef: + bytes py_path = _b(path) + bytes py_lang = _b(lang) + cchar_t *cpath = py_path + cchar_t *clang = py_lang + int configs_size = len(configs) + char **configs_ = <char **>malloc(configs_size * sizeof(char *)) + vector[string] vars_vec + vector[string] vars_vals + cchar_t *val + string sval + ELSE: + cdef: bytes py_path = _b(path) bytes py_lang = _b(lang) cchar_t *cpath = py_path @@ -1410,7 +1465,7 @@ lang (str): An ISO 639-3 language string. Defaults to 'eng'. See :meth:`InitFull` for full description of this parameter. oem (int): OCR engine mode. Defaults to :attr:`OEM.DEFAULT`. - See :class:`OEM` for all avaialbe options. + See :class:`OEM` for all available options. Raises: :exc:`RuntimeError`: If API initialization fails. @@ -1439,15 +1494,23 @@ Includes all languages loaded by the last Init, including those loaded as dependencies of other loaded languages. """ - cdef GenericVector[STRING] langs + IF TESSERACT_MAJOR_VERSION >= 5: + cdef vector[string] langs + ELSE: + cdef GenericVector[STRING] langs self._baseapi.GetLoadedLanguagesAsVector(&langs) return [langs[i].c_str() for i in xrange(langs.size())] def GetAvailableLanguages(self): """Return list of available languages in the init data path""" - cdef: - GenericVector[STRING] v - int i + IF TESSERACT_MAJOR_VERSION >= 5: + cdef: + vector[string] v + int i + ELSE: + cdef: + GenericVector[STRING] v + int i langs = [] self._baseapi.GetAvailableLanguagesAsVector(&v) langs = [v[i].c_str() for i in xrange(v.size())] @@ -1568,6 +1631,28 @@ self._destroy_pix() self._baseapi.SetImage(cimagedata, width, height, bytes_per_pixel, bytes_per_line) + def SetImageBytesBmp(self, imagedata): + """Provide an image for Tesseract to recognize. + + Args: + imagedata (:bytes): Raw bytes of a BMP image. + + Raises: + :exc:`RuntimeError`: If for any reason the api failed + to load the given image. + """ + cdef: + bytes py_imagedata = _b(imagedata) + size_t size = len(py_imagedata) + cuchar_t *cimagedata = py_imagedata + with nogil: + self._destroy_pix() + self._pix = pixReadMemBmp(cimagedata, size) + if self._pix == NULL: + with gil: + raise RuntimeError('Error reading image') + self._baseapi.SetImage(self._pix) + def SetImage(self, image): """Provide an image for Tesseract to recognize. @@ -1598,7 +1683,7 @@ self._baseapi.SetImage(self._pix) def SetImageFile(self, filename): - """Set image from file for Tesserac to recognize. + """Set image from file for Tesseract to recognize. Args: filename (str): Image file relative or absolute path. @@ -1615,7 +1700,10 @@ self._pix = pixRead(fname) if self._pix == NULL: with gil: - raise RuntimeError('Error reading image') + # missing leptonica support? Try PIL + image = Image.open(fname) + self.SetImage(image) + self._baseapi.SetImage(self._pix) def SetSourceResolution(self, int ppi): @@ -1633,7 +1721,7 @@ can be recognized with the same image. Args: - left (int): poisition from left + left (int): position from left top (int): position from top width (int): width height (int): height @@ -1951,20 +2039,21 @@ """Methods to retrieve information after :meth:`SetImage`, :meth:`Recognize` or :meth:`TesseractRect`. (:meth:`Recognize` is called implicitly if needed.)""" - cpdef bool RecognizeForChopTest(self, int timeout=0): - """Variant on :meth:`Recognize` used for testing chopper.""" - cdef: - ETEXT_DESC monitor - int res - with nogil: - if timeout > 0: - monitor.cancel = NULL - monitor.cancel_this = NULL - monitor.set_deadline_msecs(timeout) - res = self._baseapi.RecognizeForChopTest(&monitor) - else: - res = self._baseapi.RecognizeForChopTest(NULL) - return res == 0 + IF TESSERACT_MAJOR_VERSION < 5: + cpdef bool RecognizeForChopTest(self, int timeout=0): + """Variant on :meth:`Recognize` used for testing chopper.""" + cdef: + ETEXT_DESC monitor + int res + with nogil: + if timeout > 0: + monitor.cancel = NULL + monitor.cancel_this = NULL + monitor.set_deadline_msecs(timeout) + res = self._baseapi.RecognizeForChopTest(&monitor) + else: + res = self._baseapi.RecognizeForChopTest(NULL) + return res == 0 cdef TessResultRenderer *_get_renderer(self, cchar_t *outputbase): cdef: @@ -2084,7 +2173,7 @@ retry_config=None, int timeout=0): """Turn a single image into symbolic text. - See :meth:`ProcessPages` for desciptions of the keyword arguments + See :meth:`ProcessPages` for descriptions of the keyword arguments and all other details. Args: @@ -2258,7 +2347,6 @@ 'script_conf': script_conf} return None - def MeanTextConf(self): """Return the (average) confidence value between 0 and 100.""" return self._baseapi.MeanTextConf() @@ -2536,11 +2624,18 @@ - path (str): tessdata parent directory path - languages (list): list of available languages as ISO 639-3 strings. """ - cdef: - bytes py_path = _b(path) - TessBaseAPI baseapi - GenericVector[STRING] v - int i + IF TESSERACT_MAJOR_VERSION >= 5: + cdef: + bytes py_path = _b(path) + TessBaseAPI baseapi + vector[string] v + int i + ELSE: + cdef: + bytes py_path = _b(path) + TessBaseAPI baseapi + GenericVector[STRING] v + int i baseapi.Init(py_path, NULL) path = baseapi.GetDatapath() baseapi.GetAvailableLanguagesAsVector(&v) Binary files old/tesserocr-2.5.1/tests/eurotext.png and new/tesserocr-2.5.2/tests/eurotext.png differ Binary files old/tesserocr-2.5.1/tests/eurotext.tif and new/tesserocr-2.5.2/tests/eurotext.tif differ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/tesserocr-2.5.1/tests/test_api.py new/tesserocr-2.5.2/tests/test_api.py --- old/tesserocr-2.5.1/tests/test_api.py 2019-11-08 23:49:38.000000000 +0100 +++ new/tesserocr-2.5.2/tests/test_api.py 2021-06-19 22:02:07.000000000 +0200 @@ -2,8 +2,10 @@ import re import os.path import tesserocr + try: from PIL import Image + pil_installed = True except ImportError: pil_installed = False @@ -12,8 +14,8 @@ def version_to_int(version): subversion = None subtrahend = 0 - # Subtracts a certain amount from the version number to differentiate between - # alpha, beta and release versions. + # Subtracts a certain amount from the version number to differentiate + # between alpha, beta and release versions. if "alpha" in version: version_split = version.split("alpha") subversion = version_split[1] @@ -22,18 +24,19 @@ version_split = version.split("beta") subversion = version_split[1] subtrahend = 1 - version = re.search(r'((?:\d+\.)+\d+)', version).group() - # Split the groups on ".", take only the first one, and print each group with leading 0 if needed - # To be safe, also handle cases where an extra group is added to the version string, or if one or two groups - # are dropped. - version_groups = (version.split('.') + [0, 0])[:3] + version = re.search(r"((?:\d+\.)+\d+)", version).group() + # Split the groups on ".", take only the first one, and print each + # group with leading 0 if needed. To be safe, also handle cases where + # an extra group is added to the version string, or if one or two + # groups are dropped. + version_groups = (version.split(".") + [0, 0])[:3] version_str = "{:02}{:02}{:02}".format(*map(int, version_groups)) version_str = str((int(version_str, 10) - subtrahend)) # Adds a 2 digit subversion number for the subversionrelease. subversion_str = "00" - if subversion is not None and subversion is not "": - subversion = re.search(r'(?:\d+)', subversion).group() - subversion_groups = (subversion.split('-') + [0, 0])[:1] + if subversion is not None and subversion != "": + subversion = re.search(r"(?:\d+)", subversion).group() + subversion_groups = (subversion.split("-") + [0, 0])[:1] subversion_str = "{:02}".format(*map(int, subversion_groups)) version_str += subversion_str return int(version_str, 16) @@ -45,11 +48,11 @@ class TestTessBaseApi(unittest.TestCase): _test_dir = os.path.abspath(os.path.dirname(__file__)) - _image_file = os.path.join(_test_dir, 'eurotext.tif') + _image_file = os.path.join(_test_dir, "eurotext.png") def setUp(self): if pil_installed: - with open(self._image_file, 'rb') as f: + with open(self._image_file, "rb") as f: self._image = Image.open(f) self._image.load() self._api = tesserocr.PyTessBaseAPI(init=True) @@ -71,27 +74,27 @@ def test_init_full(self): """Test InitFull.""" # check default settings - self.assertEqual(self._api.GetVariableAsString('file_type'), '.tif') - self.assertEqual(self._api.GetVariableAsString('edges_childarea'), '0.5') + self.assertEqual(self._api.GetVariableAsString("file_type"), ".tif") + self.assertEqual(self._api.GetVariableAsString("edges_childarea"), "0.5") # use box.train config variables - configs = ['box.train'] + configs = ["box.train"] # change edges_childarea - vars_ = {'edges_childarea': '0.7'} + vars_ = {"edges_childarea": "0.7"} self._api.End() self._api.InitFull(configs=configs, variables=vars_) # assert file_type from box.train and custom edges_childarea - self.assertEqual(self._api.GetVariableAsString('file_type'), '.bl') - self.assertEqual(self._api.GetVariableAsString('edges_childarea'), '0.7') + self.assertEqual(self._api.GetVariableAsString("file_type"), ".bl") + self.assertEqual(self._api.GetVariableAsString("edges_childarea"), "0.7") # reset back to default self._api.End() self._api.Init() def test_init(self): """Test Init calls with different lang and oem.""" - self._api.Init(lang='eng+osd') - self.assertEqual(self._api.GetInitLanguagesAsString(), 'eng+osd') - self._api.Init(lang='eng') - self.assertEqual(self._api.GetInitLanguagesAsString(), 'eng') + self._api.Init(lang="eng+osd") + self.assertEqual(self._api.GetInitLanguagesAsString(), "eng+osd") + self._api.Init(lang="eng") + self.assertEqual(self._api.GetInitLanguagesAsString(), "eng") self._api.Init(oem=tesserocr.OEM.TESSERACT_ONLY) self.assertEqual(self._api.oem(), tesserocr.OEM.TESSERACT_ONLY) @@ -100,7 +103,7 @@ """Test SetImage and GetUTF8Text.""" self._api.SetImage(self._image) text = self._api.GetUTF8Text() - self.assertIn('quick', text) + self.assertIn("quick", text) text2 = tesserocr.image_to_text(self._image) self.assertEqual(text, text2) @@ -108,7 +111,7 @@ """Test SetImageFile and GetUTF8Text.""" self._api.SetImageFile(self._image_file) text = self._api.GetUTF8Text() - self.assertIn('quick', text) + self.assertIn("quick", text) text2 = tesserocr.file_to_text(self._image_file) self.assertEqual(text, text2) @@ -134,7 +137,9 @@ """Test GetDatapath and Init with an invalid data path.""" path = self._api.GetDatapath() self._api.End() - self.assertRaises(RuntimeError, self._api.Init, path=(self._test_dir + os.path.sep)) # no tessdata + self.assertRaises( + RuntimeError, self._api.Init, path=(self._test_dir + os.path.sep) + ) # no tessdata if _TESSERACT_VERSION >= 0x3999800: new_path = path else: @@ -145,17 +150,17 @@ def test_langs(self): """Test get langs methods.""" - self._api.Init(lang='eng') + self._api.Init(lang="eng") lang = self._api.GetInitLanguagesAsString() - self.assertEqual(lang, 'eng') + self.assertEqual(lang, "eng") langs = self._api.GetLoadedLanguages() - self.assertEqual(langs, ['eng']) - self.assertIn('eng', self._api.GetAvailableLanguages()) + self.assertEqual(langs, ["eng"]) + self.assertIn("eng", self._api.GetAvailableLanguages()) def test_variables(self): """Test SetVariable and GetVariableAsString.""" - self._api.SetVariable('debug_file', '/dev/null') - self.assertEqual(self._api.GetVariableAsString('debug_file'), '/dev/null') + self._api.SetVariable("debug_file", "/dev/null") + self.assertEqual(self._api.GetVariableAsString("debug_file"), "/dev/null") @unittest.skipIf(not pil_installed, "Pillow not installed") def test_rectangle(self): @@ -223,17 +228,25 @@ self._api.SetPageSegMode(tesserocr.PSM.OSD_ONLY) self._api.SetImageFile(self._image_file) orientation = self._api.DetectOS() - all(self.assertIn(k, orientation) for k in ['sconfidence', 'oconfidence', 'script', 'orientation']) - self.assertEqual(orientation['orientation'], 0) - languages = tesserocr.get_languages()[1] # this is sorted alphabetically! - self.assertLess(orientation['script'], len(languages)) - script_name = languages[orientation['script']] # therefore does not work - #self.assertEqual(script_name, 'Latin') # cannot test: not reliable + all( + self.assertIn(k, orientation) + for k in ["sconfidence", "oconfidence", "script", "orientation"] + ) + self.assertEqual(orientation["orientation"], 0) + # this is sorted alphabetically! + languages = tesserocr.get_languages()[1] + self.assertLess(orientation["script"], len(languages)) + # therefore does not work + # script_name = languages[orientation["script"]] + # self.assertEqual(script_name, 'Latin') # cannot test: not reliable if _TESSERACT_VERSION >= 0x3999800: orientation = self._api.DetectOrientationScript() - all(self.assertIn(k, orientation) for k in ['orient_deg', 'orient_conf', 'script_name', 'script_conf']) - self.assertEqual(orientation['orient_deg'], 0) - self.assertEqual(orientation['script_name'], 'Latin') + all( + self.assertIn(k, orientation) + for k in ["orient_deg", "orient_conf", "script_name", "script_conf"] + ) + self.assertEqual(orientation["orient_deg"], 0) + self.assertEqual(orientation["script_name"], "Latin") def test_clear(self): """Test Clear.""" @@ -272,10 +285,10 @@ result = self._api.GetComponentImages(tesserocr.RIL.BLOCK, True) # Test if not empty self.assertTrue(result) - _, xywh, _, _ = result[0] # bbox of largest - self.assertIn('w', xywh) - self.assertIn('h', xywh) - area = xywh['w'] * xywh['h'] + _, xywh, _, _ = result[0] # bbox of largest + self.assertIn("w", xywh) + self.assertIn("h", xywh) + area = xywh["w"] * xywh["h"] # Test if the largest block is quite large self.assertGreater(area, 400000) @@ -286,7 +299,7 @@ # Test if not empty self.assertTrue(layout) self.assertFalse(layout.Empty(tesserocr.RIL.BLOCK)) - result = layout.BoundingBox(tesserocr.RIL.BLOCK) # bbox of largest + result = layout.BoundingBox(tesserocr.RIL.BLOCK) # bbox of largest self.assertIsNot(result, None) x0, y0, x1, y1 = result area = (x1 - x0) * (y1 - y0) @@ -300,7 +313,7 @@ # Test if not empty self.assertTrue(layout) self.assertFalse(layout.Empty(tesserocr.RIL.BLOCK)) - result = layout.BlockPolygon() # polygon of largest + result = layout.BlockPolygon() # polygon of largest # Test if not empty self.assertIsNot(result, None) # Test there are at least 4 contour points @@ -318,7 +331,7 @@ res = self._api.Recognize(1) self.assertFalse(res) self._api.SetImageFile(self._image_file) - # timeout after 10 seocnds (unlikely) + # timeout after 10 seconds (unlikely) res = self._api.Recognize(10000) self.assertTrue(res) self._api.SetImageFile(self._image_file) @@ -332,10 +345,10 @@ self._api.Recognize() it = self._api.GetIterator() attrs = it.RowAttributes() - self.assertIsInstance(attrs['row_height'], float) - self.assertIsInstance(attrs['ascenders'], float) - self.assertIsInstance(attrs['descenders'], float) + self.assertIsInstance(attrs["row_height"], float) + self.assertIsInstance(attrs["ascenders"], float) + self.assertIsInstance(attrs["descenders"], float) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main()