[MediaWiki-commits] [Gerrit] Added DjVU class and djvutext.py in core - change (pywikibot/core)

Mpaa (Code Review) Wed, 13 May 2015 13:48:16 -0700

Mpaa has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/210808


Change subject: Added DjVU class and djvutext.py in core
......................................................................

Added DjVU class and djvutext.py in core

Added:
- DjVu class: wrapper to access djvu file text and properties
- added tests for DjVu class
- ported djvutext.py functionality from compat (basing it on Bot class)

Change-Id: I88ba445fd49046430dfcb78d5b8a0ab46e2343fb
---
A djvusub.py
A pywikibot/tools/djvu.py
A scripts/djvutext.py
A tests/data/djvu/myfile.djvu
A tests/data/djvu/myfile_wo_text.djvu
A tests/data/myfile_wo_text.djvu
A tests/djvu_tests.py
7 files changed, 391 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/08/210808/1

diff --git a/djvusub.py b/djvusub.py
new file mode 100644
index 0000000..1f39af2
--- /dev/null
+++ b/djvusub.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python
+# -*- coding: utf-8  -*-
+
+import os
+import sys
+import codecs
+import subprocess
+import pywikibot
+
+filename = 'myfile.djvu'
+
+def number_of_images():
+    dp = subprocess.Popen(['djvused', '-e', "n", filename], 
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    (stdoutdata, stderrdata) = dp.communicate()
+    if dp.returncode != 0:
+        pywikibot.error(stderrdata)
+    count = int(stdoutdata)
+    pywikibot.output("page count = %d" % count)
+    return count
+
+def has_text():
+    #cmd = u"djvudump \"%s\" > \"%s\".out" % (self.djvu, self.djvu)
+    dp = subprocess.Popen(['djvudump', filename], stdout=subprocess.PIPE, 
stderr=subprocess.PIPE)
+    (stdoutdata, stderrdata) = dp.communicate()
+    if dp.returncode != 0:
+        pywikibot.error(stderrdata)
+    txt = dp.stdout.read()
+    txt = txt.decode('utf-8')
+    pywikibot.output(txt)
+    dp.stdout.close()
+    return 'TXTz' in txt
+
+def get_page():
+    dp = subprocess.Popen(['djvutxt', '--page=1', filename], 
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    (stdoutdata, stderrdata) = dp.communicate()
+    if dp.returncode != 0:
+        pywikibot.error(stderrdata)
+    #import pdb; pdb.set_trace()
+    txt = dp.stdout.read()
+    txt = txt.decode('utf-8')
+    pywikibot.output(txt)
+    dp.stdout.close()
+    #cmd = u"djvutxt --page=%d \"%s\" \"%s.out\"" % (pageno, self.djvu, 
self.djvu)
+    #os.system(cmd.encode(sys.stdout.encoding))
+    #f = codecs.open(u"%s.out" % self.djvu, 'r',
+                    #config.textfile_encoding, 'replace')
+    #djvu_text = f.read()
+    #f.close()
+    return txt
+
+print(number_of_images())
+print(has_text())
+x = get_page()
diff --git a/pywikibot/tools/djvu.py b/pywikibot/tools/djvu.py
new file mode 100644
index 0000000..b7145bc
--- /dev/null
+++ b/pywikibot/tools/djvu.py
@@ -0,0 +1,95 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""Wrapper around djvulibre to access djvu files properties and content."""
+#
+# (C) Pywikibot team, 2015
+#
+# Distributed under the terms of the MIT license.
+#
+from __future__ import unicode_literals
+
+__version__ = '$Id$'
+
+import os.path
+import subprocess
+
+import pywikibot
+
+
+class DjVu(object):
+
+    """Wrapper around djvulibre to access djvu files properties and content.
+
+    Perform file existance checks.
+
+    Control characters in djvu text-layer are converted for convenience
+    (see # http://djvu.sourceforge.net/doc/man/djvused.html for control chars
+    details).
+
+    """
+
+    def __init__(self, file_djvu):
+        """
+        Constructor.
+
+        @param djvu: filename (including path) to djvu file
+        @type  djvu: string/unicode
+        """
+        try:
+            file_djvu = os.path.expanduser(file_djvu)
+        except AttributeError:
+            raise AttributeError('No djvu filename defined.')
+
+        try:
+            with open(file_djvu):
+                self.file_djvu = file_djvu
+        except IOError:
+            # Does not exist OR no read permissions.
+            raise
+
+    def number_of_images(self):
+        """Return the (cached) number of images in the djvu file."""
+        if not hasattr(self, '_n'):
+            dp = subprocess.Popen(['djvused', '-e', "n", self.file_djvu],
+                                  stdout=subprocess.PIPE, 
stderr=subprocess.PIPE)
+            (stdoutdata, stderrdata) = dp.communicate()
+            if dp.returncode != 0:
+                pywikibot.error('djvulibre library error!\n%s' % stderrdata)
+            self._n = int(stdoutdata)
+        return self._n
+
+    def has_text(self):
+        """Test if the djvu file has a text-layer."""
+        if not hasattr(self, '_has_text'):
+            dp = subprocess.Popen(['djvudump', self.file_djvu],
+                                  stdout=subprocess.PIPE, 
stderr=subprocess.PIPE)
+            (stdoutdata, stderrdata) = dp.communicate()
+            #import pdb; pdb.set_trace()
+            if dp.returncode != 0:
+                pywikibot.error('djvulibre library error!\n%s' % stderrdata)
+            txt = stdoutdata.decode('utf-8')
+            self._has_text = 'TXTz' in txt
+        return self._has_text
+
+    def get_page(self, n):
+        """Get page n for djvu file."""
+        if not self.has_text():
+            raise ValueError('Djvu file %s has no text layer.' % 
self.file_djvu)
+        if n > self.number_of_images():
+            raise ValueError('Number of pages [%d] in %s is less than %d'
+                              % (self._n, self.file_djvu, n))
+        dp = subprocess.Popen(['djvutxt', '--page=%d' % n, self.file_djvu],
+                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        (stdoutdata, stderrdata) = dp.communicate()
+        if dp.returncode != 0:
+            pywikibot.error('djvulibre library error!\n%s' % stderrdata)
+        #import pdb; pdb.set_trace()
+
+        # http://djvu.sourceforge.net/doc/man/djvused.html for control chars.
+        txt = stdoutdata.decode('utf-8')
+        txt = txt.replace('\x0b', '')  # vertical tab (\013=\x0b): remove
+        txt = txt.replace('\x1d', '\n')  # group (\035=\x1d) separator: 
replace with \n
+        txt = txt.replace('\x1f', '\n')  # unit separator (\037=\x1f): replace 
with \n
+        txt = txt.strip('\x0c\n ')  # feed char (\f=\x0c), \n and trailing 
spaces: strip
+        pywikibot.output(txt)
+        return txt
diff --git a/scripts/djvutext.py b/scripts/djvutext.py
new file mode 100644
index 0000000..92ab638
--- /dev/null
+++ b/scripts/djvutext.py
@@ -0,0 +1,179 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+This bot uploads text from djvu files onto pages in the "Page" namespace.
+
+It is intended to be used for Wikisource.
+
+The following parameters are supported:
+
+    -djvu:...      filename of the djvu file
+    -index:...     name of the index page
+                   optional, default: the djvu filename).
+    -pages:<start>-<end> Page range to upload;
+                   optional, start=1, end=djvu file number of images
+    -force         overwrites existing text
+                   optional, default False
+    -summary:      custom edit summary.
+                   Use quotes if edit summary contains spaces.
+    -always        don't bother asking to confirm any of the changes.
+
+python scripts/djvutext.py -djvu:Nietzsche_the_thinker.djvu -pages:2-5
+"""
+#
+# (C) Pywikibot team, 2008-2015
+#
+# Distributed under the terms of the MIT license.
+#
+from __future__ import unicode_literals
+
+__version__ = '$Id$'
+#
+
+import os.path
+
+import pywikibot
+from pywikibot import i18n, Bot
+from pywikibot.tools.djvu import DjVu
+
+#TODO: port to ProofeadPage, once ProofeadPage is merged.
+
+
+class DjVuTextBot(Bot):
+
+    """
+    A bot that uploads text-layer from djvu files to Page:namespace.
+
+    Works only on wikisource.
+    """
+
+    def __init__(self, djvu, index, pages=None, **kwargs):
+        """
+        Constructor.
+
+        @param djvu: djvu from where to fetch the text layer
+        @type  djvu: DjVu object
+        @param index: index page in the Index: namespace
+        @type  index: Page object
+        @param pages: page interval to upload (start, end)
+        @type  index: tuple
+        """
+        self.availableOptions.update({
+            'force': False,
+            'summary': None
+        })
+        super(DjVuTextBot, self).__init__(**kwargs)
+        self.djvu = djvu
+        self.index = index
+        self.prefix = self.index.title(withNamespace=False)
+
+        if not pages:
+            self.pages = (1, self.djvu.number_of_images())
+        else:
+            self.pages = pages
+
+        self.generator = self.gen()
+
+        # get edit summary message if it's empty
+        if not self.getOption('summary'):
+            params = {}
+            self.options['summary'] = i18n.twntranslate(
+                self.index.site, 'djvutext-creating', params)
+
+    def gen(self):
+        """Generate pages from specified page interval."""
+        start, end = self.pages
+        for n in range(start, end + 1):
+            title = '{prefix}/{number}'.format(prefix=self.prefix, number=n)
+            page = pywikibot.Page(self.index.site, title, 
ns=self.index.site.proofread_page_ns)
+            page.n = n  # remeber page number in djvu file
+            yield page
+
+    def treat(self, page):
+        """Process one page."""
+        new_text = self.djvu.get_page(page.n)
+
+        if page.exists():
+            old_text = page.text
+        else:
+            old_text = ''
+
+        summary = self.getOption('summary')
+        force = self.getOption('force')
+
+        self.userPut(page, old_text, new_text,
+                         summary=summary, minor=True, botflag=True, 
force=force)
+
+
+def main(*args):
+    """
+    Process command line arguments and invoke bot.
+
+    If args is an empty list, sys.argv is used.
+
+    @param args: command line arguments
+    @type args: list of unicode
+    """
+    file_djvu = None
+    index = None
+    pages = '1'
+    options = {}
+
+    # Parse command line arguments
+    local_args = pywikibot.handle_args(args)
+    for arg in local_args:
+        if arg == '-force':
+            options['force'] = True
+        elif arg.startswith('-djvu:'):
+            file_djvu = arg[6:]
+        elif arg.startswith('-index:'):
+            index = arg[7:]
+        elif arg.startswith('-pages:'):
+            pages = arg[7:]
+        elif arg == '-always':
+            options['always'] = True
+        elif arg.startswith('-summary:'):
+            options['summary'] = arg[len('-summary:'):]
+        else:
+            pywikibot.output('Unknown argument %s' % arg)
+
+    if not (file_djvu or index):
+        pywikibot.showHelp()
+        return
+
+    # Check the djvu file exists and, if so, create the DjVu wrapper.
+    djvu = DjVu(file_djvu)
+
+    if not djvu.has_text():
+        raise ValueError('No text layer in djvu file %s' % djvu.file_djvu)
+
+    # parse pages param
+    start, sep, end = pages.partition('-')
+    pages = (int(start),
+             int(end) if (sep and end) else djvu.number_of_images())
+
+    if not index:
+        index = os.path.basename(file_djvu)
+
+    site = pywikibot.Site()
+
+    if site.family != 'wikisource':
+        raise pywikibot.PageNotFound('Found family %s; Wikisource required.'
+                                      % site.family)
+
+    index_page = pywikibot.Page(site, index, ns=site.proofread_index_ns)
+
+    if not index_page.exists():
+        raise pywikibot.NoPage('Page %s does not exist' % index)
+
+    pywikibot.output('uploading text from %s to %s'
+                     % (djvu.file_djvu, index_page.title(asLink=True)))
+
+    bot = DjVuTextBot(djvu, index_page, pages, **options)
+    bot.run()
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception:
+        pywikibot.error('Fatal error:', exc_info=True)
diff --git a/tests/data/djvu/myfile.djvu b/tests/data/djvu/myfile.djvu
new file mode 100755
index 0000000..eedbbcb
--- /dev/null
+++ b/tests/data/djvu/myfile.djvu
Binary files differ
diff --git a/tests/data/djvu/myfile_wo_text.djvu 
b/tests/data/djvu/myfile_wo_text.djvu
new file mode 100644
index 0000000..a40d16b
--- /dev/null
+++ b/tests/data/djvu/myfile_wo_text.djvu
Binary files differ
diff --git a/tests/data/myfile_wo_text.djvu b/tests/data/myfile_wo_text.djvu
new file mode 100644
index 0000000..a40d16b
--- /dev/null
+++ b/tests/data/myfile_wo_text.djvu
Binary files differ
diff --git a/tests/djvu_tests.py b/tests/djvu_tests.py
new file mode 100644
index 0000000..365d2b0
--- /dev/null
+++ b/tests/djvu_tests.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+# -*- coding: utf-8  -*-
+"""Unit tests for djvutext.py script."""
+from __future__ import unicode_literals
+
+import os
+from tests import _data_dir
+from tests.aspects import unittest
+from pywikibot.tools.djvu import DjVu
+
+# python -m unittest tests.djvu_tests.TestDjVu
+
+_djvu_dir = 'djvu'
+
+
+class TestDjVu(unittest.TestCase):
+
+    """Test DjVu class."""
+
+    file_djvu_not_existing = os.path.join(_data_dir, _djvu_dir, 
'not_existing.djvu')
+    file_djvu = os.path.join(_data_dir, _djvu_dir, 'myfile.djvu')
+    file_djvu_wo_text = os.path.join(_data_dir, _djvu_dir, 
'myfile_wo_text.djvu')
+    test_txt = 'A file with non-ASCII characters, \nlike é or ç'
+
+    def test_file_existance(self):
+        """Test file existance checks."""
+        djvu = DjVu(self.file_djvu)
+        self.assertEqual(self.file_djvu, djvu.file_djvu)
+        self.assertRaises(IOError, lambda: DjVu(self.file_djvu_not_existing))
+        self.assertRaises(AttributeError, lambda: DjVu(None))
+
+    def test_number_of_images(self):
+        """Test page number generator."""
+        djvu = DjVu(self.file_djvu)
+        self.assertEqual(djvu.number_of_images(), 4)
+
+    def test_has_text(self):
+        """Test if djvu file contains text."""
+        djvu = DjVu(self.file_djvu)
+        self.assertTrue(djvu.has_text())
+        djvu = DjVu(self.file_djvu_wo_text)
+        self.assertFalse(djvu.has_text())
+
+    def test_get_existing_page_number(self):
+        """Test if djvu file contains text."""
+        djvu = DjVu(self.file_djvu)
+        self.assertTrue(djvu.has_text())
+        txt = djvu.get_page(1)
+        self.assertEqual(txt, self.test_txt)
+
+    def test_get_not_existing_page_number(self):
+        """Test if djvu file contains text."""
+        djvu = DjVu(self.file_djvu)
+        self.assertTrue(djvu.has_text())
+        self.assertRaises(ValueError, lambda: djvu.get_page(100))
+
+    def test_get_not_existing_page(self):
+        """Test if djvu file contains text."""
+        djvu = DjVu(self.file_djvu_wo_text)
+        self.assertFalse(djvu.has_text())
+        self.assertRaises(ValueError, lambda: djvu.get_page(100))
+
+if __name__ == "__main__":
+    unittest.main()

-- 
To view, visit https://gerrit.wikimedia.org/r/210808
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I88ba445fd49046430dfcb78d5b8a0ab46e2343fb
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.w...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Added DjVU class and djvutext.py in core - change (pywikibot/core)

Reply via email to