Mpaa has uploaded a new change for review.

Change subject: Added DjVU class and in core

Added DjVU class and in core

- DjVu class: wrapper to access djvu file text and properties
- added tests for DjVu class
- ported functionality from compat (basing it on Bot class)

Change-Id: I88ba445fd49046430dfcb78d5b8a0ab46e2343fb
A pywikibot/tools/
A scripts/
A tests/data/djvu/myfile.djvu
A tests/data/djvu/myfile_wo_text.djvu
A tests/data/myfile_wo_text.djvu
A tests/
7 files changed, 391 insertions(+), 0 deletions(-)

  git pull ssh:// 

diff --git a/ b/
new file mode 100644
index 0000000..1f39af2
--- /dev/null
+++ b/
@@ -0,0 +1,53 @@
+# -*- coding: utf-8  -*-
+import os
+import sys
+import codecs
+import subprocess
+import pywikibot
+filename = 'myfile.djvu'
+def number_of_images():
+    dp = subprocess.Popen(['djvused', '-e', "n", filename], 
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    (stdoutdata, stderrdata) = dp.communicate()
+    if dp.returncode != 0:
+        pywikibot.error(stderrdata)
+    count = int(stdoutdata)
+    pywikibot.output("page count = %d" % count)
+    return count
+def has_text():
+    #cmd = u"djvudump \"%s\" > \"%s\".out" % (self.djvu, self.djvu)
+    dp = subprocess.Popen(['djvudump', filename], stdout=subprocess.PIPE, 
+    (stdoutdata, stderrdata) = dp.communicate()
+    if dp.returncode != 0:
+        pywikibot.error(stderrdata)
+    txt =
+    txt = txt.decode('utf-8')
+    pywikibot.output(txt)
+    dp.stdout.close()
+    return 'TXTz' in txt
+def get_page():
+    dp = subprocess.Popen(['djvutxt', '--page=1', filename], 
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    (stdoutdata, stderrdata) = dp.communicate()
+    if dp.returncode != 0:
+        pywikibot.error(stderrdata)
+    #import pdb; pdb.set_trace()
+    txt =
+    txt = txt.decode('utf-8')
+    pywikibot.output(txt)
+    dp.stdout.close()
+    #cmd = u"djvutxt --page=%d \"%s\" \"%s.out\"" % (pageno, self.djvu, 
+    #os.system(cmd.encode(sys.stdout.encoding))
+    #f ="%s.out" % self.djvu, 'r',
+                    #config.textfile_encoding, 'replace')
+    #djvu_text =
+    #f.close()
+    return txt
+x = get_page()
diff --git a/pywikibot/tools/ b/pywikibot/tools/
new file mode 100644
index 0000000..b7145bc
--- /dev/null
+++ b/pywikibot/tools/
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+"""Wrapper around djvulibre to access djvu files properties and content."""
+# (C) Pywikibot team, 2015
+# Distributed under the terms of the MIT license.
+from __future__ import unicode_literals
+__version__ = '$Id$'
+import os.path
+import subprocess
+import pywikibot
+class DjVu(object):
+    """Wrapper around djvulibre to access djvu files properties and content.
+    Perform file existance checks.
+    Control characters in djvu text-layer are converted for convenience
+    (see # for control chars
+    details).
+    """
+    def __init__(self, file_djvu):
+        """
+        Constructor.
+        @param djvu: filename (including path) to djvu file
+        @type  djvu: string/unicode
+        """
+        try:
+            file_djvu = os.path.expanduser(file_djvu)
+        except AttributeError:
+            raise AttributeError('No djvu filename defined.')
+        try:
+            with open(file_djvu):
+                self.file_djvu = file_djvu
+        except IOError:
+            # Does not exist OR no read permissions.
+            raise
+    def number_of_images(self):
+        """Return the (cached) number of images in the djvu file."""
+        if not hasattr(self, '_n'):
+            dp = subprocess.Popen(['djvused', '-e', "n", self.file_djvu],
+                                  stdout=subprocess.PIPE, 
+            (stdoutdata, stderrdata) = dp.communicate()
+            if dp.returncode != 0:
+                pywikibot.error('djvulibre library error!\n%s' % stderrdata)
+            self._n = int(stdoutdata)
+        return self._n
+    def has_text(self):
+        """Test if the djvu file has a text-layer."""
+        if not hasattr(self, '_has_text'):
+            dp = subprocess.Popen(['djvudump', self.file_djvu],
+                                  stdout=subprocess.PIPE, 
+            (stdoutdata, stderrdata) = dp.communicate()
+            #import pdb; pdb.set_trace()
+            if dp.returncode != 0:
+                pywikibot.error('djvulibre library error!\n%s' % stderrdata)
+            txt = stdoutdata.decode('utf-8')
+            self._has_text = 'TXTz' in txt
+        return self._has_text
+    def get_page(self, n):
+        """Get page n for djvu file."""
+        if not self.has_text():
+            raise ValueError('Djvu file %s has no text layer.' % 
+        if n > self.number_of_images():
+            raise ValueError('Number of pages [%d] in %s is less than %d'
+                              % (self._n, self.file_djvu, n))
+        dp = subprocess.Popen(['djvutxt', '--page=%d' % n, self.file_djvu],
+                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        (stdoutdata, stderrdata) = dp.communicate()
+        if dp.returncode != 0:
+            pywikibot.error('djvulibre library error!\n%s' % stderrdata)
+        #import pdb; pdb.set_trace()
+        # for control chars.
+        txt = stdoutdata.decode('utf-8')
+        txt = txt.replace('\x0b', '')  # vertical tab (\013=\x0b): remove
+        txt = txt.replace('\x1d', '\n')  # group (\035=\x1d) separator: 
replace with \n
+        txt = txt.replace('\x1f', '\n')  # unit separator (\037=\x1f): replace 
with \n
+        txt = txt.strip('\x0c\n ')  # feed char (\f=\x0c), \n and trailing 
spaces: strip
+        pywikibot.output(txt)
+        return txt
diff --git a/scripts/ b/scripts/
new file mode 100644
index 0000000..92ab638
--- /dev/null
+++ b/scripts/
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+This bot uploads text from djvu files onto pages in the "Page" namespace.
+It is intended to be used for Wikisource.
+The following parameters are supported:
+    -djvu:...      filename of the djvu file
+    -index:...     name of the index page
+                   optional, default: the djvu filename).
+    -pages:<start>-<end> Page range to upload;
+                   optional, start=1, end=djvu file number of images
+    -force         overwrites existing text
+                   optional, default False
+    -summary:      custom edit summary.
+                   Use quotes if edit summary contains spaces.
+    -always        don't bother asking to confirm any of the changes.
+python scripts/ -djvu:Nietzsche_the_thinker.djvu -pages:2-5
+# (C) Pywikibot team, 2008-2015
+# Distributed under the terms of the MIT license.
+from __future__ import unicode_literals
+__version__ = '$Id$'
+import os.path
+import pywikibot
+from pywikibot import i18n, Bot
+from import DjVu
+#TODO: port to ProofeadPage, once ProofeadPage is merged.
+class DjVuTextBot(Bot):
+    """
+    A bot that uploads text-layer from djvu files to Page:namespace.
+    Works only on wikisource.
+    """
+    def __init__(self, djvu, index, pages=None, **kwargs):
+        """
+        Constructor.
+        @param djvu: djvu from where to fetch the text layer
+        @type  djvu: DjVu object
+        @param index: index page in the Index: namespace
+        @type  index: Page object
+        @param pages: page interval to upload (start, end)
+        @type  index: tuple
+        """
+        self.availableOptions.update({
+            'force': False,
+            'summary': None
+        })
+        super(DjVuTextBot, self).__init__(**kwargs)
+        self.djvu = djvu
+        self.index = index
+        self.prefix = self.index.title(withNamespace=False)
+        if not pages:
+            self.pages = (1, self.djvu.number_of_images())
+        else:
+            self.pages = pages
+        self.generator = self.gen()
+        # get edit summary message if it's empty
+        if not self.getOption('summary'):
+            params = {}
+            self.options['summary'] = i18n.twntranslate(
+      , 'djvutext-creating', params)
+    def gen(self):
+        """Generate pages from specified page interval."""
+        start, end = self.pages
+        for n in range(start, end + 1):
+            title = '{prefix}/{number}'.format(prefix=self.prefix, number=n)
+            page = pywikibot.Page(, title,
+            page.n = n  # remeber page number in djvu file
+            yield page
+    def treat(self, page):
+        """Process one page."""
+        new_text = self.djvu.get_page(page.n)
+        if page.exists():
+            old_text = page.text
+        else:
+            old_text = ''
+        summary = self.getOption('summary')
+        force = self.getOption('force')
+        self.userPut(page, old_text, new_text,
+                         summary=summary, minor=True, botflag=True, 
+def main(*args):
+    """
+    Process command line arguments and invoke bot.
+    If args is an empty list, sys.argv is used.
+    @param args: command line arguments
+    @type args: list of unicode
+    """
+    file_djvu = None
+    index = None
+    pages = '1'
+    options = {}
+    # Parse command line arguments
+    local_args = pywikibot.handle_args(args)
+    for arg in local_args:
+        if arg == '-force':
+            options['force'] = True
+        elif arg.startswith('-djvu:'):
+            file_djvu = arg[6:]
+        elif arg.startswith('-index:'):
+            index = arg[7:]
+        elif arg.startswith('-pages:'):
+            pages = arg[7:]
+        elif arg == '-always':
+            options['always'] = True
+        elif arg.startswith('-summary:'):
+            options['summary'] = arg[len('-summary:'):]
+        else:
+            pywikibot.output('Unknown argument %s' % arg)
+    if not (file_djvu or index):
+        pywikibot.showHelp()
+        return
+    # Check the djvu file exists and, if so, create the DjVu wrapper.
+    djvu = DjVu(file_djvu)
+    if not djvu.has_text():
+        raise ValueError('No text layer in djvu file %s' % djvu.file_djvu)
+    # parse pages param
+    start, sep, end = pages.partition('-')
+    pages = (int(start),
+             int(end) if (sep and end) else djvu.number_of_images())
+    if not index:
+        index = os.path.basename(file_djvu)
+    site = pywikibot.Site()
+    if != 'wikisource':
+        raise pywikibot.PageNotFound('Found family %s; Wikisource required.'
+                                      %
+    index_page = pywikibot.Page(site, index, ns=site.proofread_index_ns)
+    if not index_page.exists():
+        raise pywikibot.NoPage('Page %s does not exist' % index)
+    pywikibot.output('uploading text from %s to %s'
+                     % (djvu.file_djvu, index_page.title(asLink=True)))
+    bot = DjVuTextBot(djvu, index_page, pages, **options)
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception:
+        pywikibot.error('Fatal error:', exc_info=True)
diff --git a/tests/data/djvu/myfile.djvu b/tests/data/djvu/myfile.djvu
new file mode 100755
index 0000000..eedbbcb
--- /dev/null
+++ b/tests/data/djvu/myfile.djvu
Binary files differ
diff --git a/tests/data/djvu/myfile_wo_text.djvu 
new file mode 100644
index 0000000..a40d16b
--- /dev/null
+++ b/tests/data/djvu/myfile_wo_text.djvu
Binary files differ
diff --git a/tests/data/myfile_wo_text.djvu b/tests/data/myfile_wo_text.djvu
new file mode 100644
index 0000000..a40d16b
--- /dev/null
+++ b/tests/data/myfile_wo_text.djvu
Binary files differ
diff --git a/tests/ b/tests/
new file mode 100644
index 0000000..365d2b0
--- /dev/null
+++ b/tests/
@@ -0,0 +1,64 @@
+# -*- coding: utf-8  -*-
+"""Unit tests for script."""
+from __future__ import unicode_literals
+import os
+from tests import _data_dir
+from tests.aspects import unittest
+from import DjVu
+# python -m unittest tests.djvu_tests.TestDjVu
+_djvu_dir = 'djvu'
+class TestDjVu(unittest.TestCase):
+    """Test DjVu class."""
+    file_djvu_not_existing = os.path.join(_data_dir, _djvu_dir, 
+    file_djvu = os.path.join(_data_dir, _djvu_dir, 'myfile.djvu')
+    file_djvu_wo_text = os.path.join(_data_dir, _djvu_dir, 
+    test_txt = 'A file with non-ASCII characters, \nlike é or ç'
+    def test_file_existance(self):
+        """Test file existance checks."""
+        djvu = DjVu(self.file_djvu)
+        self.assertEqual(self.file_djvu, djvu.file_djvu)
+        self.assertRaises(IOError, lambda: DjVu(self.file_djvu_not_existing))
+        self.assertRaises(AttributeError, lambda: DjVu(None))
+    def test_number_of_images(self):
+        """Test page number generator."""
+        djvu = DjVu(self.file_djvu)
+        self.assertEqual(djvu.number_of_images(), 4)
+    def test_has_text(self):
+        """Test if djvu file contains text."""
+        djvu = DjVu(self.file_djvu)
+        self.assertTrue(djvu.has_text())
+        djvu = DjVu(self.file_djvu_wo_text)
+        self.assertFalse(djvu.has_text())
+    def test_get_existing_page_number(self):
+        """Test if djvu file contains text."""
+        djvu = DjVu(self.file_djvu)
+        self.assertTrue(djvu.has_text())
+        txt = djvu.get_page(1)
+        self.assertEqual(txt, self.test_txt)
+    def test_get_not_existing_page_number(self):
+        """Test if djvu file contains text."""
+        djvu = DjVu(self.file_djvu)
+        self.assertTrue(djvu.has_text())
+        self.assertRaises(ValueError, lambda: djvu.get_page(100))
+    def test_get_not_existing_page(self):
+        """Test if djvu file contains text."""
+        djvu = DjVu(self.file_djvu_wo_text)
+        self.assertFalse(djvu.has_text())
+        self.assertRaises(ValueError, lambda: djvu.get_page(100))
+if __name__ == "__main__":
+    unittest.main()

To view, visit
To unsubscribe, visit

Gerrit-MessageType: newchange
Gerrit-Change-Id: I88ba445fd49046430dfcb78d5b8a0ab46e2343fb
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <>

MediaWiki-commits mailing list

Reply via email to