Mpaa has uploaded a new change for review. https://gerrit.wikimedia.org/r/210808
Change subject: Added DjVU class and djvutext.py in core ...................................................................... Added DjVU class and djvutext.py in core Added: - DjVu class: wrapper to access djvu file text and properties - added tests for DjVu class - ported djvutext.py functionality from compat (basing it on Bot class) Change-Id: I88ba445fd49046430dfcb78d5b8a0ab46e2343fb --- A djvusub.py A pywikibot/tools/djvu.py A scripts/djvutext.py A tests/data/djvu/myfile.djvu A tests/data/djvu/myfile_wo_text.djvu A tests/data/myfile_wo_text.djvu A tests/djvu_tests.py 7 files changed, 391 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/08/210808/1 diff --git a/djvusub.py b/djvusub.py new file mode 100644 index 0000000..1f39af2 --- /dev/null +++ b/djvusub.py @@ -0,0 +1,53 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import os +import sys +import codecs +import subprocess +import pywikibot + +filename = 'myfile.djvu' + +def number_of_images(): + dp = subprocess.Popen(['djvused', '-e', "n", filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = dp.communicate() + if dp.returncode != 0: + pywikibot.error(stderrdata) + count = int(stdoutdata) + pywikibot.output("page count = %d" % count) + return count + +def has_text(): + #cmd = u"djvudump \"%s\" > \"%s\".out" % (self.djvu, self.djvu) + dp = subprocess.Popen(['djvudump', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = dp.communicate() + if dp.returncode != 0: + pywikibot.error(stderrdata) + txt = dp.stdout.read() + txt = txt.decode('utf-8') + pywikibot.output(txt) + dp.stdout.close() + return 'TXTz' in txt + +def get_page(): + dp = subprocess.Popen(['djvutxt', '--page=1', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = dp.communicate() + if dp.returncode != 0: + pywikibot.error(stderrdata) + #import pdb; pdb.set_trace() + txt = dp.stdout.read() + txt = txt.decode('utf-8') + pywikibot.output(txt) + dp.stdout.close() + #cmd = u"djvutxt --page=%d \"%s\" \"%s.out\"" % (pageno, self.djvu, self.djvu) + #os.system(cmd.encode(sys.stdout.encoding)) + #f = codecs.open(u"%s.out" % self.djvu, 'r', + #config.textfile_encoding, 'replace') + #djvu_text = f.read() + #f.close() + return txt + +print(number_of_images()) +print(has_text()) +x = get_page() diff --git a/pywikibot/tools/djvu.py b/pywikibot/tools/djvu.py new file mode 100644 index 0000000..b7145bc --- /dev/null +++ b/pywikibot/tools/djvu.py @@ -0,0 +1,95 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +"""Wrapper around djvulibre to access djvu files properties and content.""" +# +# (C) Pywikibot team, 2015 +# +# Distributed under the terms of the MIT license. +# +from __future__ import unicode_literals + +__version__ = '$Id$' + +import os.path +import subprocess + +import pywikibot + + +class DjVu(object): + + """Wrapper around djvulibre to access djvu files properties and content. + + Perform file existance checks. + + Control characters in djvu text-layer are converted for convenience + (see # http://djvu.sourceforge.net/doc/man/djvused.html for control chars + details). + + """ + + def __init__(self, file_djvu): + """ + Constructor. + + @param djvu: filename (including path) to djvu file + @type djvu: string/unicode + """ + try: + file_djvu = os.path.expanduser(file_djvu) + except AttributeError: + raise AttributeError('No djvu filename defined.') + + try: + with open(file_djvu): + self.file_djvu = file_djvu + except IOError: + # Does not exist OR no read permissions. + raise + + def number_of_images(self): + """Return the (cached) number of images in the djvu file.""" + if not hasattr(self, '_n'): + dp = subprocess.Popen(['djvused', '-e', "n", self.file_djvu], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = dp.communicate() + if dp.returncode != 0: + pywikibot.error('djvulibre library error!\n%s' % stderrdata) + self._n = int(stdoutdata) + return self._n + + def has_text(self): + """Test if the djvu file has a text-layer.""" + if not hasattr(self, '_has_text'): + dp = subprocess.Popen(['djvudump', self.file_djvu], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = dp.communicate() + #import pdb; pdb.set_trace() + if dp.returncode != 0: + pywikibot.error('djvulibre library error!\n%s' % stderrdata) + txt = stdoutdata.decode('utf-8') + self._has_text = 'TXTz' in txt + return self._has_text + + def get_page(self, n): + """Get page n for djvu file.""" + if not self.has_text(): + raise ValueError('Djvu file %s has no text layer.' % self.file_djvu) + if n > self.number_of_images(): + raise ValueError('Number of pages [%d] in %s is less than %d' + % (self._n, self.file_djvu, n)) + dp = subprocess.Popen(['djvutxt', '--page=%d' % n, self.file_djvu], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (stdoutdata, stderrdata) = dp.communicate() + if dp.returncode != 0: + pywikibot.error('djvulibre library error!\n%s' % stderrdata) + #import pdb; pdb.set_trace() + + # http://djvu.sourceforge.net/doc/man/djvused.html for control chars. + txt = stdoutdata.decode('utf-8') + txt = txt.replace('\x0b', '') # vertical tab (\013=\x0b): remove + txt = txt.replace('\x1d', '\n') # group (\035=\x1d) separator: replace with \n + txt = txt.replace('\x1f', '\n') # unit separator (\037=\x1f): replace with \n + txt = txt.strip('\x0c\n ') # feed char (\f=\x0c), \n and trailing spaces: strip + pywikibot.output(txt) + return txt diff --git a/scripts/djvutext.py b/scripts/djvutext.py new file mode 100644 index 0000000..92ab638 --- /dev/null +++ b/scripts/djvutext.py @@ -0,0 +1,179 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +This bot uploads text from djvu files onto pages in the "Page" namespace. + +It is intended to be used for Wikisource. + +The following parameters are supported: + + -djvu:... filename of the djvu file + -index:... name of the index page + optional, default: the djvu filename). + -pages:<start>-<end> Page range to upload; + optional, start=1, end=djvu file number of images + -force overwrites existing text + optional, default False + -summary: custom edit summary. + Use quotes if edit summary contains spaces. + -always don't bother asking to confirm any of the changes. + +python scripts/djvutext.py -djvu:Nietzsche_the_thinker.djvu -pages:2-5 +""" +# +# (C) Pywikibot team, 2008-2015 +# +# Distributed under the terms of the MIT license. +# +from __future__ import unicode_literals + +__version__ = '$Id$' +# + +import os.path + +import pywikibot +from pywikibot import i18n, Bot +from pywikibot.tools.djvu import DjVu + +#TODO: port to ProofeadPage, once ProofeadPage is merged. + + +class DjVuTextBot(Bot): + + """ + A bot that uploads text-layer from djvu files to Page:namespace. + + Works only on wikisource. + """ + + def __init__(self, djvu, index, pages=None, **kwargs): + """ + Constructor. + + @param djvu: djvu from where to fetch the text layer + @type djvu: DjVu object + @param index: index page in the Index: namespace + @type index: Page object + @param pages: page interval to upload (start, end) + @type index: tuple + """ + self.availableOptions.update({ + 'force': False, + 'summary': None + }) + super(DjVuTextBot, self).__init__(**kwargs) + self.djvu = djvu + self.index = index + self.prefix = self.index.title(withNamespace=False) + + if not pages: + self.pages = (1, self.djvu.number_of_images()) + else: + self.pages = pages + + self.generator = self.gen() + + # get edit summary message if it's empty + if not self.getOption('summary'): + params = {} + self.options['summary'] = i18n.twntranslate( + self.index.site, 'djvutext-creating', params) + + def gen(self): + """Generate pages from specified page interval.""" + start, end = self.pages + for n in range(start, end + 1): + title = '{prefix}/{number}'.format(prefix=self.prefix, number=n) + page = pywikibot.Page(self.index.site, title, ns=self.index.site.proofread_page_ns) + page.n = n # remeber page number in djvu file + yield page + + def treat(self, page): + """Process one page.""" + new_text = self.djvu.get_page(page.n) + + if page.exists(): + old_text = page.text + else: + old_text = '' + + summary = self.getOption('summary') + force = self.getOption('force') + + self.userPut(page, old_text, new_text, + summary=summary, minor=True, botflag=True, force=force) + + +def main(*args): + """ + Process command line arguments and invoke bot. + + If args is an empty list, sys.argv is used. + + @param args: command line arguments + @type args: list of unicode + """ + file_djvu = None + index = None + pages = '1' + options = {} + + # Parse command line arguments + local_args = pywikibot.handle_args(args) + for arg in local_args: + if arg == '-force': + options['force'] = True + elif arg.startswith('-djvu:'): + file_djvu = arg[6:] + elif arg.startswith('-index:'): + index = arg[7:] + elif arg.startswith('-pages:'): + pages = arg[7:] + elif arg == '-always': + options['always'] = True + elif arg.startswith('-summary:'): + options['summary'] = arg[len('-summary:'):] + else: + pywikibot.output('Unknown argument %s' % arg) + + if not (file_djvu or index): + pywikibot.showHelp() + return + + # Check the djvu file exists and, if so, create the DjVu wrapper. + djvu = DjVu(file_djvu) + + if not djvu.has_text(): + raise ValueError('No text layer in djvu file %s' % djvu.file_djvu) + + # parse pages param + start, sep, end = pages.partition('-') + pages = (int(start), + int(end) if (sep and end) else djvu.number_of_images()) + + if not index: + index = os.path.basename(file_djvu) + + site = pywikibot.Site() + + if site.family != 'wikisource': + raise pywikibot.PageNotFound('Found family %s; Wikisource required.' + % site.family) + + index_page = pywikibot.Page(site, index, ns=site.proofread_index_ns) + + if not index_page.exists(): + raise pywikibot.NoPage('Page %s does not exist' % index) + + pywikibot.output('uploading text from %s to %s' + % (djvu.file_djvu, index_page.title(asLink=True))) + + bot = DjVuTextBot(djvu, index_page, pages, **options) + bot.run() + +if __name__ == "__main__": + try: + main() + except Exception: + pywikibot.error('Fatal error:', exc_info=True) diff --git a/tests/data/djvu/myfile.djvu b/tests/data/djvu/myfile.djvu new file mode 100755 index 0000000..eedbbcb --- /dev/null +++ b/tests/data/djvu/myfile.djvu Binary files differ diff --git a/tests/data/djvu/myfile_wo_text.djvu b/tests/data/djvu/myfile_wo_text.djvu new file mode 100644 index 0000000..a40d16b --- /dev/null +++ b/tests/data/djvu/myfile_wo_text.djvu Binary files differ diff --git a/tests/data/myfile_wo_text.djvu b/tests/data/myfile_wo_text.djvu new file mode 100644 index 0000000..a40d16b --- /dev/null +++ b/tests/data/myfile_wo_text.djvu Binary files differ diff --git a/tests/djvu_tests.py b/tests/djvu_tests.py new file mode 100644 index 0000000..365d2b0 --- /dev/null +++ b/tests/djvu_tests.py @@ -0,0 +1,64 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +"""Unit tests for djvutext.py script.""" +from __future__ import unicode_literals + +import os +from tests import _data_dir +from tests.aspects import unittest +from pywikibot.tools.djvu import DjVu + +# python -m unittest tests.djvu_tests.TestDjVu + +_djvu_dir = 'djvu' + + +class TestDjVu(unittest.TestCase): + + """Test DjVu class.""" + + file_djvu_not_existing = os.path.join(_data_dir, _djvu_dir, 'not_existing.djvu') + file_djvu = os.path.join(_data_dir, _djvu_dir, 'myfile.djvu') + file_djvu_wo_text = os.path.join(_data_dir, _djvu_dir, 'myfile_wo_text.djvu') + test_txt = 'A file with non-ASCII characters, \nlike é or ç' + + def test_file_existance(self): + """Test file existance checks.""" + djvu = DjVu(self.file_djvu) + self.assertEqual(self.file_djvu, djvu.file_djvu) + self.assertRaises(IOError, lambda: DjVu(self.file_djvu_not_existing)) + self.assertRaises(AttributeError, lambda: DjVu(None)) + + def test_number_of_images(self): + """Test page number generator.""" + djvu = DjVu(self.file_djvu) + self.assertEqual(djvu.number_of_images(), 4) + + def test_has_text(self): + """Test if djvu file contains text.""" + djvu = DjVu(self.file_djvu) + self.assertTrue(djvu.has_text()) + djvu = DjVu(self.file_djvu_wo_text) + self.assertFalse(djvu.has_text()) + + def test_get_existing_page_number(self): + """Test if djvu file contains text.""" + djvu = DjVu(self.file_djvu) + self.assertTrue(djvu.has_text()) + txt = djvu.get_page(1) + self.assertEqual(txt, self.test_txt) + + def test_get_not_existing_page_number(self): + """Test if djvu file contains text.""" + djvu = DjVu(self.file_djvu) + self.assertTrue(djvu.has_text()) + self.assertRaises(ValueError, lambda: djvu.get_page(100)) + + def test_get_not_existing_page(self): + """Test if djvu file contains text.""" + djvu = DjVu(self.file_djvu_wo_text) + self.assertFalse(djvu.has_text()) + self.assertRaises(ValueError, lambda: djvu.get_page(100)) + +if __name__ == "__main__": + unittest.main() -- To view, visit https://gerrit.wikimedia.org/r/210808 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I88ba445fd49046430dfcb78d5b8a0ab46e2343fb Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Mpaa <mpaa.w...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits