Bug#479178: python-chardet: Add command-line program for detecting character encoding of files

Ben Finney Sat, 03 May 2008 18:41:58 -0700

On 03-May-2008, Piotr Ożarowski wrote:
> [Ben Finney, 2008-05-03 15:22]
> > I wanted a command-line tool using 'python-chardet' to report on the 
> > detected character encoding of arbitrary specified files. So I wrote 
> > one.
> 
> I didn't add such tool to python-chardet package, because /usr/bin/enca
> from enca package does all this and even more.


Perhaps, but it doesn't use 'python-chardet' to do so, leaving Debian 
currently without a command that uses this library.

> > The patch 'chardet-1.0.1_unit-test-scaffold.patch' adds unit test 
> > support modules that I used to develop the program, and may be 
> > useful for adding more unit tests in future.
> 
> you forgot to attach the patch :-)

Argh. I didn't forget; the patches were eaten by bug #211808 of 
'reportbug' <URL:http://bugs.debian.org/211808>. Thanks for letting me 
know.

> please attach it and we'll consider adding it to the package

Both patches discussed in the initial bug report are now attached to 
this message.

-- 
 \       "I don't care to belong to a club that accepts people like me |
  `\                                     as members."  -- Groucho Marx |
_o__)                                                                  |
Ben Finney <[EMAIL PROTECTED]>

=== added directory 'test'
=== added file 'test/minimock.py'
--- test/minimock.py	1970-01-01 00:00:00 +0000
+++ test/minimock.py	2008-04-30 03:30:45 +0000
@@ -0,0 +1,279 @@
+# (c) 2006 Ian Bicking, Mike Beachy, and contributors
+# Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php
+r"""
+minimock is a simple library for doing Mock objects with doctest.
+When using doctest, mock objects can be very simple.
+
+Here's an example of something we might test, a simple email sender::
+
+    >>> import smtplib
+    >>> def send_email(from_addr, to_addr, subject, body):
+    ...     conn = smtplib.SMTP('localhost')
+    ...     msg = 'To: %s\nFrom: %s\nSubject: %s\n\n%s' % (
+    ...         to_addr, from_addr, subject, body)
+    ...     conn.sendmail(from_addr, [to_addr], msg)
+    ...     conn.quit()
+
+Now we want to make a mock ``smtplib.SMTP`` object.  We'll have to
+inject our mock into the ``smtplib`` module::
+
+    >>> smtplib.SMTP = Mock('smtplib.SMTP')
+    >>> smtplib.SMTP.mock_returns = Mock('smtp_connection')
+
+Now we do the test::
+
+    >>> send_email('[EMAIL PROTECTED]', '[EMAIL PROTECTED]',
+    ...            'Hi there!', 'How is it going?')
+    Called smtplib.SMTP('localhost')
+    Called smtp_connection.sendmail(
+        '[EMAIL PROTECTED]',
+        ['[EMAIL PROTECTED]'],
+        'To: [EMAIL PROTECTED]: [EMAIL PROTECTED]: Hi there!\n\nHow is it going?')
+    Called smtp_connection.quit()
+
+Voila!  We've tested implicitly that no unexpected methods were called
+on the object.  We've also tested the arguments that the mock object
+got.  We've provided fake return calls (for the ``smtplib.SMTP()``
+constructor).  These are all the core parts of a mock library.  The
+implementation is simple because most of the work is done by doctest.
+"""
+
+__all__ = ["mock", "restore", "Mock"]
+
+import sys
+import inspect
+
+# A list of mocked objects. Each item is a tuple of (original object,
+# namespace dict, object name, and a list of object attributes).
+#
+mocked = []
+
+def lookup_by_name(name, nsdicts):
+    """
+    Look up an object by name from a sequence of namespace dictionaries.
+    Returns a tuple of (nsdict, object, attributes); nsdict is the
+    dictionary the name was found in, object is the base object the name is
+    bound to, and the attributes list is the chain of attributes of the
+    object that complete the name.
+
+        >>> import os
+        >>> nsdict, name, attributes = lookup_by_name("os.path.isdir",
+        ...     (locals(),))
+        >>> name, attributes
+        ('os', ['path', 'isdir'])
+        >>> nsdict, name, attributes = lookup_by_name("os.monkey", (locals(),))
+        Traceback (most recent call last):
+          ...
+        NameError: name 'os.monkey' is not defined
+
+    """
+    for nsdict in nsdicts:
+        attrs = name.split(".")
+        names = []
+
+        while attrs:
+            names.append(attrs.pop(0))
+            obj_name = ".".join(names)
+
+            if obj_name in nsdict:
+                attr_copy = attrs[:]
+                tmp = nsdict[obj_name]
+                try:
+                    while attr_copy:
+                        tmp = getattr(tmp, attr_copy.pop(0))
+                except AttributeError:
+                    pass
+                else:
+                    return nsdict, obj_name, attrs
+
+    raise NameError("name '%s' is not defined" % name)
+
+def mock(name, nsdicts=None, mock_obj=None, **kw):
+    """
+    Mock the named object, placing a Mock instance in the correct namespace
+    dictionary. If no iterable of namespace dicts is provided, use
+    introspection to get the locals and globals of the caller of this
+    function.
+
+    All additional keyword args are passed on to the Mock object
+    initializer.
+
+    An example of how os.path.isfile is replaced:
+
+        >>> import os
+        >>> os.path.isfile
+        <function isfile at ...>
+        >>> isfile_id = id(os.path.isfile)
+        >>> mock("os.path.isfile", returns=True)
+        >>> os.path.isfile
+        <Mock ... os.path.isfile>
+        >>> os.path.isfile("/foo/bar/baz")
+        Called os.path.isfile('/foo/bar/baz')
+        True
+        >>> mock_id = id(os.path.isfile)
+        >>> mock_id != isfile_id
+        True
+
+    A second mock object will replace the first, but the original object
+    will be the one replaced with the replace() function.
+
+        >>> mock("os.path.isfile", returns=False)
+        >>> mock_id != id(os.path.isfile)
+        True
+        >>> restore()
+        >>> os.path.isfile
+        <function isfile at ...>
+        >>> isfile_id == id(os.path.isfile)
+        True
+
+    """
+    if nsdicts is None:
+        stack = inspect.stack()
+        try:
+            # stack[1][0] is the frame object of the caller to this function
+            globals_ = stack[1][0].f_globals
+            locals_ = stack[1][0].f_locals
+            nsdicts = (locals_, globals_)
+        finally:
+            del(stack)
+
+    if mock_obj is None:
+        mock_obj = Mock(name, **kw)
+
+    nsdict, obj_name, attrs = lookup_by_name(name, nsdicts)
+
+    # Get the original object and replace it with the mock object.
+    tmp = nsdict[obj_name]
+    if not attrs:
+        original = tmp
+        nsdict[obj_name] = mock_obj
+    else:
+        for attr in attrs[:-1]:
+            tmp = getattr(tmp, attr)
+        original = getattr(tmp, attrs[-1])
+        setattr(tmp, attrs[-1], mock_obj)
+
+    mocked.append((original, nsdict, obj_name, attrs))
+
+def restore():
+    """
+    Restore all mocked objects.
+
+    """
+    global mocked
+
+    # Restore the objects in the reverse order of their mocking to assure
+    # the original state is retrieved.
+    while mocked:
+        original, nsdict, name, attrs = mocked.pop()
+        if not attrs:
+            nsdict[name] = original
+        else:
+            tmp = nsdict[name]
+            for attr in attrs[:-1]:
+                tmp = getattr(tmp, attr)
+            setattr(tmp, attrs[-1], original)
+    return
+
+class Mock(object):
+
+    def __init__(self, name, returns=None, returns_iter=None,
+                returns_func=None, raises=None,
+                outfile=None):
+        self.mock_name = name
+        self.mock_returns = returns
+        if returns_iter is not None:
+            returns_iter = iter(returns_iter)
+        self.mock_returns_iter = returns_iter
+        self.mock_returns_func = returns_func
+        self.mock_raises = raises
+        if outfile is None:
+            outfile = sys.stdout
+        self.mock_outfile = outfile
+        self.mock_attrs = {}
+
+    def __repr__(self):
+        return '<Mock %s %s>' % (hex(id(self)), self.mock_name)
+
+    def __call__(self, *args, **kw):
+        parts = [repr(a) for a in args]
+        parts.extend(
+            '%s=%r' % (items) for items in sorted(kw.items()))
+        msg = 'Called %s(%s)' % (self.mock_name, ', '.join(parts))
+        if len(msg) > 80:
+            msg = 'Called %s(\n    %s)' % (
+                self.mock_name, ',\n    '.join(parts))
+        self.mock_outfile.write("%s\n" % msg)
+        return self._mock_return(*args, **kw)
+
+    def _mock_return(self, *args, **kw):
+        if self.mock_raises is not None:
+            raise self.mock_raises
+        elif self.mock_returns is not None:
+            return self.mock_returns
+        elif self.mock_returns_iter is not None:
+            try:
+                return self.mock_returns_iter.next()
+            except StopIteration:
+                raise Exception("No more mock return values are present.")
+        elif self.mock_returns_func is not None:
+            return self.mock_returns_func(*args, **kw)
+        else:
+            return None
+
+    def __getattr__(self, attr):
+        if attr not in self.mock_attrs:
+            if self.mock_name:
+                new_name = self.mock_name + '.' + attr
+            else:
+                new_name = attr
+            self.mock_attrs[attr] = Mock(new_name,
+                outfile=self.mock_outfile)
+        return self.mock_attrs[attr]
+
+__test__ = {
+    "mock" :
+    r"""
+    An additional test for mocking a function accessed directly (i.e.
+    not via object attributes).
+
+    >>> import os
+    >>> rename = os.rename
+    >>> orig_id = id(rename)
+    >>> mock("rename")
+    >>> mock_id = id(rename)
+    >>> mock("rename")
+    >>> mock_id != id(rename)
+    True
+    >>> restore()
+    >>> orig_id == id(rename) == id(os.rename)
+    True
+
+    The example from the module docstring, done with the mock/restore
+    functions.
+
+    >>> import smtplib
+    >>> def send_email(from_addr, to_addr, subject, body):
+    ...     conn = smtplib.SMTP('localhost')
+    ...     msg = 'To: %s\nFrom: %s\nSubject: %s\n\n%s' % (
+    ...         to_addr, from_addr, subject, body)
+    ...     conn.sendmail(from_addr, [to_addr], msg)
+    ...     conn.quit()
+
+    >>> mock("smtplib.SMTP", returns=Mock('smtp_connection'))
+    >>> send_email('[EMAIL PROTECTED]', '[EMAIL PROTECTED]',
+    ...            'Hi there!', 'How is it going?')
+    Called smtplib.SMTP('localhost')
+    Called smtp_connection.sendmail(
+        '[EMAIL PROTECTED]',
+        ['[EMAIL PROTECTED]'],
+        'To: [EMAIL PROTECTED]: [EMAIL PROTECTED]: Hi there!\n\nHow is it going?')
+    Called smtp_connection.quit()
+    >>> restore()
+
+    """,
+}
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod(optionflags=doctest.ELLIPSIS)

=== added file 'test/scaffold.py'
--- test/scaffold.py	1970-01-01 00:00:00 +0000
+++ test/scaffold.py	2008-04-30 03:47:46 +0000
@@ -0,0 +1,368 @@
+# -*- coding: utf-8 -*-
+
+# scaffold.py
+#
+# Copyright © 2007-2008 Ben Finney <[EMAIL PROTECTED]>
+# This is free software; you may copy, modify and/or distribute this work
+# under the terms of the GNU General Public License, version 2 or later.
+# No warranty expressed or implied. See the file LICENSE for details.
+
+""" Scaffolding for unit test modules
+"""
+
+import unittest
+import doctest
+import logging
+import os
+import sys
+import textwrap
+from StringIO import StringIO
+from minimock import (
+    Mock,
+    mock,
+    restore as mock_restore,
+    )
+
+test_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(test_dir)
+if not test_dir in sys.path:
+    sys.path.insert(1, test_dir)
+if not parent_dir in sys.path:
+    sys.path.insert(1, parent_dir)
+bin_dir = os.path.join(parent_dir, "bin")
+
+# Disable all but the most critical logging messages
+logging.disable(logging.CRITICAL)
+
+
+def suite(module_name):
+    """ Create the test suite for named module """
+    from sys import modules
+    loader = unittest.TestLoader()
+    suite = loader.loadTestsFromModule(modules[module_name])
+    return suite
+
+def unittest_main(argv=None):
+    """ Mainline function for each unit test module """
+
+    from sys import argv as sys_argv
+    if not argv:
+        argv = sys_argv
+
+    exitcode = None
+    try:
+        unittest.main(argv=argv, defaultTest='suite')
+    except SystemExit, e:
+        exitcode = e.code
+
+    return exitcode
+
+
+def make_module_from_file(module_name, file_name):
+    """ Make a new module object from the code in specified file """
+
+    from types import ModuleType
+    module = ModuleType(module_name)
+
+    module_file = open(file_name, 'r')
+    exec module_file in module.__dict__
+
+    return module
+
+
+class TestCase(unittest.TestCase):
+    """ Test case behaviour """
+
+    def failUnlessRaises(self, exc_class, func, *args, **kwargs):
+        """ Fail unless the function call raises the expected exception
+
+            Fail the test if an instance of the exception class
+            ``exc_class`` is not raised when calling ``func`` with the
+            arguments ``*args`` and ``**kwargs``.
+
+            """
+
+        try:
+            super(TestCase, self).failUnlessRaises(
+                exc_class, func, *args, **kwargs)
+        except self.failureException:
+            exc_class_name = exc_class.__name__
+            msg = (
+                "Exception %(exc_class_name)s not raised"
+                " for function call:"
+                " func=%(func)r args=%(args)r kwargs=%(kwargs)r"
+                ) % vars()
+            raise self.failureException(msg)
+
+
+    def failIfIs(self, first, second, msg=None):
+        """ Fail if the two objects are identical
+
+            Fail the test if ``first`` and ``second`` are identical,
+            as determined by the ``is`` operator.
+
+            """
+
+        if first is second:
+            if msg is None:
+                msg = "%(first)r is %(second)r" % vars()
+            raise self.failureException(msg)
+
+    def failUnlessIs(self, first, second, msg=None):
+        """ Fail unless the two objects are identical
+
+            Fail the test unless ``first`` and ``second`` are
+            identical, as determined by the ``is`` operator.
+
+            """
+
+        if first is not second:
+            if msg is None:
+                msg = "%(first)r is not %(second)r" % vars()
+            raise self.failureException(msg)
+
+    assertIs = failUnlessIs
+    assertNotIs = failIfIs
+
+    def failIfIn(self, first, second, msg=None):
+        """ Fail if the second object is in the first
+
+            Fail the test if ``first`` contains ``second``, as
+            determined by the ``in`` operator.
+
+            """
+
+        if second in first:
+            if msg is None:
+                msg = "%(second)r is in %(first)r" % vars()
+            raise self.failureException(msg)
+
+    def failUnlessIn(self, first, second, msg=None):
+        """ Fail unless the second object is in the first
+
+            Fail the test unless ``first`` contains ``second``, as
+            determined by the ``in`` operator.
+
+            """
+
+        if second not in first:
+            if msg is None:
+                msg = "%(second)r is not in %(first)r" % vars()
+            raise self.failureException(msg)
+
+    assertIn = failUnlessIn
+    assertNotIn = failIfIn
+
+    def failUnlessOutputCheckerMatch(self, want, got, msg=None):
+        """ Fail unless the specified string matches the expected
+
+            Fail the test unless ``want`` matches ``got``, as
+            determined by a ``doctest.OutputChecker`` instance. This
+            is not an equality check, but a pattern match according to
+            the OutputChecker rules.
+
+            """
+
+        checker = doctest.OutputChecker()
+        want = textwrap.dedent(want)
+        got = textwrap.dedent(got)
+        if not checker.check_output(want, got, doctest.ELLIPSIS):
+            if msg is None:
+                msg = ("Expected %(want)r, got %(got)r:"
+                       "\n--- want: ---\n%(want)s"
+                       "\n--- got: ---\n%(got)s") % vars()
+            raise self.failureException(msg)
+
+    assertOutputCheckerMatch = failUnlessOutputCheckerMatch
+
+    def failIfIsInstance(self, obj, classes):
+        """ Fail if the object is an instance of the specified classes
+
+            Fail the test if the object ``obj`` is an instance of any
+            of ``classes``.
+
+            """
+
+        if isinstance(obj, classes):
+            msg = "%(obj)r is an instance of one of %(classes)r" % vars()
+            raise self.failureException(msg)
+
+    def failUnlessIsInstance(self, obj, classes):
+        """ Fail unless the object is an instance of the specified classes
+
+            Fail the test unless the object ``obj`` is an instance of
+            any of ``classes``.
+
+            """
+
+        if not isinstance(obj, classes):
+            msg = "%(obj)r is not an instance of any of %(classes)r" % vars()
+            raise self.failureException(msg)
+
+    assertIsInstance = failUnlessIsInstance
+    assertNotIsInstance = failIfIsInstance
+
+    def failUnlessFunctionInTraceback(self, traceback, function):
+        """ Fail if the function is not in the traceback
+
+            Fail the test if the function ``function`` is not at any
+            of the levels in the traceback object ``traceback``.
+
+            """
+
+        func_in_traceback = False
+        expect_code = function.func_code
+        current_traceback = traceback
+        while current_traceback is not None:
+            if expect_code is current_traceback.tb_frame.f_code:
+                func_in_traceback = True
+                break
+            current_traceback = current_traceback.tb_next
+
+        if not func_in_traceback:
+            msg = ("Traceback did not lead to original function"
+                " %(function)s"
+                ) % vars()
+            raise self.failureException(msg)
+
+    assertFunctionInTraceback = failUnlessFunctionInTraceback
+
+
+class Test_Exception(TestCase):
+    """ Test cases for exception classes """
+
+    def __init__(self, *args, **kwargs):
+        """ Set up a new instance """
+        self.valid_exceptions = NotImplemented
+        super(Test_Exception, self).__init__(*args, **kwargs)
+
+    def setUp(self):
+        """ Set up test fixtures """
+        for exc_type, params in self.valid_exceptions.items():
+            args = (None,) * params['min_args']
+            params['args'] = args
+            instance = exc_type(*args)
+            params['instance'] = instance
+
+        self.iterate_params = make_params_iterator(
+            default_params_dict = self.valid_exceptions
+            )
+
+        super(Test_Exception, self).setUp()
+
+    def test_exception_instance(self):
+        """ Exception instance should be created """
+        for key, params in self.iterate_params():
+            instance = params['instance']
+            self.failIfIs(None, instance)
+
+    def test_exception_types(self):
+        """ Exception instances should match expected types """
+        for key, params in self.iterate_params():
+            instance = params['instance']
+            for match_type in params['types']:
+                match_type_name = match_type.__name__
+                fail_msg = (
+                    "%(instance)r is not an instance of"
+                    " %(match_type_name)s"
+                    ) % vars()
+                self.failUnless(
+                    isinstance(instance, match_type),
+                    msg=fail_msg)
+
+
+class Test_ProgramMain(TestCase):
+    """ Test cases for program __main__ function
+
+        Tests a module-level function named __main__ with behaviour
+        inspired by Guido van Rossum's post "Python main() functions"
+        <URL:http://www.artima.com/weblogs/viewpost.jsp?thread=4829>.
+
+        It expects:
+          * the program module has a __main__ function, that:
+              * accepts an 'argv' argument, defaulting to sys.argv
+              * instantiates a program application class
+              * calls the application's main() method, passing argv
+              * catches SystemExit and returns the error code
+          * the application behaviour is defined in a class, that:
+              * has an __init__() method accepting an 'argv' argument as
+                the commandline argument list to parse
+              * has a main() method responsible for running the program,
+                and returning on successful program completion
+              * raises SystemExit when an abnormal exit is required
+        """
+
+    def __init__(self, *args, **kwargs):
+        """ Set up a new instance """
+        self.program_module = NotImplemented
+        self.application_class = NotImplemented
+        super(Test_ProgramMain, self).__init__(*args, **kwargs)
+
+    def setUp(self):
+        """ Set up test fixtures """
+        self.mock_outfile = StringIO()
+
+        self.app_class_name = self.application_class.__name__
+        self.mock_app = Mock("test_app", outfile=self.mock_outfile)
+        self.mock_app_class = Mock(self.app_class_name,
+            outfile=self.mock_outfile)
+        self.mock_app_class.mock_returns = self.mock_app
+        mock(self.app_class_name, mock_obj=self.mock_app_class,
+            nsdicts=[self.program_module.__dict__])
+
+        super(Test_ProgramMain, self).setUp()
+
+    def tearDown(self):
+        """ Tear down test fixtures """
+        mock_restore()
+        super(Test_ProgramMain, self).tearDown()
+
+    def test_main_should_instantiate_app(self):
+        """ __main__() should instantiate application class """
+        app_class_name = self.app_class_name
+        argv = ["foo", "bar"]
+        expect_mock_output = """\
+            Called %(app_class_name)s(%(argv)r)...
+            """ % vars()
+        self.program_module.__main__(argv)
+        self.failUnlessOutputCheckerMatch(
+            expect_mock_output, self.mock_outfile.getvalue())
+
+    def test_main_should_call_app_main(self):
+        """ __main__() should call the application main method """
+        argv = ["foo", "bar"]
+        app_class_name = self.app_class_name
+        expect_mock_output = """\
+            Called %(app_class_name)s(%(argv)r)
+            Called test_app.main()
+            """ % vars()
+        self.program_module.__main__(argv)
+        self.failUnlessOutputCheckerMatch(
+            expect_mock_output, self.mock_outfile.getvalue())
+
+    def test_main_no_argv_should_supply_sys_argv(self):
+        """ __main__() with no argv should supply sys.argv to application """
+        sys_argv_test = ["foo", "bar"]
+        mock("sys.argv", mock_obj=sys_argv_test)
+        app_class_name = self.app_class_name
+        expect_mock_output = """\
+            Called %(app_class_name)s(%(sys_argv_test)r)
+            Called test_app.main()
+            """ % vars()
+        self.program_module.__main__()
+        self.failUnlessOutputCheckerMatch(
+            expect_mock_output, self.mock_outfile.getvalue())
+
+    def test_main_should_return_none_on_success(self):
+        """ __main__() should return None when no SystemExit raised """
+        expect_exit_code = None
+        exit_code = self.program_module.__main__()
+        self.failUnlessEqual(expect_exit_code, exit_code)
+
+    def test_main_should_return_exit_code_on_system_exit(self):
+        """ __main__() should return application SystemExit code """
+        expect_exit_code = object()
+        self.mock_app.main.mock_raises = SystemExit(expect_exit_code)
+        exit_code = self.program_module.__main__()
+        self.failUnlessEqual(expect_exit_code, exit_code)
+

=== added file 'test/suite.py'
--- test/suite.py	1970-01-01 00:00:00 +0000
+++ test/suite.py	2008-04-30 03:08:08 +0000
@@ -0,0 +1,59 @@
+# -*- coding:utf-8; -*-
+
+# test/suite.py
+# Part of chardet, the Universal Encoding Detector.
+#
+# Copyright © 2008 Ben Finney <[EMAIL PROTECTED]>
+# This is free software; you may copy, modify and/or distribute this work
+# under the terms of the GNU General Public License, version 2 or later.
+# No warranty expressed or implied. See the file COPYING for details.
+
+""" Unit test suite for chardet
+"""
+
+import unittest
+import sys
+import os
+
+
+def get_python_modules(file_list, file_suffix = '.py'):
+    """ Return a list of module names from a filename list """
+    python_modules = [m[:m.rfind(file_suffix)] for m in file_list
+        if m.endswith(file_suffix)]
+    return python_modules
+
+def get_test_modules(module_list, module_prefix = 'test_'):
+    """ Return the list of modules that are named as test modules """
+    test_modules = [m for m in module_list
+        if m.startswith(module_prefix)]
+    return test_modules
+
+
+def suite():
+    """ Create the test suite for this module """
+    loader = unittest.TestLoader()
+    test_dir = os.path.dirname(__file__)
+    python_modules = get_python_modules(os.listdir(test_dir))
+    module_list = get_test_modules(python_modules)
+    suite = loader.loadTestsFromNames(module_list)
+
+    return suite
+
+
+def __main__(argv=None):
+    """ Mainline function for this module """
+    from sys import argv as sys_argv
+    if not argv:
+        argv = sys_argv
+
+    exitcode = None
+    try:
+        unittest.main(argv=argv, defaultTest='suite')
+    except SystemExit, e:
+        exitcode = e.code
+
+    return exitcode
+
+if __name__ == '__main__':
+    exitcode = __main__(sys.argv)
+    sys.exit(exitcode)

=== added directory 'bin'
=== added file 'bin/detect-encoding'
--- bin/detect-encoding	1970-01-01 00:00:00 +0000
+++ bin/detect-encoding	2008-05-03 05:13:33 +0000
@@ -0,0 +1,159 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+# bin/detect-encoding
+# Part of chardet, the Universal Encoding Detector.
+#
+# Copyright © 2008 Ben Finney <[EMAIL PROTECTED]>
+# This is free software; you may copy, modify and/or distribute this work
+# under the terms of the GNU General Public License, version 2 or later.
+# No warranty expressed or implied. See the file COPYING for details.
+
+""" %prog [options] [file ...]
+
+Report heuristically-detected character encoding for each file.
+
+For every specified file (defaulting to stdin if no files are
+specified), reads and determines the character encoding of the file
+content. Reports the name and confidence level for each file's
+detected character encoding.
+"""
+
+import sys
+import optparse
+import chardet
+
+
+class OptionParser(optparse.OptionParser, object):
+    """ Command-line parser for this program """
+
+    def __init__(self, *args, **kwargs):
+        """ Set up a new instance """
+        super(OptionParser, self).__init__(*args, **kwargs)
+
+        global __doc__
+        self.usage = __doc__.strip()
+
+
+def detect_encoding(in_file):
+    """ Detect encoding of text in `in_file`
+
+        Parameters
+          in_file
+            Opened file object to read and examine.
+
+        Return value
+            The mapping as returned by `chardet.detect`.
+
+        """
+    in_data = in_file.read()
+    params = chardet.detect(in_data)
+    return params
+
+
+def report_file_encoding(in_file, encoding_params):
+    """ Return a report of the file's encoding
+
+        Parameters
+          in_file
+            File object being reported. Should have an appropriate
+            `name` attribute.
+
+          encoding_params
+            Mapping as returned by `detect_encoding` on the file's
+            data.
+
+        Return value
+            The report is a single line of text showing filename,
+            detected encoding, and detection confidence.
+
+        """
+    file_name = in_file.name
+    encoding_name = encoding_params['encoding']
+    confidence = encoding_params['confidence']
+    report = (
+        "%(file_name)s: %(encoding_name)s"
+        " (confidence: %(confidence)0.2f)"
+        ) % vars()
+    return report
+
+
+def process_file(in_file):
+    """ Process a single file
+
+        Parameters
+          in_file
+            Opened file object to read and examine.
+
+        Return value
+            None.
+
+        Reads the file contents, detects the encoding, and writes a
+        report line to stdout.
+        """
+    encoding_params = detect_encoding(in_file)
+    encoding_report = report_file_encoding(in_file, encoding_params)
+    message = "%(encoding_report)s\n" % vars()
+    sys.stdout.write(message)
+
+
+class DetectEncodingApp(object):
+    """ Application behaviour for 'detect-encoding' program """
+
+    def __init__(self, argv):
+        """ Set up a new instance """
+        self._parse_commandline(argv)
+
+    def _parse_commandline(self, argv):
+        """ Parse command-line arguments """
+        option_parser = OptionParser()
+        (options, args) = option_parser.parse_args(argv[1:])
+        self.file_names = args
+
+    def _emit_file_error(self, file_name, error):
+        """ Emit an error message regarding file processing """
+        error_name = error.__class__.__name__
+        message = (
+            "%(file_name)s: %(error_name)s: %(error)s\n"
+            ) % vars()
+        sys.stderr.write(message)
+
+    def _process_all_files(self, file_names):
+        """ Process all files in list """
+        if not len(file_names):
+            file_names = [None]
+        for file_name in file_names:
+            try:
+                if file_name is None:
+                    file_name = sys.stdin.name
+                    in_file = sys.stdin
+                else:
+                    in_file = open(file_name)
+                process_file(in_file)
+            except IOError, exc:
+                self._emit_file_error(file_name, exc)
+
+    def main(self):
+        """ Main entry point for application """
+        self._process_all_files(self.file_names)
+
+
+def __main__(argv=None):
+    """ Mainline code for this program """
+
+    from sys import argv as sys_argv
+    if argv is None:
+        argv = sys_argv
+
+    app = DetectEncodingApp(argv)
+    exitcode = None
+    try:
+        app.main()
+    except SystemExit, e:
+        exitcode = e.code
+
+    return exitcode
+
+if __name__ == "__main__":
+    exitcode = __main__(argv=sys.argv)
+    sys.exit(exitcode)

=== added file 'test/test_detect_encoding.py'
--- test/test_detect_encoding.py	1970-01-01 00:00:00 +0000
+++ test/test_detect_encoding.py	2008-05-03 05:09:39 +0000
@@ -0,0 +1,558 @@
+# -*- coding: utf-8; -*-
+
+# test/test_detect_encoding.py
+# Part of chardet, the Universal Encoding Detector.
+#
+# Copyright © 2008 Ben Finney <[EMAIL PROTECTED]>
+# This is free software; you may copy, modify and/or distribute this work
+# under the terms of the GNU General Public License, version 2 or later.
+# No warranty expressed or implied. See the file COPYING for details.
+
+""" Unit test suite for the 'detect-encoding' command-line program
+"""
+
+import __builtin__
+import sys
+import os
+import stat
+from StringIO import StringIO
+
+import scaffold
+from scaffold import TestCase
+from minimock import (
+    Mock,
+    mock,
+    restore as mock_restore,
+    )
+import chardet
+
+
+module_name = 'detect_encoding'
+module_file_under_test = os.path.join(scaffold.bin_dir, 'detect-encoding')
+detect_encoding = scaffold.make_module_from_file(
+    module_name, module_file_under_test
+    )
+
+class Test_ProgramFile(TestCase):
+    """ Test cases for executable program """
+
+    def test_program_is_executable(self):
+        """ Program file should be executable """
+        file_mode = os.stat(module_file_under_test).st_mode
+        exec_bit = stat.S_IEXEC
+        self.failUnlessEqual(exec_bit, (file_mode & exec_bit))
+
+
+class Test_OptionParser(TestCase):
+    """ Test cases for OptionParser class """
+
+    def setUp(self):
+        """ Set up test fixtures """
+        self.instance = detect_encoding.OptionParser()
+
+    def test_usage_contains_module_docstring(self):
+        """ Usage message should contain the module docstring
+
+            To reduce unnecessary duplication, the module docstring
+            for the program should be used for the program usage
+            message.
+
+            """
+        instance = self.instance
+        module_docstring_stripped = detect_encoding.__doc__.strip()
+        self.failUnlessIn(instance.usage, module_docstring_stripped)
+
+    def test_usage_contains_program_name_placeholder(self):
+        """ Usage message should contain program name placeholder
+
+            The OptionParser usage message generation will replace the
+            '%prog' placeholder with the name of the running program,
+            so this placeholder should be in the usage message string.
+
+            """
+        instance = self.instance
+        progname_placeholder = "%prog"
+        self.failUnlessIn(instance.usage, progname_placeholder)
+
+
+def setup_file_encoding_fixture(testcase):
+    """ Set up test parameters for files with encodings """
+    testcase.valid_file_params = {
+        "foo.txt": dict(
+            encoding = "spam.eggs",
+            confidence = 1.0,
+            ),
+        "bar": dict(
+            encoding = "beans.tofu",
+            confidence = 0.333333333,
+            ),
+        "baz.dat": dict(
+            encoding = "bacon.muffin",
+            confidence = 0.666666667,
+            ),
+        }
+
+    for (file_name, params) in testcase.valid_file_params.items():
+        params['file_name'] = file_name
+        instance = StringIO()
+        instance.name = file_name
+        params['instance'] = instance
+        encoding_params = dict(
+            encoding=params['encoding'],
+            confidence=params['confidence'],
+            )
+        params['encoding_params'] = encoding_params
+
+
+class Test_detect_encoding(TestCase):
+    """ Test cases for ``detect_encoding`` function """
+
+    def setUp(self):
+        """ Set up test fixtures """
+        self.mock_outfile = StringIO()
+
+        mock(
+            'chardet.detect',
+            outfile=self.mock_outfile)
+
+        self.in_data = object()
+        self.in_file = Mock("file", outfile=self.mock_outfile)
+        self.in_file.read.mock_returns = self.in_data
+
+    def test_reads_file_contents(self):
+        """ detect_encoding should read contents of file """
+        expect_mock_output = """\
+            Called file.read()
+            ...
+            """ % vars()
+        dummy = detect_encoding.detect_encoding(self.in_file)
+        self.failUnlessOutputCheckerMatch(
+            expect_mock_output, self.mock_outfile.getvalue()
+            )
+
+    def test_uses_chardet_detect(self):
+        """ detect_encoding should use chardet.detect() """
+        expect_mock_output = """\
+            ...
+            Called chardet.detect(%(in_data)r)
+            """ % vars(self)
+        dummy = detect_encoding.detect_encoding(self.in_file)
+        self.failUnlessOutputCheckerMatch(
+            expect_mock_output, self.mock_outfile.getvalue()
+            )
+
+    def test_returns_expected_parameters(self):
+        """ detect_encoding should return expected encoding parameters """
+        expect_encoding_params = object()
+        chardet.detect.mock_returns = expect_encoding_params
+        encoding_params = detect_encoding.detect_encoding(self.in_file)
+        self.failUnlessIs(expect_encoding_params, encoding_params)
+
+
+class Test_report_file_encoding(TestCase):
+    """ Test cases for ``report_file_encoding`` function """
+
+    def setUp(self):
+        """ Set up test fixtures """
+        self.mock_outfile = StringIO()
+
+        setup_file_encoding_fixture(self)
+
+    def test_report_contains_filename(self):
+        """ report_file_encoding() result should contain filename
+
+            The report text returned from report_file_encoding() should
+            contain the name of the supplied file.
+
+            """
+        for params in self.valid_file_params.values():
+            file_name = params['file_name']
+            in_file = params['instance']
+            encoding_params = params['encoding_params']
+            args = dict(
+                in_file=in_file,
+                encoding_params=encoding_params,
+                )
+            report = detect_encoding.report_file_encoding(**args)
+            self.failUnlessIn(report, in_file.name)
+
+    def test_report_contains_encoding(self):
+        """ report_file_encoding() result should contain encoding
+
+            The report text returned from report_file_encoding()
+            should contain the name of the detected encoding.
+
+            """
+        for params in self.valid_file_params.values():
+            in_file = params['instance']
+            encoding_params = params['encoding_params']
+            encoding_name = encoding_params['encoding']
+            args = dict(
+                in_file=in_file,
+                encoding_params=encoding_params,
+                )
+            report = detect_encoding.report_file_encoding(**args)
+            self.failUnlessIn(report, encoding_name)
+
+    def test_report_contains_confidence_score(self):
+        """ report_file_encoding() result should contain confidence score
+
+            The report text returned from report_file_encoding()
+            should contain the confidence score of the detection.
+
+            """
+        for params in self.valid_file_params.values():
+            in_file = params['instance']
+            encoding_params = params['encoding_params']
+            confidence = encoding_params['confidence']
+            confidence_text = "%(confidence)0.2f" % vars()
+            args = dict(
+                in_file=in_file,
+                encoding_params=encoding_params,
+                )
+            report = detect_encoding.report_file_encoding(**args)
+            self.failUnlessIn(report, confidence_text)
+
+
+class Test_process_file(TestCase):
+    """ Test cases for process_file function """
+
+    def setUp(self):
+        """ Set up test fixtures """
+
+        self.mock_outfile = StringIO()
+
+        self.app_class = detect_encoding.DetectEncodingApp
+        setup_DetectEncodingApp_fixture(self)
+        app_params = self.valid_app_params['no files']
+        self.app_instance = app_params['instance']
+
+        mock(
+            'detect_encoding.detect_encoding',
+            outfile=self.mock_outfile)
+        detect_encoding.detect_encoding.mock_returns_iter = (
+            f['encoding_params']
+                for f in self.valid_file_params.values()
+            )
+        mock(
+            'detect_encoding.report_file_encoding',
+            outfile=self.mock_outfile)
+        fake_report_text = str(object())
+        detect_encoding.report_file_encoding.mock_returns = (
+            fake_report_text)
+        mock(
+            'sys.stdout',
+            outfile=self.mock_outfile)
+
+    def tearDown(self):
+        """ Tear down test fixtures """
+        mock_restore()
+
+    def test_uses_detect_encoding_on_file(self):
+        """ Should call detect_encoding with file parameter """
+        for file_params in self.valid_file_params.values():
+            in_file = file_params['instance']
+            encoding_params = file_params['encoding_params']
+            expect_mock_output = """\
+                Called detect_encoding.detect_encoding(
+                    %(instance)r)
+                ...
+                """ % file_params
+            detect_encoding.process_file(in_file)
+            self.failUnlessOutputCheckerMatch(
+                expect_mock_output, self.mock_outfile.getvalue()
+                )
+            self.mock_outfile.truncate(0)
+
+    def test_uses_report_file_encoding_on_parameters(self):
+        """ Should call report_file_encoding with report parameters """
+        for file_params in self.valid_file_params.values():
+            in_file = file_params['instance']
+            expect_mock_output = """\
+                ...
+                Called detect_encoding.report_file_encoding(
+                    %(instance)r,
+                    %(encoding_params)r)
+                ...
+                """ % file_params
+            detect_encoding.process_file(in_file)
+            self.failUnlessOutputCheckerMatch(
+                expect_mock_output, self.mock_outfile.getvalue()
+                )
+            self.mock_outfile.truncate(0)
+
+    def test_writes_report_to_stdout(self):
+        """ Should write report for file to stdout """
+        for file_params in self.valid_file_params.values():
+            fake_report_text = str(object())
+            detect_encoding.report_file_encoding.mock_returns = (
+                fake_report_text)
+            in_file = file_params['instance']
+            expect_mock_output = """\
+                ...
+                Called sys.stdout.write('...%(fake_report_text)s...')
+                """ % vars()
+            detect_encoding.process_file(in_file)
+            self.failUnlessOutputCheckerMatch(
+                expect_mock_output, self.mock_outfile.getvalue()
+                )
+            self.mock_outfile.truncate(0)
+
+
+def setup_DetectEncodingApp_fixture(testcase):
+    """ Set up a DetectEncodingApp test fixture on the test case
+
+        `testcase`
+            The unit test case to which the fixture should be applied.
+            Must have an attribute `app_class` which is the type to
+            instantiate for the application instances.
+
+        The fixture is applied as the `valid_apps` attribute, and is a
+        dict of parameter dicts to use in the test case.
+
+        """
+
+    setup_file_encoding_fixture(testcase)
+
+    testcase.valid_app_params = {
+        'no files': dict(
+            file_names = [],
+            ),
+        'one file': dict(
+            file_names = [
+                "foo.txt",
+                ],
+            ),
+        'three files': dict(
+            file_names = [
+                "foo.txt", "bar", "baz.dat",
+                ],
+            ),
+        }
+
+    for params in testcase.valid_app_params.values():
+        argv = []
+        cmd_args = params.get('cmd_args', ["foo_prog"])
+        file_names = params['file_names']
+        in_files = []
+        for file_name in file_names:
+            in_file = StringIO()
+            in_file.name = file_name
+            in_files.append(in_file)
+        else:
+            in_file = StringIO()
+            in_file.name = "<stdin>"
+            in_files = [in_file]
+        params['in_files'] = in_files
+        cmd_args.extend(file_names)
+        argv.extend(cmd_args)
+        params['argv'] = argv
+        args = dict(
+            argv=argv
+            )
+        params['args'] = args
+        instance = testcase.app_class(**args)
+        params['instance'] = instance
+
+
+class Test_DetectEncodingApp_init(TestCase):
+    """ Test cases for DetectCodingApp class initialisation """
+
+    def setUp(self):
+        """ Set up test fixtures """
+
+        self.mock_outfile = StringIO()
+
+        self.app_class = detect_encoding.DetectEncodingApp
+        setup_DetectEncodingApp_fixture(self)
+
+    def tearDown(self):
+        """ Tear down test fixtures """
+        mock_restore()
+
+    def test_requires_argv(self):
+        """ Shoudl require argv parameter """
+        args = dict()
+        self.failUnlessRaises(TypeError, self.app_class, **args)
+
+    def test_parses_args(self):
+        """ Should parse command-line arguments """
+        for params in self.valid_app_params.values():
+            args = params['args']
+            argv = args['argv']
+
+            argv_to_parse = argv[1:]
+            args_return = argv_to_parse
+            mock_option_parser = Mock(
+                "OptionParser",
+                outfile=self.mock_outfile)
+            stub_parse_args_return = (object(), args_return)
+            mock_option_parser.parse_args.mock_returns = (
+                stub_parse_args_return)
+            mock(
+                'detect_encoding.OptionParser',
+                returns=mock_option_parser,
+                outfile=self.mock_outfile)
+
+            expect_mock_output = """\
+                Called detect_encoding.OptionParser()
+                Called OptionParser.parse_args(%(argv_to_parse)r)
+                """ % vars()
+            instance = self.app_class(**args)
+            self.failUnlessOutputCheckerMatch(
+                expect_mock_output, self.mock_outfile.getvalue())
+            self.mock_outfile.truncate(0)
+
+    def test_stores_specified_args(self):
+        """ Should store specified command-line arguments """
+        for params in self.valid_app_params.values():
+            instance = params['instance']
+            app_args = params['args']
+            expect_file_names = params['file_names']
+            self.failUnlessEqual(expect_file_names, instance.file_names)
+
+
+class Test_DetectEncodingApp_main(TestCase):
+    """ Test cases for DetectEncodingApp.main method """
+
+    def setUp(self):
+        """ Set up test fixtures """
+
+        self.mock_outfile = StringIO()
+
+        self.app_class = detect_encoding.DetectEncodingApp
+        setup_DetectEncodingApp_fixture(self)
+
+        mock(
+            'detect_encoding.process_file',
+            outfile=self.mock_outfile)
+        fake_file = self.valid_file_params['foo.txt']['instance']
+        mock(
+            '__builtin__.open',
+            returns=fake_file,
+            outfile=self.mock_outfile)
+
+    def tearDown(self):
+        """ Tear down test fixtures """
+        mock_restore()
+
+    def test_with_no_files_processes_stdin(self):
+        """ With no files specified, should process sys.stdin """
+        params = self.valid_app_params['no files']
+        instance = params['instance']
+        in_file = Mock("sys.stdin", outfile=self.mock_outfile)
+        mock(
+            'sys.stdin', mock_obj=in_file)
+        expect_mock_output = """\
+            Called detect_encoding.process_file(%(in_file)r)
+            """ % vars()
+        instance.main()
+        mock_restore()
+        self.failUnlessOutputCheckerMatch(
+            expect_mock_output, self.mock_outfile.getvalue()
+            )
+
+    def test_processes_specified_files(self):
+        """ With filenames, should process each file in turn """
+        params = self.valid_app_params['three files']
+        file_names = params['file_names']
+        instance = params['instance']
+        in_files = []
+        expect_mock_output_segments = []
+        for file_name in file_names:
+            in_file = self.valid_file_params[file_name]['instance']
+            in_files.append(in_file)
+            expect_mock_output_segments.append(
+                ("""\
+                    Called __builtin__.open(%(file_name)r)
+                    Called detect_encoding.process_file(%(in_file)r)"""
+                    ) % vars()
+                )
+        mock(
+            '__builtin__.open', returns_iter=in_files,
+            outfile=self.mock_outfile)
+        expect_mock_output = "\n".join(expect_mock_output_segments) + "\n"
+        instance.main()
+        mock_restore()
+        self.failUnlessOutputCheckerMatch(
+            expect_mock_output, self.mock_outfile.getvalue()
+            )
+
+    def test_emits_message_on_open_ioerror(self):
+        """ IOError from open should cause error message """
+        params = self.valid_app_params['one file']
+        instance = params['instance']
+        error_instance = IOError("Badness!")
+        error_name = error_instance.__class__.__name__
+        __builtin__.open.mock_raises = error_instance
+        mock(
+            'sys.stderr',
+            outfile=self.mock_outfile)
+        expect_mock_output = """\
+            ...
+            Called sys.stderr.write('...%(error_name)s...%(error_instance)s...')
+            """ % vars()
+        instance.main()
+        mock_restore()
+        self.failUnlessOutputCheckerMatch(
+            expect_mock_output, self.mock_outfile.getvalue()
+            )
+
+    def test_continues_to_next_file_after_ioerror(self):
+        """ Should proceed to next file after IOError """
+        params = self.valid_app_params['three files']
+        instance = params['instance']
+        file_names = params['file_names']
+        error_instance = IOError("Badness!")
+        mock(
+            'sys.stderr',
+            outfile=self.mock_outfile)
+        error_file_name = file_names[1]
+        def stub_open(file_name, *args, **kwargs):
+            if file_name == error_file_name:
+                raise error_instance
+            else:
+                return Mock("file", outfile=self.mock_outfile)
+        mock(
+            '__builtin__.open', mock_obj=stub_open,
+            outfile=self.mock_outfile)
+        expect_mock_output = """\
+            Called detect_encoding.process_file(...)
+            Called sys.stderr.write(...)
+            Called detect_encoding.process_file(...)
+            """ % vars()
+        instance.main()
+        mock_restore()
+        self.failUnlessOutputCheckerMatch(
+            expect_mock_output, self.mock_outfile.getvalue()
+            )
+
+    def test_emits_message_on_detect_encoding_ioerror(self):
+        """ IOError from detect_encoding should cause error message """
+        params = self.valid_app_params['one file']
+        instance = params['instance']
+        error_instance = IOError("Badness!")
+        error_name = error_instance.__class__.__name__
+        detect_encoding.process_file.mock_raises = error_instance
+        mock(
+            'sys.stderr',
+            outfile=self.mock_outfile)
+        expect_mock_output = """\
+            ...
+            Called sys.stderr.write('...%(error_name)s...%(error_instance)s...')
+            """ % vars()
+        instance.main()
+        mock_restore()
+        self.failUnlessOutputCheckerMatch(
+            expect_mock_output, self.mock_outfile.getvalue()
+            )
+
+
+class Test_ProgramMain(scaffold.Test_ProgramMain):
+    """ Test cases for program __main__ function """
+
+    def setUp(self):
+        """ Set up a new instance """
+        self.program_module = detect_encoding
+        self.application_class = detect_encoding.DetectEncodingApp
+        super(Test_ProgramMain, self).setUp()

signature.asc
Description: Digital signature

Bug#479178: python-chardet: Add command-line program for detecting character encoding of files

Reply via email to