On Wed, Jan 18, 2012 at 12:14:02PM +0100, Julian Andres Klode wrote: > You'd also need to take care of TagSection if that is done, which should > then work in bytes mode when passed a bytes string. > > Basically you'd need to modify TagSection and TagFile to both store whether > to use bytes or unicode and pass the value of that flag from the TagFile > to the TagSection. Then create a function > > PyObject *TagFile_ToString(char *s, size_t n) > > or similar that uses PyString_* functions or PyBytes_ functions depending > on the context (where PyString is mapped to unicode in Python 3, and > str in Python 2). Then use that function everywhere we currently > create strings in the TagFile.
OK. How about something like this? I added both an explicit bytes= parameter and a fallback which tries to detect the encoding from the file object. === modified file 'python/tag.cc' --- python/tag.cc 2011-11-10 16:20:58 +0000 +++ python/tag.cc 2012-01-20 14:47:56 +0000 @@ -38,6 +38,10 @@ using namespace std; struct TagSecData : public CppPyObject<pkgTagSection> { char *Data; + bool Bytes; +#if PY_MAJOR_VERSION >= 3 + PyObject *Encoding; +#endif }; // The owner of the TagFile is a Python file object. @@ -45,6 +49,10 @@ struct TagFileData : public CppPyObject< { TagSecData *Section; FileFd Fd; + bool Bytes; +#if PY_MAJOR_VERSION >= 3 + PyObject *Encoding; +#endif }; // Traversal and Clean for owned objects @@ -60,6 +68,35 @@ int TagFileClear(PyObject *self) { return 0; } +// Helpers to return Unicode or bytes as appropriate. +#if PY_MAJOR_VERSION < 3 +#define TagSecString_FromStringAndSize(self, v, len) \ + PyString_FromStringAndSize((v), (len)) +#define TagSecString_FromString(self, v) PyString_FromString(v) +#else +PyObject *TagSecString_FromStringAndSize(PyObject *self, const char *v, + Py_ssize_t len) { + TagSecData *Self = (TagSecData *)self; + if (Self->Bytes) + return PyBytes_FromStringAndSize(v, len); + else if (Self->Encoding) + return PyUnicode_Decode(v, len, PyUnicode_AsString(Self->Encoding), 0); + else + return PyUnicode_FromStringAndSize(v, len); +} + +PyObject *TagSecString_FromString(PyObject *self, const char *v) { + TagSecData *Self = (TagSecData *)self; + if (Self->Bytes) + return PyBytes_FromString(v); + else if (Self->Encoding) + return PyUnicode_Decode(v, strlen(v), + PyUnicode_AsString(Self->Encoding), 0); + else + return PyUnicode_FromString(v); +} +#endif + /*}}}*/ // TagSecFree - Free a Tag Section /*{{{*/ @@ -107,9 +144,9 @@ static PyObject *TagSecFind(PyObject *Se { if (Default == 0) Py_RETURN_NONE; - return PyString_FromString(Default); + return TagSecString_FromString(Self,Default); } - return PyString_FromStringAndSize(Start,Stop-Start); + return TagSecString_FromStringAndSize(Self,Start,Stop-Start); } static char *doc_FindRaw = @@ -128,14 +165,14 @@ static PyObject *TagSecFindRaw(PyObject { if (Default == 0) Py_RETURN_NONE; - return PyString_FromString(Default); + return TagSecString_FromString(Self,Default); } const char *Start; const char *Stop; GetCpp<pkgTagSection>(Self).Get(Start,Stop,Pos); - return PyString_FromStringAndSize(Start,Stop-Start); + return TagSecString_FromStringAndSize(Self,Start,Stop-Start); } static char *doc_FindFlag = @@ -161,21 +198,18 @@ static PyObject *TagSecFindFlag(PyObject // Map access, operator [] static PyObject *TagSecMap(PyObject *Self,PyObject *Arg) { - if (PyString_Check(Arg) == 0) - { - PyErr_SetNone(PyExc_TypeError); + const char *Name = PyObject_AsString(Arg); + if (Name == 0) return 0; - } - const char *Start; const char *Stop; - if (GetCpp<pkgTagSection>(Self).Find(PyString_AsString(Arg),Start,Stop) == false) + if (GetCpp<pkgTagSection>(Self).Find(Name,Start,Stop) == false) { - PyErr_SetString(PyExc_KeyError,PyString_AsString(Arg)); + PyErr_SetString(PyExc_KeyError,Name); return 0; } - return PyString_FromStringAndSize(Start,Stop-Start); + return TagSecString_FromStringAndSize(Self,Start,Stop-Start); } // len() operation @@ -230,9 +264,9 @@ static PyObject *TagSecExists(PyObject * static int TagSecContains(PyObject *Self,PyObject *Arg) { - if (PyString_Check(Arg) == 0) - return 0; - const char *Name = PyString_AsString(Arg); + const char *Name = PyObject_AsString(Arg); + if (Name == 0) + return 0; const char *Start; const char *Stop; if (GetCpp<pkgTagSection>(Self).Find(Name,Start,Stop) == false) @@ -256,7 +290,7 @@ static PyObject *TagSecStr(PyObject *Sel const char *Start; const char *Stop; GetCpp<pkgTagSection>(Self).GetSection(Start,Stop); - return PyString_FromStringAndSize(Start,Stop-Start); + return TagSecString_FromStringAndSize(Self,Start,Stop-Start); } /*}}}*/ // TagFile Wrappers /*{{{*/ @@ -286,6 +320,12 @@ static PyObject *TagFileNext(PyObject *S Obj.Section->Owner = Self; Py_INCREF(Obj.Section->Owner); Obj.Section->Data = 0; + Obj.Section->Bytes = Obj.Bytes; +#if PY_MAJOR_VERSION >= 3 + // We don't need to incref Encoding as the previous Section object already + // held a reference to it. + Obj.Section->Encoding = Obj.Encoding; +#endif if (Obj.Object.Step(Obj.Section->Object) == false) return HandleErrors(NULL); @@ -347,11 +387,12 @@ static PyObject *TagFileJump(PyObject *S static PyObject *TagSecNew(PyTypeObject *type,PyObject *Args,PyObject *kwds) { char *Data; int Len; - char *kwlist[] = {"text", 0}; + char Bytes = 0; + char *kwlist[] = {"text", "bytes", 0}; // this allows reading "byte" types from python3 - but we don't // make (much) use of it yet - if (PyArg_ParseTupleAndKeywords(Args,kwds,"s#",kwlist,&Data,&Len) == 0) + if (PyArg_ParseTupleAndKeywords(Args,kwds,"s#|b",kwlist,&Data,&Len,&Bytes) == 0) return 0; // Create the object.. @@ -359,6 +400,10 @@ static PyObject *TagSecNew(PyTypeObject new (&New->Object) pkgTagSection(); New->Data = new char[strlen(Data)+2]; snprintf(New->Data,strlen(Data)+2,"%s\n",Data); + New->Bytes = Bytes; +#if PY_MAJOR_VERSION >= 3 + New->Encoding = 0; +#endif if (New->Object.Scan(New->Data,strlen(New->Data)) == false) { @@ -390,9 +435,10 @@ PyObject *ParseSection(PyObject *self,Py static PyObject *TagFileNew(PyTypeObject *type,PyObject *Args,PyObject *kwds) { - PyObject *File; - char *kwlist[] = {"file", 0}; - if (PyArg_ParseTupleAndKeywords(Args,kwds,"O",kwlist,&File) == 0) + PyObject *File = 0; + char Bytes = 0; + char *kwlist[] = {"file", "bytes", 0}; + if (PyArg_ParseTupleAndKeywords(Args,kwds,"O|b",kwlist,&File,&Bytes) == 0) return 0; int fileno = PyObject_AsFileDescriptor(File); if (fileno == -1) @@ -405,8 +451,15 @@ static PyObject *TagFileNew(PyTypeObject #else new (&New->Fd) FileFd(fileno,false); #endif + New->Bytes = Bytes; New->Owner = File; Py_INCREF(New->Owner); +#if PY_MAJOR_VERSION >= 3 + New->Encoding = PyObject_GetAttr(File, PyUnicode_FromString("encoding")); + if (!PyUnicode_Check(New->Encoding)) + New->Encoding = 0; + Py_XINCREF(New->Encoding); +#endif new (&New->Object) pkgTagFile(&New->Fd); // Create the section @@ -415,6 +468,11 @@ static PyObject *TagFileNew(PyTypeObject New->Section->Owner = New; Py_INCREF(New->Section->Owner); New->Section->Data = 0; + New->Section->Bytes = Bytes; +#if PY_MAJOR_VERSION >= 3 + New->Section->Encoding = New->Encoding; + Py_XINCREF(New->Section->Encoding); +#endif return HandleErrors(New); } @@ -492,7 +550,7 @@ PyObject *RewriteSection(PyObject *self, } // Return the string - PyObject *ResObj = PyString_FromStringAndSize(bp,size); + PyObject *ResObj = TagSecString_FromStringAndSize(Section,bp,size); free(bp); return HandleErrors(ResObj); } @@ -521,11 +579,15 @@ PySequenceMethods TagSecSeqMeth = {0,0,0 PyMappingMethods TagSecMapMeth = {TagSecLength,TagSecMap,0}; -static char *doc_TagSec = "TagSection(text: str)\n\n" +static char *doc_TagSec = "TagSection(text: str, [bytes: bool = False])\n\n" "Provide methods to access RFC822-style header sections, like those\n" "found in debian/control or Packages files.\n\n" "TagSection() behave like read-only dictionaries and also provide access\n" - "to the functions provided by the C++ class (e.g. find)"; + "to the functions provided by the C++ class (e.g. find).\n\n" + "By default, text read from files is treated as strings (binary data in\n" + "Python 2, Unicode strings in Python 3). Use bytes=True to cause all\n" + "header values read from this TagSection to be bytes even in Python 3.\n" + "Header names are always treated as Unicode."; PyTypeObject PyTagSection_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) @@ -596,7 +658,7 @@ static PyGetSetDef TagFileGetSet[] = { }; -static char *doc_TagFile = "TagFile(file)\n\n" +static char *doc_TagFile = "TagFile(file, [bytes: bool = False])\n\n" "TagFile() objects provide access to debian control files, which consist\n" "of multiple RFC822-style sections.\n\n" "To provide access to those sections, TagFile objects provide an iterator\n" @@ -608,7 +670,11 @@ static char *doc_TagFile = "TagFile(file "It is important to not mix the use of both APIs, because this can have\n" "unwanted effects.\n\n" "The parameter 'file' refers to an object providing a fileno() method or\n" - "a file descriptor (an integer)"; + "a file descriptor (an integer).\n\n" + "By default, text read from files is treated as strings (binary data in\n" + "Python 2, Unicode strings in Python 3). Use bytes=True to cause all\n" + "header values read from this TagFile to be bytes even in Python 3.\n" + "Header names are always treated as Unicode."; // Type for a Tag File PyTypeObject PyTagFile_Type = === added file 'tests/test_tagfile.py' --- tests/test_tagfile.py 1970-01-01 00:00:00 +0000 +++ tests/test_tagfile.py 2012-01-20 14:52:28 +0000 @@ -0,0 +1,106 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Canonical Ltd. +# Author: Colin Watson <cjwat...@ubuntu.com> +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. +"""Unit tests for verifying the correctness of apt_pkg.TagFile.""" +from __future__ import print_function, unicode_literals +import io +import os +import shutil +import sys +import tempfile +import unittest + +import apt_pkg + + +class TestTagFile(unittest.TestCase): + """Test apt_pkg.TagFile.""" + + def setUp(self): + apt_pkg.init() + self.temp_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_utf8(self): + value = "Tést Persön <t...@example.org>" + packages = os.path.join(self.temp_dir, "Packages") + with io.open(packages, "w", encoding="UTF-8") as packages_file: + print("Maintainer: %s" % value, file=packages_file) + print("", file=packages_file) + if sys.version < '3': + # In Python 2, test the traditional file interface. + with open(packages) as packages_file: + tagfile = apt_pkg.TagFile(packages_file) + tagfile.step() + self.assertEqual( + value.encode("UTF-8"), tagfile.section["Maintainer"]) + with io.open(packages, encoding="UTF-8") as packages_file: + tagfile = apt_pkg.TagFile(packages_file) + tagfile.step() + if sys.version < '3': + self.assertEqual( + value.encode("UTF-8"), tagfile.section["Maintainer"]) + else: + self.assertEqual(value, tagfile.section["Maintainer"]) + + def test_latin1(self): + value = "Tést Persön <t...@example.org>" + packages = os.path.join(self.temp_dir, "Packages") + with io.open(packages, "w", encoding="ISO-8859-1") as packages_file: + print("Maintainer: %s" % value, file=packages_file) + print("", file=packages_file) + if sys.version < '3': + # In Python 2, test the traditional file interface. + with open(packages) as packages_file: + tagfile = apt_pkg.TagFile(packages_file) + tagfile.step() + self.assertEqual( + value.encode("ISO-8859-1"), tagfile.section["Maintainer"]) + with io.open(packages) as packages_file: + tagfile = apt_pkg.TagFile(packages_file, bytes=True) + tagfile.step() + self.assertEqual( + value.encode("ISO-8859-1"), tagfile.section["Maintainer"]) + if sys.version >= '3': + # In Python 3, TagFile can pick up the encoding of the file + # object. + with io.open(packages, encoding="ISO-8859-1") as packages_file: + tagfile = apt_pkg.TagFile(packages_file) + tagfile.step() + self.assertEqual(value, tagfile.section["Maintainer"]) + + def test_mixed(self): + value = "Tést Persön <t...@example.org>" + packages = os.path.join(self.temp_dir, "Packages") + with io.open(packages, "w", encoding="UTF-8") as packages_file: + print("Maintainer: %s" % value, file=packages_file) + print("", file=packages_file) + with io.open(packages, "a", encoding="ISO-8859-1") as packages_file: + print("Maintainer: %s" % value, file=packages_file) + print("", file=packages_file) + if sys.version < '3': + # In Python 2, test the traditional file interface. + with open(packages) as packages_file: + tagfile = apt_pkg.TagFile(packages_file) + tagfile.step() + self.assertEqual( + value.encode("UTF-8"), tagfile.section["Maintainer"]) + tagfile.step() + self.assertEqual( + value.encode("ISO-8859-1"), tagfile.section["Maintainer"]) + with io.open(packages) as packages_file: + tagfile = apt_pkg.TagFile(packages_file, bytes=True) + tagfile.step() + self.assertEqual( + value.encode("UTF-8"), tagfile.section["Maintainer"]) + tagfile.step() + self.assertEqual( + value.encode("ISO-8859-1"), tagfile.section["Maintainer"]) -- Colin Watson [cjwat...@ubuntu.com] -- To UNSUBSCRIBE, email to debian-bugs-dist-requ...@lists.debian.org with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org