[Numpy-discussion] Text array dtype for numpy

Oscar Benjamin Thu, 23 Jan 2014 12:35:46 -0800

There have been a few threads discussing the problems of how to do
text with numpy arrays in Python 3.


To make a slightly more concrete proposal, I've implemented a pure
Python ndarray subclass that I believe can consistently handle
text/bytes in Python 3. It is intended to be an illustration since I
think that the real solution is a new dtype rather than an array
subclass (so that it can be used in e.g. record arrays).

The idea is that the array has an encoding. It stores strings as
bytes. The bytes are encoded/decoded on insertion/access. Methods
accessing the binary content of the array will see the encoded bytes.
Methods accessing the elements of the array will see unicode strings.

I believe it would not be as hard to implement as the proposals for
variable length string arrays. The one caveat is that it will strip
null characters from the end of any string. I'm not 100% that the byte
stripping encoding function will always work but it will for all the
encodings I know and it seems to work with all the encodings that
Python has.

The code is inline below and attached (in case there are encoding
problems with this message!):

Oscar

#!/usr/bin/env python3

from numpy import ndarray, array

class textarray(ndarray):
    '''ndarray for holding encoded text.

    This is for demonstration purposes only. The real proposal
    is to specify the encoding as a dtype rather than a subclass.

    Only works as a 1-d array.

    >>> a = textarray(['qwert', 'zxcvb'], encoding='ascii')
    >>> a
    textarray(['qwert', 'zxcvb'],
          dtype='|S5:ascii')
    >>> a[0]
    'qwert'
    >>> a.tostring()
    b'qwertzxcvb'

    >>> a[0] = 'qwe'  # shorter string
    >>> a[0]
    'qwe'
    >>> a.tostring()
    b'qwe\\x00\\x00zxcvb'

    >>> a[0] = 'qwertyuiop'  # longer string
    Traceback (most recent call last):
        ...
    ValueError: Encoded bytes don't fit

    >>> b = textarray(['Õscar', 'qwe'], encoding='utf-8')
    >>> b
    textarray(['Õscar', 'qwe'],
          dtype='|S6:utf-8')
    >>> b[0]
    'Õscar'
    >>> b[0].encode('utf-8')
    b'\\xc3\\x95scar'
    >>> b.tostring()
    b'\\xc3\\x95scarqwe\\x00\\x00\\x00'

    >>> c = textarray(['qwe'], encoding='utf-32-le')
    >>> c
    textarray(['qwe'],
          dtype='|S12:utf-32-le')

    '''
    def __new__(cls, strings, encoding='utf-8'):
        bytestrings = [s.encode(encoding) for s in strings]
        a = array(bytestrings, dtype='S').view(textarray)
        a.encoding = encoding
        return a

    def __repr__(self):
        slist = ', '.join(repr(self[n]) for n in range(len(self)))
        return "textarray([%s], \n      dtype='|S%d:%s')"\
                   % (slist, self.itemsize, self.encoding)

    def __getitem__(self, index):
        bstring = ndarray.__getitem__(self, index)
        return self._decode(bstring)

    def __setitem__(self, index, string):
        bstring = string.encode(self.encoding)
        if len(bstring) > self.itemsize:
            raise ValueError("Encoded bytes don't fit")
        ndarray.__setitem__(self, index, bstring)

    def _decode(self, b):
        b = b + b'\0' * (4 - len(b) % 4)
        s = b.decode(self.encoding)
        for n, c in enumerate(reversed(s)):
            if c != '\0':
                return s[:len(s)-n]
        return s

if __name__ == "__main__":
    import doctest
    doctest.testmod()

#!/usr/bin/env python3

from numpy import ndarray, array

class textarray(ndarray):
    '''ndarray for holding encoded text.

    This is for demonstration purposes only. The real proposal
    is to specify the encoding as a dtype rather than a subclass.

    Only works as a 1-d array.

    >>> a = textarray(['qwert', 'zxcvb'], encoding='ascii')
    >>> a
    textarray(['qwert', 'zxcvb'], 
          dtype='|S5:ascii')
    >>> a[0]
    'qwert'
    >>> a.tostring()
    b'qwertzxcvb'

    >>> a[0] = 'qwe'  # shorter string
    >>> a[0]
    'qwe'
    >>> a.tostring()
    b'qwe\\x00\\x00zxcvb'

    >>> a[0] = 'qwertyuiop'  # longer string
    Traceback (most recent call last):
        ...
    ValueError: Encoded bytes don't fit

    >>> b = textarray(['Õscar', 'qwe'], encoding='utf-8')
    >>> b
    textarray(['Õscar', 'qwe'], 
          dtype='|S6:utf-8')
    >>> b[0]
    'Õscar'
    >>> b[0].encode('utf-8')
    b'\\xc3\\x95scar'
    >>> b.tostring()
    b'\\xc3\\x95scarqwe\\x00\\x00\\x00'

    >>> c = textarray(['qwe'], encoding='utf-32-le')
    >>> c
    textarray(['qwe'], 
          dtype='|S12:utf-32-le')

    '''
    def __new__(cls, strings, encoding='utf-8'):
        bytestrings = [s.encode(encoding) for s in strings]
        a = array(bytestrings, dtype='S').view(textarray)
        a.encoding = encoding
        return a

    def __repr__(self):
        slist = ', '.join(repr(self[n]) for n in range(len(self)))
        return "textarray([%s], \n      dtype='|S%d:%s')"\
                   % (slist, self.itemsize, self.encoding)

    def __getitem__(self, index):
        bstring = ndarray.__getitem__(self, index)
        return self._decode(bstring)

    def __setitem__(self, index, string):
        bstring = string.encode(self.encoding)
        if len(bstring) > self.itemsize:
            raise ValueError("Encoded bytes don't fit")
        ndarray.__setitem__(self, index, bstring)

    def _decode(self, b):
        b = b + b'\0' * (4 - len(b) % 4)
        s = b.decode(self.encoding)
        for n, c in enumerate(reversed(s)):
            if c != '\0':
                return s[:len(s)-n]
        return s

if __name__ == "__main__":
    import doctest
    doctest.testmod()

_______________________________________________
NumPy-Discussion mailing list
NumPy-Discussion@scipy.org
http://mail.scipy.org/mailman/listinfo/numpy-discussion

[Numpy-discussion] Text array dtype for numpy

Reply via email to