There have been a few threads discussing the problems of how to do text with numpy arrays in Python 3.
To make a slightly more concrete proposal, I've implemented a pure Python ndarray subclass that I believe can consistently handle text/bytes in Python 3. It is intended to be an illustration since I think that the real solution is a new dtype rather than an array subclass (so that it can be used in e.g. record arrays). The idea is that the array has an encoding. It stores strings as bytes. The bytes are encoded/decoded on insertion/access. Methods accessing the binary content of the array will see the encoded bytes. Methods accessing the elements of the array will see unicode strings. I believe it would not be as hard to implement as the proposals for variable length string arrays. The one caveat is that it will strip null characters from the end of any string. I'm not 100% that the byte stripping encoding function will always work but it will for all the encodings I know and it seems to work with all the encodings that Python has. The code is inline below and attached (in case there are encoding problems with this message!): Oscar #!/usr/bin/env python3 from numpy import ndarray, array class textarray(ndarray): '''ndarray for holding encoded text. This is for demonstration purposes only. The real proposal is to specify the encoding as a dtype rather than a subclass. Only works as a 1-d array. >>> a = textarray(['qwert', 'zxcvb'], encoding='ascii') >>> a textarray(['qwert', 'zxcvb'], dtype='|S5:ascii') >>> a[0] 'qwert' >>> a.tostring() b'qwertzxcvb' >>> a[0] = 'qwe' # shorter string >>> a[0] 'qwe' >>> a.tostring() b'qwe\\x00\\x00zxcvb' >>> a[0] = 'qwertyuiop' # longer string Traceback (most recent call last): ... ValueError: Encoded bytes don't fit >>> b = textarray(['Õscar', 'qwe'], encoding='utf-8') >>> b textarray(['Õscar', 'qwe'], dtype='|S6:utf-8') >>> b[0] 'Õscar' >>> b[0].encode('utf-8') b'\\xc3\\x95scar' >>> b.tostring() b'\\xc3\\x95scarqwe\\x00\\x00\\x00' >>> c = textarray(['qwe'], encoding='utf-32-le') >>> c textarray(['qwe'], dtype='|S12:utf-32-le') ''' def __new__(cls, strings, encoding='utf-8'): bytestrings = [s.encode(encoding) for s in strings] a = array(bytestrings, dtype='S').view(textarray) a.encoding = encoding return a def __repr__(self): slist = ', '.join(repr(self[n]) for n in range(len(self))) return "textarray([%s], \n dtype='|S%d:%s')"\ % (slist, self.itemsize, self.encoding) def __getitem__(self, index): bstring = ndarray.__getitem__(self, index) return self._decode(bstring) def __setitem__(self, index, string): bstring = string.encode(self.encoding) if len(bstring) > self.itemsize: raise ValueError("Encoded bytes don't fit") ndarray.__setitem__(self, index, bstring) def _decode(self, b): b = b + b'\0' * (4 - len(b) % 4) s = b.decode(self.encoding) for n, c in enumerate(reversed(s)): if c != '\0': return s[:len(s)-n] return s if __name__ == "__main__": import doctest doctest.testmod()
#!/usr/bin/env python3 from numpy import ndarray, array class textarray(ndarray): '''ndarray for holding encoded text. This is for demonstration purposes only. The real proposal is to specify the encoding as a dtype rather than a subclass. Only works as a 1-d array. >>> a = textarray(['qwert', 'zxcvb'], encoding='ascii') >>> a textarray(['qwert', 'zxcvb'], dtype='|S5:ascii') >>> a[0] 'qwert' >>> a.tostring() b'qwertzxcvb' >>> a[0] = 'qwe' # shorter string >>> a[0] 'qwe' >>> a.tostring() b'qwe\\x00\\x00zxcvb' >>> a[0] = 'qwertyuiop' # longer string Traceback (most recent call last): ... ValueError: Encoded bytes don't fit >>> b = textarray(['Õscar', 'qwe'], encoding='utf-8') >>> b textarray(['Õscar', 'qwe'], dtype='|S6:utf-8') >>> b[0] 'Õscar' >>> b[0].encode('utf-8') b'\\xc3\\x95scar' >>> b.tostring() b'\\xc3\\x95scarqwe\\x00\\x00\\x00' >>> c = textarray(['qwe'], encoding='utf-32-le') >>> c textarray(['qwe'], dtype='|S12:utf-32-le') ''' def __new__(cls, strings, encoding='utf-8'): bytestrings = [s.encode(encoding) for s in strings] a = array(bytestrings, dtype='S').view(textarray) a.encoding = encoding return a def __repr__(self): slist = ', '.join(repr(self[n]) for n in range(len(self))) return "textarray([%s], \n dtype='|S%d:%s')"\ % (slist, self.itemsize, self.encoding) def __getitem__(self, index): bstring = ndarray.__getitem__(self, index) return self._decode(bstring) def __setitem__(self, index, string): bstring = string.encode(self.encoding) if len(bstring) > self.itemsize: raise ValueError("Encoded bytes don't fit") ndarray.__setitem__(self, index, bstring) def _decode(self, b): b = b + b'\0' * (4 - len(b) % 4) s = b.decode(self.encoding) for n, c in enumerate(reversed(s)): if c != '\0': return s[:len(s)-n] return s if __name__ == "__main__": import doctest doctest.testmod()
_______________________________________________ NumPy-Discussion mailing list NumPy-Discussion@scipy.org http://mail.scipy.org/mailman/listinfo/numpy-discussion