memory-mapped Numerical Python arrays

Kragen Sitaker Sat, 26 Jan 2002 00:22:46 -0800

Python has this package called Numeric (or Numerical Python, or
Numpy), which provides APLish (or PV-WAVE IDL-ish, or PDLish)
multidimensional homogeneous arrays of numeric data, of the sort one
often encounters in scientific computing.  It provides aggregate
operations on these, such as elementwise addition, whose inner loops
are written in C, so they can be much faster than if you wrote the
loops in Python.


Numpy's file I/O capabilities are nonexistent; the best it has are
fromstring() and tostring() facilities, which copy the data from a
Python string into newly-allocated data for the Numpy array.  This has
a couple of major drawbacks for large chunks of data:

- it requires copying the entire contents of the array, which
  sometimes takes more time than the rest of the program put together
- it requires having two copies of the contents of the array in
  virtual memory at a time, which is a problem for large arrays on
  32-bit machines
- it's inconvenient; you have to go through several steps to read the data
  and several more to write it back to disk
- you can't tell what changed, so you generally need to write the whole
  array back to disk, even if you only changed a few bytes.

For years (since at least 1999), Numpy users have occasionally wanted
the ability to mmap() files as Numeric arrays, which avoids all four
of the above disadvantages.  So here's the code that makes it
possible, which is also available from
http://pobox.com/~kragen/sw/arrayfrombuffer-1.tar.gz.  First, a Python
module with the slightly more complicated logic and the test suite:

#!/usr/bin/env python
import Numeric, arrayfrombuffer, mmap, os, stat, unittest

def maparray(file, typecode='l', shape=None):
    """mmap a file as an array, either given a file object or a filename.
    
    Returns the array, the mmap object (so you can call flush
    explicitly to ensure your changes have gotten copied out to disk),
    and the open file.

    """
    
    if not hasattr(file, 'fileno'): file = open(file, 'rb+')
    fn = file.fileno()
    m = mmap.mmap(fn, os.fstat(fn)[stat.ST_SIZE])
    a = arrayfrombuffer.arrayfrombuffer(m, typecode)
    if shape is not None: a.shape = shape
    return a, m, file

class maparraytest(unittest.TestCase):
    def setUp(self):
        self.x = Numeric.arange(10) / 2.0
        open('tmp.foo', 'wb').write(self.x.tostring())
    def tearDown(self):
        os.unlink('tmp.foo')
    def testx(self):
        self.failUnless(Numeric.alltrue(self.x ==
                                        Numeric.array([0, 0.5, 1, 1.5, 2,
                                                       2.5, 3, 3.5, 4, 4.5])))
    def testread(self):
        y, _, _ = maparray('tmp.foo', 'd')
        self.failUnless(Numeric.alltrue(self.x == y))
    def testwrite(self):
        y, mapobj, _ = maparray('tmp.foo', 'd')
        y[1] = 37
        mapobj.flush()
        z, _, _ = maparray('tmp.foo', 'd')
        self.failUnless(Numeric.alltrue(z ==
                                        Numeric.array([0, 37, 1, 1.5, 2,
                                                       2.5, 3, 3.5, 4, 4.5])))
        self.failUnless(Numeric.alltrue(z == y))
    def testimplicitflush(self):
        y, _, _ = maparray('tmp.foo', 'd')
        y[1] = 37
        del y, _  # trigger implicit flush as refcounts go to 0
        z, _, _ = maparray('tmp.foo', 'd')
        self.failUnless(Numeric.alltrue(z ==
                                        Numeric.array([0, 37, 1, 1.5, 2,
                                                       2.5, 3, 3.5, 4, 4.5])))
    def testnewlines(self):
        # If we forgot to open the file in binary mode, some bytes would
        # get mangled on some systems.
        x = Numeric.arange(258).astype('b')  # bytes
        x[-2:] = (13, 10)  # CRLF
        open('tmp.foo', 'wb').write(x.tostring())
        y, _, _ = maparray('tmp.foo', 'b')
        self.failUnless(Numeric.alltrue(x == y))
    def testshape(self):
        xx = [[1, 3], [5, 7]]
        open('tmp.foo', 'wb').write(Numeric.array(xx).tostring())
        self.assertEqual(maparray('tmp.foo', shape=(2, 2))[0].tolist(), xx)
    # There are a bunch of features that are sort of hard to test, because
    # they're performance-oriented and require big data files or because
    # they involve transcending system limitations that may not be present.
    # For example:
    # - mapping an array should return instantly, even for large files
    # - reading the contents of a mapped array shouldn't take much more time
    #   than reading in the corresponding file with ordinary read() calls
    # - it should be possible to have arrays much larger than RAM
    #   (this is obviously hard to test if you have more than a moby of RAM)
    #   and operate on them reasonably efficiently.
    # - reading or writing a small part of the array should be fast

if __name__ == "__main__": unittest.main()



Here's the C module the above module tries to import as
"arrayfrombuffer":

#include "Python.h"
#include "Numeric/arrayobject.h"

static char doc_arrayfrombuffer[] = 
"arrayfrombuffer(buf, typecode='l') makes an array whose data is in buf\n"
"\n"
"This function makes a new Numeric array whose data is stored in\n"
"buf, a read-write buffer object; for example, a read-write mmap\n"
"object.  Any changes to the array will change the data in buf,\n"
"and any changes to the data in buf will change the array.\n"
"\n"
"The typecode is optional; it defaults to 'l', like fromstring's.\n"
"\n"
"This is useful, for example, for storing an array in a\n"
"memory-mapped file or a shared-memory segment, or just for\n"
"avoiding memory-to-memory copying.\n"
"\n"
"I considered supporting read-only buffers, but Numeric doesn't\n"
"support read-only arrays at the moment, and I don't need them for\n"
"my application.\n"
; 
static PyObject *
arrayfrombuffer(PyObject *self, PyObject *args)
{
  PyArrayObject *arrayobj;
  PyObject *bufobj;
  char *type = "l";  /* long int is default */
  void *data;
  int datalen;
  int nitems;
  PyArray_Descr *descr;

  if (!PyArg_ParseTuple(args, "O|s:arrayfrombuffer", &bufobj, &type)) {
    return NULL;
  }
  if (-1 == PyObject_AsWriteBuffer(bufobj, &data, &datalen)) {
    return NULL;
  }
  if (!(descr = PyArray_DescrFromType(*type))) {
    return NULL;
  }
  if (datalen % descr->elsize != 0) {
    PyErr_SetString(PyExc_ValueError, "buffer length must be a multiple of element 
size");
    return NULL;
  }
  nitems = datalen / descr->elsize;
  /* The Numeric docs say not to do this unless 'data' is eternal,
   * but they're wrong.  See longer comment below. */
  if (!(arrayobj = (PyArrayObject*)PyArray_FromDimsAndData(1, 
                                                           &nitems, 
                                                           *type, 
                                                           data))) {
    return NULL;
  }
  /* This prevents the bufobj from being inadvertently
   * deallocated before the array is, and arranges for the bufobj
   * to be deallocated when the array is if it's not otherwise
   * referenced.  Of course, you can still cause a core dump or
   * incorrect results by invalidating the bufobj's memory before
   * deleting the array --- for example, by calling close() on an
   * mmap object.  Them's the breaks.*/
  arrayobj->base = bufobj;
  Py_INCREF(bufobj);
  /* do I need to incref arrayobj?! fromstring does... */
  return (PyObject*)arrayobj;
}

static PyMethodDef arrayfrombufferfuncs[] = {
  {"arrayfrombuffer", arrayfrombuffer, METH_VARARGS, doc_arrayfrombuffer},
  {NULL, NULL}
};

void
initarrayfrombuffer(void)
{
  (void)Py_InitModule("arrayfrombuffer", arrayfrombufferfuncs);
  import_array();
}

memory-mapped Numerical Python arrays

Reply via email to