Python has this package called Numeric (or Numerical Python, or Numpy), which provides APLish (or PV-WAVE IDL-ish, or PDLish) multidimensional homogeneous arrays of numeric data, of the sort one often encounters in scientific computing. It provides aggregate operations on these, such as elementwise addition, whose inner loops are written in C, so they can be much faster than if you wrote the loops in Python.
Numpy's file I/O capabilities are nonexistent; the best it has are fromstring() and tostring() facilities, which copy the data from a Python string into newly-allocated data for the Numpy array. This has a couple of major drawbacks for large chunks of data: - it requires copying the entire contents of the array, which sometimes takes more time than the rest of the program put together - it requires having two copies of the contents of the array in virtual memory at a time, which is a problem for large arrays on 32-bit machines - it's inconvenient; you have to go through several steps to read the data and several more to write it back to disk - you can't tell what changed, so you generally need to write the whole array back to disk, even if you only changed a few bytes. For years (since at least 1999), Numpy users have occasionally wanted the ability to mmap() files as Numeric arrays, which avoids all four of the above disadvantages. So here's the code that makes it possible, which is also available from http://pobox.com/~kragen/sw/arrayfrombuffer-1.tar.gz. First, a Python module with the slightly more complicated logic and the test suite: #!/usr/bin/env python import Numeric, arrayfrombuffer, mmap, os, stat, unittest def maparray(file, typecode='l', shape=None): """mmap a file as an array, either given a file object or a filename. Returns the array, the mmap object (so you can call flush explicitly to ensure your changes have gotten copied out to disk), and the open file. """ if not hasattr(file, 'fileno'): file = open(file, 'rb+') fn = file.fileno() m = mmap.mmap(fn, os.fstat(fn)[stat.ST_SIZE]) a = arrayfrombuffer.arrayfrombuffer(m, typecode) if shape is not None: a.shape = shape return a, m, file class maparraytest(unittest.TestCase): def setUp(self): self.x = Numeric.arange(10) / 2.0 open('tmp.foo', 'wb').write(self.x.tostring()) def tearDown(self): os.unlink('tmp.foo') def testx(self): self.failUnless(Numeric.alltrue(self.x == Numeric.array([0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5]))) def testread(self): y, _, _ = maparray('tmp.foo', 'd') self.failUnless(Numeric.alltrue(self.x == y)) def testwrite(self): y, mapobj, _ = maparray('tmp.foo', 'd') y[1] = 37 mapobj.flush() z, _, _ = maparray('tmp.foo', 'd') self.failUnless(Numeric.alltrue(z == Numeric.array([0, 37, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5]))) self.failUnless(Numeric.alltrue(z == y)) def testimplicitflush(self): y, _, _ = maparray('tmp.foo', 'd') y[1] = 37 del y, _ # trigger implicit flush as refcounts go to 0 z, _, _ = maparray('tmp.foo', 'd') self.failUnless(Numeric.alltrue(z == Numeric.array([0, 37, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5]))) def testnewlines(self): # If we forgot to open the file in binary mode, some bytes would # get mangled on some systems. x = Numeric.arange(258).astype('b') # bytes x[-2:] = (13, 10) # CRLF open('tmp.foo', 'wb').write(x.tostring()) y, _, _ = maparray('tmp.foo', 'b') self.failUnless(Numeric.alltrue(x == y)) def testshape(self): xx = [[1, 3], [5, 7]] open('tmp.foo', 'wb').write(Numeric.array(xx).tostring()) self.assertEqual(maparray('tmp.foo', shape=(2, 2))[0].tolist(), xx) # There are a bunch of features that are sort of hard to test, because # they're performance-oriented and require big data files or because # they involve transcending system limitations that may not be present. # For example: # - mapping an array should return instantly, even for large files # - reading the contents of a mapped array shouldn't take much more time # than reading in the corresponding file with ordinary read() calls # - it should be possible to have arrays much larger than RAM # (this is obviously hard to test if you have more than a moby of RAM) # and operate on them reasonably efficiently. # - reading or writing a small part of the array should be fast if __name__ == "__main__": unittest.main() Here's the C module the above module tries to import as "arrayfrombuffer": #include "Python.h" #include "Numeric/arrayobject.h" static char doc_arrayfrombuffer[] = "arrayfrombuffer(buf, typecode='l') makes an array whose data is in buf\n" "\n" "This function makes a new Numeric array whose data is stored in\n" "buf, a read-write buffer object; for example, a read-write mmap\n" "object. Any changes to the array will change the data in buf,\n" "and any changes to the data in buf will change the array.\n" "\n" "The typecode is optional; it defaults to 'l', like fromstring's.\n" "\n" "This is useful, for example, for storing an array in a\n" "memory-mapped file or a shared-memory segment, or just for\n" "avoiding memory-to-memory copying.\n" "\n" "I considered supporting read-only buffers, but Numeric doesn't\n" "support read-only arrays at the moment, and I don't need them for\n" "my application.\n" ; static PyObject * arrayfrombuffer(PyObject *self, PyObject *args) { PyArrayObject *arrayobj; PyObject *bufobj; char *type = "l"; /* long int is default */ void *data; int datalen; int nitems; PyArray_Descr *descr; if (!PyArg_ParseTuple(args, "O|s:arrayfrombuffer", &bufobj, &type)) { return NULL; } if (-1 == PyObject_AsWriteBuffer(bufobj, &data, &datalen)) { return NULL; } if (!(descr = PyArray_DescrFromType(*type))) { return NULL; } if (datalen % descr->elsize != 0) { PyErr_SetString(PyExc_ValueError, "buffer length must be a multiple of element size"); return NULL; } nitems = datalen / descr->elsize; /* The Numeric docs say not to do this unless 'data' is eternal, * but they're wrong. See longer comment below. */ if (!(arrayobj = (PyArrayObject*)PyArray_FromDimsAndData(1, &nitems, *type, data))) { return NULL; } /* This prevents the bufobj from being inadvertently * deallocated before the array is, and arranges for the bufobj * to be deallocated when the array is if it's not otherwise * referenced. Of course, you can still cause a core dump or * incorrect results by invalidating the bufobj's memory before * deleting the array --- for example, by calling close() on an * mmap object. Them's the breaks.*/ arrayobj->base = bufobj; Py_INCREF(bufobj); /* do I need to incref arrayobj?! fromstring does... */ return (PyObject*)arrayobj; } static PyMethodDef arrayfrombufferfuncs[] = { {"arrayfrombuffer", arrayfrombuffer, METH_VARARGS, doc_arrayfrombuffer}, {NULL, NULL} }; void initarrayfrombuffer(void) { (void)Py_InitModule("arrayfrombuffer", arrayfrombufferfuncs); import_array(); }