>
> Will memmap be fixed to use offsets correctly before 1.3?
I posted this to scipy-dev (possibly wrong list) on March 9, so I'll
repeat it here: In Python 2.6, mmap has a offset keyword. NumPy's memmap
should use this to allow big files to be memory mapped on 32 bit systems.
Only a minor change is required:
if float(sys.version[:3]) > 2.5:
bytes = bytes - offset
mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=offset)
self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
offset=0, order=order)
else:
mm = mmap.mmap(fid.fileno(), bytes, access=acc)
self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
offset=offset, order=order)
Instead of just:
mm = mmap.mmap(fid.fileno(), bytes, access=acc)
self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
offset=offset, order=order)
Reagards,
Sturla Molden
__all__ = ['memmap']
import warnings
from numeric import uint8, ndarray, dtype
import sys
dtypedescr = dtype
valid_filemodes = ["r", "c", "r+", "w+"]
writeable_filemodes = ["r+","w+"]
mode_equivalents = {
"readonly":"r",
"copyonwrite":"c",
"readwrite":"r+",
"write":"w+"
}
class memmap(ndarray):
"""
Create a memory-map to an array stored in a file on disk.
Memory-mapped files are used for accessing small segments of large files
on disk, without reading the entire file into memory. Numpy's
memmap's are array-like objects. This differs from Python's ``mmap``
module, which uses file-like objects.
Parameters
----------
filename : string or file-like object
The file name or file object to be used as the array data
buffer.
dtype : data-type, optional
The data-type used to interpret the file contents.
Default is `uint8`
mode : {'r+', 'r', 'w+', 'c'}, optional
The file is opened in this mode:
+------+-------------------------------------------------------------+
| 'r' | Open existing file for reading only. |
+------+-------------------------------------------------------------+
| 'r+' | Open existing file for reading and writing. |
+------+-------------------------------------------------------------+
| 'w+' | Create or overwrite existing file for reading and writing. |
+------+-------------------------------------------------------------+
| 'c' | Copy-on-write: assignments affect data in memory, but |
| | changes are not saved to disk. The file on disk is |
| | read-only. |
+------+-------------------------------------------------------------+
Default is 'r+'.
offset : integer, optional
In the file, array data starts at this offset. `offset` should be
a multiple of the byte-size of `dtype`. Requires `shape=None`.
The default is 0.
shape : tuple, optional
The desired shape of the array. By default, the returned array will be
1-D with the number of elements determined by file size and data-type.
order : {'C', 'F'}, optional
Specify the order of the ndarray memory layout: C (row-major) or
Fortran (column-major). This only has an effect if the shape is
greater than 1-D. The defaullt order is 'C'.
Methods
-------
close
Close the memmap file.
flush
Flush any changes in memory to file on disk.
When you delete a memmap object, flush is called first to write
changes to disk before removing the object.
Notes
-----
The memmap object can be used anywhere an ndarray is accepted.
Given a memmap ``fp``, ``isinstance(fp, numpy.ndarray)`` returns
``True``.
Notes
-----
Memory-mapped arrays use the the Python memory-map object which
(prior to Python 2.5) does not allow files to be larger than a
certain size depending on the platform. This size is always < 2GB
even on 64-bit systems.
Examples
--------
>>> data = np.arange(12, dtype='float32')
>>> data.resize((3,4))
This example uses a temporary file so that doctest doesn't write
files to your directory. You would use a 'normal' filename.
>>> from tempfile import mkdtemp
>>> import os.path as path
>>> filename = path.join(mkdtemp(), 'newfile.dat')
Create a memmap with dtype and shape that matches our data:
>>> fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4))
>>> fp
memmap([[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.],
[ 0., 0., 0., 0.]], dtype=float32)
Write data to memmap array:
>>> fp[:] = data[:]
>>> fp
memmap([[ 0., 1., 2., 3.],
[ 4., 5., 6., 7.],
[ 8., 9., 10., 11.]], dtype=float32)
Deletion flushes memory changes to disk before removing the object:
>>> del fp
Load the memmap and verify data was stored:
>>> newfp = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
>>> newfp
memmap([[ 0., 1., 2., 3.],
[ 4., 5., 6., 7.],
[ 8., 9., 10., 11.]], dtype=float32)
Read-only memmap:
>>> fpr = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
>>> fpr.flags.writeable
False
Cannot assign to read-only, obviously:
>>> fpr[0, 3] = 56
Traceback (most recent call last):
...
RuntimeError: array is not writeable
Copy-on-write memmap:
>>> fpc = np.memmap(filename, dtype='float32', mode='c', shape=(3,4))
>>> fpc.flags.writeable
True
It's possible to assign to copy-on-write array, but values are only
written into the memory copy of the array, and not written to disk:
>>> fpc
memmap([[ 0., 1., 2., 3.],
[ 4., 5., 6., 7.],
[ 8., 9., 10., 11.]], dtype=float32)
>>> fpc[0,:] = 0
>>> fpc
memmap([[ 0., 0., 0., 0.],
[ 4., 5., 6., 7.],
[ 8., 9., 10., 11.]], dtype=float32)
File on disk is unchanged:
>>> fpr
memmap([[ 0., 1., 2., 3.],
[ 4., 5., 6., 7.],
[ 8., 9., 10., 11.]], dtype=float32)
Offset into a memmap:
>>> fpo = np.memmap(filename, dtype='float32', mode='r', offset=16)
>>> fpo
memmap([ 4., 5., 6., 7., 8., 9., 10., 11.], dtype=float32)
"""
__array_priority__ = -100.0
def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0,
shape=None, order='C'):
# Import here to minimize 'import numpy' overhead
import mmap
try:
mode = mode_equivalents[mode]
except KeyError:
if mode not in valid_filemodes:
raise ValueError("mode must be one of %s" % \
(valid_filemodes + mode_equivalents.keys()))
if hasattr(filename,'read'):
fid = filename
else:
fid = file(filename, (mode == 'c' and 'r' or mode)+'b')
if (mode == 'w+') and shape is None:
raise ValueError, "shape must be given"
fid.seek(0,2)
flen = fid.tell()
descr = dtypedescr(dtype)
_dbytes = descr.itemsize
if shape is None:
bytes = flen-offset
if (bytes % _dbytes):
fid.close()
raise ValueError, "Size of available data is not a "\
"multiple of data-type size."
size = bytes // _dbytes
shape = (size,)
else:
if not isinstance(shape, tuple):
shape = (shape,)
size = 1
for k in shape:
size *= k
bytes = long(offset + size*_dbytes)
if mode == 'w+' or (mode == 'r+' and flen < bytes):
fid.seek(bytes-1,0)
fid.write(chr(0))
fid.flush()
if mode == 'c':
acc = mmap.ACCESS_COPY
elif mode == 'r':
acc = mmap.ACCESS_READ
else:
acc = mmap.ACCESS_WRITE
if float(sys.version[:3]) > 2.5:
bytes = bytes - offset
mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=offset)
self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
offset=0, order=order)
else:
mm = mmap.mmap(fid.fileno(), bytes, access=acc)
self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
offset=offset, order=order)
self._mmap = mm
return self
def __array_finalize__(self, obj):
if hasattr(obj, '_mmap'):
self._mmap = obj._mmap
else:
self._mmap = None
def flush(self):
"""Flush any changes in the array to the file on disk."""
if self._mmap is not None:
self._mmap.flush()
def sync(self):
"""Flush any changes in the array to the file on disk."""
warnings.warn("Use ``flush``.", DeprecationWarning)
self.flush()
def _close(self):
"""Close the memmap file. Only do this when deleting the object."""
if self.base is self._mmap:
# The python mmap probably causes flush on close, but
# we put this here for safety
self._mmap.flush()
self._mmap.close()
self._mmap = None
def close(self):
"""Close the memmap file. Does nothing."""
warnings.warn("``close`` is deprecated on memmap arrays. Use del",
DeprecationWarning)
def __del__(self):
# We first check if we are the owner of the mmap, rather than
# a view, so deleting a view does not call _close
# on the parent mmap
if self._mmap is self.base:
try:
# First run tell() to see whether file is open
self._mmap.tell()
except ValueError:
pass
else:
_______________________________________________
Numpy-discussion mailing list
Numpy-discussion@scipy.org
http://mail.scipy.org/mailman/listinfo/numpy-discussion