Robin Becker wrote:
#sscan1.py thanks to Skip import sys, time, mmap, os, re fn = sys.argv[1] fh=os.open(fn,os.O_BINARY|os.O_RDONLY) s=mmap.mmap(fh,0,access=mmap.ACCESS_READ) l=n=0 t0 = time.time() for mat in re.split("XXXXX", s):
re.split() returns a list, not a generator, and this list may consume a lot of memory.
n += 1 l += len(mat) t1 = time.time()
print "fn=%s n=%d l=%d time=%.2f" % (fn, n, l, (t1-t0))
I wrote a generator replacement for re.split(), but as you might expect, the performance is nowhere near re.split(). For your large data it might help somewhat because of its smaller memory footprint.
def splititer(regex, data): # like re.split(), but never yields the separators. if not hasattr(regex, "finditer"): regex = re.compile(regex) start = 0 for match in regex.finditer(data): end, new_start = match.span() yield data[start:end] start = new_start yield data[start:]
Peter
OK now the split scan times are much more comparable for 200Mb (which is what I have freely available according to taskmanager), but things start getting bad for 300Mb.
C:\code\reportlab\demos\gadflypaper>\tmp\sscan0.py xxx_200mb.dat fn=xxx_200mb.dat n=3797470 l=181012689 time=23.05
C:\code\reportlab\demos\gadflypaper>\tmp\sscan2.py xxx_200mb.dat fn=xxx_200mb.dat n=3797470 l=181012689 time=27.63
C:\code\reportlab\demos\gadflypaper>\tmp\sscan2.py xxx_200mb.dat fn=xxx_200mb.dat n=3797470 l=181012689 time=28.13
C:\code\reportlab\demos\gadflypaper>\tmp\sscan0.py xxx_200mb.dat fn=xxx_200mb.dat n=3797470 l=181012689 time=22.66
C:\code\reportlab\demos\gadflypaper>\tmp\sscan2.py xxx_300mb.dat fn=xxx_300mb.dat n=5696206 l=271519105 time=45.45
C:\code\reportlab\demos\gadflypaper>\tmp\sscan0.py xxx_300mb.dat fn=xxx_300mb.dat n=5696206 l=271519105 time=32.14
C:\code\reportlab\demos\gadflypaper>\tmp\sscan0.py xxx_300mb.dat fn=xxx_300mb.dat n=5696206 l=271519105 time=33.17
C:\code\reportlab\demos\gadflypaper>\tmp\sscan2.py xxx_300mb.dat fn=xxx_300mb.dat n=5696206 l=271519105 time=45.27
here sscan0.py is Bengt's adaptive buffer splitter and sscan2.py is Peter's generator splitter.
C:\code\reportlab\demos\gadflypaper>cat \tmp\sscan0.py
import sys, time, re
fn = sys.argv[1]
rxo = re.compile('XXXXX')
def frxsplit(path, rxo, chunksize=4096): buffer = '' for chunk in iter((lambda f=open(path,'rb'): f.read(chunksize)),''): buffer += chunk pieces = rxo.split(buffer) for piece in pieces[:-1]: yield piece buffer = pieces[-1] yield buffer l=n=0 t0 = time.time() for mat in frxsplit(fn,rxo): n += 1 l += len(mat) t1 = time.time()
print "fn=%s n=%d l=%d time=%.2f" % (fn, n, l, (t1-t0))
C:\code\reportlab\demos\gadflypaper>cat \tmp\sscan2.py import sys, time, mmap, os, re def splititer(regex, data): # like re.split(), but never yields the separators. if not hasattr(regex, "finditer"): regex = re.compile(regex) start = 0 for match in regex.finditer(data): end, new_start = match.span() yield data[start:end] start = new_start yield data[start:] fn = sys.argv[1] fh=os.open(fn,os.O_BINARY|os.O_RDONLY) s=mmap.mmap(fh,0,access=mmap.ACCESS_READ) l=n=0 t0 = time.time() for mat in splititer("XXXXX", s): n += 1 l += len(mat) t1 = time.time()
print "fn=%s n=%d l=%d time=%.2f" % (fn, n, l, (t1-t0))
-- Robin Becker
-- http://mail.python.org/mailman/listinfo/python-list