import sys, time, mmap, os, re
fn = sys.argv[1],os.O_BINARY|os.O_RDONLY)
t0 = time.time()
for mat in re.split("XXXXX", s):

re.split() returns a list, not a generator, and this list may consume a lot
of memory.

n += 1
l += len(mat)
t1 = time.time()

print "fn=%s n=%d l=%d time=%.2f" % (fn, n, l, (t1-t0))

I wrote a generator replacement for re.split(), but as you might expect, the
performance is nowhere near re.split(). For your large data it might help
somewhat because of its smaller memory footprint.

def splititer(regex, data):
    # like re.split(), but never yields the separators.
    if not hasattr(regex, "finditer"):
        regex = re.compile(regex)
    start = 0
    for match in regex.finditer(data):
        end, new_start = match.span()
        yield data[start:end]
        start = new_start
    yield data[start:]


OK now the split scan times are much more comparable for 200Mb (which is what I have freely available according to taskmanager), but things start getting bad for 300Mb.

C:\code\reportlab\demos\gadflypaper>\tmp\ xxx_200mb.dat
fn=xxx_200mb.dat n=3797470 l=181012689 time=23.05

C:\code\reportlab\demos\gadflypaper>\tmp\ xxx_200mb.dat
fn=xxx_200mb.dat n=3797470 l=181012689 time=27.63

C:\code\reportlab\demos\gadflypaper>\tmp\ xxx_200mb.dat
fn=xxx_200mb.dat n=3797470 l=181012689 time=28.13

C:\code\reportlab\demos\gadflypaper>\tmp\ xxx_200mb.dat
fn=xxx_200mb.dat n=3797470 l=181012689 time=22.66

C:\code\reportlab\demos\gadflypaper>\tmp\ xxx_300mb.dat
fn=xxx_300mb.dat n=5696206 l=271519105 time=45.45

C:\code\reportlab\demos\gadflypaper>\tmp\ xxx_300mb.dat
fn=xxx_300mb.dat n=5696206 l=271519105 time=32.14

C:\code\reportlab\demos\gadflypaper>\tmp\ xxx_300mb.dat
fn=xxx_300mb.dat n=5696206 l=271519105 time=33.17

C:\code\reportlab\demos\gadflypaper>\tmp\ xxx_300mb.dat
fn=xxx_300mb.dat n=5696206 l=271519105 time=45.27

here is Bengt's adaptive buffer splitter and is Peter's generator splitter.
C:\code\reportlab\demos\gadflypaper>cat \tmp\
import sys, time, re
fn = sys.argv[1]
rxo = re.compile('XXXXX')

def frxsplit(path, rxo, chunksize=4096):
    buffer = ''
    for chunk in iter((lambda f=open(path,'rb'):,''):
        buffer += chunk
        pieces = rxo.split(buffer)
        for piece in pieces[:-1]: yield piece
        buffer = pieces[-1]
    yield buffer
t0 = time.time()
for mat in frxsplit(fn,rxo):
    n += 1
    l += len(mat)
t1 = time.time()

print "fn=%s n=%d l=%d time=%.2f" % (fn, n, l, (t1-t0))

C:\code\reportlab\demos\gadflypaper>cat \tmp\ import sys, time, mmap, os, re def splititer(regex, data): # like re.split(), but never yields the separators. if not hasattr(regex, "finditer"): regex = re.compile(regex) start = 0 for match in regex.finditer(data): end, new_start = match.span() yield data[start:end] start = new_start yield data[start:] fn = sys.argv[1],os.O_BINARY|os.O_RDONLY) s=mmap.mmap(fh,0,access=mmap.ACCESS_READ) l=n=0 t0 = time.time() for mat in splititer("XXXXX", s): n += 1 l += len(mat) t1 = time.time()

print "fn=%s n=%d l=%d time=%.2f" % (fn, n, l, (t1-t0))

Robin Becker


