Author: Carl Friedrich Bolz <[email protected]>
Branch:
Changeset: r82778:a4e20be9d315
Date: 2016-03-04 10:27 +0100
http://bitbucket.org/pypy/pypy/changeset/a4e20be9d315/
Log: a somewhat messy improvement of re.sub. makes simple things 3x
faster (not quite beating CPython)
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -7,6 +7,7 @@
from pypy.interpreter.error import OperationError
from rpython.rlib.rarithmetic import intmask
from rpython.rlib import jit
+from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
# ____________________________________________________________
#
@@ -226,6 +227,11 @@
def subx(self, w_ptemplate, w_string, count):
space = self.space
+ # use a (much faster) string/unicode builder if w_ptemplate and
+ # w_string are both string or both unicode objects, and if w_ptemplate
+ # is a literal
+ use_builder = False
+ filter_as_unicode = filter_as_string = None
if space.is_true(space.callable(w_ptemplate)):
w_filter = w_ptemplate
filter_is_callable = True
@@ -233,6 +239,8 @@
if space.isinstance_w(w_ptemplate, space.w_unicode):
filter_as_unicode = space.unicode_w(w_ptemplate)
literal = u'\\' not in filter_as_unicode
+ use_builder = (
+ space.isinstance_w(w_string, space.w_unicode) and literal)
else:
try:
filter_as_string = space.str_w(w_ptemplate)
@@ -242,6 +250,8 @@
literal = False
else:
literal = '\\' not in filter_as_string
+ use_builder = (
+ space.isinstance_w(w_string, space.w_str) and literal)
if literal:
w_filter = w_ptemplate
filter_is_callable = False
@@ -252,19 +262,28 @@
space.wrap(self), w_ptemplate)
filter_is_callable = space.is_true(space.callable(w_filter))
#
+ # XXX this is a bit of a mess, but it improves performance a lot
ctx = self.make_ctx(w_string)
- sublist_w = []
+ sublist_w = strbuilder = unicodebuilder = None
+ if use_builder:
+ if filter_as_unicode is not None:
+ unicodebuilder = UnicodeBuilder(ctx.end)
+ else:
+ assert filter_as_string is not None
+ strbuilder = StringBuilder(ctx.end)
+ else:
+ sublist_w = []
n = last_pos = 0
while not count or n < count:
if not searchcontext(space, ctx):
break
if last_pos < ctx.match_start:
- sublist_w.append(slice_w(space, ctx, last_pos,
- ctx.match_start, space.w_None))
+ _sub_append_slice(
+ ctx, space, use_builder, sublist_w,
+ strbuilder, unicodebuilder, last_pos, ctx.match_start)
start = ctx.match_end
if start == ctx.match_start:
start += 1
- nextctx = ctx.fresh_copy(start)
if not (last_pos == ctx.match_start
== ctx.match_end and n > 0):
# the above ignores empty matches on latest position
@@ -272,27 +291,60 @@
w_match = self.getmatch(ctx, True)
w_piece = space.call_function(w_filter, w_match)
if not space.is_w(w_piece, space.w_None):
+ assert strbuilder is None and unicodebuilder is None
+ assert not use_builder
sublist_w.append(w_piece)
else:
- sublist_w.append(w_filter)
+ if use_builder:
+ if strbuilder is not None:
+ assert filter_as_string is not None
+ strbuilder.append(filter_as_string)
+ else:
+ assert unicodebuilder is not None
+ assert filter_as_unicode is not None
+ unicodebuilder.append(filter_as_unicode)
+ else:
+ sublist_w.append(w_filter)
last_pos = ctx.match_end
n += 1
elif last_pos >= ctx.end:
break # empty match at the end: finished
- ctx = nextctx
+ ctx.reset(start)
if last_pos < ctx.end:
- sublist_w.append(slice_w(space, ctx, last_pos, ctx.end,
- space.w_None))
+ _sub_append_slice(ctx, space, use_builder, sublist_w,
+ strbuilder, unicodebuilder, last_pos, ctx.end)
+ if use_builder:
+ if strbuilder is not None:
+ return space.wrap(strbuilder.build()), n
+ else:
+ assert unicodebuilder is not None
+ return space.wrap(unicodebuilder.build()), n
+ else:
+ if space.isinstance_w(w_string, space.w_unicode):
+ w_emptystr = space.wrap(u'')
+ else:
+ w_emptystr = space.wrap('')
+ w_item = space.call_method(w_emptystr, 'join',
+ space.newlist(sublist_w))
+ return w_item, n
- if space.isinstance_w(w_string, space.w_unicode):
- w_emptystr = space.wrap(u'')
- else:
- w_emptystr = space.wrap('')
- w_item = space.call_method(w_emptystr, 'join',
- space.newlist(sublist_w))
- return w_item, n
+def _sub_append_slice(ctx, space, use_builder, sublist_w,
+ strbuilder, unicodebuilder, start, end):
+ if use_builder:
+ if isinstance(ctx, rsre_core.BufMatchContext):
+ assert strbuilder is not None
+ return strbuilder.append(ctx._buffer.getslice(start, end, 1,
end-start))
+ if isinstance(ctx, rsre_core.StrMatchContext):
+ assert strbuilder is not None
+ return strbuilder.append_slice(ctx._string, start, end)
+ elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+ assert unicodebuilder is not None
+ return unicodebuilder.append_slice(ctx._unicodestr, start, end)
+ assert 0, "unreachable"
+ else:
+ sublist_w.append(slice_w(space, ctx, start, end, space.w_None))
@unwrap_spec(flags=int, groups=int, w_groupindex=WrappedDefault(None),
w_indexgroup=WrappedDefault(None))
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit