[pypy-commit] pypy default: hg merge shadowstack-perf-2

arigo Wed, 08 Mar 2017 05:51:01 -0800

Author: Armin Rigo <ar...@tunes.org>
Branch: 
Changeset: r90588:c05892e069b0
Date: 2017-03-08 14:48 +0100
http://bitbucket.org/pypy/pypy/changeset/c05892e069b0/


Log:    hg merge shadowstack-perf-2

        Two changes that together bring the performance of shadowstack close
        to asmgcc---close enough that we can now make shadowstack the
        default even on Linux. Yay!

        The changes are:

        * Compile with "gcc -flto" or "clang -flto". Gives a speed-up, and
        cannot be used with asmgcc.

        * Add complicated logic in shadowcolor.py to optimize the placement
        of the roots in the shadow stack. Long code, but should be ok
        because it is (hopefully) well-tested and independent.

diff too long, truncating to 2000 out of 2343 lines

diff --git a/rpython/config/translationoption.py 
b/rpython/config/translationoption.py
--- a/rpython/config/translationoption.py
+++ b/rpython/config/translationoption.py
@@ -17,10 +17,10 @@
 DEFL_GC = "incminimark"   # XXX
 
 DEFL_ROOTFINDER_WITHJIT = "shadowstack"
-if sys.platform.startswith("linux"):
-    _mach = os.popen('uname -m', 'r').read().strip()
-    if _mach.startswith('x86') or _mach in ['i386', 'i486', 'i586', 'i686']:
-        DEFL_ROOTFINDER_WITHJIT = "asmgcc"   # only for Linux on x86 / x86-64
+## if sys.platform.startswith("linux"):
+##     _mach = os.popen('uname -m', 'r').read().strip()
+##     if _mach.startswith('x86') or _mach in ['i386', 'i486', 'i586', 'i686']:
+##         DEFL_ROOTFINDER_WITHJIT = "asmgcc"   # only for Linux on x86 / 
x86-64
 
 IS_64_BITS = sys.maxint > 2147483647
 
diff --git a/rpython/flowspace/model.py b/rpython/flowspace/model.py
--- a/rpython/flowspace/model.py
+++ b/rpython/flowspace/model.py
@@ -96,6 +96,13 @@
         from rpython.translator.tool.graphpage import FlowGraphPage
         FlowGraphPage(t, [self]).display()
 
+    def showbg(self, t=None):
+        import os
+        self.show(t)
+        if os.fork() == 0:
+            self.show(t)
+            os._exit(0)
+
     view = show
 
 
@@ -191,6 +198,11 @@
                 txt = "raise block"
             else:
                 txt = "codeless block"
+        if len(self.inputargs) > 0:
+            if len(self.inputargs) > 1:
+                txt += '[%s...]' % (self.inputargs[0],)
+            else:
+                txt += '[%s]' % (self.inputargs[0],)
         return txt
 
     def __repr__(self):
diff --git a/rpython/memory/gctransform/asmgcroot.py 
b/rpython/memory/gctransform/asmgcroot.py
--- a/rpython/memory/gctransform/asmgcroot.py
+++ b/rpython/memory/gctransform/asmgcroot.py
@@ -341,6 +341,9 @@
         # called first, to initialize self.belongs_to_current_thread.
         assert not hasattr(self, 'gc_detach_callback_pieces_ptr')
 
+    def postprocess_graph(self, gct, graph, any_inlining):
+        pass
+
     def walk_stack_roots(self, collect_stack_root, is_minor=False):
         gcdata = self.gcdata
         gcdata._gc_collect_stack_root = collect_stack_root
diff --git a/rpython/memory/gctransform/framework.py 
b/rpython/memory/gctransform/framework.py
--- a/rpython/memory/gctransform/framework.py
+++ b/rpython/memory/gctransform/framework.py
@@ -205,6 +205,9 @@
             if minimal_transform:
                 self.need_minimal_transform(graph)
             if inline:
+                assert minimal_transform, (
+                    "%r has both inline=True and minimal_transform=False"
+                    % (graph,))
                 self.graphs_to_inline[graph] = True
             return annhelper.graph2const(graph)
 
@@ -410,7 +413,7 @@
         self.identityhash_ptr = getfn(GCClass.identityhash.im_func,
                                       [s_gc, s_gcref],
                                       annmodel.SomeInteger(),
-                                      minimal_transform=False, inline=True)
+                                      minimal_transform=False)
         if getattr(GCClass, 'obtain_free_space', False):
             self.obtainfreespace_ptr = getfn(GCClass.obtain_free_space.im_func,
                                              [s_gc, annmodel.SomeInteger()],
@@ -419,7 +422,6 @@
         if GCClass.moving_gc:
             self.id_ptr = getfn(GCClass.id.im_func,
                                 [s_gc, s_gcref], annmodel.SomeInteger(),
-                                inline = True,
                                 minimal_transform = False)
         else:
             self.id_ptr = None
@@ -600,6 +602,9 @@
                     "the custom trace hook %r for %r can cause "
                     "the GC to be called!" % (func, TP))
 
+    def postprocess_graph(self, graph, any_inlining):
+        self.root_walker.postprocess_graph(self, graph, any_inlining)
+
     def consider_constant(self, TYPE, value):
         self.layoutbuilder.consider_constant(TYPE, value, self.gcdata.gc)
 
diff --git a/rpython/memory/gctransform/shadowcolor.py 
b/rpython/memory/gctransform/shadowcolor.py
new file mode 100644
--- /dev/null
+++ b/rpython/memory/gctransform/shadowcolor.py
@@ -0,0 +1,803 @@
+from rpython.rtyper.lltypesystem import lltype, llmemory
+from rpython.flowspace.model import mkentrymap, checkgraph, Block, Link
+from rpython.flowspace.model import Variable, Constant, SpaceOperation
+from rpython.tool.algo.regalloc import perform_register_allocation
+from rpython.tool.algo.unionfind import UnionFind
+from rpython.translator.unsimplify import varoftype, insert_empty_block
+from rpython.translator.unsimplify import insert_empty_startblock, split_block
+from rpython.translator.simplify import join_blocks
+from rpython.rlib.rarithmetic import intmask
+from collections import defaultdict
+
+
+def is_trivial_rewrite(op):
+    return (op.opname in ('same_as', 'cast_pointer', 'cast_opaque_ptr')
+                and isinstance(op.args[0], Variable))
+
+
+def find_predecessors(graph, pending_pred):
+    """Return the set of variables whose content can end up inside one
+    of the 'pending_pred', which is a list of (block, var) tuples.
+    """
+    entrymap = mkentrymap(graph)
+    if len(entrymap[graph.startblock]) != 1:
+        insert_empty_startblock(graph)
+        entrymap = mkentrymap(graph)
+
+    pred = set([v for block, v in pending_pred])
+
+    def add(block, v):
+        if isinstance(v, Variable):
+            if v not in pred:
+                pending_pred.append((block, v))
+                pred.add(v)
+
+    while pending_pred:
+        block, v = pending_pred.pop()
+        if v in block.inputargs:
+            var_index = block.inputargs.index(v)
+            for link in entrymap[block]:
+                prevblock = link.prevblock
+                if prevblock is not None:
+                    add(prevblock, link.args[var_index])
+        else:
+            for op in block.operations:
+                if op.result is v:
+                    if is_trivial_rewrite(op):
+                        add(block, op.args[0])
+                    break
+    return pred
+
+
+def find_successors(graph, pending_succ):
+    """Return the set of variables where one of the 'pending_succ' can
+    end up.  'block_succ' is a list of (block, var) tuples.
+    """
+    succ = set([v for block, v in pending_succ])
+
+    def add(block, v):
+        if isinstance(v, Variable):
+            if v not in succ:
+                pending_succ.append((block, v))
+                succ.add(v)
+
+    while pending_succ:
+        block, v = pending_succ.pop()
+        for op in block.operations:
+            if op.args and v is op.args[0] and is_trivial_rewrite(op):
+                add(block, op.result)
+        for link in block.exits:
+            for i, v1 in enumerate(link.args):
+                if v1 is v:
+                    add(link.target, link.target.inputargs[i])
+    return succ
+
+
+def find_interesting_variables(graph):
+    # Decide which variables are "interesting" or not.  Interesting
+    # variables contain at least the ones that appear in gc_push_roots
+    # and gc_pop_roots.
+    pending_pred = []
+    pending_succ = []
+    interesting_vars = set()
+    for block in graph.iterblocks():
+        for op in block.operations:
+            if op.opname == 'gc_push_roots':
+                for v in op.args:
+                    if not isinstance(v, Variable):
+                        continue
+                    interesting_vars.add(v)
+                    pending_pred.append((block, v))
+            elif op.opname == 'gc_pop_roots':
+                for v in op.args:
+                    if not isinstance(v, Variable):
+                        continue
+                    assert v in interesting_vars   # must be pushed just above
+                    pending_succ.append((block, v))
+    if not interesting_vars:
+        return None
+
+    # If there is a path from a gc_pop_roots(v) to a subsequent
+    # gc_push_roots(w) where w contains the same value as v along that
+    # path, then we consider all intermediate blocks along that path
+    # which contain a copy of the same value, and add these variables
+    # as "interesting", too.  Formally, a variable in a block is
+    # "interesting" if it is both a "predecessor" and a "successor",
+    # where predecessors are variables which (sometimes) end in a
+    # gc_push_roots, and successors are variables which (sometimes)
+    # come from a gc_pop_roots.
+    pred = find_predecessors(graph, pending_pred)
+    succ = find_successors(graph, pending_succ)
+    interesting_vars |= (pred & succ)
+
+    return interesting_vars
+
+
+def allocate_registers(graph):
+    interesting_vars = find_interesting_variables(graph)
+    if not interesting_vars:
+        return None
+    regalloc = perform_register_allocation(graph, 
interesting_vars.__contains__)
+    assert regalloc.graph is graph
+    regalloc.find_num_colors()
+    return regalloc
+
+
+def _gc_save_root(index, var):
+    c_index = Constant(index, lltype.Signed)
+    return SpaceOperation('gc_save_root', [c_index, var],
+                          varoftype(lltype.Void))
+
+def _gc_restore_root(index, var):
+    c_index = Constant(index, lltype.Signed)
+    return SpaceOperation('gc_restore_root', [c_index, var],
+                          varoftype(lltype.Void))
+
+def make_bitmask(filled, graph='?'):
+    n = filled.count(False)
+    if n == 0:
+        return (None, None)
+    bitmask = 0
+    last_index = 0
+    for i in range(len(filled)):
+        if not filled[i]:
+            bitmask <<= (i - last_index)
+            last_index = i
+            bitmask |= 1
+    assert bitmask & 1
+    if bitmask != intmask(bitmask):
+        raise GCBitmaskTooLong("the graph %r is too complex: cannot create "
+                               "a bitmask telling than more than 31/63 "
+                               "shadowstack entries are unused" % (graph,))
+    # the mask is always a positive value, but it is replaced by a
+    # negative value during a minor collection root walking.  Then,
+    # if the next minor collection finds an already-negative value,
+    # we know we can stop.  So that's why we don't include here an
+    # optimization to not re-write a same-valued mask: it is important
+    # to re-write the value, to turn it from potentially negative back
+    # to positive, in order to mark this shadow frame as modified.
+    assert bitmask > 0
+    return (last_index, bitmask)
+
+
+def expand_one_push_roots(regalloc, args):
+    if regalloc is None:
+        assert len(args) == 0
+    else:
+        filled = [False] * regalloc.numcolors
+        for v in args:
+            index = regalloc.getcolor(v)
+            assert not filled[index]
+            filled[index] = True
+            yield _gc_save_root(index, v)
+        bitmask_index, bitmask = make_bitmask(filled, regalloc.graph)
+        if bitmask_index is not None:
+            # xxx we might in some cases avoid this gc_save_root
+            # entirely, if we know we're after another gc_push/gc_pop
+            # that wrote exactly the same mask at the same index
+            bitmask_c = Constant(bitmask, lltype.Signed)
+            yield _gc_save_root(bitmask_index, bitmask_c)
+
+def expand_one_pop_roots(regalloc, args):
+    if regalloc is None:
+        assert len(args) == 0
+    else:
+        for v in args:
+            index = regalloc.getcolor(v)
+            yield _gc_restore_root(index, v)
+
+
+def expand_push_roots(graph, regalloc):
+    """Expand gc_push_roots into a series of gc_save_root, including
+    writing a bitmask tag to mark some entries as not-in-use.
+    (If regalloc is None, it will still remove empty gc_push_roots.)
+    """
+    for block in graph.iterblocks():
+        any_change = False
+        newops = []
+        for op in block.operations:
+            if op.opname == 'gc_push_roots':
+                args = [v for v in op.args if isinstance(v, Variable)]
+                newops += expand_one_push_roots(regalloc, args)
+                any_change = True
+            else:
+                newops.append(op)
+        if any_change:
+            block.operations = newops
+
+
+def move_pushes_earlier(graph, regalloc):
+    """gc_push_roots and gc_pop_roots are pushes/pops to the shadowstack,
+    immediately enclosing the operation that needs them (typically a call).
+    Here, we try to move individual pushes earlier.
+
+    Should run after expand_push_roots(), but before expand_pop_roots(),
+    so that it sees individual 'gc_save_root' operations but bulk
+    'gc_pop_roots' operations.
+    """
+    # Concrete example (assembler tested on x86-64 gcc 5.3 and clang 3.7):
+    #
+    # ----original----           ----move_pushes_earlier----
+    #
+    # while (a > 10) {           *foo = b;
+    #     *foo = b;              while (a > 10) {
+    #     a = g(a);                  a = g(a);
+    #     b = *foo;                  b = *foo;
+    #                                // *foo = b;
+    # }                          }
+    # return b;                  return b;
+    #
+    # => the store and the       => the store is before, and gcc/clang
+    # load are in the loop,      moves the load after the loop
+    # even in the assembler      (the commented-out '*foo=b' is removed
+    #                            here, but gcc/clang would also remove it)
+
+    # Draft of the algorithm: see shadowcolor.txt
+
+    if not regalloc:
+        return
+
+    entrymap = mkentrymap(graph)
+    assert len(entrymap[graph.startblock]) == 1
+
+    inputvars = {}    # {inputvar: (its block, its index in inputargs)}
+    for block in graph.iterblocks():
+        for i, v in enumerate(block.inputargs):
+            inputvars[v] = (block, i)
+
+    Plist = []
+
+    for index in range(regalloc.numcolors):
+        U = UnionFind()
+
+        S = set()
+        for block in graph.iterblocks():
+            for op in reversed(block.operations):
+                if op.opname == 'gc_pop_roots':
+                    break
+            else:
+                continue   # no gc_pop_roots in this block
+            for v in op.args:
+                if isinstance(v, Variable) and regalloc.checkcolor(v, index):
+                    break
+            else:
+                continue   # no variable goes into index i
+
+            succ = set()
+            pending_succ = [(block, v)]
+            while pending_succ:
+                block1, v1 = pending_succ.pop()
+                assert regalloc.checkcolor(v1, index)
+                for op1 in block1.operations:
+                    if is_trivial_rewrite(op1) and op1.args[0] is v1:
+                        if regalloc.checkcolor(op1.result, index):
+                            pending_succ.append((block1, op1.result))
+                for link1 in block1.exits:
+                    for i2, v2 in enumerate(link1.args):
+                        if v2 is not v1:
+                            continue
+                        block2 = link1.target
+                        w2 = block2.inputargs[i2]
+                        if w2 in succ or not regalloc.checkcolor(w2, index):
+                            continue
+                        succ.add(w2)
+                        for op2 in block2.operations:
+                            if op2.opname in ('gc_save_root', 'gc_pop_roots'):
+                                break
+                        else:
+                            pending_succ.append((block2, w2))
+            U.union_list(list(succ))
+            S.update(succ)
+
+        G = defaultdict(set)
+        for block in graph.iterblocks():
+            found = False
+            for opindex, op in enumerate(block.operations):
+                if op.opname == 'gc_save_root':
+                    if (isinstance(op.args[1], Constant) and
+                        op.args[1].concretetype == lltype.Signed):
+                        break
+                    elif op.args[0].value == index:
+                        found = True
+                        break
+            if not found or not isinstance(op.args[1], Variable):
+                continue   # no matching gc_save_root in this block
+
+            key = (block, op)
+            pred = set()
+            pending_pred = [(block, op.args[1], opindex)]
+            while pending_pred:
+                block1, v1, opindex1 = pending_pred.pop()
+                assert regalloc.getcolor(v1) == index
+                for i in range(opindex1-1, -1, -1):
+                    op1 = block1.operations[i]
+                    if op1.opname == 'gc_pop_roots':
+                        break    # stop
+                    if op1.result is v1:
+                        if not is_trivial_rewrite(op1):
+                            break   # stop
+                        if not regalloc.checkcolor(op1.args[0], index):
+                            break   # stop
+                        v1 = op1.args[0]
+                else:
+                    varindex = block1.inputargs.index(v1)
+                    if v1 in pred:
+                        continue    # already done
+                    pred.add(v1)
+                    for link1 in entrymap[block1]:
+                        prevblock1 = link1.prevblock
+                        if prevblock1 is not None:
+                            w1 = link1.args[varindex]
+                            if isinstance(w1, Variable) and w1 not in pred:
+                                if regalloc.checkcolor(w1, index):
+                                    pending_pred.append((prevblock1, w1,
+                                                len(prevblock1.operations)))
+            U.union_list(list(pred))
+            for v1 in pred:
+                G[v1].add(key)
+
+        M = S.intersection(G)
+
+        parts_target = {}
+        for v in M:
+            vp = U.find_rep(v)
+            if vp not in parts_target:
+                new_part = (index, set(), set())
+                # (index,
+                #  subset P of variables,
+                #  set of (block, gc_save_root))
+                Plist.append(new_part)
+                parts_target[vp] = new_part
+            part = parts_target[vp]
+            part[1].add(v)
+            part[2].update(G[v])
+
+    # Sort P so that it prefers places that would avoid multiple
+    # gcsaveroots (smaller 'heuristic' result, so first in sorted
+    # order); but also prefers smaller overall pieces, because it
+    # might be possible to remove several small-scale pieces instead
+    # of one big-scale one.
+    def heuristic((index, P, gcsaveroots)):
+        return float(len(P)) / len(gcsaveroots)
+    Plist.sort(key=heuristic)
+
+    variables_along_changes = {}
+    live_at_start_of_block = set()   # set of (block, index)
+    insert_gc_push_root = defaultdict(list)
+
+    for index, P, gcsaveroots in Plist:
+        # if this Plist entry is not valid any more because of changes
+        # done by the previous entries, drop it
+        if any((inputvars[v][0], index) in live_at_start_of_block for v in P):
+            continue
+        if any(op not in block.operations for block, op in gcsaveroots):
+            continue
+        for v in P:
+            assert regalloc.getcolor(v) == index
+            assert v not in variables_along_changes
+
+        success_count = 0
+        mark = []
+
+        for v in P:
+            block, varindex = inputvars[v]
+            for link in entrymap[block]:
+                w = link.args[varindex]
+                if link.prevblock is not None:
+                    prevoperations = link.prevblock.operations
+                else:
+                    prevoperations = []
+                for op in reversed(prevoperations):
+                    if op.opname == 'gc_pop_roots':
+                        # it is possible to have gc_pop_roots() without
+                        # w in the args, if w is the result of the call
+                        # that comes just before.
+                        if (isinstance(w, Variable) and
+                                w in op.args and
+                                regalloc.checkcolor(w, index)):
+                            success_count += 1
+                        else:
+                            mark.append((index, link, varindex))
+                        break
+                    if op.result is w:
+                        if is_trivial_rewrite(op) and (
+                                regalloc.checkcolor(op.args[0], index)):
+                            w = op.args[0]
+                        else:
+                            mark.append((index, link, varindex))
+                            break
+                else:
+                    if not isinstance(w, Variable) or w not in P:
+                        mark.append((index, link, varindex))
+
+        if success_count > 0:
+            for block, op in gcsaveroots:
+                newops = list(block.operations)
+                newops.remove(op)
+                block.operations = newops
+            for index, link, varindex in mark:
+                insert_gc_push_root[link].append((index, link.args[varindex]))
+            for v in P:
+                block, varindex = inputvars[v]
+                variables_along_changes[v] = block, index
+                live_at_start_of_block.add((block, index))
+
+    for link in insert_gc_push_root:
+        newops = [_gc_save_root(index, v)
+                  for index, v in sorted(insert_gc_push_root[link])]
+        insert_empty_block(link, newops=newops)
+
+
+def expand_pop_roots(graph, regalloc):
+    """gc_pop_roots => series of gc_restore_root; this is done after
+    move_pushes_earlier() because that one doesn't work correctly if
+    a completely-empty gc_pop_roots is removed.
+
+    Also notice in-block code sequences like gc_pop_roots(v) followed
+    by a gc_save_root(v), and drop the gc_save_root.
+    """
+    drop = {}
+    for block in graph.iterblocks():
+        any_change = False
+        newops = []
+        for op in block.operations:
+            if op.opname == 'gc_pop_roots':
+                args = [v for v in op.args if isinstance(v, Variable)]
+                expanded = list(expand_one_pop_roots(regalloc, args))
+                drop = {}
+                for op1 in expanded:
+                    if isinstance(op1.args[1], Variable):
+                        drop[op1.args[1]] = op1.args[0].value
+                newops += expanded
+                any_change = True
+            elif (op.opname == 'gc_save_root' and
+                      drop.get(op.args[1]) == op.args[0].value):
+                any_change = True    # kill the operation
+            else:
+                newops.append(op)
+        if any_change:
+            block.operations = newops
+
+
+def add_enter_leave_roots_frame(graph, regalloc, c_gcdata):
+    # put 'gc_enter_roots_frame' as late as possible, but before the
+    # first 'gc_save_root' is reached.
+    #
+    # put the 'gc_leave_roots_frame' operations as early as possible,
+    # that is, just after the last 'gc_restore_root' reached.  This is
+    # done by putting it along a link, such that the previous block
+    # contains a 'gc_restore_root' and from the next block it is not
+    # possible to reach any extra 'gc_restore_root'; then, as doing
+    # this is not as precise as we'd like, we first break every block
+    # just after their last 'gc_restore_root'.
+    if regalloc is None:
+        return
+
+    # break blocks after their last 'gc_restore_root', unless they
+    # are already at the last position
+    for block in graph.iterblocks():
+        ops = block.operations
+        for i in range(len(ops)-1, -1, -1):
+            if ops[i].opname == 'gc_restore_root':
+                if i < len(ops) - 1:
+                    split_block(block, i + 1)
+                break
+    # done
+
+    insert_empty_startblock(graph)
+    entrymap = mkentrymap(graph)
+
+    # helpers
+
+    def is_interesting_op(op):
+        if op.opname == 'gc_restore_root':
+            return True
+        if op.opname == 'gc_save_root':
+            # ignore saves that say "everything is free"
+            return not (isinstance(op.args[1], Constant) and
+                        isinstance(op.args[1].value, int) and
+                        op.args[1].value == bitmask_all_free)
+        return False
+    bitmask_all_free = (1 << regalloc.numcolors) - 1
+
+    def insert_along_link(link, opname, args, cache):
+        b2 = link.target
+        if b2 not in cache:
+            newblock = Block([v.copy() for v in b2.inputargs])
+            newblock.operations.append(
+                SpaceOperation(opname, args, varoftype(lltype.Void)))
+            newblock.closeblock(Link(list(newblock.inputargs), b2))
+            cache[b2] = newblock
+        link.target = cache[b2]
+
+    # make a list of blocks with gc_save_root/gc_restore_root in them
+    interesting_blocks = []
+    for block in graph.iterblocks():
+        for op in block.operations:
+            if is_interesting_op(op):
+                assert block is not graph.startblock
+                assert block is not graph.returnblock
+                interesting_blocks.append(block)
+                break    # interrupt this block, go to the next one
+
+    # compute the blocks such that 'gc_save_root/gc_restore_root'
+    # exist anywhere before the start of this block
+    before_blocks = set()
+    pending = list(interesting_blocks)
+    seen = set(pending)
+    while pending:
+        block = pending.pop()
+        for link in block.exits:
+            before_blocks.add(link.target)
+            if link.target not in seen:
+                seen.add(link.target)
+                pending.append(link.target)
+    assert graph.startblock not in before_blocks
+
+    # compute the blocks such that 'gc_save_root/gc_restore_root'
+    # exist anywhere after the start of this block
+    after_blocks = set(interesting_blocks)
+    pending = list(interesting_blocks)
+    while pending:
+        block = pending.pop()
+        for link in entrymap[block]:
+            if link.prevblock is not None:
+                if link.prevblock not in after_blocks:
+                    after_blocks.add(link.prevblock)
+                    pending.append(link.prevblock)
+    assert graph.returnblock not in after_blocks
+
+    # this is the set of blocks such that, at the start of the block,
+    # we're "in frame", i.e. there are 'gc_save_root/gc_restore_root'
+    # both before and after the start of the block.
+    inside_blocks = before_blocks & after_blocks
+    inside_or_interesting_blocks = set(interesting_blocks) | inside_blocks
+
+    # if a block contains gc_save_root/gc_restore_root but is not
+    # an "inside_block", then add gc_enter_roots_frame where needed
+    c_num = Constant(regalloc.numcolors, lltype.Signed)
+    for block in interesting_blocks:
+        if block not in inside_blocks:
+            i = 0
+            while not is_interesting_op(block.operations[i]):
+                i += 1
+            block.operations.insert(i,
+                SpaceOperation('gc_enter_roots_frame', [c_gcdata, c_num],
+                               varoftype(lltype.Void)))
+
+    # If a link goes from a "non-inside, non-interesting block"
+    # straight to an "inside_block", insert a gc_enter_roots_frame
+    # along the link.  Similarly, if a block is a "inside-or-
+    # interesting_block" and exits with a link going to a
+    # "non-inside_block", then insert a gc_leave_roots_frame along the
+    # link.
+    cache1 = {}
+    cache2 = {}
+    for block in list(graph.iterblocks()):
+        if block not in inside_or_interesting_blocks:
+            for link in block.exits:
+                if link.target in inside_blocks:
+                    insert_along_link(link, 'gc_enter_roots_frame',
+                                      [c_gcdata, c_num], cache1)
+        else:
+            for link in block.exits:
+                if link.target not in inside_blocks:
+                    insert_along_link(link, 'gc_leave_roots_frame',
+                                      [], cache2)
+
+    # check all blocks not in "inside_block": they might contain a
+    # gc_save_root() that writes the bitmask meaning "everything is
+    # free".  Look only before gc_enter_roots_frame, if there is one
+    # in that block.  Remove these out-of-frame gc_save_root().
+    for block in graph.iterblocks():
+        if block not in inside_blocks:
+            newops = []
+            for i, op in enumerate(block.operations):
+                if op.opname == 'gc_enter_roots_frame':
+                    newops.extend(block.operations[i:])
+                    break
+                if op.opname == 'gc_save_root' and not is_interesting_op(op):
+                    pass   # don't add in newops
+                else:
+                    newops.append(op)
+            if len(newops) < len(block.operations):
+                block.operations = newops
+
+    join_blocks(graph)  # for the extra new blocks made in this function
+
+
+class GCBitmaskTooLong(Exception):
+    pass
+
+class PostProcessCheckError(Exception):
+    pass
+
+def postprocess_double_check(graph):
+    # Debugging only: double-check that the placement is correct.
+    # Assumes that every gc_restore_root() indicates that the variable
+    # must be saved at the given position in the shadowstack frame (in
+    # practice it may have moved because of the GC, but in theory it
+    # is still the "same" object).  So we build the set of all known
+    # valid-in-all-paths saved locations, and check that.
+
+    saved = {}  # {var-from-inputargs: location} where location is:
+                #    <unset>: we haven't seen this variable so far
+                #    set-of-indexes: says where the variable is always
+                #                    saved at the start of this block
+                #    empty-set: same as above, so: "saved nowhere"
+
+    in_frame = {}   # {block: bool}, tells if, at the start of this block,
+                    # we're in status "frame entered" or not
+
+    in_frame[graph.startblock] = False
+    pending = set([graph.startblock])
+    while pending:
+        block = pending.pop()
+        locsaved = {}
+        currently_in_frame = in_frame[block]
+        if currently_in_frame:
+            for v in block.inputargs:
+                locsaved[v] = saved[v]
+        for op in block.operations:
+            if op.opname == 'gc_restore_root':
+                if not currently_in_frame:
+                    raise PostProcessCheckError(graph, block, op, 'no frame!')
+                if isinstance(op.args[1], Constant):
+                    continue
+                num = op.args[0].value
+                if num not in locsaved[op.args[1]]:
+                    raise PostProcessCheckError(graph, block, op, num, 
locsaved)
+            elif op.opname == 'gc_save_root':
+                if not currently_in_frame:
+                    raise PostProcessCheckError(graph, block, op, 'no frame!')
+                num = op.args[0].value
+                # first, cancel any other variable that would be saved in 'num'
+                for v in locsaved:
+                    locsaved[v] = locsaved[v].difference([num])
+                #
+                v = op.args[1]
+                if isinstance(v, Variable):
+                    locsaved[v] = locsaved[v].union([num])
+                else:
+                    if v.concretetype != lltype.Signed:
+                        locsaved[v] = locsaved.get(v, frozenset()).union([num])
+                        continue
+                    bitmask = v.value
+                    if bitmask != 1:
+                        # cancel any variable that would be saved in any
+                        # position shown by the bitmask, not just 'num'
+                        assert bitmask & 1
+                        assert 1 < bitmask < (2<<num)
+                        nummask = [i for i in range(num+1)
+                                     if bitmask & (1<<(num-i))]
+                        assert nummask[-1] == num
+                        for v in locsaved:
+                            locsaved[v] = locsaved[v].difference(nummask)
+            elif op.opname == 'gc_enter_roots_frame':
+                if currently_in_frame:
+                    raise PostProcessCheckError(graph, block, op,'double 
enter')
+                currently_in_frame = True
+                # initialize all local variables so far with "not seen 
anywhere"
+                # (already done, apart from block.inputargs)
+                for v in block.inputargs:
+                    locsaved[v] = frozenset()
+            elif op.opname == 'gc_leave_roots_frame':
+                if not currently_in_frame:
+                    raise PostProcessCheckError(graph, block, op, 'not 
entered')
+                currently_in_frame = False
+            elif is_trivial_rewrite(op) and currently_in_frame:
+                locsaved[op.result] = locsaved[op.args[0]]
+            else:
+                locsaved[op.result] = frozenset()
+        for link in block.exits:
+            changed = False
+            if link.target not in in_frame:
+                in_frame[link.target] = currently_in_frame
+                changed = True
+            else:
+                if in_frame[link.target] != currently_in_frame:
+                    raise PostProcessCheckError(graph, link.target,
+                                                'inconsistent in_frame')
+            if currently_in_frame:
+                for i, v in enumerate(link.args):
+                    try:
+                        loc = locsaved[v]
+                    except KeyError:
+                        assert isinstance(v, Constant)
+                        loc = frozenset()
+                    w = link.target.inputargs[i]
+                    if w in saved:
+                        if loc == saved[w]:
+                            continue      # already up-to-date
+                        loc = loc.intersection(saved[w])
+                    saved[w] = loc
+                    changed = True
+            if changed:
+                pending.add(link.target)
+
+    if in_frame.get(graph.returnblock, False):
+        raise PostProcessCheckError(graph, 'missing gc_leave_roots_frame')
+    assert graph.getreturnvar() not in saved   # missing gc_leave_roots_frame?
+
+
+def postprocess_graph(graph, c_gcdata):
+    """Collect information about the gc_push_roots and gc_pop_roots
+    added in this complete graph, and replace them with real operations.
+    """
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    add_enter_leave_roots_frame(graph, regalloc, c_gcdata)
+    checkgraph(graph)
+    postprocess_double_check(graph)
+    return (regalloc is not None)
+
+
+def postprocess_inlining(graph):
+    """We first write calls to GC functions with gc_push_roots(...) and
+    gc_pop_roots(...) around.  Then we inline some of these functions.
+    As a result, the gc_push_roots and gc_pop_roots are no longer in
+    the same block.  Fix that by moving the gc_push_roots/gc_pop_roots
+    inside the inlined portion of the graph, around every call.
+
+    We could also get a correct result by doing things in a different
+    order, e.g. first postprocess_graph() and then inlining.  However,
+    this order brings an important benefit: if the inlined graph has a
+    fast-path, like malloc_fixedsize(), then there are no gc_push_roots
+    and gc_pop_roots left along the fast-path.
+    """
+    for block in graph.iterblocks():
+        for i in range(len(block.operations)-1, -1, -1):
+            op = block.operations[i]
+            if op.opname == 'gc_pop_roots':
+                break
+            if op.opname == 'gc_push_roots':
+                _fix_graph_after_inlining(graph, block, i)
+                break
+    checkgraph(graph)
+
+def _fix_graph_after_inlining(graph, initial_block, initial_index):
+    op = initial_block.operations.pop(initial_index)
+    assert op.opname == 'gc_push_roots'
+    seen = set()
+    pending = [(initial_block, initial_index, op.args)]
+    while pending:
+        block, start_index, track_args = pending.pop()
+        if block in seen:
+            continue
+        seen.add(block)
+        assert block.operations != ()     # did not find the gc_pop_roots?
+        new_operations = block.operations[:start_index]
+        stop = False
+        for i in range(start_index, len(block.operations)):
+            op = block.operations[i]
+            if op.opname == 'gc_push_roots':
+                raise Exception("%r: seems to have inlined inside it another "
+                                "graph which also uses GC roots" % (graph,))
+            if op.opname == 'gc_pop_roots':
+                # end of the inlined graph, drop gc_pop_roots, keep the tail
+                new_operations += block.operations[i + 1:]
+                stop = True
+                break
+            if op.opname in ('direct_call', 'indirect_call'):
+                new_operations.append(SpaceOperation('gc_push_roots',
+                                                     track_args[:],
+                                                     varoftype(lltype.Void)))
+                new_operations.append(op)
+                new_operations.append(SpaceOperation('gc_pop_roots',
+                                                     track_args[:],
+                                                     varoftype(lltype.Void)))
+            else:
+                new_operations.append(op)
+        block.operations = new_operations
+        if not stop:
+            for link in block.exits:
+                track_next = []
+                for v in track_args:
+                    if not isinstance(v, Variable):
+                        continue
+                    i = link.args.index(v)   # should really be here
+                    w = link.target.inputargs[i]
+                    track_next.append(w)
+                pending.append((link.target, 0, track_next))
diff --git a/rpython/memory/gctransform/shadowcolor.txt 
b/rpython/memory/gctransform/shadowcolor.txt
new file mode 100644
--- /dev/null
+++ b/rpython/memory/gctransform/shadowcolor.txt
@@ -0,0 +1,46 @@
+
+
+for every frame index i:
+
+  S_i = { variable: non-empty-set-of-sources }
+
+    source = a variable popped by gc_pop_roots from frame index 'i',
+             only looking at the last gc_pop_roots in each block
+    keys = variables that appear in inputargs, where a value from a
+           'source' can come from
+
+  G_i = { variable: non-empty-set-of-targets }
+
+    target = a variable pushed by gc_push_roots into frame index 'i',
+             only looking at the first gc_push_roots in each block
+    keys = variables that appear in inputargs, where a value can
+           end up in a 'target'
+
+  M_i = S_i intersection G_i
+
+  "variable in M_i"  <==>  at the start of this block, this variable's
+  value is already saved in the frame index i (provided "success" below)
+
+  split M_i into a partition of independent parts; add (i, P), (i, P'), ...
+  to the global list
+
+
+for every (i, P), ideally in some suitable order:
+
+   for every variable in P, for every link entering this block:
+
+      if prevblock's corresponding variable is from the last gc_pop_roots
+      of that block, at index i:
+
+        *success = True*
+
+      elif prevblock's corresponding variable is not in P:
+
+        mark the link
+
+   if success:
+
+      insert a new gc_save_root() along all links marked above;
+      remove the original gc_save_root
+
+      for any P' after P that has any variables in common, kill that P'
diff --git a/rpython/memory/gctransform/shadowstack.py 
b/rpython/memory/gctransform/shadowstack.py
--- a/rpython/memory/gctransform/shadowstack.py
+++ b/rpython/memory/gctransform/shadowstack.py
@@ -3,6 +3,7 @@
 from rpython.rlib.debug import ll_assert
 from rpython.rlib.nonconst import NonConstant
 from rpython.rlib import rgc
+from rpython.rlib.objectmodel import specialize
 from rpython.rtyper import rmodel
 from rpython.rtyper.annlowlevel import llhelper
 from rpython.rtyper.lltypesystem import lltype, llmemory
@@ -10,6 +11,7 @@
 from rpython.memory.gctransform.framework import (
      BaseFrameworkGCTransformer, BaseRootWalker, sizeofaddr)
 from rpython.rtyper.rbuiltin import gen_cast
+from rpython.memory.gctransform.log import log
 
 
 class ShadowStackFrameworkGCTransformer(BaseFrameworkGCTransformer):
@@ -29,30 +31,43 @@
     def push_roots(self, hop, keep_current_args=False):
         livevars = self.get_livevars_for_roots(hop, keep_current_args)
         self.num_pushs += len(livevars)
-        if not livevars:
-            return []
-        c_len = rmodel.inputconst(lltype.Signed, len(livevars) )
-        base_addr = hop.genop("direct_call", [self.incr_stack_ptr, c_len ],
-                              resulttype=llmemory.Address)
-        for k,var in enumerate(livevars):
-            c_k = rmodel.inputconst(lltype.Signed, k * sizeofaddr)
-            v_adr = gen_cast(hop.llops, llmemory.Address, var)
-            hop.genop("raw_store", [base_addr, c_k, v_adr])
+        hop.genop("gc_push_roots", livevars)
         return livevars
 
     def pop_roots(self, hop, livevars):
-        if not livevars:
-            return
-        c_len = rmodel.inputconst(lltype.Signed, len(livevars) )
-        base_addr = hop.genop("direct_call", [self.decr_stack_ptr, c_len ],
-                              resulttype=llmemory.Address)
-        if self.gcdata.gc.moving_gc:
-            # for moving collectors, reload the roots into the local variables
-            for k,var in enumerate(livevars):
-                c_k = rmodel.inputconst(lltype.Signed, k * sizeofaddr)
-                v_newaddr = hop.genop("raw_load", [base_addr, c_k],
-                                      resulttype=llmemory.Address)
-                hop.genop("gc_reload_possibly_moved", [v_newaddr, var])
+        hop.genop("gc_pop_roots", livevars)
+        # NB. we emit it even if len(livevars) == 0; this is needed for
+        # shadowcolor.move_pushes_earlier()
+
+
+@specialize.call_location()
+def walk_stack_root(invoke, arg0, arg1, start, addr, is_minor):
+    skip = 0
+    while addr != start:
+        addr -= sizeofaddr
+        #XXX reintroduce support for tagged values?
+        #if gc.points_to_valid_gc_object(addr):
+        #    callback(gc, addr)
+
+        if skip & 1 == 0:
+            content = addr.address[0]
+            n = llmemory.cast_adr_to_int(content)
+            if n & 1 == 0:
+                if content:   # non-0, non-odd: a regular ptr
+                    invoke(arg0, arg1, addr)
+            else:
+                # odd number: a skip bitmask
+                if n > 0:       # initially, an unmarked value
+                    if is_minor:
+                        newcontent = llmemory.cast_int_to_adr(-n)
+                        addr.address[0] = newcontent   # mark
+                    skip = n
+                else:
+                    # a marked value
+                    if is_minor:
+                        return
+                    skip = -n
+        skip >>= 1
 
 
 class ShadowStackRootWalker(BaseRootWalker):
@@ -73,14 +88,8 @@
             return top
         self.decr_stack = decr_stack
 
-        def walk_stack_root(callback, start, end):
-            gc = self.gc
-            addr = end
-            while addr != start:
-                addr -= sizeofaddr
-                if gc.points_to_valid_gc_object(addr):
-                    callback(gc, addr)
-        self.rootstackhook = walk_stack_root
+        self.invoke_collect_stack_root = specialize.call_location()(
+            lambda arg0, arg1, addr: arg0(self.gc, addr))
 
         self.shadow_stack_pool = ShadowStackPool(gcdata)
         rsd = gctransformer.root_stack_depth
@@ -101,8 +110,9 @@
 
     def walk_stack_roots(self, collect_stack_root, is_minor=False):
         gcdata = self.gcdata
-        self.rootstackhook(collect_stack_root,
-                           gcdata.root_stack_base, gcdata.root_stack_top)
+        walk_stack_root(self.invoke_collect_stack_root, collect_stack_root,
+                        None, gcdata.root_stack_base, gcdata.root_stack_top,
+                        is_minor=is_minor)
 
     def need_thread_support(self, gctransformer, getfn):
         from rpython.rlib import rthread    # xxx fish
@@ -208,7 +218,7 @@
 
         self.thread_setup = thread_setup
         self.thread_run_ptr = getfn(thread_run, [], annmodel.s_None,
-                                    inline=True, minimal_transform=False)
+                                    minimal_transform=False)
         self.thread_die_ptr = getfn(thread_die, [], annmodel.s_None,
                                     minimal_transform=False)
         # no thread_before_fork_ptr here
@@ -222,6 +232,16 @@
         from rpython.rlib import _stacklet_shadowstack
         _stacklet_shadowstack.complete_destrptr(gctransformer)
 
+    def postprocess_graph(self, gct, graph, any_inlining):
+        from rpython.memory.gctransform import shadowcolor
+        if any_inlining:
+            shadowcolor.postprocess_inlining(graph)
+        use_push_pop = shadowcolor.postprocess_graph(graph, gct.c_const_gcdata)
+        if use_push_pop and graph in gct.graphs_to_inline:
+            log.WARNING("%r is marked for later inlining, "
+                        "but is using push/pop roots.  Disabled" % (graph,))
+            del gct.graphs_to_inline[graph]
+
 # ____________________________________________________________
 
 class ShadowStackPool(object):
@@ -310,11 +330,8 @@
 
     def customtrace(gc, obj, callback, arg):
         obj = llmemory.cast_adr_to_ptr(obj, SHADOWSTACKREFPTR)
-        addr = obj.top
-        start = obj.base
-        while addr != start:
-            addr -= sizeofaddr
-            gc._trace_callback(callback, arg, addr)
+        walk_stack_root(gc._trace_callback, callback, arg, obj.base, obj.top,
+                        is_minor=False)   # xxx optimize?
 
     gc = gctransformer.gcdata.gc
     assert not hasattr(gc, 'custom_trace_dispatcher')
diff --git a/rpython/memory/gctransform/test/test_shadowcolor.py 
b/rpython/memory/gctransform/test/test_shadowcolor.py
new file mode 100644
--- /dev/null
+++ b/rpython/memory/gctransform/test/test_shadowcolor.py
@@ -0,0 +1,791 @@
+from rpython.rtyper.lltypesystem import lltype, llmemory
+from rpython.rtyper.lltypesystem.lloperation import llop
+from rpython.rtyper.test.test_llinterp import gengraph
+from rpython.conftest import option
+from rpython.memory.gctransform.shadowcolor import *
+from rpython.flowspace import model as graphmodel
+from rpython.translator.simplify import join_blocks, cleanup_graph
+from hypothesis import given, strategies
+
+
+def make_graph(f, argtypes):
+    t, rtyper, graph = gengraph(f, argtypes, viewbefore=False)
+    if getattr(option, 'view', False):
+        graph.show()
+    return graph
+
+def nameof(v):
+    return v._name.rstrip('_')
+
+def summary(interesting_vars):
+    result = {}
+    for v in interesting_vars:
+        name = nameof(v)
+        result[name] = result.get(name, 0) + 1
+    return result
+
+def summary_regalloc(regalloc):
+    result = []
+    for block in regalloc.graph.iterblocks():
+        print block.inputargs
+        for op in block.operations:
+            print '\t', op
+        blockvars = block.inputargs + [op.result for op in block.operations]
+        for v in blockvars:
+            if regalloc.consider_var(v):
+                result.append((nameof(v), regalloc.getcolor(v)))
+                print '\t\t%s: %s' % (v, regalloc.getcolor(v))
+    result.sort()
+    return result
+
+
+def test_find_predecessors_1():
+    def f(a, b):
+        c = a + b
+        return c
+    graph = make_graph(f, [int, int])
+    pred = find_predecessors(graph, [(graph.returnblock, 
graph.getreturnvar())])
+    assert summary(pred) == {'c': 1, 'v': 1}
+
+def test_find_predecessors_2():
+    def f(a, b):
+        c = a + b
+        while a > 0:
+            a -= 2
+        return c
+    graph = make_graph(f, [int, int])
+    pred = find_predecessors(graph, [(graph.returnblock, 
graph.getreturnvar())])
+    assert summary(pred) == {'c': 3, 'v': 1}
+
+def test_find_predecessors_3():
+    def f(a, b):
+        while b > 100:
+            b -= 2
+        if b > 10:
+            c = a + b      # 'c' created in this block
+        else:
+            c = a - b      # 'c' created in this block
+        return c           # 'v' is the return var
+    graph = make_graph(f, [int, int])
+    pred = find_predecessors(graph, [(graph.returnblock, 
graph.getreturnvar())])
+    assert summary(pred) == {'c': 2, 'v': 1}
+
+def test_find_predecessors_4():
+    def f(a, b):           # 'a' in the input block
+        while b > 100:     # 'a' in the loop header block
+            b -= 2         # 'a' in the loop body block
+        if b > 10:         # 'a' in the condition block
+            while b > 5:   # nothing
+                b -= 2     # nothing
+            c = a + b      # 'c' created in this block
+        else:
+            c = a
+        return c           # 'v' is the return var
+    graph = make_graph(f, [int, int])
+    pred = find_predecessors(graph, [(graph.returnblock, 
graph.getreturnvar())])
+    assert summary(pred) == {'a': 4, 'c': 1, 'v': 1}
+
+def test_find_predecessors_trivial_rewrite():
+    def f(a, b):                              # 'b' in empty startblock
+        while a > 100:                        # 'b'
+            a -= 2                            # 'b'
+        c = llop.same_as(lltype.Signed, b)    # 'c', 'b'
+        while b > 10:                         # 'c'
+            b -= 2                            # 'c'
+        d = llop.same_as(lltype.Signed, c)    # 'd', 'c'
+        return d           # 'v' is the return var
+    graph = make_graph(f, [int, int])
+    pred = find_predecessors(graph, [(graph.returnblock, 
graph.getreturnvar())])
+    assert summary(pred) == {'b': 4, 'c': 4, 'd': 1, 'v': 1}
+
+def test_find_successors_1():
+    def f(a, b):
+        return a + b
+    graph = make_graph(f, [int, int])
+    succ = find_successors(graph, [(graph.startblock, graph.getargs()[0])])
+    assert summary(succ) == {'a': 1}
+
+def test_find_successors_2():
+    def f(a, b):
+        if b > 10:
+            return a + b
+        else:
+            return a - b
+    graph = make_graph(f, [int, int])
+    succ = find_successors(graph, [(graph.startblock, graph.getargs()[0])])
+    assert summary(succ) == {'a': 3}
+
+def test_find_successors_3():
+    def f(a, b):
+        if b > 10:      # 'a' condition block
+            a = a + b   # 'a' input
+            while b > 100:
+                b -= 2
+        while b > 5:    # 'a' in loop header
+            b -= 2      # 'a' in loop body
+        return a * b    # 'a' in product
+    graph = make_graph(f, [int, int])
+    succ = find_successors(graph, [(graph.startblock, graph.getargs()[0])])
+    assert summary(succ) == {'a': 5}
+
+def test_find_successors_trivial_rewrite():
+    def f(a, b):                              # 'b' in empty startblock
+        while a > 100:                        # 'b'
+            a -= 2                            # 'b'
+        c = llop.same_as(lltype.Signed, b)    # 'c', 'b'
+        while b > 10:                         # 'c', 'b'
+            b -= 2                            # 'c', 'b'
+        d = llop.same_as(lltype.Signed, c)    # 'd', 'c'
+        return d           # 'v' is the return var
+    graph = make_graph(f, [int, int])
+    pred = find_successors(graph, [(graph.startblock, graph.getargs()[1])])
+    assert summary(pred) == {'b': 6, 'c': 4, 'd': 1, 'v': 1}
+
+
+def test_interesting_vars_0():
+    def f(a, b):
+        pass
+    graph = make_graph(f, [llmemory.GCREF, int])
+    assert not find_interesting_variables(graph)
+
+def test_interesting_vars_1():
+    def f(a, b):
+        llop.gc_push_roots(lltype.Void, a)
+        llop.gc_pop_roots(lltype.Void, a)
+    graph = make_graph(f, [llmemory.GCREF, int])
+    assert summary(find_interesting_variables(graph)) == {'a': 1}
+
+def test_interesting_vars_2():
+    def f(a, b, c):
+        llop.gc_push_roots(lltype.Void, a)
+        llop.gc_pop_roots(lltype.Void, a)
+        while b > 0:
+            b -= 5
+        llop.gc_push_roots(lltype.Void, c)
+        llop.gc_pop_roots(lltype.Void, c)
+    graph = make_graph(f, [llmemory.GCREF, int, llmemory.GCREF])
+    assert summary(find_interesting_variables(graph)) == {'a': 1, 'c': 1}
+
+def test_interesting_vars_3():
+    def f(a, b):
+        llop.gc_push_roots(lltype.Void, a)
+        llop.gc_pop_roots(lltype.Void, a)
+        while b > 0:   # 'a' remains interesting across the blocks of this loop
+            b -= 5
+        llop.gc_push_roots(lltype.Void, a)
+        llop.gc_pop_roots(lltype.Void, a)
+    graph = make_graph(f, [llmemory.GCREF, int])
+    assert summary(find_interesting_variables(graph)) == {'a': 4}
+
+def test_allocate_registers_1():
+    def f(a, b):
+        llop.gc_push_roots(lltype.Void, a)
+        llop.gc_pop_roots(lltype.Void, a)
+        while b > 0:   # 'a' remains interesting across the blocks of this loop
+            b -= 5
+        llop.gc_push_roots(lltype.Void, a)
+        llop.gc_pop_roots(lltype.Void, a)
+    graph = make_graph(f, [llmemory.GCREF, int])
+    regalloc = allocate_registers(graph)
+    assert summary_regalloc(regalloc) == [('a', 0)] * 4
+
+def test_allocate_registers_2():
+    def f(a, b, c):
+        llop.gc_push_roots(lltype.Void, a)
+        llop.gc_pop_roots(lltype.Void, a)
+        while b > 0:
+            b -= 5
+        llop.gc_push_roots(lltype.Void, c)
+        llop.gc_pop_roots(lltype.Void, c)
+    graph = make_graph(f, [llmemory.GCREF, int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    assert summary_regalloc(regalloc) == [('a', 0), ('c', 0)]
+
+def test_allocate_registers_3():
+    def f(a, b, c):
+        llop.gc_push_roots(lltype.Void, c, a)
+        llop.gc_pop_roots(lltype.Void, c, a)
+        while b > 0:
+            b -= 5
+        llop.gc_push_roots(lltype.Void, a)
+        llop.gc_pop_roots(lltype.Void, a)
+    graph = make_graph(f, [llmemory.GCREF, int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    assert summary_regalloc(regalloc) == [('a', 1)] * 4 + [('c', 0)]
+
+def test_allocate_registers_4():
+    def g(a, x):
+        return x   # (or something different)
+    def f(a, b, c):
+        llop.gc_push_roots(lltype.Void, a, c) # 'a', 'c'
+        llop.gc_pop_roots(lltype.Void, a, c)
+        while b > 0:                          # 'a' only; 'c' not in push_roots
+            b -= 5
+            llop.gc_push_roots(lltype.Void, a)# 'a'
+            d = g(a, c)
+            llop.gc_pop_roots(lltype.Void, a)
+            c = d
+        return c
+    graph = make_graph(f, [llmemory.GCREF, int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    assert summary_regalloc(regalloc) == [('a', 1)] * 3 + [('c', 0)]
+
+def test_allocate_registers_5():
+    def g(a, x):
+        return x   # (or something different)
+    def f(a, b, c):
+        while b > 0:                          # 'a', 'c'
+            b -= 5
+            llop.gc_push_roots(lltype.Void, a, c)  # 'a', 'c'
+            g(a, c)
+            llop.gc_pop_roots(lltype.Void, a, c)
+        while b < 10:
+            b += 2
+        return c
+    graph = make_graph(f, [llmemory.GCREF, int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    assert summary_regalloc(regalloc) == [('a', 1)] * 2 + [('c', 0)] * 2
+
+@given(strategies.lists(strategies.booleans()))
+def test_make_bitmask(boollist):
+    index, bitmask = make_bitmask(boollist)
+    if index is None:
+        assert bitmask is None
+    else:
+        assert 0 <= index < len(boollist)
+        assert boollist[index] == False
+        assert bitmask >= 1
+        while bitmask:
+            if bitmask & 1:
+                assert index >= 0
+                assert boollist[index] == False
+                boollist[index] = True
+            bitmask >>= 1
+            index -= 1
+    assert boollist == [True] * len(boollist)
+
+
+class FakeRegAlloc:
+    graph = '?'
+
+    def __init__(self, expected_op, **colors):
+        self.expected_op = expected_op
+        self.numcolors = len(colors)
+        self.getcolor = colors.__getitem__
+
+    def check(self, got):
+        got = list(got)
+        result = []
+        for spaceop in got:
+            assert spaceop.opname == self.expected_op
+            result.append((spaceop.args[0].value, spaceop.args[1]))
+        return result
+
+def test_expand_one_push_roots():
+    regalloc = FakeRegAlloc('gc_save_root', a=0, b=1, c=2)
+    assert regalloc.check(expand_one_push_roots(regalloc, ['a', 'b', 'c'])) == 
[
+        (0, 'a'), (1, 'b'), (2, 'c')]
+    assert regalloc.check(expand_one_push_roots(regalloc, ['a', 'c'])) == [
+        (0, 'a'), (2, 'c'), (1, Constant(0x1, lltype.Signed))]
+    assert regalloc.check(expand_one_push_roots(regalloc, ['b'])) == [
+        (1, 'b'), (2, Constant(0x5, lltype.Signed))]
+    assert regalloc.check(expand_one_push_roots(regalloc, ['a'])) == [
+        (0, 'a'), (2, Constant(0x3, lltype.Signed))]
+    assert regalloc.check(expand_one_push_roots(regalloc, [])) == [
+        (2, Constant(0x7, lltype.Signed))]
+
+    assert list(expand_one_push_roots(None, [])) == []
+
+def test_expand_one_pop_roots():
+    regalloc = FakeRegAlloc('gc_restore_root', a=0, b=1, c=2)
+    assert regalloc.check(expand_one_pop_roots(regalloc, ['a', 'b', 'c'])) == [
+        (0, 'a'), (1, 'b'), (2, 'c')]
+    assert regalloc.check(expand_one_pop_roots(regalloc, ['a', 'c'])) == [
+        (0, 'a'), (2, 'c')]
+    assert regalloc.check(expand_one_pop_roots(regalloc, ['b'])) == [
+        (1, 'b')]
+    assert regalloc.check(expand_one_pop_roots(regalloc, ['a'])) == [
+        (0, 'a')]
+    assert regalloc.check(expand_one_pop_roots(regalloc, [])) == []
+
+    assert list(expand_one_pop_roots(None, [])) == []
+
+def test_move_pushes_earlier_1():
+    def g(a):
+        return a - 1
+    def f(a, b):
+        a *= 2
+        while a > 10:
+            llop.gc_push_roots(lltype.Void, b)
+            a = g(a)
+            llop.gc_pop_roots(lltype.Void, b)
+        return b
+
+    graph = make_graph(f, [int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    add_enter_leave_roots_frame(graph, regalloc, Constant('fake gcdata'))
+    assert graphmodel.summary(graph) == {
+        'int_mul': 1,
+        'gc_enter_roots_frame': 1,
+        'gc_save_root': 1,
+        'gc_restore_root': 1,
+        'int_gt': 1,
+        'direct_call': 1,
+        'gc_leave_roots_frame': 1,
+        }
+    assert len(graph.startblock.operations) == 3
+    assert graph.startblock.operations[0].opname == 'int_mul'
+    assert graph.startblock.operations[1].opname == 'gc_enter_roots_frame'
+    assert graph.startblock.operations[2].opname == 'gc_save_root'
+    assert graph.startblock.operations[2].args[0].value == 0
+    postprocess_double_check(graph)
+
+def test_move_pushes_earlier_2():
+    def g(a):
+        pass
+    def f(a, b):
+        llop.gc_push_roots(lltype.Void, b)
+        g(a)
+        llop.gc_pop_roots(lltype.Void, b)
+        while a > 10:
+            a -= 2
+        llop.gc_push_roots(lltype.Void, b)
+        g(a)
+        llop.gc_pop_roots(lltype.Void, b)
+        return b
+
+    graph = make_graph(f, [int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    assert graphmodel.summary(graph) == {
+        'gc_save_root': 1,
+        'gc_restore_root': 2,
+        'int_gt': 1,
+        'int_sub': 1,
+        'direct_call': 2,
+        }
+    add_enter_leave_roots_frame(graph, regalloc, Constant('fake gcdata'))
+    postprocess_double_check(graph)
+
+def test_remove_intrablock_push_roots():
+    def g(a):
+        pass
+    def f(a, b):
+        llop.gc_push_roots(lltype.Void, b)
+        g(a)
+        llop.gc_pop_roots(lltype.Void, b)
+        llop.gc_push_roots(lltype.Void, b)
+        g(a)
+        llop.gc_pop_roots(lltype.Void, b)
+        return b
+
+    graph = make_graph(f, [int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    assert graphmodel.summary(graph) == {
+        'gc_save_root': 1,
+        'gc_restore_root': 2,
+        'direct_call': 2,
+        }
+
+PSTRUCT = lltype.Ptr(lltype.GcStruct('S'))
+
+def test_move_pushes_earlier_rename_1():
+    def g(a):
+        pass
+    def f(a, b):
+        llop.gc_push_roots(lltype.Void, b)
+        g(a)
+        llop.gc_pop_roots(lltype.Void, b)
+        c = lltype.cast_opaque_ptr(PSTRUCT, b)
+        while a > 10:
+            a -= 2
+        llop.gc_push_roots(lltype.Void, c)
+        g(a)
+        llop.gc_pop_roots(lltype.Void, c)
+        return c
+
+    graph = make_graph(f, [int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    assert graphmodel.summary(graph) == {
+        'gc_save_root': 1,
+        'gc_restore_root': 2,
+        'cast_opaque_ptr': 1,
+        'int_gt': 1,
+        'int_sub': 1,
+        'direct_call': 2,
+        }
+    add_enter_leave_roots_frame(graph, regalloc, Constant('fake gcdata'))
+    postprocess_double_check(graph)
+
+def test_move_pushes_earlier_rename_2():
+    def g(a):
+        pass
+    def f(a, b):
+        llop.gc_push_roots(lltype.Void, b)
+        g(a)
+        llop.gc_pop_roots(lltype.Void, b)
+        while a > 10:
+            a -= 2
+        c = lltype.cast_opaque_ptr(PSTRUCT, b)
+        llop.gc_push_roots(lltype.Void, c)
+        g(a)
+        llop.gc_pop_roots(lltype.Void, c)
+        return c
+
+    graph = make_graph(f, [int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    assert graphmodel.summary(graph) == {
+        'gc_save_root': 1,
+        'gc_restore_root': 2,
+        'cast_opaque_ptr': 1,
+        'int_gt': 1,
+        'int_sub': 1,
+        'direct_call': 2,
+        }
+    add_enter_leave_roots_frame(graph, regalloc, Constant('fake gcdata'))
+    postprocess_double_check(graph)
+
+def test_move_pushes_earlier_rename_3():
+    def g(a):
+        pass
+    def f(a, b):
+        llop.gc_push_roots(lltype.Void, b)
+        g(a)
+        llop.gc_pop_roots(lltype.Void, b)
+        while a > 10:
+            a -= 2
+        c = lltype.cast_opaque_ptr(PSTRUCT, b)
+        while a > 10:
+            a -= 2
+        llop.gc_push_roots(lltype.Void, c)
+        g(a)
+        llop.gc_pop_roots(lltype.Void, c)
+        return c
+
+    graph = make_graph(f, [int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    assert graphmodel.summary(graph) == {
+        'gc_save_root': 1,
+        'gc_restore_root': 2,
+        'cast_opaque_ptr': 1,
+        'int_gt': 2,
+        'int_sub': 2,
+        'direct_call': 2,
+        }
+    add_enter_leave_roots_frame(graph, regalloc, Constant('fake gcdata'))
+    postprocess_double_check(graph)
+
+def test_move_pushes_earlier_rename_4():
+    def g(a):
+        return a - 2
+    def f(a, b):
+        while a > 10:
+            b1 = lltype.cast_opaque_ptr(PSTRUCT, b)
+            while a > 100:
+                a -= 3
+            b2 = lltype.cast_opaque_ptr(llmemory.GCREF, b1)
+            llop.gc_push_roots(lltype.Void, b2)
+            a = g(a)
+            llop.gc_pop_roots(lltype.Void, b2)
+            b3 = lltype.cast_opaque_ptr(PSTRUCT, b2)
+            while a > 100:
+                a -= 4
+            b4 = lltype.cast_opaque_ptr(llmemory.GCREF, b3)
+            llop.gc_push_roots(lltype.Void, b4)
+            a = g(a)
+            llop.gc_pop_roots(lltype.Void, b4)
+            b5 = lltype.cast_opaque_ptr(PSTRUCT, b4)
+            while a > 100:
+                a -= 5
+            b = lltype.cast_opaque_ptr(llmemory.GCREF, b5)
+        return b
+
+    graph = make_graph(f, [int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    assert graphmodel.summary(graph) == {
+        'gc_save_root': 1,
+        'gc_restore_root': 2,
+        'cast_opaque_ptr': 6,
+        'int_gt': 4,
+        'int_sub': 3,
+        'direct_call': 2,
+        }
+    add_enter_leave_roots_frame(graph, regalloc, Constant('fake gcdata'))
+    postprocess_double_check(graph)
+
+def test_add_leave_roots_frame_1():
+    def g(b):
+        pass
+    def f(a, b):
+        if a & 1:
+            llop.gc_push_roots(lltype.Void, b)
+            g(b)
+            llop.gc_pop_roots(lltype.Void, b)
+            a += 5
+        else:
+            llop.gc_push_roots(lltype.Void, b)
+            g(b)
+            llop.gc_pop_roots(lltype.Void, b)
+            a += 6
+        #...b forgotten here, even though it is pushed/popped above
+        while a > 100:
+            a -= 3
+        return a
+
+    graph = make_graph(f, [int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    add_enter_leave_roots_frame(graph, regalloc, Constant('fake gcdata'))
+    assert len(graph.startblock.exits) == 2
+    for link in graph.startblock.exits:
+        assert [op.opname for op in link.target.operations] == [
+            'gc_enter_roots_frame',
+            'gc_save_root',
+            'direct_call',
+            'gc_restore_root',
+            'gc_leave_roots_frame',
+            'int_add']
+    postprocess_double_check(graph)
+
+def test_add_leave_roots_frame_2():
+    def g(b):
+        pass
+    def f(a, b):
+        llop.gc_push_roots(lltype.Void, b)
+        g(b)
+        llop.gc_pop_roots(lltype.Void, b)
+        #...b forgotten here; the next push/pop is empty
+        llop.gc_push_roots(lltype.Void)
+        g(b)
+        llop.gc_pop_roots(lltype.Void)
+        while a > 100:
+            a -= 3
+        return a
+
+    graph = make_graph(f, [int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    add_enter_leave_roots_frame(graph, regalloc, Constant('fake gcdata'))
+    assert [op.opname for op in graph.startblock.operations] == [
+        'gc_enter_roots_frame',
+        'gc_save_root',
+        'direct_call',
+        'gc_restore_root',
+        'gc_leave_roots_frame',
+        'direct_call']
+    postprocess_double_check(graph)
+
+def test_bug_1():
+    class W:
+        pass
+    def foo(a):
+        if a < 10:
+            return W()
+        else:
+            return None
+    def compare(w_a, w_b):
+        return W()
+    def fetch_compare(w_a, w_b):
+        return W()
+    def is_true(a, w_b):
+        return not a
+    def call_next(w_a):
+        return W()
+
+    def f(a, w_tup):
+        llop.gc_push_roots(lltype.Void, w_tup)
+        w_key = foo(a)
+        llop.gc_pop_roots(lltype.Void, w_tup)
+
+        llop.gc_push_roots(lltype.Void, w_key)
+        w_iter = foo(a)
+        llop.gc_pop_roots(lltype.Void, w_key)
+
+        has_key = w_key is not None
+        hasit = False
+        w_maxit = None
+        w_max_val = None
+
+        while True:
+            llop.gc_push_roots(lltype.Void, w_iter, w_key, w_maxit, w_max_val)
+            w_item = call_next(w_iter)
+            llop.gc_pop_roots(lltype.Void, w_iter, w_key, w_maxit, w_max_val)
+
+            if has_key:
+                llop.gc_push_roots(lltype.Void, w_iter, w_key,
+                                       w_maxit, w_max_val, w_item)
+                w_compare_with = fetch_compare(w_key, w_item)
+                llop.gc_pop_roots(lltype.Void, w_iter, w_key,
+                                       w_maxit, w_max_val, w_item)
+            else:
+                w_compare_with = w_item
+
+            if hasit:
+                llop.gc_push_roots(lltype.Void, w_iter, w_key,
+                                w_maxit, w_max_val, w_item, w_compare_with)
+                w_bool = compare(w_compare_with, w_max_val)
+                llop.gc_pop_roots(lltype.Void, w_iter, w_key,
+                                w_maxit, w_max_val, w_item, w_compare_with)
+
+                llop.gc_push_roots(lltype.Void, w_iter, w_key,
+                                w_maxit, w_max_val, w_item, w_compare_with)
+                condition = is_true(a, w_bool)
+                llop.gc_pop_roots(lltype.Void, w_iter, w_key,
+                                w_maxit, w_max_val, w_item, w_compare_with)
+            else:
+                condition = True
+
+            if condition:
+                hasit = True
+                w_maxit = w_item
+                w_max_val = w_compare_with
+
+    graph = make_graph(f, [int, llmemory.GCREF])
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    add_enter_leave_roots_frame(graph, regalloc, Constant('fake gcdata'))
+    postprocess_double_check(graph)
+
+def test_bug_2():
+    def f(w_tup):
+        while True:
+            llop.gc_push_roots(lltype.Void, w_tup)
+            llop.gc_pop_roots(lltype.Void, w_tup)
+
+    graph = make_graph(f, [llmemory.GCREF])
+    assert not graph.startblock.operations
+    # this test is about what occurs if the startblock of the graph
+    # is also reached from another block.  None of the 'simplify'
+    # functions actually remove that, but the JIT transformation can...
+    graph.startblock = graph.startblock.exits[0].target
+
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    add_enter_leave_roots_frame(graph, regalloc, Constant('fake gcdata'))
+    postprocess_double_check(graph)
+
+def test_add_enter_roots_frame_remove_empty():
+    class W:
+        pass
+    def g():
+        return W()
+    def h(x):
+        pass
+    def k():
+        pass
+    def f():
+        llop.gc_push_roots(lltype.Void)
+        x = g()
+        llop.gc_pop_roots(lltype.Void)
+        llop.gc_push_roots(lltype.Void, x)
+        h(x)
+        llop.gc_pop_roots(lltype.Void, x)
+        llop.gc_push_roots(lltype.Void)
+        h(x)
+        llop.gc_pop_roots(lltype.Void)
+        llop.gc_push_roots(lltype.Void)
+        k()
+        llop.gc_pop_roots(lltype.Void)
+
+    graph = make_graph(f, [])
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    add_enter_leave_roots_frame(graph, regalloc, Constant('fake gcdata'))
+    assert [op.opname for op in graph.startblock.operations] == [
+        "direct_call",
+        "gc_enter_roots_frame",
+        "gc_save_root",
+        "direct_call",
+        "gc_restore_root",
+        "gc_leave_roots_frame",
+        "direct_call",
+        "direct_call",
+        ]
+    postprocess_double_check(graph)
+
+def test_add_enter_roots_frame_avoided():
+    def g(x):
+        return x
+    def f(x, n):
+        if n > 100:
+            llop.gc_push_roots(lltype.Void, x)
+            g(x)
+            llop.gc_pop_roots(lltype.Void, x)
+        return x
+
+    graph = make_graph(f, [llmemory.GCREF, int])
+    regalloc = allocate_registers(graph)
+    expand_push_roots(graph, regalloc)
+    move_pushes_earlier(graph, regalloc)
+    expand_pop_roots(graph, regalloc)
+    add_enter_leave_roots_frame(graph, regalloc, Constant('fake gcdata'))
+    assert [op.opname for op in graph.startblock.operations] == [
+        'int_gt', 'same_as']
+    [fastpath, slowpath] = graph.startblock.exits
+    assert fastpath.target is graph.returnblock
+    block2 = slowpath.target
+    assert [op.opname for op in block2.operations] == [
+        'gc_enter_roots_frame',
+        'gc_save_root',
+        'direct_call',
+        'gc_restore_root',
+        'gc_leave_roots_frame']
+    postprocess_double_check(graph)
+
+def test_fix_graph_after_inlining():
+    # the graph of f looks like it inlined another graph, which itself
+    # would be "if x > 100: foobar()".  The foobar() function is supposed
+    # to be the big slow-path.
+    def foobar():
+        print 42
+    def f(x):
+        llop.gc_push_roots(lltype.Void, x)
+        if x > 100:  # slow-path
+            foobar()
+        llop.gc_pop_roots(lltype.Void, x)
+        return x
+    graph = make_graph(f, [int])
+    postprocess_inlining(graph)
+    cleanup_graph(graph)
+    assert [op.opname for op in graph.startblock.operations] == [
+        'int_gt', 'same_as']
+    [fastpath, slowpath] = graph.startblock.exits
+    assert fastpath.target is graph.returnblock
+    block2 = slowpath.target
+    [v] = block2.inputargs
+    assert block2.operations[0].opname == 'gc_push_roots'
+    assert block2.operations[0].args == [v]
+    assert block2.operations[1].opname == 'direct_call'   # -> foobar
+    assert block2.operations[2].opname == 'gc_pop_roots'
+    assert block2.operations[2].args == [v]
+    assert len(block2.exits) == 1
+    assert block2.exits[0].target is graph.returnblock
diff --git a/rpython/memory/gctransform/transform.py 
b/rpython/memory/gctransform/transform.py
--- a/rpython/memory/gctransform/transform.py
+++ b/rpython/memory/gctransform/transform.py
@@ -97,6 +97,7 @@
         self.inline = inline
         if translator and inline:
             self.lltype_to_classdef = 
translator.rtyper.lltype_to_classdef_mapping()
+            self.raise_analyzer = RaiseAnalyzer(translator)
         self.graphs_to_inline = {}
         self.graph_dependencies = {}
         self.ll_finalizers_ptrs = []
@@ -113,28 +114,36 @@
         self.seen_graphs.add(graph)
         self.minimal_transform.add(graph)
 
-    def inline_helpers(self, graphs):
+    def inline_helpers_into(self, graph):
         from rpython.translator.backendopt.inline import iter_callsites
-        raise_analyzer = RaiseAnalyzer(self.translator)
+        to_enum = []
+        for called, block, i in iter_callsites(graph, None):
+            if called in self.graphs_to_inline:
+                to_enum.append(called)
+        any_inlining = False
+        for inline_graph in to_enum:
+            try:
+                inline.inline_function(self.translator, inline_graph, graph,
+                                       self.lltype_to_classdef,
+                                       self.raise_analyzer,
+                                       cleanup=False)
+                any_inlining = True
+            except inline.CannotInline as e:
+                print 'CANNOT INLINE:', e
+                print '\t%s into %s' % (inline_graph, graph)
+                raise      # for now, make it a fatal error
+        cleanup_graph(graph)
+        if any_inlining:
+            constant_fold_graph(graph)
+        return any_inlining
+
+    def inline_helpers_and_postprocess(self, graphs):
         for graph in graphs:
-            to_enum = []
-            for called, block, i in iter_callsites(graph, None):
-                if called in self.graphs_to_inline:
-                    to_enum.append(called)
-            must_constfold = False
-            for inline_graph in to_enum:
-                try:
-                    inline.inline_function(self.translator, inline_graph, 
graph,
-                                           self.lltype_to_classdef,
-                                           raise_analyzer,
-                                           cleanup=False)
-                    must_constfold = True
-                except inline.CannotInline as e:
-                    print 'CANNOT INLINE:', e
-                    print '\t%s into %s' % (inline_graph, graph)
-            cleanup_graph(graph)
-            if must_constfold:
-                constant_fold_graph(graph)
+            any_inlining = self.inline and self.inline_helpers_into(graph)
+            self.postprocess_graph(graph, any_inlining)
+
+    def postprocess_graph(self, graph, any_inlining):
+        pass
 
     def compute_borrowed_vars(self, graph):
         # the input args are borrowed, and stay borrowed for as long as they
diff --git a/rpython/rlib/_stacklet_shadowstack.py 
b/rpython/rlib/_stacklet_shadowstack.py
--- a/rpython/rlib/_stacklet_shadowstack.py
+++ b/rpython/rlib/_stacklet_shadowstack.py
@@ -47,14 +47,15 @@
 SIZEADDR = llmemory.sizeof(llmemory.Address)
 
 def customtrace(gc, obj, callback, arg):
+    from rpython.memory.gctransform.shadowstack import walk_stack_root
+
     stacklet = llmemory.cast_adr_to_ptr(obj, STACKLET_PTR)
     sscopy = stacklet.s_sscopy
     if sscopy:
         length_bytes = sscopy.signed[0]
-        while length_bytes > 0:
-            addr = sscopy + length_bytes
-            gc._trace_callback(callback, arg, addr)
-            length_bytes -= SIZEADDR
+        walk_stack_root(gc._trace_callback, callback, arg,
+                        sscopy + SIZEADDR, sscopy + SIZEADDR + length_bytes,
+                        is_minor=False)
 lambda_customtrace = lambda: customtrace
 
 def sscopy_detach_shadow_stack():
diff --git a/rpython/rtyper/llinterp.py b/rpython/rtyper/llinterp.py
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy default: hg merge shadowstack-perf-2

Reply via email to