The implementation is over complicated - finding "root" garbage is actually not needed. Finding cycles is good enough.
I'll send a simplified V2 with some common test cases. Excerpts from Jun Wu's message of 2017-05-24 12:36:20 -0700: > # HG changeset patch > # User Jun Wu <qu...@fb.com> > # Date 1495653050 25200 > # Wed May 24 12:10:50 2017 -0700 > # Node ID ee6c852d00cb7453981545127bbe27cc1516d2d8 > # Parent 57d6c0c74b1bbc83e9a511a4a1fa8b57e2457046 > # Available At https://bitbucket.org/quark-zju/hg-draft > # hg pull https://bitbucket.org/quark-zju/hg-draft -r > ee6c852d00cb > contrib: add a tool to detect cyclic references > > The motivation is when we have too many objects (like 700k+), gc.collect() > become expensive that even calling it once takes hundreds of milliseconds. > > Even if we have other solutions to reduce object count and want to keep gc > enabled at all time, it seems still nice to remove cycles as a best effort. > > This patch adds a leakdetect.py in contrib as an attempt to provide useful > tool to fight against cycles. > > It's relatively easy to get garbage objects but that information may be not > very useful because there may be too many. For example, if object A and B > forms a cycle, and are not referred by living objects, then objects referred > by A or B are garbage but are not that interesting. > > A <-----> B ("root" garbage) > /|\ /|\ > ..... ..... (referred by "root" garbage, many, less interesting) > > The script detects cycles (strongly connected components) and does one step > of topological sorting so it only outputs "root" garbage. > > An example of "hg id" analysis gives the following output is like: > > 771089 objects not freed by reference counting > 9 objects in a cycle: > <cell <function _parse_pl...@config.py:190>> > <function _parse_qu...@config.py:208> > <tuple (<cell <function _parse_pl...@config.py:190>>)> > <tuple (<cell <function _parse_pl...@config.py:190>>, <cell <function > _parse_qu...@config.py:208>>)> > <cell <function _configl...@config.py:251>> > <function _parse_pl...@config.py:190> > <tuple (<cell <function _configl...@config.py:251>>, <cell <function > _parse_pl...@config.py:190>>)> > <cell <function _parse_qu...@config.py:208>> > <function _configl...@config.py:251> > 5 objects in a cycle: > <type tagscache> > <tuple (<type tagscache>, <type object>)> > <attribute '__dict__' of 'tagscache' objects> > <attribute '__weakref__' of 'tagscache' objects> > <dict> > 2 objects in a cycle: > <type filteredrepo> > <tuple (<type filteredrepo>, <type repoview>, <type localrepository>, > <type object>)> > 33 objects in a cycle: > <localrepository> > <filecacheentry> > <changelog> > <filecacheentry> > <dict> > <filecacheentry> > <bound method fncachestore.join of <mercurial.store.fncachestore object > at 0x7fd4c584be90>> > <dict> > <obsstore> > <dict> > <filecacheentry> > <dict> > <bmstore> > <dict> > <bound method localrepository._checknested of > <mercurial.localrepo.localrepository object at 0x7fd4c584b390>> > <dict> > <pathauditor> > <dict> > <dict> > <dict> > <bound method localrepository._checknested of > <mercurial.localrepo.localrepository object at 0x7fd4c584b390>> > <pathauditor> > <dict> > <dict> > <fncachestore> > <bound method localrepository._dirstatevalidate of > <mercurial.localrepo.localrepository object at 0x7fd4c584b390>> > <phasecache> > <dict> > <dict> > <dict> > <filecacheentry> > <dirstate> > <dict> > > Although there are 700k+ garbage objects, the "root" garbage are just a few > that could be probably investigated manually. > > A simple test was added to demonstrate the feature. > > diff --git a/contrib/leakdetect.py b/contrib/leakdetect.py > new file mode 100644 > --- /dev/null > +++ b/contrib/leakdetect.py > @@ -0,0 +1,174 @@ > +# leakdetect.py - CPython object cyclic references analysis > +# > +# Copyright 2017 Facebook, Inc. > +# > +# This software may be used and distributed according to the terms of the > +# GNU General Public License version 2 or any later version. > + > +from __future__ import absolute_import > + > +import contextlib > +import gc > +import os > +import platform > +import sys > + > +def stronglyconnectedcomponents(vertices, edges): > + """yield strongly connected components in a directed graph""" > + ignored = set() # vertices outputted already > + stack = [] # vertice DFS stack > + index = {} # stack[index[n]] == n; later visit: bigger index > + lowlink = {} # see Tarjan's strongly connected components algorithm > + execstack = [] # enumerate execution stack to bypass Python stack limit > + > + def visit(v): > + index[v] = lowlink[v] = len(stack) > + stack.append(v) > + execstack.append((checkroot, (v,))) > + for w in edges.get(v, []): > + execstack.append((updatelowlink, (v, w))) > + if w not in index: > + execstack.append((visit, (w,))) > + > + def updatelowlink(v, w): > + lowlink[v] = min(lowlink[v], lowlink[w]) > + > + def checkroot(v): > + i = index[v] > + if lowlink[v] == i: > + component = set(stack[i:]) > + del stack[i:] > + ignored.update(component) > + return component > + > + for v in vertices: > + if v not in ignored: > + index.clear() > + lowlink.clear() > + execstack.append((visit, (v,))) > + while execstack: > + func, args = execstack.pop() > + result = func(*args) > + if result is not None: > + yield result > + > +def simplifygraph(vertices, edges): > + """make strongly connected component single vertex and remove cycles > + > + return a DAG (newvertices, newedges, vertexmapping) > + vertexmapping: {newvertex: [oldvertex]} > + """ > + components = stronglyconnectedcomponents(vertices, edges) > + old2new = {} # {oldvertex: newvertex} > + new2old = {} # {newvertex: [oldvertex]} > + for c in components: > + root = next(iter(c)) > + for v in c: > + old2new[v] = root > + new2old[root] = c > + newedges = {} > + for v, ws in edges.items(): > + v = old2new[v] > + ws = set(old2new[w] for w in ws if old2new[w] != v) > + newedges.setdefault(v, set()).update(ws) > + return new2old.keys(), newedges, new2old > + > +def notreferred(vertices, edges): > + """find vertices that are not referred (in-degree is 0)""" > + referred = set() > + for ws in edges.values(): > + referred.update(ws) > + return set(vertices) - referred > + > +def objectreferencegraph(objects): > + """return (vertices, edges), referenced relationship among objects""" > + idset = set(id(o) for o in objects) > + edges = {} > + for o in objects: > + edges[id(o)] = [id(r) for r in gc.get_referents(o) if id(r) in idset] > + return idset, edges > + > +def describeobject(obj): > + """short and easy to understand text to help identify an object > + > + similar to repr(), but avoids huge outputs in all cases. > + """ > + class _dummy(object): > + def _dummy(self): > + pass > + > + if isinstance(obj, tuple): > + return '<tuple (%s)>' % ', '.join(describeobject(v) for v in obj) > + elif isinstance(obj, type): > + return '<type %s>' % obj.__name__ > + elif isinstance(obj, type(describeobject)): # function > + code = obj.func_code > + path = os.path.basename(code.co_filename) > + return ('<function %s@%s:%s>' % (obj.func_name, path, > + code.co_firstlineno)) > + elif isinstance(obj, type(_dummy()._dummy)): # instancemethod > + return repr(obj) > + else: > + if type(obj) in (list, set, dict): > + tryrepr = repr(obj) > + if len(tryrepr) < 70: > + return tryrepr > + cellcontent = getattr(obj, 'cell_contents', None) > + if cellcontent is not None: > + return '<cell %s>' % describeobject(cellcontent) > + typename = type(obj).__name__ > + if typename == 'getset_descriptor': > + return repr(obj) > + return '<%s>' % typename > + > +def analysegarbage(garbage, out): > + """analyse root garbage objects and output them for investigation > + > + root garbage objects are strongly connected components in the object > + reference graph where they are not referred by other strongly connected > + components. > + """ > + out.write('%d objects not freed by reference counting\n' % len(garbage)) > + vertices, edges, mapping = simplifygraph(*objectreferencegraph(garbage)) > + roots = [mapping[v] for v in notreferred(vertices, edges)] > + objmap = {id(o): o for o in garbage} > + groups = [[objmap[o] for o in root] for root in roots] > + outputs = [] > + for g in groups: > + output = '' > + if len(g) == 1: > + # it may or may not be in a cycle due to gc.get_referents is not > + # guaranteed to provide complete information > + output += '1 object cannot be freed:\n' > + else: > + output += '%d objects in a cycle:\n' % len(g) > + output += ''.join(sorted(' %s\n' % describeobject(o) for o in g)) > + outputs.append(output) > + out.write(''.join(sorted(outputs))) > + > +if platform.python_implementation() == 'CPython': > + # only verified to work in CPython > + @contextlib.contextmanager > + def leakdetect(out=sys.stderr): > + # make gc.collect put objects to gc.garbage instead of releasing them > + gc.set_debug(gc.get_debug() | gc.DEBUG_SAVEALL) > + gc.disable() > + try: > + yield > + finally: > + # populate gc.garbage > + gc.collect() > + garbage = gc.garbage > + # restore the default behavior > + gc.set_debug(gc.get_debug() ^ gc.DEBUG_SAVEALL) > + if garbage: > + analysegarbage(garbage, out) > + > + disabled = False > +else: > + # fallback to a no-op gracefully > + @contextlib.contextmanager > + def leakdetect(out=sys.stderr): > + yield > + > + disabled = True > diff --git a/tests/test-leakdetect.py b/tests/test-leakdetect.py > new file mode 100644 > --- /dev/null > +++ b/tests/test-leakdetect.py > @@ -0,0 +1,24 @@ > +from __future__ import absolute_import > + > +import os > +import sys > + > +dirname = os.path.dirname > +sys.path.insert(0, os.path.join(dirname(dirname(__file__)), 'contrib')) > + > +import leakdetect > + > +if leakdetect.disabled: > + sys.exit(80, 'skipped: missing feature: cpython') > + > +def leak1(): > + a = [] > + b = {} > + b['a'] = a > + a.append(b) > + a.append([[[[['65537']]]]]) # referenced by root garbage, uninteresting > + def func(x): > + return func(x + 1) > + > +with leakdetect.leakdetect(): > + leak1() > diff --git a/tests/test-leakdetect.py.out b/tests/test-leakdetect.py.out > new file mode 100644 > --- /dev/null > +++ b/tests/test-leakdetect.py.out > @@ -0,0 +1,8 @@ > +10 objects not freed by reference counting > +2 objects in a cycle: > + [{'a': [...]}, [[[[['65537']]]]]] > + {'a': [{...}, [[[[['65537']]]]]]} > +3 objects in a cycle: > + <cell <function f...@test-leakdetect.py:20>> > + <function f...@test-leakdetect.py:20> > + <tuple (<cell <function f...@test-leakdetect.py:20>>)> _______________________________________________ Mercurial-devel mailing list Mercurial-devel@mercurial-scm.org https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel