# HG changeset patch # User Jun Wu <qu...@fb.com> # Date 1495653050 25200 # Wed May 24 12:10:50 2017 -0700 # Node ID ee6c852d00cb7453981545127bbe27cc1516d2d8 # Parent 57d6c0c74b1bbc83e9a511a4a1fa8b57e2457046 # Available At https://bitbucket.org/quark-zju/hg-draft # hg pull https://bitbucket.org/quark-zju/hg-draft -r ee6c852d00cb contrib: add a tool to detect cyclic references
The motivation is when we have too many objects (like 700k+), gc.collect() become expensive that even calling it once takes hundreds of milliseconds. Even if we have other solutions to reduce object count and want to keep gc enabled at all time, it seems still nice to remove cycles as a best effort. This patch adds a leakdetect.py in contrib as an attempt to provide useful tool to fight against cycles. It's relatively easy to get garbage objects but that information may be not very useful because there may be too many. For example, if object A and B forms a cycle, and are not referred by living objects, then objects referred by A or B are garbage but are not that interesting. A <-----> B ("root" garbage) /|\ /|\ ..... ..... (referred by "root" garbage, many, less interesting) The script detects cycles (strongly connected components) and does one step of topological sorting so it only outputs "root" garbage. An example of "hg id" analysis gives the following output is like: 771089 objects not freed by reference counting 9 objects in a cycle: <cell <function _parse_pl...@config.py:190>> <function _parse_qu...@config.py:208> <tuple (<cell <function _parse_pl...@config.py:190>>)> <tuple (<cell <function _parse_pl...@config.py:190>>, <cell <function _parse_qu...@config.py:208>>)> <cell <function _configl...@config.py:251>> <function _parse_pl...@config.py:190> <tuple (<cell <function _configl...@config.py:251>>, <cell <function _parse_pl...@config.py:190>>)> <cell <function _parse_qu...@config.py:208>> <function _configl...@config.py:251> 5 objects in a cycle: <type tagscache> <tuple (<type tagscache>, <type object>)> <attribute '__dict__' of 'tagscache' objects> <attribute '__weakref__' of 'tagscache' objects> <dict> 2 objects in a cycle: <type filteredrepo> <tuple (<type filteredrepo>, <type repoview>, <type localrepository>, <type object>)> 33 objects in a cycle: <localrepository> <filecacheentry> <changelog> <filecacheentry> <dict> <filecacheentry> <bound method fncachestore.join of <mercurial.store.fncachestore object at 0x7fd4c584be90>> <dict> <obsstore> <dict> <filecacheentry> <dict> <bmstore> <dict> <bound method localrepository._checknested of <mercurial.localrepo.localrepository object at 0x7fd4c584b390>> <dict> <pathauditor> <dict> <dict> <dict> <bound method localrepository._checknested of <mercurial.localrepo.localrepository object at 0x7fd4c584b390>> <pathauditor> <dict> <dict> <fncachestore> <bound method localrepository._dirstatevalidate of <mercurial.localrepo.localrepository object at 0x7fd4c584b390>> <phasecache> <dict> <dict> <dict> <filecacheentry> <dirstate> <dict> Although there are 700k+ garbage objects, the "root" garbage are just a few that could be probably investigated manually. A simple test was added to demonstrate the feature. diff --git a/contrib/leakdetect.py b/contrib/leakdetect.py new file mode 100644 --- /dev/null +++ b/contrib/leakdetect.py @@ -0,0 +1,174 @@ +# leakdetect.py - CPython object cyclic references analysis +# +# Copyright 2017 Facebook, Inc. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import contextlib +import gc +import os +import platform +import sys + +def stronglyconnectedcomponents(vertices, edges): + """yield strongly connected components in a directed graph""" + ignored = set() # vertices outputted already + stack = [] # vertice DFS stack + index = {} # stack[index[n]] == n; later visit: bigger index + lowlink = {} # see Tarjan's strongly connected components algorithm + execstack = [] # enumerate execution stack to bypass Python stack limit + + def visit(v): + index[v] = lowlink[v] = len(stack) + stack.append(v) + execstack.append((checkroot, (v,))) + for w in edges.get(v, []): + execstack.append((updatelowlink, (v, w))) + if w not in index: + execstack.append((visit, (w,))) + + def updatelowlink(v, w): + lowlink[v] = min(lowlink[v], lowlink[w]) + + def checkroot(v): + i = index[v] + if lowlink[v] == i: + component = set(stack[i:]) + del stack[i:] + ignored.update(component) + return component + + for v in vertices: + if v not in ignored: + index.clear() + lowlink.clear() + execstack.append((visit, (v,))) + while execstack: + func, args = execstack.pop() + result = func(*args) + if result is not None: + yield result + +def simplifygraph(vertices, edges): + """make strongly connected component single vertex and remove cycles + + return a DAG (newvertices, newedges, vertexmapping) + vertexmapping: {newvertex: [oldvertex]} + """ + components = stronglyconnectedcomponents(vertices, edges) + old2new = {} # {oldvertex: newvertex} + new2old = {} # {newvertex: [oldvertex]} + for c in components: + root = next(iter(c)) + for v in c: + old2new[v] = root + new2old[root] = c + newedges = {} + for v, ws in edges.items(): + v = old2new[v] + ws = set(old2new[w] for w in ws if old2new[w] != v) + newedges.setdefault(v, set()).update(ws) + return new2old.keys(), newedges, new2old + +def notreferred(vertices, edges): + """find vertices that are not referred (in-degree is 0)""" + referred = set() + for ws in edges.values(): + referred.update(ws) + return set(vertices) - referred + +def objectreferencegraph(objects): + """return (vertices, edges), referenced relationship among objects""" + idset = set(id(o) for o in objects) + edges = {} + for o in objects: + edges[id(o)] = [id(r) for r in gc.get_referents(o) if id(r) in idset] + return idset, edges + +def describeobject(obj): + """short and easy to understand text to help identify an object + + similar to repr(), but avoids huge outputs in all cases. + """ + class _dummy(object): + def _dummy(self): + pass + + if isinstance(obj, tuple): + return '<tuple (%s)>' % ', '.join(describeobject(v) for v in obj) + elif isinstance(obj, type): + return '<type %s>' % obj.__name__ + elif isinstance(obj, type(describeobject)): # function + code = obj.func_code + path = os.path.basename(code.co_filename) + return ('<function %s@%s:%s>' % (obj.func_name, path, + code.co_firstlineno)) + elif isinstance(obj, type(_dummy()._dummy)): # instancemethod + return repr(obj) + else: + if type(obj) in (list, set, dict): + tryrepr = repr(obj) + if len(tryrepr) < 70: + return tryrepr + cellcontent = getattr(obj, 'cell_contents', None) + if cellcontent is not None: + return '<cell %s>' % describeobject(cellcontent) + typename = type(obj).__name__ + if typename == 'getset_descriptor': + return repr(obj) + return '<%s>' % typename + +def analysegarbage(garbage, out): + """analyse root garbage objects and output them for investigation + + root garbage objects are strongly connected components in the object + reference graph where they are not referred by other strongly connected + components. + """ + out.write('%d objects not freed by reference counting\n' % len(garbage)) + vertices, edges, mapping = simplifygraph(*objectreferencegraph(garbage)) + roots = [mapping[v] for v in notreferred(vertices, edges)] + objmap = {id(o): o for o in garbage} + groups = [[objmap[o] for o in root] for root in roots] + outputs = [] + for g in groups: + output = '' + if len(g) == 1: + # it may or may not be in a cycle due to gc.get_referents is not + # guaranteed to provide complete information + output += '1 object cannot be freed:\n' + else: + output += '%d objects in a cycle:\n' % len(g) + output += ''.join(sorted(' %s\n' % describeobject(o) for o in g)) + outputs.append(output) + out.write(''.join(sorted(outputs))) + +if platform.python_implementation() == 'CPython': + # only verified to work in CPython + @contextlib.contextmanager + def leakdetect(out=sys.stderr): + # make gc.collect put objects to gc.garbage instead of releasing them + gc.set_debug(gc.get_debug() | gc.DEBUG_SAVEALL) + gc.disable() + try: + yield + finally: + # populate gc.garbage + gc.collect() + garbage = gc.garbage + # restore the default behavior + gc.set_debug(gc.get_debug() ^ gc.DEBUG_SAVEALL) + if garbage: + analysegarbage(garbage, out) + + disabled = False +else: + # fallback to a no-op gracefully + @contextlib.contextmanager + def leakdetect(out=sys.stderr): + yield + + disabled = True diff --git a/tests/test-leakdetect.py b/tests/test-leakdetect.py new file mode 100644 --- /dev/null +++ b/tests/test-leakdetect.py @@ -0,0 +1,24 @@ +from __future__ import absolute_import + +import os +import sys + +dirname = os.path.dirname +sys.path.insert(0, os.path.join(dirname(dirname(__file__)), 'contrib')) + +import leakdetect + +if leakdetect.disabled: + sys.exit(80, 'skipped: missing feature: cpython') + +def leak1(): + a = [] + b = {} + b['a'] = a + a.append(b) + a.append([[[[['65537']]]]]) # referenced by root garbage, uninteresting + def func(x): + return func(x + 1) + +with leakdetect.leakdetect(): + leak1() diff --git a/tests/test-leakdetect.py.out b/tests/test-leakdetect.py.out new file mode 100644 --- /dev/null +++ b/tests/test-leakdetect.py.out @@ -0,0 +1,8 @@ +10 objects not freed by reference counting +2 objects in a cycle: + [{'a': [...]}, [[[[['65537']]]]]] + {'a': [{...}, [[[[['65537']]]]]]} +3 objects in a cycle: + <cell <function f...@test-leakdetect.py:20>> + <function f...@test-leakdetect.py:20> + <tuple (<cell <function f...@test-leakdetect.py:20>>)> _______________________________________________ Mercurial-devel mailing list Mercurial-devel@mercurial-scm.org https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel