On 01/09/2017 08:49 PM, Sean Farley wrote:
# HG changeset patch
# User Sean Farley
# Date 1483850877 28800
# Sat Jan 07 20:47:57 2017 -0800
# Node ID 8561e240f25e851476abde452b850fc09a7fdf06
# Parent 9af4ef2f80a8c5cb69ad4ff5d291ce81a35e4f39
similar: move score function to module level
Future patches will use this to report the similarity of a rename / copy
in the patch output.
Martin spotted an interesting issue. This move a @cachefunc decorator to
the module level, so every data accessed to compute similarity will end
up cached for ever :-/. There is also a second caching added on the
score function that would keep filectx alive for ever (and was not
initially cached at all before this patch)
I played around a bit and made the following change to allow the top
level function to state cache free while not regressing about the
caching in the loop itself. Can you send a V2 (with my change or another
one as long as the caching issue is solved)
--- a/mercurial/similar.py
+++ b/mercurial/similar.py
@@ -43,16 +43,17 @@ def _findexactmatches(repo, added, remov
# Done
repo.ui.progress(_('searching for exact renames'), None)
-@util.cachefunc
def _ctxdata(fctx):
# lazily load text
orig = fctx.data()
return orig, mdiff.splitnewlines(orig)
-@util.cachefunc
def score(fctx1, fctx2):
-text = fctx1.data()
-orig, lines = _ctxdata(fctx2)
+return _score(fcx1, _ctxdata(fctx2))
+
+def _score(fctx, otherdata):
+orig, lines = otherdata
+text = fctx.data()
# bdiff.blocks() returns blocks of matching lines
# count the number of bytes in each
equal = 0
@@ -74,10 +75,12 @@ def _findsimilarmatches(repo, added, rem
for i, r in enumerate(removed):
repo.ui.progress(_('searching for similar files'), i,
total=len(removed), unit=_('files'))
-
+data = None
for a in added:
bestscore = copies.get(a, (None, threshold))[1]
-myscore = score(a, r)
+if data is None:
+data = _ctxdata(r)
+myscore = _score(a, data)
if myscore >= bestscore:
copies[a] = (r, myscore)
repo.ui.progress(_('searching'), None)
diff --git a/mercurial/similar.py b/mercurial/similar.py
--- a/mercurial/similar.py
+++ b/mercurial/similar.py
@@ -41,10 +41,31 @@ def _findexactmatches(repo, added, remov
yield (hashes[h], fctx)
# Done
repo.ui.progress(_('searching for exact renames'), None)
+@util.cachefunc
+def _ctxdata(fctx):
+# lazily load text
+orig = fctx.data()
+return orig, mdiff.splitnewlines(orig)
+
+@util.cachefunc
+def score(fctx1, fctx2):
+text = fctx1.data()
+orig, lines = _ctxdata(fctx2)
+# bdiff.blocks() returns blocks of matching lines
+# count the number of bytes in each
+equal = 0
+matches = bdiff.blocks(text, orig)
+for x1, x2, y1, y2 in matches:
+for line in lines[y1:y2]:
+equal += len(line)
+
+lengths = len(text) + len(orig)
+return equal * 2.0 / lengths
+
def _findsimilarmatches(repo, added, removed, threshold):
'''find potentially renamed files based on similar file content
Takes a list of new filectxs and a list of removed filectxs, and yields
(before, after, score) tuples of partial matches.
@@ -52,32 +73,13 @@ def _findsimilarmatches(repo, added, rem
copies = {}
for i, r in enumerate(removed):
repo.ui.progress(_('searching for similar files'), i,
total=len(removed), unit=_('files'))
-# lazily load text
-@util.cachefunc
-def data():
-orig = r.data()
-return orig, mdiff.splitnewlines(orig)
-
-def score(text):
-orig, lines = data()
-# bdiff.blocks() returns blocks of matching lines
-# count the number of bytes in each
-equal = 0
-matches = bdiff.blocks(text, orig)
-for x1, x2, y1, y2 in matches:
-for line in lines[y1:y2]:
-equal += len(line)
-
-lengths = len(text) + len(orig)
-return equal * 2.0 / lengths
-
for a in added:
bestscore = copies.get(a, (None, threshold))[1]
-myscore = score(a.data())
+myscore = score(a, r)
if myscore >= bestscore:
copies[a] = (r, myscore)
repo.ui.progress(_('searching'), None)
for dest, v in copies.iteritems():
Cheers,
--
Pierre-Yves David
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel