Re: [PATCH 5 of 8 git-diff] similar: move score function to module level

2017-01-13 Thread Pierre-Yves David



On 01/09/2017 08:49 PM, Sean Farley wrote:

# HG changeset patch
# User Sean Farley 
# Date 1483850877 28800
#  Sat Jan 07 20:47:57 2017 -0800
# Node ID 8561e240f25e851476abde452b850fc09a7fdf06
# Parent  9af4ef2f80a8c5cb69ad4ff5d291ce81a35e4f39
similar: move score function to module level

Future patches will use this to report the similarity of a rename / copy
in the patch output.


Martin spotted an interesting issue. This move a @cachefunc decorator to 
the module level, so every data accessed to compute similarity will end 
up cached for ever :-/. There is also a second caching added on the 
score function that would keep filectx alive for ever (and was not 
initially cached at all before this patch)


I played around a bit and made the following change to allow the top 
level function to state cache free while not regressing about the 
caching in the loop itself. Can you send a V2 (with my change or another 
one as long as the caching issue is solved)


--- a/mercurial/similar.py
+++ b/mercurial/similar.py
@@ -43,16 +43,17 @@ def _findexactmatches(repo, added, remov
 # Done
 repo.ui.progress(_('searching for exact renames'), None)

-@util.cachefunc
 def _ctxdata(fctx):
 # lazily load text
 orig = fctx.data()
 return orig, mdiff.splitnewlines(orig)

-@util.cachefunc
 def score(fctx1, fctx2):
-text = fctx1.data()
-orig, lines = _ctxdata(fctx2)
+return _score(fcx1, _ctxdata(fctx2))
+
+def _score(fctx, otherdata):
+orig, lines = otherdata
+text = fctx.data()
 # bdiff.blocks() returns blocks of matching lines
 # count the number of bytes in each
 equal = 0
@@ -74,10 +75,12 @@ def _findsimilarmatches(repo, added, rem
 for i, r in enumerate(removed):
 repo.ui.progress(_('searching for similar files'), i,
  total=len(removed), unit=_('files'))
-
+data = None
 for a in added:
 bestscore = copies.get(a, (None, threshold))[1]
-myscore = score(a, r)
+if data is None:
+data = _ctxdata(r)
+myscore = _score(a, data)
 if myscore >= bestscore:
 copies[a] = (r, myscore)
 repo.ui.progress(_('searching'), None)





diff --git a/mercurial/similar.py b/mercurial/similar.py
--- a/mercurial/similar.py
+++ b/mercurial/similar.py
@@ -41,10 +41,31 @@ def _findexactmatches(repo, added, remov
 yield (hashes[h], fctx)

 # Done
 repo.ui.progress(_('searching for exact renames'), None)

+@util.cachefunc
+def _ctxdata(fctx):
+# lazily load text
+orig = fctx.data()
+return orig, mdiff.splitnewlines(orig)
+
+@util.cachefunc
+def score(fctx1, fctx2):
+text = fctx1.data()
+orig, lines = _ctxdata(fctx2)
+# bdiff.blocks() returns blocks of matching lines
+# count the number of bytes in each
+equal = 0
+matches = bdiff.blocks(text, orig)
+for x1, x2, y1, y2 in matches:
+for line in lines[y1:y2]:
+equal += len(line)
+
+lengths = len(text) + len(orig)
+return equal * 2.0 / lengths
+
 def _findsimilarmatches(repo, added, removed, threshold):
 '''find potentially renamed files based on similar file content

 Takes a list of new filectxs and a list of removed filectxs, and yields
 (before, after, score) tuples of partial matches.
@@ -52,32 +73,13 @@ def _findsimilarmatches(repo, added, rem
 copies = {}
 for i, r in enumerate(removed):
 repo.ui.progress(_('searching for similar files'), i,
  total=len(removed), unit=_('files'))

-# lazily load text
-@util.cachefunc
-def data():
-orig = r.data()
-return orig, mdiff.splitnewlines(orig)
-
-def score(text):
-orig, lines = data()
-# bdiff.blocks() returns blocks of matching lines
-# count the number of bytes in each
-equal = 0
-matches = bdiff.blocks(text, orig)
-for x1, x2, y1, y2 in matches:
-for line in lines[y1:y2]:
-equal += len(line)
-
-lengths = len(text) + len(orig)
-return equal * 2.0 / lengths
-
 for a in added:
 bestscore = copies.get(a, (None, threshold))[1]
-myscore = score(a.data())
+myscore = score(a, r)
 if myscore >= bestscore:
 copies[a] = (r, myscore)
 repo.ui.progress(_('searching'), None)

 for dest, v in copies.iteritems():


Cheers,

--
Pierre-Yves David
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


[PATCH 5 of 8 git-diff] similar: move score function to module level

2017-01-09 Thread Sean Farley
# HG changeset patch
# User Sean Farley 
# Date 1483850877 28800
#  Sat Jan 07 20:47:57 2017 -0800
# Node ID 8561e240f25e851476abde452b850fc09a7fdf06
# Parent  9af4ef2f80a8c5cb69ad4ff5d291ce81a35e4f39
similar: move score function to module level

Future patches will use this to report the similarity of a rename / copy
in the patch output.

diff --git a/mercurial/similar.py b/mercurial/similar.py
--- a/mercurial/similar.py
+++ b/mercurial/similar.py
@@ -41,10 +41,31 @@ def _findexactmatches(repo, added, remov
 yield (hashes[h], fctx)
 
 # Done
 repo.ui.progress(_('searching for exact renames'), None)
 
+@util.cachefunc
+def _ctxdata(fctx):
+# lazily load text
+orig = fctx.data()
+return orig, mdiff.splitnewlines(orig)
+
+@util.cachefunc
+def score(fctx1, fctx2):
+text = fctx1.data()
+orig, lines = _ctxdata(fctx2)
+# bdiff.blocks() returns blocks of matching lines
+# count the number of bytes in each
+equal = 0
+matches = bdiff.blocks(text, orig)
+for x1, x2, y1, y2 in matches:
+for line in lines[y1:y2]:
+equal += len(line)
+
+lengths = len(text) + len(orig)
+return equal * 2.0 / lengths
+
 def _findsimilarmatches(repo, added, removed, threshold):
 '''find potentially renamed files based on similar file content
 
 Takes a list of new filectxs and a list of removed filectxs, and yields
 (before, after, score) tuples of partial matches.
@@ -52,32 +73,13 @@ def _findsimilarmatches(repo, added, rem
 copies = {}
 for i, r in enumerate(removed):
 repo.ui.progress(_('searching for similar files'), i,
  total=len(removed), unit=_('files'))
 
-# lazily load text
-@util.cachefunc
-def data():
-orig = r.data()
-return orig, mdiff.splitnewlines(orig)
-
-def score(text):
-orig, lines = data()
-# bdiff.blocks() returns blocks of matching lines
-# count the number of bytes in each
-equal = 0
-matches = bdiff.blocks(text, orig)
-for x1, x2, y1, y2 in matches:
-for line in lines[y1:y2]:
-equal += len(line)
-
-lengths = len(text) + len(orig)
-return equal * 2.0 / lengths
-
 for a in added:
 bestscore = copies.get(a, (None, threshold))[1]
-myscore = score(a.data())
+myscore = score(a, r)
 if myscore >= bestscore:
 copies[a] = (r, myscore)
 repo.ui.progress(_('searching'), None)
 
 for dest, v in copies.iteritems():
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel