D2601: xdiff: reduce indent heuristic overhead

2018-03-03 Thread quark (Jun Wu)
This revision was automatically updated to reflect the committed changes.
Closed by commit rHG9f2597a6620e: xdiff: reduce indent heuristic overhead 
(authored by quark, committed by ).

CHANGED PRIOR TO COMMIT
  https://phab.mercurial-scm.org/D2601?vs=6475=6515#toc

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D2601?vs=6475=6515

REVISION DETAIL
  https://phab.mercurial-scm.org/D2601

AFFECTED FILES
  mercurial/thirdparty/xdiff/xdiffi.c

CHANGE DETAILS

diff --git a/mercurial/thirdparty/xdiff/xdiffi.c 
b/mercurial/thirdparty/xdiff/xdiffi.c
--- a/mercurial/thirdparty/xdiff/xdiffi.c
+++ b/mercurial/thirdparty/xdiff/xdiffi.c
@@ -802,6 +802,14 @@
 }
 
 /*
+ * For indentation heuristic, skip searching for better slide position after
+ * checking MAX_BORING lines without finding an improvement. This defends the
+ * indentation heuristic logic against pathological cases. The value is not
+ * picked scientifically but should be good enough.
+ */
+#define MAX_BORING 100
+
+/*
  * Move back and forward change groups for a consistent and pretty diff output.
  * This also helps in finding joinable change groups and reducing the diff
  * size.
@@ -897,19 +905,43 @@
long shift, best_shift = -1;
struct split_score best_score;
 
-   for (shift = earliest_end; shift <= g.end; shift++) {
+   /*
+* This is O(N * MAX_BLANKS) (N = shift-able lines).
+* Even with MAX_BLANKS bounded to a small value, a
+* large N could still make this loop take several
+* times longer than the main diff algorithm. The
+* "boring" value is to help cut down N to something
+* like (MAX_BORING + groupsize).
+*
+* Scan from bottom to top. So we can exit the loop
+* without compromising the assumption "for a same best
+* score, pick the bottommost shift".
+*/
+   int boring = 0;
+   for (shift = g.end; shift >= earliest_end; shift--) {
struct split_measurement m;
struct split_score score = {0, 0};
+   int cmp;
 
measure_split(xdf, shift, );
score_add_split(, );
measure_split(xdf, shift - groupsize, );
score_add_split(, );
-   if (best_shift == -1 ||
-   score_cmp(, _score) <= 0) {
+
+   if (best_shift == -1) {
+   cmp = -1;
+   } else {
+   cmp = score_cmp(, _score);
+   }
+   if (cmp < 0) {
+   boring = 0;
best_score.effective_indent = 
score.effective_indent;
best_score.penalty = score.penalty;
best_shift = shift;
+   } else {
+   boring += 1;
+   if (boring >= MAX_BORING)
+   break;
}
}
 



To: quark, #hg-reviewers, indygreg, durin42
Cc: mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D2601: xdiff: reduce indent heuristic overhead

2018-03-03 Thread quark (Jun Wu)
quark updated this revision to Diff 6475.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D2601?vs=6458=6475

REVISION DETAIL
  https://phab.mercurial-scm.org/D2601

AFFECTED FILES
  mercurial/thirdparty/xdiff/xdiffi.c

CHANGE DETAILS

diff --git a/mercurial/thirdparty/xdiff/xdiffi.c 
b/mercurial/thirdparty/xdiff/xdiffi.c
--- a/mercurial/thirdparty/xdiff/xdiffi.c
+++ b/mercurial/thirdparty/xdiff/xdiffi.c
@@ -801,6 +801,14 @@
exit(1);
 }
 
+/*
+ * For indentation heuristic, skip searching for better slide position after
+ * checking MAX_BORING lines without finding an improvement. This defends the
+ * indentation heuristic logic against pathological cases. The value is not
+ * picked scientifically but should be good enough.
+ */
+#define MAX_BORING 100
+
 /*
  * Move back and forward change groups for a consistent and pretty diff output.
  * This also helps in finding joinable change groups and reducing the diff
@@ -897,19 +905,43 @@
long shift, best_shift = -1;
struct split_score best_score;
 
-   for (shift = earliest_end; shift <= g.end; shift++) {
+   /*
+* This is O(N * MAX_BLANKS) (N = shift-able lines).
+* Even with MAX_BLANKS bounded to a small value, a
+* large N could still make this loop take several
+* times longer than the main diff algorithm. The
+* "boring" value is to help cut down N to something
+* like (MAX_BORING + groupsize).
+*
+* Scan from bottom to top. So we can exit the loop
+* without compromising the assumption "for a same best
+* score, pick the bottommost shift".
+*/
+   int boring = 0;
+   for (shift = g.end; shift >= earliest_end; shift--) {
struct split_measurement m;
struct split_score score = {0, 0};
+   int cmp;
 
measure_split(xdf, shift, );
score_add_split(, );
measure_split(xdf, shift - groupsize, );
score_add_split(, );
-   if (best_shift == -1 ||
-   score_cmp(, _score) <= 0) {
+
+   if (best_shift == -1) {
+   cmp = -1;
+   } else {
+   cmp = score_cmp(, _score);
+   }
+   if (cmp < 0) {
+   boring = 0;
best_score.effective_indent = 
score.effective_indent;
best_score.penalty = score.penalty;
best_shift = shift;
+   } else {
+   boring += 1;
+   if (boring >= MAX_BORING)
+   break;
}
}
 



To: quark, #hg-reviewers
Cc: mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D2601: xdiff: reduce indent heuristic overhead

2018-03-03 Thread quark (Jun Wu)
quark created this revision.
Herald added a subscriber: mercurial-devel.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  Adds some threshold to avoid expensive cases, like:
  
#!python
open('a', 'w').write(" \n" * 100)
open('b', 'w').write(" \n" * 101)
  
  The indent heuristic is O(N * 20) (N = 100) for the above case, and
  makes diff much slower.
  
  Before this patch (system git 2.14.2):
  
git diff --no-indent-heuristic a b  0.21s user 0.03s system 100% cpu 0.239 
total
git diff --indent-heuristic a b 0.77s user 0.02s system 99% cpu 0.785 
total
  
  After this patch (git 2fc74f41, with xdiffi.c patched):
  
# with the changed xdiffi.c
git diff --indent-heuristic a b  0.16s user 0.01s system 90% cpu 0.188 
total
git diff --no-indent-heuristic a b   0.18s user 0.01s system 99% cpu 0.192 
total
  
  Now turning on indent-heuristic has no visible impact on performance.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D2601

AFFECTED FILES
  mercurial/third-party/xdiff/xdiffi.c

CHANGE DETAILS

diff --git a/mercurial/third-party/xdiff/xdiffi.c 
b/mercurial/third-party/xdiff/xdiffi.c
--- a/mercurial/third-party/xdiff/xdiffi.c
+++ b/mercurial/third-party/xdiff/xdiffi.c
@@ -801,6 +801,14 @@
exit(1);
 }
 
+/*
+ * For indentation heuristic, skip searching for better slide position after
+ * checking MAX_BORING lines without finding an improvement. This defends the
+ * indentation heuristic logic against pathological cases. The value is not
+ * picked scientifically but should be good enough.
+ */
+#define MAX_BORING 100
+
 /*
  * Move back and forward change groups for a consistent and pretty diff output.
  * This also helps in finding joinable change groups and reducing the diff
@@ -897,19 +905,43 @@
long shift, best_shift = -1;
struct split_score best_score;
 
-   for (shift = earliest_end; shift <= g.end; shift++) {
+   /*
+* This is O(N * MAX_BLANKS) (N = shift-able lines).
+* Even with MAX_BLANKS bounded to a small value, a
+* large N could still make this loop take several
+* times longer than the main diff algorithm. The
+* "boring" value is to help cut down N to something
+* like (MAX_BORING + groupsize).
+*
+* Scan from bottom to top. So we can exit the loop
+* without compromising the assumption "for a same best
+* score, pick the bottommost shift".
+*/
+   int boring = 0;
+   for (shift = g.end; shift >= earliest_end; shift--) {
struct split_measurement m;
struct split_score score = {0, 0};
+   int cmp;
 
measure_split(xdf, shift, );
score_add_split(, );
measure_split(xdf, shift - groupsize, );
score_add_split(, );
-   if (best_shift == -1 ||
-   score_cmp(, _score) <= 0) {
+
+   if (best_shift == -1) {
+   cmp = -1;
+   } else {
+   cmp = score_cmp(, _score);
+   }
+   if (cmp < 0) {
+   boring = 0;
best_score.effective_indent = 
score.effective_indent;
best_score.penalty = score.penalty;
best_shift = shift;
+   } else {
+   boring += 1;
+   if (boring >= MAX_BORING)
+   break;
}
}
 



To: quark, #hg-reviewers
Cc: mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel