D2631: [RFC] xdiff: skip trimmed lines when preparing the hashtable

quark (Jun Wu) Sat, 03 Mar 2018 18:52:32 -0800

quark created this revision.
Herald added a subscriber: mercurial-devel.
Herald added a reviewer: hg-reviewers.


REVISION SUMMARY
  NOTE: I'm still reading xdiffi.c to understand whether this is a safe
  optimization or not.
  
  xdiff has a "xdl_trim_ends" function that removes common prefix and suffix.
  
  Previously, xdiff will build a hashtable for all lines. That is a waste of
  time for trimmed lines. This diff changes the logic so trimmed lines will be
  ignored when building the hashtable. Note: the hashtable is still needed for
  shifting purpose, so it does not blindly take whatever `xdl_trim_ends` says,
  but also looks around.
  
  For the following test case:
  
    #!python
    open('a','w').write(''.join('%s\n' % (i % 100000) for i in 
xrange(10000000)))
    open('b','w').write(''.join('%s\n' % (i % 100000) for i in 
xrange(10000001)))
  
  This series reduces xdiff's time for the above case from 1.1 seconds 
(https://phab.mercurial-scm.org/D2604)
  to 0.6 seconds.
  
  However, GNU diffutils can perform even better (<0.1 seconds), there are
  still things to catch up.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D2631

AFFECTED FILES
  mercurial/thirdparty/xdiff/xprepare.c

CHANGE DETAILS

diff --git a/mercurial/thirdparty/xdiff/xprepare.c 
b/mercurial/thirdparty/xdiff/xprepare.c
--- a/mercurial/thirdparty/xdiff/xprepare.c
+++ b/mercurial/thirdparty/xdiff/xprepare.c
@@ -62,6 +62,7 @@
 static int xdl_clean_mmatch(char const *dis, long i, long s, long e);
 static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t 
*xdf2);
 static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2);
+static long xdl_trim_reserved_lines(xdfile_t *xdf1, xdfile_t *xdf2);
 
 
 
@@ -234,21 +235,32 @@
  * unique. This makes future calculation faster - they can just compare "ha"
  * instead of comparing line content.
  */
-static int xdl_prepare_hashtable(unsigned int pass, mmfile_t *mf,
-               xpparam_t const *xpp, xdlclassifier_t *cf, xdfile_t *xdf) {
+static int xdl_prepare_hashtable(unsigned int pass, long reserved, mmfile_t
+               *mf, xpparam_t const *xpp, xdlclassifier_t *cf, xdfile_t *xdf)
+{
        xrecord_t **rhash = NULL;
-       long nrec = xdf->nrec;
+       long nrec;
+
        unsigned int hbits;
        long hsize;
        long i;
+       long start = xdf->dstart - reserved;
+       long end = xdf->dend + reserved;
+
+       if (start < 0)
+               start = 0;
+       if (end >= xdf->nrec)
+               end = xdf->nrec - 1;
+
+       nrec = end - start;
 
        hbits = xdl_hashbits((unsigned int) nrec);
        hsize = 1 << hbits;
        if (!(rhash = (xrecord_t **) xdl_malloc(hsize * sizeof(xrecord_t *))))
                goto abort;
        memset(rhash, 0, hsize * sizeof(xrecord_t *));
 
-       for (i = 0; i < nrec; ++i) {
+       for (i = start; i <= end; i++) {
                if (xdl_classify_record(pass, cf, rhash, hbits, xdf->recs[i]) < 
0)
                        goto abort;
        }
@@ -275,7 +287,7 @@
 
 int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
                    xdfenv_t *xe) {
-       long enl1, enl2, sample;
+       long enl1, enl2, sample, reserved;
        xdlclassifier_t cf;
 
        memset(&cf, 0, sizeof(cf));
@@ -307,13 +319,14 @@
                return -1;
        }
 
-       if (xdl_prepare_hashtable(1, mf1, xpp, &cf, &xe->xdf1) < 0) {
+       reserved = xdl_trim_reserved_lines(&xe->xdf1, &xe->xdf2);
+       if (xdl_prepare_hashtable(1, reserved, mf1, xpp, &cf, &xe->xdf1) < 0) {
                xdl_free_ctx(&xe->xdf1);
                xdl_free_ctx(&xe->xdf2);
                xdl_free_classifier(&cf);
                return -1;
        }
-       if (xdl_prepare_hashtable(2, mf2, xpp, &cf, &xe->xdf2) < 0) {
+       if (xdl_prepare_hashtable(2, reserved, mf2, xpp, &cf, &xe->xdf2) < 0) {
                xdl_free_ctx(&xe->xdf1);
                xdl_free_ctx(&xe->xdf2);
                xdl_free_classifier(&cf);
@@ -462,7 +475,6 @@
        return 0;
 }
 
-
 /*
  * Early trim initial and terminal matching records.
  */
@@ -500,3 +512,22 @@
 
        return 0;
 }
+
+
+/*
+ * Return "reserved lines" for possible hunk shifting. Normally, only look at
+ * lines in dstart..dend range. But hunk shifting also needs accurate line
+ * hashes. Estimated hunk size and reserve lines for shifting purpose.
+ *
+ * This would be used by xdl_prepare_hashtable, to build accurate hash values.
+ */
+static long xdl_trim_reserved_lines(xdfile_t *xdf1, xdfile_t *xdf2) {
+       long lines = 0;
+       if (xdf1->dend > xdf1->dstart)
+               lines += xdf1->dend - xdf1->dstart;
+       if (xdf2->dend > xdf2->dstart)
+               lines += xdf2->dend - xdf2->dstart;
+       return lines;
+}
+
+



To: quark, #hg-reviewers
Cc: mercurial-devel
_______________________________________________
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel

D2631: [RFC] xdiff: skip trimmed lines when preparing the hashtable

Reply via email to