On Sun, May 29, 2011 at 6:17 PM, Morten Kloster <mor...@gmail.com> wrote:
> On Sun, May 29, 2011 at 6:00 PM, Bert Huijben <b...@qqmail.nl> wrote:
>>
>>
>>> -----Original Message-----
>>> From: Morten Kloster [mailto:mor...@gmail.com]
>>> Sent: zondag 29 mei 2011 17:35
>>> To: Julian Foad
>>> Cc: Mark Phippard; dev@subversion.apache.org
>>> Subject: Re: [PATCH] Speed-up of libsvn_diff using token counts
>>>
>>> On Fri, May 27, 2011 at 7:57 PM, Julian Foad <julian.f...@wandisco.com>
>>> wrote:
>>> > Morten Kloster wrote:
>>> >> On Fri, May 27, 2011 at 4:55 PM, Julian Foad <julian.f...@wandisco.com>
>>> wrote:
>>> >> > Morten Kloster wrote:
>>> >> >> I haven't changed the index/count types yet. What's the right type
>>> >> >> to use to get signed 32 bit on 32-bit machines and signed 64 bit
>>> >> >> on 64-bit machines?
>>> >> >
>>> >> > "int"?
>>> >>
>>> >> Is int guaranteed to correspond to (or be larger than) single-process
>>> >> addressable space?
>>> >
>>> > No.  Some 64-bit platforms use 32-bit ints by default and 64-bit
>>> > pointers.
>>> >
>>> > But do you really need to guarantee that svn's text-diff will cope with
>>> > more than 2 billion lines on such a system?  Personally I don't think
>>> > so.  If you think that is important, you can use "long int".
>>
>> long int isn't guaranteed to be 64 bit either. (E.g. on Windows 64 long int
>> is just 32 bit)
>>
>> You need something like intptr_t if you want an integer type of pointer
>> size.
>>
>> But threating more than 4 billion lines of text as just every other textfile
>> doesn't seem necessary to me.
>>
>>        Bert
>>
>>
>
> And I forgot the attachment, as well... Ok, trying again, using intptr_t this
> time.
>
>
> Morten
>

Bert informed me that intptr_t might not be work on all systems, so I've
made a new typedef svn_diff__token_index_t and set it to long int for now.


[[[
Speed-up of libsvn_diff using token counts
By using indices, not node pointers, to refer to tokens, and counting
the number of each token, the longest common subsequence (lcs)
algorithm at the heart of libsvn_diff becomes much faster in many
situations by skipping tokens that are unique to one source or the other.

* subversion/libsvn_diff/token.c
  (svn_diff__node_t): Added 'index' member.
  (svn_diff__tree_t): Added 'node_count' member.
  (svn_diff__get_node_count): New method.
  (tree_insert_token): Assigns indices to nodes.
  (svn_diff__get_tokens): Uses token_index (int), not node pointer,
   to identify node in position struct.

* subversion/libsvn_diff/lcs.c
  (svn_diff__snake): Takes token counts as additional argument.
   Skips tokens that are unique to one or the other file.
  (svn_diff__lcs): Takes token counts and number of different tokens
   as additional arguments. Calculates number of tokens unique to
   each source and uses this to compute effective length difference.

* subversion/libsvn_diff/diff.h
  (svn_diff__token_index_t): New type for token indices/counts
  (svn_diff__position_t): Replaced node pointer member
   by an integer token_index.
  Updated and added new method declarations.

* subversion/libsvn_diff/diff.c
* subversion/libsvn_diff/diff3.c
* subversion/libsvn_diff/diff4.c
  (svn_diff_diff_2, svn_diff_diff_3, svn_diff_diff_4): Count the
   number of times each token appears in each source.
  (svn_diff__resolve_conflict): Takes number of different tokens
   as additional argument. Counts the number of times
   each token appears in each (partial) source.
  (svn_diff__get_token_counts): New method.
]]]
Index: subversion/libsvn_diff/diff.c
===================================================================
--- subversion/libsvn_diff/diff.c       (revision 1128885)
+++ subversion/libsvn_diff/diff.c       (working copy)
@@ -33,7 +33,39 @@
 
 #include "diff.h"
 
+/* Returns an array with the counts for the tokens in
+ * the looped linked list given in loop_start.
+ * num_tokens equals the highest possible token index +1.
+ */
 
+svn_diff__token_index_t*
+svn_diff__get_token_counts(svn_diff__position_t *loop_start,
+                           svn_diff__token_index_t num_tokens,
+                           apr_pool_t *pool)
+{
+  svn_diff__token_index_t *token_counts;
+  svn_diff__token_index_t token_index;
+  svn_diff__position_t *current;
+
+  token_counts = apr_palloc(pool, num_tokens * sizeof(*token_counts));
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    token_counts[token_index] = 0;
+
+  current = loop_start;
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[current->token_index]++;
+          current = current->next;
+        }
+      while (current != loop_start);
+    }
+
+  return token_counts;
+}
+
+
 svn_diff_t *
 svn_diff__diff(svn_diff__lcs_t *lcs,
                apr_off_t original_start, apr_off_t modified_start,
@@ -105,6 +137,8 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[2];
+  svn_diff__token_index_t num_tokens;
+  svn_diff__token_index_t *token_counts[2];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified};
   svn_diff__lcs_t *lcs;
@@ -138,6 +172,8 @@
                                prefix_lines,
                                subpool));
 
+  num_tokens = svn_diff__get_node_count(tree);
+
   /* The cool part is that we don't need the tokens anymore.
    * Allow the app to clean them up if it wants to.
    */
@@ -147,8 +183,12 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_destroy(treepool);
 
+  token_counts[0] = svn_diff__get_token_counts(position_list[0], num_tokens, 
subpool);
+  token_counts[1] = svn_diff__get_token_counts(position_list[1], num_tokens, 
subpool);
+
   /* Get the lcs */
-  lcs = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
+  lcs = svn_diff__lcs(position_list[0], position_list[1], token_counts[0],
+                      token_counts[1], num_tokens, prefix_lines,
                       suffix_lines, subpool);
 
   /* Produce the diff */
Index: subversion/libsvn_diff/diff.h
===================================================================
--- subversion/libsvn_diff/diff.h       (revision 1128885)
+++ subversion/libsvn_diff/diff.h       (working copy)
@@ -59,10 +59,13 @@
   svn_diff_t *resolved_diff;
 };
 
+/* Type used for token indices and counts of tokens. Must be signed. */
+typedef long int svn_diff__token_index_t;
+
 struct svn_diff__position_t
 {
   svn_diff__position_t *next;
-  svn_diff__node_t     *node;
+  svn_diff__token_index_t token_index;
   apr_off_t             offset;
 };
 
@@ -99,12 +102,21 @@
 svn_diff__lcs_t *
 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) 
*/
               svn_diff__position_t *position_list2, /* pointer to tail (ring) 
*/
+              svn_diff__token_index_t *token_counts_list1, /* array of counts 
*/
+              svn_diff__token_index_t *token_counts_list2, /* array of counts 
*/
+              svn_diff__token_index_t num_tokens, /* length of count arrays */
               apr_off_t prefix_lines,
               apr_off_t suffix_lines,
               apr_pool_t *pool);
 
 
 /*
+ * Returns number of tokens in a tree
+ */
+svn_diff__token_index_t
+svn_diff__get_node_count(svn_diff__tree_t *tree);
+
+/*
  * Support functions to build a tree of token positions
  */
 void
@@ -124,6 +136,11 @@
                      apr_off_t prefix_lines,
                      apr_pool_t *pool);
 
+/* Count the number of each token in a looped linked list */
+svn_diff__token_index_t*
+svn_diff__get_token_counts(svn_diff__position_t *loop_start,
+                           svn_diff__token_index_t num_tokens,
+                           apr_pool_t *pool);
 
 /* Morph a svn_lcs_t into a svn_diff_t. */
 svn_diff_t *
@@ -136,6 +153,7 @@
 svn_diff__resolve_conflict(svn_diff_t *hunk,
                            svn_diff__position_t **position_list1,
                            svn_diff__position_t **position_list2,
+                           svn_diff__token_index_t num_tokens,
                            apr_pool_t *pool);
 
 
Index: subversion/libsvn_diff/diff3.c
===================================================================
--- subversion/libsvn_diff/diff3.c      (revision 1128885)
+++ subversion/libsvn_diff/diff3.c      (working copy)
@@ -38,6 +38,7 @@
 svn_diff__resolve_conflict(svn_diff_t *hunk,
                            svn_diff__position_t **position_list1,
                            svn_diff__position_t **position_list2,
+                           svn_diff__token_index_t num_tokens,
                            apr_pool_t *pool)
 {
     apr_off_t modified_start = hunk->modified_start + 1;
@@ -47,6 +48,7 @@
     apr_off_t latest_length = hunk->latest_length;
     svn_diff__position_t *start_position[2];
     svn_diff__position_t *position[2];
+    svn_diff__token_index_t *token_counts[2];
     svn_diff__lcs_t *lcs = NULL;
     svn_diff__lcs_t **lcs_ref = &lcs;
     svn_diff_t **diff_ref = &hunk->resolved_diff;
@@ -72,7 +74,7 @@
                   ? modified_length : latest_length;
 
     while (common_length > 0
-           && position[0]->node == position[1]->node)
+           && position[0]->token_index == position[1]->token_index)
       {
         position[0] = position[0]->next;
         position[1] = position[1]->next;
@@ -173,9 +175,12 @@
         position[1]->next = start_position[1];
       }
 
-    *lcs_ref = svn_diff__lcs(position[0], position[1], 0, 0,
-                             subpool);
+    token_counts[0] = svn_diff__get_token_counts(position[0], num_tokens, 
subpool);
+    token_counts[1] = svn_diff__get_token_counts(position[1], num_tokens, 
subpool);
 
+    *lcs_ref = svn_diff__lcs(position[0], position[1], token_counts[0],
+                             token_counts[1], num_tokens, 0, 0, subpool);
+
     /* Fix up the EOF lcs element in case one of
      * the two sequences was NULL.
      */
@@ -251,6 +256,8 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[3];
+  svn_diff__token_index_t num_tokens;
+  svn_diff__token_index_t *token_counts[3];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified,
                                         svn_diff_datasource_latest};
@@ -292,6 +299,8 @@
                                prefix_lines,
                                subpool));
 
+  num_tokens = svn_diff__get_node_count(tree);
+
   /* Get rid of the tokens, we don't need them to calc the diff */
   if (vtable->token_discard_all != NULL)
     vtable->token_discard_all(diff_baton);
@@ -299,10 +308,16 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_destroy(treepool);
 
+  token_counts[0] = svn_diff__get_token_counts(position_list[0], num_tokens, 
subpool);
+  token_counts[1] = svn_diff__get_token_counts(position_list[1], num_tokens, 
subpool);
+  token_counts[2] = svn_diff__get_token_counts(position_list[2], num_tokens, 
subpool);
+
   /* Get the lcs for original-modified and original-latest */
-  lcs_om = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
+  lcs_om = svn_diff__lcs(position_list[0], position_list[1], token_counts[0],
+                         token_counts[1], num_tokens, prefix_lines,
                          suffix_lines, subpool);
-  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], token_counts[0],
+                         token_counts[2], num_tokens, prefix_lines,
                          suffix_lines, subpool);
 
   /* Produce a merged diff */
@@ -435,6 +450,7 @@
                 svn_diff__resolve_conflict(*diff_ref,
                                            &position_list[1],
                                            &position_list[2],
+                                           num_tokens,
                                            pool);
               }
             else if (is_modified)
Index: subversion/libsvn_diff/diff4.c
===================================================================
--- subversion/libsvn_diff/diff4.c      (revision 1128885)
+++ subversion/libsvn_diff/diff4.c      (working copy)
@@ -173,6 +173,8 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[4];
+  svn_diff__token_index_t num_tokens;
+  svn_diff__token_index_t *token_counts[4];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified,
                                         svn_diff_datasource_latest,
@@ -227,6 +229,8 @@
                                prefix_lines,
                                subpool2));
 
+  num_tokens = svn_diff__get_node_count(tree);
+
   /* Get rid of the tokens, we don't need them to calc the diff */
   if (vtable->token_discard_all != NULL)
     vtable->token_discard_all(diff_baton);
@@ -234,8 +238,15 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_clear(subpool3);
 
+  token_counts[0] = svn_diff__get_token_counts(position_list[0], num_tokens, 
subpool);
+  token_counts[1] = svn_diff__get_token_counts(position_list[1], num_tokens, 
subpool);
+  token_counts[2] = svn_diff__get_token_counts(position_list[2], num_tokens, 
subpool);
+  token_counts[3] = svn_diff__get_token_counts(position_list[3], num_tokens, 
subpool);
+
   /* Get the lcs for original - latest */
-  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+  lcs_ol = svn_diff__lcs(position_list[0], position_list[2],
+                         token_counts[0], token_counts[2],
+                         num_tokens, prefix_lines,
                          suffix_lines, subpool3);
   diff_ol = svn_diff__diff(lcs_ol, 1, 1, TRUE, pool);
 
@@ -257,7 +268,9 @@
   /* Get the lcs for common ancestor - original
    * Do reverse adjustements
    */
-  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], prefix_lines,
+  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2],
+                             token_counts[3], token_counts[2],
+                             num_tokens, prefix_lines,
                              suffix_lines, subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
@@ -267,7 +280,9 @@
   /* Get the lcs for modified - common ancestor
    * Do forward adjustments
    */
-  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], prefix_lines,
+  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3],
+                             token_counts[1], token_counts[3],
+                             num_tokens, prefix_lines,
                              suffix_lines, subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
@@ -283,7 +298,7 @@
       if (hunk->type == svn_diff__type_conflict)
         {
           svn_diff__resolve_conflict(hunk, &position_list[1],
-                                     &position_list[2], pool);
+                                     &position_list[2], num_tokens, pool);
         }
     }
 
Index: subversion/libsvn_diff/lcs.c
===================================================================
--- subversion/libsvn_diff/lcs.c        (revision 1128885)
+++ subversion/libsvn_diff/lcs.c        (working copy)
@@ -82,6 +82,7 @@
 static APR_INLINE void
 svn_diff__snake(apr_off_t k,
                 svn_diff__snake_t *fp,
+                svn_diff__token_index_t *token_counts[2],
                 svn_diff__lcs_t **freelist,
                 apr_pool_t *pool)
 {
@@ -123,6 +124,11 @@
     }
 
 
+  if (previous_lcs)
+    {
+      previous_lcs->refcount++;
+    }
+
   /* ### Optimization, skip all positions that don't have matchpoints
    * ### anyway. Beware of the sentinel, don't skip it!
    */
@@ -130,8 +136,10 @@
   position[0] = start_position[0];
   position[1] = start_position[1];
 
-  while (position[0]->node == position[1]->node)
+  while (1)
     {
+  while (position[0]->token_index == position[1]->token_index)
+    {
       position[0] = position[0]->next;
       position[1] = position[1]->next;
     }
@@ -153,18 +161,20 @@
       lcs->length = position[1]->offset - start_position[1]->offset;
       lcs->next = previous_lcs;
       lcs->refcount = 1;
-      fp[k].lcs = lcs;
+      previous_lcs = lcs;
+      start_position[0] = position[0];
+      start_position[1] = position[1];
     }
-  else
-    {
-      fp[k].lcs = previous_lcs;
+      /* Skip any and all tokens that only occur in one of the files */
+      if (position[0]->token_index >= 0 && 
token_counts[1][position[0]->token_index] == 0)
+        start_position[0] = position[0] = position[0]->next;
+      else if (position[1]->token_index >= 0 && 
token_counts[0][position[1]->token_index] == 0)
+        start_position[1] = position[1] = position[1]->next;
+      else
+        break;
     }
 
-  if (previous_lcs)
-    {
-      previous_lcs->refcount++;
-    }
-
+  fp[k].lcs = previous_lcs;
   fp[k].position[0] = position[0];
   fp[k].position[1] = position[1];
 
@@ -219,11 +229,17 @@
 svn_diff__lcs_t *
 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) 
*/
               svn_diff__position_t *position_list2, /* pointer to tail (ring) 
*/
+              svn_diff__token_index_t *token_counts_list1, /* array of counts 
*/
+              svn_diff__token_index_t *token_counts_list2, /* array of counts 
*/
+              svn_diff__token_index_t num_tokens,
               apr_off_t prefix_lines,
               apr_off_t suffix_lines,
               apr_pool_t *pool)
 {
   apr_off_t length[2];
+  svn_diff__token_index_t *token_counts[2];
+  svn_diff__token_index_t unique_count[2];
+  svn_diff__token_index_t token_index;
   svn_diff__snake_t *fp;
   apr_off_t d;
   apr_off_t k;
@@ -261,10 +277,23 @@
       return lcs;
     }
 
-  /* Calculate lengths M and N of the sequences to be compared */
-  length[0] = position_list1->offset - position_list1->next->offset + 1;
-  length[1] = position_list2->offset - position_list2->next->offset + 1;
+  unique_count[1] = unique_count[0] = 0;
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    {
+      if (token_counts_list1[token_index] == 0)
+        unique_count[1] += token_counts_list2[token_index];
+      if (token_counts_list2[token_index] == 0)
+        unique_count[0] += token_counts_list1[token_index];
+    }
 
+  /* Calculate lengths M and N of the sequences to be compared. Do not
+   * count tokens unique to one file, as those are ignored in __snake.
+   */
+  length[0] = position_list1->offset - position_list1->next->offset + 1
+              - unique_count[0];
+  length[1] = position_list2->offset - position_list2->next->offset + 1
+              - unique_count[1];
+
   /* strikerXXX: here we allocate the furthest point array, which is
    * strikerXXX: sized M + N + 3 (!)
    */
@@ -282,16 +311,17 @@
   sentinel_position[0].next = position_list1->next;
   position_list1->next = &sentinel_position[0];
   sentinel_position[0].offset = position_list1->offset + 1;
+  token_counts[0] = token_counts_list1;
 
   sentinel_position[1].next = position_list2->next;
   position_list2->next = &sentinel_position[1];
   sentinel_position[1].offset = position_list2->offset + 1;
+  token_counts[1] = token_counts_list2;
 
-  /* These are never dereferenced, only compared by value, so we
-   * can safely fake these up and the void* cast is OK.
+  /* Negative indices will not be used elsewhere
    */
-  sentinel_position[0].node = (void*)&sentinel_position[0];
-  sentinel_position[1].node = (void*)&sentinel_position[1];
+  sentinel_position[0].token_index = -1;
+  sentinel_position[1].token_index = -2;
 
   /* position d = M - N corresponds to the initial state, where
    * we are at the beginning of both files.
@@ -311,12 +341,12 @@
       /* For k < 0, insertions are free */
       for (k = (d < 0 ? d : 0) - p; k < 0; k++)
         {
-          svn_diff__snake(k, fp, &lcs_freelist, pool);
+          svn_diff__snake(k, fp, token_counts, &lcs_freelist, pool);
         }
          /* for k > 0, deletions are free */
       for (k = (d > 0 ? d : 0) + p; k >= 0; k--)
         {
-          svn_diff__snake(k, fp, &lcs_freelist, pool);
+          svn_diff__snake(k, fp, token_counts, &lcs_freelist, pool);
         }
 
       p++;
Index: subversion/libsvn_diff/token.c
===================================================================
--- subversion/libsvn_diff/token.c      (revision 1128885)
+++ subversion/libsvn_diff/token.c      (working copy)
@@ -46,6 +46,7 @@
   svn_diff__node_t     *right;
 
   apr_uint32_t          hash;
+  svn_diff__token_index_t index;
   void                 *token;
 };
 
@@ -53,10 +54,20 @@
 {
   svn_diff__node_t     *root[SVN_DIFF__HASH_SIZE];
   apr_pool_t           *pool;
+  svn_diff__token_index_t node_count;
 };
 
 
 /*
+ * Returns number of tokens in a tree
+ */
+svn_diff__token_index_t
+svn_diff__get_node_count(svn_diff__tree_t *tree)
+{
+  return tree->node_count;
+}
+
+/*
  * Support functions to build a tree of token positions
  */
 
@@ -65,6 +76,7 @@
 {
   *tree = apr_pcalloc(pool, sizeof(**tree));
   (*tree)->pool = pool;
+  (*tree)->node_count = 0;
 }
 
 
@@ -122,6 +134,7 @@
   new_node->right = NULL;
   new_node->hash = hash;
   new_node->token = token;
+  new_node->index = tree->node_count++;
 
   *node = *node_ref = new_node;
 
@@ -168,7 +181,7 @@
       /* Create a new position */
       position = apr_palloc(pool, sizeof(*position));
       position->next = NULL;
-      position->node = node;
+      position->token_index = node->index;
       position->offset = offset;
 
       *position_ref = position;

Reply via email to