On Sun, May 29, 2011 at 6:00 PM, Bert Huijben <b...@qqmail.nl> wrote:
>
>
>> -----Original Message-----
>> From: Morten Kloster [mailto:mor...@gmail.com]
>> Sent: zondag 29 mei 2011 17:35
>> To: Julian Foad
>> Cc: Mark Phippard; dev@subversion.apache.org
>> Subject: Re: [PATCH] Speed-up of libsvn_diff using token counts
>>
>> On Fri, May 27, 2011 at 7:57 PM, Julian Foad <julian.f...@wandisco.com>
>> wrote:
>> > Morten Kloster wrote:
>> >> On Fri, May 27, 2011 at 4:55 PM, Julian Foad <julian.f...@wandisco.com>
>> wrote:
>> >> > Morten Kloster wrote:
>> >> >> I haven't changed the index/count types yet. What's the right type
>> >> >> to use to get signed 32 bit on 32-bit machines and signed 64 bit
>> >> >> on 64-bit machines?
>> >> >
>> >> > "int"?
>> >>
>> >> Is int guaranteed to correspond to (or be larger than) single-process
>> >> addressable space?
>> >
>> > No.  Some 64-bit platforms use 32-bit ints by default and 64-bit
>> > pointers.
>> >
>> > But do you really need to guarantee that svn's text-diff will cope with
>> > more than 2 billion lines on such a system?  Personally I don't think
>> > so.  If you think that is important, you can use "long int".
>
> long int isn't guaranteed to be 64 bit either. (E.g. on Windows 64 long int
> is just 32 bit)
>
> You need something like intptr_t if you want an integer type of pointer
> size.
>
> But threating more than 4 billion lines of text as just every other textfile
> doesn't seem necessary to me.
>
>        Bert
>
>

And I forgot the attachment, as well... Ok, trying again, using intptr_t this
time.


Morten
Index: subversion/libsvn_diff/diff.c
===================================================================
--- subversion/libsvn_diff/diff.c       (revision 1128885)
+++ subversion/libsvn_diff/diff.c       (working copy)
@@ -33,7 +33,39 @@
 
 #include "diff.h"
 
+/* Returns an array with the counts for the tokens in
+ * the looped linked list given in loop_start.
+ * num_tokens equals the highest possible token index +1.
+ */
 
+intptr_t*
+svn_diff__get_token_counts(svn_diff__position_t *loop_start,
+                           intptr_t num_tokens,
+                           apr_pool_t *pool)
+{
+  intptr_t *token_counts;
+  intptr_t token_index;
+  svn_diff__position_t *current;
+
+  token_counts = apr_palloc(pool, num_tokens * sizeof(*token_counts));
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    token_counts[token_index] = 0;
+
+  current = loop_start;
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[current->token_index]++;
+          current = current->next;
+        }
+      while (current != loop_start);
+    }
+
+  return token_counts;
+}
+
+
 svn_diff_t *
 svn_diff__diff(svn_diff__lcs_t *lcs,
                apr_off_t original_start, apr_off_t modified_start,
@@ -105,6 +137,8 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[2];
+  intptr_t num_tokens;
+  intptr_t *token_counts[2];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified};
   svn_diff__lcs_t *lcs;
@@ -138,6 +172,8 @@
                                prefix_lines,
                                subpool));
 
+  num_tokens = svn_diff__get_node_count(tree);
+
   /* The cool part is that we don't need the tokens anymore.
    * Allow the app to clean them up if it wants to.
    */
@@ -147,8 +183,12 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_destroy(treepool);
 
+  token_counts[0] = svn_diff__get_token_counts(position_list[0], num_tokens, 
subpool);
+  token_counts[1] = svn_diff__get_token_counts(position_list[1], num_tokens, 
subpool);
+
   /* Get the lcs */
-  lcs = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
+  lcs = svn_diff__lcs(position_list[0], position_list[1], token_counts[0],
+                      token_counts[1], num_tokens, prefix_lines,
                       suffix_lines, subpool);
 
   /* Produce the diff */
Index: subversion/libsvn_diff/diff.h
===================================================================
--- subversion/libsvn_diff/diff.h       (revision 1128885)
+++ subversion/libsvn_diff/diff.h       (working copy)
@@ -62,7 +62,7 @@
 struct svn_diff__position_t
 {
   svn_diff__position_t *next;
-  svn_diff__node_t     *node;
+  intptr_t              token_index;
   apr_off_t             offset;
 };
 
@@ -99,12 +99,21 @@
 svn_diff__lcs_t *
 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) 
*/
               svn_diff__position_t *position_list2, /* pointer to tail (ring) 
*/
+              intptr_t *token_counts_list1, /* array of counts */
+              intptr_t *token_counts_list2, /* array of counts */
+              intptr_t num_tokens, /* length of count arrays */
               apr_off_t prefix_lines,
               apr_off_t suffix_lines,
               apr_pool_t *pool);
 
 
 /*
+ * Returns number of tokens in a tree
+ */
+intptr_t
+svn_diff__get_node_count(svn_diff__tree_t *tree);
+
+/*
  * Support functions to build a tree of token positions
  */
 void
@@ -124,6 +133,11 @@
                      apr_off_t prefix_lines,
                      apr_pool_t *pool);
 
+/* Count the number of each token in a looped linked list */
+intptr_t*
+svn_diff__get_token_counts(svn_diff__position_t *loop_start,
+                           intptr_t num_tokens,
+                           apr_pool_t *pool);
 
 /* Morph a svn_lcs_t into a svn_diff_t. */
 svn_diff_t *
@@ -136,6 +150,7 @@
 svn_diff__resolve_conflict(svn_diff_t *hunk,
                            svn_diff__position_t **position_list1,
                            svn_diff__position_t **position_list2,
+                           intptr_t num_tokens,
                            apr_pool_t *pool);
 
 
Index: subversion/libsvn_diff/diff3.c
===================================================================
--- subversion/libsvn_diff/diff3.c      (revision 1128885)
+++ subversion/libsvn_diff/diff3.c      (working copy)
@@ -38,6 +38,7 @@
 svn_diff__resolve_conflict(svn_diff_t *hunk,
                            svn_diff__position_t **position_list1,
                            svn_diff__position_t **position_list2,
+                           intptr_t num_tokens,
                            apr_pool_t *pool)
 {
     apr_off_t modified_start = hunk->modified_start + 1;
@@ -47,6 +48,7 @@
     apr_off_t latest_length = hunk->latest_length;
     svn_diff__position_t *start_position[2];
     svn_diff__position_t *position[2];
+    intptr_t *token_counts[2];
     svn_diff__lcs_t *lcs = NULL;
     svn_diff__lcs_t **lcs_ref = &lcs;
     svn_diff_t **diff_ref = &hunk->resolved_diff;
@@ -72,7 +74,7 @@
                   ? modified_length : latest_length;
 
     while (common_length > 0
-           && position[0]->node == position[1]->node)
+           && position[0]->token_index == position[1]->token_index)
       {
         position[0] = position[0]->next;
         position[1] = position[1]->next;
@@ -173,9 +175,12 @@
         position[1]->next = start_position[1];
       }
 
-    *lcs_ref = svn_diff__lcs(position[0], position[1], 0, 0,
-                             subpool);
+    token_counts[0] = svn_diff__get_token_counts(position[0], num_tokens, 
subpool);
+    token_counts[1] = svn_diff__get_token_counts(position[1], num_tokens, 
subpool);
 
+    *lcs_ref = svn_diff__lcs(position[0], position[1], token_counts[0],
+                             token_counts[1], num_tokens, 0, 0, subpool);
+
     /* Fix up the EOF lcs element in case one of
      * the two sequences was NULL.
      */
@@ -251,6 +256,8 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[3];
+  intptr_t num_tokens;
+  intptr_t *token_counts[3];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified,
                                         svn_diff_datasource_latest};
@@ -292,6 +299,8 @@
                                prefix_lines,
                                subpool));
 
+  num_tokens = svn_diff__get_node_count(tree);
+
   /* Get rid of the tokens, we don't need them to calc the diff */
   if (vtable->token_discard_all != NULL)
     vtable->token_discard_all(diff_baton);
@@ -299,10 +308,16 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_destroy(treepool);
 
+  token_counts[0] = svn_diff__get_token_counts(position_list[0], num_tokens, 
subpool);
+  token_counts[1] = svn_diff__get_token_counts(position_list[1], num_tokens, 
subpool);
+  token_counts[2] = svn_diff__get_token_counts(position_list[2], num_tokens, 
subpool);
+
   /* Get the lcs for original-modified and original-latest */
-  lcs_om = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
+  lcs_om = svn_diff__lcs(position_list[0], position_list[1], token_counts[0],
+                         token_counts[1], num_tokens, prefix_lines,
                          suffix_lines, subpool);
-  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], token_counts[0],
+                         token_counts[2], num_tokens, prefix_lines,
                          suffix_lines, subpool);
 
   /* Produce a merged diff */
@@ -435,6 +450,7 @@
                 svn_diff__resolve_conflict(*diff_ref,
                                            &position_list[1],
                                            &position_list[2],
+                                           num_tokens,
                                            pool);
               }
             else if (is_modified)
Index: subversion/libsvn_diff/diff4.c
===================================================================
--- subversion/libsvn_diff/diff4.c      (revision 1128885)
+++ subversion/libsvn_diff/diff4.c      (working copy)
@@ -173,6 +173,8 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[4];
+  intptr_t num_tokens;
+  intptr_t *token_counts[4];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified,
                                         svn_diff_datasource_latest,
@@ -227,6 +229,8 @@
                                prefix_lines,
                                subpool2));
 
+  num_tokens = svn_diff__get_node_count(tree);
+
   /* Get rid of the tokens, we don't need them to calc the diff */
   if (vtable->token_discard_all != NULL)
     vtable->token_discard_all(diff_baton);
@@ -234,8 +238,15 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_clear(subpool3);
 
+  token_counts[0] = svn_diff__get_token_counts(position_list[0], num_tokens, 
subpool);
+  token_counts[1] = svn_diff__get_token_counts(position_list[1], num_tokens, 
subpool);
+  token_counts[2] = svn_diff__get_token_counts(position_list[2], num_tokens, 
subpool);
+  token_counts[3] = svn_diff__get_token_counts(position_list[3], num_tokens, 
subpool);
+
   /* Get the lcs for original - latest */
-  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+  lcs_ol = svn_diff__lcs(position_list[0], position_list[2],
+                         token_counts[0], token_counts[2],
+                         num_tokens, prefix_lines,
                          suffix_lines, subpool3);
   diff_ol = svn_diff__diff(lcs_ol, 1, 1, TRUE, pool);
 
@@ -257,7 +268,9 @@
   /* Get the lcs for common ancestor - original
    * Do reverse adjustements
    */
-  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], prefix_lines,
+  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2],
+                             token_counts[3], token_counts[2],
+                             num_tokens, prefix_lines,
                              suffix_lines, subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
@@ -267,7 +280,9 @@
   /* Get the lcs for modified - common ancestor
    * Do forward adjustments
    */
-  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], prefix_lines,
+  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3],
+                             token_counts[1], token_counts[3],
+                             num_tokens, prefix_lines,
                              suffix_lines, subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
@@ -283,7 +298,7 @@
       if (hunk->type == svn_diff__type_conflict)
         {
           svn_diff__resolve_conflict(hunk, &position_list[1],
-                                     &position_list[2], pool);
+                                     &position_list[2], num_tokens, pool);
         }
     }
 
Index: subversion/libsvn_diff/lcs.c
===================================================================
--- subversion/libsvn_diff/lcs.c        (revision 1128885)
+++ subversion/libsvn_diff/lcs.c        (working copy)
@@ -82,6 +82,7 @@
 static APR_INLINE void
 svn_diff__snake(apr_off_t k,
                 svn_diff__snake_t *fp,
+                intptr_t *token_counts[2],
                 svn_diff__lcs_t **freelist,
                 apr_pool_t *pool)
 {
@@ -123,6 +124,11 @@
     }
 
 
+  if (previous_lcs)
+    {
+      previous_lcs->refcount++;
+    }
+
   /* ### Optimization, skip all positions that don't have matchpoints
    * ### anyway. Beware of the sentinel, don't skip it!
    */
@@ -130,8 +136,10 @@
   position[0] = start_position[0];
   position[1] = start_position[1];
 
-  while (position[0]->node == position[1]->node)
+  while (1)
     {
+  while (position[0]->token_index == position[1]->token_index)
+    {
       position[0] = position[0]->next;
       position[1] = position[1]->next;
     }
@@ -153,18 +161,20 @@
       lcs->length = position[1]->offset - start_position[1]->offset;
       lcs->next = previous_lcs;
       lcs->refcount = 1;
-      fp[k].lcs = lcs;
+      previous_lcs = lcs;
+      start_position[0] = position[0];
+      start_position[1] = position[1];
     }
-  else
-    {
-      fp[k].lcs = previous_lcs;
+      /* Skip any and all tokens that only occur in one of the files */
+      if (position[0]->token_index >= 0 && 
token_counts[1][position[0]->token_index] == 0)
+        start_position[0] = position[0] = position[0]->next;
+      else if (position[1]->token_index >= 0 && 
token_counts[0][position[1]->token_index] == 0)
+        start_position[1] = position[1] = position[1]->next;
+      else
+        break;
     }
 
-  if (previous_lcs)
-    {
-      previous_lcs->refcount++;
-    }
-
+  fp[k].lcs = previous_lcs;
   fp[k].position[0] = position[0];
   fp[k].position[1] = position[1];
 
@@ -219,11 +229,17 @@
 svn_diff__lcs_t *
 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) 
*/
               svn_diff__position_t *position_list2, /* pointer to tail (ring) 
*/
+              intptr_t *token_counts_list1, /* array of counts */
+              intptr_t *token_counts_list2, /* array of counts */
+              intptr_t num_tokens,
               apr_off_t prefix_lines,
               apr_off_t suffix_lines,
               apr_pool_t *pool)
 {
   apr_off_t length[2];
+  intptr_t *token_counts[2];
+  intptr_t unique_count[2];
+  intptr_t token_index;
   svn_diff__snake_t *fp;
   apr_off_t d;
   apr_off_t k;
@@ -261,10 +277,23 @@
       return lcs;
     }
 
-  /* Calculate lengths M and N of the sequences to be compared */
-  length[0] = position_list1->offset - position_list1->next->offset + 1;
-  length[1] = position_list2->offset - position_list2->next->offset + 1;
+  unique_count[1] = unique_count[0] = 0;
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    {
+      if (token_counts_list1[token_index] == 0)
+        unique_count[1] += token_counts_list2[token_index];
+      if (token_counts_list2[token_index] == 0)
+        unique_count[0] += token_counts_list1[token_index];
+    }
 
+  /* Calculate lengths M and N of the sequences to be compared. Do not
+   * count tokens unique to one file, as those are ignored in __snake.
+   */
+  length[0] = position_list1->offset - position_list1->next->offset + 1
+              - unique_count[0];
+  length[1] = position_list2->offset - position_list2->next->offset + 1
+              - unique_count[1];
+
   /* strikerXXX: here we allocate the furthest point array, which is
    * strikerXXX: sized M + N + 3 (!)
    */
@@ -282,16 +311,17 @@
   sentinel_position[0].next = position_list1->next;
   position_list1->next = &sentinel_position[0];
   sentinel_position[0].offset = position_list1->offset + 1;
+  token_counts[0] = token_counts_list1;
 
   sentinel_position[1].next = position_list2->next;
   position_list2->next = &sentinel_position[1];
   sentinel_position[1].offset = position_list2->offset + 1;
+  token_counts[1] = token_counts_list2;
 
-  /* These are never dereferenced, only compared by value, so we
-   * can safely fake these up and the void* cast is OK.
+  /* Negative indices will not be used elsewhere
    */
-  sentinel_position[0].node = (void*)&sentinel_position[0];
-  sentinel_position[1].node = (void*)&sentinel_position[1];
+  sentinel_position[0].token_index = -1;
+  sentinel_position[1].token_index = -2;
 
   /* position d = M - N corresponds to the initial state, where
    * we are at the beginning of both files.
@@ -311,12 +341,12 @@
       /* For k < 0, insertions are free */
       for (k = (d < 0 ? d : 0) - p; k < 0; k++)
         {
-          svn_diff__snake(k, fp, &lcs_freelist, pool);
+          svn_diff__snake(k, fp, token_counts, &lcs_freelist, pool);
         }
          /* for k > 0, deletions are free */
       for (k = (d > 0 ? d : 0) + p; k >= 0; k--)
         {
-          svn_diff__snake(k, fp, &lcs_freelist, pool);
+          svn_diff__snake(k, fp, token_counts, &lcs_freelist, pool);
         }
 
       p++;
Index: subversion/libsvn_diff/token.c
===================================================================
--- subversion/libsvn_diff/token.c      (revision 1128885)
+++ subversion/libsvn_diff/token.c      (working copy)
@@ -46,6 +46,7 @@
   svn_diff__node_t     *right;
 
   apr_uint32_t          hash;
+  intptr_t              index;
   void                 *token;
 };
 
@@ -53,10 +54,20 @@
 {
   svn_diff__node_t     *root[SVN_DIFF__HASH_SIZE];
   apr_pool_t           *pool;
+  intptr_t              node_count;
 };
 
 
 /*
+ * Returns number of tokens in a tree
+ */
+intptr_t
+svn_diff__get_node_count(svn_diff__tree_t *tree)
+{
+  return tree->node_count;
+}
+
+/*
  * Support functions to build a tree of token positions
  */
 
@@ -65,6 +76,7 @@
 {
   *tree = apr_pcalloc(pool, sizeof(**tree));
   (*tree)->pool = pool;
+  (*tree)->node_count = 0;
 }
 
 
@@ -122,6 +134,7 @@
   new_node->right = NULL;
   new_node->hash = hash;
   new_node->token = token;
+  new_node->index = tree->node_count++;
 
   *node = *node_ref = new_node;
 
@@ -168,7 +181,7 @@
       /* Create a new position */
       position = apr_palloc(pool, sizeof(*position));
       position->next = NULL;
-      position->node = node;
+      position->token_index = node->index;
       position->offset = offset;
 
       *position_ref = position;

Reply via email to