On Mon, Dec 20, 2010 at 11:19 AM, Philip Martin
<philip.mar...@wandisco.com> wrote:
> Johan Corveleyn <jcor...@gmail.com> writes:
>
>> This makes the diff algorithm another 10% - 15%
>> faster (granted, this was measured with my "extreme" testcase of a 1,5
>> Mb file (60000 lines), of which most lines are identical
>> prefix/suffix).
>
> Can you provide a test script?  Or decribe the test more fully, please.

Hmm, it's not easy to come up with a test script to test this "from
scratch" (unless with testing diff directly, see below). I test it
with a repository (a dump/load of an old version of our production
repository) which contains this 60000 line xml file (1,5 Mb) with 2272
revisions.

I run blame on this file, over svnserve protocol on localhost (server
running on same machine), with an svnserve built from Stefan^2's
performance branch (with membuffer caching of full-texts, so server
I/O is not the bottleneck). This gives me an easy way to call 2272
times diff on this file, and measure it (with the help of some
instrumentation code in blame.c, see attachment). And it's
incidentally the actual use case I first started out wanting to
optimize (blame for large files with many revisions).

This is the actual command I use, and the output generated by the
instrumentation in blame.c:
[[[
$ time svn blame -x-b svn://localhost/trunk/path/to/settings.xml >/dev/null
### blame took 117546875 usec (117 s)
### file_rev_handler: 3203125 (3 s) - window_handler: 110781250 (110 s)
### wrapped_handler: 37859375 (37 s) - diff: 70921875 (70 s) -
blame_process: 1015625 (1 s)


real    1m58.008s
user    0m0.015s
sys     0m0.031s
]]]

(note: I use -x-b option in this case, because for some reason this
speeds it up tremendously. This probably has something to do with my
test data, which contains in its history some "all tabs to spaces" and
"all spaces to tabs" revisions.).

Some background info on this instrumentation output:
- "blame took ...": timing before and after doing all the stuff
(around the call to svn_ra_get_file_revs2, which includes all the
callbacks).
- "file_rev_handler" and "window_handler": timing of all the useful
work that's done at the client side (so this kind of excludes the time
that the client is simply waiting for the server).
- Last line contains parts of the window_handler time:
- "wrapped_handler": time taken to build all the full-texts at the client side.
- "diff": time spent in calls to svn_diff_file_diff_2 (this is the one
I'm trying to optimize with the diff-optimizations stuff).
- "blame_process": the time taken to create and insert the blame
chunks (linked list with blame information). As you can see this is
quite negligible.

So, I'm mainly looking at the time reported after "diff:".

An alternative way to test this, which may be scriptable: testing diff
directly, by "svn diffing" a large file. I can notice small
differences (in the area of 10 or 20 milliseconds) when simply
executing a single "svn diff" of settings.xml, with one line modified.
But it's too small to make any definite conclusions (inaccuracy,
overhead of program startup, ...). Maybe a simple test in c, with a
for loop with many iterations calling svn_diff_file_diff_2, would be
better.

I guess it would be easy to script the creation of a new repository,
commit a file in it with 100000 lines, modify one line, and diff that
while measuring it.

(the best example I found in subversion's own repository on
svn.apache.org, was subversion/tests/cmdline/merge_tests.py. This has
~16500 lines, and has about 660 changes)

Cheers,
-- 
Johan
Index: subversion/libsvn_client/blame.c
===================================================================
--- subversion/libsvn_client/blame.c    (revision 1041582)
+++ subversion/libsvn_client/blame.c    (working copy)
@@ -22,6 +22,7 @@
  */
 
 #include <apr_pools.h>
+#include <apr_time.h>
 
 #include "client.h"
 
@@ -42,6 +43,12 @@
 
 #include <assert.h>
 
+static long diff_time = 0L;
+static long blame_process_time = 0L;
+static long delta_process_time = 0L;
+static long file_rev_time = 0L;
+static long window_time = 0L;
+
 /* The metadata associated with a particular revision. */
 struct rev
 {
@@ -260,7 +267,9 @@ output_diff_modified(void *baton,
   struct diff_baton *db = baton;
 
   if (original_length)
-    SVN_ERR(blame_delete_range(db->chain, modified_start, original_length));
+    {
+      SVN_ERR(blame_delete_range(db->chain, modified_start, original_length));
+    }
 
   if (modified_length)
     SVN_ERR(blame_insert_range(db->chain, db->rev, modified_start,
@@ -285,10 +294,16 @@ add_file_blame(const char *last_file,
                const svn_diff_file_options_t *diff_options,
                apr_pool_t *pool)
 {
+  apr_time_t start_time;
+  long result_time;
+
   if (!last_file)
     {
       SVN_ERR_ASSERT(chain->blame == NULL);
+      start_time = apr_time_now();
       chain->blame = blame_create(chain, rev, 0);
+      result_time = apr_time_now() - start_time;
+      blame_process_time += result_time;
     }
   else
     {
@@ -299,9 +314,15 @@ add_file_blame(const char *last_file,
       diff_baton.rev = rev;
 
       /* We have a previous file.  Get the diff and adjust blame info. */
+      start_time = apr_time_now();
       SVN_ERR(svn_diff_file_diff_2(&diff, last_file, cur_file,
                                    diff_options, pool));
+      result_time = apr_time_now() - start_time;
+      diff_time += result_time;
+      start_time = apr_time_now();
       SVN_ERR(svn_diff_output(diff, &diff_baton, &output_fns));
+      result_time = apr_time_now() - start_time;
+      blame_process_time += result_time;
     }
 
   return SVN_NO_ERROR;
@@ -313,13 +334,23 @@ window_handler(svn_txdelta_window_t *window, void
   struct delta_baton *dbaton = baton;
   struct file_rev_baton *frb = dbaton->file_rev_baton;
   struct blame_chain *chain;
+  apr_time_t wrapped_start_time, start_time;
+  long wrapped_result_time, result_time;
 
+  start_time = apr_time_now();
   /* Call the wrapped handler first. */
+  wrapped_start_time = apr_time_now();
   SVN_ERR(dbaton->wrapped_handler(window, dbaton->wrapped_baton));
+  wrapped_result_time = apr_time_now() - wrapped_start_time;
+  delta_process_time += wrapped_result_time;
 
   /* We patiently wait for the NULL window marking the end. */
   if (window)
+  {
+    result_time = apr_time_now() - start_time;
+    window_time += result_time;
     return SVN_NO_ERROR;
+  }
 
   /* Close the files used for the delta.
      It is important to do this early, since otherwise, they will be deleted
@@ -375,6 +406,8 @@ window_handler(svn_txdelta_window_t *window, void
     frb->currpool = tmp_pool;
   }
 
+  result_time = apr_time_now() - start_time;
+  window_time += result_time;
   return SVN_NO_ERROR;
 }
 
@@ -415,7 +448,10 @@ file_rev_handler(void *baton, const char *path, sv
   svn_stream_t *cur_stream;
   struct delta_baton *delta_baton;
   apr_pool_t *filepool;
+  apr_time_t start_time;
+  long result_time;
 
+  start_time = apr_time_now();
   /* Clear the current pool. */
   svn_pool_clear(frb->currpool);
 
@@ -446,7 +482,11 @@ file_rev_handler(void *baton, const char *path, sv
      since the tempfile will be removed by the pool and we need the tempfile
      from the last revision with content changes. */
   if (!content_delta_handler)
+  {
+    result_time = apr_time_now() - start_time;
+    file_rev_time += result_time;
     return SVN_NO_ERROR;
+  }
 
   frb->merged_revision = merged_revision;
 
@@ -511,6 +551,8 @@ file_rev_handler(void *baton, const char *path, sv
   if (frb->include_merged_revisions)
     frb->rev->path = apr_pstrdup(frb->mainpool, path);
 
+  result_time = apr_time_now() - start_time;
+  file_rev_time += result_time;
   return SVN_NO_ERROR;
 }
 
@@ -600,6 +642,8 @@ svn_client_blame5(const char *target,
   svn_stream_t *last_stream;
   svn_stream_t *stream;
   const char *target_abspath_or_url;
+  apr_time_t starttime;
+  long result_time;
 
   if (start->kind == svn_opt_revision_unspecified
       || end->kind == svn_opt_revision_unspecified)
@@ -666,10 +710,18 @@ svn_client_blame5(const char *target,
      We need to ensure that we get one revision before the start_rev,
      if available so that we can know what was actually changed in the start
      revision. */
+  starttime = apr_time_now();
   SVN_ERR(svn_ra_get_file_revs2(ra_session, "",
                                 start_revnum - (start_revnum > 0 ? 1 : 0),
                                 end_revnum, include_merged_revisions,
                                 file_rev_handler, &frb, pool));
+  result_time = apr_time_now() - starttime;
+  fprintf(stderr, "### blame took %d usec (%d s)\n", result_time, 
result_time/1000000L);
+  fprintf(stderr, "### file_rev_handler: %d (%d s) - window_handler: %d (%d 
s)\n",
+    file_rev_time, file_rev_time/1000000L, window_time, window_time/1000000L);
+  fprintf(stderr, "### wrapped_handler: %d (%d s) - diff: %d (%d s) - 
blame_process: %d (%d s)\n\n", 
+    delta_process_time, delta_process_time/1000000L, diff_time, 
diff_time/1000000L, 
+    blame_process_time, blame_process_time/1000000L);
 
   if (end->kind == svn_opt_revision_working)
     {

Reply via email to