Re: [PATCH] limit diff effort to fix performance issue

Nathan Hartman Mon, 07 Jun 2021 22:45:22 -0700

On Wed, Jun 2, 2021 at 8:20 AM Stefan Sperling <s...@elego.de> wrote:
>
> I've heard of several instances where users complain that 'svn update'
> takes an extraordinary amount of time to run (in terms of hours).
>
> At elego we have received files which reproduce such behaviour.
> These files are XML files that are almost 100MB in size. While versioning
> such data is not the primary use case for Subversion, this should not get
> in the way of people getting work done. So I would like to get this fixed.
>
> The root of the performance problem is located in Subversion's diff code.
>
> The files contain in the order of 1.200.000 lines of XML text. I cannot
> share the full files, unfortunately.



In order to do some testing, I needed some test data that reproduces
the issue; since stsp can't share the customer's 100MB XML file, and
we'd probably want other inputs or sizes anyway, I wrote a program
that attempts to generate such a thing. I'm attaching that program...

To build, rename to .c extension and, e.g.,
$ gcc gen_diff_test_data.c -o gen_diff_test_data

To run it, provide two parameters:

The first is a 'seed' value like you'd provide to a pseudo random
number generator at init time.

The second is a 'length' parameter that says how long (approximately)
you want the output data to be. (The program nearly always overshoots
this by a small amount.)

Rather than using the system's pseudo random number generator, this
program includes its own implementation to ensure that users on any
system can get the same results when using the same parameters. So if
different people want to test with the same sets of input, you only
have to share 2 numbers, rather than send each other files >100MB of
useless junk.

Example: Generate two files of approx 100 MB, containing lots of
differences and diff them:

$ gen_diff_test_data 98 100m > one.txt
$ gen_diff_test_data 99 100m > two.txt
$ time diff one.txt two.txt > /dev/null

With the above parameters, it takes my system's diff about 50 seconds
to come up with something that looks reasonable at a glance; svn's
diff has been crunching away for a while now...

Cheers,
Nathan

/* Program to generate some pathological sample data for testing and
 * improving diff implementations.
 *
 * The output is deterministic but varies based on a seed value, like
 * that provided to a pseudo random number generator. The output
 * length is controlled as well. Both parameters are given at the
 * command line. The output is written to stdout.
 *
 * Presumably if two large outputs are generated by two runs with
 * different seed values, it will take a diff algorithm a long time to
 * calculate their longest common subsequence.
 *
 *
 * Usage:
 *
 * $ gen_diff_test_data <seed> <length>
 *
 *
 * Implementation notes:
 *
 * Rather than use the system-provided pseudo random number generator,
 * this program implements the hailstone sequence (see [1]) to assure
 * that users on different systems can produce same outputs when using
 * same seed and length values. That way people don't have to send
 * each other huge >100M files of useless junk. :-)
 *
 *
 * References:
 *
 * [1] Hailstone sequence: See Collatz Conjecture
 *     https://en.wikipedia.org/wiki/Collatz_conjecture
 */


#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdarg.h>
#include <string.h>
#include <limits.h>


#define PROGRAM_VERSION "0.01"


/* starting number for a hailstone sequence
 */
static uint64_t g_seed;


/* desired length of the output (approximately; actual may be longer)
 */
static uint64_t g_length;


/* current number in the hailstone sequence
 */
static uint64_t g_curr;


/* current word in words[] array
 */
static uint64_t g_word_index;


/* number of bytes written to stdout
 */
static uint64_t g_written;


/* how much to indent lines
 */
static int g_indents;


/* a bunch of random words to print in the output
 */
static const char * words[] = {
  "list", "exe", "MODULE", "EXE", "BIT", "database", "POINT", "link",
  "node", "parent", "BYTE", "enumerated", "OPTION", "managed",
  "deprecated", "point", "inheritance", "OUT", "VARIABLE", "PERL",
  "core", "else", "provider", "IMPLEMENTATION", "ENDIANNESS",
  "platform", "TYPE", "SCANNER", "libc", "lisp", "PROCESSOR", "path",
  "optimisation", "NANO", "subversion", "FORTRAN", "support", "EMPTY",
  "parser", "EXTENSION", "LOOP", "COLUMN", "resource", "end",
  "SUBCLASS", "optimal", "silicon", "row", "EXTENSIONS", "config",
  "EXCEPTION", "INHERITANCE", "BEGIN", "emacs", "VALLEY", "PROJECT",
  "EXTERNAL", "version", "subclass", "array", "ABI", "OPTIMISATION",
  "CLEAN", "ENVIRONMENT", "COL", "string", "RESOURCE", "VECTOR",
  "true", "STANDALONE", "VAR", "cobol", "DATA", "main", "TOOL",
  "ERROR", "IF", "drive", "errno", "artifact", "NO", "no", "DEVICE",
  "namespace", "name", "while", "dependencies", "IOCTL", "FLOAT",
  "SUBVERSION", "variable", "fortran", "external", "COBOL", "SILICON",
  "table", "API", "DATABASE", "ioctl", "BUILTIN", "polymorphism",
  "empty", "extensions", "OPTIMAL", "target", "optimization",
  "superclass", "INTERFACE", "interface", "PREFERENCES", "FOR", "asm",
  "var", "diagnostic", "PARALLELIZATION", "type", "xml", "linker",
  "PROVIDER", "leaf", "valley", "LINK", "TOOLCHAIN", "false",
  "DIAGNOSTIC", "RUNTIME", "CONFIGURATION", "CORE", "CONST",
  "MANAGED", "LEAF", "encoding", "switch", "CASE", "ERRNO", "DEBUG",
  "LIST", "double", "STATE", "builtin", "TARGET", "PYTHON", "SCRIPT",
  "definitions", "file", "if", "TABLE", "SETTINGS", "compiler",
  "ENUMERATED", "FALSE", "EXECUTABLE", "technical", "POLYMORPHISM",
  "vector", "STUDIO", "NAME", "float", "VERSION", "exception", "TRUE",
  "bit", "STORAGE", "INCANTATION", "endianness", "NODE", "id", "XML",
  "DONE", "INVOCATION", "environment", "PARENT", "SUPPORT", "tool",
  "ARRAY", "state", "project", "configuration", "const", "module",
  "builder", "BUILDER", "parallelization", "perl", "standalone",
  "ARTIFACT", "OPTIMIZATION", "COMPILER", "executable",
  "DEPENDENCIES", "nil", "column", "debug", "FILE", "option",
  "DEPRECATED", "COMMAND", "abi", "processor", "ENCODING", "command",
  "WHILE", "LISP", "vim", "DOUBLE", "folder", "script", "EMACS",
  "col", "DRIVE", "build", "case", "PARSER", "device", "clean", "NIL",
  "storage", "preferences", "VIM", "END", "NAMESPACE", "data",
  "toolchain", "STRING", "error", "description", "RELEASE",
  "incantation", "nano", "do", "TECHNICAL", "ROW", "scanner",
  "binary", "SUPERCLASS", "DESCRIPTION", "DO", "CONFIG", "invocation",
  "DIRECTORY", "done", "SWITCH", "NULL", "FOLDER", "LIBC", "BUILD",
  "ASM", "directory", "LINKER", "MAIN", "ID", "THEN",
  "implementation", "ELSE", "PLATFORM", "PATH", "then", "connection",
  "studio", "DEFINITIONS", "out", "null", "CONNECTION", "loop",
  "python", "runtime", "api", "BINARY"
};


/* temporary space for constructing strings
 */
static char scratchpad1[1024];
static char scratchpad2[1024];


/* something bad happened; print message and terminate execution
 */
static void
die(const char * s)
{
  if (s)
    {
      fprintf(stderr, "gen_diff_test_data: %s\n", s);
    }

  exit(1);
}


/* given a value, calculate next value in hailstone sequence
 *
 * f(n) = 3n+1 if n odd, n/2 if n even
 */
static uint64_t
hailstone(uint64_t n)
{
  return (n & 1) ? (n * 3) + 1 : n >> 1;
}


/* advance global variable to next value in hailstone sequence
 * if reached end of sequence, reseed and restart
 */
static void
advance(void)
{
  if (g_curr == 1)
    {
      g_seed++;
      g_curr = g_seed;
    }
  else
    {
      g_curr = hailstone(g_curr);
    }
}


/* get another "pseudo-random" word from words[] and advance in
 * hailstone sequence
 */
static const char *
word(void)
{
  const char * ret;

  g_word_index += g_curr;
  ret = words[g_word_index % (sizeof(words) / sizeof(words[0]))];

  advance();

  return ret;
}


/* get another "pseudo-random" number and advance in hailstone
 * sequence
 */
static int
number(void)
{
  int ret = (int) g_curr;

  advance();

  return ret;
}


/* print a hopefully helpful message and then quit
 */
static void
usage(void)
{
  fprintf(stderr, "gen_diff_test_data version %s\n\n",
          PROGRAM_VERSION);

  fprintf(stderr,
          "Usage: gen_diff_test_data <seed> <length>\n"
          "Where:\n"
          "        seed   - controls the content of the output\n"
          "        length - in bytes controls amount written\n"
          "                 approximately; actual output could be\n"
          "                 longer; can use k, m, or g suffix\n\n");

  exit(1);
}


/* parse command line arguments and validate them successfully or quit
 */
static void
parse_args(int argc, const char * argv[])
{
  char * endptr;
  long int val;

  if (argc != 3)
    {
      usage();
    }

  /* parse the seed value */

  val = strtol(argv[1], &endptr, 0);
  if ((val < 2) || (val == LONG_MAX))
    {
      die("seed must be in 1 < seed < LONG_MAX");
    }

  if ((endptr) && (*endptr))
    {
      die("unexpected stuff after seed");
    }

  g_seed = (uint64_t) val;

  /* parse the length value */

  val = strtol(argv[2], &endptr, 0);
  if ((val < 1) || (val == LONG_MAX))
    {
      die("length must be in 0 < length < LONG_MAX");
    }

  g_length = (uint64_t) val;

  if (endptr)
    {
      switch (*endptr)
        {
          case 0:
            break;

          case 'g': case 'G':
            g_length <<= 10;
          case 'm': case 'M':
            g_length <<= 10;
          case 'k': case 'K':
            g_length <<= 10;

            endptr++;
            if (*endptr)
              {
                die("unexpected stuff after length");
              }

            break;

          default: die("unknown length suffix");
        }
    }
}


/* print a string to stdout or else!
 */
static void
print_or_die(const char * s, ...)
{
  va_list args;
  int ret;

  va_start(args, s);
  ret = vfprintf(stdout, s, args);
  va_end(args);

  if (ret < 0)
    {
      die("sorry, vfprintf() failed!");
    }

  g_written += (uint64_t) ret;
}


/* print a string to a buffer or else!
 */
static void
snprintf_or_die(char * s, size_t n, const char * fmt, ...)
{
  va_list args;
  int ret;

  va_start(args, fmt);
  ret = vsnprintf(s, n, fmt, args);
  va_end(args);

  if ((ret < 0) || ((size_t) ret >= n))
    {
      die("sorry, vsnprintf() failed!");
    }
}


/* really lame function to "indent" by the current indent level by
 * repeatedly printing spaces
 */
static void
indent_or_die(void)
{
  int indents = g_indents;

  while (indents > 0)
    {
      print_or_die("  ");
      indents--;
    }
}


/* print an opening XML-looking tag and increase indent level
 */
static void
open_tag(const char * s)
{
  indent_or_die();
  print_or_die("<%s>\n", s);
  g_indents++;
}


/* safely decrease indent level and print a closing XML-looking tag
 */
static void
close_tag(const char * s)
{
  if (g_indents > 0)
    {
      g_indents--;
    }

  indent_or_die();
  print_or_die("</%s>\n", s);
}


/* on one line, print an opening XML-looking tag, possibly with
 * params, then print some contents, then print a closing tag; does
 * not change indent level
 */
static void
one_line_tag(const char * tag, const char * params,
             const char * contents)
{
  indent_or_die();

  if ((params) && (strlen(params)))
    {
      print_or_die("<%s %s>%s</%s>\n", tag, params, contents, tag);
    }
  else
    {
      print_or_die("<%s>%s</%s>\n", tag, contents, tag);
    }
}


static void
print_thing_1(void)
{
  snprintf_or_die(scratchpad1, sizeof(scratchpad1), "%s=\"%s\"",
                  word(), word());
  snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%d", number());
  one_line_tag(word(), scratchpad1, scratchpad2);
}


static void
print_thing_2(void)
{
  snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%d", number());
  one_line_tag(word(), NULL, scratchpad2);
}


static void
print_thing_3(void)
{
  snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%s", word());
  one_line_tag(word(), NULL, scratchpad2);
}


static void
print_thing_4(void)
{
  snprintf_or_die(scratchpad1, sizeof(scratchpad1),
                  "%s=\"%s\" %s=\"%s\" %s=\"%s\"",
                  word(), word(), word(), word(), word(), word());
  snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%d", number());
  one_line_tag(word(), scratchpad1, scratchpad2);
}


static void
print_thing_x(int x)
{
  if ((x + 30) >= (sizeof(words) / sizeof(words[0])))
    {
      x = 0;
    }

  snprintf_or_die(scratchpad1, sizeof(scratchpad1),
                  "%s=\"%s\" %s=\"%s\" %s=\"%s\"",
                  words[x + 5], words[x + 10], words[x + 15],
                  words[x + 20], words[x + 25], words[x + 30]);
  snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%d", x);
  one_line_tag(word(), scratchpad1, scratchpad2);
}


static void
print_sequence_1(void)
{
  open_tag("level1");

  print_thing_1();
  print_thing_2();

  open_tag("level2");

  print_thing_3();
  print_thing_x(10);
  print_thing_4();

  open_tag("level3");

  print_thing_1();
  print_thing_2();
  print_thing_x(35);
  print_thing_3();

  open_tag("level4");

  print_thing_3();
  print_thing_2();
  print_thing_4();

  close_tag("level4");
  close_tag("level3");
  close_tag("level2");

  print_thing_3();

  close_tag("level1");
}


/* generate a whole bunch of output that looks like XML with pseudo
 * random contents, but lots of similar lines; in other words, stuff
 * to keep a diff algorithm busy for a while
 */
static void
generate_output(void)
{
  open_tag("level0");

  while (g_written < g_length)
    {
      print_sequence_1();
    }

  close_tag("level0");
}


int
main(int argc, const char * argv[])
{
  parse_args(argc, argv);

  g_curr = g_seed;
  g_word_index = 0;
  g_written = 0;
  g_indents = 0;

  generate_output();

  return 0;
}

Re: [PATCH] limit diff effort to fix performance issue

Reply via email to