Author: hartmannathan
Date: Tue Jun  8 13:26:23 2021
New Revision: 1890601

URL: http://svn.apache.org/viewvc?rev=1890601&view=rev
Log:
* tools/dev/gen-test-data/gen_diff_test_data.c: Create a program to generate
  sample data to test svn's diff.

Added:
    subversion/trunk/tools/dev/gen-test-data/
    subversion/trunk/tools/dev/gen-test-data/gen_diff_test_data.c   (with props)

Added: subversion/trunk/tools/dev/gen-test-data/gen_diff_test_data.c
URL: 
http://svn.apache.org/viewvc/subversion/trunk/tools/dev/gen-test-data/gen_diff_test_data.c?rev=1890601&view=auto
==============================================================================
--- subversion/trunk/tools/dev/gen-test-data/gen_diff_test_data.c (added)
+++ subversion/trunk/tools/dev/gen-test-data/gen_diff_test_data.c Tue Jun  8 
13:26:23 2021
@@ -0,0 +1,540 @@
+/* gen_diff_test_data.c -- Generate sample data to test svn's diff
+ *
+ * ====================================================================
+ *    Licensed to the Apache Software Foundation (ASF) under one
+ *    or more contributor license agreements.  See the NOTICE file
+ *    distributed with this work for additional information
+ *    regarding copyright ownership.  The ASF licenses this file
+ *    to you under the Apache License, Version 2.0 (the
+ *    "License"); you may not use this file except in compliance
+ *    with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing,
+ *    software distributed under the License is distributed on an
+ *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *    KIND, either express or implied.  See the License for the
+ *    specific language governing permissions and limitations
+ *    under the License.
+ * ====================================================================
+ *
+ *
+ * This is a program to generate some pathological sample data for
+ * testing and improving diff implementations.
+ *
+ * The output is deterministic but varies based on a seed value, like
+ * that provided to a pseudo random number generator. The output
+ * length is controlled as well. Both parameters are given at the
+ * command line. The output is written to stdout.
+ *
+ * Presumably if two large outputs are generated by two runs with
+ * different seed values, it will take a diff algorithm a long time to
+ * calculate their longest common subsequence.
+ *
+ *
+ * Usage:
+ *
+ * $ gen_diff_test_data <seed> <length>
+ *
+ *
+ * Implementation notes:
+ *
+ * Rather than use the system-provided pseudo random number generator,
+ * this program implements the hailstone sequence (see [1]) to assure
+ * that users on different systems can produce same outputs when using
+ * same seed and length values. That way people don't have to send
+ * each other huge >100M files of useless junk. :-)
+ *
+ *
+ * References:
+ *
+ * [1] Hailstone sequence: See Collatz Conjecture
+ *     https://en.wikipedia.org/wiki/Collatz_conjecture
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <string.h>
+#include <limits.h>
+
+
+#define PROGRAM_VERSION "0.01"
+
+
+/* starting number for a hailstone sequence
+ */
+static uint64_t g_seed;
+
+
+/* desired length of the output (approximately; actual may be longer)
+ */
+static uint64_t g_length;
+
+
+/* current number in the hailstone sequence
+ */
+static uint64_t g_curr;
+
+
+/* current word in words[] array
+ */
+static uint64_t g_word_index;
+
+
+/* number of bytes written to stdout
+ */
+static uint64_t g_written;
+
+
+/* how much to indent lines
+ */
+static int g_indents;
+
+
+/* a bunch of random words to print in the output
+ */
+static const char * words[] = {
+  "list", "exe", "MODULE", "EXE", "BIT", "database", "POINT", "link",
+  "node", "parent", "BYTE", "enumerated", "OPTION", "managed",
+  "deprecated", "point", "inheritance", "OUT", "VARIABLE", "PERL",
+  "core", "else", "provider", "IMPLEMENTATION", "ENDIANNESS",
+  "platform", "TYPE", "SCANNER", "libc", "lisp", "PROCESSOR", "path",
+  "optimisation", "NANO", "subversion", "FORTRAN", "support", "EMPTY",
+  "parser", "EXTENSION", "LOOP", "COLUMN", "resource", "end",
+  "SUBCLASS", "optimal", "silicon", "row", "EXTENSIONS", "config",
+  "EXCEPTION", "INHERITANCE", "BEGIN", "emacs", "VALLEY", "PROJECT",
+  "EXTERNAL", "version", "subclass", "array", "ABI", "OPTIMISATION",
+  "CLEAN", "ENVIRONMENT", "COL", "string", "RESOURCE", "VECTOR",
+  "true", "STANDALONE", "VAR", "cobol", "DATA", "main", "TOOL",
+  "ERROR", "IF", "drive", "errno", "artifact", "NO", "no", "DEVICE",
+  "namespace", "name", "while", "dependencies", "IOCTL", "FLOAT",
+  "SUBVERSION", "variable", "fortran", "external", "COBOL", "SILICON",
+  "table", "API", "DATABASE", "ioctl", "BUILTIN", "polymorphism",
+  "empty", "extensions", "OPTIMAL", "target", "optimization",
+  "superclass", "INTERFACE", "interface", "PREFERENCES", "FOR", "asm",
+  "var", "diagnostic", "PARALLELIZATION", "type", "xml", "linker",
+  "PROVIDER", "leaf", "valley", "LINK", "TOOLCHAIN", "false",
+  "DIAGNOSTIC", "RUNTIME", "CONFIGURATION", "CORE", "CONST",
+  "MANAGED", "LEAF", "encoding", "switch", "CASE", "ERRNO", "DEBUG",
+  "LIST", "double", "STATE", "builtin", "TARGET", "PYTHON", "SCRIPT",
+  "definitions", "file", "if", "TABLE", "SETTINGS", "compiler",
+  "ENUMERATED", "FALSE", "EXECUTABLE", "technical", "POLYMORPHISM",
+  "vector", "STUDIO", "NAME", "float", "VERSION", "exception", "TRUE",
+  "bit", "STORAGE", "INCANTATION", "endianness", "NODE", "id", "XML",
+  "DONE", "INVOCATION", "environment", "PARENT", "SUPPORT", "tool",
+  "ARRAY", "state", "project", "configuration", "const", "module",
+  "builder", "BUILDER", "parallelization", "perl", "standalone",
+  "ARTIFACT", "OPTIMIZATION", "COMPILER", "executable",
+  "DEPENDENCIES", "nil", "column", "debug", "FILE", "option",
+  "DEPRECATED", "COMMAND", "abi", "processor", "ENCODING", "command",
+  "WHILE", "LISP", "vim", "DOUBLE", "folder", "script", "EMACS",
+  "col", "DRIVE", "build", "case", "PARSER", "device", "clean", "NIL",
+  "storage", "preferences", "VIM", "END", "NAMESPACE", "data",
+  "toolchain", "STRING", "error", "description", "RELEASE",
+  "incantation", "nano", "do", "TECHNICAL", "ROW", "scanner",
+  "binary", "SUPERCLASS", "DESCRIPTION", "DO", "CONFIG", "invocation",
+  "DIRECTORY", "done", "SWITCH", "NULL", "FOLDER", "LIBC", "BUILD",
+  "ASM", "directory", "LINKER", "MAIN", "ID", "THEN",
+  "implementation", "ELSE", "PLATFORM", "PATH", "then", "connection",
+  "studio", "DEFINITIONS", "out", "null", "CONNECTION", "loop",
+  "python", "runtime", "api", "BINARY"
+};
+
+
+/* temporary space for constructing strings
+ */
+static char scratchpad1[1024];
+static char scratchpad2[1024];
+
+
+/* something bad happened; print message and terminate execution
+ */
+static void
+die(const char * s)
+{
+  if (s)
+    {
+      fprintf(stderr, "gen_diff_test_data: %s\n", s);
+    }
+
+  exit(1);
+}
+
+
+/* given a value, calculate next value in hailstone sequence
+ *
+ * f(n) = 3n+1 if n odd, n/2 if n even
+ */
+static uint64_t
+hailstone(uint64_t n)
+{
+  return (n & 1) ? (n * 3) + 1 : n >> 1;
+}
+
+
+/* advance global variable to next value in hailstone sequence
+ * if reached end of sequence, reseed and restart
+ */
+static void
+advance(void)
+{
+  if (g_curr == 1)
+    {
+      g_seed++;
+      g_curr = g_seed;
+    }
+  else
+    {
+      g_curr = hailstone(g_curr);
+    }
+}
+
+
+/* get another "pseudo-random" word from words[] and advance in
+ * hailstone sequence
+ */
+static const char *
+word(void)
+{
+  const char * ret;
+
+  g_word_index += g_curr;
+  ret = words[g_word_index % (sizeof(words) / sizeof(words[0]))];
+
+  advance();
+
+  return ret;
+}
+
+
+/* get another "pseudo-random" number and advance in hailstone
+ * sequence
+ */
+static int
+number(void)
+{
+  int ret = (int) g_curr;
+
+  advance();
+
+  return ret;
+}
+
+
+/* print a hopefully helpful message and then quit
+ */
+static void
+usage(void)
+{
+  fprintf(stderr, "gen_diff_test_data version %s\n\n",
+          PROGRAM_VERSION);
+
+  fprintf(stderr,
+          "Usage: gen_diff_test_data <seed> <length>\n"
+          "Where:\n"
+          "        seed   - controls the content of the output\n"
+          "        length - in bytes controls amount written\n"
+          "                 approximately; actual output could be\n"
+          "                 longer; can use k, m, or g suffix\n\n");
+
+  exit(1);
+}
+
+
+/* parse command line arguments and validate them successfully or quit
+ */
+static void
+parse_args(int argc, const char * argv[])
+{
+  char * endptr;
+  long int val;
+
+  if (argc != 3)
+    {
+      usage();
+    }
+
+  /* parse the seed value */
+
+  val = strtol(argv[1], &endptr, 0);
+  if ((val < 2) || (val == LONG_MAX))
+    {
+      die("seed must be in 1 < seed < LONG_MAX");
+    }
+
+  if ((endptr) && (*endptr))
+    {
+      die("unexpected stuff after seed");
+    }
+
+  g_seed = (uint64_t) val;
+
+  /* parse the length value */
+
+  val = strtol(argv[2], &endptr, 0);
+  if ((val < 1) || (val == LONG_MAX))
+    {
+      die("length must be in 0 < length < LONG_MAX");
+    }
+
+  g_length = (uint64_t) val;
+
+  if (endptr)
+    {
+      switch (*endptr)
+        {
+          case 0:
+            break;
+
+          case 'g': case 'G':
+            g_length <<= 10;
+          case 'm': case 'M':
+            g_length <<= 10;
+          case 'k': case 'K':
+            g_length <<= 10;
+
+            endptr++;
+            if (*endptr)
+              {
+                die("unexpected stuff after length");
+              }
+
+            break;
+
+          default: die("unknown length suffix");
+        }
+    }
+}
+
+
+/* print a string to stdout or else!
+ */
+static void
+print_or_die(const char * s, ...)
+{
+  va_list args;
+  int ret;
+
+  va_start(args, s);
+  ret = vfprintf(stdout, s, args);
+  va_end(args);
+
+  if (ret < 0)
+    {
+      die("sorry, vfprintf() failed!");
+    }
+
+  g_written += (uint64_t) ret;
+}
+
+
+/* print a string to a buffer or else!
+ */
+static void
+snprintf_or_die(char * s, size_t n, const char * fmt, ...)
+{
+  va_list args;
+  int ret;
+
+  va_start(args, fmt);
+  ret = vsnprintf(s, n, fmt, args);
+  va_end(args);
+
+  if ((ret < 0) || ((size_t) ret >= n))
+    {
+      die("sorry, vsnprintf() failed!");
+    }
+}
+
+
+/* really lame function to "indent" by the current indent level by
+ * repeatedly printing spaces
+ */
+static void
+indent_or_die(void)
+{
+  int indents = g_indents;
+
+  while (indents > 0)
+    {
+      print_or_die("  ");
+      indents--;
+    }
+}
+
+
+/* print an opening XML-looking tag and increase indent level
+ */
+static void
+open_tag(const char * s)
+{
+  indent_or_die();
+  print_or_die("<%s>\n", s);
+  g_indents++;
+}
+
+
+/* safely decrease indent level and print a closing XML-looking tag
+ */
+static void
+close_tag(const char * s)
+{
+  if (g_indents > 0)
+    {
+      g_indents--;
+    }
+
+  indent_or_die();
+  print_or_die("</%s>\n", s);
+}
+
+
+/* on one line, print an opening XML-looking tag, possibly with
+ * params, then print some contents, then print a closing tag; does
+ * not change indent level
+ */
+static void
+one_line_tag(const char * tag, const char * params,
+             const char * contents)
+{
+  indent_or_die();
+
+  if ((params) && (strlen(params)))
+    {
+      print_or_die("<%s %s>%s</%s>\n", tag, params, contents, tag);
+    }
+  else
+    {
+      print_or_die("<%s>%s</%s>\n", tag, contents, tag);
+    }
+}
+
+
+static void
+print_thing_1(void)
+{
+  snprintf_or_die(scratchpad1, sizeof(scratchpad1), "%s=\"%s\"",
+                  word(), word());
+  snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%d", number());
+  one_line_tag(word(), scratchpad1, scratchpad2);
+}
+
+
+static void
+print_thing_2(void)
+{
+  snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%d", number());
+  one_line_tag(word(), NULL, scratchpad2);
+}
+
+
+static void
+print_thing_3(void)
+{
+  snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%s", word());
+  one_line_tag(word(), NULL, scratchpad2);
+}
+
+
+static void
+print_thing_4(void)
+{
+  snprintf_or_die(scratchpad1, sizeof(scratchpad1),
+                  "%s=\"%s\" %s=\"%s\" %s=\"%s\"",
+                  word(), word(), word(), word(), word(), word());
+  snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%d", number());
+  one_line_tag(word(), scratchpad1, scratchpad2);
+}
+
+
+static void
+print_thing_x(int x)
+{
+  if ((x + 30) >= (sizeof(words) / sizeof(words[0])))
+    {
+      x = 0;
+    }
+
+  snprintf_or_die(scratchpad1, sizeof(scratchpad1),
+                  "%s=\"%s\" %s=\"%s\" %s=\"%s\"",
+                  words[x + 5], words[x + 10], words[x + 15],
+                  words[x + 20], words[x + 25], words[x + 30]);
+  snprintf_or_die(scratchpad2, sizeof(scratchpad2), "%d", x);
+  one_line_tag(word(), scratchpad1, scratchpad2);
+}
+
+
+static void
+print_sequence_1(void)
+{
+  open_tag("level1");
+
+  print_thing_1();
+  print_thing_2();
+
+  open_tag("level2");
+
+  print_thing_3();
+  print_thing_x(10);
+  print_thing_4();
+
+  open_tag("level3");
+
+  print_thing_1();
+  print_thing_2();
+  print_thing_x(35);
+  print_thing_3();
+
+  open_tag("level4");
+
+  print_thing_3();
+  print_thing_2();
+  print_thing_4();
+
+  close_tag("level4");
+  close_tag("level3");
+  close_tag("level2");
+
+  print_thing_3();
+
+  close_tag("level1");
+}
+
+
+/* generate a whole bunch of output that looks like XML with pseudo
+ * random contents, but lots of similar lines; in other words, stuff
+ * to keep a diff algorithm busy for a while
+ */
+static void
+generate_output(void)
+{
+  open_tag("level0");
+
+  while (g_written < g_length)
+    {
+      print_sequence_1();
+    }
+
+  close_tag("level0");
+}
+
+
+int
+main(int argc, const char * argv[])
+{
+  parse_args(argc, argv);
+
+  g_curr = g_seed;
+  g_word_index = 0;
+  g_written = 0;
+  g_indents = 0;
+
+  generate_output();
+
+  return 0;
+}
+

Propchange: subversion/trunk/tools/dev/gen-test-data/gen_diff_test_data.c
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to