Here is a preliminary patch for basic shuffling functionality in
'sort', with same-keys-sort-together behavior. It adds two options:
"-R" to compare based on a hash of key, and "--seed" to specify salt
for the hash. If "--seed" is not given then the default is to read
from /dev/random or /dev/urandom.

I still need to add support to configure.ac for some of the random
device macros.

Feedback?

Frederik

-- 
http://ofb.net/~frederik/
diff -x '*~' -ur -N coreutils-5.2.1/src/sort.c 
coreutils-5.2.1-sort-rand/src/sort.c
--- coreutils-5.2.1/src/sort.c  2005-06-02 22:56:29.000000000 -0700
+++ coreutils-5.2.1-sort-rand/src/sort.c        2005-06-05 22:07:37.000000000 
-0700
@@ -37,6 +37,9 @@
 #include "stdio-safer.h"
 #include "xmemcoll.h"
 #include "xstrtol.h"
+#include "checksum.h"
+#include "randseed.h"
+#include "md5.h"
 
 #if HAVE_SYS_RESOURCE_H
 # include <sys/resource.h>
@@ -153,6 +156,7 @@
   bool numeric;                        /* Flag for numeric comparison.  Handle
                                   strings of digits with optional decimal
                                   point, but no exponential notation. */
+  bool random_hash;            /* Shuffle by sorting on random hash of key */
   bool general_numeric;                /* Flag for general, numeric comparison.
                                   Handle numbers in exponential notation. */
   bool month;                  /* Flag for comparison by month name. */
@@ -296,6 +300,7 @@
   -i, --ignore-nonprinting    consider only printable characters\n\
   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
   -n, --numeric-sort          compare according to string numerical value\n\
+  -R, --random                sort by random hash of keys\n\
   -r, --reverse               reverse the result of comparisons\n\
 \n\
 "), stdout);
@@ -318,6 +323,7 @@
 "), DEFAULT_TMPDIR);
       fputs (_("\
   -z, --zero-terminated     end lines with 0 byte, not newline\n\
+  --seed                    use specified seed for random number generator\n\
 "), stdout);
       fputs (HELP_OPTION_DESCRIPTION, stdout);
       fputs (VERSION_OPTION_DESCRIPTION, stdout);
@@ -346,7 +352,11 @@
   exit (status);
 }
 
-#define COMMON_SHORT_OPTIONS "-bcdfgik:mMno:rsS:t:T:uz"
+#define COMMON_SHORT_OPTIONS "-bcdfgik:mMnRo:rsS:t:T:uz"
+enum
+{
+  SEED_OPTION = CHAR_MAX + 1
+};
 
 static struct option const long_options[] =
 {
@@ -360,6 +370,7 @@
   {"merge", no_argument, NULL, 'm'},
   {"month-sort", no_argument, NULL, 'M'},
   {"numeric-sort", no_argument, NULL, 'n'},
+  {"random", no_argument, NULL, 'R'},
   {"output", required_argument, NULL, 'o'},
   {"reverse", no_argument, NULL, 'r'},
   {"stable", no_argument, NULL, 's'},
@@ -368,6 +379,7 @@
   {"temporary-directory", required_argument, NULL, 'T'},
   {"unique", no_argument, NULL, 'u'},
   {"zero-terminated", no_argument, NULL, 'z'},
+  {"seed", required_argument, NULL, SEED_OPTION},
   {GETOPT_HELP_OPTION_DECL},
   {GETOPT_VERSION_OPTION_DECL},
   {0, 0, 0, 0},
@@ -1332,6 +1344,26 @@
   return result;
 }
 
+struct digest_ops digops;
+void *salt_ctx;
+
+static void
+init_salt()
+{
+  get_digest_ops(ALG_MD5, &digops);
+  salt_ctx = xmalloc(digops.ctx_size);
+  digops.init_ctx(salt_ctx);
+}
+
+static void
+get_hash(char* text, int len, void *resbuf)
+{
+  void *ctx = alloca(digops.ctx_size);
+  memcpy(ctx, salt_ctx, digops.ctx_size);
+  digops.process_bytes(text, len, ctx);
+  digops.finish_ctx(ctx,resbuf);
+}
+
 /* Compare two lines A and B trying every key in sequence until there
    are no more keys or a difference is found. */
 
@@ -1374,6 +1406,15 @@
                  (texta, textb));
          *lima = savea, *limb = saveb;
        }
+      else if (key->random_hash)
+        {
+          int dig_bytes = digops.bits/8;
+          char diga[dig_bytes];
+          char digb[dig_bytes];
+          get_hash(texta, lena, diga);
+          get_hash(textb, lenb, digb);
+          diff = memcmp(diga, digb, sizeof(diga));
+        }
       else if (key->month)
        diff = getmonth (texta, lena) - getmonth (textb, lenb);
       /* Sorting like this may become slow, so in a simple locale the user
@@ -2083,7 +2124,7 @@
 {
   error (SORT_FAILURE, 0, _("%s: invalid field specification `%s'"),
         _(msgid), spec);
-  abort ();
+  abort (); // XXX is this ever reached? need comment if it is
 }
 
 /* Parse the leading integer in STRING and store the resulting value
@@ -2189,6 +2230,9 @@
        case 'n':
          key->numeric = true;
          break;
+       case 'R':
+         key->random_hash = true;
+         break;
        case 'r':
          key->reverse = true;
          break;
@@ -2217,6 +2261,8 @@
   int c = 0;
   bool checkonly = false;
   bool mergeonly = false;
+  char *randseed = 0;
+  bool need_rand = false;
   int nfiles = 0;
   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
   bool obsolete_usage = (posix2_version () < 200112);
@@ -2300,7 +2346,7 @@
   gkey.sword = gkey.eword = SIZE_MAX;
   gkey.ignore = NULL;
   gkey.translate = NULL;
-  gkey.numeric = gkey.general_numeric = gkey.month = gkey.reverse = false;
+  gkey.numeric = gkey.general_numeric = gkey.random_hash = gkey.month = 
gkey.reverse = false;
   gkey.skipsblanks = gkey.skipeblanks = false;
 
   files = xnmalloc (argc, sizeof *files);
@@ -2376,6 +2422,7 @@
        case 'M':
        case 'n':
        case 'r':
+       case 'R':
          {
            char str[2];
            str[0] = c;
@@ -2503,6 +2550,10 @@
          eolchar = 0;
          break;
 
+        case SEED_OPTION:
+          randseed = strdupa(optarg);
+          break;
+
        case_GETOPT_HELP_CHAR;
 
        case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
@@ -2512,26 +2563,41 @@
        }
     }
 
+  need_rand |= gkey.random_hash;
   /* Inheritance of global options to individual keys. */
-  for (key = keylist; key; key = key->next)
-    if (! (key->ignore || key->translate
-          || (key->skipsblanks | key->reverse
-              | key->skipeblanks | key->month | key->numeric
-              | key->general_numeric)))
-      {
-       key->ignore = gkey.ignore;
-       key->translate = gkey.translate;
-       key->skipsblanks = gkey.skipsblanks;
-       key->skipeblanks = gkey.skipeblanks;
-       key->month = gkey.month;
-       key->numeric = gkey.numeric;
-       key->general_numeric = gkey.general_numeric;
-       key->reverse = gkey.reverse;
-      }
+  for (key = keylist; key; key = key->next) 
+    {
+      need_rand |= key->random_hash;
+      if (! (key->ignore || key->translate
+             || (key->skipsblanks | key->reverse
+                 | key->skipeblanks | key->month | key->numeric
+                 | key->general_numeric
+                 | key->random_hash)))
+        {
+          key->ignore = gkey.ignore;
+          key->translate = gkey.translate;
+          key->skipsblanks = gkey.skipsblanks;
+          key->skipeblanks = gkey.skipeblanks;
+          key->month = gkey.month;
+          key->numeric = gkey.numeric;
+          key->general_numeric = gkey.general_numeric;
+          key->random_hash = gkey.random_hash;
+          key->reverse = gkey.reverse;
+        }
+    }
+
+  if (need_rand) {
+    init_salt();
+    if(randseed)
+      string_seed(&digops, salt_ctx, randseed);
+    else
+      default_seed(&digops, salt_ctx);
+  }
 
   if (!keylist && (gkey.ignore || gkey.translate
                   || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
-                      | gkey.numeric | gkey.general_numeric)))
+                      | gkey.numeric | gkey.general_numeric
+                       | gkey.random_hash)))
     insertkey (&gkey);
   reverse = gkey.reverse;

diff -x '*~' -ur -N coreutils-5.2.1/src/checksum.h 
coreutils-5.2.1-sort-rand/src/checksum.h
--- coreutils-5.2.1/src/checksum.h      2003-04-11 02:07:21.000000000 -0700
+++ coreutils-5.2.1-sort-rand/src/checksum.h    2005-06-05 13:57:22.000000000 
-0700
@@ -1,8 +1,14 @@
+#ifndef _CHECKSUM_H
+#define _CHECKSUM_H 1
+
 #include <config.h>
 
 #include <sys/types.h>
 #include <stdio.h>
 
+#include "sha1.h"
+#include "md5.h"
+
 /* For long options that have no equivalent short option, use a
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 enum
@@ -13,3 +19,39 @@
 };
 
 extern int algorithm;
+
+struct digest_ops {
+  int ctx_size;
+  int bits;
+  void (*init_ctx) (void *ctx);
+  void (*process_bytes) (const void *buffer, size_t len, void *ctx);
+  void *(*finish_ctx) (void *ctx, void *resbuf);
+  void *(*read_ctx) (void *ctx, void *resbuf);
+};
+
+static inline
+void get_digest_ops(int alg, struct digest_ops *res) {
+  switch(alg) {
+  case ALG_MD5:
+    res->ctx_size = sizeof(struct md5_ctx);
+    res->bits = 128;
+    res->init_ctx = (void (*) (void *))md5_init_ctx;
+    res->process_bytes = (void (*) (const void *, size_t, void 
*))md5_process_bytes;
+    res->finish_ctx = (void *(*) (void *, void *))md5_finish_ctx;
+    res->read_ctx = (void *(*) (void *, void *))md5_read_ctx;
+    break;
+  case ALG_SHA1:
+    res->ctx_size = sizeof(struct sha_ctx);
+    res->bits = 160;
+    res->init_ctx = (void (*) (void *))sha_init_ctx;
+    res->process_bytes = (void (*) (const void *, size_t, void 
*))sha_process_bytes;
+    res->finish_ctx = (void *(*) (void *, void *))sha_finish_ctx;
+    res->read_ctx = (void *(*) (void *, void *))sha_read_ctx;
+    break;
+  default:
+    /* abort? */
+    break;
+  }
+}
+
+#endif
diff -x '*~' -ur -N coreutils-5.2.1/src/Makefile.in 
coreutils-5.2.1-sort-rand/src/Makefile.in
--- coreutils-5.2.1/src/Makefile.in     2004-03-11 00:59:23.000000000 -0800
+++ coreutils-5.2.1-sort-rand/src/Makefile.in   2005-06-06 00:14:02.000000000 
-0700
@@ -481,7 +481,8 @@
        $(am__DEPENDENCIES_1)
 sleep_DEPENDENCIES = $(am__DEPENDENCIES_3)
 sort_SOURCES = sort.c
-sort_OBJECTS = sort.$(OBJEXT)
+am_sort_OBJECTS = sort.$(OBJEXT) md5.$(OBJEXT) randseed.$(OBJEXT)
+sort_OBJECTS = $(am_sort_OBJECTS)
 sort_DEPENDENCIES = $(am__DEPENDENCIES_2) $(am__DEPENDENCIES_1)
 split_SOURCES = split.c
 split_OBJECTS = split.$(OBJEXT)
diff -x '*~' -ur -N coreutils-5.2.1/src/randseed.c 
coreutils-5.2.1-sort-rand/src/randseed.c
--- coreutils-5.2.1/src/randseed.c      1969-12-31 16:00:00.000000000 -0800
+++ coreutils-5.2.1-sort-rand/src/randseed.c    2005-06-06 01:12:07.000000000 
-0700
@@ -0,0 +1,390 @@
+/* randseed.c
+
+Code for portably reading a random seed from entropy devices. Mostly
+adapted from libgcrypt.
+
+June 2005 Frederik Eaton
+*/
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/times.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <time.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "checksum.h"
+
+// XXX put these in autoconf
+#define NAME_OF_DEV_RANDOM "/dev/random"
+#define NAME_OF_DEV_URANDOM "/dev/urandom"
+#define USE_RNDLINUX 1
+#define USE_RNDEGD 1
+
+#define SORT_FAILURE 2
+#define FAIL_CODE SORT_FAILURE
+
+static int gather_random(struct digest_ops *ops, void *ctx, int level);
+
+#if USE_RNDLINUX
+static int linux_gather_random (struct digest_ops *ops, void *ctx, int level);
+#endif
+
+#if USE_RNDEGD
+static int egd_gather_random (struct digest_ops *ops, void *ctx, int level);
+static int egd_connect_socket (int nofail);
+#endif
+
+int string_seed(struct digest_ops *ops, void *ctx, char *str)
+{
+  int len = strlen(str);
+  ops->process_bytes(str, len, ctx);
+  return (len*2) >= ops->bits;
+}
+
+int default_seed(struct digest_ops *ops, void *ctx)
+{
+  gather_random(ops,ctx,1);
+}
+
+static int
+gather_random(struct digest_ops *ops, void *ctx, int level)
+{
+#if USE_RNDLINUX
+  if ( !access (NAME_OF_DEV_RANDOM, R_OK)
+       && !access (NAME_OF_DEV_URANDOM, R_OK))
+    {
+      return linux_gather_random(ops, ctx, level);
+    }
+#endif
+
+#if USE_RNDEGD
+  if ( egd_connect_socket (1) != -1 )
+    {
+      return egd_gather_random(ops, ctx, level);
+    }
+#endif
+
+  {
+    unsigned long l;
+    l = time(NULL);
+    ops->process_bytes(&l, sizeof(l), ctx);
+    l = getpid();;
+    ops->process_bytes(&l, sizeof(l), ctx);
+    l = getuid();
+    ops->process_bytes(&l, sizeof(l), ctx);
+  }
+
+  return 0;
+}
+
+//----------------------------------------------------------------
+
+int open_dev(char *name)
+{
+  int fd;
+  fd = open( name, O_RDONLY );
+  if( fd == -1 )
+    error (FAIL_CODE, errno, "can't open %s\n", name);
+  return fd;
+}
+
+#if USE_RNDLINUX
+static int
+linux_gather_random (struct digest_ops *ops, void *ctx, int level)
+{
+  static int fd_urandom = -1;
+  static int fd_random = -1;
+  int fd;
+  int n;
+  int length = ops->bits/8;
+  int warn=0;
+  char buffer[768];
+
+  if( level >= 2 )
+    {
+      if( fd_random == -1 )
+        fd_random = open_dev( NAME_OF_DEV_RANDOM );
+      fd = fd_random;
+    }
+  else
+    {
+      if( fd_urandom == -1 )
+        fd_urandom = open_dev( NAME_OF_DEV_URANDOM );
+      fd = fd_urandom;
+    }
+
+  while (length)
+    {
+      fd_set rfds;
+      struct timeval tv;
+      int rc;
+      
+      FD_ZERO(&rfds);
+      FD_SET(fd, &rfds);
+      tv.tv_sec = 0;
+      tv.tv_usec = 20*1000;
+      if( !(rc=select(fd+1, &rfds, NULL, NULL, &tv)) )
+        {
+          if( !warn )
+            {
+              error (0,0,"need_entropy");
+             warn = 1;
+           }
+         continue;
+       }
+       else if( rc == -1 )
+          {
+           error (0, errno, "select()\n");
+           continue;
+          }
+
+       do 
+          {
+           int nbytes = length < sizeof(buffer)? length : sizeof(buffer);
+           n = read(fd, buffer, nbytes );
+           if( n >= 0 && n > nbytes ) 
+              {
+               error(0,0,"bogus read from random device (n=%d)\n", n );
+               n = nbytes;
+              }
+          } 
+        while( n == -1 && errno == EINTR );
+       if( n == -1 )
+          error(FAIL_CODE, errno, "read error on random device: %s\n");
+        ops->process_bytes (buffer, n, ctx);
+       length -= n;
+    }
+  memset(buffer, 0, sizeof(buffer));
+
+  return 0; /* success */
+}
+#endif
+
+//----------------------------------------------------------------
+
+#if USE_RNDEGD
+
+#ifndef offsetof
+#define offsetof(type, member) ((size_t) &((type *)0)->member)
+#endif
+
+static int egd_socket = -1;
+
+/* Allocate a new filename from FIRST_PART and SECOND_PART and to
+   tilde expansion for first_part.  SECOND_PART might be NULL.
+ */
+static char *
+my_make_filename (const char *first_part, const char *second_part)
+{
+  size_t n;
+  char *name, *home, *p;
+
+  n = strlen(first_part)+1;
+  if (second_part)
+    n += strlen (second_part) + 1;
+
+  home = NULL;
+  if( *first_part == '~' && first_part[1] == '/'
+      && (home = (char*)getenv("HOME")) && *home )
+    n += strlen(home);
+
+  name = (char*)xmalloc(n);
+  p = (char*)(home
+       ? stpcpy (stpcpy (name, home), first_part+1 )
+       : stpcpy (name, first_part) );
+
+  if (second_part)
+    strcpy ((char*)stpcpy(p,"/"), second_part);
+
+  return name;
+}
+
+
+static int
+do_write( int fd, void *buf, size_t nbytes )
+{
+  size_t nleft = nbytes;
+  int nwritten;
+  
+  while( nleft > 0 ) 
+    {
+      nwritten = write( fd, buf, nleft);
+      if( nwritten < 0 )
+        {
+          if( errno == EINTR )
+            continue;
+          return -1;
+       }
+      nleft -= nwritten;
+      buf = (char*)buf + nwritten;
+    }
+  return 0;
+}
+
+static int
+do_read( int fd, void *buf, size_t nbytes )
+{
+  int n, nread = 0;
+
+  do
+    {
+      do
+        {
+          n = read(fd, (char*)buf + nread, nbytes );
+        } 
+      while( n == -1 && errno == EINTR );
+      if( n == -1)
+        return nread? nread:-1;
+      if( n == 0)
+        return -1;
+      nread += n;
+      nbytes -= n;
+    } 
+  while( nread < nbytes );
+  return nread;
+}
+
+/* Connect to the EGD and return the file descriptor.  Return -1 on
+   error.  With NOFAIL set to true, silently fail and return the
+   error, otherwise print an error message and die. */
+static int
+egd_connect_socket (int nofail)
+{
+  int fd;
+  const char *bname = NULL;
+  char *name;
+  struct sockaddr_un addr;
+  int addr_len;
+
+  if (egd_socket != -1)
+    {
+      close (egd_socket);
+      egd_socket = -1;
+    }
+
+#ifdef EGD_SOCKET_NAME
+  bname = EGD_SOCKET_NAME;
+#endif
+  if ( !bname || !*bname )
+    name = my_make_filename ("~/.gnupg", "entropy");
+  else
+    name = my_make_filename (bname, NULL);
+
+  if (strlen(name)+1 >= sizeof addr.sun_path)
+    error (FAIL_CODE, 0, "EGD socketname is too long\n");
+  
+  memset( &addr, 0, sizeof addr );
+  addr.sun_family = AF_UNIX;
+  strcpy( addr.sun_path, name );         
+  addr_len = (offsetof( struct sockaddr_un, sun_path )
+              + strlen( addr.sun_path ));
+  
+  fd = socket(AF_UNIX, SOCK_STREAM, 0);
+  if (fd == -1 && !nofail)
+    error (FAIL_CODE, errno, "can't create unix domain socket\n");
+  else if (connect (fd, (struct sockaddr*)&addr, addr_len) == -1)
+    {
+      if (!nofail)
+        error (FAIL_CODE, errno, "can't connect to EGD socket `%s'\n");
+      close (fd);
+      fd = -1;
+    }
+  free(name);
+  if (fd != -1)
+    egd_socket = fd;
+  return fd;
+}
+
+/****************
+ * Note: We always use the highest level.
+ * To boost the performance we may want to add some
+ * additional code for level 1
+ *
+ * Using a level of 0 should never block and better add nothing
+ * to the pool.  So this is just a dummy for EGD.
+ */
+static int
+egd_gather_random (struct digest_ops *ops, void *ctx, int level)
+{
+  int fd = egd_socket;
+  int n;
+  char buffer[256+2];
+  int nbytes;
+  int do_restart = 0;
+  int length = ops->bits/8;
+
+  if( !length )
+    return 0;
+  if( !level )
+    return 0;
+
+ restart:
+  if (fd == -1 || do_restart)
+    fd = egd_connect_socket (0);
+
+  do_restart = 0;
+
+  nbytes = length < 255? length : 255;
+  /* First time we do it with a non blocking request */
+  buffer[0] = 1; /* non blocking */
+  buffer[1] = nbytes;
+  if( do_write( fd, buffer, 2 ) == -1 )
+    error(FAIL_CODE, errno, "can't write to the EGD: %s\n");
+  n = do_read( fd, buffer, 1 );
+  if( n == -1 )
+    {
+      error (FAIL_CODE, errno, "read error on EGD\n");
+      do_restart = 1;
+      goto restart;
+    }
+  n = buffer[0];
+  if( n )
+    {
+      n = do_read( fd, buffer, n );
+      if( n == -1 )
+        {
+          error (FAIL_CODE, errno, "read error on EGD\n");
+          do_restart = 1;
+          goto restart;
+       }
+      ops->process_bytes (buffer, n, ctx);
+      length -= n;
+    }
+
+  if( length )
+    {
+      fprintf (stderr, 
+               "Please wait, entropy is being gathered. Do some work if it 
would\n"
+               "keep you from getting bored, because it will improve the 
quality\n"
+               "of the entropy.\n");
+    }
+  while( length )
+    {
+      nbytes = length < 255? length : 255;
+
+      buffer[0] = 2; /* blocking */
+      buffer[1] = nbytes;
+      if( do_write( fd, buffer, 2 ) == -1 )
+        error (FAIL_CODE, errno, "can't write to the EGD\n");
+      n = do_read( fd, buffer, nbytes );
+      if( n == -1 )
+        {
+          error (FAIL_CODE, errno, "read error on EGD\n");
+          do_restart = 1;
+          goto restart;
+       }
+      ops->process_bytes (buffer, n, ctx);
+      length -= n;
+    }
+  memset(buffer, 0, sizeof(buffer) );
+
+  return 0; /* success */
+}
+
+#endif
diff -x '*~' -ur -N coreutils-5.2.1/src/randseed.h 
coreutils-5.2.1-sort-rand/src/randseed.h
--- coreutils-5.2.1/src/randseed.h      1969-12-31 16:00:00.000000000 -0800
+++ coreutils-5.2.1-sort-rand/src/randseed.h    2005-06-05 13:05:41.000000000 
-0700
@@ -0,0 +1,4 @@
+#include "checksum.h"
+
+int string_seed(struct digest_ops *ops, void *ctx, char *str);
+int default_seed(struct digest_ops *ops, void *ctx);
_______________________________________________
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils

Reply via email to