Re: rsync and debian -- summary of issues

Robert Tiberius Johnson Fri, 12 Apr 2002 19:38:31 -0500

On Fri, 2002-04-12 at 01:47, Martin Pool wrote:
> On 11 Apr 2002, Robert Tiberius Johnson <[EMAIL PROTECTED]> wrote:
> > Also, in section 3.15, you say, "Fetching a series of deltas which
> > update the same area is less efficient than calculating the deltas
> > directly."  This is true, but by my computations, the number of packages
> > which change more than once in a month is 300, which require about 30K
> > of wasted bandwidth. 
> 
> Sorry, I don't understand what the 30kB number relates to.  Is that
> only for the updates to the Packages file?


Yep.  Sorry that wasn't clear before.

I implemented a local version of Goswin Brederlow's proposal to use the
reverse rsync algorithm over HTTP Range requests.  The code is included
below: please look it over to make sure I got the protocol right.

Here's what I found by running the program on some Packages[.gz] files:

Using gzip -9 --rsyncable Packages files from unstable main:
From    To      blocksize       diskspace       bandwidth
---------------------------------------------------------
Apr 6   Apr 7   128             118K            882K
Apr 6   Apr 7   256             59K             841K
Apr 6   Apr 7   512             29K             839K
Apr 6   Apr 7   1024            15K             875K
Apr 6   Apr 11  128             118K            1900K
Apr 6   Apr 11  256             59K             1853K
Apr 6   Apr 11  512             29K             1839K
Apr 6   Apr 11  1024            15K             1845K

Using uncompressed Packages files from unstable main:
From    To      blocksize       diskspace       bandwidth
---------------------------------------------------------
Apr 6   Apr 7   256             212K            323K
Apr 6   Apr 7   512             106K            270K
Apr 6   Apr 7   1024            53K             292K
Apr 6   Apr 11  128             429K            898K
Apr 6   Apr 11  256             215K            860K
Apr 6   Apr 11  512             107K            985K

For comparison, 5 days of bzip2 -9 compressed diffs take about 60K of
disk space, and updating after one day uses about 13K of bandwidth. 
Updating after 5 days uses 65K of bandwidth.

Best,
Rob

/* Mock-up version of the "checksum file" file synching protocol.
 * 
 * compile with "gcc -lssl gen_csums.c -o gen_csums"
 * 
 * Usage: gen_csums <bsize> <csize> <oldfile> <newfile>
 *
 * using a checksum size (csize) of < 8 can cause collisions,
 * which will ruin file reconstruction.  This implementation only supports
 * csize of <= 8 to reduce memory usage.  So always use csize=8.
 *
 * <oldfile> is the file the client has.  <newfile> is the file
 * the server has.  The program produces the file of checksums (newfile.csums),
 * a file containing the extra data the client had to fetch from the server
 * (oldfile.data), and the result of patching oldfile using the
 * checksums and fetched data (oldfile.merged).
 *
 * It prints out the blocksize, the amount of data that had to
 * be transferred from the server, and whether reconstruction was 
 * successful.
 */


#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <assert.h>
#include <openssl/md5.h>

/* Generate the checksums file for the given file. */
void gen_csums (int bsize, int csize, char* ifilename, char* cfilename)
{
        int ifile, cfile;
        struct stat s;
        
        char* buf;
        char csumbuf[MD5_DIGEST_LENGTH];
        MD5_CTX c;
        
        int i;
        
        ifile = open (ifilename, O_RDONLY);
        cfile = open (cfilename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | 
S_IWRITE);
        fstat (ifile, &s);
        buf = (char*) malloc (s.st_size);
        read (ifile, buf, s.st_size);
        close (ifile);
        
        write (cfile, &s.st_size, sizeof (s.st_size));
        write (cfile, &bsize, sizeof (bsize));
        write (cfile, &csize, sizeof (csize));
        MD5 (buf, s.st_size, csumbuf);
        write (cfile, csumbuf, sizeof (csumbuf));
        
        for (i = 0; i < s.st_size; i += bsize)
        {
                        MD5 (buf + i, bsize, csumbuf);
                        write (cfile, csumbuf, csize);
        }

        free (buf);
        close (cfile);
}

/* Merging code */

struct block_info
{
        int offset;
        char csum[8];
};

int csize = 0;

/* Used for sorting the checksums, and doing binary search
 * on checksums */
int block_info_cmp (void* a, void* b)
{
        struct block_info* p1 = (struct block_info*)a;
        struct block_info* p2 = (struct block_info*)b;
        int r = memcmp (p1->csum, p2->csum, csize);
        return r;
}

/* Given:
 *              cfilename - the checksum file
 *              pfilename - the new file on the server
 *              ifilename - our old file
 *              ofilename - the merged file to be created
 *              dfilename - the file extra data is to be logged in
 *
 * use the checksums to fill in as much of the merged file as possible.
 * Then use the pfilename to fill in the rest.  At the end, the file
 * dfilename will contain a copy of all the extra data the client had 
 * to pull from pfilename to fill in the holes.
 */
int merge_csums (char* cfilename, char* pfilename, 
                                  char* ifilename, char* ofilename,
                                  char* dfilename)
{
        int bsize;

        int cfile, pfile, ifile, ofile, dfile;
        int ofilelen;
        int inblocks;
    struct stat s;

    char* buf;
        char* foundmap;
        char tc;
        struct block_info csum;
    struct block_info* csums;
        struct block_info* match;
    MD5_CTX c;
        char totalcsum[MD5_DIGEST_LENGTH];
        char csumbuf[MD5_DIGEST_LENGTH];

    int i, j;

    cfile = open (cfilename, O_RDONLY);
    pfile = open (pfilename, O_RDONLY);
    ifile = open (ifilename, O_RDONLY);
    ofile = open (ofilename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE);
    dfile = open (dfilename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE);
    fstat (ifile, &s);
        
    read (cfile, &ofilelen, sizeof (ofilelen));
    read (cfile, &bsize, sizeof (bsize));
    read (cfile, &csize, sizeof (csize));
        read (cfile, totalcsum, sizeof (totalcsum));
        inblocks = s.st_size - bsize + 1;

        foundmap = (char*) malloc (ofilelen);
        memset (foundmap, 0, ofilelen);

        /* Hash all our local blocks.  Then sort them by hash */
        csums = (struct block_info*) malloc (inblocks * sizeof (struct 
block_info));
        buf = (char *) malloc (s.st_size);
        read (ifile, buf, s.st_size);
        for (i = 0; i < inblocks; i++)
        {
                csums[i].offset = i;
                MD5 (buf + i, bsize, csumbuf);
                memcpy (csums[i].csum, csumbuf, csize);
        }
        qsort (csums, inblocks, sizeof (struct block_info), block_info_cmp);

        /* For each csum in the cfile, look for it in our list of checksums */
        j = 0;
        while ((i = read (cfile, csum.csum, csize)) == csize)
        {
                match = (struct block_info*) bsearch (&csum, csums, inblocks, 
                                                                                
          sizeof (struct block_info),
                                                                                
          block_info_cmp);
                if (match != NULL)
                {
                        /* If we find a match, copy the corresponding 
                         * position from our old file to the proper place
                         * in our new file.  Also, mark this range of
                         * bytes as "found". */
                        lseek (ofile, j * bsize, SEEK_SET);
                        write (ofile, buf + match->offset, bsize);
                        memset (foundmap + j * bsize, 1, bsize);
                }
                
                j++;
        }

        /* Now find any holes in our output, and use the peekfile (pfile)
         * to fill them in. These are the bytes we'd have to fetch from the
         * server using http range requests.  Store them in the data file for 
         * later reference, too. */
        for (i = 0; i < ofilelen; i++)
        {
                if (foundmap[i] == 0)
                {
                        lseek (pfile, i, SEEK_SET);
                        lseek (ofile, i, SEEK_SET);
                        read (pfile, &tc, sizeof (tc));
                        write (ofile, &tc, sizeof (tc));
                        write (dfile, &tc, sizeof (tc));
                }
        }

        free (foundmap);
        free (csums);
        free (buf);
        
    close (cfile);
    close (pfile);
    close (ifile);
    close (ofile);
    close (dfile);

        /* Now checksum our newly reconstructed file and compare it
         * to the server's checksum */
        ofile = open (ofilename, O_RDONLY);
        buf = (char *) malloc (ofilelen);
        read (ofile, buf, ofilelen);
        close (ofile);
        MD5 (buf, ofilelen, csumbuf);
        free (buf);

        return memcmp (csumbuf, totalcsum, MD5_DIGEST_LENGTH);
}

int main (int argc, char** argv)
{
        int bsize = atoi (argv[1]);
        int csize = atoi (argv[2]);
        char* oldfilename = argv[3];
        char* newfilename = argv[4];
        char csumsfilename[1024];
        char datafilename[1024];
        char mergedfilename[1024];
        
        struct stat csums;
        struct stat data;
        
        int failed = 0;

        assert (csize <= 8);
        
        sprintf (csumsfilename, "%s.csums", newfilename);
        sprintf (datafilename, "%s.data", oldfilename);
        sprintf (mergedfilename, "%s.merged", oldfilename);

        gen_csums (bsize, csize, newfilename, csumsfilename);
        failed = merge_csums (csumsfilename, newfilename, 
                                                   oldfilename, mergedfilename,
                                                   datafilename);

        stat (csumsfilename, &csums);
        stat (datafilename, &data);

        /* Estimate the total bandwidth as:
         * csums file size + data file size */
        printf ("%d\t%d\t%d\t %s\n", bsize, csums.st_size, csums.st_size + 
data.st_size,
                                                         failed ? "failed" : 
"succeeded");
}


-- 
To UNSUBSCRIBE, email to [EMAIL PROTECTED]
with a subject of "unsubscribe". Trouble? Contact [EMAIL PROTECTED]

Re: rsync and debian -- summary of issues

Reply via email to