On Fri, 2002-04-12 at 01:47, Martin Pool wrote: > On 11 Apr 2002, Robert Tiberius Johnson <[EMAIL PROTECTED]> wrote: > > Also, in section 3.15, you say, "Fetching a series of deltas which > > update the same area is less efficient than calculating the deltas > > directly." This is true, but by my computations, the number of packages > > which change more than once in a month is 300, which require about 30K > > of wasted bandwidth. > > Sorry, I don't understand what the 30kB number relates to. Is that > only for the updates to the Packages file?
Yep. Sorry that wasn't clear before. I implemented a local version of Goswin Brederlow's proposal to use the reverse rsync algorithm over HTTP Range requests. The code is included below: please look it over to make sure I got the protocol right. Here's what I found by running the program on some Packages[.gz] files: Using gzip -9 --rsyncable Packages files from unstable main: From To blocksize diskspace bandwidth --------------------------------------------------------- Apr 6 Apr 7 128 118K 882K Apr 6 Apr 7 256 59K 841K Apr 6 Apr 7 512 29K 839K Apr 6 Apr 7 1024 15K 875K Apr 6 Apr 11 128 118K 1900K Apr 6 Apr 11 256 59K 1853K Apr 6 Apr 11 512 29K 1839K Apr 6 Apr 11 1024 15K 1845K Using uncompressed Packages files from unstable main: From To blocksize diskspace bandwidth --------------------------------------------------------- Apr 6 Apr 7 256 212K 323K Apr 6 Apr 7 512 106K 270K Apr 6 Apr 7 1024 53K 292K Apr 6 Apr 11 128 429K 898K Apr 6 Apr 11 256 215K 860K Apr 6 Apr 11 512 107K 985K For comparison, 5 days of bzip2 -9 compressed diffs take about 60K of disk space, and updating after one day uses about 13K of bandwidth. Updating after 5 days uses 65K of bandwidth. Best, Rob /* Mock-up version of the "checksum file" file synching protocol. * * compile with "gcc -lssl gen_csums.c -o gen_csums" * * Usage: gen_csums <bsize> <csize> <oldfile> <newfile> * * using a checksum size (csize) of < 8 can cause collisions, * which will ruin file reconstruction. This implementation only supports * csize of <= 8 to reduce memory usage. So always use csize=8. * * <oldfile> is the file the client has. <newfile> is the file * the server has. The program produces the file of checksums (newfile.csums), * a file containing the extra data the client had to fetch from the server * (oldfile.data), and the result of patching oldfile using the * checksums and fetched data (oldfile.merged). * * It prints out the blocksize, the amount of data that had to * be transferred from the server, and whether reconstruction was * successful. */ #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <assert.h> #include <openssl/md5.h> /* Generate the checksums file for the given file. */ void gen_csums (int bsize, int csize, char* ifilename, char* cfilename) { int ifile, cfile; struct stat s; char* buf; char csumbuf[MD5_DIGEST_LENGTH]; MD5_CTX c; int i; ifile = open (ifilename, O_RDONLY); cfile = open (cfilename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE); fstat (ifile, &s); buf = (char*) malloc (s.st_size); read (ifile, buf, s.st_size); close (ifile); write (cfile, &s.st_size, sizeof (s.st_size)); write (cfile, &bsize, sizeof (bsize)); write (cfile, &csize, sizeof (csize)); MD5 (buf, s.st_size, csumbuf); write (cfile, csumbuf, sizeof (csumbuf)); for (i = 0; i < s.st_size; i += bsize) { MD5 (buf + i, bsize, csumbuf); write (cfile, csumbuf, csize); } free (buf); close (cfile); } /* Merging code */ struct block_info { int offset; char csum[8]; }; int csize = 0; /* Used for sorting the checksums, and doing binary search * on checksums */ int block_info_cmp (void* a, void* b) { struct block_info* p1 = (struct block_info*)a; struct block_info* p2 = (struct block_info*)b; int r = memcmp (p1->csum, p2->csum, csize); return r; } /* Given: * cfilename - the checksum file * pfilename - the new file on the server * ifilename - our old file * ofilename - the merged file to be created * dfilename - the file extra data is to be logged in * * use the checksums to fill in as much of the merged file as possible. * Then use the pfilename to fill in the rest. At the end, the file * dfilename will contain a copy of all the extra data the client had * to pull from pfilename to fill in the holes. */ int merge_csums (char* cfilename, char* pfilename, char* ifilename, char* ofilename, char* dfilename) { int bsize; int cfile, pfile, ifile, ofile, dfile; int ofilelen; int inblocks; struct stat s; char* buf; char* foundmap; char tc; struct block_info csum; struct block_info* csums; struct block_info* match; MD5_CTX c; char totalcsum[MD5_DIGEST_LENGTH]; char csumbuf[MD5_DIGEST_LENGTH]; int i, j; cfile = open (cfilename, O_RDONLY); pfile = open (pfilename, O_RDONLY); ifile = open (ifilename, O_RDONLY); ofile = open (ofilename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE); dfile = open (dfilename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE); fstat (ifile, &s); read (cfile, &ofilelen, sizeof (ofilelen)); read (cfile, &bsize, sizeof (bsize)); read (cfile, &csize, sizeof (csize)); read (cfile, totalcsum, sizeof (totalcsum)); inblocks = s.st_size - bsize + 1; foundmap = (char*) malloc (ofilelen); memset (foundmap, 0, ofilelen); /* Hash all our local blocks. Then sort them by hash */ csums = (struct block_info*) malloc (inblocks * sizeof (struct block_info)); buf = (char *) malloc (s.st_size); read (ifile, buf, s.st_size); for (i = 0; i < inblocks; i++) { csums[i].offset = i; MD5 (buf + i, bsize, csumbuf); memcpy (csums[i].csum, csumbuf, csize); } qsort (csums, inblocks, sizeof (struct block_info), block_info_cmp); /* For each csum in the cfile, look for it in our list of checksums */ j = 0; while ((i = read (cfile, csum.csum, csize)) == csize) { match = (struct block_info*) bsearch (&csum, csums, inblocks, sizeof (struct block_info), block_info_cmp); if (match != NULL) { /* If we find a match, copy the corresponding * position from our old file to the proper place * in our new file. Also, mark this range of * bytes as "found". */ lseek (ofile, j * bsize, SEEK_SET); write (ofile, buf + match->offset, bsize); memset (foundmap + j * bsize, 1, bsize); } j++; } /* Now find any holes in our output, and use the peekfile (pfile) * to fill them in. These are the bytes we'd have to fetch from the * server using http range requests. Store them in the data file for * later reference, too. */ for (i = 0; i < ofilelen; i++) { if (foundmap[i] == 0) { lseek (pfile, i, SEEK_SET); lseek (ofile, i, SEEK_SET); read (pfile, &tc, sizeof (tc)); write (ofile, &tc, sizeof (tc)); write (dfile, &tc, sizeof (tc)); } } free (foundmap); free (csums); free (buf); close (cfile); close (pfile); close (ifile); close (ofile); close (dfile); /* Now checksum our newly reconstructed file and compare it * to the server's checksum */ ofile = open (ofilename, O_RDONLY); buf = (char *) malloc (ofilelen); read (ofile, buf, ofilelen); close (ofile); MD5 (buf, ofilelen, csumbuf); free (buf); return memcmp (csumbuf, totalcsum, MD5_DIGEST_LENGTH); } int main (int argc, char** argv) { int bsize = atoi (argv[1]); int csize = atoi (argv[2]); char* oldfilename = argv[3]; char* newfilename = argv[4]; char csumsfilename[1024]; char datafilename[1024]; char mergedfilename[1024]; struct stat csums; struct stat data; int failed = 0; assert (csize <= 8); sprintf (csumsfilename, "%s.csums", newfilename); sprintf (datafilename, "%s.data", oldfilename); sprintf (mergedfilename, "%s.merged", oldfilename); gen_csums (bsize, csize, newfilename, csumsfilename); failed = merge_csums (csumsfilename, newfilename, oldfilename, mergedfilename, datafilename); stat (csumsfilename, &csums); stat (datafilename, &data); /* Estimate the total bandwidth as: * csums file size + data file size */ printf ("%d\t%d\t%d\t %s\n", bsize, csums.st_size, csums.st_size + data.st_size, failed ? "failed" : "succeeded"); } -- To UNSUBSCRIBE, email to [EMAIL PROTECTED] with a subject of "unsubscribe". Trouble? Contact [EMAIL PROTECTED]