Back in the spring, we started using rsync for a disk to disk backup system maintaining close to 10PB of data. I am not here to debate the issue of what is the right tool but only to discuss what we found to be a problem with rsync when doing so.

We traced the various processes hoping to find what the culprit was slowing things down so much and determined pretty easily that it was the checksum components in rsync. Once we found that and tested against the --checksum option, it was glaring that this was slowing us down.

We next tested the MD5 vs MD4 checksums and found little difference in speed. So we went out in search of a better checksum algorithm and found xxhash, using the one from the Centos release.

Thanks to the way the source is written, it was a fairly easy patch to get this into the src RPM. We have been using this in production now for awhile and see about a 3x speedup over the MD5/4 checksum algorithm which brings it pretty close to the --checksum speed.

Attached is the patch we applied. Since xxhash is in the distro, a dependency would be required for this RPM. If nothing else, perhaps the developers should just take a look as this could benefit many.

Thanks,
Bill
diff -up rsync-3.1.3/checksum.c.xxhash rsync-3.1.3/checksum.c
--- rsync-3.1.3/checksum.c.xxhash	2018-01-14 22:55:07.000000000 -0500
+++ rsync-3.1.3/checksum.c	2019-09-24 15:59:19.410068707 -0400
@@ -19,6 +19,7 @@
  * with this program; if not, visit the http://fsf.org website.
  */
 
+#include "xxhash.h"
 #include "rsync.h"
 
 extern int checksum_seed;
@@ -32,6 +33,7 @@ extern char *checksum_choice;
 #define CSUM_MD4_OLD 3
 #define CSUM_MD4 4
 #define CSUM_MD5 5
+#define CSUM_XXHASH 6
 
 int xfersum_type = 0; /* used for the file transfer checksums */
 int checksum_type = 0; /* used for the pre-transfer (--checksum) checksums */
@@ -68,6 +70,8 @@ int parse_csum_name(const char *name, in
 		return CSUM_MD5;
 	if (len == 4 && strncasecmp(name, "none", 4) == 0)
 		return CSUM_NONE;
+	if (len == 6 && strncasecmp(name, "xxhash", 6) == 0)
+		return CSUM_XXHASH;
 
 	rprintf(FERROR, "unknown checksum name: %s\n", name);
 	exit_cleanup(RERR_UNSUPPORTED);
@@ -88,6 +92,8 @@ int csum_len_for_type(int cst, BOOL flis
 		return MD4_DIGEST_LEN;
 	  case CSUM_MD5:
 		return MD5_DIGEST_LEN;
+	  case CSUM_XXHASH:
+		return sizeof(XXH64_hash_t);
 	  default: /* paranoia to prevent missing case values */
 		exit_cleanup(RERR_UNSUPPORTED);
 	}
@@ -126,6 +132,10 @@ void get_checksum2(char *buf, int32 len,
 	md_context m;
 
 	switch (xfersum_type) {
+          case CSUM_XXHASH: 
+		SIVAL64(sum, 0, XXH64(buf, len, checksum_seed));
+                break;
+
 	  case CSUM_MD5: {
 		uchar seedbuf[4];
 		md5_begin(&m);
@@ -197,6 +207,7 @@ void file_checksum(const char *fname, co
 	md_context m;
 	int32 remainder;
 	int fd;
+	XXH64_state_t* state;
 
 	memset(sum, 0, MAX_DIGEST_LEN);
 
@@ -207,6 +218,31 @@ void file_checksum(const char *fname, co
 	buf = map_file(fd, len, MAX_MAP_SIZE, CSUM_CHUNK);
 
 	switch (checksum_type) {
+          case CSUM_XXHASH:
+		state = XXH64_createState();
+		if (state==NULL) out_of_memory("file_checksum xx64");
+
+		unsigned long long const seed = 0;
+		if (XXH64_reset(state, seed) == XXH_ERROR) {
+		    rprintf(FERROR, "error resetting XXH64 seed");
+		    exit_cleanup(RERR_STREAMIO);
+		}
+
+		for (i = 0; i + CSUM_CHUNK <= len; i += CSUM_CHUNK) {
+		   XXH_errorcode const updateResult = XXH64_update(state, (uchar *)map_ptr(buf, i, CSUM_CHUNK), CSUM_CHUNK);
+		   if (updateResult == XXH_ERROR) {
+			rprintf(FERROR, "error computing XX64 hash");
+			exit_cleanup(RERR_STREAMIO);
+		   }
+		}
+		remainder = (int32)(len - i);
+                if (remainder > 0)
+			XXH64_update(state, (uchar *)map_ptr(buf, i, CSUM_CHUNK), remainder);
+		SIVAL64(sum, 0, XXH64_digest(state));
+
+		XXH64_freeState(state);
+
+		break;
 	  case CSUM_MD5:
 		md5_begin(&m);
 
@@ -254,6 +290,7 @@ void file_checksum(const char *fname, co
 static int32 sumresidue;
 static md_context md;
 static int cursum_type;
+XXH64_state_t* xxh64_state = NULL;
 
 void sum_init(int csum_type, int seed)
 {
@@ -264,6 +301,16 @@ void sum_init(int csum_type, int seed)
 	cursum_type = csum_type;
 
 	switch (csum_type) {
+	  case CSUM_XXHASH:
+		if(xxh64_state == NULL) {
+		    xxh64_state = XXH64_createState();
+		    if (xxh64_state == NULL) out_of_memory("sum_init xxh64");
+		}
+		if (XXH64_reset(xxh64_state, 0) == XXH_ERROR) {
+		    rprintf(FERROR, "error resetting XXH64 state");
+		    exit_cleanup(RERR_STREAMIO);
+		}
+		break;
 	  case CSUM_MD5:
 		md5_begin(&md);
 		break;
@@ -297,6 +344,12 @@ void sum_init(int csum_type, int seed)
 void sum_update(const char *p, int32 len)
 {
 	switch (cursum_type) {
+	  case CSUM_XXHASH:
+		if (XXH64_update(xxh64_state, p, len) == XXH_ERROR) {
+		    rprintf(FERROR, "error computing XX64 hash");
+		    exit_cleanup(RERR_STREAMIO);
+		}
+		break;
 	  case CSUM_MD5:
 		md5_update(&md, (uchar *)p, len);
 		break;
@@ -342,6 +395,9 @@ void sum_update(const char *p, int32 len
 int sum_end(char *sum)
 {
 	switch (cursum_type) {
+	  case CSUM_XXHASH:
+		SIVAL64(sum, 0, XXH64_digest(xxh64_state));
+		break;
 	  case CSUM_MD5:
 		md5_result(&md, (uchar *)sum);
 		break;
-- 
Please use reply-all for most replies to avoid omitting the mailing list.
To unsubscribe or change options: https://lists.samba.org/mailman/listinfo/rsync
Before posting, read: http://www.catb.org/~esr/faqs/smart-questions.html

Reply via email to