Author: jcorvel Date: Wed Nov 24 00:03:07 2010 New Revision: 1038390 URL: http://svn.apache.org/viewvc?rev=1038390&view=rev Log: On the diff-optimizations-tokens branch:
Make svn_diff skip identical prefix to make diff and blame faster. * subversion/include/svn_diff.h (svn_diff_fns_t): Add new function type token_pushback_prefix to the vtable. This function is needed to push back the last token that has been read during prefix scanning (the first non-matching line), so it can be read again (and checksummed) during the get_next_token phase. * subversion/libsvn_diff/diff_memory.c (datasource_get_next_token): Make the checksum calculation optional (only calculate checksum if HASH is not NULL). (token_pushback_prefix): New function. (svn_diff__mem_vtable): Add new function token_pushback_prefix. * subversion/libsvn_diff/diff_file.c (svn_diff__file_baton_t): Add member PREFIX_TOKENS, a linked list of pushed back prefix tokens. (datasource_get_next_token): Make the checksum calculation optional (only calculate checksum if HASH is not NULL). If applicable, use a pushed back prefix token to adjust the raw_length of a token that is being reread. (token_pushback_prefix): New function. (svn_diff__mem_vtable): Add new function token_pushback_prefix. * subversion/libsvn_diff/diff.h (svn_diff__lcs): Add argument "prefix_lines". (svn_diff__get_all_tokens): New function declaration. * subversion/libsvn_diff/token.c (find_identical_prefix): New function. (svn_diff__get_all_tokens): New function. The purpose of this function is to read the tokens from all datasources at once, so it can first scan for identical prefix and suffix. * subversion/libsvn_diff/lcs.c (svn_diff__lcs): Added argument "prefix_lines". Use this to correctly set the offset of the sentinel position for EOF, even if one of the files became empty after eliminating the identical prefix. * subversion/libsvn_diff/diff.c (svn_diff__diff): Add a chunk of "common" diff for identical prefix. (svn_diff_diff): Use new function svn_diff__get_all_tokens to get the tokens from original and modified at once (while scanning for identical prefix). Pass prefix_lines to svn_diff__lcs and svn_diff__diff. * subversion/libsvn_diff/diff3.c (svn_diff_diff3): For now, don't use svn_diff__get_all_tokens yet. Pass prefix_lines = 0 to svn_diff__lcs. * subversion/libsvn_diff/diff4.c (svn_diff_diff4): For now, don't use svn_diff__get_all_tokens yet. Pass prefix_lines = 0 to svn_diff__lcs. Modified: subversion/branches/diff-optimizations-tokens/subversion/include/svn_diff.h subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.c subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.h subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff3.c subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff4.c subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_file.c subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_memory.c subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/lcs.c subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/token.c Modified: subversion/branches/diff-optimizations-tokens/subversion/include/svn_diff.h URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/include/svn_diff.h?rev=1038390&r1=1038389&r2=1038390&view=diff ============================================================================== --- subversion/branches/diff-optimizations-tokens/subversion/include/svn_diff.h (original) +++ subversion/branches/diff-optimizations-tokens/subversion/include/svn_diff.h Wed Nov 24 00:03:07 2010 @@ -135,6 +135,10 @@ typedef struct svn_diff_fns_t void *rtoken, int *compare); + svn_error_t *(*token_pushback_prefix)(void *diff_baton, + void *token, + svn_diff_datasource_e datasource); + /** Free @a token from memory, the diff algorithm is done with it. */ void (*token_discard)(void *diff_baton, void *token); Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.c URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.c?rev=1038390&r1=1038389&r2=1038390&view=diff ============================================================================== --- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.c (original) +++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.c Wed Nov 24 00:03:07 2010 @@ -43,6 +43,22 @@ svn_diff__diff(svn_diff__lcs_t *lcs, svn_diff_t *diff; svn_diff_t **diff_ref = &diff; + if (want_common && (original_start > 1)) + { + /* we have a prefix to skip */ + (*diff_ref) = apr_palloc(pool, sizeof(**diff_ref)); + + (*diff_ref)->type = svn_diff__type_common; + (*diff_ref)->original_start = 0; + (*diff_ref)->original_length = original_start - 1; + (*diff_ref)->modified_start = 0; + (*diff_ref)->modified_length = modified_start - 1; + (*diff_ref)->latest_start = 0; + (*diff_ref)->latest_length = 0; + + diff_ref = &(*diff_ref)->next; + } + while (1) { if (original_start < lcs->position[0]->offset @@ -105,11 +121,17 @@ svn_diff_diff(svn_diff_t **diff, { svn_diff__tree_t *tree; svn_diff__position_t *position_list[2]; + svn_diff__position_t **position_list_ref[2]; + svn_diff_datasource_e datasource[] = {svn_diff_datasource_original, + svn_diff_datasource_modified}; svn_diff__lcs_t *lcs; apr_pool_t *subpool; apr_pool_t *treepool; + apr_off_t prefix_lines = 0; *diff = NULL; + position_list_ref[0] = &position_list[0]; + position_list_ref[1] = &position_list[1]; subpool = svn_pool_create(pool); treepool = svn_pool_create(pool); @@ -117,17 +139,12 @@ svn_diff_diff(svn_diff_t **diff, svn_diff__tree_create(&tree, treepool); /* Insert the data into the tree */ - SVN_ERR(svn_diff__get_tokens(&position_list[0], - tree, - diff_baton, vtable, - svn_diff_datasource_original, - subpool)); - - SVN_ERR(svn_diff__get_tokens(&position_list[1], - tree, - diff_baton, vtable, - svn_diff_datasource_modified, - subpool)); + SVN_ERR(svn_diff__get_all_tokens(position_list_ref, + &prefix_lines, + tree, + diff_baton, vtable, + datasource, 2, + subpool)); /* The cool part is that we don't need the tokens anymore. * Allow the app to clean them up if it wants to. @@ -139,10 +156,10 @@ svn_diff_diff(svn_diff_t **diff, svn_pool_destroy(treepool); /* Get the lcs */ - lcs = svn_diff__lcs(position_list[0], position_list[1], subpool); + lcs = svn_diff__lcs(position_list[0], position_list[1], prefix_lines, subpool); /* Produce the diff */ - *diff = svn_diff__diff(lcs, 1, 1, TRUE, pool); + *diff = svn_diff__diff(lcs, prefix_lines + 1, prefix_lines + 1, TRUE, pool); /* Get rid of all the data we don't have a use for anymore */ svn_pool_destroy(subpool); Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.h URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.h?rev=1038390&r1=1038389&r2=1038390&view=diff ============================================================================== --- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.h (original) +++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff.h Wed Nov 24 00:03:07 2010 @@ -91,6 +91,7 @@ typedef enum svn_diff__normalize_state_t svn_diff__lcs_t * svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) */ svn_diff__position_t *position_list2, /* pointer to tail (ring) */ + apr_off_t prefix_lines, apr_pool_t *pool); @@ -113,6 +114,20 @@ svn_diff__get_tokens(svn_diff__position_ svn_diff_datasource_e datasource, apr_pool_t *pool); +/* + * Get all tokens from all datasources (skipping identical prefix and suffix). + * Return the last item in the (circular) list. + */ +svn_error_t * +svn_diff__get_all_tokens(svn_diff__position_t **position_list[], + apr_off_t *prefix_lines, + svn_diff__tree_t *tree, + void *diff_baton, + const svn_diff_fns_t *vtable, + svn_diff_datasource_e datasource[], + int datasource_len, + apr_pool_t *pool); + /* Morph a svn_lcs_t into a svn_diff_t. */ svn_diff_t * Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff3.c URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff3.c?rev=1038390&r1=1038389&r2=1038390&view=diff ============================================================================== --- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff3.c (original) +++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff3.c Wed Nov 24 00:03:07 2010 @@ -173,7 +173,7 @@ svn_diff__resolve_conflict(svn_diff_t *h position[1]->next = start_position[1]; } - *lcs_ref = svn_diff__lcs(position[0], position[1], + *lcs_ref = svn_diff__lcs(position[0], position[1], 0, subpool); /* Fix up the EOF lcs element in case one of @@ -289,9 +289,9 @@ svn_diff_diff3(svn_diff_t **diff, svn_pool_destroy(treepool); /* Get the lcs for original-modified and original-latest */ - lcs_om = svn_diff__lcs(position_list[0], position_list[1], + lcs_om = svn_diff__lcs(position_list[0], position_list[1], 0, subpool); - lcs_ol = svn_diff__lcs(position_list[0], position_list[2], + lcs_ol = svn_diff__lcs(position_list[0], position_list[2], 0, subpool); /* Produce a merged diff */ Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff4.c URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff4.c?rev=1038390&r1=1038389&r2=1038390&view=diff ============================================================================== --- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff4.c (original) +++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff4.c Wed Nov 24 00:03:07 2010 @@ -222,7 +222,7 @@ svn_diff_diff4(svn_diff_t **diff, svn_pool_clear(subpool3); /* Get the lcs for original - latest */ - lcs_ol = svn_diff__lcs(position_list[0], position_list[2], subpool3); + lcs_ol = svn_diff__lcs(position_list[0], position_list[2], 0, subpool3); diff_ol = svn_diff__diff(lcs_ol, 1, 1, TRUE, pool); svn_pool_clear(subpool3); @@ -243,7 +243,7 @@ svn_diff_diff4(svn_diff_t **diff, /* Get the lcs for common ancestor - original * Do reverse adjustements */ - lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], subpool3); + lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], 0, subpool3); diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3); adjust_diff(diff_ol, diff_adjust); @@ -252,7 +252,7 @@ svn_diff_diff4(svn_diff_t **diff, /* Get the lcs for modified - common ancestor * Do forward adjustments */ - lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], subpool3); + lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], 0, subpool3); diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3); adjust_diff(diff_ol, diff_adjust); Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_file.c URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_file.c?rev=1038390&r1=1038389&r2=1038390&view=diff ============================================================================== --- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_file.c (original) +++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_file.c Wed Nov 24 00:03:07 2010 @@ -82,6 +82,9 @@ typedef struct svn_diff__file_baton_t char *endp; /* next memory address after the current chunk */ svn_diff__normalize_state_t normalize_state; + + /* Linked list of pushed back prefix tokens */ + svn_diff__file_token_t *prefix_tokens; } files[4]; /* List of free tokens that may be reused. */ @@ -330,7 +333,8 @@ datasource_get_next_token(apr_uint32_t * &file->normalize_state, curp, file_baton->options); file_token->length += length; - h = svn_diff__adler32(h, curp, length); + if (hash) + h = svn_diff__adler32(h, curp, length); curp = endp = file->buffer; file->chunk++; @@ -377,10 +381,48 @@ datasource_get_next_token(apr_uint32_t * file_token->length += length; - *hash = svn_diff__adler32(h, c, length); + if (hash) + *hash = svn_diff__adler32(h, c, length); *token = file_token; } + /* If there is a pushed back prefix_token available, that means we have + read (and normalized in-place) this token before. Adjust our new token + with the original raw_length of the prefix_token. */ + if (file->prefix_tokens) + { + svn_diff__file_token_t *prefix_token = file->prefix_tokens; + file->prefix_tokens = file->prefix_tokens->next; + + if (file_token->raw_length < prefix_token->raw_length) + { + int length_difference = prefix_token->raw_length + - file_token->raw_length; + if (length_difference < file->endp - file->curp) + { + file->curp += length_difference; + } + else + { + /* ### TODO: rework this so it can handle crossing multiple + chunks, instead of only one. */ + length_difference -= file->endp - file->curp; + file->curp = file->buffer; + file->chunk++; + length = file->chunk == last_chunk ? + offset_in_chunk(file->size) : CHUNK_SIZE; + file->endp = file->curp + length; + + SVN_ERR(read_chunk(file->file, file->path, + file->curp, length, + chunk_to_offset(file->chunk), + file_baton->pool)); + file->curp += length_difference; + } + file_token->raw_length = prefix_token->raw_length; + } + } + return SVN_NO_ERROR; } @@ -506,6 +548,37 @@ token_compare(void *baton, void *token1, return SVN_NO_ERROR; } +static svn_error_t * +token_pushback_prefix(void *baton, + void *token, + svn_diff_datasource_e datasource) +{ + svn_diff__file_baton_t *file_baton = baton; + svn_diff__file_token_t *file_token = token; + struct file_info *file = + &file_baton->files[datasource_to_index(datasource)]; + + file_token->next = file->prefix_tokens; + file->prefix_tokens = file_token; + + if (offset_to_chunk(file_token->norm_offset) == file->chunk) + { + file->curp = file->buffer + offset_in_chunk(file_token->norm_offset); + } + else + { + /* ### TODO: rework this so it can handle crossing multiple + chunks, instead of only one. */ + file->chunk--; + SVN_ERR(read_chunk(file->file, file->path, file->buffer, + CHUNK_SIZE, chunk_to_offset(file->chunk), + file_baton->pool)); + file->endp = file->buffer + CHUNK_SIZE; + file->curp = file->buffer + offset_in_chunk(file_token->norm_offset); + } + + return SVN_NO_ERROR; +} /* Implements svn_diff_fns_t::token_discard */ static void @@ -536,6 +609,7 @@ static const svn_diff_fns_t svn_diff__fi datasource_close, datasource_get_next_token, token_compare, + token_pushback_prefix, token_discard, token_discard_all }; Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_memory.c URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_memory.c?rev=1038390&r1=1038389&r2=1038390&view=diff ============================================================================== --- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_memory.c (original) +++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/diff_memory.c Wed Nov 24 00:03:07 2010 @@ -128,7 +128,8 @@ datasource_get_next_token(apr_uint32_t * svn_diff__normalize_buffer(&buf, &len, &state, tok->data, mem_baton->normalization_options); - *hash = svn_diff__adler32(0, buf, len); + if (hash) + *hash = svn_diff__adler32(0, buf, len); src->next_token++; } else @@ -166,6 +167,19 @@ token_compare(void *baton, void *token1, return SVN_NO_ERROR; } +static svn_error_t * +token_pushback_prefix(void *baton, + void *token, + svn_diff_datasource_e datasource) +{ + diff_mem_baton_t *mem_baton = baton; + source_tokens_t *src = &(mem_baton->sources[datasource_to_index(datasource)]); + + src->next_token--; + + return SVN_NO_ERROR; +} + /* Implements svn_diff_fns_t::token_discard */ static void token_discard(void *baton, void *token) @@ -192,6 +206,7 @@ static const svn_diff_fns_t svn_diff__me datasource_close, datasource_get_next_token, token_compare, + token_pushback_prefix, token_discard, token_discard_all }; Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/lcs.c URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/lcs.c?rev=1038390&r1=1038389&r2=1038390&view=diff ============================================================================== --- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/lcs.c (original) +++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/lcs.c Wed Nov 24 00:03:07 2010 @@ -163,6 +163,7 @@ svn_diff__lcs_reverse(svn_diff__lcs_t *l svn_diff__lcs_t * svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) */ svn_diff__position_t *position_list2, /* pointer to tail (ring) */ + apr_off_t prefix_lines, apr_pool_t *pool) { int idx; @@ -180,9 +181,11 @@ svn_diff__lcs(svn_diff__position_t *posi */ lcs = apr_palloc(pool, sizeof(*lcs)); lcs->position[0] = apr_pcalloc(pool, sizeof(*lcs->position[0])); - lcs->position[0]->offset = position_list1 ? position_list1->offset + 1 : 1; + lcs->position[0]->offset = position_list1 ? + position_list1->offset + 1 : prefix_lines + 1; lcs->position[1] = apr_pcalloc(pool, sizeof(*lcs->position[1])); - lcs->position[1]->offset = position_list2 ? position_list2->offset + 1 : 1; + lcs->position[1]->offset = position_list2 ? + position_list2->offset + 1 : prefix_lines + 1; lcs->length = 0; lcs->refcount = 1; lcs->next = NULL; Modified: subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/token.c URL: http://svn.apache.org/viewvc/subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/token.c?rev=1038390&r1=1038389&r2=1038390&view=diff ============================================================================== --- subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/token.c (original) +++ subversion/branches/diff-optimizations-tokens/subversion/libsvn_diff/token.c Wed Nov 24 00:03:07 2010 @@ -187,3 +187,148 @@ svn_diff__get_tokens(svn_diff__position_ return SVN_NO_ERROR; } + +/* Find identical prefix between all datasources + */ +static svn_error_t * +find_identical_prefix(svn_boolean_t *reached_one_eof, + apr_off_t *prefix_lines, + void *diff_baton, + const svn_diff_fns_t *vtable, + svn_diff_datasource_e datasource[], + int datasource_len) +{ + void *token[4]; + svn_boolean_t is_match, reached_all_eof; + int i, rv; + + *prefix_lines = 0; + *reached_one_eof = FALSE; + + /* Read first token from every datasource, and check if it matches. */ + for (i = 0; i < datasource_len; i++) + { + SVN_ERR(vtable->datasource_get_next_token(NULL, &token[i], + diff_baton, datasource[i])); + *reached_one_eof = *reached_one_eof || token[i] == NULL; + } + if (*reached_one_eof) + { + /* We're done. Push back tokens that we may have read too much. */ + for (i = 0; i < datasource_len; i++) + if (token[i] != NULL) + SVN_ERR(vtable->token_pushback_prefix(diff_baton, token[i], datasource[i])); + return SVN_NO_ERROR; + } + for (i = 1, is_match = TRUE; is_match && i < datasource_len; i++) + { + SVN_ERR(vtable->token_compare(diff_baton, token[0], token[i], &rv)); + is_match = is_match && rv == 0; + } + + /* While tokens match up, continue getting tokens and matching. */ + while (is_match) + { + (*prefix_lines)++; + for (i = 0; i < datasource_len; i++) + { + SVN_ERR(vtable->datasource_get_next_token(NULL, &token[i], + diff_baton, datasource[i])); + *reached_one_eof = *reached_one_eof || token[i] == NULL; + } + if (*reached_one_eof) + break; + else + for (i = 1, is_match = TRUE; is_match && i < datasource_len; i++) + { + SVN_ERR(vtable->token_compare(diff_baton, token[0], token[i], &rv)); + is_match = is_match && rv == 0; + } + } + + /* If all files reached their end (i.e. are fully identical), we're done. */ + for (i = 0, reached_all_eof = TRUE; i < datasource_len; i++) + reached_all_eof = reached_all_eof && token[i] == NULL; + if (reached_all_eof) + return SVN_NO_ERROR; + + /* Push back the non-matching token we read. */ + for (i = 0; i < datasource_len; i++) + if (token[i] != NULL) + SVN_ERR(vtable->token_pushback_prefix(diff_baton, token[i], datasource[i])); + + return SVN_NO_ERROR; +} + +/* + * Get all tokens from the datasources. For each datasource, return the + * last item in the (circular) list. + */ +svn_error_t * +svn_diff__get_all_tokens(svn_diff__position_t **position_list[], + apr_off_t *prefix_lines, + svn_diff__tree_t *tree, + void *diff_baton, + const svn_diff_fns_t *vtable, + svn_diff_datasource_e datasource[], + int datasource_len, + apr_pool_t *pool) +{ + svn_diff__position_t *start_position; + svn_diff__position_t *position = NULL; + svn_diff__position_t **position_ref; + svn_diff__node_t *node; + void *token; + apr_off_t offset; + apr_uint32_t hash; + svn_boolean_t reached_one_eof; + int i; + + for (i = 0; i < datasource_len; i++) + { + *position_list[i] = NULL; + SVN_ERR(vtable->datasource_open(diff_baton, datasource[i])); + } + + /* find identical prefix */ + SVN_ERR(find_identical_prefix(&reached_one_eof, prefix_lines, + diff_baton, vtable, datasource, datasource_len)); + + /* ### TODO: find identical suffix (if not eof) */ + + for (i = 0; i < datasource_len; i++) + { + position_ref = &start_position; + offset = *prefix_lines; + hash = 0; /* The callback fn doesn't need to touch it per se */ + while (1) + { + SVN_ERR(vtable->datasource_get_next_token(&hash, &token, + diff_baton, datasource[i])); + if (token == NULL) + break; + + offset++; + SVN_ERR(svn_diff__tree_insert_token(&node, tree, + diff_baton, vtable, + hash, token)); + + /* Create a new position */ + position = apr_palloc(pool, sizeof(*position)); + position->next = NULL; + position->node = node; + position->offset = offset; + + *position_ref = position; + position_ref = &position->next; + } + + *position_ref = start_position; + + SVN_ERR(vtable->datasource_close(diff_baton, datasource[i])); + + *position_list[i] = position; + } + + return SVN_NO_ERROR; +}