[PATCH v4 1/2] refs.c: optimize check_refname_component()

2014-05-31 Thread David Turner
In a repository with many refs, check_refname_component can be a major
contributor to the runtime of some git commands. One such command is

git rev-parse HEAD

Timings for one particular repo, with about 60k refs, almost all
packed, are:

Old: 35 ms
New: 29 ms

Many other commands which read refs are also sped up.

Signed-off-by: David Turner 
---
 refs.c | 68 --
 1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/refs.c b/refs.c
index 28d5eca..62e2301 100644
--- a/refs.c
+++ b/refs.c
@@ -5,9 +5,32 @@
 #include "dir.h"
 #include "string-list.h"
 
+/* How to handle various characters in refnames:
+ * 0: An acceptable character for refs
+ * 1: End-of-component
+ * 2: ., look for a following . to reject .. in refs
+ * 3: @, look for a following { to reject @{ in refs
+ * 9: A bad character, reject ref
+ *
+ * See below for the list of illegal characters, from which
+ * this table is derived.
+ */
+static unsigned char refname_disposition[] = {
+   1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+   9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+   9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 2, 1,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 9,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 0, 9, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 9, 9
+};
+
 /*
- * Make sure "ref" is something reasonable to have under ".git/refs/";
- * We do not like it if:
+ * Try to read one refname component from the front of refname.
+ * Return the length of the component found, or -1 if the component is
+ * not legal.  It is legal if it is something reasonable to have under
+ * ".git/refs/"; We do not like it if:
  *
  * - any path component of it begins with ".", or
  * - it has double dots "..", or
@@ -15,24 +38,7 @@
  * - it ends with a "/".
  * - it ends with ".lock"
  * - it contains a "\" (backslash)
- */
 
-/* Return true iff ch is not allowed in reference names. */
-static inline int bad_ref_char(int ch)
-{
-   if (((unsigned) ch) <= ' ' || ch == 0x7f ||
-   ch == '~' || ch == '^' || ch == ':' || ch == '\\')
-   return 1;
-   /* 2.13 Pattern Matching Notation */
-   if (ch == '*' || ch == '?' || ch == '[') /* Unsupported */
-   return 1;
-   return 0;
-}
-
-/*
- * Try to read one refname component from the front of refname.  Return
- * the length of the component found, or -1 if the component is not
- * legal.
  */
 static int check_refname_component(const char *refname, int flags)
 {
@@ -40,17 +46,25 @@ static int check_refname_component(const char *refname, int 
flags)
char last = '\0';
 
for (cp = refname; ; cp++) {
-   char ch = *cp;
-   if (ch == '\0' || ch == '/')
+   unsigned char ch = (unsigned char) *cp;
+   char disp = refname_disposition[ch];
+   switch(disp) {
+   case 1:
+   goto out;
+   case 2:
+   if (last == '.')
+   return -1; /* Refname contains "..". */
break;
-   if (bad_ref_char(ch))
-   return -1; /* Illegal character in refname. */
-   if (last == '.' && ch == '.')
-   return -1; /* Refname contains "..". */
-   if (last == '@' && ch == '{')
-   return -1; /* Refname contains "@{". */
+   case 3:
+   if (last == '@')
+   return -1; /* Refname contains "@{". */
+   break;
+   case 9:
+   return -1;
+   }
last = ch;
}
+out:
if (cp == refname)
return 0; /* Component has zero length. */
if (refname[0] == '.') {
-- 
2.0.0.rc1.18.gf763c0f

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 1/2] refs.c: optimize check_refname_component()

2014-06-01 Thread Andreas Schwab
David Turner  writes:

> +static unsigned char refname_disposition[] = {
> + 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
> + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
> + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 2, 1,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 9,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 0, 9, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 9, 9
> +};

> + unsigned char ch = (unsigned char) *cp;
> + char disp = refname_disposition[ch];

Undefined behaviour.

Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 1/2] refs.c: optimize check_refname_component()

2014-06-01 Thread Philip Oakley

From: "David Turner" 

In a repository with many refs, check_refname_component can be a major
contributor to the runtime of some git commands. One such command is

git rev-parse HEAD

Timings for one particular repo, with about 60k refs, almost all
packed, are:

Old: 35 ms
New: 29 ms

Many other commands which read refs are also sped up.

Signed-off-by: David Turner 
---
refs.c | 68 
--

1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/refs.c b/refs.c
index 28d5eca..62e2301 100644
--- a/refs.c
+++ b/refs.c
@@ -5,9 +5,32 @@
#include "dir.h"
#include "string-list.h"

+/* How to handle various characters in refnames:
+ * 0: An acceptable character for refs
+ * 1: End-of-component
+ * 2: ., look for a following . to reject .. in refs
+ * 3: @, look for a following { to reject @{ in refs
+ * 9: A bad character, reject ref
+ *
+ * See below for the list of illegal characters, from which
+ * this table is derived.


The "see below" doesn't quite scan right.  Perhaps
   The character handling rules above are used to
   derive the table below.


+ */
+static unsigned char refname_disposition[] = {
+ 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 2, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 9,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 0, 9, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 9, 9
+};
+
/*
- * Make sure "ref" is something reasonable to have under 
".git/refs/";

- * We do not like it if:
+ * Try to read one refname component from the front of refname.
+ * Return the length of the component found, or -1 if the component 
is
+ * not legal.  It is legal if it is something reasonable to have 
under

+ * ".git/refs/"; We do not like it if:
 *
 * - any path component of it begins with ".", or
 * - it has double dots "..", or
@@ -15,24 +38,7 @@
 * - it ends with a "/".
 * - it ends with ".lock"
 * - it contains a "\" (backslash)
- */

-/* Return true iff ch is not allowed in reference names. */
-static inline int bad_ref_char(int ch)
-{
- if (((unsigned) ch) <= ' ' || ch == 0x7f ||
- ch == '~' || ch == '^' || ch == ':' || ch == '\\')
- return 1;
- /* 2.13 Pattern Matching Notation */
- if (ch == '*' || ch == '?' || ch == '[') /* Unsupported */
- return 1;
- return 0;
-}
-
-/*
- * Try to read one refname component from the front of refname. 
Return

- * the length of the component found, or -1 if the component is not
- * legal.
 */
static int check_refname_component(const char *refname, int flags)
{
@@ -40,17 +46,25 @@ static int check_refname_component(const char 
*refname, int flags)

 char last = '\0';

 for (cp = refname; ; cp++) {
- char ch = *cp;
- if (ch == '\0' || ch == '/')
+ unsigned char ch = (unsigned char) *cp;
+ char disp = refname_disposition[ch];
+ switch(disp) {
+ case 1:
+ goto out;
+ case 2:
+ if (last == '.')
+ return -1; /* Refname contains "..". */
 break;
- if (bad_ref_char(ch))
- return -1; /* Illegal character in refname. */
- if (last == '.' && ch == '.')
- return -1; /* Refname contains "..". */
- if (last == '@' && ch == '{')
- return -1; /* Refname contains "@{". */
+ case 3:
+ if (last == '@')
+ return -1; /* Refname contains "@{". */
+ break;
+ case 9:
+ return -1;
+ }
 last = ch;
 }
+out:
 if (cp == refname)
 return 0; /* Component has zero length. */
 if (refname[0] == '.') {
--
2.0.0.rc1.18.gf763c0f

--


Philip 


--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 1/2] refs.c: optimize check_refname_component()

2014-06-01 Thread Michael Haggerty
Thanks for splitting this up into two patches.  See my comments below.

On 06/01/2014 07:17 AM, David Turner wrote:
> In a repository with many refs, check_refname_component can be a major
> contributor to the runtime of some git commands. One such command is
> 
> git rev-parse HEAD
> 
> Timings for one particular repo, with about 60k refs, almost all
> packed, are:
> 
> Old: 35 ms
> New: 29 ms
> 
> Many other commands which read refs are also sped up.
> 
> Signed-off-by: David Turner 
> ---
>  refs.c | 68 
> --
>  1 file changed, 41 insertions(+), 27 deletions(-)
> 
> diff --git a/refs.c b/refs.c
> index 28d5eca..62e2301 100644
> --- a/refs.c
> +++ b/refs.c
> @@ -5,9 +5,32 @@
>  #include "dir.h"
>  #include "string-list.h"
>  
> +/* How to handle various characters in refnames:

We format multiline comments with no text on the opening line:

/*
 * How to handle...
 * ... is derived.
 */

> + * 0: An acceptable character for refs
> + * 1: End-of-component
> + * 2: ., look for a following . to reject .. in refs
> + * 3: @, look for a following { to reject @{ in refs
> + * 9: A bad character, reject ref

This explanation does not agree with the code (or I'm not reading it
correctly).  For example, the character with disposition 3 is '{', not
'@', so ISTM the comment should be

 * 3: {, look for a preceding @ to reject @{ in refnames

BTW, is there a special reason that the values in your table jump from 3
to 9?  I imagine that compilers would use a jump table to implement the
"switch" statement where these values are used, and they might have a
slightly easier time if there are no "holes" between the legal values.

> + *
> + * See below for the list of illegal characters, from which
> + * this table is derived.
> + */
> +static unsigned char refname_disposition[] = {

I think you need to define the length explicitly to 256 here to cause
the entries for 0x80..0xff to be created and initialized to zeros.

The fact that you didn't notice this bug suggests that there might be no
tests involving refnames with non-ASCII characters, which would be
another nice thing to remedy while you are working in this area.

> + 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
> + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
> + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 2, 1,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 9,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 0, 9, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 9, 9
> +};
> +
>  /*
> - * Make sure "ref" is something reasonable to have under ".git/refs/";
> - * We do not like it if:
> + * Try to read one refname component from the front of refname.
> + * Return the length of the component found, or -1 if the component is
> + * not legal.  It is legal if it is something reasonable to have under
> + * ".git/refs/"; We do not like it if:
>   *
>   * - any path component of it begins with ".", or
>   * - it has double dots "..", or
> @@ -15,24 +38,7 @@
>   * - it ends with a "/".
>   * - it ends with ".lock"
>   * - it contains a "\" (backslash)
> - */
>  

^^^ doesn't this leave a blank line inside the comment?

> -/* Return true iff ch is not allowed in reference names. */
> -static inline int bad_ref_char(int ch)
> -{
> - if (((unsigned) ch) <= ' ' || ch == 0x7f ||
> - ch == '~' || ch == '^' || ch == ':' || ch == '\\')
> - return 1;
> - /* 2.13 Pattern Matching Notation */
> - if (ch == '*' || ch == '?' || ch == '[') /* Unsupported */
> - return 1;
> - return 0;
> -}
> -
> -/*
> - * Try to read one refname component from the front of refname.  Return
> - * the length of the component found, or -1 if the component is not
> - * legal.
>   */
> [...]

Michael

-- 
Michael Haggerty
mhag...@alum.mit.edu

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html