Quoth Jani Nikula on Jan 03 at 5:48 pm: > On Mon, 31 Dec 2012, Austin Clements <amdragon at MIT.EDU> wrote: > > From: Austin Clements <amdragon at MIT.EDU> > > > > This is now a generic boolean term quoting function. It performs > > minimal quoting to produce user-friendly queries. > > > > This could live in tag-util as well, but it is really nothing specific > > to tags (although the conventions are specific to Xapian). > > > > The API is changed from "caller-allocates" to "readline-like". The > > scan for max tag length is pushed down into the quoting routine. > > Furthermore, this now combines the term prefix with the quoted term; > > arguably this is just as easy to do in the caller, but this will > > nicely parallel the boolean term parsing function to be introduced > > shortly. > > > > This is an amalgamation of code written by David Bremner and myself. > > --- > > notmuch-tag.c | 48 ++++++++++++--------------------------- > > util/string-util.c | 64 > > ++++++++++++++++++++++++++++++++++++++++++++++++++++ > > util/string-util.h | 14 ++++++++++++ > > 3 files changed, 92 insertions(+), 34 deletions(-) > > > > diff --git a/notmuch-tag.c b/notmuch-tag.c > > index 88d559b..fc9d43a 100644 > > --- a/notmuch-tag.c > > +++ b/notmuch-tag.c > > @@ -19,6 +19,7 @@ > > */ > > > > #include "notmuch-client.h" > > +#include "string-util.h" > > > > static volatile sig_atomic_t interrupted; > > > > @@ -35,25 +36,6 @@ handle_sigint (unused (int sig)) > > interrupted = 1; > > } > > > > -static char * > > -_escape_tag (char *buf, const char *tag) > > -{ > > - const char *in = tag; > > - char *out = buf; > > - > > - /* Boolean terms surrounded by double quotes can contain any > > - * character. Double quotes are quoted by doubling them. */ > > - *out++ = '"'; > > - while (*in) { > > - if (*in == '"') > > - *out++ = '"'; > > - *out++ = *in++; > > - } > > - *out++ = '"'; > > - *out = 0; > > - return buf; > > -} > > - > > typedef struct { > > const char *tag; > > notmuch_bool_t remove; > > @@ -71,25 +53,16 @@ _optimize_tag_query (void *ctx, const char > > *orig_query_string, > > * parenthesize and the exclusion part of the query must not use > > * the '-' operator (though the NOT operator is fine). */ > > > > - char *escaped, *query_string; > > + char *escaped = NULL; > > + size_t escaped_len = 0; > > + char *query_string; > > const char *join = ""; > > - int i; > > - unsigned int max_tag_len = 0; > > + size_t i; > > > > /* Don't optimize if there are no tag changes. */ > > if (tag_ops[0].tag == NULL) > > return talloc_strdup (ctx, orig_query_string); > > > > - /* Allocate a buffer for escaping tags. This is large enough to > > - * hold a fully escaped tag with every character doubled plus > > - * enclosing quotes and a NUL. */ > > - for (i = 0; tag_ops[i].tag; i++) > > - if (strlen (tag_ops[i].tag) > max_tag_len) > > - max_tag_len = strlen (tag_ops[i].tag); > > - escaped = talloc_array (ctx, char, max_tag_len * 2 + 3); > > - if (! escaped) > > - return NULL; > > - > > /* Build the new query string */ > > if (strcmp (orig_query_string, "*") == 0) > > query_string = talloc_strdup (ctx, "("); > > @@ -97,10 +70,17 @@ _optimize_tag_query (void *ctx, const char > > *orig_query_string, > > query_string = talloc_asprintf (ctx, "( %s ) and (", orig_query_string); > > > > for (i = 0; tag_ops[i].tag && query_string; i++) { > > + /* XXX in case of OOM, query_string will be deallocated when > > + * ctx is, which might be at shutdown */ > > + if (make_boolean_term (ctx, > > + "tag", tag_ops[i].tag, > > + &escaped, &escaped_len)) > > + return NULL; > > + > > query_string = talloc_asprintf_append_buffer ( > > - query_string, "%s%stag:%s", join, > > + query_string, "%s%s%s", join, > > tag_ops[i].remove ? "" : "not ", > > - _escape_tag (escaped, tag_ops[i].tag)); > > + escaped); > > join = " or "; > > } > > > > diff --git a/util/string-util.c b/util/string-util.c > > index 44f8cd3..e4bea21 100644 > > --- a/util/string-util.c > > +++ b/util/string-util.c > > @@ -20,6 +20,7 @@ > > > > > > #include "string-util.h" > > +#include "talloc.h" > > > > char * > > strtok_len (char *s, const char *delim, size_t *len) > > @@ -32,3 +33,66 @@ strtok_len (char *s, const char *delim, size_t *len) > > > > return *len ? s : NULL; > > } > > + > > +int > > +make_boolean_term (void *ctx, const char *prefix, const char *term, > > + char **buf, size_t *len) > > +{ > > + const char *in; > > + char *out; > > + size_t needed = 3; > > + int need_quoting = 0; > > + > > + /* Do we need quoting? To be paranoid, we quote anything > > + * containing a quote, even though it only matters at the > > + * beginning, and anything containing non-ASCII text. */ > > + for (in = term; *in && !need_quoting; in++) > > + if (*in <= ' ' || *in == ')' || *in == '"' || (unsigned char)*in > 127) > > Should that be *in >= 127?
Nope. Character 127 is fine (and ASCII). Technically the only non-ASCII characters that require quoting are 0x201c and 0x201d, but rather than decoding UTF-8 to find those characters, it's much easier to just quote if there are any non-ASCII UTF-8 bytes. (Extra technically, we would be in real trouble if a tag contained 8-bit bytes but wasn't valid UTF-8; however, I think this would be the least of our worries.) > Otherwise LGTM. > > Jani. > > > + need_quoting = 1; > > + > > + if (need_quoting) > > + for (in = term; *in; in++) > > + needed += (*in == '"') ? 2 : 1; > > + else > > + needed = strlen (term) + 1; > > + > > + /* Reserve space for the prefix */ > > + if (prefix) > > + needed += strlen (prefix) + 1; > > + > > + if ((*buf == NULL) || (needed > *len)) { > > + *len = 2 * needed; > > + *buf = talloc_realloc (ctx, *buf, char, *len); > > + } > > + > > + if (! *buf) > > + return 1; > > + > > + out = *buf; > > + > > + /* Copy in the prefix */ > > + if (prefix) { > > + strcpy (out, prefix); > > + out += strlen (prefix); > > + *out++ = ':'; > > + } > > + > > + if (! need_quoting) { > > + strcpy (out, term); > > + return 0; > > + } > > + > > + /* Quote term by enclosing it in double quotes and doubling any > > + * internal double quotes. */ > > + *out++ = '"'; > > + in = term; > > + while (*in) { > > + if (*in == '"') > > + *out++ = '"'; > > + *out++ = *in++; > > + } > > + *out++ = '"'; > > + *out = '\0'; > > + > > + return 0; > > +} > > diff --git a/util/string-util.h b/util/string-util.h > > index ac7676c..b8844a3 100644 > > --- a/util/string-util.h > > +++ b/util/string-util.h > > @@ -19,4 +19,18 @@ > > > > char *strtok_len (char *s, const char *delim, size_t *len); > > > > +/* Construct a boolean term query with the specified prefix (e.g., > > + * "id") and search term, quoting term as necessary. Specifically, if > > + * term contains any non-printable ASCII characters, non-ASCII > > + * characters, close parenthesis or double quotes, it will be enclosed > > + * in double quotes and any internal double quotes will be doubled > > + * (e.g. a"b -> "a""b"). The result will be a valid notmuch query and > > + * can be parsed by parse_boolean_term. > > + * > > + * Output is into buf; it may be talloc_realloced. > > + * Return: 0 on success, non-zero on memory allocation failure. > > + */ > > +int make_boolean_term (void *talloc_ctx, const char *prefix, const char > > *term, > > + char **buf, size_t *len); > > + > > #endif -- Austin Clements MIT/'06/PhD/CSAIL amdragon at mit.edu http://web.mit.edu/amdragon Somewhere in the dream we call reality you will find me, searching for the reality we call dreams.