Re: [Patch v2] lib: regexp matching in 'subject' and 'from'

2017-01-19 Thread David Bremner
David Bremner  writes:

> the idea is that you can run
>
> % notmuch search re:subject:
> % notmuch search re:from:'
>
> or
>
> % notmuch search subject:"your usual phrase search"
> % notmuch search from:"usual phrase search"

I'm not sure how useful it is, but here's an interdiff.

diff --git a/lib/database-private.h b/lib/database-private.h
index e7cbed8f..92f4b72f 100644
--- a/lib/database-private.h
+++ b/lib/database-private.h
@@ -190,7 +190,8 @@ struct _notmuch_database {
 #if HAVE_XAPIAN_FIELD_PROCESSOR
 Xapian::FieldProcessor *date_field_processor;
 Xapian::FieldProcessor *query_field_processor;
-Xapian::FieldProcessor *re_field_processor;
+Xapian::FieldProcessor *re_from_field_processor;
+Xapian::FieldProcessor *re_subject_field_processor;
 #endif
 Xapian::ValueRangeProcessor *last_mod_range_processor;
 };
diff --git a/lib/database.cc b/lib/database.cc
index 851a62d1..2b2f8f5e 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -1043,8 +1043,10 @@ notmuch_database_open_verbose (const char *path,
notmuch->query_parser->add_boolean_prefix("date", 
notmuch->date_field_processor);
notmuch->query_field_processor = new QueryFieldProcessor 
(*notmuch->query_parser, notmuch);
notmuch->query_parser->add_boolean_prefix("query", 
notmuch->query_field_processor);
-   notmuch->re_field_processor = new RegexpFieldProcessor 
(*notmuch->query_parser, notmuch);
-   notmuch->query_parser->add_boolean_prefix("re", 
notmuch->re_field_processor);
+   notmuch->re_from_field_processor = new RegexpFieldProcessor ("from", 
*notmuch->query_parser, notmuch);
+   notmuch->re_subject_field_processor = new RegexpFieldProcessor 
("subject", *notmuch->query_parser, notmuch);
+   notmuch->query_parser->add_boolean_prefix("re_from", 
notmuch->re_from_field_processor);
+   notmuch->query_parser->add_boolean_prefix("re_subject", 
notmuch->re_subject_field_processor);
 #endif
notmuch->last_mod_range_processor = new 
Xapian::NumberValueRangeProcessor (NOTMUCH_VALUE_LAST_MOD, "lastmod:");
 
@@ -1141,8 +1143,10 @@ notmuch_database_close (notmuch_database_t *notmuch)
 notmuch->date_field_processor = NULL;
 delete notmuch->query_field_processor;
 notmuch->query_field_processor = NULL;
-delete notmuch->re_field_processor;
-notmuch->re_field_processor = NULL;
+delete notmuch->re_from_field_processor;
+notmuch->re_from_field_processor = NULL;
+delete notmuch->re_subject_field_processor;
+notmuch->re_subject_field_processor = NULL;
 #endif
 
 return status;
diff --git a/lib/regexp-fields.cc b/lib/regexp-fields.cc
index 4d3d9721..211ec02d 100644
--- a/lib/regexp-fields.cc
+++ b/lib/regexp-fields.cc
@@ -101,25 +101,10 @@ RegexpPostingSource::next (unused (double min_wt))
 }
 }
 
-static Xapian::valueno
-_find_slot (std::string prefix)
-{
-if (prefix == "from")
-   return NOTMUCH_VALUE_FROM;
-else if (prefix == "subject")
-   return NOTMUCH_VALUE_SUBJECT;
-else
-   throw Xapian::QueryParserError ("unsupported regexp field '" + prefix + 
"'");
-}
-
 Xapian::Query
 RegexpFieldProcessor::operator() (const std::string & str)
 {
-size_t pos = str.find_first_of (':');
-std::string prefix = str.substr (0, pos);
-std::string regexp = str.substr (pos + 1);
-
-postings = new RegexpPostingSource (_find_slot (prefix), regexp);
+postings = new RegexpPostingSource (slot, str);
 return Xapian::Query (postings);
 }
 #endif
diff --git a/lib/regexp-fields.h b/lib/regexp-fields.h
index 2c9c2d7e..c2c44167 100644
--- a/lib/regexp-fields.h
+++ b/lib/regexp-fields.h
@@ -61,13 +61,26 @@ class RegexpPostingSource : public Xapian::PostingSource
 
 class RegexpFieldProcessor : public Xapian::FieldProcessor {
  protected:
+Xapian::valueno slot;
 Xapian::QueryParser 
 notmuch_database_t *notmuch;
 RegexpPostingSource *postings = NULL;
 
+
+static inline Xapian::valueno _find_slot (std::string prefix)
+{
+   if (prefix == "from")
+   return NOTMUCH_VALUE_FROM;
+   else if (prefix == "subject")
+   return NOTMUCH_VALUE_SUBJECT;
+   else
+   throw Xapian::QueryParserError ("unsupported regexp field '" + 
prefix + "'");
+}
+
+
  public:
-RegexpFieldProcessor (Xapian::QueryParser _, notmuch_database_t 
*notmuch_)
-   : parser(parser_), notmuch(notmuch_) { };
+RegexpFieldProcessor (std::string prefix, Xapian::QueryParser _, 
notmuch_database_t *notmuch_)
+   : slot(_find_slot (prefix)), parser(parser_), notmuch(notmuch_) { };
 
 ~RegexpFieldProcessor () { delete postings; };
 
diff --git a/test/T630-regexp-query.sh b/test/T630-regexp-query.sh
index 3bbe47cf..1b25634d 100755
--- a/test/T630-regexp-query.sh
+++ b/test/T630-regexp-query.sh
@@ -10,15 +10,15 @@ if [ $NOTMUCH_HAVE_XAPIAN_FIELD_PROCESSOR -eq 1 ]; then
 notmuch search --output=messages from:cworth > cworth.msg-ids
 
 test_begin_subtest 

Re: [Patch v2] lib: regexp matching in 'subject' and 'from'

2017-01-18 Thread David Bremner
Jani Nikula  writes:


> I played around with this a bit, and it seemed to work. Unsurprisingly,
> getting the quoting right was the hardest part. Even though I know how
> the stuff works under the hood, it took me a while to realize that you
> have to use 're:"subject:"' to make it work. (I kept
> trying 're:subject:""'.) I don't know if there's
> anything we could really do about this.
>

I _think_ we could add distinct prefixes at the xapian level for each
regex-prefix. That opens the can of worms of naming them eg re-subject:
and re-from:. I'm not sure about the added complexity, but I think it's
a matter of adding an extra argument to the field processor constructor.

d
___
notmuch mailing list
notmuch@notmuchmail.org
https://notmuchmail.org/mailman/listinfo/notmuch


Re: [Patch v2] lib: regexp matching in 'subject' and 'from'

2017-01-18 Thread Jani Nikula
On Mon, 14 Nov 2016, David Bremner  wrote:
> the idea is that you can run
>
> % notmuch search re:subject:
> % notmuch search re:from:'
>
> or
>
> % notmuch search subject:"your usual phrase search"
> % notmuch search from:"usual phrase search"
>
> This should also work with bindings, since it extends the query parser.
>
> This is trivial to extend for other value slots, but currently the only
> value slots are date, message_id, from, subject, and last_mod. Date is
> already searchable, and message_id is not obviously useful to regex
> match.
>
> This was originally written by Austin Clements, and ported to Xapian
> field processors (from Austin's custom query parser) by yours truly.

I can't say I would have done a detailed review of all the Xapian bits
and pieces here, but I didn't spot anything obviously wrong either.

I suppose I'd prefer the documentation to be more explicit about
"re:subject:" and "re:from:" instead of having the generic "re::"
that I think is bound to confuse people.

The _ suffixes instead of prefixes in variables seemed a bit odd, but no
strong opinions on it.

I played around with this a bit, and it seemed to work. Unsurprisingly,
getting the quoting right was the hardest part. Even though I know how
the stuff works under the hood, it took me a while to realize that you
have to use 're:"subject:"' to make it work. (I kept
trying 're:subject:""'.) I don't know if there's
anything we could really do about this.

BR,
Jani.



> ---
>
> rebase of id:1467034387-16885-1-git-send-email-da...@tethera.net against 
> master
>
>  doc/man7/notmuch-search-terms.rst |  17 +-
>  lib/Makefile.local|   1 +
>  lib/database-private.h|   1 +
>  lib/database.cc   |   5 ++
>  lib/regexp-fields.cc  | 125 
> ++
>  lib/regexp-fields.h   |  77 +++
>  test/T630-regexp-query.sh |  91 +++
>  7 files changed, 316 insertions(+), 1 deletion(-)
>  create mode 100644 lib/regexp-fields.cc
>  create mode 100644 lib/regexp-fields.h
>  create mode 100755 test/T630-regexp-query.sh
>
> diff --git a/doc/man7/notmuch-search-terms.rst 
> b/doc/man7/notmuch-search-terms.rst
> index de93d73..4c7afc2 100644
> --- a/doc/man7/notmuch-search-terms.rst
> +++ b/doc/man7/notmuch-search-terms.rst
> @@ -60,6 +60,8 @@ indicate user-supplied values):
>  
>  -  property:=
>  
> +- re:{subject,from}:
> +
>  The **from:** prefix is used to match the name or address of the sender
>  of an email message.
>  
> @@ -146,6 +148,12 @@ The **property:** prefix searches for messages with a 
> particular
>  (and extensions) to add metadata to messages. A given key can be
>  present on a given message with several different values.
>  
> +The **re::** prefix can be used to restrict the results to
> +those whose  matches the given regular expression (see
> +**regex(7)**). Regular expression searches are only available if
> +notmuch is built with **Xapian Field Processors** (see below), and
> +currently only for the Subject and From fields.
> +
>  Operators
>  -
>  
> @@ -220,13 +228,19 @@ Boolean and Probabilistic Prefixes
>  --
>  
>  Xapian (and hence notmuch) prefixes are either **boolean**, supporting
> -exact matches like "tag:inbox"  or **probabilistic**, supporting a more 
> flexible **term** based searching. The prefixes currently supported by 
> notmuch are as follows.
> +exact matches like "tag:inbox" or **probabilistic**, supporting a more
> +flexible **term** based searching. Certain **special** prefixes are
> +processed by notmuch in a way not stricly fitting either of Xapian's
> +built in styles. The prefixes currently supported by notmuch are as
> +follows.
>  
>  
>  Boolean
> **tag:**, **id:**, **thread:**, **folder:**, **path:**, **property:**
>  Probabilistic
> **from:**, **to:**, **subject:**, **attachment:**, **mimetype:**
> +Special
> +   **query:**, **re:**
>  
>  Terms and phrases
>  -
> @@ -396,6 +410,7 @@ Currently the following features require field processor 
> support:
>  
>  - non-range date queries, e.g. "date:today"
>  - named queries e.g. "query:my_special_query"
> +- regular expression searches, e.g. "re:subject:^\\[SPAM\\]"
>  
>  SEE ALSO
>  
> diff --git a/lib/Makefile.local b/lib/Makefile.local
> index 3d1030a..ccd32ab 100644
> --- a/lib/Makefile.local
> +++ b/lib/Makefile.local
> @@ -53,6 +53,7 @@ libnotmuch_cxx_srcs =   \
>   $(dir)/query.cc \
>   $(dir)/query-fp.cc  \
>   $(dir)/config.cc\
> + $(dir)/regexp-fields.cc \
>   $(dir)/thread.cc
>  
>  libnotmuch_modules := $(libnotmuch_c_srcs:.c=.o) 
> $(libnotmuch_cxx_srcs:.cc=.o)
> diff --git a/lib/database-private.h b/lib/database-private.h
> index ca71a92..900a989 100644
> --- a/lib/database-private.h
> +++ b/lib/database-private.h
> @@ -186,6 +186,7 @@