On disk tag storage format

2013-02-20 Thread David Bremner
David Bremner  writes:

> Austin outlined on IRC a way of representing tags on disk as hardlinks
> to messages. In order to make the discussion more concrete, I wrote a
> prototype in python to dump the notmuch database to this format. On my
> 250k messages, this creates 40k new hardlinks, and uses about 5M of
> diskspace. The dump process takes about 20s on
> my core i7 machine.  With symbolic links, the same database takes about
> 150M of disk space; this isn't great but it isn't unbearable either.
>

I've being playing a bit with this script and it seems more or less
usable as a way of mirroring the notmuch tag database to a link farm.

It's a bit faster than my current dump/restore based approach, although
if you want to keep the results in a git repository then it takes up
more space. Of course the bonus with this approach is that it creates
"virtual" maildirs for each tag that can be browsed with the maildir
client of choice.

The current default is to use some mix of hard and symbolic links to try
to balance the space consumed in a git repo versus the inode
consumption/performance issues of using too many symlinks.

It's still a prototype, and there is not much error checking, and there
are certain issues not dealt with at all (the ones I thought about are
commented).

-- next part --
A non-text attachment was scrubbed...
Name: linksync.py
Type: text/x-python
Size: 5194 bytes
Desc: not available
URL: 
<http://notmuchmail.org/pipermail/notmuch/attachments/20130220/351bd585/attachment.py>


[Patch v2 4/4] nmbug: allow empty prefix

2013-02-20 Thread da...@tethera.net
From: David Bremner 

Current code does not distinguish between an empty string in the
NMBPREFIX environment variable and the variable being undefined. This
makes it impossible to define an empty prefix, if, e.g. somebody wants
to dump all of their tags with nmbug.
---
 devel/nmbug/nmbug |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devel/nmbug/nmbug b/devel/nmbug/nmbug
index b9c70e4..90d98b6 100755
--- a/devel/nmbug/nmbug
+++ b/devel/nmbug/nmbug
@@ -13,7 +13,7 @@ my $NMBGIT = $ENV{NMBGIT} || $ENV{HOME}.'/.nmbug';

 $NMBGIT .= '/.git' if (-d $NMBGIT.'/.git');

-my $TAGPREFIX = $ENV{NMBPREFIX} || 'notmuch::';
+my $TAGPREFIX = defined($ENV{NMBPREFIX}) ? $ENV{NMBPREFIX} : 'notmuch::';

 # for encoding

-- 
1.7.10.4



[Patch v2 3/4] nmbug: replace hard-coded magic hash with git-hash-object

2013-02-20 Thread da...@tethera.net
From: David Bremner 

This is at least easier to understand than the magic hash. It may also
be a bit more robust, although it is hard to imagine these numbers
changing without many other changes in git.
---
 devel/nmbug/nmbug |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/devel/nmbug/nmbug b/devel/nmbug/nmbug
index 73d64fe..b9c70e4 100755
--- a/devel/nmbug/nmbug
+++ b/devel/nmbug/nmbug
@@ -15,9 +15,6 @@ $NMBGIT .= '/.git' if (-d $NMBGIT.'/.git');

 my $TAGPREFIX = $ENV{NMBPREFIX} || 'notmuch::';

-# magic hash for git
-my $EMPTYBLOB = 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391';
-
 # for encoding

 my $ESCAPE_CHAR =  '%';
@@ -50,6 +47,9 @@ if (!exists $command{$subcommand}) {
   usage ();
 }

+# magic hash for git
+my $EMPTYBLOB = git (qw{hash-object -t blob /dev/null});
+
 &{$command{$subcommand}}(@ARGV);

 sub git_pipe {
-- 
1.7.10.4



[Patch v2 2/4] nmbug: use 'notmuch tag --batch'

2013-02-20 Thread da...@tethera.net
From: David Bremner 

This should be more robust with respect to tags with whitespace and
and other special characters. It also (hopefully) fixes a remaining
bug handling message-ids with whitespace.  It should also be
noticeably faster for large sets of changes since it does one exec per
change set as opposed to one exec per tag changed.
---
 devel/nmbug/nmbug |   27 ++-
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/devel/nmbug/nmbug b/devel/nmbug/nmbug
index befc3d9..73d64fe 100755
--- a/devel/nmbug/nmbug
+++ b/devel/nmbug/nmbug
@@ -267,6 +267,20 @@ sub do_checkout {
   do_sync (action => 'checkout');
 }

+sub quote_for_xapian {
+  my $str = shift;
+  $str =~ s/"/""/g;
+  return '"' . $str . '"';
+}
+
+sub pair_to_batch_line {
+  my ($action, $pair) = @_;
+
+  # the tag should already be suitably encoded
+
+  return $action . $ENCPREFIX . $pair->{tag} .
+' -- id:' . quote_for_xapian ($pair->{id})."\n";
+}

 sub do_sync {

@@ -283,17 +297,20 @@ sub do_sync {
 $D_action = '-';
   }

-  foreach my $pair (@{$status->{added}}) {
+  my $notmuch = spawn ({}, '|-', qw/notmuch tag --batch/)
+or die 'notmuch tag --batch';

-notmuch ('tag', $A_action.$TAGPREFIX.$pair->{tag},
-'id:'.$pair->{id});
+  foreach my $pair (@{$status->{added}}) {
+print $notmuch pair_to_batch_line ($A_action, $pair);
   }

   foreach my $pair (@{$status->{deleted}}) {
-notmuch ('tag', $D_action.$TAGPREFIX.$pair->{tag},
-'id:'.$pair->{id});
+print $notmuch pair_to_batch_line ($D_action, $pair);
   }

+  unless (close $notmuch) {
+die "'notmuch tag --batch' exited with nonzero value\n";
+  }
 }


-- 
1.7.10.4



[Patch v2 1/4] nmbug: use dump --format=batch-tag

2013-02-20 Thread da...@tethera.net
From: David Bremner 

This should make nmbug tolerate tags with whitespace and other special
characters it.  At the moment this relies on _not_ passing calls to
notmuch tag through the shell, which is a documented feature of perl's
system function.
---
 devel/nmbug/nmbug |   27 ---
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/devel/nmbug/nmbug b/devel/nmbug/nmbug
index fe103b3..befc3d9 100755
--- a/devel/nmbug/nmbug
+++ b/devel/nmbug/nmbug
@@ -39,6 +39,11 @@ my %command = (
 status => \_status,
 );

+# Convert prefix into form suitable for literal matching against
+# notmuch dump --format=batch-tag output.
+my $ENCPREFIX = encode_for_fs ($TAGPREFIX);
+$ENCPREFIX =~ s/:/%3a/g;
+
 my $subcommand = shift || usage ();

 if (!exists $command{$subcommand}) {
@@ -203,9 +208,9 @@ sub index_tags {

   my $index = $NMBGIT.'/nmbug.index';

-  my $query = join ' ', map ("tag:$_", get_tags ($TAGPREFIX));
+  my $query = join ' ', map ("tag:\"$_\"", get_tags ($TAGPREFIX));

-  my $fh = spawn ('-|', qw/notmuch dump --/, $query)
+  my $fh = spawn ('-|', qw/notmuch dump --format=batch-tag --/, $query)
 or die "notmuch dump: $!";

   git ('read-tree', '--empty');
@@ -214,22 +219,30 @@ sub index_tags {
 or die 'git update-index';

   while (<$fh>) {
-m/ ( [^ ]* ) \s+ \( ([^\)]* ) \) /x || die 'syntax error in dump';
-my ($id,$rest) = ($1,$2);

-#strip prefixes before writing
-my @tags = grep { s/^$TAGPREFIX//; } split (' ', $rest);
+chomp();
+my ($rest,$id) = split(/ -- id:/);
+
+if ($id =~ s/^"(.*)"\s*$/$1/) {
+  # xapian quoted string, dequote.
+  $id =~ s/""/"/g;
+}
+
+#strip prefixes from tags before writing
+my @tags = grep { s/^[+]$ENCPREFIX//; } split (' ', $rest);
 index_tags_for_msg ($git,$id, 'A', @tags);
   }
   unless (close $git) {
 die "'git update-index --index-info' exited with nonzero value\n";
   }
   unless (close $fh) {
-die "'notmuch dump -- $query' exited with nonzero value\n";
+die "'notmuch dump --format=batch-tag -- $query' exited with nonzero 
value\n";
   }
   return $index;
 }

+# update the git index to either create or delete an empty file.
+# Neither argument should be encoded/escaped.
 sub index_tags_for_msg {
   my $fh = shift;
   my $msgid = shift;
-- 
1.7.10.4



Update for nmbug, round 2

2013-02-20 Thread da...@tethera.net
This obsoletes 

 id:1360374019-20988-1-git-send-email-david at tethera.net

This less broken than the last version ;). I've used these patches for
a few days without ill effects. The first two patches use
batch-tagging, which should have some speedup. The includes fixes for
the issue about quoting that Tomi raised.

The second two patches are a style improvement and a bug fix for a bug
that probably not many people hit.



[Patch v2 1/4] nmbug: use dump --format=batch-tag

2013-02-20 Thread david
From: David Bremner brem...@debian.org

This should make nmbug tolerate tags with whitespace and other special
characters it.  At the moment this relies on _not_ passing calls to
notmuch tag through the shell, which is a documented feature of perl's
system function.
---
 devel/nmbug/nmbug |   27 ---
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/devel/nmbug/nmbug b/devel/nmbug/nmbug
index fe103b3..befc3d9 100755
--- a/devel/nmbug/nmbug
+++ b/devel/nmbug/nmbug
@@ -39,6 +39,11 @@ my %command = (
 status = \do_status,
 );
 
+# Convert prefix into form suitable for literal matching against
+# notmuch dump --format=batch-tag output.
+my $ENCPREFIX = encode_for_fs ($TAGPREFIX);
+$ENCPREFIX =~ s/:/%3a/g;
+
 my $subcommand = shift || usage ();
 
 if (!exists $command{$subcommand}) {
@@ -203,9 +208,9 @@ sub index_tags {
 
   my $index = $NMBGIT.'/nmbug.index';
 
-  my $query = join ' ', map (tag:$_, get_tags ($TAGPREFIX));
+  my $query = join ' ', map (tag:\$_\, get_tags ($TAGPREFIX));
 
-  my $fh = spawn ('-|', qw/notmuch dump --/, $query)
+  my $fh = spawn ('-|', qw/notmuch dump --format=batch-tag --/, $query)
 or die notmuch dump: $!;
 
   git ('read-tree', '--empty');
@@ -214,22 +219,30 @@ sub index_tags {
 or die 'git update-index';
 
   while ($fh) {
-m/ ( [^ ]* ) \s+ \( ([^\)]* ) \) /x || die 'syntax error in dump';
-my ($id,$rest) = ($1,$2);
 
-#strip prefixes before writing
-my @tags = grep { s/^$TAGPREFIX//; } split (' ', $rest);
+chomp();
+my ($rest,$id) = split(/ -- id:/);
+
+if ($id =~ s/^(.*)\s*$/$1/) {
+  # xapian quoted string, dequote.
+  $id =~ s///g;
+}
+
+#strip prefixes from tags before writing
+my @tags = grep { s/^[+]$ENCPREFIX//; } split (' ', $rest);
 index_tags_for_msg ($git,$id, 'A', @tags);
   }
   unless (close $git) {
 die 'git update-index --index-info' exited with nonzero value\n;
   }
   unless (close $fh) {
-die 'notmuch dump -- $query' exited with nonzero value\n;
+die 'notmuch dump --format=batch-tag -- $query' exited with nonzero 
value\n;
   }
   return $index;
 }
 
+# update the git index to either create or delete an empty file.
+# Neither argument should be encoded/escaped.
 sub index_tags_for_msg {
   my $fh = shift;
   my $msgid = shift;
-- 
1.7.10.4

___
notmuch mailing list
notmuch@notmuchmail.org
http://notmuchmail.org/mailman/listinfo/notmuch


[Patch v2 2/4] nmbug: use 'notmuch tag --batch'

2013-02-20 Thread david
From: David Bremner brem...@debian.org

This should be more robust with respect to tags with whitespace and
and other special characters. It also (hopefully) fixes a remaining
bug handling message-ids with whitespace.  It should also be
noticeably faster for large sets of changes since it does one exec per
change set as opposed to one exec per tag changed.
---
 devel/nmbug/nmbug |   27 ++-
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/devel/nmbug/nmbug b/devel/nmbug/nmbug
index befc3d9..73d64fe 100755
--- a/devel/nmbug/nmbug
+++ b/devel/nmbug/nmbug
@@ -267,6 +267,20 @@ sub do_checkout {
   do_sync (action = 'checkout');
 }
 
+sub quote_for_xapian {
+  my $str = shift;
+  $str =~ s///g;
+  return '' . $str . '';
+}
+
+sub pair_to_batch_line {
+  my ($action, $pair) = @_;
+
+  # the tag should already be suitably encoded
+
+  return $action . $ENCPREFIX . $pair-{tag} .
+' -- id:' . quote_for_xapian ($pair-{id}).\n;
+}
 
 sub do_sync {
 
@@ -283,17 +297,20 @@ sub do_sync {
 $D_action = '-';
   }
 
-  foreach my $pair (@{$status-{added}}) {
+  my $notmuch = spawn ({}, '|-', qw/notmuch tag --batch/)
+or die 'notmuch tag --batch';
 
-notmuch ('tag', $A_action.$TAGPREFIX.$pair-{tag},
-'id:'.$pair-{id});
+  foreach my $pair (@{$status-{added}}) {
+print $notmuch pair_to_batch_line ($A_action, $pair);
   }
 
   foreach my $pair (@{$status-{deleted}}) {
-notmuch ('tag', $D_action.$TAGPREFIX.$pair-{tag},
-'id:'.$pair-{id});
+print $notmuch pair_to_batch_line ($D_action, $pair);
   }
 
+  unless (close $notmuch) {
+die 'notmuch tag --batch' exited with nonzero value\n;
+  }
 }
 
 
-- 
1.7.10.4

___
notmuch mailing list
notmuch@notmuchmail.org
http://notmuchmail.org/mailman/listinfo/notmuch


[Patch v2 3/4] nmbug: replace hard-coded magic hash with git-hash-object

2013-02-20 Thread david
From: David Bremner brem...@debian.org

This is at least easier to understand than the magic hash. It may also
be a bit more robust, although it is hard to imagine these numbers
changing without many other changes in git.
---
 devel/nmbug/nmbug |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/devel/nmbug/nmbug b/devel/nmbug/nmbug
index 73d64fe..b9c70e4 100755
--- a/devel/nmbug/nmbug
+++ b/devel/nmbug/nmbug
@@ -15,9 +15,6 @@ $NMBGIT .= '/.git' if (-d $NMBGIT.'/.git');
 
 my $TAGPREFIX = $ENV{NMBPREFIX} || 'notmuch::';
 
-# magic hash for git
-my $EMPTYBLOB = 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391';
-
 # for encoding
 
 my $ESCAPE_CHAR =  '%';
@@ -50,6 +47,9 @@ if (!exists $command{$subcommand}) {
   usage ();
 }
 
+# magic hash for git
+my $EMPTYBLOB = git (qw{hash-object -t blob /dev/null});
+
 {$command{$subcommand}}(@ARGV);
 
 sub git_pipe {
-- 
1.7.10.4

___
notmuch mailing list
notmuch@notmuchmail.org
http://notmuchmail.org/mailman/listinfo/notmuch


[Patch v2 4/4] nmbug: allow empty prefix

2013-02-20 Thread david
From: David Bremner brem...@debian.org

Current code does not distinguish between an empty string in the
NMBPREFIX environment variable and the variable being undefined. This
makes it impossible to define an empty prefix, if, e.g. somebody wants
to dump all of their tags with nmbug.
---
 devel/nmbug/nmbug |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devel/nmbug/nmbug b/devel/nmbug/nmbug
index b9c70e4..90d98b6 100755
--- a/devel/nmbug/nmbug
+++ b/devel/nmbug/nmbug
@@ -13,7 +13,7 @@ my $NMBGIT = $ENV{NMBGIT} || $ENV{HOME}.'/.nmbug';
 
 $NMBGIT .= '/.git' if (-d $NMBGIT.'/.git');
 
-my $TAGPREFIX = $ENV{NMBPREFIX} || 'notmuch::';
+my $TAGPREFIX = defined($ENV{NMBPREFIX}) ? $ENV{NMBPREFIX} : 'notmuch::';
 
 # for encoding
 
-- 
1.7.10.4

___
notmuch mailing list
notmuch@notmuchmail.org
http://notmuchmail.org/mailman/listinfo/notmuch


Update for nmbug, round 2

2013-02-20 Thread david
This obsoletes 

 id:1360374019-20988-1-git-send-email-da...@tethera.net

This less broken than the last version ;). I've used these patches for
a few days without ill effects. The first two patches use
batch-tagging, which should have some speedup. The includes fixes for
the issue about quoting that Tomi raised.

The second two patches are a style improvement and a bug fix for a bug
that probably not many people hit.

___
notmuch mailing list
notmuch@notmuchmail.org
http://notmuchmail.org/mailman/listinfo/notmuch


Re: On disk tag storage format

2013-02-20 Thread David Bremner
David Bremner da...@tethera.net writes:

 Austin outlined on IRC a way of representing tags on disk as hardlinks
 to messages. In order to make the discussion more concrete, I wrote a
 prototype in python to dump the notmuch database to this format. On my
 250k messages, this creates 40k new hardlinks, and uses about 5M of
 diskspace. The dump process takes about 20s on
 my core i7 machine.  With symbolic links, the same database takes about
 150M of disk space; this isn't great but it isn't unbearable either.


I've being playing a bit with this script and it seems more or less
usable as a way of mirroring the notmuch tag database to a link farm.

It's a bit faster than my current dump/restore based approach, although
if you want to keep the results in a git repository then it takes up
more space. Of course the bonus with this approach is that it creates
virtual maildirs for each tag that can be browsed with the maildir
client of choice.

The current default is to use some mix of hard and symbolic links to try
to balance the space consumed in a git repo versus the inode
consumption/performance issues of using too many symlinks.

It's still a prototype, and there is not much error checking, and there
are certain issues not dealt with at all (the ones I thought about are
commented).

# Copyright 2013, David Bremner da...@tethera.net

# Licensed under the same terms as notmuch.

import notmuch
import re
import os, errno
import sys
from collections import defaultdict
import argparse

# skip automatic and maildir tags

skiptags = re.compile(r^(attachement|signed|encrypted|draft|flagged|passed|replied|unread)$)

# some random person on stack overflow suggests:

def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python 2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise

CHARSET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+_@=.,-'

encode_re = '([^{0}])'.format(CHARSET)

decode_re = '[%]([0-7][0-9A-Fa-f])'

def encode_one_char(match):
return('%{:02x}'.format(ord(match.group(1

def encode_for_fs(str):
return re.sub(encode_re,encode_one_char, str,0)

def decode_one_char(match):
return chr(int(match.group(1),16))

def decode_from_fs(str):
return re.sub(decode_re,decode_one_char, str, 0)


def mk_tag_dir(tagdir):

mkdir_p (os.path.join(tagdir, 'cur'))
mkdir_p (os.path.join(tagdir, 'new'))
mkdir_p (os.path.join(tagdir, 'tmp'))


flagpart = '(:2,[^:]*)'
flagre = re.compile(flagpart + '$');

def path_for_msg (dir, msg):
filename = msg.get_filename()
flagsmatch = flagre.search(filename)
if flagsmatch == None:
flags = ''
else:
flags = flagsmatch.group(1)

return os.path.join(dir, 'cur', encode_for_fs(msg.get_message_id()) + flags)


def unlink_message(dir, msg):

dir = os.path.join(dir, 'cur')

filepattern = encode_for_fs(msg.get_message_id())  + flagpart +'?$'

filere = re.compile(filepattern);

for file in os.listdir(dir):
if filere.match(file):
os.unlink(os.path.join(dir, file))

def dir_for_tag(tag):
enc_tag = encode_for_fs (tag)
return os.path.join(tagroot, enc_tag)

disk_tags = defaultdict(set)
disk_ids = set()

def read_tags_from_disk(rootdir):

for root, subFolders, files in os.walk(rootdir):
for filename in files:
msg_id = filename.split(':')[0]
tag = root.split('/')[-2]
decoded_id = decode_from_fs(msg_id)
disk_ids.add(decoded_id)
disk_tags[decoded_id].add(decode_from_fs(tag));

# Main program

parser = argparse.ArgumentParser(description='Sync notmuch tag database to/from link farm')
parser.add_argument('-l','--link-style',choices=['hard','symbolic', 'adaptive'],
default='adaptive',dest='link_style')
parser.add_argument('-d','--destination',choices=['disk','notmuch'], default='disk',
dest='destination')
parser.add_argument('-t','--threshold', default=5L, type=int, dest='threshold')

parser.add_argument('tagroot')

opts=parser.parse_args()

tagroot=opts.tagroot

sync_from_links = (opts.destination == 'notmuch')

read_tags_from_disk(tagroot)

if sync_from_links:
db = notmuch.Database(mode=notmuch.Database.MODE.READ_WRITE)
else:
db = notmuch.Database(mode=notmuch.Database.MODE.READ_ONLY)

dbtags = filter (lambda tag: not skiptags.match(tag), db.get_all_tags())

querystr = ' OR '.join(map (lambda tag: 'tag:'+tag,  dbtags));

q_new = notmuch.Query(db, querystr)
q_new.set_sort(notmuch.Query.SORT.UNSORTED)
for msg in q_new.search_messages():

# silently ignore empty tags
db_tags = set(filter (lambda tag: tag != '' and not skiptags.match(tag),
  msg.get_tags()))

message_id = msg.get_message_id()

disk_ids.discard(message_id)

missing_on_disk = db_tags.difference(disk_tags[message_id])
missing_in_db =