Some time ago I complained about very slow access to compressed mboxes.
Unfortunately it looks like that it is very little interest in it, so I
have to investigate some things by myself.

Firstly: some rationale.
Why do I prefer use mbox/maildir over mdbox. Short answer "bus factor"
for support mdbox (not only dovecot)
Longer answer: if something goes wrong withm maildir/mbox i can use
other tools (mutt, or formail or even text editor) and with mdbox ...

I am not ISP, I use dovecot as a "gateway" to my (rather huge) mail
archive. Most of these mails are rather valuable for me, so I prefer use
something "well-known-and-tested".
(I can't do like most ISP's do: write in "Terms of Service" that mail
can be lost or damaged and we give no warranty :) )

So then:

Below my patch. 
It contains 2 changes:
1. when buffer is compressed, we try to save last marked offset. 
2. Increase temporary buffer for decompression.

without these changes 1.5 GB of bzip compressed mbox with ~20K messages 
can't be open in 1.5 day
After applying 1. change it can be open in ~1.5 h 
With both changes it was a few minutes.

Maybe it is a good idea to add config parameter to specify size of
decompress buffer?


Patch is against v2.0.18


diff -x '*.o' -x '*.lo' -x '*.la' -u -r ../dovecot-2.0.18/src/lib/istream.c ./src/lib/istream.c
--- ../dovecot-2.0.18/src/lib/istream.c	2011-12-13 12:38:27.000000000 +0100
+++ ./src/lib/istream.c	2012-04-14 10:27:23.790724625 +0200
@@ -452,6 +452,22 @@
 	stream->pos -= stream->skip;
 
 	stream->skip = 0;
+
+}
+
+void i_stream_compress1(struct istream_private *stream, size_t bytes )
+{
+
+    size_t lskip ;
+
+	lskip = (stream->skip > bytes ? bytes : stream->skip );
+
+	memmove(stream->w_buffer, stream->w_buffer + lskip ,
+		stream->pos - lskip);
+	stream->pos -= lskip;
+	stream->skip -= lskip;
+
+
 }
 
 void i_stream_grow_buffer(struct istream_private *stream, size_t bytes)
diff -x '*.o' -x '*.lo' -x '*.la' -u -r ../dovecot-2.0.18/src/lib/istream-internal.h ./src/lib/istream-internal.h
--- ../dovecot-2.0.18/src/lib/istream-internal.h	2011-12-13 12:38:27.000000000 +0100
+++ ./src/lib/istream-internal.h	2012-04-13 00:06:27.700298378 +0200
@@ -51,6 +51,7 @@
 i_stream_create(struct istream_private *stream, struct istream *parent, int fd);
 
 void i_stream_compress(struct istream_private *stream);
+void i_stream_compress1(struct istream_private *stream, size_t bytes );
 void i_stream_grow_buffer(struct istream_private *stream, size_t bytes);
 bool i_stream_get_buffer_space(struct istream_private *stream,
 			       size_t wanted_size, size_t *size_r);
 
diff -x '*.o' -x '*.lo' -x '*.la' -u -r ../dovecot-2.0.18/src/plugins/zlib/istream-bzlib.c ./src/plugins/zlib/istream-bzlib.c
--- ../dovecot-2.0.18/src/plugins/zlib/istream-bzlib.c	2012-02-09 18:32:48.000000000 +0100
+++ ./src/plugins/zlib/istream-bzlib.c	2012-04-14 10:35:04.349800777 +0200
@@ -9,12 +9,14 @@
 #include <bzlib.h>
 
 #define CHUNK_SIZE (1024*64)
+#define BUFF_SIZE (1024*1024*16)
 
 struct bzlib_istream {
 	struct istream_private istream;
-
+	
 	bz_stream zs;
 	uoff_t eof_offset, stream_size;
+	uoff_t marked_offset;
 	size_t prev_size, high_pos;
 	struct stat last_parent_statbuf;
 
@@ -48,7 +50,6 @@
 	uoff_t high_offset;
 	size_t size;
 	int ret;
-
 	high_offset = stream->istream.v_offset + (stream->pos - stream->skip);
 	if (zstream->eof_offset == high_offset) {
 		i_assert(zstream->high_pos == 0 ||
@@ -87,7 +88,14 @@
 		if (stream->pos == stream->buffer_size) {
 			if (stream->skip > 0) {
 				/* lose our buffer cache */
-				i_stream_compress(stream);
+				/* try to save our buffer cache as much as possible */
+
+				if (zstream->marked && (stream-> skip - (stream->istream.v_offset - zstream->marked_offset)) >0 ){
+					
+					i_stream_compress1(stream, stream-> skip - (stream->istream.v_offset - zstream->marked_offset));
+				} else {
+					i_stream_compress(stream);
+				}
 			}
 
 			if (stream->pos == stream->buffer_size)
@@ -215,8 +223,12 @@
 	struct bzlib_istream *zstream = (struct bzlib_istream *) stream;
 	uoff_t start_offset = stream->istream.v_offset - stream->skip;
 
+	if (mark) 
+		zstream->marked_offset = v_offset;		
 	if (v_offset < start_offset) {
 		/* have to seek backwards */
+
+	
 		i_stream_bzlib_reset(zstream);
 		start_offset = 0;
 	} else if (zstream->high_pos != 0) {
@@ -243,6 +255,7 @@
 			}
 
 			i_stream_skip(&stream->istream, avail);
+
 		} while (i_stream_read(&stream->istream) >= 0);
 
 		if (stream->istream.v_offset != v_offset) {
@@ -260,8 +273,11 @@
 		}
 	}
 
-	if (mark)
+	if (mark){
 		zstream->marked = TRUE;
+		zstream->marked_offset = v_offset;
+	}
+
 }
 
 static const struct stat *
@@ -329,7 +345,9 @@
 	i_stream_bzlib_init(zstream);
 
 	zstream->istream.iostream.close = i_stream_bzlib_close;
-	zstream->istream.max_buffer_size = input->real_stream->max_buffer_size;
+	//	zstream->istream.max_buffer_size = (input->real_stream->max_buffer_size);
+	zstream->istream.max_buffer_size = BUFF_SIZE;
+
 	zstream->istream.read = i_stream_bzlib_read;
 	zstream->istream.seek = i_stream_bzlib_seek;
 	zstream->istream.stat = i_stream_bzlib_stat;
-- 
Gdyby ktoś miał zbędny Toshiba G450 - to chętnie przejmę ;)
< asuffield> a workstation is anything you can stick on somebodies desk
             and con them into using
                -- in #debian-devel

Reply via email to