Hi
I've just made updated versions of 3 patches against the latest release 2.3.21
in case they are useful to someone or might get considered for official
inclusion.
John
dovecot-2.3.21-tika-http-auth.patch
Allows specification of username and password in the fts_tika setting for basic
auth against tika server. For example
fts_tika = https://user:password@tika_server:443/tika
dovecot-2.3.21-solr-max-size.patch
This is a simplified version of my previous patch. Sets a size limit
(configuration fts_max_size) on message bodies that are to be indexed. Message
bodies for messages larger than fts_max_size are not sent to solr. Defaults to
zero which means no limit. For example
fts_max_size = 10M
dovecot-2.3.21-solr-max-rows.patch
When dovecot sends a search to solr it uses the rows parameter. For multiple
mailbox search the value used is SOLR_MAX_MULTI_ROWS , hardcoded to 100000. For
single mailbox search the value is uidnext. This patch introduces an upper
limit for single mailbox search using the same value as SOLR_MAX_MULTI_ROWS,
while leaving the existing functionality of sending the uidnext value if it is
smaller. This is just to place a more reasonable upper bound since uidnext can
get much larger.

--- dovecot-2.3.21/src/plugins/fts/fts-parser-tika.c	2023-09-14 15:17:47.000000000 +0200
+++ dovecot-2.3.21-new/src/plugins/fts/fts-parser-tika.c	2023-12-09 11:07:46.436259394 +0100
@@ -57,7 +57,7 @@
 	tuser = p_new(user->pool, struct fts_parser_tika_user, 1);
 	MODULE_CONTEXT_SET(user, fts_parser_tika_user_module, tuser);
 
-	if (http_url_parse(url, NULL, 0, user->pool,
+	if (http_url_parse(url, NULL, HTTP_URL_ALLOW_USERINFO_PART, user->pool,
 			   &tuser->http_url, &error) < 0) {
 		i_error("fts_tika: Failed to parse HTTP url %s: %s", url, error);
 		return -1;
@@ -159,6 +159,11 @@
 			http_url->host.name,
 			t_strconcat(http_url->path, http_url->enc_query, NULL),
 			fts_tika_parser_response, parser);
+	if (http_url->user != NULL) {
+		http_client_request_set_auth_simple(
+			http_req, http_url->user, http_url->password);
+	}
+
 	http_client_request_set_port(http_req, http_url->port);
 	http_client_request_set_ssl(http_req, http_url->have_ssl);
 	if (parser_context->content_type != NULL)
--- dovecot-2.3.21/src/plugins/fts/fts-build-mail.c	2023-09-14 15:17:47.000000000 +0200
+++ dovecot-2.3.21-new/src/plugins/fts/fts-build-mail.c	2023-12-09 11:04:02.205207091 +0100
@@ -17,6 +17,7 @@
 #include "fts-filter.h"
 #include "fts-api-private.h"
 #include "fts-build-mail.h"
+#include "settings-parser.h"
 
 /* there are other characters as well, but this doesn't have to be exact */
 #define IS_WORD_WHITESPACE(c) \
@@ -573,6 +574,18 @@
 	bool binary_body;
 	const char *error;
 	int ret;
+	uoff_t msg_size;
+	uoff_t fts_max_size = 0;
+	const char * fts_max_size_setting;
+	bool oversized_msg;
+
+	fts_max_size_setting = mail_user_plugin_getenv(update_ctx->backend->ns->user, "fts_max_size");
+	if (fts_max_size_setting != NULL) {
+		if (settings_get_size(fts_max_size_setting, &fts_max_size, &error) < 0) {
+		i_error("%s",error);
+			fts_max_size = 0;
+		}
+	}
 
 	*may_need_retry_r = FALSE;
 	if (mail_get_stream_because(mail, NULL, NULL, "fts indexing", &input) < 0) {
@@ -583,6 +596,14 @@
 		return -1;
 	}
 
+	oversized_msg = FALSE;
+	i_stream_get_size(input,TRUE,&msg_size);
+	if (fts_max_size > 0 && msg_size > fts_max_size) {
+		i_info("Skipping message body indexing because size %"PRIuUOFF_T" exceeds setting fts_max_size %s",msg_size,fts_max_size_setting);
+		oversized_msg = TRUE;
+	}
+
+
 	i_zero(&ctx);
 	ctx.update_ctx = update_ctx;
 	ctx.mail = mail;
@@ -640,7 +661,7 @@
 				message_decoder_set_return_binary(decoder, TRUE);
 			body_part = TRUE;
 		} else {
-			if (skip_body)
+			if (skip_body||oversized_msg)
 				continue;
 		}
 
@@ -675,7 +696,7 @@
 		else
 			(void)fts_parser_deinit(&ctx.body_parser, NULL);
 	}
-	if (ret == 0 && body_part && !skip_body && !body_added) {
+	if (ret == 0 && body_part && !skip_body && !oversized_msg && !body_added) {
 		/* make sure body is added even when it doesn't exist */
 		block.data = NULL; block.size = 0;
 		ret = fts_build_body_block(&ctx, &block, TRUE);
--- dovecot-2.3.21/src/plugins/fts-solr/fts-backend-solr.c	2023-09-14 15:17:47.000000000 +0200
+++ dovecot-2.3.21-new/src/plugins/fts-solr/fts-backend-solr.c	2023-12-09 10:46:52.976808250 +0100
@@ -837,7 +837,7 @@
 
 	str = t_str_new(256);
 	str_printfa(str, "wt=xml&fl=uid,score&rows=%u&sort=uid+asc&q=%%7b!lucene+q.op%%3dAND%%7d",
-		    status.uidnext);
+		   I_MIN(status.uidnext,SOLR_MAX_MULTI_ROWS)); 
 	prefix_len = str_len(str);
 
 	if (solr_add_definite_query_args(str, args, and_args)) {
_______________________________________________
dovecot mailing list -- dovecot@dovecot.org
To unsubscribe send an email to dovecot-le...@dovecot.org

Reply via email to