Hi As mentioned in a previous thread, the solr + tika combination has caused me some issues due to attachment size. While tika seems to be able to parse large attachments, the resulting volume of text can overwhelm the solr server.
One solution would be to throw resources at the problem, but in my case such large attachments don't contain anything worthwile indexing. Additionally I don't want people to be able to randomly crash my solr server by sending large compressed attachments that expand into huge volumes for solr. It's also a safety feature to have sane limits on what can be indexed. Attached is a first attempt to address the problem. I did not find a way to easily get actual attachment sizes, so I used an already available information - the overall message size. It may not be ideal but at least introduces limits where none existed. I have introduced two new parameters for the plugin section, for example: plugin { fts_max_size = 2M fts_max_size_tika = 1M } They can be used separately or together. Both sizes refer to the overall message size. The meaning is: fts_max_size - do not parse message bodies if the message size exceeds this value. A value of 0 indicates no limit. If the message body is not parsed, attachments are also not parsed. fts_max_size_tika - do not parse message attachments with tika if the message size exceeds this value. A value of 0 indicates no limit. If using both settings it makes sense to have fts_max_size > fts_max_size_tika, since with a smaller fts_max_size bodies are not indexed including attachments and the fts_max_size_tika will have no effect. The difference (ft_max_size - fts_max_size_tika) places an upper bound on the size of the non attachment body text that will be indexed. However, any attachments over the fts_size will automatically consume this limit and no body text will be indexed for those messages. I've only updated the tika parser not the script parser though the script parser potentially could benefit from this approach. The attached patch also includes the rolled up patch for using basic auth with the tika server and the previous posted patch (not mine) which solves an assert when using solr and tika together. John
diff -ur dovecot-2.3.11.3-orig/src/plugins/fts/fts-build-mail.c dovecot-2.3.11.3/src/plugins/fts/fts-build-mail.c --- dovecot-2.3.11.3-orig/src/plugins/fts/fts-build-mail.c 2020-08-12 14:20:41.000000000 +0200 +++ dovecot-2.3.11.3/src/plugins/fts/fts-build-mail.c 2020-12-07 14:05:23.654217555 +0100 @@ -17,6 +17,7 @@ #include "fts-filter.h" #include "fts-api-private.h" #include "fts-build-mail.h" +#include "settings-parser.h" /* there are other characters as well, but this doesn't have to be exact */ #define IS_WORD_WHITESPACE(c) \ @@ -34,6 +35,7 @@ buffer_t *word_buf, *pending_input; struct fts_user_language *cur_user_lang; + bool oversized_tika; }; static int fts_build_data(struct fts_mail_build_context *ctx, @@ -236,7 +238,7 @@ parser_context.user = mail_storage_get_user(storage); parser_context.content_disposition = ctx->content_disposition; - + parser_context.oversized_tika = ctx->oversized_tika; if (fts_parser_init(&parser_context, &ctx->body_parser)) { /* extract text using the the returned parser */ *binary_body_r = TRUE; @@ -488,7 +490,32 @@ bool binary_body; const char *error; int ret; - + uoff_t msg_size; + uoff_t fts_max_size = 0; + uoff_t fts_max_size_tika = 0; + const char * fts_max_size_setting; + const char * fts_max_size_tika_setting; + bool oversized_msg; + bool oversized_tika; + + fts_max_size_setting = mail_user_plugin_getenv(update_ctx->backend->ns->user, "fts_max_size"); + if (fts_max_size_setting != NULL) { + i_debug("fts_max_size %s",fts_max_size_setting); + if (settings_get_size(fts_max_size_setting, &fts_max_size, &error) < 0) { + i_error("%s",error); + fts_max_size = 0; + } + i_debug("fts_max_size (value) %"PRIuUOFF_T,fts_max_size); + } + fts_max_size_tika_setting = mail_user_plugin_getenv(update_ctx->backend->ns->user, "fts_max_size_tika"); + if (fts_max_size_tika_setting != NULL) { + i_debug("fts_max_size_tika %s",fts_max_size_tika_setting); + if (settings_get_size(fts_max_size_tika_setting, &fts_max_size_tika, &error) < 0) { + i_error("%s",error); + fts_max_size_tika = 0; + } + i_debug("fts_max_size_tika (value) %"PRIuUOFF_T,fts_max_size_tika); + } *may_need_retry_r = FALSE; if (mail_get_stream_because(mail, NULL, NULL, "fts indexing", &input) < 0) { if (mail->expunged) @@ -498,10 +525,21 @@ mailbox_get_last_internal_error(mail->box, NULL)); return -1; } - + oversized_msg = FALSE; + oversized_tika = FALSE; + i_stream_get_size(input,TRUE,&msg_size); + if (fts_max_size > 0 && msg_size > fts_max_size) { + i_info("Skipping message body indexing because size %"PRIuUOFF_T" exceeds setting fts_max_size %s",msg_size,fts_max_size_setting); + oversized_msg = TRUE; + } + if (fts_max_size_tika > 0 && msg_size > fts_max_size_tika) { + i_info("Skipping message attachment indexing because size %"PRIuUOFF_T" exceeds setting fts_max_size_tika %s",msg_size,fts_max_size_tika_setting); + oversized_tika = TRUE; + } i_zero(&ctx); ctx.update_ctx = update_ctx; ctx.mail = mail; + ctx.oversized_tika = oversized_tika; if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0) ctx.pending_input = buffer_create_dynamic(default_pool, 128); @@ -556,7 +594,7 @@ message_decoder_set_return_binary(decoder, TRUE); body_part = TRUE; } else { - if (skip_body) + if (skip_body ||oversized_msg) continue; } @@ -590,7 +628,7 @@ else (void)fts_parser_deinit(&ctx.body_parser, NULL); } - if (ret == 0 && body_part && !skip_body && !body_added) { + if (ret == 0 && body_part && !skip_body && !oversized_msg && !body_added) { /* make sure body is added even when it doesn't exist */ block.data = NULL; block.size = 0; ret = fts_build_body_block(&ctx, &block, TRUE); diff -ur dovecot-2.3.11.3-orig/src/plugins/fts/fts-parser.h dovecot-2.3.11.3/src/plugins/fts/fts-parser.h --- dovecot-2.3.11.3-orig/src/plugins/fts/fts-parser.h 2020-08-12 14:20:41.000000000 +0200 +++ dovecot-2.3.11.3/src/plugins/fts/fts-parser.h 2020-12-07 12:42:55.653635916 +0100 @@ -10,6 +10,7 @@ /* Can't be NULL */ const char *content_type; const char *content_disposition; + bool oversized_tika; }; struct fts_parser_vfuncs { diff -ur dovecot-2.3.11.3-orig/src/plugins/fts/fts-parser-tika.c dovecot-2.3.11.3/src/plugins/fts/fts-parser-tika.c --- dovecot-2.3.11.3-orig/src/plugins/fts/fts-parser-tika.c 2020-08-12 14:20:41.000000000 +0200 +++ dovecot-2.3.11.3/src/plugins/fts/fts-parser-tika.c 2020-12-07 13:01:33.732476038 +0100 @@ -57,7 +57,7 @@ tuser = p_new(user->pool, struct fts_parser_tika_user, 1); MODULE_CONTEXT_SET(user, fts_parser_tika_user_module, tuser); - if (http_url_parse(url, NULL, 0, user->pool, + if (http_url_parse(url, NULL, HTTP_URL_ALLOW_USERINFO_PART, user->pool, &tuser->http_url, &error) < 0) { i_error("fts_tika: Failed to parse HTTP url %s: %s", url, error); return -1; @@ -77,7 +77,8 @@ http_set.request_timeout_msecs = 60*1000; http_set.ssl = &ssl_set; http_set.debug = user->mail_debug; - tika_http_client = http_client_init(&http_set); + tika_http_client = http_client_init_private(&http_set); } *http_url_r = tuser->http_url; return 0; @@ -141,6 +142,10 @@ if (tika_get_http_client_url(parser_context->user, &http_url) < 0) return NULL; + if (parser_context->oversized_tika) { + i_info("skipping tika parser due to oversized message"); + return NULL; + } if (http_url->path == NULL) http_url->path = "/"; @@ -152,6 +157,11 @@ http_url->host.name, t_strconcat(http_url->path, http_url->enc_query, NULL), fts_tika_parser_response, parser); + if (http_url->user != NULL) { + http_client_request_set_auth_simple( + http_req, http_url->user, http_url->password); + } + http_client_request_set_port(http_req, http_url->port); http_client_request_set_ssl(http_req, http_url->have_ssl); if (parser_context->content_type != NULL) diff -ur dovecot-2.3.11.3-orig/src/plugins/fts-solr/solr-connection.c dovecot-2.3.11.3/src/plugins/fts-solr/solr-connection.c --- dovecot-2.3.11.3-orig/src/plugins/fts-solr/solr-connection.c 2020-08-12 14:20:41.000000000 +0200 +++ dovecot-2.3.11.3/src/plugins/fts-solr/solr-connection.c 2020-11-15 18:34:13.657576104 +0100 @@ -103,7 +103,8 @@ http_set.ssl = ssl_client_set; http_set.debug = solr_set->debug; http_set.rawlog_dir = solr_set->rawlog_dir; - solr_http_client = http_client_init(&http_set); + solr_http_client = http_client_init_private(&http_set); } *conn_r = conn;