Wietse Venema: > The long-term solution is to implement an asynchronous atrace_flush() > operation, similar to the asynchronous abounce_flush() and > adefer_flush() operations (another solution, moving work from the > qmgr to the bounce/defer/flush daemons, would require a structural > redesign and that is not an option at this time). > > This long-term solution would involve: > > - Cloning a few lines of existing code in src/global/abounce.[hc]. > > - Replacing the trace_flush() call in src/qmgr/qmgr_active.c by > atrace_flush() and a call-back function, similar to the way that > the abounce_flush() and adefer_flush() operations are used. > > That is under a day of work, but I have three deadlines early May. > > However, if code can't be changed, then configuration must be > changed: announce DSN support more selectively.
Actually, it took much less time, because this was indeed a trivial job of cloning a few lines of already-existing code. There are no changes to existing protocols or APIs. In the attached patch, all work is done by less than half a dozen new lines of code: three lines in abounce.c, and four lines in each of the two identical qmgr_active_done.c files. The remainder of the patch is spent on comments and declarations. You may want to reconsider whether qmgr can/can't be changed because this change is really very localized. Even if this part of qmgr is customized (which seems unlikely) this should be easy to paste in by hand. Wietse
[This patch applies cleanly to Postfix 2.3 and later] 20110410 Performance: a high load of DSN success notification requests could stall the queue manager. Solution: make the trace client asynchronous, just like the bounce and defer clients. Problem reported by Eduardo M. Stelmaszczyk of terra.com.br, a very good Postfix customer. Files: global/abounce.[hc], *qmgr/qmgr_active.c (the qmgr_active.c files are identical). diff -cr /var/tmp/postfix-old/src/global/abounce.c ./src/global/abounce.c *** /var/tmp/postfix-old/src/global/abounce.c Tue May 15 16:12:49 2007 --- ./src/global/abounce.c Wed Apr 20 19:43:51 2011 *************** *** 67,75 **** /* int dsn_ret; /* void (*callback)(int status, char *context); /* char *context; /* DESCRIPTION /* This module implements an asynchronous interface to the ! /* bounce/defer service for submitting sender notifications /* without waiting for completion of the request. /* /* abounce_flush() bounces the specified message to --- 67,87 ---- /* int dsn_ret; /* void (*callback)(int status, char *context); /* char *context; + /* + /* void atrace_flush(flags, queue, id, encoding, sender, + /* dsn_envid, dsn_ret, callback, context) + /* int flags; + /* const char *queue; + /* const char *id; + /* const char *encoding; + /* const char *sender; + /* const char *dsn_envid; + /* int dsn_ret; + /* void (*callback)(int status, char *context); + /* char *context; /* DESCRIPTION /* This module implements an asynchronous interface to the ! /* bounce/defer/trace service for submitting sender notifications /* without waiting for completion of the request. /* /* abounce_flush() bounces the specified message to *************** *** 92,97 **** --- 104,113 ---- /* the specified sender, including the defer log that was /* built with defer_append(). /* + /* atrace_flush() returns the specified message to the specified + /* sender, including the message delivery record log that was + /* built with vtrace_append(). + /* /* Arguments: /* .IP flags /* The bitwise OR of zero or more of the following (specify *************** *** 359,361 **** --- 375,389 ---- flags, queue, id, encoding, sender, dsn_envid, dsn_ret, callback, context); } + + /* atrace_flush - asynchronous trace flush */ + + void atrace_flush(int flags, const char *queue, const char *id, + const char *encoding, const char *sender, + const char *dsn_envid, int dsn_ret, + ABOUNCE_FN callback, char *context) + { + abounce_request(MAIL_CLASS_PRIVATE, var_trace_service, BOUNCE_CMD_TRACE, + flags, queue, id, encoding, sender, dsn_envid, dsn_ret, + callback, context); + } diff -cr /var/tmp/postfix-old/src/global/abounce.h ./src/global/abounce.h *** /var/tmp/postfix-old/src/global/abounce.h Tue May 17 13:36:13 2005 --- ./src/global/abounce.h Wed Apr 20 17:41:35 2011 *************** *** 24,29 **** --- 24,30 ---- extern void abounce_flush(int, const char *, const char *, const char *, const char *, const char *, int, ABOUNCE_FN, char *); extern void adefer_flush(int, const char *, const char *, const char *, const char *, const char *, int, ABOUNCE_FN, char *); extern void adefer_warn(int, const char *, const char *, const char *, const char *, const char *, int, ABOUNCE_FN, char *); + extern void atrace_flush(int, const char *, const char *, const char *, const char *, const char *, int, ABOUNCE_FN, char *); extern void abounce_flush_verp(int, const char *, const char *, const char *, const char *, const char *, int, const char *, ABOUNCE_FN, char *); extern void adefer_flush_verp(int, const char *, const char *, const char *, const char *, const char *, int, const char *, ABOUNCE_FN, char *); diff -cr /var/tmp/postfix-old/src/oqmgr/qmgr_active.c ./src/oqmgr/qmgr_active.c *** /var/tmp/postfix-old/src/oqmgr/qmgr_active.c Wed Oct 3 20:19:21 2007 --- ./src/oqmgr/qmgr_active.c Wed Apr 20 19:39:54 2011 *************** *** 116,121 **** --- 116,123 ---- */ static void qmgr_active_done_2_bounce_flush(int, char *); static void qmgr_active_done_2_generic(QMGR_MESSAGE *); + static void qmgr_active_done_25_trace_flush(int, char *); + static void qmgr_active_done_25_generic(QMGR_MESSAGE *); static void qmgr_active_done_3_defer_flush(int, char *); static void qmgr_active_done_3_defer_warn(int, char *); static void qmgr_active_done_3_generic(QMGR_MESSAGE *); *************** *** 336,345 **** static void qmgr_active_done_2_generic(QMGR_MESSAGE *message) { - const char *myname = "qmgr_active_done_2_generic"; const char *path; struct stat st; - int status; /* * A delivery agent marks a queue file as corrupt by changing its --- 338,345 ---- *************** *** 372,381 **** } /* - * As a temporary implementation, synchronously inform the sender of - * trace information. This will block for 10 seconds when the qmgr FIFO - * is full. - * * XXX With multi-recipient mail, some recipients may have NOTIFY=SUCCESS * and others not. Depending on what subset of recipients are delivered, * a trace file may or may not be created. Even when the last partial --- 372,377 ---- *************** *** 388,406 **** */ if ((message->tflags & (DEL_REQ_FLAG_USR_VRFY | DEL_REQ_FLAG_RECORD)) || (message->rflags & QMGR_READ_FLAG_NOTIFY_SUCCESS)) { ! status = trace_flush(message->tflags, ! message->queue_name, ! message->queue_id, ! message->encoding, ! message->sender, ! message->dsn_envid, ! message->dsn_ret); ! if (status == 0 && message->tflags_offset) ! qmgr_message_kill_record(message, message->tflags_offset); ! message->flags |= status; } /* * If we get to this point we have tried all recipients for this message. * If the message is too old, try to bounce it. * --- 384,429 ---- */ if ((message->tflags & (DEL_REQ_FLAG_USR_VRFY | DEL_REQ_FLAG_RECORD)) || (message->rflags & QMGR_READ_FLAG_NOTIFY_SUCCESS)) { ! atrace_flush(message->tflags, ! message->queue_name, ! message->queue_id, ! message->encoding, ! message->sender, ! message->dsn_envid, ! message->dsn_ret, ! qmgr_active_done_25_trace_flush, ! (char *) message); ! return; } /* + * Asynchronous processing does not reach this point. + */ + qmgr_active_done_25_generic(message); + } + + /* qmgr_active_done_25_trace_flush - continue after atrace_flush() completion */ + + static void qmgr_active_done_25_trace_flush(int status, char *context) + { + QMGR_MESSAGE *message = (QMGR_MESSAGE *) context; + + /* + * Process atrace_flush() status and continue processing. + */ + if (status == 0 && message->tflags_offset) + qmgr_message_kill_record(message, message->tflags_offset); + message->flags |= status; + qmgr_active_done_25_generic(message); + } + + /* qmgr_active_done_25_generic - continue processing */ + + static void qmgr_active_done_25_generic(QMGR_MESSAGE *message) + { + const char *myname = "qmgr_active_done_25_generic"; + + /* * If we get to this point we have tried all recipients for this message. * If the message is too old, try to bounce it. * diff -cr /var/tmp/postfix-old/src/qmgr/qmgr_active.c ./src/qmgr/qmgr_active.c *** /var/tmp/postfix-old/src/qmgr/qmgr_active.c Sun Sep 23 08:24:15 2007 --- ./src/qmgr/qmgr_active.c Wed Apr 20 19:39:54 2011 *************** *** 116,121 **** --- 116,123 ---- */ static void qmgr_active_done_2_bounce_flush(int, char *); static void qmgr_active_done_2_generic(QMGR_MESSAGE *); + static void qmgr_active_done_25_trace_flush(int, char *); + static void qmgr_active_done_25_generic(QMGR_MESSAGE *); static void qmgr_active_done_3_defer_flush(int, char *); static void qmgr_active_done_3_defer_warn(int, char *); static void qmgr_active_done_3_generic(QMGR_MESSAGE *); *************** *** 336,345 **** static void qmgr_active_done_2_generic(QMGR_MESSAGE *message) { - const char *myname = "qmgr_active_done_2_generic"; const char *path; struct stat st; - int status; /* * A delivery agent marks a queue file as corrupt by changing its --- 338,345 ---- *************** *** 372,381 **** } /* - * As a temporary implementation, synchronously inform the sender of - * trace information. This will block for 10 seconds when the qmgr FIFO - * is full. - * * XXX With multi-recipient mail, some recipients may have NOTIFY=SUCCESS * and others not. Depending on what subset of recipients are delivered, * a trace file may or may not be created. Even when the last partial --- 372,377 ---- *************** *** 388,406 **** */ if ((message->tflags & (DEL_REQ_FLAG_USR_VRFY | DEL_REQ_FLAG_RECORD)) || (message->rflags & QMGR_READ_FLAG_NOTIFY_SUCCESS)) { ! status = trace_flush(message->tflags, ! message->queue_name, ! message->queue_id, ! message->encoding, ! message->sender, ! message->dsn_envid, ! message->dsn_ret); ! if (status == 0 && message->tflags_offset) ! qmgr_message_kill_record(message, message->tflags_offset); ! message->flags |= status; } /* * If we get to this point we have tried all recipients for this message. * If the message is too old, try to bounce it. * --- 384,429 ---- */ if ((message->tflags & (DEL_REQ_FLAG_USR_VRFY | DEL_REQ_FLAG_RECORD)) || (message->rflags & QMGR_READ_FLAG_NOTIFY_SUCCESS)) { ! atrace_flush(message->tflags, ! message->queue_name, ! message->queue_id, ! message->encoding, ! message->sender, ! message->dsn_envid, ! message->dsn_ret, ! qmgr_active_done_25_trace_flush, ! (char *) message); ! return; } /* + * Asynchronous processing does not reach this point. + */ + qmgr_active_done_25_generic(message); + } + + /* qmgr_active_done_25_trace_flush - continue after atrace_flush() completion */ + + static void qmgr_active_done_25_trace_flush(int status, char *context) + { + QMGR_MESSAGE *message = (QMGR_MESSAGE *) context; + + /* + * Process atrace_flush() status and continue processing. + */ + if (status == 0 && message->tflags_offset) + qmgr_message_kill_record(message, message->tflags_offset); + message->flags |= status; + qmgr_active_done_25_generic(message); + } + + /* qmgr_active_done_25_generic - continue processing */ + + static void qmgr_active_done_25_generic(QMGR_MESSAGE *message) + { + const char *myname = "qmgr_active_done_25_generic"; + + /* * If we get to this point we have tried all recipients for this message. * If the message is too old, try to bounce it. *