On Wed, May 13, 2026 at 12:51:09PM +0200, Mickaël Salaün wrote:
> matches_log_fs() and other audit_match_record() callers intermittently
> return -EAGAIN under heavy debug configs (KASAN, lockdep).  The audit
> record delivery pipeline is asynchronous: landlock_log_denial() queues
> the record to audit_queue, and kauditd_thread dequeues and delivers via
> netlink.  Under debug configs, kauditd scheduling between
> audit_log_end() and netlink_unicast() can exceed a syscall round trip
> (more than 1 usec), which was the value of the socket timeout used for
> the recvfrom() calls.
> 
> The observed failure [1] is an EAGAIN error code (-11) which means that
> the access record had not arrived within the 1 usec timeout of
> recvfrom().  The expected record does arrive, but only after
> matches_log_fs() has already returned.  It is then consumed by a later
> audit_count_records() call, making records.access == 1 instead of 0.
> 
> Switch the default socket timeout to the slow value (1 second) so all
> audit_match_record() callers wait long enough for kauditd delivery, and
> lower it to the fast value (1 usec) only on the two paths that expect no
> record: audit_count_records() and the expected_domain_id == 0 probe in
> matches_log_domain_deallocated().  audit_init() drains stale records
> with the fast timeout (terminating on -EAGAIN once the backlog is empty)
> and switches to the patient default before returning.  1 second gives
> ~10x margin over the observed maximum (~100 ms, while the happy path is
> ~23 us).
> 
> Rename the timeval constants to reflect their new roles:
> - audit_tv_dom_drop (1 second) -> audit_tv_default: default socket
>   timeout, patient enough for asynchronous kauditd delivery.
> - audit_tv_default (1 usec) -> audit_tv_fast: fast timeout for paths
>   that expect no record (drain, audit_count_records(), probes).
> 
> Invert the conditional in matches_log_domain_deallocated().  Check
> setsockopt returns on both the lower and restore paths; preserve the
> first error via !err when the restore fails after a prior error so the
> actionable return code is not masked by a bookkeeping failure.
> 
> Cc: Günther Noack <[email protected]>
> Cc: Thomas Weißschuh <[email protected]>
> Cc: [email protected]
> Depends-on: 07c2572a8757 ("selftests/landlock: Skip stale records in 
> audit_match_record()")
> Fixes: 6a500b22971c ("selftests/landlock: Add tests for audit flags and 
> domain IDs")
> Reported-by: Günther Noack <[email protected]>
> Closes: https://lore.kernel.org/r/[email protected] [1]
> Reported-by: kernel test robot <[email protected]>
> Closes: https://lore.kernel.org/oe-lkp/[email protected]
> Signed-off-by: Mickaël Salaün <[email protected]>
> ---
>  tools/testing/selftests/landlock/audit.h | 80 +++++++++++++++++++-----
>  1 file changed, 63 insertions(+), 17 deletions(-)
> 
> diff --git a/tools/testing/selftests/landlock/audit.h 
> b/tools/testing/selftests/landlock/audit.h
> index 699aed5ffab4..936fe20f020e 100644
> --- a/tools/testing/selftests/landlock/audit.h
> +++ b/tools/testing/selftests/landlock/audit.h
> @@ -45,17 +45,25 @@ struct audit_message {
>       };
>  };
>  
> -static const struct timeval audit_tv_dom_drop = {
> +static const struct timeval audit_tv_default = {
>       /*
> -      * Because domain deallocation is tied to asynchronous credential
> -      * freeing, receiving such event may take some time.  In practice,
> -      * on a small VM, it should not exceed 100k usec, but let's wait up
> -      * to 1 second to be safe.
> +      * Default socket timeout for audit_match_record() callers that expect a
> +      * record to arrive.  Asynchronous kauditd delivery can exceed 1 usec
> +      * under heavy debug configs (KASAN, lockdep), where kauditd_thread
> +      * scheduling between audit_log_end() and netlink_unicast() takes longer
> +      * than the previous 1 usec timeout. 1 second is a generous ceiling: on
> +      * the happy path, kauditd delivers within dozens of usec.
>        */
>       .tv_sec = 1,
>  };
>  
> -static const struct timeval audit_tv_default = {
> +static const struct timeval audit_tv_fast = {
> +     /*
> +      * Fast timeout for paths that expect no record (audit_init() drain,
> +      * audit_count_records(), probes).  Causes audit_recv() to return
> +      * -EAGAIN once the socket buffer is empty, naturally terminating the
> +      * read loop.
> +      */
>       .tv_usec = 1,
>  };
>  
> @@ -334,8 +342,13 @@ static int __maybe_unused 
> matches_log_domain_allocated(int audit_fd, pid_t pid,
>   * Matches a domain deallocation record.  When expected_domain_id is 
> non-zero,
>   * the pattern includes the specific domain ID so that stale deallocation
>   * records from a previous test (with a different domain ID) are skipped by
> - * audit_match_record(), and the socket timeout is temporarily increased to
> - * audit_tv_dom_drop to wait for the asynchronous kworker deallocation.
> + * audit_match_record(), waiting for the asynchronous kworker deallocation 
> with
> + * the default patient timeout.
> + *
> + * When expected_domain_id is zero, the caller is probing for any dealloc 
> record
> + * that may or may not arrive.  Temporarily lowers the socket timeout to
> + * audit_tv_fast for this probe so it returns promptly when no record is
> + * pending; restores audit_tv_default after.
>   */
>  static int __maybe_unused
>  matches_log_domain_deallocated(int audit_fd, unsigned int num_denials,
> @@ -361,16 +374,21 @@ matches_log_domain_deallocated(int audit_fd, unsigned 
> int num_denials,
>       if (log_match_len >= sizeof(log_match))
>               return -E2BIG;
>  
> -     if (expected_domain_id)
> -             setsockopt(audit_fd, SOL_SOCKET, SO_RCVTIMEO,
> -                        &audit_tv_dom_drop, sizeof(audit_tv_dom_drop));
> +     if (!expected_domain_id) {
> +             if (setsockopt(audit_fd, SOL_SOCKET, SO_RCVTIMEO,
> +                            &audit_tv_fast, sizeof(audit_tv_fast)))
> +                     return -errno;
> +     }
>  
>       err = audit_match_record(audit_fd, AUDIT_LANDLOCK_DOMAIN, log_match,
>                                domain_id);
>  
> -     if (expected_domain_id)
> -             setsockopt(audit_fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_default,
> -                        sizeof(audit_tv_default));
> +     if (!expected_domain_id) {
> +             if (setsockopt(audit_fd, SOL_SOCKET, SO_RCVTIMEO,
> +                            &audit_tv_default, sizeof(audit_tv_default)) &&
> +                 !err)
> +                     err = -errno;
> +     }
>  
>       return err;
>  }
> @@ -387,6 +405,11 @@ struct audit_records {
>   * audit_init() and after the preceding audit_match_record() call.  
> Allocation
>   * records are emitted synchronously during landlock_log_denial() in the 
> current
>   * test's syscall context, so only those are counted in records->domain.
> + *
> + * Temporarily lowers SO_RCVTIMEO to audit_tv_fast for the read loop: this 
> is a
> + * "no record expected" path that should terminate on the first -EAGAIN.  The
> + * default patient timeout is restored on exit for subsequent
> + * audit_match_record() callers.
>   */
>  static int audit_count_records(int audit_fd, struct audit_records *records)
>  {
> @@ -403,6 +426,12 @@ static int audit_count_records(int audit_fd, struct 
> audit_records *records)
>       records->access = 0;
>       records->domain = 0;
>  
> +     if (setsockopt(audit_fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_fast,
> +                    sizeof(audit_tv_fast))) {
> +             err = -errno;
> +             goto out;
> +     }
> +
>       do {
>               memset(&msg, 0, sizeof(msg));
>               err = audit_recv(audit_fd, &msg);
> @@ -429,6 +458,10 @@ static int audit_count_records(int audit_fd, struct 
> audit_records *records)
>       } while (true);
>  
>  out:
> +     if (setsockopt(audit_fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_default,
> +                    sizeof(audit_tv_default)) &&
> +         !err)
> +             err = -errno;
>       regfree(&dealloc_re);
>       return err;
>  }
> @@ -449,9 +482,9 @@ static int audit_init(void)
>       if (err)
>               goto err_close;
>  
> -     /* Sets a timeout for negative tests. */
> -     err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_default,
> -                      sizeof(audit_tv_default));
> +     /* Uses the fast timeout to drain stale records below. */
> +     err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_fast,
> +                      sizeof(audit_tv_fast));
>       if (err) {
>               err = -errno;
>               goto err_close;
> @@ -467,6 +500,19 @@ static int audit_init(void)
>       while (audit_recv(fd, NULL) == 0)
>               ;
>  
> +     /*
> +      * Restores the default timeout for audit_match_record() callers that
> +      * expect a record to arrive.  Paths that expect no record restore the
> +      * fast timeout locally (audit_count_records(), the expected_domain_id
> +      * == 0 probe in matches_log_domain_deallocated()).
> +      */
> +     err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_default,
> +                      sizeof(audit_tv_default));
> +     if (err) {
> +             err = -errno;
> +             goto err_close;
> +     }
> +
>       return fd;
>  
>  err_close:
> -- 
> 2.54.0
> 

Tested-by: Günther Noack <[email protected]>

Reply via email to