For instance there is [1] in apreq that also splits a brigade on a boundary.

[1] https://github.com/apache/apreq/blob/trunk/library/parser_multipart.c#L112

Regards;
Yann.

On Wed, Oct 20, 2021 at 11:43 AM Yann Ylavic <ylavic....@gmail.com> wrote:
>
> On Tue, Oct 19, 2021 at 4:31 PM <minf...@apache.org> wrote:
> >
> > +APR_DECLARE(apr_status_t) apr_brigade_split_boundary(apr_bucket_brigade 
> > *bbOut,
> > +                                                     apr_bucket_brigade 
> > *bbIn,
> > +                                                     apr_read_type_e block,
> > +                                                     const char *boundary,
> > +                                                     apr_size_t 
> > boundary_len,
> > +                                                     apr_off_t maxbytes)
> > +{
> []
> > +        /*
> > +         * Fast path.
> > +         *
> > +         * If we have at least one boundary worth of data, do an optimised
> > +         * substring search for the boundary, and split quickly if found.
> > +         */
> > +        if (len >= boundary_len) {
> > +
> > +            apr_size_t off;
> > +            apr_size_t leftover;
> > +
> > +            pos = strnstr(str, boundary, len);
>
> apr_strnstr() maybe, strnstr() is non-standard AFAICT and possibly not
> available on all platforms (Windows)?
>
> > +
> > +            /* definitely found it, we leave */
> > +            if (pos != NULL) {
> > +
> > +                off = pos - str;
> > +
> > +                /* everything up to the boundary */
> > +                if (off) {
> > +
> > +                    apr_bucket_split(e, off);
> > +                    APR_BUCKET_REMOVE(e);
> > +                    APR_BRIGADE_INSERT_TAIL(bbOut, e);
> > +
> > +                    e = APR_BRIGADE_FIRST(bbIn);
> > +                }
> > +
> > +                /* cut out the boundary */
> > +                apr_bucket_split(e, boundary_len);
> > +                apr_bucket_delete(e);
> > +
> > +                return APR_SUCCESS;
> > +            }
> > +
> > +            /* any partial matches at the end? */
> > +            leftover = boundary_len - 1;
> > +            off = (len - leftover);
>
> Can't we fall through the below "else" slow path here? The code looks
> quite similar.
>
> > +
> > +            while (leftover) {
> > +                if (!strncmp(str + off, boundary, leftover)) {
> > +
> > +                    if (off) {
> > +
> > +                        apr_bucket_split(e, off);
> > +                        APR_BUCKET_REMOVE(e);
> > +                        APR_BRIGADE_INSERT_TAIL(bbOut, e);
> > +
> > +                        e = APR_BRIGADE_FIRST(bbIn);
> > +                    }
> > +
> > +                    outbytes += off;
> > +                    inbytes -= off;
> > +
> > +                    goto skip;
> > +                }
> > +                off++;
> > +                leftover--;
> > +            }
> > +
> > +            APR_BUCKET_REMOVE(e);
> > +            APR_BRIGADE_INSERT_TAIL(bbOut, e);
> > +
> > +            outbytes += len;
> > +
> > +            continue;
> > +
> > +        }
> > +
> > +        /*
> > +         * Slow path.
> > +         *
> > +         * We need to read ahead at least one boundary worth of data so
> > +         * we can search across the bucket edges.
> > +         */
> > +        else {
> > +
> > +            apr_size_t off = 0;
> > +
> > +            /* find all definite non matches */
> > +            while (len) {
> > +                if (!strncmp(str + off, boundary, len)) {
> > +
> > +                    if (off) {
> > +
> > +                        apr_bucket_split(e, off);
> > +                        APR_BUCKET_REMOVE(e);
> > +                        APR_BRIGADE_INSERT_TAIL(bbOut, e);
> > +
> > +                        e = APR_BRIGADE_FIRST(bbIn);
> > +                    }
> > +
> > +                    inbytes -= off;
> > +
> > +                    goto skip;
> > +                }
> > +                off++;
> > +                len--;
> > +            }
> > +
> > +            APR_BUCKET_REMOVE(e);
> > +            APR_BRIGADE_INSERT_TAIL(bbOut, e);
> > +            continue;
>
> "outbytes" not bumped?
>
> > +
> > +        }
> > +
> > +        /*
> > +         * If we reach skip, it means the bucket in e is:
> > +         *
> > +         * - shorter than the boundary
> > +         * - matches the boundary up to the bucket length
> > +         * - might match more buckets
> > +         *
> > +         * Read further buckets and check whether the boundary matches all
> > +         * the way to the end. If so, we have a match. If no match, shave 
> > off
> > +         * one byte and continue round to try again.
> > +         */
> > +skip:
>
> Below you read ahead to try to match the end of the boundary, and if
> not split one byte into bbOut before restarting the loop, which could
> result in a quite fragmented bbOut and is also what you almost did
> above already.
>
> Couldn't we instead save the partially matching bucket(s) in a
> temporary brigade (provided by the caller) and restart the loop with
> the more effective [apr_]strnstr() match (at boundary+off)?
>
> > +
> > +        for (next = APR_BUCKET_NEXT(e);
> > +                inbytes < boundary_len && next != 
> > APR_BRIGADE_SENTINEL(bbIn);
> > +                next = APR_BUCKET_NEXT(next)) {
> > +
> > +            const char *str;
> > +            apr_size_t off;
> > +            apr_size_t len;
> > +
> > +            rv = apr_bucket_read(next, &str, &len, block);
> > +
> > +            if (rv != APR_SUCCESS) {
> > +                return rv;
> > +            }
> > +
> > +            off = boundary_len - inbytes;
> > +
> > +            if (len > off) {
> > +
> > +                /* not a match, bail out */
> > +                if (strncmp(str, boundary + inbytes, off)) {
> > +                    break;
> > +                }
> > +
> > +                /* a match! remove the boundary and return */
> > +                apr_bucket_split(next, off);
> > +
> > +                e = APR_BUCKET_NEXT(next);
> > +
> > +                for (prev = APR_BRIGADE_FIRST(bbIn);
> > +                        prev != e;
> > +                        prev = APR_BRIGADE_FIRST(bbIn)) {
> > +
> > +                    apr_bucket_delete(prev);
> > +
> > +                }
> > +
> > +                return APR_SUCCESS;
> > +
> > +            }
> > +            if (len == off) {
> > +
> > +                /* not a match, bail out */
> > +                if (strncmp(str, boundary + inbytes, off)) {
> > +                    break;
> > +                }
> > +
> > +                /* a match! remove the boundary and return */
> > +                e = APR_BUCKET_NEXT(next);
> > +
> > +                for (prev = APR_BRIGADE_FIRST(bbIn);
> > +                        prev != e;
> > +                        prev = APR_BRIGADE_FIRST(bbIn)) {
> > +
> > +                    apr_bucket_delete(prev);
> > +
> > +                }
> > +
> > +                return APR_SUCCESS;
> > +
> > +            }
> > +            else if (len) {
> > +
> > +                /* not a match, bail out */
> > +                if (strncmp(str, boundary + inbytes, len)) {
> > +                    break;
> > +                }
> > +
> > +                /* still hope for a match */
> > +                inbytes += len;
> > +            }
> > +
> > +        }
> > +
> > +        /*
> > +         * If we reach this point, the bucket e did not match the boundary
> > +         * in the subsequent buckets.
> > +         *
> > +         * Bump one byte off, and loop round to search again.
> > +         */
> > +        apr_bucket_split(e, 1);
> > +        APR_BUCKET_REMOVE(e);
> > +        APR_BRIGADE_INSERT_TAIL(bbOut, e);
> > +
> > +        outbytes++;
> > +
> > +    }
> > +
> > +    return APR_INCOMPLETE;
> > +}
>
>
> Regards;
> Yann.

Reply via email to