Re: [squid-dev] [PATCH] suffix parsing and skipping

Alex Rousskov Fri, 07 Aug 2015 16:08:14 -0700

On 07/23/2015 09:27 AM, Amos Jeffries wrote:

> in SBuf.h:
> 
> * SBuf::findLastNotOf() documentation looks wrong.
>  - s/occurrences before/occurrences after/


Fixed.


> * not sure why the "TODO: rename to camelCase" still existed.
>  - probably should not be copied to the new method(s) anyhow.

Done.


I also implemented Tokenizer::parsedSize() updates for consumed suffixes
per Amos request, even though I do not think that was the right thing to
do. The XXX comment marking this API in the previous patch (that Kinkie
complained about) is now removed.


The patch has been ported to recent trunk and passes "make check" tests.
I added a few tests for the new suffix-parsing methods, mimicking the
existing prefix-parsing tests.


HTH,

Alex.

New SBuf and Tokenizer methods to simplify suffix parsing and skipping
(and to make suffix/reverse APIs more similar to prefix/forward ones).

Also reluctantly changed Tokenizer to update parsedSize() when parsing
suffixes, per reviewer request.

=== modified file 'src/SBuf.cc'
--- src/SBuf.cc	2015-07-29 18:12:16 +0000
+++ src/SBuf.cc	2015-08-07 21:03:48 +0000
@@ -785,40 +785,82 @@ SBuf::findFirstNotOf(const CharacterSet
     ++stats.find;
 
     if (startPos == npos)
         return npos;
 
     if (startPos >= length())
         return npos;
 
     debugs(24, 7, "first not of characterset " << set.name << " in id " << id);
     char *cur = buf()+startPos;
     const char *bufend = bufEnd();
     while (cur < bufend) {
         if (!set[*cur])
             return cur-buf();
         ++cur;
     }
     debugs(24, 7, "not found");
     return npos;
 }
 
+SBuf::size_type
+SBuf::findLastOf(const CharacterSet &set, size_type endPos) const
+{
+    ++stats.find;
+
+    if (isEmpty())
+        return npos;
+
+    if (endPos == npos || endPos >= length())
+        endPos = length() - 1;
+
+    debugs(24, 7, "last of characterset " << set.name << " in id " << id);
+    const char *start = buf();
+    for (const char *cur = start + endPos; cur >= start; --cur) {
+        if (set[*cur])
+            return cur - start;
+    }
+    debugs(24, 7, "not found");
+    return npos;
+}
+
+SBuf::size_type
+SBuf::findLastNotOf(const CharacterSet &set, size_type endPos) const
+{
+    ++stats.find;
+
+    if (isEmpty())
+        return npos;
+
+    if (endPos == npos || endPos >= length())
+        endPos = length() - 1;
+
+    debugs(24, 7, "last not of characterset " << set.name << " in id " << id);
+    const char *start = buf();
+    for (const char *cur = start + endPos; cur >= start; --cur) {
+        if (!set[*cur])
+            return cur - start;
+    }
+    debugs(24, 7, "not found");
+    return npos;
+}
+
 /*
  * TODO: borrow a sscanf implementation from Linux or similar?
  * we'd really need a vsnscanf(3)... ? As an alternative, a
  * light-regexp-like domain-specific syntax might be an idea.
  */
 int
 SBuf::scanf(const char *format, ...)
 {
     va_list arg;
     int rv;
     ++stats.scanf;
     va_start(arg, format);
     rv = vsscanf(c_str(), format, arg);
     va_end(arg);
     return rv;
 }
 
 std::ostream &
 SBufStats::dump(std::ostream& os) const
 {

=== modified file 'src/SBuf.h'
--- src/SBuf.h	2015-07-29 18:12:16 +0000
+++ src/SBuf.h	2015-08-07 21:09:12 +0000
@@ -579,50 +579,68 @@ public:
      * Returns the index in the SBuf of the last occurrence of the
      * sequence contained in the str argument.
      * \return npos if the sequence  was not found
      * \param endPos if specified, ignore any occurrences after that position
      *   if npos or greater than length(), the whole SBuf is considered
      */
     size_type rfind(const SBuf &str, size_type endPos = npos) const;
 
     /** Find first occurrence of character of set in SBuf
      *
      * Finds the first occurrence of ANY of the characters in the supplied set in
      * the SBuf.
      * \return npos if no character in the set could be found
      * \param startPos if specified, ignore any occurrences before that position
      *   if npos, then npos is always returned
      *
      * TODO: rename to camelCase
      */
     size_type findFirstOf(const CharacterSet &set, size_type startPos = 0) const;
 
+    /** Find last occurrence of character of set in SBuf
+     *
+     * Finds the last occurrence of ANY of the characters in the supplied set in
+     * the SBuf.
+     * \return npos if no character in the set could be found
+     * \param endPos if specified, ignore any occurrences after that position
+     *   if npos, the entire SBuf is searched
+     */
+    size_type findLastOf(const CharacterSet &set, size_type endPos = npos) const;
+
     /** Find first occurrence character NOT in character set
      *
      * \return npos if all characters in the SBuf are from set
      * \param startPos if specified, ignore any occurrences before that position
      *   if npos, then npos is always returned
      *
      * TODO: rename to camelCase
      */
     size_type findFirstNotOf(const CharacterSet &set, size_type startPos = 0) const;
 
+    /** Find last occurrence character NOT in character set
+     *
+     * \return npos if all characters in the SBuf are from set
+     * \param endPos if specified, ignore any occurrences after that position
+     *   if npos, then the entire SBuf is searched
+     */
+    size_type findLastNotOf(const CharacterSet &set, size_type endPos = npos) const;
+
     /** sscanf-alike
      *
      * sscanf re-implementation. Non-const, and not \0-clean.
      * \return same as sscanf
      * \see man sscanf(3)
      */
     int scanf(const char *format, ...);
 
     /// converts all characters to lower case; \see man tolower(3)
     void toLower();
 
     /// converts all characters to upper case; \see man toupper(3)
     void toUpper();
 
     /** String export function
      * converts the SBuf to a legacy String, by copy.
      * \deprecated
      */
     String toString() const;
 

=== modified file 'src/parser/Tokenizer.cc'
--- src/parser/Tokenizer.cc	2015-04-22 13:32:56 +0000
+++ src/parser/Tokenizer.cc	2015-08-07 21:30:41 +0000
@@ -18,40 +18,62 @@
 #endif
 
 /// convenience method: consumes up to n bytes, counts, and returns them
 SBuf
 Parser::Tokenizer::consume(const SBuf::size_type n)
 {
     // careful: n may be npos!
     debugs(24, 5, "consuming " << n << " bytes");
     const SBuf result = buf_.consume(n);
     parsed_ += result.length();
     return result;
 }
 
 /// convenience method: consume()s up to n bytes and returns their count
 SBuf::size_type
 Parser::Tokenizer::success(const SBuf::size_type n)
 {
     return consume(n).length();
 }
 
+/// convenience method: consumes up to n last bytes and returns them
+SBuf
+Parser::Tokenizer::consumeTrailing(const SBuf::size_type n)
+{
+    debugs(24, 5, "consuming " << n << " bytes");
+
+    // If n is npos, we consume everything from buf_ (and nothing from result).
+    const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n;
+
+    SBuf result = buf_;
+    buf_ = result.consume(buf_.length() - parsed);
+    parsed_ += parsed;
+    return result;
+}
+
+/// convenience method: consumes up to n last bytes and returns their count
+SBuf::size_type
+Parser::Tokenizer::successTrailing(const SBuf::size_type n)
+{
+    return consumeTrailing(n).length();
+}
+
 bool
 Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
 {
     const Tokenizer saved(*this);
     skipAll(delimiters);
     const SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end
     if (tokenLen == SBuf::npos) {
         debugs(24, 8, "no token found for delimiters " << delimiters.name);
         *this = saved;
         return false;
     }
     returnedToken = consume(tokenLen); // cannot be empty
     skipAll(delimiters);
     debugs(24, DBG_DATA, "token found for delimiters " << delimiters.name << ": '" <<
            returnedToken << '\'');
     return true;
 }
 
 bool
 Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
@@ -73,107 +95,132 @@ Parser::Tokenizer::prefix(SBuf &returned
     returnedToken = consume(prefixLen); // cannot be empty after the npos check
     return true;
 }
 
 bool
 Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
 {
     SBuf span = buf_;
 
     if (limit < buf_.length())
         span.consume(buf_.length() - limit); // ignore the N prefix characters
 
     auto i = span.rbegin();
     SBuf::size_type found = 0;
     while (i != span.rend() && tokenChars[*i]) {
         ++i;
         ++found;
     }
     if (!found)
         return false;
-    returnedToken = buf_;
-    buf_ = returnedToken.consume(buf_.length() - found);
+    returnedToken = consumeTrailing(found);
     return true;
 }
 
 SBuf::size_type
 Parser::Tokenizer::skipAll(const CharacterSet &tokenChars)
 {
     const SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
     if (prefixLen == 0) {
         debugs(24, 8, "no match when trying to skipAll " << tokenChars.name);
         return 0;
     }
     debugs(24, 8, "skipping all in " << tokenChars.name << " len " << prefixLen);
     return success(prefixLen);
 }
 
 bool
 Parser::Tokenizer::skipOne(const CharacterSet &chars)
 {
     if (!buf_.isEmpty() && chars[buf_[0]]) {
         debugs(24, 8, "skipping one-of " << chars.name);
         return success(1);
     }
     debugs(24, 8, "no match while skipping one-of " << chars.name);
     return false;
 }
 
 bool
 Parser::Tokenizer::skipSuffix(const SBuf &tokenToSkip)
 {
     if (buf_.length() < tokenToSkip.length())
         return false;
 
     SBuf::size_type offset = 0;
     if (tokenToSkip.length() < buf_.length())
         offset = buf_.length() - tokenToSkip.length();
 
     if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) {
-        buf_ = buf_.substr(0,offset);
-        return true;
+        debugs(24, 8, "skipping " << tokenToSkip.length());
+        return successTrailing(tokenToSkip.length());
     }
     return false;
 }
 
 bool
 Parser::Tokenizer::skip(const SBuf &tokenToSkip)
 {
     if (buf_.startsWith(tokenToSkip)) {
         debugs(24, 8, "skipping " << tokenToSkip.length());
         return success(tokenToSkip.length());
     }
     debugs(24, 8, "no match, not skipping '" << tokenToSkip << '\'');
     return false;
 }
 
 bool
 Parser::Tokenizer::skip(const char tokenChar)
 {
     if (!buf_.isEmpty() && buf_[0] == tokenChar) {
         debugs(24, 8, "skipping char '" << tokenChar << '\'');
         return success(1);
     }
     debugs(24, 8, "no match, not skipping char '" << tokenChar << '\'');
     return false;
 }
 
+bool
+Parser::Tokenizer::skipOneTrailing(const CharacterSet &skippable)
+{
+    if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) {
+        debugs(24, 8, "skipping one-of " << skippable.name);
+        return successTrailing(1);
+    }
+    debugs(24, 8, "no match while skipping one-of " << skippable.name);
+    return false;
+}
+
+SBuf::size_type
+Parser::Tokenizer::skipAllTrailing(const CharacterSet &skippable)
+{
+    const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable);
+    const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ?
+        0 : (prefixEnd + 1);
+    const SBuf::size_type suffixLen = buf_.length() - prefixLen;
+    if (suffixLen == 0) {
+        debugs(24, 8, "no match when trying to skip " << skippable.name);
+        return 0;
+    }
+    debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen);
+    return successTrailing(suffixLen);
+}
+
 /* reworked from compat/strtoll.c */
 bool
 Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit)
 {
     if (atEnd() || limit == 0)
         return false;
 
     const SBuf range(buf_.substr(0,limit));
 
     //fixme: account for buf_.size()
     bool neg = false;
     const char *s = range.rawContent();
     const char *end = range.rawContent() + range.length();
 
     if (allowSign) {
         if (*s == '-') {
             neg = true;
             ++s;
         } else if (*s == '+') {
             ++s;

=== modified file 'src/parser/Tokenizer.h'
--- src/parser/Tokenizer.h	2015-04-10 11:02:44 +0000
+++ src/parser/Tokenizer.h	2015-08-07 21:41:28 +0000
@@ -98,52 +98,66 @@ public:
     bool skip(const SBuf &tokenToSkip);
 
     /** skips a given single character
      *
      * \return whether the character was skipped
      */
     bool skip(const char tokenChar);
 
     /** Skips a single character from the set.
      *
      * \return whether a character was skipped
      */
     bool skipOne(const CharacterSet &discardables);
 
     /** Skips all sequential characters from the set, in any order.
      *
      * \returns the number of skipped characters
      */
     SBuf::size_type skipAll(const CharacterSet &discardables);
 
+    /** Removes a single trailing character from the set.
+     *
+     * \return whether a character was removed
+     */
+    bool skipOneTrailing(const CharacterSet &discardables);
+
+    /** Removes all sequential trailing characters from the set, in any order.
+     *
+     * \returns the number of characters removed
+     */
+    SBuf::size_type skipAllTrailing(const CharacterSet &discardables);
+
     /** Extracts an unsigned int64_t at the beginning of the buffer.
      *
      * strtoll(3)-alike function: tries to parse unsigned 64-bit integer
      * at the beginning of the parse buffer, in the base specified by the user
      * or guesstimated; consumes the parsed characters.
      *
      * \param result Output value. Not touched if parsing is unsuccessful.
      * \param base   Specify base to do the parsing in, with the same restrictions
      *               as strtoll. Defaults to 0 (meaning guess)
      * \param allowSign Whether to accept a '+' or '-' sign prefix.
      * \param limit  Maximum count of characters to convert.
      *
      * \return whether the parsing was successful
      */
     bool int64(int64_t &result, int base = 0, bool allowSign = true, SBuf::size_type limit = SBuf::npos);
 
 protected:
     SBuf consume(const SBuf::size_type n);
     SBuf::size_type success(const SBuf::size_type n);
+    SBuf consumeTrailing(const SBuf::size_type n);
+    SBuf::size_type successTrailing(const SBuf::size_type n);
 
     /// reset the buffer and parsed stats to a saved checkpoint
     void undoParse(const SBuf &newBuf, SBuf::size_type cParsed) { buf_ = newBuf; parsed_ = cParsed; }
 
 private:
     SBuf buf_; ///< yet unparsed input
     SBuf::size_type parsed_; ///< bytes successfully parsed, including skipped
 };
 
 } /* namespace Parser */
 
 #endif /* SQUID_PARSER_TOKENIZER_H_ */
 

=== modified file 'src/tests/testTokenizer.cc'
--- src/tests/testTokenizer.cc	2015-08-03 02:08:22 +0000
+++ src/tests/testTokenizer.cc	2015-08-07 22:44:38 +0000
@@ -112,40 +112,100 @@ testTokenizer::testTokenizerSkip()
 
 void
 testTokenizer::testTokenizerToken()
 {
     Parser::Tokenizer t(text);
     SBuf s;
 
     // first scenario: patterns match
     CPPUNIT_ASSERT(t.token(s,whitespace));
     CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s);
     CPPUNIT_ASSERT(t.token(s,whitespace));
     CPPUNIT_ASSERT_EQUAL(SBuf("http://resource.com/path";),s);
     CPPUNIT_ASSERT(t.token(s,whitespace));
     CPPUNIT_ASSERT_EQUAL(SBuf("HTTP/1.1"),s);
     CPPUNIT_ASSERT(t.token(s,whitespace));
     CPPUNIT_ASSERT_EQUAL(SBuf("Host:"),s);
 
 }
 
 void
+testTokenizer::testTokenizerSuffix()
+{
+    const SBuf canary("This text should not be changed.");
+
+    Parser::Tokenizer t(text);
+    SBuf s;
+
+    CharacterSet all(whitespace);
+    all += alpha;
+    all += crlf;
+    all += numbers;
+    all.add(':').add('.').add('/');
+
+    // an empty suffix should return false (the full output buffer case)
+    s = canary;
+    const SBuf before = t.remaining();
+    CPPUNIT_ASSERT(!t.suffix(s, all, 0));
+    // ... and a false return value means no parameter changes
+    CPPUNIT_ASSERT_EQUAL(canary, s);
+    // ... and a false return value means no input buffer changes
+    CPPUNIT_ASSERT_EQUAL(before, t.remaining());
+
+    // consume suffix until the last CRLF, including that last CRLF
+    SBuf::size_type remaining = t.remaining().length();
+    while (t.remaining().findLastOf(crlf) != SBuf::npos) {
+        CPPUNIT_ASSERT(t.remaining().length() > 0);
+        CPPUNIT_ASSERT(t.skipOneTrailing(all));
+        // ensure steady progress
+        CPPUNIT_ASSERT_EQUAL(remaining, t.remaining().length() + 1);
+        --remaining;
+    }
+
+    // no match (last char is not in the suffix set)
+    CPPUNIT_ASSERT(!t.suffix(s, crlf));
+    CPPUNIT_ASSERT(!t.suffix(s, whitespace));
+
+    // successful suffix tokenization
+    CPPUNIT_ASSERT(t.suffix(s, numbers));
+    CPPUNIT_ASSERT_EQUAL(SBuf("1"), s);
+    CPPUNIT_ASSERT(t.skipSuffix(SBuf("1.")));
+    CPPUNIT_ASSERT(t.skipSuffix(SBuf("/")));
+    CPPUNIT_ASSERT(t.suffix(s, alpha));
+    CPPUNIT_ASSERT_EQUAL(SBuf("HTTP"), s);
+    CPPUNIT_ASSERT(t.suffix(s, whitespace));
+    CPPUNIT_ASSERT_EQUAL(SBuf(" "), s);
+
+    // match until the end of the sample
+    CPPUNIT_ASSERT(t.suffix(s, all));
+    CPPUNIT_ASSERT_EQUAL(SBuf(), t.remaining());
+
+    // an empty buffer does not end with a token
+    s = canary;
+    CPPUNIT_ASSERT(!t.suffix(s, all));
+    CPPUNIT_ASSERT_EQUAL(canary, s); // no parameter changes
+
+    // we cannot skip an empty suffix, even in an empty buffer
+    CPPUNIT_ASSERT(!t.skipSuffix(SBuf()));
+}
+
+void
 testTokenizer::testCharacterSet()
 {
 
 }
 
 void
 testTokenizer::testTokenizerInt64()
 {
     // successful parse in base 10
     {
         int64_t rv;
         Parser::Tokenizer t(SBuf("1234"));
         const int64_t benchmark = 1234;
         CPPUNIT_ASSERT(t.int64(rv, 10));
         CPPUNIT_ASSERT_EQUAL(benchmark,rv);
     }
 
     // successful parse, autodetect base
     {
         int64_t rv;

=== modified file 'src/tests/testTokenizer.h'
--- src/tests/testTokenizer.h	2015-08-03 02:08:22 +0000
+++ src/tests/testTokenizer.h	2015-08-07 22:21:52 +0000
@@ -1,33 +1,35 @@
 /*
  * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
  *
  * Squid software is distributed under GPLv2+ license and includes
  * contributions from numerous individuals and organizations.
  * Please see the COPYING and CONTRIBUTORS files for details.
  */
 
 #ifndef SQUID_TESTTOKENIZER_H_
 #define SQUID_TESTTOKENIZER_H_
 
 #include <cppunit/extensions/HelperMacros.h>
 
 class testTokenizer : public CPPUNIT_NS::TestFixture
 {
     CPPUNIT_TEST_SUITE( testTokenizer );
     CPPUNIT_TEST ( testCharacterSet );
     CPPUNIT_TEST ( testTokenizerPrefix );
+    CPPUNIT_TEST ( testTokenizerSuffix );
     CPPUNIT_TEST ( testTokenizerSkip );
     CPPUNIT_TEST ( testTokenizerToken );
     CPPUNIT_TEST ( testTokenizerInt64 );
     CPPUNIT_TEST_SUITE_END();
 
 protected:
     void testTokenizerPrefix();
+    void testTokenizerSuffix();
     void testTokenizerSkip();
     void testTokenizerToken();
     void testCharacterSet();
     void testTokenizerInt64();
 };
 
 #endif /* SQUID_TESTTOKENIZER_H_ */

_______________________________________________
squid-dev mailing list
squid-dev@lists.squid-cache.org
http://lists.squid-cache.org/listinfo/squid-dev

Re: [squid-dev] [PATCH] suffix parsing and skipping

Reply via email to