Re: [RFC] Possible PCT performance improvements.

chromatic Sun, 15 Aug 2010 16:25:52 -0700

On Saturday 14 August 2010 at 16:38, Patrick R wrote:

> I've suggested a "compare substring at position" opcode several times,
> but so far nobody (including myself) seems to have found the tuits to
> create it.
> 
> If/when the opcode is added, I'll gladly fix up NQP and the regex engine
> to take advantage of it.  Ideally I'd like something like:
> 
>     $I0 = substreq $S0, $S1, $I1
> 
> which should mean something along the lines of
> 
>     Compare the string in $S0 with an equivalent-length substring of
>     $S1 starting at position $I1; return true if matched and false 
>     otherwise.


I have a patch which adds a new API function called 
Parrot_str_compare_offset().  It takes two strings and an offset within the 
first 
string at which to start the comparison.  If the contents of the first string 
at that offset position match the entire contents of the second string, it 
returns a true value.

We could add an experimental opcode around this function trivially.

Here's a diff which doesn't include tests, as there's no PIR-visible support 
yet.

-- c

diff --git a/include/parrot/charset.h b/include/parrot/charset.h
index 2d2ede4..12981f5 100644
--- a/include/parrot/charset.h
+++ b/include/parrot/charset.h
@@ -40,7 +40,7 @@ typedef STRING * (*charset_titlecase_t)(PARROT_INTERP, ARGIN(const STRING *src))
 typedef STRING * (*charset_upcase_first_t)(PARROT_INTERP, ARGIN(const STRING *src));
 typedef STRING * (*charset_downcase_first_t)(PARROT_INTERP, ARGIN(const STRING *src));
 typedef STRING * (*charset_titlecase_first_t)(PARROT_INTERP, ARGIN(const STRING *src));
-typedef INTVAL   (*charset_compare_t)(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs));
+typedef INTVAL   (*charset_compare_t)(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs), INTVAL offset);
 typedef INTVAL   (*charset_index_t)(PARROT_INTERP, ARGIN(const STRING *src), ARGIN(const STRING *search_string), UINTVAL offset);
 typedef INTVAL   (*charset_rindex_t)(PARROT_INTERP, ARGIN(const STRING *src), ARGIN(const STRING *search_string), UINTVAL offset);
 typedef UINTVAL  (*charset_validate_t)(PARROT_INTERP, ARGIN(const STRING *src));
@@ -246,7 +246,8 @@ struct _charset {
 #define CHARSET_UPCASE_FIRST(interp, source) ((source)->charset)->upcase_first((interp), (source))
 #define CHARSET_DOWNCASE_FIRST(interp, source) ((source)->charset)->downcase_first((interp), (source))
 #define CHARSET_TITLECASE_FIRST(interp, source) ((source)->charset)->titlecase_first((interp), (source))
-#define CHARSET_COMPARE(interp, lhs, rhs) ((const CHARSET *)(lhs)->charset)->compare((interp), (lhs), (rhs))
+#define CHARSET_COMPARE(interp, lhs, rhs) ((const CHARSET *)(lhs)->charset)->compare((interp), (lhs), (rhs), 0)
+#define CHARSET_COMPARE_OFFSET(interp, lhs, rhs, offset) ((const CHARSET *)(lhs)->charset)->compare((interp), (lhs), (rhs), offset)
 #define CHARSET_INDEX(interp, source, search, offset) ((source)->charset)->index((interp), (source), (search), (offset))
 #define CHARSET_RINDEX(interp, source, search, offset) ((source)->charset)->rindex((interp), (source), (search), (offset))
 #define CHARSET_VALIDATE(interp, source) ((source)->charset)->validate((interp), (source))
diff --git a/include/parrot/string_funcs.h b/include/parrot/string_funcs.h
index 7c35265..e961c92 100644
--- a/include/parrot/string_funcs.h
+++ b/include/parrot/string_funcs.h
@@ -95,6 +95,16 @@ INTVAL Parrot_str_compare(PARROT_INTERP,
 
 PARROT_EXPORT
 PARROT_WARN_UNUSED_RESULT
+INTVAL Parrot_str_compare_offset(PARROT_INTERP,
+    ARGIN(const STRING *a),
+    INTVAL offset,
+    ARGIN(const STRING *b))
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        __attribute__nonnull__(4);
+
+PARROT_EXPORT
+PARROT_WARN_UNUSED_RESULT
 PARROT_CAN_RETURN_NULL
 STRING * Parrot_str_compose(PARROT_INTERP, ARGIN_NULLOK(const STRING *src))
         __attribute__nonnull__(1);
@@ -515,6 +525,10 @@ STRING* Parrot_str_from_uint(PARROT_INTERP,
     , PARROT_ASSERT_ARG(s))
 #define ASSERT_ARGS_Parrot_str_compare __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_Parrot_str_compare_offset __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(a) \
+    , PARROT_ASSERT_ARG(b))
 #define ASSERT_ARGS_Parrot_str_compose __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp))
 #define ASSERT_ARGS_Parrot_str_concat __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
diff --git a/src/string/api.c b/src/string/api.c
index adf0346..7270566 100644
--- a/src/string/api.c
+++ b/src/string/api.c
@@ -1326,6 +1326,44 @@ Parrot_str_compare(PARROT_INTERP, ARGIN_NULLOK(const STRING *s1), ARGIN_NULLOK(c
     return CHARSET_COMPARE(interp, s1, s2);
 }
 
+/*
+
+=item C<INTVAL Parrot_str_compare_offset(PARROT_INTERP, const STRING *a, INTVAL
+offset, const STRING *b)>
+
+Compares two strings to each other.  If s1 is less than s2, returns -1.  If the
+strings are equal, returns 0.  If s1 is greater than s2, returns 2.  This
+comparison uses the character set collation order of the strings for
+comparison.  Any given offset (a positive value) will start the comparison that
+many characters from the start of s1.
+
+=cut
+
+*/
+
+
+PARROT_EXPORT
+PARROT_WARN_UNUSED_RESULT
+INTVAL
+Parrot_str_compare_offset(PARROT_INTERP, ARGIN(const STRING *a), INTVAL offset,
+    ARGIN(const STRING *b))
+{
+    ASSERT_ARGS(Parrot_str_compare_offset)
+
+    /* do these make sense? */
+    if (STRING_IS_NULL(b))
+        return a && (a->strlen != 0);
+
+    if (STRING_IS_NULL(a))
+        return -(b->strlen != 0);
+
+    ASSERT_STRING_SANITY(a);
+    ASSERT_STRING_SANITY(b);
+
+    /* XXX: sanitize offset */
+    return CHARSET_COMPARE_OFFSET(interp, a, b, offset);
+}
+
 
 /*
 
diff --git a/src/string/charset/ascii.c b/src/string/charset/ascii.c
index 5c8371a..5b63f9f 100644
--- a/src/string/charset/ascii.c
+++ b/src/string/charset/ascii.c
@@ -468,10 +468,11 @@ titlecase_first(PARROT_INTERP, ARGIN(const STRING *src))
 /*
 
 =item C<INTVAL ascii_compare(PARROT_INTERP, const STRING *lhs, const STRING
-*rhs)>
+*rhs, INTVAL offset)>
 
-Compares two strings as ASCII strings. If STRING C<lhs> > C<rhs>, returns
-1. If C<lhs> == C<rhs> returns 0. If STRING C<lhs> < C<rhs>, returns  -1.
+Compares two strings as ASCII strings. If STRING C<lhs> > C<rhs>, returns 1. If
+C<lhs> == C<rhs> returns 0. If STRING C<lhs> < C<rhs>, returns -1.  The offset
+represents the number of characters into lhs to start the comparison.
 
 =cut
 
@@ -479,35 +480,39 @@ Compares two strings as ASCII strings. If STRING C<lhs> > C<rhs>, returns
 
 PARROT_WARN_UNUSED_RESULT
 INTVAL
-ascii_compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
+ascii_compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs),
+    INTVAL offset)
 {
     ASSERT_ARGS(ascii_compare)
-    const UINTVAL l_len = lhs->strlen;
-    const UINTVAL r_len = rhs->strlen;
+    const UINTVAL l_len   = lhs->strlen;
+    const UINTVAL r_len   = rhs->strlen;
     const UINTVAL min_len = l_len > r_len ? r_len : l_len;
+
     String_iter iter;
 
     if (lhs->encoding == rhs->encoding) {
-        const int ret_val = memcmp(lhs->strstart, rhs->strstart, min_len);
+        const int ret_val = memcmp(lhs->strstart + offset,
+                                   rhs->strstart, min_len);
         if (ret_val)
             return ret_val < 0 ? -1 : 1;
     }
     else {
         UINTVAL offs;
         ENCODING_ITER_INIT(interp, rhs, &iter);
-        for (offs = 0; offs < min_len; ++offs) {
+        for (offs = offset; offs < min_len; ++offs) {
             const UINTVAL cl = ENCODING_GET_BYTE(interp, lhs, offs);
             const UINTVAL cr = iter.get_and_advance(interp, &iter);
             if (cl != cr)
                 return cl < cr ? -1 : 1;
         }
     }
-    if (l_len < r_len) {
+
+    if (l_len < r_len)
         return -1;
-    }
-    if (l_len > r_len) {
+
+    if (l_len > r_len)
         return 1;
-    }
+
     return 0;
 }
 
diff --git a/src/string/charset/ascii.h b/src/string/charset/ascii.h
index cc3494c..5c58efd 100644
--- a/src/string/charset/ascii.h
+++ b/src/string/charset/ascii.h
@@ -23,7 +23,8 @@
 PARROT_WARN_UNUSED_RESULT
 INTVAL ascii_compare(PARROT_INTERP,
     ARGIN(const STRING *lhs),
-    ARGIN(const STRING *rhs))
+    ARGIN(const STRING *rhs),
+    INTVAL offset)
         __attribute__nonnull__(1)
         __attribute__nonnull__(2)
         __attribute__nonnull__(3);
diff --git a/src/string/charset/binary.c b/src/string/charset/binary.c
index 676bb0f..d30d8a3 100644
--- a/src/string/charset/binary.c
+++ b/src/string/charset/binary.c
@@ -29,7 +29,8 @@ This file implements the charset functions for binary data
 
 static INTVAL compare(SHIM_INTERP,
     ARGIN(const STRING *lhs),
-    ARGIN(const STRING *rhs))
+    ARGIN(const STRING *rhs),
+    INTVAL offset)
         __attribute__nonnull__(2)
         __attribute__nonnull__(3);
 
@@ -312,24 +313,27 @@ titlecase_first(PARROT_INTERP, SHIM(const STRING *src))
 /*
 
 =item C<static INTVAL compare(PARROT_INTERP, const STRING *lhs, const STRING
-*rhs)>
+*rhs, INTVAL offset)>
 
-Compare the two buffers, first by size, then with memcmp.
+Compare the two buffers, first by size, then with memcmp.  The offset
+represents the number of bytes within lhs to start the comparison.
 
 =cut
 
 */
 
 static INTVAL
-compare(SHIM_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
+compare(SHIM_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs),
+        INTVAL offset)
 {
     ASSERT_ARGS(compare)
-    const UINTVAL l_len = lhs->strlen;
+    const UINTVAL l_len = lhs->strlen - offset;
     const UINTVAL r_len = rhs->strlen;
+
     if (l_len != r_len)
         return l_len - r_len;
 
-    return memcmp(lhs->strstart, rhs->strstart, l_len);
+    return memcmp(lhs->strstart + offset, rhs->strstart, l_len);
 }
 
 /*
diff --git a/src/string/charset/unicode.c b/src/string/charset/unicode.c
index e1de74c..2ca5ac5 100644
--- a/src/string/charset/unicode.c
+++ b/src/string/charset/unicode.c
@@ -28,7 +28,8 @@ This file implements the charset functions for unicode data
 
 static INTVAL compare(PARROT_INTERP,
     ARGIN(const STRING *lhs),
-    ARGIN(const STRING *rhs))
+    ARGIN(const STRING *rhs),
+    INTVAL offset)
         __attribute__nonnull__(1)
         __attribute__nonnull__(2)
         __attribute__nonnull__(3);
@@ -637,17 +638,19 @@ titlecase_first(PARROT_INTERP, SHIM(const STRING *src))
 /*
 
 =item C<static INTVAL compare(PARROT_INTERP, const STRING *lhs, const STRING
-*rhs)>
+*rhs, INTVAL offset)>
 
 Compares two STRINGs, C<lhs> and C<rhs>. Returns -1 if C<lhs> < C<rhs>. Returns
-0 if C<lhs> = C<rhs>. Returns 1 if C<lhs> > C<rhs>.
+0 if C<lhs> = C<rhs>. Returns 1 if C<lhs> > C<rhs>.  The offset represents the
+number of characters from the start of lhs from which to begin the comparsion.
 
 =cut
 
 */
 
 static INTVAL
-compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
+compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs),
+        INTVAL offset)
 {
     ASSERT_ARGS(compare)
     String_iter l_iter, r_iter;
@@ -657,12 +660,12 @@ compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
     ENCODING_ITER_INIT(interp, lhs, &l_iter);
     ENCODING_ITER_INIT(interp, rhs, &r_iter);
 
-    l_len = lhs->strlen;
-    r_len = rhs->strlen;
+    l_len   = lhs->strlen - offset;
+    r_len   = rhs->strlen;
 
     min_len = l_len > r_len ? r_len : l_len;
 
-    for (offs = 0; offs < min_len; ++offs) {
+    for (offs = offset; offs < min_len; ++offs) {
         cl = l_iter.get_and_advance(interp, &l_iter);
         cr = r_iter.get_and_advance(interp, &r_iter);

_______________________________________________
http://lists.parrot.org/mailman/listinfo/parrot-dev

Re: [RFC] Possible PCT performance improvements.

Reply via email to