Control: tags -1 + patch

Please find attached a patch; build-tested only.

I was unsure what to do with the recursion limit code in the match
method.  In the old PCRE3, PCRE_CONFIG_STACKRECURSE is 1 because of
the recursion implementation, which is stack-based.  In PCRE2, the
corresponding parameter PCRE2_CONFIG_STACKRECURSE is obsolete and
always 0.

Furthermore, according to pcre2api(3), if the recursion is great
enough, workspace vectors are allocated on the heap from version 10.32
onwards.  It also says that only local variables are allocated on the
stack and even a small stack can support a lot of recursion.  So I
concluded that limiting the stack space is unnecessary with PCRE2.  I
guess that only runtime tests can show if this is true, but as I am
not a Qt/KDE person I am unable to perform them, I'm afraid.

In any case, it is trivial to limit both the stack and the heap via a
match context -- just let me know and I'll make the required
modifications.

P.S.  This patch applies cleanly to the latest upstream release
      (5.113.0) but I haven't made a build test with it.
Description: Port to PCRE2.
Bug-Debian: https://bugs.debian.org/1000112
Bug: https://bugs.kde.org/show_bug.cgi?id=457338
Author: Yavor Doganov <ya...@gnu.org>
Forwarded: no
Last-Update: 2023-12-23
---

--- kjs-5.107.0.orig/cmake/FindPCRE.cmake
+++ kjs-5.107.0/cmake/FindPCRE.cmake
@@ -11,10 +11,10 @@
 # For details see the accompanying COPYING-CMAKE-SCRIPTS file.
 
 
-if (PCRE_INCLUDE_DIR AND PCRE_PCREPOSIX_LIBRARY AND PCRE_PCRE_LIBRARY)
+if (PCRE_INCLUDE_DIR AND PCRE_PCRE_LIBRARY)
   # Already in cache, be silent
   set(PCRE_FIND_QUIETLY TRUE)
-endif (PCRE_INCLUDE_DIR AND PCRE_PCREPOSIX_LIBRARY AND PCRE_PCRE_LIBRARY)
+endif (PCRE_INCLUDE_DIR AND PCRE_PCRE_LIBRARY)
 
 
 if (NOT WIN32)
@@ -22,23 +22,21 @@
   # in the FIND_PATH() and FIND_LIBRARY() calls
   find_package(PkgConfig)
 
-  pkg_check_modules(PC_PCRE QUIET libpcre)
+  pkg_check_modules(PC_PCRE QUIET libpcre2-8)
 
   set(PCRE_DEFINITIONS ${PC_PCRE_CFLAGS_OTHER})
 
 endif (NOT WIN32)
 
-find_path(PCRE_INCLUDE_DIR pcre.h 
+find_path(PCRE_INCLUDE_DIR pcre2.h
           HINTS ${PC_PCRE_INCLUDEDIR} ${PC_PCRE_INCLUDE_DIRS} 
-          PATH_SUFFIXES pcre)
+          )
 
-find_library(PCRE_PCRE_LIBRARY NAMES pcre pcred HINTS ${PC_PCRE_LIBDIR} 
${PC_PCRE_LIBRARY_DIRS})
-
-find_library(PCRE_PCREPOSIX_LIBRARY NAMES pcreposix pcreposixd HINTS 
${PC_PCRE_LIBDIR} ${PC_PCRE_LIBRARY_DIRS})
+find_library(PCRE_PCRE_LIBRARY NAMES pcre2-8 HINTS ${PC_PCRE_LIBDIR} 
${PC_PCRE_LIBRARY_DIRS})
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(PCRE DEFAULT_MSG PCRE_INCLUDE_DIR 
PCRE_PCRE_LIBRARY PCRE_PCREPOSIX_LIBRARY )
+find_package_handle_standard_args(PCRE DEFAULT_MSG PCRE_INCLUDE_DIR 
PCRE_PCRE_LIBRARY)
 
-set(PCRE_LIBRARIES ${PCRE_PCRE_LIBRARY} ${PCRE_PCREPOSIX_LIBRARY})
+set(PCRE_LIBRARIES ${PCRE_PCRE_LIBRARY})
 
-mark_as_advanced(PCRE_INCLUDE_DIR PCRE_LIBRARIES PCRE_PCREPOSIX_LIBRARY 
PCRE_PCRE_LIBRARY)
+mark_as_advanced(PCRE_INCLUDE_DIR PCRE_LIBRARIES PCRE_PCRE_LIBRARY)
--- kjs-5.107.0.orig/src/kjs/regexp.h
+++ kjs-5.107.0/src/kjs/regexp.h
@@ -26,7 +26,8 @@
 #include "global.h"
 
 #if HAVE_PCREPOSIX
-#include <pcre.h>
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
 #else  // POSIX regex - not so good...
 extern "C" { // bug with some libc5 distributions
 #include <regex.h>
@@ -115,7 +116,7 @@
 #endif
 private:
 #if HAVE_PCREPOSIX
-    pcre *_regex;
+    pcre2_code *_regex;
 #else
     regex_t _regex;
 #endif
--- kjs-5.107.0.orig/src/kjs/regexp.cpp
+++ kjs-5.107.0/src/kjs/regexp.cpp
@@ -273,8 +273,8 @@
 #if HAVE_PCREPOSIX
     // Determine whether libpcre has unicode support if need be..
     if (utf8Support == Unknown) {
-        int supported;
-        pcre_config(PCRE_CONFIG_UTF8, (void *)&supported);
+        uint32_t supported;
+        pcre2_config(PCRE2_CONFIG_UNICODE, &supported);
         utf8Support = supported ? Supported : Unsupported;
     }
 #endif
@@ -282,50 +282,49 @@
     UString intern = sanitizePattern(p);
 
 #if HAVE_PCREPOSIX
-    int options = 0;
+    uint32_t options = 0;
 
     // we are close but not 100% the same as Perl
-#ifdef PCRE_JAVASCRIPT_COMPAT // introduced in PCRE 7.7
-    options |= PCRE_JAVASCRIPT_COMPAT;
-#endif
+    options |= (PCRE2_ALT_BSUX | PCRE2_MATCH_UNSET_BACKREF);
 
     // Note: the Global flag is already handled by RegExpProtoFunc::execute.
     // FIXME: That last comment is dubious. Not all RegExps get run through 
RegExpProtoFunc::execute.
     if (flags & IgnoreCase) {
-        options |= PCRE_CASELESS;
+        options |= PCRE2_CASELESS;
     }
     if (flags & Multiline) {
-        options |= PCRE_MULTILINE;
+        options |= PCRE2_MULTILINE;
     }
 
     if (utf8Support == Supported) {
-        options |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
+        options |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK);
     }
 
-    const char *errorMessage;
-    int errorOffset;
+    PCRE2_UCHAR errorMessage[120];
+    PCRE2_SIZE errorOffset;
+    int errCode;
     bool secondTry = false;
 
     while (1) {
         RegExpStringContext converted(intern);
 
-        _regex = pcre_compile(converted.buffer(), options, &errorMessage, 
&errorOffset, nullptr);
+        _regex = pcre2_compile((PCRE2_SPTR)converted.buffer(), 
PCRE2_ZERO_TERMINATED, options, &errCode, &errorOffset, nullptr);
 
         if (!_regex) {
-#ifdef PCRE_JAVASCRIPT_COMPAT
             // The compilation failed. It is likely the pattern contains 
non-standard extensions.
             // We may try to tolerate some of those extensions.
             bool doRecompile = !secondTry && sanitizePatternExtensions(intern);
             if (doRecompile) {
                 secondTry = true;
 #ifndef NDEBUG
-                fprintf(stderr, "KJS: pcre_compile() failed with '%s' - 
non-standard extensions detected in pattern, trying second compile after 
correction.\n", errorMessage);
+                pcre2_get_error_message(errCode, errorMessage, 
sizeof(errorMessage));
+                fprintf(stderr, "KJS: pcre2_compile() failed with '%s' - 
non-standard extensions detected in pattern, trying second compile after 
correction.\n", errorMessage);
 #endif
                 continue;
             }
-#endif
 #ifndef NDEBUG
-            fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", 
errorMessage);
+            pcre2_get_error_message(errCode, errorMessage, 
sizeof(errorMessage));
+            fprintf(stderr, "KJS: pcre2_compile() failed with '%s'\n", 
errorMessage);
 #endif
             _valid = false;
             return;
@@ -333,10 +332,8 @@
         break;
     }
 
-#ifdef PCRE_INFO_CAPTURECOUNT
     // Get number of subpatterns that will be returned.
-    pcre_fullinfo(_regex, nullptr, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
-#endif
+    pcre2_pattern_info(_regex, PCRE2_INFO_CAPTURECOUNT, &_numSubPatterns);
 
 #else /* HAVE_PCREPOSIX */
 
@@ -370,7 +367,7 @@
 RegExp::~RegExp()
 {
 #if HAVE_PCREPOSIX
-    pcre_free(_regex);
+    pcre2_code_free(_regex);
 #else
     /* TODO: is this really okay after an error ? */
     regfree(&_regex);
@@ -490,6 +487,8 @@
 
     // Set up the offset vector for the result.
     // First 2/3 used for result, the last third used by PCRE.
+    pcre2_match_data *matchData;
+    PCRE2_SIZE *matchVector;
     int *offsetVector;
     int offsetVectorSize;
     int fixedSizeOffsetVector[3];
@@ -500,6 +499,7 @@
         offsetVectorSize = (_numSubPatterns + 1) * 3;
         offsetVector = new int [offsetVectorSize];
     }
+    matchData = pcre2_match_data_create(offsetVectorSize, nullptr);
 
     int startPos;
     if (utf8Support == Supported) {
@@ -511,44 +511,17 @@
         startPos = i;
     }
 
-    int baseFlags = utf8Support == Supported ? PCRE_NO_UTF8_CHECK : 0;
+    uint32_t baseFlags = utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0;
 
-    // See if we have to limit stack space...
     *error = false;
-    int stackGlutton = 0;
-    pcre_config(PCRE_CONFIG_STACKRECURSE, (void *)&stackGlutton);
-    pcre_extra limits;
-    if (stackGlutton) {
-#if HAVE(SYS_TIME_H)
-        if (tryGrowingMaxStackSize) {
-            rlimit l;
-            getrlimit(RLIMIT_STACK, &l);
-            availableStackSize = l.rlim_cur;
-            if (l.rlim_cur < sWantedStackSizeLimit &&
-                    (l.rlim_max > l.rlim_cur || l.rlim_max == RLIM_INFINITY)) {
-                l.rlim_cur = (l.rlim_max == RLIM_INFINITY) ?
-                             sWantedStackSizeLimit : std::min(l.rlim_max, 
sWantedStackSizeLimit);
-                if ((didIncreaseMaxStackSize = !setrlimit(RLIMIT_STACK, &l))) {
-                    availableStackSize = l.rlim_cur;
-                }
-            }
-            tryGrowingMaxStackSize = false;
-        }
-#endif
-
-        limits.flags = PCRE_EXTRA_MATCH_LIMIT_RECURSION;
-        // libPCRE docs claim that it munches about 500 bytes per recursion.
-        // The crash in #160792 actually showed pcre 7.4 using about 1300 bytes
-        // (and I've measured 800 in an another instance)
-        // We go somewhat conservative, and use about 3/4ths of that,
-        // especially since we're not exactly light on the stack, either
-        limits.match_limit_recursion = (availableStackSize / 1300) * 3 / 4;
-    }
 
-    const int numMatches = pcre_exec(_regex, stackGlutton ? &limits : nullptr, 
ctx.buffer(),
-                                     ctx.bufferSize(), startPos, baseFlags, 
offsetVector, offsetVectorSize);
+    const int numMatches = pcre2_match(_regex, (PCRE2_SPTR)ctx.buffer(),
+                                       ctx.bufferSize(), startPos, baseFlags, 
matchData, nullptr);
 
     //Now go through and patch up the offsetVector
+    matchVector = pcre2_get_ovector_pointer(matchData);
+    for (int j = 0; j < offsetVectorSize; ++j)
+        offsetVector[j] = (int)matchVector[j];
     if (utf8Support == Supported)
         for (int c = 0; c < 2 * numMatches; ++c)
             if (offsetVector[c] != -1) {
@@ -557,14 +530,15 @@
 
     if (numMatches < 0) {
 #ifndef NDEBUG
-        if (numMatches != PCRE_ERROR_NOMATCH) {
-            fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", 
numMatches);
+        if (numMatches != PCRE2_ERROR_NOMATCH) {
+            fprintf(stderr, "KJS: pcre2_match() failed with result %d\n", 
numMatches);
         }
 #endif
         if (offsetVector != fixedSizeOffsetVector) {
             delete [] offsetVector;
         }
-        if (numMatches == PCRE_ERROR_MATCHLIMIT || numMatches == 
PCRE_ERROR_RECURSIONLIMIT) {
+        pcre2_match_data_free(matchData);
+        if (numMatches == PCRE2_ERROR_MATCHLIMIT || numMatches == 
PCRE2_ERROR_DEPTHLIMIT) {
             *error = true;
         }
         return UString::null();
@@ -574,6 +548,7 @@
     if (ovector) {
         *ovector = offsetVector;
     }
+    pcre2_match_data_free(matchData);
     return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
 
 #else
--- kjs-5.107.0.orig/src/kjs/CMakeLists.txt
+++ kjs-5.107.0/src/kjs/CMakeLists.txt
@@ -12,25 +12,7 @@
 
 # the check for pcre is in ../CMakeLists.txt
 if(PCRE_FOUND AND NOT KJS_FORCE_DISABLE_PCRE)
-   include_directories(${PCRE_INCLUDE_DIR})
-
-   # tell check_symbol_exists to -I pcre dirs.
-   include(CMakePushCheckState)
-   cmake_push_check_state()
-   set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} ${PCRE_INCLUDE_DIR})
-
-   check_symbol_exists(PCRE_CONFIG_UTF8 "pcre.h" HAVE_PCRE_UTF8)
-   check_symbol_exists(PCRE_CONFIG_STACKRECURSE "pcre.h" HAVE_PCRE_STACK)
-
-   cmake_pop_check_state()
-
-   # Even though we "support" non-PCRE builds, if we build PCRE, we want a 
version
-   # recent enough, and we don't want to fallback to a completely crippled
-   # POSIX code just like that.
-   if (NOT HAVE_PCRE_UTF8  OR NOT  HAVE_PCRE_STACK)
-      message(FATAL_ERROR "Your libPCRE is too old. KJS requires at least 
PCRE4.5")
-   endif ()
-
+  message(STATUS "Using PCRE2")
 else ()
    # if we're here, either PCRE support is disabled, or it's not found...
    # it better be disabled.

Reply via email to