X-Debbugs-CC: ya...@gnu.org On Thu, 18 Nov 2021 11:49:06 +0000 Matthew Vernon <matt...@debian.org> wrote: > Source: parser > Severity: important > User: matthew-pcre...@debian.org > Usertags: obsolete-pcre3 > > Dear maintainer, > > Your package still depends on the old, obsolete PCRE3[0] libraries > (i.e. libpcre3-dev). This has been end of life for a while now, and > upstream do not intend to fix any further bugs in it. Accordingly, I > would like to remove the pcre3 libraries from Debian, preferably in > time for the release of Bookworm. > > The newer PCRE2 library was first released in 2015, and has been in > Debian since stretch. Upstream's documentation for PCRE2 is available > here: https://pcre.org/current/doc/html/ > > Many large projects that use PCRE have made the switch now (e.g. git, > php); it does involve some work, but we are now at the stage where > PCRE3 should not be used, particularly if it might ever be exposed to > untrusted input. > > This mass bug filing was discussed on debian-devel@ in > https://lists.debian.org/debian-devel/2021/11/msg00176.html > > Regards, > > Matthew [0] Historical reasons mean that old PCRE is packaged as > pcre3 in Debian
I am aware of the work at https://bugs.debian.org/1057281 , but unfortunately I am unable to review the patch at the moment. In order to prevent the loss of the proposed patch, I am including it as an email attachment here. Thanks, Boyuan Yang
Description: Port to PCRE2. Bug-Debian: https://bugs.debian.org/1000006 Author: Yavor Doganov <ya...@gnu.org> Forwarded: mailto:mail...@parser.ru Last-Update: 2023-11-29 --- --- parser-3.4.6.orig/configure.ac +++ parser-3.4.6/configure.ac @@ -184,20 +184,20 @@ PCRE_INCLUDES="-I$PCRE/include" PCRE_LIBS="$PCRE/lib/libpcre.la" - if test -f $PCRE/include/pcre.h -a -f $PCRE_LIBS; then + if test -f $PCRE/include/pcre2.h -a -f $PCRE_LIBS; then PCRE_OK="yes" else - PCRE_LIBS="-L$PCRE/lib -lpcre" + PCRE_LIBS="-L$PCRE/lib -lpcre2-8" fi if test "$PCRE" = "yes"; then PCRE="" - PCRE_LIBS="-lpcre" + PCRE_LIBS="-lpcre2-8" PCRE_INCLUDES="" AC_MSG_WARN([--with-pcre value was not specified, hoping linker would find it]) fi ],[ - PCRE_LIBS="-lpcre" + PCRE_LIBS="-lpcre2-8" PCRE_INCLUDES="" AC_MSG_WARN([--with-pcre was not specified, hoping linker would find it]) ]) @@ -206,16 +206,21 @@ AC_MSG_CHECKING(for prce) SAVE_LIBS=$LIBS LIBS="$LIBS $PCRE_LIBS $PCRE_INCLUDES" - AC_TRY_LINK([ #include <pcre.h> ],[ const char *v=pcre_version(); ], - AC_MSG_RESULT(yes) + AC_LINK_IFELSE( + [AC_LANG_PROGRAM([[#define PCRE2_CODE_UNIT_WIDTH 8 + #include <pcre2.h>]], + [[uint32_t ov=16; + pcre2_match_data *md; + md=pcre2_match_data_create(ov, NULL);]])], + [AC_MSG_RESULT([yes])] , - AC_MSG_RESULT(no) + [AC_MSG_RESULT([no]) if test -z "$PCRE"; then AC_MSG_ERROR(please specify path to PCRE: --with-pcre=DIR) else AC_MSG_ERROR($PCRE does not seem to be valid PCRE installation directory) fi - ) + ]) LIBS=$SAVE_LIBS fi --- parser-3.4.6.orig/src/include/pa_charset.h +++ parser-3.4.6/src/include/pa_charset.h @@ -16,7 +16,8 @@ #include "pa_hash.h" #include "pa_array.h" -#include "pcre.h" +#define PCRE2_CODE_UNIT_WIDTH 8 +#include <pcre2.h> // we are using some pcre_internal.h stuff as well #include "../lib/pcre/pa_pcre_internal.h" --- parser-3.4.6.orig/src/lib/pcre/pa_pcre_valid_utf8.c +++ parser-3.4.6/src/lib/pcre/pa_pcre_valid_utf8.c @@ -6,7 +6,8 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2012 University of Cambridge + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2020 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -38,112 +39,134 @@ */ -/* This module contains an internal function for validating UTF-8 character -strings. */ - -#include "pcre.h" +/* This module contains an internal function for validating UTF character +strings. This file is also #included by the pcre2test program, which uses +macros to change names from _pcre2_xxx to xxxx, thereby avoiding name clashes +with the library. In this case, PCRE2_PCRE2TEST is defined. */ + +#define SUPPORT_UNICODE +#define PCRE2_CODE_UNIT_WIDTH 8 +#include <pcre2.h> #include "pa_pcre_internal.h" +static const uint8_t utf8_table4[] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; + +#ifndef SUPPORT_UNICODE +/************************************************* +* Dummy function when Unicode is not supported * +*************************************************/ + +/* This function should never be called when Unicode is not supported. */ + +int +PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset) +{ +(void)string; +(void)length; +(void)erroroffset; +return 0; +} +#else /* UTF is supported */ + + /************************************************* -* Validate a UTF-8 string * +* Validate a UTF string * *************************************************/ /* This function is called (optionally) at the start of compile or match, to -check that a supposed UTF-8 string is actually valid. The early check means +check that a supposed UTF string is actually valid. The early check means that subsequent code can assume it is dealing with a valid string. The check can be turned off for maximum performance, but the consequences of supplying an invalid string are then undefined. -Originally, this function checked according to RFC 2279, allowing for values in -the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in -the canonical format. Once somebody had pointed out RFC 3629 to me (it -obsoletes 2279), additional restrictions were applied. The values are now -limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the -subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte -characters is still checked. - -From release 8.13 more information about the details of the error are passed -back in the returned value: - -PCRE_UTF8_ERR0 No error -PCRE_UTF8_ERR1 Missing 1 byte at the end of the string -PCRE_UTF8_ERR2 Missing 2 bytes at the end of the string -PCRE_UTF8_ERR3 Missing 3 bytes at the end of the string -PCRE_UTF8_ERR4 Missing 4 bytes at the end of the string -PCRE_UTF8_ERR5 Missing 5 bytes at the end of the string -PCRE_UTF8_ERR6 2nd-byte's two top bits are not 0x80 -PCRE_UTF8_ERR7 3rd-byte's two top bits are not 0x80 -PCRE_UTF8_ERR8 4th-byte's two top bits are not 0x80 -PCRE_UTF8_ERR9 5th-byte's two top bits are not 0x80 -PCRE_UTF8_ERR10 6th-byte's two top bits are not 0x80 -PCRE_UTF8_ERR11 5-byte character is not permitted by RFC 3629 -PCRE_UTF8_ERR12 6-byte character is not permitted by RFC 3629 -PCRE_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted -PCRE_UTF8_ERR14 3-byte character with value 0xd000-0xdfff is not permitted -PCRE_UTF8_ERR15 Overlong 2-byte sequence -PCRE_UTF8_ERR16 Overlong 3-byte sequence -PCRE_UTF8_ERR17 Overlong 4-byte sequence -PCRE_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) -PCRE_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) -PCRE_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) -PCRE_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff - Arguments: string points to the string - length length of string, or -1 if the string is zero-terminated + length length of string errp pointer to an error position offset variable -Returns: = 0 if the string is a valid UTF-8 string - > 0 otherwise, setting the offset of the bad character +Returns: == 0 if the string is a valid UTF string + != 0 otherwise, setting the offset of the bad character */ -typedef unsigned char *PCRE_PUCHAR; -#define SUPPORT_UTF +int +pa_pcre_valid_utf(PCRE2_SPTR string, int length, int *erroroffset) +{ +PCRE2_SPTR p; +uint32_t c; -static const unsigned char utf8_table4[] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; +/* ----------------- Check a UTF-8 string ----------------- */ +#if PCRE2_CODE_UNIT_WIDTH == 8 -int -pa_pcre_valid_utf(PCRE_PUCHAR string, int length, int *erroroffset) -{ -#ifdef SUPPORT_UTF -register PCRE_PUCHAR p; +/* Originally, this function checked according to RFC 2279, allowing for values +in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were +in the canonical format. Once somebody had pointed out RFC 3629 to me (it +obsoletes 2279), additional restrictions were applied. The values are now +limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the +subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte +characters is still checked. Error returns are as follows: -if (length < 0) - { - for (p = string; *p != 0; p++); - length = (int)(p - string); - } +PCRE2_ERROR_UTF8_ERR1 Missing 1 byte at the end of the string +PCRE2_ERROR_UTF8_ERR2 Missing 2 bytes at the end of the string +PCRE2_ERROR_UTF8_ERR3 Missing 3 bytes at the end of the string +PCRE2_ERROR_UTF8_ERR4 Missing 4 bytes at the end of the string +PCRE2_ERROR_UTF8_ERR5 Missing 5 bytes at the end of the string +PCRE2_ERROR_UTF8_ERR6 2nd-byte's two top bits are not 0x80 +PCRE2_ERROR_UTF8_ERR7 3rd-byte's two top bits are not 0x80 +PCRE2_ERROR_UTF8_ERR8 4th-byte's two top bits are not 0x80 +PCRE2_ERROR_UTF8_ERR9 5th-byte's two top bits are not 0x80 +PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80 +PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629 +PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629 +PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted +PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted +PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence +PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence +PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence +PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) +PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) +PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) +PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff +*/ -for (p = string; length-- > 0; p++) +for (p = string; length > 0; p++) { - register int ab, c, d; + uint32_t ab, d; c = *p; + length--; + if (c < 128) continue; /* ASCII character */ if (c < 0xc0) /* Isolated 10xx xxxx byte */ { - *erroroffset = (int)(p - string); - return PCRE_UTF8_ERR20; + *erroroffset = p - string; + return PCRE2_ERROR_UTF8_ERR20; } if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */ { - *erroroffset = (int)(p - string); - return PCRE_UTF8_ERR21; + *erroroffset = p - string; + return PCRE2_ERROR_UTF8_ERR21; } - ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */ - if (length < ab) + ab = utf8_table4[c & 0x3f]; /* Number of additional bytes (1-5) */ + if (length < ab) /* Missing bytes */ { - *erroroffset = (int)(p - string); /* Missing bytes */ - return ab - length; /* Codes ERR1 to ERR5 */ + *erroroffset = p - string; + switch(ab - length) + { + case 1: return PCRE2_ERROR_UTF8_ERR1; + case 2: return PCRE2_ERROR_UTF8_ERR2; + case 3: return PCRE2_ERROR_UTF8_ERR3; + case 4: return PCRE2_ERROR_UTF8_ERR4; + case 5: return PCRE2_ERROR_UTF8_ERR5; + } } length -= ab; /* Length remaining */ @@ -152,7 +175,7 @@ if (((d = *(++p)) & 0xc0) != 0x80) { *erroroffset = (int)(p - string) - 1; - return PCRE_UTF8_ERR6; + return PCRE2_ERROR_UTF8_ERR6; } /* For each length, check that the remaining bytes start with the 0x80 bit @@ -167,7 +190,7 @@ case 1: if ((c & 0x3e) == 0) { *erroroffset = (int)(p - string) - 1; - return PCRE_UTF8_ERR15; + return PCRE2_ERROR_UTF8_ERR15; } break; @@ -179,17 +202,17 @@ if ((*(++p) & 0xc0) != 0x80) /* Third byte */ { *erroroffset = (int)(p - string) - 2; - return PCRE_UTF8_ERR7; + return PCRE2_ERROR_UTF8_ERR7; } if (c == 0xe0 && (d & 0x20) == 0) { *erroroffset = (int)(p - string) - 2; - return PCRE_UTF8_ERR16; + return PCRE2_ERROR_UTF8_ERR16; } if (c == 0xed && d >= 0xa0) { *erroroffset = (int)(p - string) - 2; - return PCRE_UTF8_ERR14; + return PCRE2_ERROR_UTF8_ERR14; } break; @@ -201,22 +224,22 @@ if ((*(++p) & 0xc0) != 0x80) /* Third byte */ { *erroroffset = (int)(p - string) - 2; - return PCRE_UTF8_ERR7; + return PCRE2_ERROR_UTF8_ERR7; } if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ { *erroroffset = (int)(p - string) - 3; - return PCRE_UTF8_ERR8; + return PCRE2_ERROR_UTF8_ERR8; } if (c == 0xf0 && (d & 0x30) == 0) { *erroroffset = (int)(p - string) - 3; - return PCRE_UTF8_ERR17; + return PCRE2_ERROR_UTF8_ERR17; } if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) { *erroroffset = (int)(p - string) - 3; - return PCRE_UTF8_ERR13; + return PCRE2_ERROR_UTF8_ERR13; } break; @@ -232,22 +255,22 @@ if ((*(++p) & 0xc0) != 0x80) /* Third byte */ { *erroroffset = (int)(p - string) - 2; - return PCRE_UTF8_ERR7; + return PCRE2_ERROR_UTF8_ERR7; } if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ { *erroroffset = (int)(p - string) - 3; - return PCRE_UTF8_ERR8; + return PCRE2_ERROR_UTF8_ERR8; } if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ { *erroroffset = (int)(p - string) - 4; - return PCRE_UTF8_ERR9; + return PCRE2_ERROR_UTF8_ERR9; } if (c == 0xf8 && (d & 0x38) == 0) { *erroroffset = (int)(p - string) - 4; - return PCRE_UTF8_ERR18; + return PCRE2_ERROR_UTF8_ERR18; } break; @@ -258,27 +281,27 @@ if ((*(++p) & 0xc0) != 0x80) /* Third byte */ { *erroroffset = (int)(p - string) - 2; - return PCRE_UTF8_ERR7; + return PCRE2_ERROR_UTF8_ERR7; } if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ { *erroroffset = (int)(p - string) - 3; - return PCRE_UTF8_ERR8; + return PCRE2_ERROR_UTF8_ERR8; } if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ { *erroroffset = (int)(p - string) - 4; - return PCRE_UTF8_ERR9; + return PCRE2_ERROR_UTF8_ERR9; } if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */ { *erroroffset = (int)(p - string) - 5; - return PCRE_UTF8_ERR10; + return PCRE2_ERROR_UTF8_ERR10; } if (c == 0xfc && (d & 0x3c) == 0) { *erroroffset = (int)(p - string) - 5; - return PCRE_UTF8_ERR19; + return PCRE2_ERROR_UTF8_ERR19; } break; } @@ -290,16 +313,89 @@ if (ab > 3) { *erroroffset = (int)(p - string) - ab; - return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12; + return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12; + } + } +return 0; + + +/* ----------------- Check a UTF-16 string ----------------- */ + +#elif PCRE2_CODE_UNIT_WIDTH == 16 + +/* There's not so much work, nor so many errors, for UTF-16. +PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string +PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate +PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate +*/ + +for (p = string; length > 0; p++) + { + c = *p; + length--; + + if ((c & 0xf800) != 0xd800) + { + /* Normal UTF-16 code point. Neither high nor low surrogate. */ + } + else if ((c & 0x0400) == 0) + { + /* High surrogate. Must be a followed by a low surrogate. */ + if (length == 0) + { + *erroroffset = p - string; + return PCRE2_ERROR_UTF16_ERR1; + } + p++; + length--; + if ((*p & 0xfc00) != 0xdc00) + { + *erroroffset = p - string - 1; + return PCRE2_ERROR_UTF16_ERR2; + } + } + else + { + /* Isolated low surrogate. Always an error. */ + *erroroffset = p - string; + return PCRE2_ERROR_UTF16_ERR3; } } +return 0; + -#else /* SUPPORT_UTF */ -(void)(string); /* Keep picky compilers happy */ -(void)(length); -#endif -return PCRE_UTF8_ERR0; /* This indicates success */ +/* ----------------- Check a UTF-32 string ----------------- */ + +#else + +/* There is very little to do for a UTF-32 string. +PCRE2_ERROR_UTF32_ERR1 Surrogate character +PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff +*/ + +for (p = string; length > 0; length--, p++) + { + c = *p; + if ((c & 0xfffff800u) != 0xd800u) + { + /* Normal UTF-32 code point. Neither high nor low surrogate. */ + if (c > 0x10ffffu) + { + *erroroffset = p - string; + return PCRE2_ERROR_UTF32_ERR2; + } + } + else + { + /* A surrogate */ + *erroroffset = p - string; + return PCRE2_ERROR_UTF32_ERR1; + } + } +return 0; +#endif /* CODE_UNIT_WIDTH */ } +#endif /* SUPPORT_UNICODE */ -/* End of pcre_valid_utf8.c */ +/* End of pcre2_valid_utf.c */ --- parser-3.4.6.orig/src/main/pa_common.C +++ parser-3.4.6/src/main/pa_common.C @@ -12,7 +12,8 @@ #include "pa_charsets.h" #include "pa_http.h" #include "pa_request_charsets.h" -#include "pcre.h" +#define PCRE2_CODE_UNIT_WIDTH 8 +#include <pcre2.h> #include "pa_request.h" #include "pa_idna.h" --- parser-3.4.6.orig/src/main/pa_globals.C +++ parser-3.4.6/src/main/pa_globals.C @@ -26,7 +26,6 @@ #include "pa_cache_managers.h" #include "ltdl.h" -#include "pcre.h" volatile const char * IDENT_PA_GLOBALS_C="$Id: pa_globals.C,v 1.212 2021/01/16 15:47:05 moko Exp $" IDENT_PA_GLOBALS_H IDENT_PA_SAPI_H; @@ -206,10 +205,6 @@ #endif - // pcre - pcre_malloc=pa_malloc; - pcre_free=pa_free; - // cord CORD_oom_fn=pa_CORD_oom_fn; } --- parser-3.4.6.orig/src/types/pa_vregex.h +++ parser-3.4.6/src/types/pa_vregex.h @@ -16,7 +16,8 @@ #include "pa_common.h" #include "pa_vstateless_object.h" #include "pa_charset.h" -#include "pcre.h" +#define PCRE2_CODE_UNIT_WIDTH 8 +#include <pcre2.h> // defines @@ -62,26 +63,27 @@ fpattern(0), foptions_cstr(0), fcode(0), - fextra(0), - fstudied(false) + fgen_ctxt(0), + fcmp_ctxt(0), + fmatch_ctxt(0), + fmatch_data(0) { foptions[0]=0; foptions[1]=0; } - VRegex(Charset& acharset, const String* aregex, const String* aoptions): - fextra(0), - fstudied(false) + VRegex(Charset& acharset, const String* aregex, const String* aoptions) { set(acharset, aregex, aoptions); compile(); } ~VRegex(){ - if(fextra) - pcre_free(fextra); - if(fcode) - pcre_free(fcode); + pcre2_code_free(fcode); + pcre2_match_data_free(fmatch_data); + pcre2_match_context_free(fmatch_ctxt); + pcre2_compile_context_free(fcmp_ctxt); + pcre2_general_context_free(fgen_ctxt); } void set(Charset& acharset, const String* aregex, const String* aoptions); @@ -89,18 +91,16 @@ void compile(); - void study(); - - int exec(const char* string, size_t string_len, int* ovector, int ovector_size, int prestart=0); + int exec(const char* string, size_t string_len, int prestart=0); // size_t info(); + size_t* get_ovector_ptr(); + size_t full_info(int type); size_t get_info_size(); - size_t get_study_size(); - size_t get_options(); bool is_pre_post_match_needed(){ @@ -120,13 +120,15 @@ private: Charset* fcharset; - const char* fpattern; + PCRE2_SPTR fpattern; const char* foptions_cstr; int foptions[2]; - pcre* fcode; - pcre_extra* fextra; - bool fstudied; + pcre2_code* fcode; + pcre2_general_context* fgen_ctxt; + pcre2_compile_context* fcmp_ctxt; + pcre2_match_context* fmatch_ctxt; + pcre2_match_data* fmatch_data; }; --- parser-3.4.6.orig/src/classes/file.C +++ parser-3.4.6/src/classes/file.C @@ -711,7 +711,6 @@ } else if(vfilter->is_string()) { if(!vfilter->get_string()->trim().is_empty()) { vregex=new VRegex(r.charsets.source(), &vfilter->as_string(), 0/*options*/); - vregex->study(); vrcleaner.vregex=vregex; } } else { @@ -726,14 +725,11 @@ Table::Action_options table_options; Table& table=*new Table(file_list_table_template, table_options); - const int ovector_size=(1/*match*/)*3; - int ovector[ovector_size]; - LOAD_DIR(absolute_path_cstr, const char* file_name_cstr=ffblk.name(); size_t file_name_size=strlen(file_name_cstr); - if(!vregex || vregex->exec(file_name_cstr, file_name_size, ovector, ovector_size)>=0) { + if(!vregex || vregex->exec(file_name_cstr, file_name_size)>=0) { Table::element_type row(new ArrayString); *row+=new String(pa_strdup(file_name_cstr, file_name_size), String::L_TAINTED); *row+=new String(String::Body::Format(ffblk.is_dir(stat) ? 1 : 0), String::L_CLEAN); --- parser-3.4.6.orig/src/main/pa_string.C +++ parser-3.4.6/src/main/pa_string.C @@ -652,8 +652,7 @@ const char* subject=cstr(); size_t subject_length=length(); - const int ovector_size=(1/*match*/+MAX_MATCH_GROUPS)*3; - int ovector[ovector_size]; + size_t* ovector; Table::Action_options table_options; Table& table=*new Table(string_match_table_template, table_options); @@ -662,11 +661,12 @@ int poststart=0; int postfinish=length(); while(true) { - int exec_result=vregex->exec(subject, subject_length, ovector, ovector_size, prestart); + int exec_result=vregex->exec(subject, subject_length, prestart); if(exec_result<0) // only PCRE_ERROR_NOMATCH might be here, other negative results cause an exception break; + ovector=vregex->get_ovector_ptr(); int prefinish=ovector[0]; poststart=ovector[1]; --- parser-3.4.6.orig/src/types/pa_vregex.C +++ parser-3.4.6/src/types/pa_vregex.C @@ -19,8 +19,28 @@ const char* get_pcre_exec_error_text(int exec_result){ switch(exec_result){ - case PCRE_ERROR_BADUTF8: - case PCRE_ERROR_BADUTF8_OFFSET: + case PCRE2_ERROR_UTF8_ERR1: + case PCRE2_ERROR_UTF8_ERR2: + case PCRE2_ERROR_UTF8_ERR3: + case PCRE2_ERROR_UTF8_ERR4: + case PCRE2_ERROR_UTF8_ERR5: + case PCRE2_ERROR_UTF8_ERR6: + case PCRE2_ERROR_UTF8_ERR7: + case PCRE2_ERROR_UTF8_ERR8: + case PCRE2_ERROR_UTF8_ERR9: + case PCRE2_ERROR_UTF8_ERR10: + case PCRE2_ERROR_UTF8_ERR11: + case PCRE2_ERROR_UTF8_ERR12: + case PCRE2_ERROR_UTF8_ERR13: + case PCRE2_ERROR_UTF8_ERR14: + case PCRE2_ERROR_UTF8_ERR15: + case PCRE2_ERROR_UTF8_ERR16: + case PCRE2_ERROR_UTF8_ERR17: + case PCRE2_ERROR_UTF8_ERR18: + case PCRE2_ERROR_UTF8_ERR19: + case PCRE2_ERROR_UTF8_ERR20: + case PCRE2_ERROR_UTF8_ERR21: + case PCRE2_ERROR_BADUTFOFFSET: return "UTF-8 validation failed during pcre_exec (%d)."; break; default: @@ -28,6 +48,15 @@ } } +static void* +pa_pcre_malloc(size_t size, void *ptr){ + return pa_malloc(size); +} + +static void +pa_pcre_free(void *ptr, void *tag){ + pa_free(ptr); +} Value& VRegex::as_expr_result() { return *new VInt(as_int()); @@ -41,19 +70,18 @@ int set; int *result; } regex_option[]={ - {"i", "I", 0, PCRE_CASELESS, result}, // a=A - {"s", "S", 0, PCRE_DOTALL, result}, // ^\n\n$ [default] - {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$ - {"x", 0, 0, PCRE_EXTENDED, result}, // whitespace in regex ignored - {"U", 0, 0, PCRE_UNGREEDY, result}, // ungreedy patterns (greedy by default) + {"i", "I", 0, PCRE2_CASELESS, result}, // a=A + {"s", "S", 0, PCRE2_DOTALL, result}, // ^\n\n$ [default] + {"m", "M", PCRE2_DOTALL, PCRE2_MULTILINE, result}, // ^aaa\n$^bbb\n$ + {"x", 0, 0, PCRE2_EXTENDED, result}, // whitespace in regex ignored + {"U", 0, 0, PCRE2_UNGREEDY, result}, // ungreedy patterns (greedy by default) {"g", "G", 0, MF_GLOBAL_SEARCH, result+1}, // many rows {"'", 0, 0, MF_NEED_PRE_POST_MATCH, result+1}, {"n", 0, 0, MF_JUST_COUNT_MATCHES, result+1}, {0, 0, 0, 0, 0} }; - result[0]=PCRE_EXTRA /* backslash+non-special char causes error */ - | PCRE_DOTALL /* dot matches all chars including newline char */ - | PCRE_DOLLAR_ENDONLY /* dollar matches only end of string, but not newline chars */; + result[0]=PCRE2_DOTALL /* dot matches all chars including newline char */ + | PCRE2_DOLLAR_ENDONLY /* dollar matches only end of string, but not newline chars */; result[1]=0; if(options && !options->is_empty()){ @@ -79,7 +107,7 @@ fcharset=&acharset; - fpattern=aregex->untaint_cstr(String::L_REGEX); + fpattern=reinterpret_cast<const unsigned char*>(aregex->untaint_cstr(String::L_REGEX)); foptions_cstr=aoptions ? aoptions->cstr() : 0; @@ -99,34 +127,49 @@ void VRegex::compile(){ - const char* err_ptr; - int err_offset; + int err; + size_t err_offset; int options=foptions[0]; // @todo (for UTF-8): check string & pattern and use PCRE_NO_UTF8_CHECK option if(fcharset->isUTF8()) - options |= (PCRE_UTF8 | PCRE_UCP); + options |= (PCRE2_UTF | PCRE2_UCP); + + if(!fgen_ctxt) + fgen_ctxt=pcre2_general_context_create(pa_pcre_malloc, pa_pcre_free, NULL); + + if(!fcmp_ctxt) + fcmp_ctxt=pcre2_compile_context_create(fgen_ctxt); - fcode=pcre_compile(fpattern, options, - &err_ptr, &err_offset, - fcharset->pcre_tables); + pcre2_set_character_tables(fcmp_ctxt, fcharset->pcre_tables); + fcode=pcre2_compile(fpattern, PCRE2_ZERO_TERMINATED, options, + &err, &err_offset, + fcmp_ctxt); if(!fcode){ + PCRE2_UCHAR buffer[120]; + + pcre2_get_error_message(err, buffer, sizeof(buffer)); throw Exception(PCRE_EXCEPTION_TYPE, - new String(fpattern+err_offset, String::L_TAINTED), - "regular expression syntax error - %s", err_ptr); + new String(reinterpret_cast<const char*>(fpattern+err_offset), String::L_TAINTED), + "regular expression syntax error - %s", buffer); } } +size_t* VRegex::get_ovector_ptr(){ + return pcre2_get_ovector_pointer(fmatch_data); +} + + size_t VRegex::full_info(int type){ size_t result; - int fullinfo_result=pcre_fullinfo(fcode, fextra, type, &result); + int fullinfo_result=pcre2_pattern_info(fcode, type, &result); if(fullinfo_result<0){ throw Exception(PCRE_EXCEPTION_TYPE, - new String(fpattern, String::L_TAINTED), - "pcre_full_info error (%d)", fullinfo_result); + new String(reinterpret_cast<const char*>(fpattern), String::L_TAINTED), + "pcre2_pattern_info error (%d)", fullinfo_result); } return result; @@ -134,39 +177,24 @@ size_t VRegex::get_info_size(){ - return full_info(PCRE_INFO_SIZE); + return full_info(PCRE2_INFO_SIZE); } -size_t VRegex::get_study_size(){ - return full_info(PCRE_INFO_STUDYSIZE); -} - -void VRegex::study(){ - if(fstudied) - return; - - const char* err_ptr; - fextra=pcre_study(fcode, 0/*options*/, &err_ptr); - - if(err_ptr){ - throw Exception(PCRE_EXCEPTION_TYPE, - new String(fpattern, String::L_TAINTED), - "pcre_study error: %s", err_ptr); - } - - fstudied=true; -} +int VRegex::exec(const char* string, size_t string_len, int prestart){ + if(!fmatch_data) + fmatch_data=pcre2_match_data_create_from_pattern(fcode, fgen_ctxt); + if(!fmatch_ctxt) + fmatch_ctxt=pcre2_match_context_create(fgen_ctxt); -int VRegex::exec(const char* string, size_t string_len, int* ovector, int ovector_size, int prestart){ - int result=pcre_exec(fcode, fextra, - string, string_len, prestart, - prestart>0 ? PCRE_NO_UTF8_CHECK : 0, ovector, ovector_size); + int result=pcre2_match(fcode, + reinterpret_cast<const unsigned char*>(string), string_len, prestart, + prestart>0 ? PCRE2_NO_UTF_CHECK : 0, fmatch_data, fmatch_ctxt); - if(result<0 && result!=PCRE_ERROR_NOMATCH){ + if(result<0 && result!=PCRE2_ERROR_NOMATCH){ throw Exception(PCRE_EXCEPTION_TYPE, - new String(fpattern, String::L_TAINTED), + new String(reinterpret_cast<const char*>(fpattern), String::L_TAINTED), get_pcre_exec_error_text(result), result); } @@ -176,7 +204,7 @@ Value* VRegex::get_element(const String& aname) { if(aname == REGEX_PATTERN_NAME) - return new VString(*new String(fpattern, String::L_TAINTED)); + return new VString(*new String(reinterpret_cast<const char*>(fpattern), String::L_TAINTED)); if(aname == REGEX_OPTIONS_NAME) return new VString(*new String(foptions_cstr, String::L_TAINTED)); --- parser-3.4.6.orig/src/types/pa_vmail.C +++ parser-3.4.6/src/types/pa_vmail.C @@ -484,7 +484,7 @@ size_t mail_header_utf8_substring(const char *mail, size_t sub_length, size_t length){ int error_offset; if(int error_code=pa_pcre_valid_utf((unsigned char *)mail, sub_length, &error_offset)){ - if(error_code<PCRE_UTF8_ERR6){ // Missing X byte at the end of the string errors + if(error_code<PCRE2_ERROR_UTF8_ERR6){ // Missing X byte at the end of the string errors sub_length+=error_code; // adding X bytes return sub_length < length ? sub_length : length; } --- parser-3.4.6.orig/src/lib/pcre/pa_pcre_internal.h +++ parser-3.4.6/src/lib/pcre/pa_pcre_internal.h @@ -46,8 +46,8 @@ for this function is in the pcre_valid_utf8.c module. */ #ifdef __cplusplus - extern "C" int pa_pcre_valid_utf(unsigned char *string, int length, int *erroroffset); + extern "C" int pa_pcre_valid_utf(const unsigned char *string, int length, int *erroroffset); #else - extern int pa_pcre_valid_utf(unsigned char *string, int length, int *erroroffset); + extern int pa_pcre_valid_utf(const unsigned char *string, int length, int *erroroffset); #endif --- parser-3.4.6.orig/src/classes/regex.C +++ parser-3.4.6/src/classes/regex.C @@ -40,7 +40,6 @@ } vregex.compile(); - vregex.study(); } @@ -49,11 +48,6 @@ r.write(*new VInt(vregex.get_info_size())); } -static void _study_size(Request& r, MethodParams&) { - VRegex& vregex=GET_SELF(r, VRegex); - r.write(*new VInt(vregex.get_study_size())); -} - // constructor MRegex::MRegex(): Methoded("regex") { @@ -63,8 +57,5 @@ // ^regex.info_size[] add_native_method("size", Method::CT_DYNAMIC, _size, 0, 0); - // ^regex.study_size[] - add_native_method("study_size", Method::CT_DYNAMIC, _study_size, 0, 0); - } --- parser-3.4.6.orig/src/classes/string.C +++ parser-3.4.6/src/classes/string.C @@ -223,7 +223,6 @@ static void split_list(Value& delim_value, const String& string, ArrayString& result) { if(Value* value=delim_value.as(VREGEX_TYPE)){ VRegex *vregex=static_cast<VRegex*>(value); - vregex->study(); int matches_count=0; Split_action_info ai = { string, result }; @@ -396,7 +395,6 @@ vregex=static_cast<VRegex*>(value); } else { vregex=new VRegex(r.charsets.source(), ®exp.as_string(), (options) ? (&options->as_string()) : 0); - vregex->study(); vrcleaner.vregex=vregex; }
signature.asc
Description: This is a digitally signed message part