From 963338e33d010d4e932365b3c214c2ddffe565b5 Mon Sep 17 00:00:00 2001 From: eglwang Date: Sun, 4 Sep 2022 17:29:16 +0800 Subject: [PATCH] re V2 --- package/re/cre.c | 141 ++--- package/re/cre.h | 4 +- package/re/pcre.h | 193 ++---- package/re/pcre_chartables.c | 22 - package/re/pcre_compile.c | 4 +- package/re/pcre_exec.c | 2 +- package/re/pcre_fullinfo.c | 2 +- package/re/pcre_globals.c | 10 +- package/re/pcre_internal.h | 828 +++++++++--------------- package/re/pcre_newline.c | 10 - package/re/pcre_ord2utf8.c | 4 - package/re/pcre_tables.c | 6 - package/re/pcre_try_flipped.c | 5 - package/re/pcre_valid_utf8.c | 4 - package/re/pcre_xclass.c | 5 - package/re/re-api-adapter.c | 1126 +++++++++++++++++++++++---------- package/re/re.pyi | 49 +- package/re/readme.md | 37 +- 18 files changed, 1247 insertions(+), 1205 deletions(-) diff --git a/package/re/cre.c b/package/re/cre.c index 90c3c5bae..d1679725e 100644 --- a/package/re/cre.c +++ b/package/re/cre.c @@ -1,15 +1,16 @@ - -/* #define PCRE_STATIC */ - +/* +* +* Generally additional utility functions. +* L flag, also known as re.LOCALE in Python is not available here. +* Wrong results may be returned in re_sub likes funcitones when 'repl' contains '\', '\\\\1' for example. +* +* 4/9/2022 +*/ #include #include #include "pcre.h" #include "cre.h" -/// @brief the the number of groups in a re pattern -/// @param re: re pattern -/// @param out_groups_number : from 0,1,2,3,4... -/// @return a array pointer, free if after using int *_re_get_vec_table(pcre *re, int *out_groups_number) { int brackets_number = 0; @@ -25,18 +26,9 @@ int *_re_get_vec_table(pcre *re, int *out_groups_number) return vec; } -/************************************************************************* - -* (https?)://((\w+\.)+)(\w+) -* hihsid dii https://www.baidu.com, http://glwang.com -*************************************************************************/ - int *pcre_match(const char *_pat, const char *s, int len, int *out_vec_number, int opt) { int *vec = NULL; - // int group_n = 0; - //int rc; - // int start_offset = 0; pcre *re = re_get_match_re(_pat, opt); if (!re) return NULL; @@ -67,9 +59,9 @@ match: } if (rc <= 0) goto e_er; - if (vec[0] == vec[1]) // a empty match + if (vec[0] == vec[1]) { - start_offset++; // advace a position + start_offset++; if (start_offset >= len) goto e_er; goto match; @@ -84,9 +76,6 @@ e_er: int *pcre_fullmatch(const char *_pat, const char *s, int len, int *out_vec_number, int opt) { int *vec = NULL; - // int group_n = 0; - //int rc; - // int start_offset = 0; opt &= ~PCRE_MULTILINE; pcre *re = re_get_fullmatch_re(_pat, opt); if (!re) @@ -118,9 +107,9 @@ match: } if (rc <= 0) goto e_er; - if (vec[0] == vec[1]) // a empty match + if (vec[0] == vec[1]) { - start_offset++; // advace a position + start_offset++; if (start_offset >= len) goto e_er; goto match; @@ -252,9 +241,9 @@ match: } if (rc <= 0) goto e_er; - if (vec[0] == vec[1]) // a empty match + if (vec[0] == vec[1]) { - start_offset++; // advace a position + start_offset++; if (start_offset >= len) goto e_er; goto match; @@ -277,17 +266,10 @@ int **re_searchall(const char *pat, const char *s, int len, int *out_number, int pcre_free(re); return res; } -/// @brief find all match in a string -/// @param re: re pattern -/// @param s : string searching in -/// @param out_number : the number of matches -/// @return a vector table, vrc[n] is the nth matchs, -/// vrc[group_n][i*2] - vrc[group_n][i*2+1] is the begining-offset and ending-offset of group i. -/// Use re_free_searchall() to free the memory int **re_searchall2(pcre *re, const char *s, int len, int *out_number, int *out_vec_number, int opt) { int start_offset = 0; - int **vecs = NULL; // to store vec + int **vecs = NULL; int vec_cap = 4; int vec_n = 0; int *vec = NULL; @@ -304,7 +286,8 @@ int **re_searchall2(pcre *re, const char *s, int len, int *out_number, int *out_ *out_vec_number = group_n; group_n *= 3; } - if (!vec){ + if (!vec) + { goto e_er; } int rc; @@ -319,14 +302,13 @@ int **re_searchall2(pcre *re, const char *s, int len, int *out_number, int *out_ } if (rc <= 0) goto e_er; - if (vec[0] == vec[1]) // a empty match + if (vec[0] == vec[1]) { - start_offset++; // advace a position + start_offset++; if (start_offset >= len) goto e_er; goto match; } - //to sotre vec if (!vecs) { vecs = (int **)malloc(sizeof(int *) * vec_cap); @@ -334,32 +316,28 @@ int **re_searchall2(pcre *re, const char *s, int len, int *out_number, int *out_ goto e_er; } - if (vec_n >= vec_cap) // need to recap this list + if (vec_n >= vec_cap) { vec_cap *= 2; void *p = realloc(vecs, vec_cap * sizeof(int *)); if (!p) goto e_er; - // if (p != vecs) // move data - // { - // memmove(p, vecs, vec_n * sizeof(int*)); vecs = (int **)p; - // } } vecs[vec_n++] = vec; start_offset = vec[1]; } e_er: if (vec) - free(vec); // the latest vec table + free(vec); if (!vecs) return NULL; for (int j = 0; j < vec_n; j++) { if (vecs[j]) - free((void *)(vecs[j])); // free vec table + free((void *)(vecs[j])); } - free(vecs); // free the table list + free(vecs); return NULL; } void re_free_searchall(int **vecs, int n) @@ -369,9 +347,9 @@ void re_free_searchall(int **vecs, int n) for (int j = 0; j < n; j++) { if (vecs[j]) - free((void *)(vecs[j])); // free vec table + free((void *)(vecs[j])); } - free(vecs); // free the table list + free(vecs); } /* the following functions return (a) string in heap, which means it need to be freed after using*/ @@ -438,9 +416,9 @@ match: } if (rc <= 0) goto e_er; - if (vec[0] == vec[1]) // a empty match + if (vec[0] == vec[1]) { - start_offset++; // advace a position + start_offset++; if (start_offset >= len) goto e_er; goto match; @@ -498,9 +476,9 @@ void re_free_findall(char **ss, int n) for (int j = 0; j < n; j++) { if (ss[j]) - free((void *)(ss[j])); // free vec table + free((void *)(ss[j])); } - free(ss); // free the table list + free(ss); } char *pcre_sub(const char *pat, const char *to, const char *s, int len, int opt) @@ -514,30 +492,20 @@ char *pcre_sub(const char *pat, const char *to, const char *s, int len, int opt) pcre_free(re); return res; } -char *pcre_subn(const char *pat, const char *to, const char *s, int len, int n, int opt) +char *pcre_subn(const char *pat, const char *to, const char *s, int len, int n, int opt, int *out_repl_times) { const char *error; int erroffset; pcre *re = pcre_compile(pat, opt, &error, &erroffset, NULL); if (!re) return NULL; - char *res = re_subn2(re, to, s, len, n, opt); + char *res = re_subn2(re, to, s, len, n, opt, out_repl_times); pcre_free(re); return res; } -/// @brief substitute a string with a pattern expression, given replacement limit -/// @param re : re pattern for matching -/// @param to : re pattern to replacement -/// @param s : string searching in -/// @param len : length of -/// @param n : the replacement number -/// @return if no replacement, return s exactly, otherwise return a new string, free it after using -char *re_subn2(pcre *re, const char *to, const char *s, int len, int n, int opt) +char *re_subn2(pcre *re, const char *to, const char *s, int len, int n, int opt, int *out_repl_times) { int group_n = 0; - // int group_n2 = 0; - // int *vec = NULL; - // int *vec2 = NULL; pcre *re2 = NULL; int vcs1_n = 0, vcs2_n = 0; int **vcs1 = re_searchall2(re, s, len, &vcs1_n, &group_n, opt); @@ -545,11 +513,8 @@ char *re_subn2(pcre *re, const char *to, const char *s, int len, int n, int opt) int match_limit = 0; if (!vcs1_n) { - //no match, no replacement return (char *)s; } - //to determine '\\' and group like: '\group_n' - //3 groups, 0, 1, 2->\\, 3->\group_n, if any const char *p2 = "(\\\\\\\\|\\\\\\d{1,2})"; int erroffset; const char *error; @@ -559,76 +524,55 @@ char *re_subn2(pcre *re, const char *to, const char *s, int len, int n, int opt) re2 = pcre_compile(p2, 0, &error, &erroffset, NULL); if (!re2) goto exit_error; - //match len_to = strlen(to); vcs2 = re_searchall2(re2, to, len_to, &vcs2_n, NULL, 0); - //if (!vcs2) - //{ - // //goto exit_error; - // vcs2_n = 0; - //} - pcre_free(re2); re2 = NULL; - //note that re2 is no use after this, onece we get vcs2 - remain_length2 = len_to; // the remain length in 'to' exclude from all '\\' and all '\n' + remain_length2 = len_to; for (int i = 0; i < vcs2_n; i++) { - int *vc = vcs2[i]; // (0,1)->'\\'or'\n', (2,3)->'\\', (4,5)->'\n', (6,7,8) + int *vc = vcs2[i]; int vc0 = vc[0] + 1; if (to[vc0] == '\\') { vc[2] = 0; remain_length2 -= 2; } - else // \n, + else { int wanted_number = 0; - //vc[1]--; int l_n = vc[1] - vc0; if (l_n == 1) { wanted_number = to[vc0] - '0'; remain_length2 -= 2; } - else // if(l_n==2) + else { wanted_number = (to[vc0] - '0') * 10 + to[vc0 + 1] - '0'; remain_length2 -= 3; } if (wanted_number <= 0 || wanted_number >= group_n) goto exit_error; - //store it in vc[2] vc[2] = wanted_number; } } - //now that vcs2 stores data of which group is used in replacement - //Nx9, N is the number of groups used in every one replcaement, - //while vcs2[2] is the exact group number used in replacement - //parse 'to' - //get the remian size match_limit = n ? (n <= vcs1_n ? n : vcs1_n) : vcs1_n; remain_size = len + remain_length2 * match_limit; - //match times for (int i = 0; i < match_limit; i++) { int *vc = vcs1[i]; - //vc[1]-vc[0] is the match sequence which need to be replaced, while the following are groups remain_size -= vc[1] - vc[0]; - // the replcaements - // 'to' e.g.: \\ \1, \2, ....\x for (int j = 0; j < vcs2_n; j++) { int *v2 = vcs2[j]; if (v2[2]) { - //replaced to a group remain_size += GetGroupLen(vc, v2[2]); } else { - //replaced to a '/' remain_size++; } } @@ -655,13 +599,11 @@ char *re_subn2(pcre *re, const char *to, const char *s, int len, int n, int opt) int to_group_at = vc[to_group * 2]; int to_group_end = vc[to_group * 2 + 1]; int g_l = to_group_end - to_group_at; - //replaced to a group memcpy(new_s + pi, s + to_group_at, g_l); pi += g_l; } else { - //replaced to a '/' new_s[pi++] = '\\'; } m_start = v2[1]; @@ -669,9 +611,10 @@ char *re_subn2(pcre *re, const char *to, const char *s, int len, int n, int opt) m_len = len_to - m_start; memcpy(new_s + pi, to + m_start, m_len); pi += m_len; - // end of one match qi = vc[1]; } + if (out_repl_times) + *out_repl_times = match_limit; if (vcs1) re_free_searchall(vcs1, vcs1_n); if (vcs2) @@ -692,13 +635,7 @@ exit_error: pcre_free(re2); return NULL; } -/// @brief substitute a string with a pattern expression -/// @param re : re pattern for matching -/// @param to : re pattern to replacement -/// @param s : string searching in -/// @param len : length of -/// @return if no replacement, return s exactly, otherwise return a new string, free it after using char *re_sub2(pcre *re, const char *to, const char *s, int len, int opt) { - return re_subn2(re, to, s, len, 0, opt); + return re_subn2(re, to, s, len, 0, opt, NULL); } diff --git a/package/re/cre.h b/package/re/cre.h index 9d7f7c2ad..9aa6dfe37 100644 --- a/package/re/cre.h +++ b/package/re/cre.h @@ -42,9 +42,9 @@ void re_free_findall(char **ss, int n); char *pcre_sub(const char *pat, const char *to, const char *s, int len, int opt); -char *pcre_subn(const char *pat, const char *to, const char *s, int len, int n, int opt); +char *pcre_subn(const char *pat, const char *to, const char *s, int len, int n, int opt, int *out_repl_times); -char *re_subn2(pcre *re, const char *to, const char *s, int len, int n, int opt); +char *re_subn2(pcre *re, const char *to, const char *s, int len, int n, int opt, int *out_repl_times); char *re_sub2(pcre *re, const char *to, const char *s, int len, int opt); #endif \ No newline at end of file diff --git a/package/re/pcre.h b/package/re/pcre.h index e7252643a..710f86d32 100644 --- a/package/re/pcre.h +++ b/package/re/pcre.h @@ -2,63 +2,13 @@ #ifndef _PCRE_H #define _PCRE_H -/* The current PCRE version information. */ - -#define PCRE_MAJOR @PCRE_MAJOR@ -#define PCRE_MINOR @PCRE_MINOR@ -#define PCRE_PRERELEASE @PCRE_PRERELEASE@ -#define PCRE_DATE @PCRE_DATE@ - -/* When an application links to a PCRE DLL in Windows, the symbols that are -imported have to be identified as such. When building PCRE, the appropriate -export setting is defined in pcre_internal.h, which includes this file. So we -don't change existing definitions of PCRE_EXP_DECL and PCRECPP_EXP_DECL. */ - -#if defined(_WIN32) && !defined(PCRE_STATIC) -# ifndef PCRE_EXP_DECL -# define PCRE_EXP_DECL extern __declspec(dllimport) -# endif -# ifdef __cplusplus -# ifndef PCRECPP_EXP_DECL -# define PCRECPP_EXP_DECL extern __declspec(dllimport) -# endif -# ifndef PCRECPP_EXP_DEFN -# define PCRECPP_EXP_DEFN __declspec(dllimport) -# endif -# endif -#endif - -/* By default, we use the standard "extern" declarations. */ - -#ifndef PCRE_EXP_DECL -# ifdef __cplusplus -# define PCRE_EXP_DECL extern "C" -# else -# define PCRE_EXP_DECL extern -# endif -#endif - -#ifdef __cplusplus -# ifndef PCRECPP_EXP_DECL -# define PCRECPP_EXP_DECL extern -# endif -# ifndef PCRECPP_EXP_DEFN -# define PCRECPP_EXP_DEFN -# endif -#endif - -/* Have to include stdlib.h in order to ensure that size_t is defined; -it is needed here for malloc. */ - #include -/* Allow for C++ users */ #ifdef __cplusplus extern "C" { #endif -/* Options */ #define PCRE_CASELESS 0x00000001 #define PCRE_MULTILINE 0x00000002 @@ -87,19 +37,19 @@ extern "C" { #define PCRE_NEWLINE_ANYCRLF 0x00500000 #define PCRE_BSR_ANYCRLF 0x00800000 #define PCRE_BSR_UNICODE 0x01000000 +#define PCRE_ONLY_ASCII 0x02000000 -/* Exec-time and get/set-time error codes */ #define PCRE_ERROR_NOMATCH (-1) #define PCRE_ERROR_NULL (-2) #define PCRE_ERROR_BADOPTION (-3) #define PCRE_ERROR_BADMAGIC (-4) #define PCRE_ERROR_UNKNOWN_OPCODE (-5) -#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */ +#define PCRE_ERROR_UNKNOWN_NODE (-5) #define PCRE_ERROR_NOMEMORY (-6) #define PCRE_ERROR_NOSUBSTRING (-7) #define PCRE_ERROR_MATCHLIMIT (-8) -#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */ +#define PCRE_ERROR_CALLOUT (-9) #define PCRE_ERROR_BADUTF8 (-10) #define PCRE_ERROR_BADUTF8_OFFSET (-11) #define PCRE_ERROR_PARTIAL (-12) @@ -112,17 +62,16 @@ extern "C" { #define PCRE_ERROR_DFA_WSSIZE (-19) #define PCRE_ERROR_DFA_RECURSE (-20) #define PCRE_ERROR_RECURSIONLIMIT (-21) -#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */ +#define PCRE_ERROR_NULLWSLIMIT (-22) #define PCRE_ERROR_BADNEWLINE (-23) -/* Request types for pcre_fullinfo() */ #define PCRE_INFO_OPTIONS 0 #define PCRE_INFO_SIZE 1 #define PCRE_INFO_CAPTURECOUNT 2 #define PCRE_INFO_BACKREFMAX 3 #define PCRE_INFO_FIRSTBYTE 4 -#define PCRE_INFO_FIRSTCHAR 4 /* For backwards compatibility */ +#define PCRE_INFO_FIRSTCHAR 4 #define PCRE_INFO_FIRSTTABLE 5 #define PCRE_INFO_LASTLITERAL 6 #define PCRE_INFO_NAMEENTRYSIZE 7 @@ -134,8 +83,6 @@ extern "C" { #define PCRE_INFO_JCHANGED 13 #define PCRE_INFO_HASCRORLF 14 -/* Request types for pcre_config(). Do not re-arrange, in order to remain -compatible. */ #define PCRE_CONFIG_UTF8 0 #define PCRE_CONFIG_NEWLINE 1 @@ -147,8 +94,6 @@ compatible. */ #define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7 #define PCRE_CONFIG_BSR 8 -/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine -these bits, just add new ones on the end, in order to remain compatible. */ #define PCRE_EXTRA_STUDY_DATA 0x0001 #define PCRE_EXTRA_MATCH_LIMIT 0x0002 @@ -156,109 +101,67 @@ these bits, just add new ones on the end, in order to remain compatible. */ #define PCRE_EXTRA_TABLES 0x0008 #define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010 -/* Types */ struct real_pcre; /* declaration; the definition is private */ typedef struct real_pcre pcre; -/* When PCRE is compiled as a C++ library, the subject pointer type can be -replaced with a custom type. For conventional use, the public interface is a -const char *. */ #ifndef PCRE_SPTR #define PCRE_SPTR const char * #endif -/* The structure for passing additional data to pcre_exec(). This is defined in -such as way as to be extensible. Always add new fields at the end, in order to -remain compatible. */ typedef struct pcre_extra { - unsigned long int flags; /* Bits for which fields are set */ - void *study_data; /* Opaque data from pcre_study() */ - unsigned long int match_limit; /* Maximum number of calls to match() */ - void *callout_data; /* Data passed back in callouts */ - const unsigned char *tables; /* Pointer to character tables */ - unsigned long int match_limit_recursion; /* Max recursive calls to match() */ + unsigned long int flags; + void *study_data; + unsigned long int match_limit; + void *callout_data; + const unsigned char *tables; + unsigned long int match_limit_recursion; } pcre_extra; -/* The structure for passing out data via the pcre_callout_function. We use a -structure so that new fields can be added on the end in future versions, -without changing the API of the function, thereby allowing old clients to work -without modification. */ typedef struct pcre_callout_block { - int version; /* Identifies version of block */ - /* ------------------------ Version 0 ------------------------------- */ - int callout_number; /* Number compiled into pattern */ - int *offset_vector; /* The offset vector */ - PCRE_SPTR subject; /* The subject being matched */ - int subject_length; /* The length of the subject */ - int start_match; /* Offset to start of this match attempt */ - int current_position; /* Where we currently are in the subject */ - int capture_top; /* Max current capture */ - int capture_last; /* Most recently closed capture */ - void *callout_data; /* Data passed in with the call */ - /* ------------------- Added for Version 1 -------------------------- */ - int pattern_position; /* Offset to next item in the pattern */ - int next_item_length; /* Length of next item in the pattern */ - /* ------------------------------------------------------------------ */ + int version; + int callout_number; + int *offset_vector; + PCRE_SPTR subject; + int subject_length; + int start_match; + int current_position; + int capture_top; + int capture_last; + void *callout_data; + int pattern_position; + int next_item_length; } pcre_callout_block; -/* Indirection for store get and free functions. These can be set to -alternative malloc/free functions if required. Special ones are used in the -non-recursive case for "frames". There is also an optional callout function -that is triggered by the (?) regex item. For Virtual Pascal, these definitions -have to take another form. */ #ifndef VPCOMPAT -PCRE_EXP_DECL void *(*pcre_malloc)(size_t); -PCRE_EXP_DECL void (*pcre_free)(void *); -PCRE_EXP_DECL void *(*pcre_stack_malloc)(size_t); -PCRE_EXP_DECL void (*pcre_stack_free)(void *); -PCRE_EXP_DECL int (*pcre_callout)(pcre_callout_block *); -#else /* VPCOMPAT */ -PCRE_EXP_DECL void *pcre_malloc(size_t); -PCRE_EXP_DECL void pcre_free(void *); -PCRE_EXP_DECL void *pcre_stack_malloc(size_t); -PCRE_EXP_DECL void pcre_stack_free(void *); -PCRE_EXP_DECL int pcre_callout(pcre_callout_block *); -#endif /* VPCOMPAT */ - -/* Exported PCRE functions */ - -PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *, - const unsigned char *); -PCRE_EXP_DECL pcre *pcre_compile2(const char *, int, int *, const char **, - int *, const unsigned char *); -// PCRE_EXP_DECL int pcre_copy_named_substring(const pcre *, const char *, -// int *, int, const char *, char *, int); -// PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int, char *, -// int); -// PCRE_EXP_DECL int pcre_dfa_exec(const pcre *, const pcre_extra *, - // const char *, int, int, int, int *, int , int *, int); -PCRE_EXP_DECL int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR, - int, int, int, int *, int); -// PCRE_EXP_DECL void pcre_free_substring(const char *); -// PCRE_EXP_DECL void pcre_free_substring_list(const char **); -PCRE_EXP_DECL int pcre_fullinfo(const pcre *, const pcre_extra *, int, - void *); -// PCRE_EXP_DECL int pcre_get_named_substring(const pcre *, const char *, -// int *, int, const char *, const char **); -// PCRE_EXP_DECL int pcre_get_stringnumber(const pcre *, const char *); -// PCRE_EXP_DECL int pcre_get_stringtable_entries(const pcre *, const char *, -// char **, char **); -// PCRE_EXP_DECL int pcre_get_substring(const char *, int *, int, int, -// const char **); -// PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int, -// const char ***); -// PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *); -// PCRE_EXP_DECL const unsigned char *pcre_maketables(void); -// PCRE_EXP_DECL int pcre_refcount(pcre *, int); -// PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **); - -#ifdef __cplusplus -} /* extern "C" */ +void *(*pcre_malloc)(size_t); +void (*pcre_free)(void *); +void *(*pcre_stack_malloc)(size_t); +void (*pcre_stack_free)(void *); +int (*pcre_callout)(pcre_callout_block *); +#else +void *pcre_malloc(size_t); +void pcre_free(void *); +void *pcre_stack_malloc(size_t); +void pcre_stack_free(void *); +int pcre_callout(pcre_callout_block *); #endif -#endif /* End of pcre.h */ +pcre *pcre_compile(const char *, int, const char **, int *, + const unsigned char *); +pcre *pcre_compile2(const char *, int, int *, const char **, + int *, const unsigned char *); +int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR, + int, int, int, int *, int); +int pcre_fullinfo(const pcre *, const pcre_extra *, int, + void *); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/package/re/pcre_chartables.c b/package/re/pcre_chartables.c index 09fdc8b38..736adc518 100644 --- a/package/re/pcre_chartables.c +++ b/package/re/pcre_chartables.c @@ -1,25 +1,3 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* This file contains character tables that are used when no external tables -are passed to PCRE by the application that calls it. The tables are used only -for characters whose code values are less than 256. - -This is a default version of the tables that assumes ASCII encoding. A program -called dftables (which is distributed with PCRE) can be used to build -alternative versions of this file. This is necessary if you are running in an -EBCDIC environment, or if you want to default to a different encoding, for -example ISO-8859-1. When dftables is run, it creates these tables in the -current locale. If PCRE is configured with --enable-rebuild-chartables, this -happens automatically. - -The following #includes are present because without the gcc 4.x may remove the -array definition from the final binary if PCRE is built into a static library -and dead code stripping is activated. This leads to link errors. Pulling in the -header ensures that the array gets flagged as "someone outside this compilation -unit might reference this" and so it will always be supplied to the linker. */ - #include "re_config.h" #include "pcre_internal.h" diff --git a/package/re/pcre_compile.c b/package/re/pcre_compile.c index 073fad6c8..d28546292 100644 --- a/package/re/pcre_compile.c +++ b/package/re/pcre_compile.c @@ -5744,7 +5744,7 @@ Returns: pointer to compiled data block, or NULL on error, with errorptr and erroroffset set */ -PCRE_EXP_DEFN pcre * +pcre * pcre_compile(const char *pattern, int options, const char **errorptr, int *erroroffset, const unsigned char *tables) { @@ -5752,7 +5752,7 @@ return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); } -PCRE_EXP_DEFN pcre * +pcre * pcre_compile2(const char *pattern, int options, int *errorcodeptr, const char **errorptr, int *erroroffset, const unsigned char *tables) { diff --git a/package/re/pcre_exec.c b/package/re/pcre_exec.c index 3d082e343..e7a8143a2 100644 --- a/package/re/pcre_exec.c +++ b/package/re/pcre_exec.c @@ -4306,7 +4306,7 @@ Returns: > 0 => success; value is the number of elements filled in < -1 => some kind of unexpected problem */ -PCRE_EXP_DEFN int +int pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, int offsetcount) diff --git a/package/re/pcre_fullinfo.c b/package/re/pcre_fullinfo.c index ad7827e99..9144f2ec8 100644 --- a/package/re/pcre_fullinfo.c +++ b/package/re/pcre_fullinfo.c @@ -23,7 +23,7 @@ Arguments: Returns: 0 if data returned, negative on error */ -PCRE_EXP_DEFN int +int pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what, void *where) { diff --git a/package/re/pcre_globals.c b/package/re/pcre_globals.c index 03aaad1f4..02c03906b 100644 --- a/package/re/pcre_globals.c +++ b/package/re/pcre_globals.c @@ -11,11 +11,11 @@ differently, and global variables are not used (see pcre.in). */ #include "pcre_internal.h" #ifndef VPCOMPAT -PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = malloc; -PCRE_EXP_DATA_DEFN void (*pcre_free)(void *) = free; -PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = malloc; -PCRE_EXP_DATA_DEFN void (*pcre_stack_free)(void *) = free; -PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL; +void *(*pcre_malloc)(size_t) = malloc; +void (*pcre_free)(void *) = free; +void *(*pcre_stack_malloc)(size_t) = malloc; +void (*pcre_stack_free)(void *) = free; +int (*pcre_callout)(pcre_callout_block *) = NULL; #endif /* End of pcre_globals.c */ diff --git a/package/re/pcre_internal.h b/package/re/pcre_internal.h index 7bec3535d..f75a669b5 100644 --- a/package/re/pcre_internal.h +++ b/package/re/pcre_internal.h @@ -2,31 +2,19 @@ #ifndef PCRE_INTERNAL_H #define PCRE_INTERNAL_H -/* Define DEBUG to get debugging output on stdout. */ #if 0 #define DEBUG #endif -/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef -inline, and there are *still* stupid compilers about that don't like indented -pre-processor statements, or at least there were when I first wrote this. After -all, it had only been about 10 years then... - -It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so -be absolutely sure we get our version. */ - #undef DPRINTF #ifdef DEBUG #define DPRINTF(p) printf p #else -#define DPRINTF(p) /* Nothing */ +#define DPRINTF(p) #endif -/* Standard C headers plus the external interface definition. The only time -setjmp and stdarg are used is when NO_RECURSE is set. */ - #include #include #include @@ -36,64 +24,6 @@ setjmp and stdarg are used is when NO_RECURSE is set. */ #include #include -/* When compiling a DLL for Windows, the exported symbols have to be declared -using some MS magic. I found some useful information on this web page: -http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the -information there, using __declspec(dllexport) without "extern" we have a -definition; with "extern" we have a declaration. The settings here override the -setting in pcre.h (which is included below); it defines only PCRE_EXP_DECL, -which is all that is needed for applications (they just import the symbols). We -use: - - PCRE_EXP_DECL for declarations - PCRE_EXP_DEFN for definitions of exported functions - PCRE_EXP_DATA_DEFN for definitions of exported variables - -The reason for the two DEFN macros is that in non-Windows environments, one -does not want to have "extern" before variable definitions because it leads to -compiler warnings. So we distinguish between functions and variables. In -Windows, the two should always be the same. - -The reason for wrapping this in #ifndef PCRE_EXP_DECL is so that pcretest, -which is an application, but needs to import this file in order to "peek" at -internals, can #include pcre.h first to get an application's-eye view. - -In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon, -special-purpose environments) might want to stick other stuff in front of -exported symbols. That's why, in the non-Windows case, we set PCRE_EXP_DEFN and -PCRE_EXP_DATA_DEFN only if they are not already set. */ - -#ifndef PCRE_EXP_DECL -# ifdef _WIN32 -# ifndef PCRE_STATIC -# define PCRE_EXP_DECL extern __declspec(dllexport) -# define PCRE_EXP_DEFN __declspec(dllexport) -# define PCRE_EXP_DATA_DEFN __declspec(dllexport) -# else -# define PCRE_EXP_DECL extern -# define PCRE_EXP_DEFN -# define PCRE_EXP_DATA_DEFN -# endif -# else -# ifdef __cplusplus -# define PCRE_EXP_DECL extern "C" -# else -# define PCRE_EXP_DECL extern -# endif -# ifndef PCRE_EXP_DEFN -# define PCRE_EXP_DEFN PCRE_EXP_DECL -# endif -# ifndef PCRE_EXP_DATA_DEFN -# define PCRE_EXP_DATA_DEFN -# endif -# endif -#endif - -/* We need to have types that specify unsigned 16-bit and 32-bit integers. We -cannot determine these outside the compilation (e.g. by running a program as -part of "configure") because PCRE is often cross-compiled for use on other -systems. Instead we make use of the maximum sizes that are available at -preprocessor time in standard C environments. */ #if USHRT_MAX == 65535 typedef unsigned short pcre_uint16; @@ -111,32 +41,17 @@ preprocessor time in standard C environments. */ #error Cannot determine a type for 32-bit unsigned integers #endif -/* All character handling must be done as unsigned characters. Otherwise there -are problems with top-bit-set characters and functions such as isspace(). -However, we leave the interface to the outside world as char *, because that -should make things easier for callers. We define a short type for unsigned char -to save lots of typing. I tried "uchar", but it causes problems on Digital -Unix, where it is defined in sys/types, so use "uschar" instead. */ typedef unsigned char uschar; -/* This is an unsigned int value that no character can ever have. UTF-8 -characters only go up to 0x7fffffff (though Unicode doesn't go beyond -0x0010ffff). */ #define NOTACHAR 0xffffffff -/* PCRE is able to support several different kinds of newline (CR, LF, CRLF, -"any" and "anycrlf" at present). The following macros are used to package up -testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various -modules to indicate in which datablock the parameters exist, and what the -start/end of string field names are. */ -#define NLTYPE_FIXED 0 /* Newline is a fixed length string */ -#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */ -#define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */ +#define NLTYPE_FIXED 0 +#define NLTYPE_ANY 1 +#define NLTYPE_ANYCRLF 2 -/* This macro checks for a newline at the given position */ #define IS_NEWLINE(p) \ ((NLBLOCK->nltype != NLTYPE_FIXED)? \ @@ -150,7 +65,6 @@ start/end of string field names are. */ ) \ ) -/* This macro checks for a newline immediately preceding the given position */ #define WAS_NEWLINE(p) \ ((NLBLOCK->nltype != NLTYPE_FIXED)? \ @@ -164,14 +78,6 @@ start/end of string field names are. */ ) \ ) -/* When PCRE is compiled as a C++ library, the subject pointer can be replaced -with a custom type. This makes it possible, for example, to allow pcre_exec() -to process subject strings that are discontinuous by using a smart pointer -class. It must always be possible to inspect all of the subject string in -pcre_exec() because of the way it backtracks. Two macros are required in the -normal case, for sign-unspecified and unsigned char pointers. The former is -used for the external interface and appears in pcre.h, which is why its name -must begin with PCRE_. */ #ifdef CUSTOM_SUBJECT_PTR #define PCRE_SPTR CUSTOM_SUBJECT_PTR @@ -182,16 +88,8 @@ must begin with PCRE_. */ #endif - -/* Include the public PCRE header and the definitions of UCP character property -values. */ - #include "pcre.h" -// #include "ucp.h" -/* When compiling for use with the Virtual Pascal compiler, these functions -need to have their names changed. PCRE must be compiled with the -DVPCOMPAT -option on the command line. */ #ifdef VPCOMPAT #define strlen(s) _strlen(s) @@ -200,18 +98,14 @@ option on the command line. */ #define memcpy(d,s,n) _memcpy(d,s,n) #define memmove(d,s,n) _memmove(d,s,n) #define memset(s,c,n) _memset(s,c,n) -#else /* VPCOMPAT */ +#else -/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), -define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY -is set. Otherwise, include an emulating function for those systems that have -neither (there some non-Unix environments where this is the case). */ #ifndef HAVE_MEMMOVE -#undef memmove /* some systems may have a macro */ +#undef memmove #ifdef HAVE_BCOPY #define memmove(a, b, c) bcopy(b, a, c) -#else /* HAVE_BCOPY */ +#else static void * pcre_memmove(void *d, const void *s, size_t n) { @@ -232,24 +126,11 @@ else } } #define memmove(a, b, c) pcre_memmove(a, b, c) -#endif /* not HAVE_BCOPY */ -#endif /* not HAVE_MEMMOVE */ -#endif /* not VPCOMPAT */ +#endif +#endif +#endif -/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored -in big-endian order) by default. These are used, for example, to link from the -start of a subpattern to its alternatives and its end. The use of 2 bytes per -offset limits the size of the compiled regex to around 64K, which is big enough -for almost everybody. However, I received a request for an even bigger limit. -For this reason, and also to make the code easier to maintain, the storing and -loading of offsets from the byte string is now handled by the macros that are -defined here. - -The macros are controlled by the value of LINK_SIZE. This defaults to 2 in -the config.h file, but can be overridden by using -D on the command line. This -is automated on Unix systems via the "configure" command. */ - #if LINK_SIZE == 2 #define PUT(a,n,d) \ @@ -286,7 +167,7 @@ is automated on Unix systems via the "configure" command. */ #define GET(a,n) \ (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) -#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ +#define MAX_PATTERN_SIZE (1 << 30) #else @@ -294,15 +175,9 @@ is automated on Unix systems via the "configure" command. */ #endif -/* Convenience macro defined in terms of the others */ - #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE -/* PCRE uses some other 2-byte quantities that do not change when the size of -offsets changes. There are used for repeat counts and for other things such as -capturing parenthesis numbers in back references. */ - #define PUT2(a,n,d) \ a[n] = (d) >> 8; \ a[(n)+1] = (d) & 255 @@ -313,12 +188,6 @@ capturing parenthesis numbers in back references. */ #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2 -/* When UTF-8 encoding is being used, a character is no longer just a single -byte. The macros for character handling generate simple sequences when used in -byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should -never be called in byte mode. To make sure it can never even appear when UTF-8 -support is omitted, we don't even define it. */ - #ifndef SUPPORT_UTF8 #define NEXTCHAR(p) p++; #define GETCHAR(c, eptr) c = *eptr; @@ -326,26 +195,22 @@ support is omitted, we don't even define it. */ #define GETCHARINC(c, eptr) c = *eptr++; #define GETCHARINCTEST(c, eptr) c = *eptr++; #define GETCHARLEN(c, eptr, len) c = *eptr; -/* #define BACKCHAR(eptr) */ -#else /* SUPPORT_UTF8 */ -/* Advance a character pointer one byte in non-UTF-8 mode and by one character -in UTF-8 mode. */ +#else + #define NEXTCHAR(p) \ p++; \ if (utf8) { while((*p & 0xc0) == 0x80) p++; } -/* Get the next UTF-8 character, not advancing the pointer. This is called when -we know we are in UTF-8 mode. */ #define GETCHAR(c, eptr) \ c = *eptr; \ if (c >= 0xc0) \ { \ int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; \ int gcss = 6*gcaa; \ c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ for (gcii = 1; gcii <= gcaa; gcii++) \ @@ -355,15 +220,13 @@ we know we are in UTF-8 mode. */ } \ } -/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the -pointer. */ #define GETCHARTEST(c, eptr) \ c = *eptr; \ if (utf8 && c >= 0xc0) \ { \ int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; \ int gcss = 6*gcaa; \ c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ for (gcii = 1; gcii <= gcaa; gcii++) \ @@ -373,14 +236,12 @@ pointer. */ } \ } -/* Get the next UTF-8 character, advancing the pointer. This is called when we -know we are in UTF-8 mode. */ #define GETCHARINC(c, eptr) \ c = *eptr++; \ if (c >= 0xc0) \ { \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; \ int gcss = 6*gcaa; \ c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ while (gcaa-- > 0) \ @@ -390,13 +251,12 @@ know we are in UTF-8 mode. */ } \ } -/* Get the next character, testing for UTF-8 mode, and advancing the pointer */ #define GETCHARINCTEST(c, eptr) \ c = *eptr++; \ if (utf8 && c >= 0xc0) \ { \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; \ int gcss = 6*gcaa; \ c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ while (gcaa-- > 0) \ @@ -406,15 +266,13 @@ know we are in UTF-8 mode. */ } \ } -/* Get the next UTF-8 character, not advancing the pointer, incrementing length -if there are extra bytes. This is called when we know we are in UTF-8 mode. */ #define GETCHARLEN(c, eptr, len) \ c = *eptr; \ if (c >= 0xc0) \ { \ int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; \ int gcss = 6*gcaa; \ c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ for (gcii = 1; gcii <= gcaa; gcii++) \ @@ -425,44 +283,30 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */ len += gcaa; \ } -/* If the pointer is not at the start of a character, move it back until -it is. This is called only in UTF-8 mode - we don't put a test within the macro -because almost all calls are already within a block of UTF-8 only code. */ #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr-- #endif -/* In case there is no definition of offsetof() provided - though any proper -Standard C system should have one. */ - #ifndef offsetof #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) #endif -/* These are the public options that can change during matching. */ - #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) -/* Private flags containing information about the compiled regex. They used to -live at the top end of the options word, but that got almost full, so now they -are in a 16-bit flags word. */ -#define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */ -#define PCRE_FIRSTSET 0x0002 /* first_byte is set */ -#define PCRE_REQCHSET 0x0004 /* req_byte is set */ -#define PCRE_STARTLINE 0x0008 /* start after \n for multiline */ -#define PCRE_JCHANGED 0x0010 /* j option used in regex */ -#define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */ +#define PCRE_NOPARTIAL 0x0001 +#define PCRE_FIRSTSET 0x0002 +#define PCRE_REQCHSET 0x0004 +#define PCRE_STARTLINE 0x0008 +#define PCRE_JCHANGED 0x0010 +#define PCRE_HASCRORLF 0x0020 -/* Options for the "extra" block produced by pcre_study(). */ -#define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ +#define PCRE_STUDY_MAPPED 0x01 -/* Masks for identifying the public options that are permitted at compile -time, run time, or study time, respectively. */ #define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \ PCRE_NEWLINE_ANYCRLF) @@ -482,37 +326,30 @@ time, run time, or study time, respectively. */ PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS| \ PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) -#define PUBLIC_STUDY_OPTIONS 0 /* None defined */ +#define PUBLIC_STUDY_OPTIONS 0 -/* Magic number to provide a small check against being handed junk. Also used -to detect whether a pattern was compiled on a host of different endianness. */ -#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ +#define MAGIC_NUMBER 0x50435245UL -/* Negative values for the firstchar and reqchar variables */ #define REQ_UNSET (-2) #define REQ_NONE (-1) -/* The maximum remaining length of subject we are prepared to search for a -req_byte match. */ #define REQ_BYTE_MAX 1000 /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a variable-length repeat, or a anything other than literal characters. */ -#define REQ_CASELESS 0x0100 /* indicates caselessness */ -#define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ +#define REQ_CASELESS 0x0100 +#define REQ_VARY 0x0200 -/* Miscellaneous definitions */ typedef int BOOL; #define FALSE 0 #define TRUE 1 -/* Escape items that are just an encoding of a particular data value. */ #ifndef ESC_e #define ESC_e 27 @@ -530,32 +367,27 @@ typedef int BOOL; #define ESC_r '\r' #endif -/* We can't officially use ESC_t because it is a POSIX reserved identifier -(presumably because of all the others like size_t). */ #ifndef ESC_tee #define ESC_tee '\t' #endif -/* Codes for different types of Unicode property */ -#define PT_ANY 0 /* Any property - matches all chars */ -#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ -#define PT_GC 2 /* General characteristic (e.g. L) */ -#define PT_PC 3 /* Particular characteristic (e.g. Lu) */ -#define PT_SC 4 /* Script (e.g. Han) */ +#define PT_ANY 0 +#define PT_LAMP 1 +#define PT_GC 2 +#define PT_PC 3 +#define PT_SC 4 -/* Flag bits and data types for the extended class (OP_XCLASS) for classes that -contain UTF-8 characters with values greater than 255. */ -#define XCL_NOT 0x01 /* Flag: this is a negative class */ -#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ +#define XCL_NOT 0x01 +#define XCL_MAP 0x02 -#define XCL_END 0 /* Marks end of individual items */ -#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ -#define XCL_RANGE 2 /* A range (two multibyte chars) follows */ -#define XCL_PROP 3 /* Unicode property (2-byte property code follows) */ -#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ +#define XCL_END 0 +#define XCL_SINGLE 1 +#define XCL_RANGE 2 +#define XCL_PROP 3 +#define XCL_NOTPROP 4 /* These are escaped items that aren't just an encoding of a particular data value such as \n. They must have non-zero values, as check_escape() returns @@ -581,162 +413,158 @@ that follow must also be updated to match. There is also a table called "coptable" in pcre_dfa_exec.c that must be updated. */ enum { - OP_END, /* 0 End of pattern */ + OP_END, - /* Values corresponding to backslashed metacharacters */ - OP_SOD, /* 1 Start of data: \A */ - OP_SOM, /* 2 Start of match (subject + offset): \G */ - OP_SET_SOM, /* 3 Set start of match (\K) */ - OP_NOT_WORD_BOUNDARY, /* 4 \B */ - OP_WORD_BOUNDARY, /* 5 \b */ - OP_NOT_DIGIT, /* 6 \D */ - OP_DIGIT, /* 7 \d */ - OP_NOT_WHITESPACE, /* 8 \S */ - OP_WHITESPACE, /* 9 \s */ - OP_NOT_WORDCHAR, /* 10 \W */ - OP_WORDCHAR, /* 11 \w */ - OP_ANY, /* 12 Match any character */ - OP_ANYBYTE, /* 13 Match any byte (\C); different to OP_ANY for UTF-8 */ - OP_NOTPROP, /* 14 \P (not Unicode property) */ - OP_PROP, /* 15 \p (Unicode property) */ - OP_ANYNL, /* 16 \R (any newline sequence) */ - OP_NOT_HSPACE, /* 17 \H (not horizontal whitespace) */ - OP_HSPACE, /* 18 \h (horizontal whitespace) */ - OP_NOT_VSPACE, /* 19 \V (not vertical whitespace) */ - OP_VSPACE, /* 20 \v (vertical whitespace) */ - OP_EXTUNI, /* 21 \X (extended Unicode sequence */ - OP_EODN, /* 22 End of data or \n at end of data: \Z. */ - OP_EOD, /* 23 End of data: \z */ - OP_OPT, /* 24 Set runtime options */ - OP_CIRC, /* 25 Start of line - varies with multiline switch */ - OP_DOLL, /* 26 End of line - varies with multiline switch */ - OP_CHAR, /* 27 Match one character, casefully */ - OP_CHARNC, /* 28 Match one character, caselessly */ - OP_NOT, /* 29 Match one character, not the following one */ + OP_SOD, + OP_SOM, + OP_SET_SOM, + OP_NOT_WORD_BOUNDARY, + OP_WORD_BOUNDARY, + OP_NOT_DIGIT, + OP_DIGIT, + OP_NOT_WHITESPACE, + OP_WHITESPACE, + OP_NOT_WORDCHAR, + OP_WORDCHAR, + OP_ANY, + OP_ANYBYTE, + OP_NOTPROP, + OP_PROP, + OP_ANYNL, + OP_NOT_HSPACE, + OP_HSPACE, + OP_NOT_VSPACE, + OP_VSPACE, + OP_EXTUNI, + OP_EODN, + OP_EOD, - OP_STAR, /* 30 The maximizing and minimizing versions of */ - OP_MINSTAR, /* 31 these six opcodes must come in pairs, with */ - OP_PLUS, /* 32 the minimizing one second. */ - OP_MINPLUS, /* 33 This first set applies to single characters.*/ - OP_QUERY, /* 34 */ - OP_MINQUERY, /* 35 */ + OP_OPT, + OP_CIRC, + OP_DOLL, + OP_CHAR, + OP_CHARNC, + OP_NOT, - OP_UPTO, /* 36 From 0 to n matches */ - OP_MINUPTO, /* 37 */ - OP_EXACT, /* 38 Exactly n matches */ + OP_STAR, + OP_MINSTAR, + OP_PLUS, + OP_MINPLUS, + OP_QUERY, + OP_MINQUERY, - OP_POSSTAR, /* 39 Possessified star */ - OP_POSPLUS, /* 40 Possessified plus */ - OP_POSQUERY, /* 41 Posesssified query */ - OP_POSUPTO, /* 42 Possessified upto */ + OP_UPTO, + OP_MINUPTO, + OP_EXACT, - OP_NOTSTAR, /* 43 The maximizing and minimizing versions of */ - OP_NOTMINSTAR, /* 44 these six opcodes must come in pairs, with */ - OP_NOTPLUS, /* 45 the minimizing one second. They must be in */ - OP_NOTMINPLUS, /* 46 exactly the same order as those above. */ - OP_NOTQUERY, /* 47 This set applies to "not" single characters. */ - OP_NOTMINQUERY, /* 48 */ + OP_POSSTAR, + OP_POSPLUS, + OP_POSQUERY, + OP_POSUPTO, - OP_NOTUPTO, /* 49 From 0 to n matches */ - OP_NOTMINUPTO, /* 50 */ - OP_NOTEXACT, /* 51 Exactly n matches */ + OP_NOTSTAR, + OP_NOTMINSTAR, + OP_NOTPLUS, + OP_NOTMINPLUS, + OP_NOTQUERY, + OP_NOTMINQUERY, - OP_NOTPOSSTAR, /* 52 Possessified versions */ - OP_NOTPOSPLUS, /* 53 */ - OP_NOTPOSQUERY, /* 54 */ - OP_NOTPOSUPTO, /* 55 */ + OP_NOTUPTO, + OP_NOTMINUPTO, + OP_NOTEXACT, - OP_TYPESTAR, /* 56 The maximizing and minimizing versions of */ - OP_TYPEMINSTAR, /* 57 these six opcodes must come in pairs, with */ - OP_TYPEPLUS, /* 58 the minimizing one second. These codes must */ - OP_TYPEMINPLUS, /* 59 be in exactly the same order as those above. */ - OP_TYPEQUERY, /* 60 This set applies to character types such as \d */ - OP_TYPEMINQUERY, /* 61 */ + OP_NOTPOSSTAR, + OP_NOTPOSPLUS, + OP_NOTPOSQUERY, + OP_NOTPOSUPTO, - OP_TYPEUPTO, /* 62 From 0 to n matches */ - OP_TYPEMINUPTO, /* 63 */ - OP_TYPEEXACT, /* 64 Exactly n matches */ + OP_TYPESTAR, + OP_TYPEMINSTAR, + OP_TYPEPLUS, + OP_TYPEMINPLUS, + OP_TYPEQUERY, + OP_TYPEMINQUERY, - OP_TYPEPOSSTAR, /* 65 Possessified versions */ - OP_TYPEPOSPLUS, /* 66 */ - OP_TYPEPOSQUERY, /* 67 */ - OP_TYPEPOSUPTO, /* 68 */ + OP_TYPEUPTO, + OP_TYPEMINUPTO, + OP_TYPEEXACT, - OP_CRSTAR, /* 69 The maximizing and minimizing versions of */ - OP_CRMINSTAR, /* 70 all these opcodes must come in pairs, with */ - OP_CRPLUS, /* 71 the minimizing one second. These codes must */ - OP_CRMINPLUS, /* 72 be in exactly the same order as those above. */ - OP_CRQUERY, /* 73 These are for character classes and back refs */ - OP_CRMINQUERY, /* 74 */ - OP_CRRANGE, /* 75 These are different to the three sets above. */ - OP_CRMINRANGE, /* 76 */ + OP_TYPEPOSSTAR, + OP_TYPEPOSPLUS, + OP_TYPEPOSQUERY, + OP_TYPEPOSUPTO, - OP_CLASS, /* 77 Match a character class, chars < 256 only */ + OP_CRSTAR, + OP_CRMINSTAR, + OP_CRPLUS, + OP_CRMINPLUS, + OP_CRQUERY, + OP_CRMINQUERY, + OP_CRRANGE, + OP_CRMINRANGE, + + OP_CLASS, OP_NCLASS, /* 78 Same, but the bitmap was created from a negative class - the difference is relevant only when a UTF-8 character > 255 is encountered. */ - OP_XCLASS, /* 79 Extended class for handling UTF-8 chars within the - class. This does both positive and negative. */ + OP_XCLASS, - OP_REF, /* 80 Match a back reference */ - OP_RECURSE, /* 81 Match a numbered subpattern (possibly recursive) */ - OP_CALLOUT, /* 82 Call out to external function if provided */ + OP_REF, + OP_RECURSE, + OP_CALLOUT, - OP_ALT, /* 83 Start of alternation */ - OP_KET, /* 84 End of group that doesn't have an unbounded repeat */ - OP_KETRMAX, /* 85 These two must remain together and in this */ - OP_KETRMIN, /* 86 order. They are for groups the repeat for ever. */ + OP_ALT, + OP_KET, + OP_KETRMAX, + OP_KETRMIN, - /* The assertions must come before BRA, CBRA, ONCE, and COND.*/ - OP_ASSERT, /* 87 Positive lookahead */ - OP_ASSERT_NOT, /* 88 Negative lookahead */ - OP_ASSERTBACK, /* 89 Positive lookbehind */ - OP_ASSERTBACK_NOT, /* 90 Negative lookbehind */ - OP_REVERSE, /* 91 Move pointer back - used in lookbehind assertions */ + + OP_ASSERT, + OP_ASSERT_NOT, + OP_ASSERTBACK, + OP_ASSERTBACK_NOT, + OP_REVERSE, /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first, as there's a test for >= ONCE for a subpattern that isn't an assertion. */ - OP_ONCE, /* 92 Atomic group */ - OP_BRA, /* 93 Start of non-capturing bracket */ - OP_CBRA, /* 94 Start of capturing bracket */ - OP_COND, /* 95 Conditional group */ + OP_ONCE, + OP_BRA, + OP_CBRA, + OP_COND, /* These three must follow the previous three, in the same order. There's a check for >= SBRA to distinguish the two sets. */ - OP_SBRA, /* 96 Start of non-capturing bracket, check empty */ - OP_SCBRA, /* 97 Start of capturing bracket, check empty */ - OP_SCOND, /* 98 Conditional group, check empty */ + OP_SBRA, + OP_SCBRA, + OP_SCOND, - OP_CREF, /* 99 Used to hold a capture number as condition */ - OP_RREF, /* 100 Used to hold a recursion number as condition */ - OP_DEF, /* 101 The DEFINE condition */ + OP_CREF, + OP_RREF, + OP_DEF, - OP_BRAZERO, /* 102 These two must remain together and in this */ - OP_BRAMINZERO, /* 103 order. */ + OP_BRAZERO, + OP_BRAMINZERO, - /* These are backtracking control verbs */ - OP_PRUNE, /* 104 */ - OP_SKIP, /* 105 */ - OP_THEN, /* 106 */ - OP_COMMIT, /* 107 */ - /* These are forced failure and success verbs */ + OP_PRUNE, + OP_SKIP, + OP_THEN, + OP_COMMIT, - OP_FAIL, /* 108 */ - OP_ACCEPT /* 109 */ + + + OP_FAIL, + OP_ACCEPT }; -/* This macro defines textual names for all the opcodes. These are used only -for debugging. The macro is referenced only in pcre_printint.c. */ - #define OP_NAME_LIST \ "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \ "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ @@ -758,77 +586,64 @@ for debugging. The macro is referenced only in pcre_printint.c. */ "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT" -/* This macro defines the length of fixed length operations in the compiled -regex. The lengths are used when searching for specific things, and also in the -debugging printing of a compiled regex. We use a macro so that it can be -defined close to the definitions of the opcodes themselves. - -As things have been extended, some of these are no longer fixed lenths, but are -minima instead. For example, the length of a single-character repeat may vary -in UTF-8 mode. The code that uses this table must know about such things. */ - #define OP_LENGTHS \ - 1, /* End */ \ - 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \ - 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \ - 1, 1, /* Any, Anybyte */ \ - 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \ - 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ - 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ - 2, /* Char - the minimum length */ \ - 2, /* Charnc - the minimum length */ \ - 2, /* not */ \ - /* Positive single-char repeats ** These are */ \ - 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ - 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ - 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \ - /* Negative single-char repeats - only for chars < 256 */ \ - 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ - 4, 4, 4, /* NOT upto, minupto, exact */ \ - 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \ - /* Positive type repeats */ \ - 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ - 4, 4, 4, /* Type upto, minupto, exact */ \ - 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \ - /* Character class & ref repeats */ \ - 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ - 5, 5, /* CRRANGE, CRMINRANGE */ \ - 33, /* CLASS */ \ - 33, /* NCLASS */ \ - 0, /* XCLASS - variable length */ \ - 3, /* REF */ \ - 1+LINK_SIZE, /* RECURSE */ \ - 2+2*LINK_SIZE, /* CALLOUT */ \ - 1+LINK_SIZE, /* Alt */ \ - 1+LINK_SIZE, /* Ket */ \ - 1+LINK_SIZE, /* KetRmax */ \ - 1+LINK_SIZE, /* KetRmin */ \ - 1+LINK_SIZE, /* Assert */ \ - 1+LINK_SIZE, /* Assert not */ \ - 1+LINK_SIZE, /* Assert behind */ \ - 1+LINK_SIZE, /* Assert behind not */ \ - 1+LINK_SIZE, /* Reverse */ \ - 1+LINK_SIZE, /* ONCE */ \ - 1+LINK_SIZE, /* BRA */ \ - 3+LINK_SIZE, /* CBRA */ \ - 1+LINK_SIZE, /* COND */ \ - 1+LINK_SIZE, /* SBRA */ \ - 3+LINK_SIZE, /* SCBRA */ \ - 1+LINK_SIZE, /* SCOND */ \ - 3, /* CREF */ \ - 3, /* RREF */ \ - 1, /* DEF */ \ - 1, 1, /* BRAZERO, BRAMINZERO */ \ - 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \ - 1, 1 /* FAIL, ACCEPT */ + 1, \ + 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, \ + 1, 1, \ + 3, 3, 1, \ + 1, 1, 1, 1, 1, \ + 1, 1, 2, 1, 1, \ + 2, \ + 2, \ + 2, \ + \ + 2, 2, 2, 2, 2, 2, \ + 4, 4, 4, \ + 2, 2, 2, 4, \ + \ + 2, 2, 2, 2, 2, 2, \ + 4, 4, 4, \ + 2, 2, 2, 4, \ + \ + 2, 2, 2, 2, 2, 2, \ + 4, 4, 4, \ + 2, 2, 2, 4, \ + \ + 1, 1, 1, 1, 1, 1, \ + 5, 5, \ + 33, \ + 33, \ + 0, \ + 3, \ + 1+LINK_SIZE, \ + 2+2*LINK_SIZE, \ + 1+LINK_SIZE, \ + 1+LINK_SIZE, \ + 1+LINK_SIZE, \ + 1+LINK_SIZE, \ + 1+LINK_SIZE, \ + 1+LINK_SIZE, \ + 1+LINK_SIZE, \ + 1+LINK_SIZE, \ + 1+LINK_SIZE, \ + 1+LINK_SIZE, \ + 1+LINK_SIZE, \ + 3+LINK_SIZE, \ + 1+LINK_SIZE, \ + 1+LINK_SIZE, \ + 3+LINK_SIZE, \ + 1+LINK_SIZE, \ + 3, \ + 3, \ + 1, \ + 1, 1, \ + 1, 1, 1, 1, \ + 1, 1 -/* A magic value for OP_RREF to indicate the "any recursion" condition. */ - #define RREF_ANY 0xffff -/* Error code numbers. They are given names so that they can more easily be -tracked. */ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, @@ -857,84 +672,73 @@ NOTE NOTE NOTE: typedef struct real_pcre { pcre_uint32 magic_number; - pcre_uint32 size; /* Total that was malloced */ - pcre_uint32 options; /* Public options */ - pcre_uint16 flags; /* Private flags */ - pcre_uint16 dummy1; /* For future use */ + pcre_uint32 size; + pcre_uint32 options; + pcre_uint16 flags; + pcre_uint16 dummy1; pcre_uint16 top_bracket; pcre_uint16 top_backref; pcre_uint16 first_byte; pcre_uint16 req_byte; - pcre_uint16 name_table_offset; /* Offset to name table that follows */ - pcre_uint16 name_entry_size; /* Size of any name items */ - pcre_uint16 name_count; /* Number of name items */ - pcre_uint16 ref_count; /* Reference count */ + pcre_uint16 name_table_offset; + pcre_uint16 name_entry_size; + pcre_uint16 name_count; + pcre_uint16 ref_count; - const unsigned char *tables; /* Pointer to tables or NULL for std */ - const unsigned char *nullpad; /* NULL padding */ + const unsigned char *tables; + const unsigned char *nullpad; } real_pcre; -/* The format of the block used to store data from pcre_study(). The same -remark (see NOTE above) about extending this structure applies. */ typedef struct pcre_study_data { - pcre_uint32 size; /* Total that was malloced */ + pcre_uint32 size; pcre_uint32 options; uschar start_bits[32]; } pcre_study_data; -/* Structure for passing "static" information around between the functions -doing the compiling, so that they are thread-safe. */ typedef struct compile_data { - const uschar *lcc; /* Points to lower casing table */ - const uschar *fcc; /* Points to case-flipping table */ - const uschar *cbits; /* Points to character type table */ - const uschar *ctypes; /* Points to table of type maps */ - const uschar *start_workspace;/* The start of working space */ - const uschar *start_code; /* The start of the compiled code */ - const uschar *start_pattern; /* The start of the pattern */ - const uschar *end_pattern; /* The end of the pattern */ - uschar *hwm; /* High watermark of workspace */ - uschar *name_table; /* The name/number table */ - int names_found; /* Number of entries so far */ - int name_entry_size; /* Size of each entry */ - int bracount; /* Count of capturing parens as we compile */ - int final_bracount; /* Saved value after first pass */ - int top_backref; /* Maximum back reference */ - unsigned int backref_map; /* Bitmap of low back refs */ - int external_options; /* External (initial) options */ - int external_flags; /* External flag bits to be set */ - int req_varyopt; /* "After variable item" flag for reqbyte */ - BOOL had_accept; /* (*ACCEPT) encountered */ - int nltype; /* Newline type */ - int nllen; /* Newline string length */ - uschar nl[4]; /* Newline string when fixed length */ + const uschar *lcc; + const uschar *fcc; + const uschar *cbits; + const uschar *ctypes; + const uschar *start_workspace; + const uschar *start_code; + const uschar *start_pattern; + const uschar *end_pattern; + uschar *hwm; + uschar *name_table; + int names_found; + int name_entry_size; + int bracount; + int final_bracount; + int top_backref; + unsigned int backref_map; + int external_options; + int external_flags; + int req_varyopt; + BOOL had_accept; + int nltype; + int nllen; + uschar nl[4]; } compile_data; -/* Structure for maintaining a chain of pointers to the currently incomplete -branches, for testing for left recursion. */ typedef struct branch_chain { struct branch_chain *outer; uschar *current; } branch_chain; -/* Structure for items in a linked list that represents an explicit recursive -call within the pattern. */ typedef struct recursion_info { - struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ - int group_num; /* Number of group that was called */ - const uschar *after_call; /* "Return value": points after the call in the expr */ - USPTR save_start; /* Old value of mstart */ - int *offset_save; /* Pointer to start of saved offsets */ - int saved_max; /* Number of saved offsets */ + struct recursion_info *prevrec; + int group_num; + const uschar *after_call; + USPTR save_start; + int *offset_save; + int saved_max; } recursion_info; -/* Structure for building a chain of data for holding the values of the subject -pointer at the start of each subpattern, so as to detect when an empty string -has been matched by a subpattern - to break infinite loops. */ typedef struct eptrblock { struct eptrblock *epb_prev; @@ -942,86 +746,76 @@ typedef struct eptrblock { } eptrblock; -/* Structure for passing "static" information around between the functions -doing traditional NFA matching, so that they are thread-safe. */ - typedef struct match_data { - unsigned long int match_call_count; /* As it says */ - unsigned long int match_limit; /* As it says */ - unsigned long int match_limit_recursion; /* As it says */ - int *offset_vector; /* Offset vector */ - int offset_end; /* One past the end */ - int offset_max; /* The maximum usable for return data */ - int nltype; /* Newline type */ - int nllen; /* Newline string length */ - uschar nl[4]; /* Newline string when fixed */ - const uschar *lcc; /* Points to lower casing table */ - const uschar *ctypes; /* Points to table of type maps */ - BOOL offset_overflow; /* Set if too many extractions */ - BOOL notbol; /* NOTBOL flag */ - BOOL noteol; /* NOTEOL flag */ - BOOL utf8; /* UTF8 flag */ - BOOL endonly; /* Dollar not before final \n */ - BOOL notempty; /* Empty string match not wanted */ - BOOL partial; /* PARTIAL flag */ - BOOL hitend; /* Hit the end of the subject at some point */ - BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ - const uschar *start_code; /* For use when recursing */ - USPTR start_subject; /* Start of the subject string */ - USPTR end_subject; /* End of the subject string */ - USPTR start_match_ptr; /* Start of matched string */ - USPTR end_match_ptr; /* Subject position at end match */ - int end_offset_top; /* Highwater mark at end of match */ - int capture_last; /* Most recent capture number */ - int start_offset; /* The start offset value */ - eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ - int eptrn; /* Next free eptrblock */ - recursion_info *recursive; /* Linked list of recursion data */ - void *callout_data; /* To pass back to callouts */ + unsigned long int match_call_count; + unsigned long int match_limit; + unsigned long int match_limit_recursion; + int *offset_vector; + int offset_end; + int offset_max; + int nltype; + int nllen; + uschar nl[4]; + const uschar *lcc; + const uschar *ctypes; + BOOL offset_overflow; + BOOL notbol; + BOOL noteol; + BOOL utf8; + BOOL endonly; + BOOL notempty; + BOOL partial; + BOOL hitend; + BOOL bsr_anycrlf; + const uschar *start_code; + USPTR start_subject; + USPTR end_subject; + USPTR start_match_ptr; + USPTR end_match_ptr; + int end_offset_top; + int capture_last; + int start_offset; + eptrblock *eptrchain; + int eptrn; + recursion_info *recursive; + void *callout_data; } match_data; -/* A similar structure is used for the same purpose by the DFA matching -functions. */ typedef struct dfa_match_data { - const uschar *start_code; /* Start of the compiled pattern */ - const uschar *start_subject; /* Start of the subject string */ - const uschar *end_subject; /* End of subject string */ - const uschar *tables; /* Character tables */ - int moptions; /* Match options */ - int poptions; /* Pattern options */ - int nltype; /* Newline type */ - int nllen; /* Newline string length */ - uschar nl[4]; /* Newline string when fixed */ - void *callout_data; /* To pass back to callouts */ + const uschar *start_code; + const uschar *start_subject; + const uschar *end_subject; + const uschar *tables; + int moptions; + int poptions; + int nltype; + int nllen; + uschar nl[4]; + void *callout_data; } dfa_match_data; -/* Bit definitions for entries in the pcre_ctypes table. */ #define ctype_space 0x01 #define ctype_letter 0x02 #define ctype_digit 0x04 #define ctype_xdigit 0x08 -#define ctype_word 0x10 /* alphanumeric or '_' */ -#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ +#define ctype_word 0x10 +#define ctype_meta 0x80 -/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set -of bits for a class map. Some classes are built by combining these tables. */ -#define cbit_space 0 /* [:space:] or \s */ -#define cbit_xdigit 32 /* [:xdigit:] */ -#define cbit_digit 64 /* [:digit:] or \d */ -#define cbit_upper 96 /* [:upper:] */ -#define cbit_lower 128 /* [:lower:] */ -#define cbit_word 160 /* [:word:] or \w */ -#define cbit_graph 192 /* [:graph:] */ -#define cbit_print 224 /* [:print:] */ -#define cbit_punct 256 /* [:punct:] */ -#define cbit_cntrl 288 /* [:cntrl:] */ -#define cbit_length 320 /* Length of the cbits table */ +#define cbit_space 0 +#define cbit_xdigit 32 +#define cbit_digit 64 +#define cbit_upper 96 +#define cbit_lower 128 +#define cbit_word 160 +#define cbit_graph 192 +#define cbit_print 224 +#define cbit_punct 256 +#define cbit_cntrl 288 +#define cbit_length 320 -/* Offsets of the various tables from the base tables pointer, and -total length. */ #define lcc_offset 0 #define fcc_offset 256 @@ -1029,10 +823,6 @@ total length. */ #define ctypes_offset (cbits_offset + cbit_length) #define tables_length (ctypes_offset + 256) -/* Layout of the UCP type table that translates property names into types and -codes. Each entry used to point directly to a name, but to reduce the number of -relocations in shared libraries, it now has an offset into a single string -instead. */ typedef struct { pcre_uint16 name_offset; @@ -1041,11 +831,6 @@ typedef struct { } ucp_type_table; -/* Internal shared data tables. These are tables that are used by more than one -of the exported public functions. They have to be "external" in the C sense, -but are not part of the PCRE public API. The data for these tables is in the -pcre_tables.c module. */ - extern const int _pcre_utf8_table1[]; extern const int _pcre_utf8_table2[]; extern const int _pcre_utf8_table3[]; @@ -1053,7 +838,6 @@ extern const uschar _pcre_utf8_table4[]; extern const int _pcre_utf8_table1_size; -// extern const char _pcre_utt_names[]; extern const ucp_type_table _pcre_utt[]; extern const int _pcre_utt_size; @@ -1062,17 +846,11 @@ extern const uschar _pcre_default_tables[]; extern const uschar _pcre_OP_lengths[]; -/* Internal shared functions. These are functions that are used by more than -one of the exported public functions. They have to be "external" in the C -sense, but are not part of the PCRE public API. */ - extern BOOL _pcre_is_newline(const uschar *, int, const uschar *, int *, BOOL); extern int _pcre_ord2utf8(int, uschar *); extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, const pcre_study_data *, pcre_study_data *); -// extern int _pcre_ucp_findprop(const unsigned int, int *, int *); -// extern unsigned int _pcre_ucp_othercase(const unsigned int); extern int _pcre_valid_utf8(const uschar *, int); extern BOOL _pcre_was_newline(const uschar *, int, const uschar *, int *, BOOL); @@ -1080,4 +858,4 @@ extern BOOL _pcre_xclass(int, const uschar *); #endif -/* End of pcre_internal.h */ + diff --git a/package/re/pcre_newline.c b/package/re/pcre_newline.c index 835c7fcf8..381705982 100644 --- a/package/re/pcre_newline.c +++ b/package/re/pcre_newline.c @@ -1,13 +1,3 @@ - -/* This module contains internal functions for testing newlines when more than -one kind of newline is to be recognized. When a newline is found, its length is -returned. In principle, we could implement several newline "types", each -referring to a different set of newline characters. At present, PCRE supports -only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF, -and NLTYPE_ANY. The full list of Unicode newline characters is taken from -http://unicode.org/unicode/reports/tr18/. */ - - #include "re_config.h" #include "pcre_internal.h" diff --git a/package/re/pcre_ord2utf8.c b/package/re/pcre_ord2utf8.c index df5d16116..ace40064a 100644 --- a/package/re/pcre_ord2utf8.c +++ b/package/re/pcre_ord2utf8.c @@ -1,7 +1,3 @@ - -/* This file contains a private PCRE function that converts an ordinal -character value into a UTF8 string. */ - #include "re_config.h" #include "pcre_internal.h" diff --git a/package/re/pcre_tables.c b/package/re/pcre_tables.c index ae101f714..b7bd9fb5a 100644 --- a/package/re/pcre_tables.c +++ b/package/re/pcre_tables.c @@ -1,10 +1,4 @@ -/* This module contains some fixed tables that are used by more than one of the -PCRE code modules. The tables are also #included by the pcretest program, which -uses macros to change their names from _pcre_xxx to xxxx, thereby avoiding name -clashes with the library. */ - - #include "re_config.h" #include "pcre_internal.h" diff --git a/package/re/pcre_try_flipped.c b/package/re/pcre_try_flipped.c index 4991650c9..a60506d29 100644 --- a/package/re/pcre_try_flipped.c +++ b/package/re/pcre_try_flipped.c @@ -1,9 +1,4 @@ -/* This module contains an internal function that tests a compiled pattern to -see if it was compiled with the opposite endianness. If so, it uses an -auxiliary local function to flip the appropriate bytes. */ - - #include "re_config.h" #include "pcre_internal.h" diff --git a/package/re/pcre_valid_utf8.c b/package/re/pcre_valid_utf8.c index f1f7f123a..8c64af525 100644 --- a/package/re/pcre_valid_utf8.c +++ b/package/re/pcre_valid_utf8.c @@ -1,8 +1,4 @@ -/* This module contains an internal function for validating UTF-8 character -strings. */ - - #include "re_config.h" #include "pcre_internal.h" diff --git a/package/re/pcre_xclass.c b/package/re/pcre_xclass.c index 5e00a279b..39184a627 100644 --- a/package/re/pcre_xclass.c +++ b/package/re/pcre_xclass.c @@ -1,9 +1,4 @@ -/* This module contains an internal function that is used to match an extended -class (one that contains characters whose values are > 255). It is used by both -pcre_exec() and pcre_def_exec(). */ - - #include "re_config.h" #include "pcre_internal.h" diff --git a/package/re/re-api-adapter.c b/package/re/re-api-adapter.c index 57548c53e..31fe7c2bb 100644 --- a/package/re/re-api-adapter.c +++ b/package/re/re-api-adapter.c @@ -18,220 +18,198 @@ #error PikaScript version 1.10.5 or later is required. #endif -void re_Match___init__args(PikaObj* self, char* sub, int* vec, int ven); +#define raise_error \ + { \ + obj_setErrorCode(self, -__LINE__); \ + } -void pre_init_re(PikaObj* self) { +#define tu_getNew(name, obj_name) \ + PikaTuple *name = New_tuple(); \ + Any obj_name = newNormalObj(New_PikaStdData_Tuple); \ + obj_setPtr(obj_name, "list", name); + +#define tu_append(tup, val, type) \ + { \ + Arg *_arg = arg_new##type(val); \ + list_append(&(tup)->super, _arg); \ + arg_deinit(_arg); \ + } +#define li_append(list, val, type) \ + { \ + Arg *_arg = arg_new##type(val); \ + PikaStdData_List_append(list, _arg); \ + arg_deinit(_arg); \ + } + +typedef PikaObj *Any; + +void re_Match___init__args(PikaObj *self, char *sub, int *vec, int ven); +int _get_flags(PikaTuple *val); +PikaObj *__split(void *pattern__or__re, + char *subject, + int max_split, + int flags, + int mode_re); +PikaObj *__findall(void *pattern__or__re, + char *subject, + int flags, + int mode_re); +PikaObj *__subn(void *pattern__or__re, + char *repl, + char *subjet, + int count, + int flags, + int mode_re); +void re___init__(PikaObj *self) +{ + obj_setInt(self, "A", PCRE_ONLY_ASCII); obj_setInt(self, "I", PCRE_CASELESS); obj_setInt(self, "M", PCRE_MULTILINE); + obj_setInt(self, "S", PCRE_DOTALL); + obj_setInt(self, "ASCII", PCRE_ONLY_ASCII); obj_setInt(self, "IGNORECASE", PCRE_CASELESS); obj_setInt(self, "MULTILINE", PCRE_MULTILINE); obj_setInt(self, "DOTALL", PCRE_DOTALL); } -void re___init__(PikaObj* self) { - pre_init_re(self); -} - -PikaObj* re_findall(PikaObj* self, - char* pattern, - char* subject, - PikaTuple* val) { +PikaObj *re_findall(PikaObj *self, + char *pattern, + char *subject, + PikaTuple *val) +{ int flags = 0; - for (size_t i = 0; i < tuple_getSize(val); i++) { - Arg* arg_i = tuple_getArg(val, i); - if (arg_getType(arg_i) != ARG_TYPE_INT) { - obj_setErrorCode(self, -__LINE__); - return NULL; - } - flags |= arg_getInt(arg_i); + flags = _get_flags(val); + if (flags < 0) + { + obj_setErrorCode(self, __LINE__); + return NULL; } - - PikaObj* list = newNormalObj(New_PikaStdData_List); - PikaStdData_List___init__(list); - PikaObj* sub_list; - int length = strlen(subject); - // int n = 0; - flags |= PCRE_UTF8; - int j2 = 0; - int m_n = -1; - int brackets = -1; - int** vcs = re_searchall(pattern, subject, length, &m_n, &brackets, flags); - char* b = NULL; - Arg* str_arg1; - Arg* sub_arg; - if (!vcs) { - if (m_n < 0) - obj_setErrorCode(self, -__LINE__); - return list; - } - if (brackets == 1) { - for (int i = 0; i < m_n; i++) { - int* v = vcs[i]; - length = v[1] - v[0]; - if (length) { - b = malloc(length + 1); - if (!b) - goto e_er; - b[length] = 0; - memcpy(b, subject + v[0], length); - } else { - b = (char*)""; - } - str_arg1 = arg_newStr(b); - PikaStdData_List_append(list, str_arg1); - arg_deinit(str_arg1); - if (length) - free(b); - } - goto e_er; - } - - for (int i = 0; i < m_n; i++) { - int* v = vcs[i]; - length = v[1] - v[0]; - b = malloc(length + 1); - if (!b) - goto e_er; - sub_list = newNormalObj(New_PikaStdData_List); - PikaStdData_List___init__(sub_list); - for (int j = 0; j < brackets; j++) { - j2 = j * 2; - length = v[j2 + 1] - v[j2]; - b[length] = 0; - memcpy(b, subject + v[j2], length); - - str_arg1 = arg_newStr(b); - PikaStdData_List_append(sub_list, str_arg1); - arg_deinit(str_arg1); - } - sub_arg = arg_newRef(sub_list); - PikaStdData_List_append(list, sub_arg); - arg_deinit(sub_arg); - free(b); - } -e_er: - if (vcs) - re_free_searchall(vcs, m_n); + Any list = __findall(pattern, subject, flags, 0); + if (!list) + raise_error; return list; - - // char **res = pcre_findall(pattern, subject, length, &n, flags); - // if (!res) - // return list; - // for (int i = 0; i < n; i++) - // { - // Arg *str_arg1 = arg_newStr(res[i]); - // PikaStdData_List_append(list, str_arg1); - // arg_deinit(str_arg1); - // } - // re_free_findall(res, n); - // return list; } -PikaObj* re_match(PikaObj* self, char* pattern, char* subject, PikaTuple* val) { +PikaObj *re_match(PikaObj *self, char *pattern, char *subject, PikaTuple *val) +{ int flags = 0; - for (size_t i = 0; i < tuple_getSize(val); i++) { - Arg* arg_i = tuple_getArg(val, i); - if (arg_getType(arg_i) != ARG_TYPE_INT) { - obj_setErrorCode(self, -__LINE__); - return NULL; - } - flags |= arg_getInt(arg_i); + flags = _get_flags(val); + if (flags < 0) + { + obj_setErrorCode(self, __LINE__); + return NULL; } - PikaObj* m = newNormalObj(New_re_Match); int ven = -1; - flags |= PCRE_UTF8; - - int* vec = pcre_match(pattern, subject, strlen(subject), &ven, flags); - if (!vec) { + int *vec = pcre_match(pattern, subject, strlen(subject), &ven, flags); + if (!vec) + { if (ven < 0) obj_setErrorCode(self, -__LINE__); return NULL; } + Any m = newNormalObj(New_re_Match); re_Match___init__args(m, subject, vec, ven); return m; } -PikaObj* re_fullmatch(PikaObj* self, - char* pattern, - char* subject, - PikaTuple* val) { +PikaObj *re_fullmatch(PikaObj *self, + char *pattern, + char *subject, + PikaTuple *val) +{ int flags = 0; - for (size_t i = 0; i < tuple_getSize(val); i++) { - Arg* arg_i = tuple_getArg(val, i); - if (arg_getType(arg_i) != ARG_TYPE_INT) { - obj_setErrorCode(self, -__LINE__); - return NULL; - } - flags |= arg_getInt(arg_i); + flags = _get_flags(val); + if (flags < 0) + { + obj_setErrorCode(self, __LINE__); + return NULL; } - PikaObj* m = newNormalObj(New_re_Match); int ven = -1; - flags |= PCRE_UTF8; - int* vec = pcre_fullmatch(pattern, subject, strlen(subject), &ven, flags); - if (!vec) { + int *vec = pcre_fullmatch(pattern, subject, strlen(subject), &ven, flags); + if (!vec) + { if (ven < 0) obj_setErrorCode(self, -__LINE__); return NULL; } + Any m = newNormalObj(New_re_Match); re_Match___init__args(m, subject, vec, ven); return m; } - -PikaObj* re_search(PikaObj* self, - char* pattern, - char* subject, - PikaTuple* val) { +PikaObj *re_search(PikaObj *self, + char *pattern, + char *subject, + PikaTuple *val) +{ int flags = 0; - for (size_t i = 0; i < tuple_getSize(val); i++) { - Arg* arg_i = tuple_getArg(val, i); - if (arg_getType(arg_i) != ARG_TYPE_INT) { - obj_setErrorCode(self, -__LINE__); - return NULL; - } - flags |= arg_getInt(arg_i); + flags = _get_flags(val); + if (flags < 0) + { + obj_setErrorCode(self, __LINE__); + return NULL; } - PikaObj* m = newNormalObj(New_re_Match); int ven = -1; - flags |= PCRE_UTF8; - - int* vec = pcre_search(pattern, subject, strlen(subject), &ven, flags); - if (!vec) { + int *vec = pcre_search(pattern, subject, strlen(subject), &ven, flags); + if (!vec) + { if (ven < 0) obj_setErrorCode(self, -__LINE__); return NULL; } + Any m = newNormalObj(New_re_Match); re_Match___init__args(m, subject, vec, ven); return m; } -char* re_sub(PikaObj* self, - char* pattern, - char* repl, - char* subjet, - PikaTuple* val) { - int flags = 0; - for (size_t i = 0; i < tuple_getSize(val); i++) { - Arg* arg_i = tuple_getArg(val, i); - if (arg_getType(arg_i) != ARG_TYPE_INT) { +char *re_sub(PikaObj *self, + char *pattern, + char *repl, + char *subjet, + PikaTuple *val) +{ + int flags = PCRE_UTF8; + int count = 0; + int argn = tuple_getSize(val); + if (argn >= 1) + { + Arg *arg_i = tuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { obj_setErrorCode(self, -__LINE__); return NULL; } - flags |= arg_getInt(arg_i); + count = arg_getInt(arg_i); + } + if (argn >= 2) + { + Arg *arg_i = tuple_getArg(val, 1); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + flags = arg_getInt(arg_i); + if (flags | PCRE_ONLY_ASCII) + { + flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); + } } int length = strlen(subjet); - flags |= PCRE_UTF8; - char* s = pcre_sub(pattern, repl, subjet, length, flags); - if (!s) { + char *s = pcre_subn(pattern, repl, subjet, length, count, flags, NULL); + if (!s) + { obj_setErrorCode(self, -__LINE__); return NULL; } - if (s == subjet) { + if (s == subjet) + { obj_setStr(self, "_b", subjet); return obj_getStr(self, "_b"); } int len = strlen(s); - char* b = (char*)malloc(len + 1); - if (!b) { + char *b = (char *)malloc(len + 1); + if (!b) + { free(s); return NULL; } @@ -242,56 +220,199 @@ char* re_sub(PikaObj* self, free(s); return obj_getStr(self, "_b"); } -PikaObj* re_compile(PikaObj* self, char* pattern) { - const char* error; +PikaObj *re_subn(PikaObj *self, + char *pattern, + char *repl, + char *subjet, + PikaTuple *val) +{ + int flags = PCRE_UTF8; + int count = 0; + int argn = tuple_getSize(val); + if (argn >= 1) + { + Arg *arg_i = tuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + count = arg_getInt(arg_i); + } + if (argn >= 2) + { + Arg *arg_i = tuple_getArg(val, 1); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + flags = arg_getInt(arg_i); + if (flags | PCRE_ONLY_ASCII) + { + flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); + } + } + Any res = __subn(pattern, repl, subjet, count, flags, 0); + if (!res) + { + raise_error; + } + return res; +} +PikaObj *re_compile(PikaObj *self, char *pattern, PikaTuple *val) +{ + const char *error; int erroffset; - pcre* re = pcre_compile(pattern, 0, &error, &erroffset, NULL); - if (!re) { + int flags = _get_flags(val); + if (flags < 0) + { + raise_error; + return NULL; + } + pcre *re = pcre_compile(pattern, flags, &error, &erroffset, NULL); + if (!re) + { obj_setErrorCode(self, erroffset); return NULL; } - PikaObj* m = newNormalObj(New_re_Pattern); + Any m = newNormalObj(New_re_Pattern); obj_setPtr(m, "_re", re); return m; } +PikaObj *re_split(PikaObj *self, char *pattern, char *subject, PikaTuple *val) +{ + int flags = PCRE_UTF8; + int max_split = 0; + int argn = tuple_getSize(val); + if (argn >= 1) + { + Arg *arg_i = tuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + max_split = arg_getInt(arg_i); + } + if (argn >= 2) + { + Arg *arg_i = tuple_getArg(val, 1); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + flags = arg_getInt(arg_i); + if (flags | PCRE_ONLY_ASCII) + { + flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); + } + } + Any list = __split(pattern, subject, max_split, flags, 0); + if (!list) + raise_error; + return list; +} -void re_Match___del__(PikaObj* self) { - void* vec = obj_getPtr(self, "_vec"); +char *re_escape(PikaObj *self, char *pattern) +{ + const char *special_chars = "()[]{}?*+-|^$\\.&~# \t\n\r\v\f"; + const int special_chars_len = 25; + if (!pattern) + return NULL; + int n = strlen(pattern); + int after_size = n; + for (int i = 0; i < n; i++) + { + for (int j = 0; j < special_chars_len; j++) + { + if (pattern[i] != special_chars[j]) + continue; + after_size++; + break; + } + } + char *new_s = (char *)malloc(after_size + 1); + if (!new_s) + return NULL; + int at = 0; + while (*pattern) + { + char c = *pattern; + int j = 0; + for (; j < special_chars_len; j++) + { + if (c != special_chars[j]) + continue; + new_s[at++] = '\\'; + break; + } + new_s[at++] = c; + pattern++; + } + new_s[at++] = 0; + obj_setStr(self, "_b", new_s); + free(new_s); + return obj_getStr(self, "_b"); +} + +void re_Match___del__(PikaObj *self) +{ + void *vec = obj_getPtr(self, "_vec"); if (!vec) return; free(vec); } -void re_Match___init__(PikaObj* self) { - if (!obj_isArgExist(self, "_vec")) { +void re_Match___init__(PikaObj *self) +{ + if (!obj_isArgExist(self, "_vec")) + { obj_setPtr(self, "_vec", NULL); obj_setStr(self, "_b", ""); obj_setInt(self, "_ven", 0); obj_setStr(self, "_s", ""); } } -void re_Match___init__args(PikaObj* self, char* sub, int* vec, int ven) { +void re_Match___init__args(PikaObj *self, char *sub, int *vec, int ven) +{ obj_setPtr(self, "_vec", vec); obj_setStr(self, "_b", ""); obj_setInt(self, "_ven", ven); obj_setStr(self, "_s", sub); } -char* re_Match_group(PikaObj* self, int n) { - int* vec = obj_getPtr(self, "_vec"); +char *re_Match_group(PikaObj *self, PikaTuple *val) +{ + int n = 0; + int argn = tuple_getSize(val); + if (argn >= 1) + { + Arg *arg_i = tuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + n = arg_getInt(arg_i); + } + + int *vec = obj_getPtr(self, "_vec"); if (!vec) return NULL; - char* s = obj_getStr(self, "_s"); + char *s = obj_getStr(self, "_s"); if (!s) return NULL; int ven = obj_getInt(self, "_ven"); - if (n >= ven || n < 0) { + if (n >= ven || n < 0) + { obj_setErrorCode(self, -__LINE__); return NULL; } int len = vec[n * 2 + 1] - vec[n * 2]; if (!len) return ""; - char* b = (char*)malloc(len + 1); + char *b = (char *)malloc(len + 1); if (!b) return NULL; memcpy(b, s + vec[n * 2], len); @@ -300,234 +421,173 @@ char* re_Match_group(PikaObj* self, int n) { free(b); return obj_getStr(self, "_b"); } -PikaObj* re_Match_groups(PikaObj* self) { - PikaObj* list = newNormalObj(New_PikaStdData_List); - PikaStdData_List___init__(list); - - int* vec = obj_getPtr(self, "_vec"); +PikaObj *re_Match_groups(PikaObj *self) +{ + int *vec = obj_getPtr(self, "_vec"); if (!vec) - return list; - char* s = obj_getStr(self, "_s"); + return NULL; + char *s = obj_getStr(self, "_s"); if (!s) - return list; + return NULL; int ven = obj_getInt(self, "_ven"); if (!ven) - return list; - for (int i = 0; i < ven; i++) { - Arg* str_arg1; + return NULL; + tu_getNew(tup, tup_obj); + + for (int i = 1; i < ven; i++) + { + Arg *str_arg1; int len = vec[i * 2 + 1] - vec[i * 2]; - if (len) { - char* b = (char*)malloc(len + 1); + if (len) + { + char *b = (char *)malloc(len + 1); if (!b) return NULL; memcpy(b, s + vec[i * 2], len); b[len] = 0; str_arg1 = arg_newStr(b); free(b); - } else { + } + else + { str_arg1 = arg_newStr(""); } - PikaStdData_List_append(list, str_arg1); + list_append(&(tup)->super, str_arg1); arg_deinit(str_arg1); } - return list; + return tup_obj; } -PikaObj* re_Match_span(PikaObj* self, int group_n) { - PikaObj* list = newNormalObj(New_PikaStdData_List); - PikaStdData_List___init__(list); - - int* vec = obj_getPtr(self, "_vec"); - if (!vec) - return list; - int ven = obj_getInt(self, "_ven"); - if (!ven || group_n >= ven) { - obj_setErrorCode(self, -__LINE__); - return list; +PikaObj *re_Match_span(PikaObj *self, PikaTuple *val) +{ + int group_n = 0; + int argn = tuple_getSize(val); + if (argn >= 1) + { + Arg *arg_i = tuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + group_n = arg_getInt(arg_i); } - - Arg* spos = arg_newInt(vec[group_n * 2]); - Arg* epos = arg_newInt(vec[group_n * 2 + 1]); - PikaStdData_List_append(list, spos); - PikaStdData_List_append(list, epos); - - arg_deinit(spos); - arg_deinit(epos); - return list; + int *vec = obj_getPtr(self, "_vec"); + if (!vec) + raise_error; + int ven = obj_getInt(self, "_ven"); + if (!ven || group_n >= ven) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + tu_getNew(tu, tu_obj); + tu_append(tu, vec[group_n * 2], Int); + tu_append(tu, vec[group_n * 2 + 1], Int); + return tu_obj; } -void re_Pattern___del__(PikaObj* self) { - void* _re = obj_getPtr(self, "_re"); +void re_Pattern___del__(PikaObj *self) +{ + void *_re = obj_getPtr(self, "_re"); if (!_re) return; - pcre* re = (pcre*)_re; + pcre *re = (pcre *)_re; pcre_free(re); } -void re_Pattern___init__(PikaObj* self) { - if (!obj_isArgExist(self, "_re")) { +void re_Pattern___init__(PikaObj *self) +{ + if (!obj_isArgExist(self, "_re")) + { obj_setPtr(self, "_re", NULL); obj_setStr(self, "_b", ""); + obj_setInt(self, "_n", -1); } } - -PikaObj* re_Pattern_findall(PikaObj* self, char* subject, PikaTuple* val) { +PikaObj *re_Pattern_findall(PikaObj *self, char *subject, PikaTuple *val) +{ int flags = 0; - for (size_t i = 0; i < tuple_getSize(val); i++) { - Arg* arg_i = tuple_getArg(val, i); - if (arg_getType(arg_i) != ARG_TYPE_INT) { - obj_setErrorCode(self, -__LINE__); - return NULL; - } - flags |= arg_getInt(arg_i); + flags = _get_flags(val); + if (flags < 0) + { + obj_setErrorCode(self, __LINE__); + return NULL; } if (!obj_isArgExist(self, "_re")) return NULL; - pcre* re = obj_getPtr(self, "_re"); - - PikaObj* list = newNormalObj(New_PikaStdData_List); - PikaStdData_List___init__(list); - PikaObj* sub_list; - int length = strlen(subject); - // int n = 0; - flags |= PCRE_UTF8; - int j2 = 0; - int m_n = -1; - int brackets = 0; - int** vcs = re_searchall2(re, subject, length, &m_n, &brackets, flags); - char* b = NULL; - Arg* str_arg1; - Arg* sub_arg; - - if (!vcs) { - if (m_n < 0) - obj_setErrorCode(self, -__LINE__); - - return list; - } - if (brackets == 1) { - for (int i = 0; i < m_n; i++) { - int* v = vcs[i]; - length = v[1] - v[0]; - if (length) { - b = malloc(length + 1); - if (!b) - goto e_er; - b[length] = 0; - memcpy(b, subject + v[0], length); - } else { - b = (char*)""; - } - str_arg1 = arg_newStr(b); - PikaStdData_List_append(list, str_arg1); - arg_deinit(str_arg1); - if (length) - free(b); - } - goto e_er; - } - - for (int i = 0; i < m_n; i++) { - int* v = vcs[i]; - length = v[1] - v[0]; - b = malloc(length + 1); - if (!b) - goto e_er; - sub_list = newNormalObj(New_PikaStdData_List); - PikaStdData_List___init__(sub_list); - for (int j = 0; j < brackets; j++) { - j2 = j * 2; - length = v[j2 + 1] - v[j2]; - b[length] = 0; - memcpy(b, subject + v[j2], length); - - str_arg1 = arg_newStr(b); - PikaStdData_List_append(sub_list, str_arg1); - arg_deinit(str_arg1); - } - sub_arg = arg_newPtr(ARG_TYPE_OBJECT, sub_list); - PikaStdData_List_append(list, sub_arg); - arg_deinit(sub_arg); - free(b); - } -e_er: - if (vcs) - re_free_searchall(vcs, m_n); + pcre *re = obj_getPtr(self, "_re"); + Any list = __findall(re, subject, flags, 1); + if (!list) + raise_error; return list; } - -PikaObj* re_Pattern_match(PikaObj* self, char* subject, PikaTuple* val) { +PikaObj *re_Pattern_match(PikaObj *self, char *subject, PikaTuple *val) +{ int flags = 0; - for (size_t i = 0; i < tuple_getSize(val); i++) { - Arg* arg_i = tuple_getArg(val, i); - if (arg_getType(arg_i) != ARG_TYPE_INT) { - obj_setErrorCode(self, -__LINE__); - return NULL; - } - flags |= arg_getInt(arg_i); + flags = _get_flags(val); + if (flags < 0) + { + obj_setErrorCode(self, __LINE__); + return NULL; } if (!obj_isArgExist(self, "_re")) return NULL; - pcre* re = obj_getPtr(self, "_re"); - PikaObj* m = newNormalObj(New_re_Match); + pcre *re = obj_getPtr(self, "_re"); int ven = -1; - flags |= PCRE_UTF8; - - int* vec = re_match2(re, subject, strlen(subject), &ven, flags); - if (!vec) { + int *vec = re_match2(re, subject, strlen(subject), &ven, flags); + if (!vec) + { if (ven < 0) obj_setErrorCode(self, -__LINE__); return NULL; } + Any m = newNormalObj(New_re_Match); re_Match___init__args(m, subject, vec, ven); return m; } -PikaObj* re_Pattern_fullmatch(PikaObj* self, char* subject, PikaTuple* val) { +PikaObj *re_Pattern_fullmatch(PikaObj *self, char *subject, PikaTuple *val) +{ int flags = 0; - for (size_t i = 0; i < tuple_getSize(val); i++) { - Arg* arg_i = tuple_getArg(val, i); - if (arg_getType(arg_i) != ARG_TYPE_INT) { - obj_setErrorCode(self, -__LINE__); - return NULL; - } - flags |= arg_getInt(arg_i); + flags = _get_flags(val); + if (flags < 0) + { + obj_setErrorCode(self, __LINE__); + return NULL; } if (!obj_isArgExist(self, "_re")) return NULL; - pcre* re = obj_getPtr(self, "_re"); - PikaObj* m = newNormalObj(New_re_Match); + pcre *re = obj_getPtr(self, "_re"); int ven = -1; - flags |= PCRE_UTF8; - - int* vec = re_fullmatch2(re, subject, strlen(subject), &ven, flags); - if (!vec) { + int *vec = re_fullmatch2(re, subject, strlen(subject), &ven, flags); + if (!vec) + { if (ven < 0) obj_setErrorCode(self, -__LINE__); - return NULL; } + Any m = newNormalObj(New_re_Match); re_Match___init__args(m, subject, vec, ven); return m; } -PikaObj* re_Pattern_search(PikaObj* self, char* subject, PikaTuple* val) { +PikaObj *re_Pattern_search(PikaObj *self, char *subject, PikaTuple *val) +{ int flags = 0; - for (size_t i = 0; i < tuple_getSize(val); i++) { - Arg* arg_i = tuple_getArg(val, i); - if (arg_getType(arg_i) != ARG_TYPE_INT) { - obj_setErrorCode(self, -__LINE__); - return NULL; - } - flags |= arg_getInt(arg_i); + flags = _get_flags(val); + if (flags < 0) + { + obj_setErrorCode(self, __LINE__); + return NULL; } if (!obj_isArgExist(self, "_re")) return NULL; - pcre* re = obj_getPtr(self, "_re"); - PikaObj* m = newNormalObj(New_re_Match); + pcre *re = obj_getPtr(self, "_re"); + Any m = newNormalObj(New_re_Match); int ven = -1; - flags |= PCRE_UTF8; - int* vec = re_search2(re, subject, strlen(subject), &ven, flags); - if (!vec) { + int *vec = re_search2(re, subject, strlen(subject), &ven, flags); + if (!vec) + { if (ven < 0) obj_setErrorCode(self, -__LINE__); return NULL; @@ -535,35 +595,59 @@ PikaObj* re_Pattern_search(PikaObj* self, char* subject, PikaTuple* val) { re_Match___init__args(m, subject, vec, ven); return m; } -char* re_Pattern_sub(PikaObj* self, char* repl, char* subjet, PikaTuple* val) { +char *re_Pattern_sub(PikaObj *self, char *repl, char *subjet, PikaTuple *val) +{ int flags = 0; - for (size_t i = 0; i < tuple_getSize(val); i++) { - Arg* arg_i = tuple_getArg(val, i); - if (arg_getType(arg_i) != ARG_TYPE_INT) { + int count = 0; + int argn = tuple_getSize(val); + if (argn >= 1) + { + Arg *arg_i = tuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { obj_setErrorCode(self, -__LINE__); return NULL; } - flags |= arg_getInt(arg_i); + count = arg_getInt(arg_i); } + if (argn >= 2) + { + Arg *arg_i = tuple_getArg(val, 1); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + flags = arg_getInt(arg_i); + if (flags | PCRE_ONLY_ASCII) + { + flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); + } + } + if (!obj_isArgExist(self, "_re")) return NULL; - pcre* re = obj_getPtr(self, "_re"); + pcre *re = obj_getPtr(self, "_re"); int length = strlen(subjet); - flags |= PCRE_UTF8; + int matched_times = 0; + char *s = re_subn2(re, repl, subjet, length, count, flags, &matched_times); + obj_setInt(self, "_n", matched_times); - char* s = re_sub2(re, repl, subjet, length, flags); - if (!s) { + if (!s) + { obj_setErrorCode(self, -__LINE__); return NULL; } - if (s == subjet) { + if (s == subjet) + { obj_setStr(self, "_b", subjet); return obj_getStr(self, "_b"); } int len = strlen(s); - char* b = (char*)malloc(len + 1); - if (!b) { + char *b = (char *)malloc(len + 1); + if (!b) + { free(s); return NULL; } @@ -574,3 +658,351 @@ char* re_Pattern_sub(PikaObj* self, char* repl, char* subjet, PikaTuple* val) { free(s); return obj_getStr(self, "_b"); } +PikaObj *re_Pattern_subn(PikaObj *self, char *repl, char *subjet, PikaTuple *val) +{ + if (!obj_isArgExist(self, "_re")) + return NULL; + int flags = 0; + int count = 0; + int argn = tuple_getSize(val); + if (argn >= 1) + { + Arg *arg_i = tuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + count = arg_getInt(arg_i); + } + if (argn >= 2) + { + Arg *arg_i = tuple_getArg(val, 1); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + flags = arg_getInt(arg_i); + if (flags | PCRE_ONLY_ASCII) + { + flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); + } + } + pcre *re = obj_getPtr(self, "_re"); + Any res = __subn(re, repl, subjet, count, flags, 1); + if (!res) + raise_error; + return res; +} +PikaObj *re_Pattern_split(PikaObj *self, char *subject, PikaTuple *val) +{ + if (!obj_isArgExist(self, "_re")) + return NULL; + pcre *re = obj_getPtr(self, "_re"); + int flags = PCRE_UTF8; + int max_split = 0; + int argn = tuple_getSize(val); + if (argn >= 1) + { + Arg *arg_i = tuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + max_split = arg_getInt(arg_i); + } + if (argn >= 2) + { + Arg *arg_i = tuple_getArg(val, 1); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + obj_setErrorCode(self, -__LINE__); + return NULL; + } + flags = arg_getInt(arg_i); + if (flags | PCRE_ONLY_ASCII) + { + flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); + } + } + Any list = __split(re, subject, max_split, flags, 1); + if (!list) + raise_error; + return list; +} + +int _get_flags(PikaTuple *val) +{ + int flags = PCRE_UTF8; + int argn = tuple_getSize(val); + if (argn >= 1) + { + Arg *arg_i = tuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) + { + return -1; + } + flags |= arg_getInt(arg_i); + if (flags & PCRE_ONLY_ASCII) + { + flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); + } + } + return flags; +} + +PikaObj *__split(void *pattern__or__re, + char *subject, + int max_split, + int flags, + int mode_re) + +{ + int sub_length = strlen(subject); + int j2 = 0; + int _m_n = 0, m_n = 0; + int brackets = -1; + int **vcs; + if (mode_re) + vcs = re_searchall2((pcre *)pattern__or__re, subject, sub_length, &_m_n, &brackets, flags); + else + vcs = re_searchall((char *)pattern__or__re, subject, sub_length, &_m_n, &brackets, flags); + m_n = _m_n; + char *b = NULL; + Arg *str_arg1; + Arg *sub_arg; + if (!vcs) + { + return NULL; + } + if (max_split && max_split < m_n) + m_n = max_split; + Any list = newNormalObj(New_PikaStdData_List); + PikaStdData_List___init__(list); + int start = 0; + + if (brackets == 1) + { + for (int i = 0; i < m_n; i++) + { + int *v = vcs[i]; + int length = v[0] - start; + if (length) + { + b = malloc(length + 1); + if (!b) + goto e_er; + b[length] = 0; + memcpy(b, subject + start, length); + } + else + { + b = (char *)""; + } + str_arg1 = arg_newStr(b); + PikaStdData_List_append(list, str_arg1); + arg_deinit(str_arg1); + if (length) + free(b); + start = v[1]; + } + if (start <= sub_length) + { + str_arg1 = arg_newStr(subject + start); + PikaStdData_List_append(list, str_arg1); + arg_deinit(str_arg1); + } + goto exit; + } + + for (int i = 0; i < m_n; i++) + { + int *v = vcs[i]; + int length = v[0] - start; + b = malloc(length + 1); + if (!b) + goto e_er; + memcpy(b, subject + start, length); + b[length] = 0; + str_arg1 = arg_newStr(b); + PikaStdData_List_append(list, str_arg1); + arg_deinit(str_arg1); + + for (int j = 1; j < brackets; j++) + { + j2 = j * 2; + int length2 = v[j2 + 1] - v[j2]; + if (length2 > length) + { + free(b); + length = length2; + b = malloc(length + 1); + if (!b) + goto e_er; + } + b[length2] = 0; + memcpy(b, subject + v[j2], length2); + + str_arg1 = arg_newStr(b); + PikaStdData_List_append(list, str_arg1); + arg_deinit(str_arg1); + } + start = v[1]; + free(b); + } + if (start <= sub_length) + { + str_arg1 = arg_newStr(subject + start); + PikaStdData_List_append(list, str_arg1); + arg_deinit(str_arg1); + } + goto exit; +e_er: + if (list) + { + obj_deinit(list); + list = NULL; + } +exit: + if (vcs) + re_free_searchall(vcs, _m_n); + return list; +} + +PikaObj *__findall(void *pattern__or__re, + char *subject, + int flags, + int mode_re) +{ + int length = strlen(subject); + int j2 = 0; + int m_n = -1; + int brackets = -1; + int **vcs; + if (mode_re) + vcs = re_searchall2((pcre *)pattern__or__re, subject, length, &m_n, &brackets, flags); + else + vcs = re_searchall((char *)pattern__or__re, subject, length, &m_n, &brackets, flags); + + char *b = NULL; + Arg *str_arg1; + Arg *sub_arg; + if (!vcs) + { + if (m_n < 0) + return NULL; + Any list = newNormalObj(New_PikaStdData_List); + PikaStdData_List___init__(list); + + return list; + } + Any list = newNormalObj(New_PikaStdData_List); + PikaStdData_List___init__(list); + PikaTuple *tu; + Any sub_list = NULL; + if (brackets == 1) + { + for (int i = 0; i < m_n; i++) + { + int *v = vcs[i]; + length = v[1] - v[0]; + if (length) + { + b = malloc(length + 1); + if (!b) + goto e_er; + b[length] = 0; + memcpy(b, subject + v[0], length); + } + else + { + b = (char *)""; + } + str_arg1 = arg_newStr(b); + PikaStdData_List_append(list, str_arg1); + arg_deinit(str_arg1); + if (length) + free(b); + } + goto exit; + } + + for (int i = 0; i < m_n; i++) + { + int *v = vcs[i]; + length = v[1] - v[0]; + b = malloc(length + 1); + if (!b) + goto e_er; + tu = New_tuple(); + + for (int j = 1; j < brackets; j++) + { + j2 = j * 2; + length = v[j2 + 1] - v[j2]; + b[length] = 0; + memcpy(b, subject + v[j2], length); + tu_append(tu, b, Str); + } + sub_list = newNormalObj(New_PikaStdData_Tuple); + obj_setPtr(sub_list, "list", tu); + sub_arg = arg_newRef(sub_list); + PikaStdData_List_append(list, sub_arg); + arg_deinit(sub_arg); + free(b); + } + goto exit; +e_er: + if (list) + { + obj_deinit(list); + list = NULL; + } +exit: + if (vcs) + re_free_searchall(vcs, m_n); + return list; +} + +PikaObj *__subn(void *pattern__or__re, + char *repl, + char *subjet, + int count, + int flags, + int mode_re) +{ + int length = strlen(subjet); + int matched_times = 0; + char *s; + if (mode_re) + s = re_subn2((pcre *)pattern__or__re, repl, subjet, length, count, flags, &matched_times); + else + s = pcre_subn((char *)pattern__or__re, repl, subjet, length, count, flags, &matched_times); + + if (!s) + { + return NULL; + } + if (s == subjet) + { + PikaTuple *yup = New_tuple(); + tu_append(yup, s, Str); + tu_append(yup, 0, Int); + + Any tuple_obj = newNormalObj(New_PikaStdData_Tuple); + obj_setPtr(tuple_obj, "list", yup); + return tuple_obj; + } + + PikaTuple *yup = New_tuple(); + tu_append(yup, s, Str); + free(s); + + tu_append(yup, matched_times, Int); + + Any tuple_obj = newNormalObj(New_PikaStdData_Tuple); + obj_setPtr(tuple_obj, "list", yup); + return tuple_obj; +} \ No newline at end of file diff --git a/package/re/re.pyi b/package/re/re.pyi index ab9da490c..58f6ff62a 100644 --- a/package/re/re.pyi +++ b/package/re/re.pyi @@ -1,44 +1,77 @@ from PikaObj import * +A: int +ASCII: int I: int IGNORECASE: int M: int MULTILINE: int +S: int DOTALL: int +# here, not as in python, there is no 'UNICODE' flags, +# cause this version only support UTF-8 characters + def __init__(): ... + class Pattern: def __init__(self): pass + def __del__(self): pass + def findall(self, subject: str, *flags) -> list: pass - def sub(self, repl: str, subjet: str, *flags) -> str: + + def sub(self, repl: str, subjet: str, *count__flags) -> str: pass + + def subn(self, repl: str, subjet: str, *count__flags) -> list: + pass + def match(self, subject: str, *flags) -> Match: pass + def fullmatch(self, subject: str, *flags) -> Match: pass + def search(self, subject: str, *flags) -> Match: pass + def split(self, subject: str, *maxsplit__flags) -> list: + pass + + class Match: def __init__(self): pass + def __del__(self): pass - def group(self, n: int) -> str: - pass - def groups(self) -> list: - pass - def span(self, group_n: int) -> list: + + def group(self, *n) -> str: pass + def groups(self) -> list: + pass + # ! may returns wrong offset when subject contains widechar, like Chinese + # this function returns exactly memory offset between the begin of string and the target substring + def span(self, *group_n) -> list: + pass + + def findall(pattern: str, subject: str, *flags) -> list: ... -def sub(pattern: str, repl: str, subjet: str, *flags) -> str: ... +# def sub(pattern, repl, string, count=0, flags=0) +def sub(pattern: str, repl: str, subjet: str, *count__flags) -> str: ... def match(pattern: str, subject: str, *flags) -> Match: ... def fullmatch(pattern: str, subject: str, *flags) -> Match: ... def search(pattern: str, subject: str, *flags) -> Match: ... -def compile(pattern: str) -> Pattern: ... +def compile(pattern: str, *flags) -> Pattern: ... + +def escape(pattern: str) -> str: ... +# def subn(pattern, repl, string, count=0, flags=0) +def subn(pattern: str, repl: str, subjet: str, *count__flags) -> list: ... +# def finditer(pattern: str, subject: str, *flags): +def split(pattern: str, subject: str, *maxsplit__flags) -> list: ... diff --git a/package/re/readme.md b/package/re/readme.md index be73b6be9..79e9a3bdb 100644 --- a/package/re/readme.md +++ b/package/re/readme.md @@ -93,14 +93,16 @@ This module prototype are likes this: # flags +A: int +ASCII: int I: int -IGNORECASE:int +IGNORECASE: int M: int -MULTILINE:int +MULTILINE: int +S: int DOTALL: int - -class Pattern(): +class Pattern: def __init__(self): pass @@ -110,40 +112,53 @@ class Pattern(): def findall(self, subject: str, *flags) -> list: pass - def sub(self, repl: str, subjet: str, *flags) -> str: + def sub(self, repl: str, subjet: str, *count__flags) -> str: + pass + + def subn(self, repl: str, subjet: str, *count__flags) -> list: pass def match(self, subject: str, *flags) -> Match: pass + def fullmatch(self, subject: str, *flags) -> Match: pass def search(self, subject: str, *flags) -> Match: pass + def split(self, subject: str, *maxsplit__flags) -> list: + pass -class Match(): + +class Match: def __init__(self): pass def __del__(self): pass - def group(self, n: int) -> str: + def group(self, *n) -> str: pass def groups(self) -> list: pass - def span(self, group_n: int) -> list: + def span(self, *group_n) -> list: pass -def findall(pattern: str, subject: str, *flags) -> list:... -def sub(pattern: str, repl: str, subjet: str, *flags) -> str: ... +def findall(pattern: str, subject: str, *flags) -> list: ... +# def sub(pattern, repl, string, count=0, flags=0) +def sub(pattern: str, repl: str, subjet: str, *count__flags) -> str: ... def match(pattern: str, subject: str, *flags) -> Match: ... def fullmatch(pattern: str, subject: str, *flags) -> Match: ... def search(pattern: str, subject: str, *flags) -> Match: ... -def compile(pattern: str) -> Pattern: ... +def compile(pattern: str, *flags) -> Pattern: ... +def escape(pattern: str) -> str: ... +# def subn(pattern, repl, string, count=0, flags=0) +def subn(pattern: str, repl: str, subjet: str, *count__flags) -> list: ... +# def finditer(pattern: str, subject: str, *flags): +def split(pattern: str, subject: str, *maxsplit__flags) -> list: ... ```