From 0adbb62369c0401626826eb84849b9508e82e12c Mon Sep 17 00:00:00 2001 From: Lyon Date: Sat, 15 Jul 2023 14:59:56 +0800 Subject: [PATCH] format and push pkgs, fix SOFT_IIC --- package/PikaStdDevice/PikaStdDevice_CAN.c | 8 +- package/PikaStdDevice/PikaStdDevice_GPIO.c | 2 +- package/PikaStdDevice/PikaStdDevice_common.c | 4 +- package/PikaStdDevice/pika_hal.c | 4 +- package/PikaStdDevice/pika_hal_SOFT_IIC.c | 149 +- package/PikaStdDevice/pika_hal_SOFT_SPI.c | 2 +- package/re/cre.c | 1135 +- package/re/cre.h | 92 +- package/re/pcre.h | 299 +- package/re/pcre_chartables.c | 259 +- package/re/pcre_compile.c | 10480 ++++++++-------- package/re/pcre_exec.c | 9265 +++++++------- package/re/pcre_fullinfo.c | 160 +- package/re/pcre_globals.c | 10 +- package/re/pcre_internal.h | 1133 +- package/re/pcre_newline.c | 160 +- package/re/pcre_ord2utf8.c | 35 +- package/re/pcre_tables.c | 26 +- package/re/pcre_try_flipped.c | 86 +- package/re/pcre_valid_utf8.c | 106 +- package/re/pcre_xclass.c | 133 +- package/re/re-api-adapter.c | 604 +- package/re/re_config.h | 58 +- .../PikaStdDevice/pika_hal_SOFT_IIC.c | 149 +- .../pikascript/pikascript-lib/re/pcre_exec.c | 8 +- port/linux/test/python/requests/get_basic.py | 2 +- 26 files changed, 12519 insertions(+), 11850 deletions(-) diff --git a/package/PikaStdDevice/PikaStdDevice_CAN.c b/package/PikaStdDevice/PikaStdDevice_CAN.c index 830c45b7c..0efcffc26 100644 --- a/package/PikaStdDevice/PikaStdDevice_CAN.c +++ b/package/PikaStdDevice/PikaStdDevice_CAN.c @@ -43,7 +43,7 @@ char* PikaStdDevice_CAN_read(PikaObj* self, int length) { return obj_getStr(self, "readData"); } -Arg* PikaStdDevice_CAN_readBytes(PikaObj *self, int length){ +Arg* PikaStdDevice_CAN_readBytes(PikaObj* self, int length) { obj_setInt(self, "length", length); obj_runNativeMethod(self, "platformReadBytes", NULL); return arg_copy(obj_getArg(self, "readData")); @@ -54,7 +54,7 @@ void PikaStdDevice_CAN_write(PikaObj* self, char* data) { obj_runNativeMethod(self, "platformWrite", NULL); } -void PikaStdDevice_CAN_writeBytes(PikaObj *self, uint8_t* data, int length){ +void PikaStdDevice_CAN_writeBytes(PikaObj* self, uint8_t* data, int length) { obj_setBytes(self, "writeData", data, length); obj_runNativeMethod(self, "platformWriteBytes", NULL); } @@ -75,10 +75,10 @@ void PikaStdDevice_CAN_platformWrite(PikaObj* self) { ABSTRACT_METHOD_NEED_OVERRIDE_ERROR(); } -void PikaStdDevice_CAN_platformReadBytes(PikaObj *self){ +void PikaStdDevice_CAN_platformReadBytes(PikaObj* self) { ABSTRACT_METHOD_NEED_OVERRIDE_ERROR(); } -void PikaStdDevice_CAN_platformWriteBytes(PikaObj *self){ +void PikaStdDevice_CAN_platformWriteBytes(PikaObj* self) { ABSTRACT_METHOD_NEED_OVERRIDE_ERROR(); } diff --git a/package/PikaStdDevice/PikaStdDevice_GPIO.c b/package/PikaStdDevice/PikaStdDevice_GPIO.c index c4556c763..cd5a0894b 100644 --- a/package/PikaStdDevice/PikaStdDevice_GPIO.c +++ b/package/PikaStdDevice/PikaStdDevice_GPIO.c @@ -166,7 +166,7 @@ void PikaStdDevice_GPIO_setCallBack(PikaObj* self, #endif } -void PikaStdDevice_GPIO_close(PikaObj *self){ +void PikaStdDevice_GPIO_close(PikaObj* self) { pika_dev* dev = _get_dev(self); pika_hal_close(dev); } diff --git a/package/PikaStdDevice/PikaStdDevice_common.c b/package/PikaStdDevice/PikaStdDevice_common.c index 763ce3d3d..6c861236b 100644 --- a/package/PikaStdDevice/PikaStdDevice_common.c +++ b/package/PikaStdDevice/PikaStdDevice_common.c @@ -3,7 +3,7 @@ extern PikaEventListener* g_pika_device_event_listener; void _PikaStdDevice_event_handler(pika_dev* dev, int signal) { pika_eventListener_sendSignal(g_pika_device_event_listener, (uintptr_t)dev, - signal); + signal); } void _PikaStdDevice_setCallBack(PikaObj* self, @@ -21,7 +21,7 @@ void _PikaStdDevice_setCallBack(PikaObj* self, extern volatile PikaObj* __pikaMain; PikaObj* PikaStdDevice_Time(PikaObj* self) { PikaObj* time = obj_getPtr((PikaObj*)__pikaMain, "time"); - if(NULL == time){ + if (NULL == time) { obj_setErrorCode(self, -1); obj_setSysOut(self, "Error: please install and import 'time' module"); return NULL; diff --git a/package/PikaStdDevice/pika_hal.c b/package/PikaStdDevice/pika_hal.c index 4cc1c93d8..1ae4f4777 100644 --- a/package/PikaStdDevice/pika_hal.c +++ b/package/PikaStdDevice/pika_hal.c @@ -55,7 +55,7 @@ __exit: } /* error */ __platform_printf("Error: dev_open failed.\r\n"); - if (dev->ioctl_config) { + if (dev && dev->ioctl_config) { pikaFree(dev->ioctl_config, _pika_hal_dev_config_size(dev_type)); dev->ioctl_config = NULL; } @@ -76,7 +76,7 @@ int pika_hal_close(pika_dev* dev) { } ret = impl->close(dev); __exit: - if (NULL != dev->ioctl_config) { + if (NULL != dev && NULL != dev->ioctl_config) { pikaFree(dev->ioctl_config, _pika_hal_dev_config_size(dev->type)); dev->ioctl_config = NULL; } diff --git a/package/PikaStdDevice/pika_hal_SOFT_IIC.c b/package/PikaStdDevice/pika_hal_SOFT_IIC.c index 8c69142d7..8c9f55a04 100644 --- a/package/PikaStdDevice/pika_hal_SOFT_IIC.c +++ b/package/PikaStdDevice/pika_hal_SOFT_IIC.c @@ -1,4 +1,17 @@ -#include "../PikaStdDevice/pika_hal.h" +#include "pika_hal.h" +#include + +static void _IIC_SDA_input(pika_hal_SOFT_IIC_config* iic_cfg) { + pika_hal_GPIO_config cfg_SDA = {0}; + cfg_SDA.dir = PIKA_HAL_GPIO_DIR_IN; + pika_hal_ioctl(iic_cfg->SDA, PIKA_HAL_IOCTL_CONFIG, &cfg_SDA); +} + +static void _IIC_SDA_output(pika_hal_SOFT_IIC_config* iic_cfg) { + pika_hal_GPIO_config cfg_SDA = {0}; + cfg_SDA.dir = PIKA_HAL_GPIO_DIR_OUT; + pika_hal_ioctl(iic_cfg->SDA, PIKA_HAL_IOCTL_CONFIG, &cfg_SDA); +} static int _GPIO_write(pika_dev* dev, uint32_t val) { return pika_hal_write(dev, &val, sizeof(val)); @@ -11,11 +24,12 @@ static uint32_t _GPIO_read(pika_dev* dev) { } static void _IIC_Delay(void) { - // Delay implementation, can be modified based on hardware platform. - // You may need to adjust the delay time to match your hardware. + pika_sleep_ms(3); } static void _IIC_Start(pika_hal_SOFT_IIC_config* cfg) { + pika_debug("iic start"); + _IIC_SDA_output(cfg); _GPIO_write(cfg->SDA, 1); _GPIO_write(cfg->SCL, 1); _IIC_Delay(); @@ -25,6 +39,8 @@ static void _IIC_Start(pika_hal_SOFT_IIC_config* cfg) { } static void _IIC_Stop(pika_hal_SOFT_IIC_config* cfg) { + pika_debug("iic stop"); + _IIC_SDA_output(cfg); _GPIO_write(cfg->SDA, 0); _GPIO_write(cfg->SCL, 1); _IIC_Delay(); @@ -32,7 +48,9 @@ static void _IIC_Stop(pika_hal_SOFT_IIC_config* cfg) { _IIC_Delay(); } -static void _IIC_SendByte(pika_hal_SOFT_IIC_config* cfg, uint8_t byte) { +static pika_bool _IIC_SendByte(pika_hal_SOFT_IIC_config* cfg, uint8_t byte) { + pika_debug(" - iic write: 0x%02X", byte); + _IIC_SDA_output(cfg); for (int i = 0; i < 8; i++) { _GPIO_write(cfg->SCL, 0); _IIC_Delay(); @@ -46,11 +64,52 @@ static void _IIC_SendByte(pika_hal_SOFT_IIC_config* cfg, uint8_t byte) { _IIC_Delay(); byte <<= 1; } + + // 在发送完字节后检查ACK信号 _GPIO_write(cfg->SCL, 0); + _IIC_Delay(); + _IIC_SDA_input(cfg); // 设置SDA为输入 + _GPIO_write(cfg->SCL, 1); // 将SCL线设置为高,让从设备发送ACK信号 + + int timeout = 1000; + uint32_t ack = 0; + do { + _IIC_Delay(); + ack = !_GPIO_read(cfg->SDA); // 如果从设备发送了ACK信号,SDA线会被拉低 + } while (ack == 0 && timeout-- > 0); + + // pika_debug("ack timeout:%d", timeout); + if (timeout <= 0) { + pika_platform_printf("Error: IIC write byte timeout\r\n"); + } + + _GPIO_write(cfg->SCL, 0); // 将SCL线设置为低,完成一个I2C周期 + return ack; +} + +static void _IIC_Ack(pika_hal_SOFT_IIC_config* cfg) { + _GPIO_write(cfg->SCL, 0); // 拉低时钟线 + _IIC_SDA_output(cfg); // 设置SDA为输出 + _GPIO_write(cfg->SDA, 0); // 拉低数据线 + _IIC_Delay(); + _GPIO_write(cfg->SCL, 1); // 产生时钟 + _IIC_Delay(); + _GPIO_write(cfg->SCL, 0); // 拉低时钟线 +} + +static void _IIC_NAck(pika_hal_SOFT_IIC_config* cfg) { + _GPIO_write(cfg->SCL, 0); // 拉低时钟线 + _IIC_SDA_output(cfg); // 设置SDA为输出 + _GPIO_write(cfg->SDA, 1); // 数据线拉高 + _IIC_Delay(); + _GPIO_write(cfg->SCL, 1); // 产生时钟 + _IIC_Delay(); + _GPIO_write(cfg->SCL, 0); // 拉低时钟线 } static uint8_t _IIC_ReadByte(pika_hal_SOFT_IIC_config* cfg, uint8_t ack) { uint8_t byte = 0; + _IIC_SDA_input(cfg); for (int i = 0; i < 8; i++) { _GPIO_write(cfg->SCL, 1); _IIC_Delay(); @@ -61,77 +120,75 @@ static uint8_t _IIC_ReadByte(pika_hal_SOFT_IIC_config* cfg, uint8_t ack) { _GPIO_write(cfg->SCL, 0); _IIC_Delay(); } + // 在读取完一个字节后发送ACK信号 if (ack) { - _IIC_SendByte(cfg, 0xFF); + _IIC_Ack(cfg); // 如果ack为真,发送ACK信号 } else { - _IIC_SendByte(cfg, 0x00); + _IIC_NAck(cfg); // 如果ack为假,发送NACK信号 } + pika_debug(" - iic read: 0x%02X", byte); return byte; } -static void set_SDA_input(pika_hal_SOFT_IIC_config* cfg) { - pika_hal_GPIO_config cfg_SDA = {0}; - cfg_SDA.dir = PIKA_HAL_GPIO_DIR_IN; - pika_hal_ioctl(cfg->SDA, PIKA_HAL_IOCTL_CONFIG, &cfg_SDA); -} - -static void set_SDA_output(pika_hal_SOFT_IIC_config* cfg) { - pika_hal_GPIO_config cfg_SDA = {0}; - cfg_SDA.dir = PIKA_HAL_GPIO_DIR_OUT; - pika_hal_ioctl(cfg->SDA, PIKA_HAL_IOCTL_CONFIG, &cfg_SDA); -} - int pika_hal_platform_SOFT_IIC_write(pika_dev* dev, void* buf, size_t count) { - pika_hal_SOFT_IIC_config* cfg = + pika_hal_SOFT_IIC_config* iic_cfg = (pika_hal_SOFT_IIC_config*)dev->ioctl_config; uint8_t* data = (uint8_t*)buf; - set_SDA_output(cfg); - _IIC_Start(cfg); + + _IIC_Start(iic_cfg); + uint8_t addr_write = (iic_cfg->slave_addr << 1) | 0x00; // 方向位为0代表写 + // pika_debug("iic addr_write: 0x%02X", addr_write); + _IIC_SendByte(iic_cfg, addr_write); // 方向位为0代表写 // 如果启用了mem_addr_ena,将设备地址和内存地址发送到I2C总线 - if (cfg->mem_addr_ena == PIKA_HAL_IIC_MEM_ADDR_ENA_ENABLE) { - _IIC_SendByte(cfg, cfg->slave_addr); - if (cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_8BIT) { - _IIC_SendByte(cfg, cfg->mem_addr & 0xFF); - } else if (cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_16BIT) { - _IIC_SendByte(cfg, (cfg->mem_addr >> 8) & 0xFF); - _IIC_SendByte(cfg, cfg->mem_addr & 0xFF); + if (iic_cfg->mem_addr_ena == PIKA_HAL_IIC_MEM_ADDR_ENA_ENABLE) { + if (iic_cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_8BIT) { + _IIC_SendByte(iic_cfg, iic_cfg->mem_addr & 0xFF); + } else if (iic_cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_16BIT) { + _IIC_SendByte(iic_cfg, (iic_cfg->mem_addr >> 8) & 0xFF); + _IIC_SendByte(iic_cfg, iic_cfg->mem_addr & 0xFF); } } for (int i = 0; i < count; i++) { - _IIC_SendByte(cfg, data[i]); + _IIC_SendByte(iic_cfg, data[i]); } - _IIC_Stop(cfg); + _IIC_Stop(iic_cfg); return count; } int pika_hal_platform_SOFT_IIC_read(pika_dev* dev, void* buf, size_t count) { - pika_hal_SOFT_IIC_config* cfg = + pika_hal_SOFT_IIC_config* iic_cfg = (pika_hal_SOFT_IIC_config*)dev->ioctl_config; uint8_t* data = (uint8_t*)buf; + _IIC_Start(iic_cfg); + // 如果启用了mem_addr_ena,先写设备地址和内存地址 - if (cfg->mem_addr_ena == PIKA_HAL_IIC_MEM_ADDR_ENA_ENABLE) { - set_SDA_output(cfg); - _IIC_Start(cfg); - _IIC_SendByte(cfg, cfg->slave_addr); - if (cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_8BIT) { - _IIC_SendByte(cfg, cfg->mem_addr & 0xFF); - } else if (cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_16BIT) { - _IIC_SendByte(cfg, (cfg->mem_addr >> 8) & 0xFF); - _IIC_SendByte(cfg, cfg->mem_addr & 0xFF); + if (iic_cfg->mem_addr_ena == PIKA_HAL_IIC_MEM_ADDR_ENA_ENABLE) { + uint8_t addr_write = + (iic_cfg->slave_addr << 1) | 0x00; // 方向位为0代表写 + // pika_debug("iic addr_write: 0x%02X", addr_write); + _IIC_SendByte(iic_cfg, addr_write); // 方向位为0代表写 + if (iic_cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_8BIT) { + _IIC_SendByte(iic_cfg, iic_cfg->mem_addr & 0xFF); + } else if (iic_cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_16BIT) { + _IIC_SendByte(iic_cfg, (iic_cfg->mem_addr >> 8) & 0xFF); + _IIC_SendByte(iic_cfg, iic_cfg->mem_addr & 0xFF); } - _IIC_Stop(cfg); + _IIC_Start(iic_cfg); } - set_SDA_input(cfg); - _IIC_Start(cfg); + uint8_t addr_read = (iic_cfg->slave_addr << 1) | 0x01; // 方向位为1代表读 + // pika_debug("iic addr_read: 0x%02X", addr_read); + _IIC_SendByte(iic_cfg, addr_read); // 方向位为1代表读 + for (int i = 0; i < count - 1; i++) { - data[i] = _IIC_ReadByte(cfg, 1); + // data[i] = _IIC_ReadByte(iic_cfg, 1); + data[i] = _IIC_ReadByte(iic_cfg, 1); } - data[count - 1] = _IIC_ReadByte(cfg, 0); - _IIC_Stop(cfg); + data[count - 1] = _IIC_ReadByte(iic_cfg, 0); + _IIC_Stop(iic_cfg); return count; } diff --git a/package/PikaStdDevice/pika_hal_SOFT_SPI.c b/package/PikaStdDevice/pika_hal_SOFT_SPI.c index d34d2bfb3..f41c0a008 100644 --- a/package/PikaStdDevice/pika_hal_SOFT_SPI.c +++ b/package/PikaStdDevice/pika_hal_SOFT_SPI.c @@ -1,4 +1,4 @@ -#include "../PikaStdDevice/pika_hal.h" +#include "pika_hal.h" static int _GPIO_write(pika_dev* dev, uint32_t val) { return pika_hal_write(dev, &val, sizeof(val)); diff --git a/package/re/cre.c b/package/re/cre.c index 20c601ca8..712aa04a5 100644 --- a/package/re/cre.c +++ b/package/re/cre.c @@ -1,641 +1,624 @@ -/* -* -* Generally additional utility functions. -* L flag, also known as re.LOCALE in Python is not available here. -* Wrong results may be returned in re_sub likes funcitones when 'repl' contains '\', '\\\\1' for example. -* -* 4/9/2022 -*/ +/* + * + * Generally additional utility functions. + * L flag, also known as re.LOCALE in Python is not available here. + * Wrong results may be returned in re_sub likes funcitones when 'repl' + *contains '\', '\\\\1' for example. + * + * 4/9/2022 + */ +#include "cre.h" #include #include #include "pcre.h" -#include "cre.h" -int *_re_get_vec_table(pcre *re, int *out_groups_number) -{ - int brackets_number = 0; - pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &brackets_number); - brackets_number++; +int* _re_get_vec_table(pcre* re, int* out_groups_number) { + int brackets_number = 0; + pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &brackets_number); + brackets_number++; - if (out_groups_number) - *out_groups_number = brackets_number; + if (out_groups_number) + *out_groups_number = brackets_number; - brackets_number *= 3; + brackets_number *= 3; - int *vec = (int *)malloc(brackets_number * sizeof(int)); - return vec; + int* vec = (int*)malloc(brackets_number * sizeof(int)); + return vec; } -int *pcre_match(const char *_pat, const char *s, int len, int *out_vec_number, int opt) -{ - int *vec = NULL; - pcre *re = re_get_match_re(_pat, opt); - if (!re) - return NULL; +int* pcre_match(const char* _pat, + const char* s, + int len, + int* out_vec_number, + int opt) { + int* vec = NULL; + pcre* re = re_get_match_re(_pat, opt); + if (!re) + return NULL; - vec = re_match2(re, s, len, out_vec_number, opt); - pcre_free(re); - return vec; + vec = re_match2(re, s, len, out_vec_number, opt); + pcre_free(re); + return vec; } -int *re_match2(pcre *re, const char *s, int len, int *out_vec_number, int opt) -{ - int *vec = NULL; - int group_n = 0; - int rc; - int start_offset = 0; - vec = _re_get_vec_table(re, &group_n); - if (out_vec_number) - *out_vec_number = group_n; - group_n *= 3; +int* re_match2(pcre* re, const char* s, int len, int* out_vec_number, int opt) { + int* vec = NULL; + int group_n = 0; + int rc; + int start_offset = 0; + vec = _re_get_vec_table(re, &group_n); + if (out_vec_number) + *out_vec_number = group_n; + group_n *= 3; - if (!vec) - goto e_er; + if (!vec) + goto e_er; match: - rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n); - if (rc == PCRE_ERROR_NOMATCH) - { - free(vec); - return NULL; - } - if (rc <= 0) - goto e_er; - if (vec[0] == vec[1]) - { - start_offset++; - if (start_offset >= len) - goto e_er; - goto match; - } - return vec; + rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n); + if (rc == PCRE_ERROR_NOMATCH) { + free(vec); + return NULL; + } + if (rc <= 0) + goto e_er; + if (vec[0] == vec[1]) { + start_offset++; + if (start_offset >= len) + goto e_er; + goto match; + } + return vec; e_er: - if (vec) - free(vec); - return NULL; + if (vec) + free(vec); + return NULL; } -int *pcre_fullmatch(const char *_pat, const char *s, int len, int *out_vec_number, int opt) -{ - int *vec = NULL; - opt &= ~PCRE_MULTILINE; - pcre *re = re_get_fullmatch_re(_pat, opt); - if (!re) - return NULL; - vec = re_fullmatch2(re, s, len, out_vec_number, opt); - pcre_free(re); - return vec; +int* pcre_fullmatch(const char* _pat, + const char* s, + int len, + int* out_vec_number, + int opt) { + int* vec = NULL; + opt &= ~PCRE_MULTILINE; + pcre* re = re_get_fullmatch_re(_pat, opt); + if (!re) + return NULL; + vec = re_fullmatch2(re, s, len, out_vec_number, opt); + pcre_free(re); + return vec; } -int *re_fullmatch2(pcre *re, const char *s, int len, int *out_vec_number, int opt) -{ - int *vec = NULL; - int group_n = 0; - int rc; - int start_offset = 0; - vec = _re_get_vec_table(re, &group_n); - if (out_vec_number) - *out_vec_number = group_n; - group_n *= 3; +int* re_fullmatch2(pcre* re, + const char* s, + int len, + int* out_vec_number, + int opt) { + int* vec = NULL; + int group_n = 0; + int rc; + int start_offset = 0; + vec = _re_get_vec_table(re, &group_n); + if (out_vec_number) + *out_vec_number = group_n; + group_n *= 3; - if (!vec) - goto e_er; - // opt &= ~PCRE_MULTILINE; + if (!vec) + goto e_er; + // opt &= ~PCRE_MULTILINE; match: - rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n); - if (rc == PCRE_ERROR_NOMATCH) - { - free(vec); - return NULL; - } - if (rc <= 0) - goto e_er; - if (vec[0] == vec[1]) - { - start_offset++; - if (start_offset >= len) - goto e_er; - goto match; - } - return vec; + rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n); + if (rc == PCRE_ERROR_NOMATCH) { + free(vec); + return NULL; + } + if (rc <= 0) + goto e_er; + if (vec[0] == vec[1]) { + start_offset++; + if (start_offset >= len) + goto e_er; + goto match; + } + return vec; e_er: - if (vec) - free(vec); - return NULL; + if (vec) + free(vec); + return NULL; } -pcre *re_get_match_re(const char *_pat, int opt) -{ - const char *pat = _pat; - if (!*pat) - { - return NULL; - } - if (*pat != '^') - { - int pat_len = strlen(_pat); - char *p = (char *)pcre_malloc(pat_len + 2); - if (!p) - return NULL; - *p = '^'; - memcpy(p + 1, _pat, pat_len + 1); - pat = p; - } - const char *error; - int erroffset; - pcre *re = pcre_compile(pat, opt, &error, &erroffset, NULL); - if (pat != _pat) - free((void *)pat); +pcre* re_get_match_re(const char* _pat, int opt) { + const char* pat = _pat; + if (!*pat) { + return NULL; + } + if (*pat != '^') { + int pat_len = strlen(_pat); + char* p = (char*)pcre_malloc(pat_len + 2); + if (!p) + return NULL; + *p = '^'; + memcpy(p + 1, _pat, pat_len + 1); + pat = p; + } + const char* error; + int erroffset; + pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL); + if (pat != _pat) + free((void*)pat); - return re; + return re; } -pcre *re_get_fullmatch_re(const char *_pat, int opt) -{ - const char *pat = _pat; - if (!*pat) - { - return NULL; - } - int prefix = 0, suffix = 0; +pcre* re_get_fullmatch_re(const char* _pat, int opt) { + const char* pat = _pat; + if (!*pat) { + return NULL; + } + int prefix = 0, suffix = 0; - if (*pat != '^') - { - prefix = 1; - } - int pat_len = strlen(_pat); - if (_pat[pat_len - 1] != '$') - suffix = 1; - else - { - int n = pat_len - 2; - int i = 0; - while (_pat[n] == '\\') - { - i++; - n--; - } - if (i % 2) - { - suffix = 1; - } - } - int dn = prefix + suffix; - if (dn) - { - char *q = (char *)malloc(pat_len + dn + 1); - if (!q) - return NULL; - pat = q; - if (prefix) - { - *q = '^'; - q++; - } - memcpy(q, _pat, pat_len); - q += pat_len; - if (suffix) - { - *q = '$'; - q++; - } - *q = '\0'; - } + if (*pat != '^') { + prefix = 1; + } + int pat_len = strlen(_pat); + if (_pat[pat_len - 1] != '$') + suffix = 1; + else { + int n = pat_len - 2; + int i = 0; + while (_pat[n] == '\\') { + i++; + n--; + } + if (i % 2) { + suffix = 1; + } + } + int dn = prefix + suffix; + if (dn) { + char* q = (char*)malloc(pat_len + dn + 1); + if (!q) + return NULL; + pat = q; + if (prefix) { + *q = '^'; + q++; + } + memcpy(q, _pat, pat_len); + q += pat_len; + if (suffix) { + *q = '$'; + q++; + } + *q = '\0'; + } - const char *error; - int erroffset; - pcre *re = pcre_compile(pat, opt, &error, &erroffset, NULL); - if (pat != _pat) - free((void *)pat); - return re; + const char* error; + int erroffset; + pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL); + if (pat != _pat) + free((void*)pat); + return re; } -/* the following functions return (a) vector/table in heap, which means it need to be freed after using*/ +/* the following functions return (a) vector/table in heap, which means it need + * to be freed after using*/ -int *pcre_search(const char *pat, const char *s, int len, int *out_vec_number, int opt) -{ - const char *error; - int erroffset; - pcre *re = pcre_compile(pat, opt, &error, &erroffset, NULL); - if (!re) - return NULL; - int *res = re_search2(re, s, len, out_vec_number, opt); - pcre_free(re); - return res; +int* pcre_search(const char* pat, + const char* s, + int len, + int* out_vec_number, + int opt) { + const char* error; + int erroffset; + pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL); + if (!re) + return NULL; + int* res = re_search2(re, s, len, out_vec_number, opt); + pcre_free(re); + return res; } -int *re_search2(pcre *re, const char *s, int len, int *out_vec_number, int opt) -{ - int *vec = NULL; - int group_n = 0; - int rc; - int start_offset = 0; - vec = _re_get_vec_table(re, &group_n); - if (out_vec_number) - *out_vec_number = group_n; - group_n *= 3; +int* re_search2(pcre* re, + const char* s, + int len, + int* out_vec_number, + int opt) { + int* vec = NULL; + int group_n = 0; + int rc; + int start_offset = 0; + vec = _re_get_vec_table(re, &group_n); + if (out_vec_number) + *out_vec_number = group_n; + group_n *= 3; - if (!vec) - goto e_er; + if (!vec) + goto e_er; match: - rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n); - if (rc == PCRE_ERROR_NOMATCH) - { - free(vec); - return NULL; - } - if (rc <= 0) - goto e_er; - if (vec[0] == vec[1]) - { - start_offset++; - if (start_offset >= len) - goto e_er; - goto match; - } - return vec; + rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n); + if (rc == PCRE_ERROR_NOMATCH) { + free(vec); + return NULL; + } + if (rc <= 0) + goto e_er; + if (vec[0] == vec[1]) { + start_offset++; + if (start_offset >= len) + goto e_er; + goto match; + } + return vec; e_er: - if (vec) - free(vec); - return NULL; + if (vec) + free(vec); + return NULL; } -int **re_searchall(const char *pat, const char *s, int len, int *out_number, int *out_vec_number, int opt) -{ - const char *error; - int erroffset; - pcre *re = pcre_compile(pat, opt, &error, &erroffset, NULL); - if (!re) - return NULL; - int **res = re_searchall2(re, s, len, out_number, out_vec_number, opt); - pcre_free(re); - return res; +int** re_searchall(const char* pat, + const char* s, + int len, + int* out_number, + int* out_vec_number, + int opt) { + const char* error; + int erroffset; + pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL); + if (!re) + return NULL; + int** res = re_searchall2(re, s, len, out_number, out_vec_number, opt); + pcre_free(re); + return res; } -int **re_searchall2(pcre *re, const char *s, int len, int *out_number, int *out_vec_number, int opt) -{ - int start_offset = 0; - int **vecs = NULL; - int vec_cap = 4; - int vec_n = 0; - int *vec = NULL; - int group_n = 0; +int** re_searchall2(pcre* re, + const char* s, + int len, + int* out_number, + int* out_vec_number, + int opt) { + int start_offset = 0; + int** vecs = NULL; + int vec_cap = 4; + int vec_n = 0; + int* vec = NULL; + int group_n = 0; - while (1) - { - if (group_n) - vec = (int *)malloc(group_n * sizeof(int)); - else - { - vec = _re_get_vec_table(re, &group_n); - if (out_vec_number) - *out_vec_number = group_n; - group_n *= 3; - } - if (!vec) - { - goto e_er; - } - int rc; - match: - rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n); - if (rc == PCRE_ERROR_NOMATCH) - { - if (out_number) - *out_number = vec_n; - free(vec); - return vecs; - } - if (rc <= 0) - goto e_er; - if (vec[0] == vec[1]) - { - start_offset++; - if (start_offset >= len) - goto e_er; - goto match; - } - if (!vecs) - { - vecs = (int **)malloc(sizeof(int *) * vec_cap); - if (!vecs) - goto e_er; - } + while (1) { + if (group_n) + vec = (int*)malloc(group_n * sizeof(int)); + else { + vec = _re_get_vec_table(re, &group_n); + if (out_vec_number) + *out_vec_number = group_n; + group_n *= 3; + } + if (!vec) { + goto e_er; + } + int rc; + match: + rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n); + if (rc == PCRE_ERROR_NOMATCH) { + if (out_number) + *out_number = vec_n; + free(vec); + return vecs; + } + if (rc <= 0) + goto e_er; + if (vec[0] == vec[1]) { + start_offset++; + if (start_offset >= len) + goto e_er; + goto match; + } + if (!vecs) { + vecs = (int**)malloc(sizeof(int*) * vec_cap); + if (!vecs) + goto e_er; + } - if (vec_n >= vec_cap) - { - vec_cap *= 2; - void *p = realloc(vecs, vec_cap * sizeof(int *)); - if (!p) - goto e_er; - vecs = (int **)p; - } - vecs[vec_n++] = vec; - start_offset = vec[1]; - } + if (vec_n >= vec_cap) { + vec_cap *= 2; + void* p = realloc(vecs, vec_cap * sizeof(int*)); + if (!p) + goto e_er; + vecs = (int**)p; + } + vecs[vec_n++] = vec; + start_offset = vec[1]; + } e_er: - if (vec) - free(vec); - if (!vecs) - return NULL; - for (int j = 0; j < vec_n; j++) - { - if (vecs[j]) - free((void *)(vecs[j])); - } - free(vecs); - return NULL; + if (vec) + free(vec); + if (!vecs) + return NULL; + for (int j = 0; j < vec_n; j++) { + if (vecs[j]) + free((void*)(vecs[j])); + } + free(vecs); + return NULL; } -void re_free_searchall(int **vecs, int n) -{ - if (!vecs) - return; - for (int j = 0; j < n; j++) - { - if (vecs[j]) - free((void *)(vecs[j])); - } - free(vecs); +void re_free_searchall(int** vecs, int n) { + if (!vecs) + return; + for (int j = 0; j < n; j++) { + if (vecs[j]) + free((void*)(vecs[j])); + } + free(vecs); } -/* the following functions return (a) string in heap, which means it need to be freed after using*/ -char **_re_extract_substring(const char *s, int **vecs, int n) -{ - if (!vecs) - return NULL; - int c = 0; - char **res = (char **)pcre_malloc(sizeof(char *) * n); - if (!res) - return NULL; - for (int j = 0; j < n; j++) - { - int *v = vecs[j]; - int len = v[1] - v[0]; - char *p = (char *)pcre_malloc(len + 1); - if (!p) - goto e_er; - res[c++] = p; - memcpy(p, s + v[0], len); - p[len] = 0; - } - return res; +/* the following functions return (a) string in heap, which means it need to be + * freed after using*/ +char** _re_extract_substring(const char* s, int** vecs, int n) { + if (!vecs) + return NULL; + int c = 0; + char** res = (char**)pcre_malloc(sizeof(char*) * n); + if (!res) + return NULL; + for (int j = 0; j < n; j++) { + int* v = vecs[j]; + int len = v[1] - v[0]; + char* p = (char*)pcre_malloc(len + 1); + if (!p) + goto e_er; + res[c++] = p; + memcpy(p, s + v[0], len); + p[len] = 0; + } + return res; e_er: - if (!res) - return NULL; - for (int i = 0; i < c; i++) - { - free(res[i]); - } - free(res); - return NULL; + if (!res) + return NULL; + for (int i = 0; i < c; i++) { + free(res[i]); + } + free(res); + return NULL; } -char *re_find(const char *pat, const char *s, int len, int opt) -{ - const char *error; - int erroffset; - pcre *re = pcre_compile(pat, opt, &error, &erroffset, NULL); - if (!re) - return NULL; - char *res = re_find2(re, s, len, opt); - pcre_free(re); - return res; +char* re_find(const char* pat, const char* s, int len, int opt) { + const char* error; + int erroffset; + pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL); + if (!re) + return NULL; + char* res = re_find2(re, s, len, opt); + pcre_free(re); + return res; } -char *re_find2(pcre *re, const char *s, int len, int opt) -{ - int *vec = NULL; - int group_n = 0; - int rc; - int start_offset = 0; - char *res_s = NULL; - vec = _re_get_vec_table(re, &group_n); +char* re_find2(pcre* re, const char* s, int len, int opt) { + int* vec = NULL; + int group_n = 0; + int rc; + int start_offset = 0; + char* res_s = NULL; + vec = _re_get_vec_table(re, &group_n); - if (!vec) - goto e_er; - group_n *= 3; + if (!vec) + goto e_er; + group_n *= 3; match: - rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n); - if (rc == PCRE_ERROR_NOMATCH) - { - free(vec); - return NULL; - } - if (rc <= 0) - goto e_er; - if (vec[0] == vec[1]) - { - start_offset++; - if (start_offset >= len) - goto e_er; - goto match; - } - len = vec[1] - vec[0]; - if (!len) - goto e_er; - res_s = (char *)malloc(len + 1); - if (!res_s) - goto e_er; - memcpy(res_s, s + vec[0], len); - res_s[len] = 0; - if (vec) - free(vec); - return res_s; + rc = pcre_exec(re, NULL, s, len, start_offset, 0, vec, group_n); + if (rc == PCRE_ERROR_NOMATCH) { + free(vec); + return NULL; + } + if (rc <= 0) + goto e_er; + if (vec[0] == vec[1]) { + start_offset++; + if (start_offset >= len) + goto e_er; + goto match; + } + len = vec[1] - vec[0]; + if (!len) + goto e_er; + res_s = (char*)malloc(len + 1); + if (!res_s) + goto e_er; + memcpy(res_s, s + vec[0], len); + res_s[len] = 0; + if (vec) + free(vec); + return res_s; e_er: - if (vec) - free(vec); - return NULL; + if (vec) + free(vec); + return NULL; } -char **pcre_findall(const char *pat, const char *s, int len, int *out_number, int opt) -{ - const char *error; - int erroffset; - pcre *re = pcre_compile(pat, opt, &error, &erroffset, NULL); - if (!re) - return NULL; - char **res = re_findall2(re, s, len, out_number, opt); - pcre_free(re); - return res; +char** pcre_findall(const char* pat, + const char* s, + int len, + int* out_number, + int opt) { + const char* error; + int erroffset; + pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL); + if (!re) + return NULL; + char** res = re_findall2(re, s, len, out_number, opt); + pcre_free(re); + return res; } -char **re_findall2(pcre *re, const char *s, int len, int *out_number, int opt) -{ - int out_vec_number; - int **vecs; - char **res; - vecs = re_searchall2(re, s, len, out_number, &out_vec_number, opt); - if (!vecs) - goto e_er; - res = _re_extract_substring(s, vecs, *out_number); - if (!res) - goto e_er; - re_free_searchall(vecs, *out_number); - return res; +char** re_findall2(pcre* re, const char* s, int len, int* out_number, int opt) { + int out_vec_number; + int** vecs; + char** res; + vecs = re_searchall2(re, s, len, out_number, &out_vec_number, opt); + if (!vecs) + goto e_er; + res = _re_extract_substring(s, vecs, *out_number); + if (!res) + goto e_er; + re_free_searchall(vecs, *out_number); + return res; e_er: - if (vecs) - re_free_searchall(vecs, *out_number); - return NULL; + if (vecs) + re_free_searchall(vecs, *out_number); + return NULL; } -void re_free_findall(char **ss, int n) -{ - if (!ss) - return; - for (int j = 0; j < n; j++) - { - if (ss[j]) - free((void *)(ss[j])); - } - free(ss); +void re_free_findall(char** ss, int n) { + if (!ss) + return; + for (int j = 0; j < n; j++) { + if (ss[j]) + free((void*)(ss[j])); + } + free(ss); } -char *pcre_sub(const char *pat, const char *to, const char *s, int len, int opt) -{ - const char *error; - int erroffset; - pcre *re = pcre_compile(pat, opt, &error, &erroffset, NULL); - if (!re) - return NULL; - char *res = re_sub2(re, to, s, len, opt); - pcre_free(re); - return res; +char* pcre_sub(const char* pat, + const char* to, + const char* s, + int len, + int opt) { + const char* error; + int erroffset; + pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL); + if (!re) + return NULL; + char* res = re_sub2(re, to, s, len, opt); + pcre_free(re); + return res; } -char *pcre_subn(const char *pat, const char *to, const char *s, int len, int n, int opt, int *out_repl_times) -{ - const char *error; - int erroffset; - pcre *re = pcre_compile(pat, opt, &error, &erroffset, NULL); - if (!re) - return NULL; - char *res = re_subn2(re, to, s, len, n, opt, out_repl_times); - pcre_free(re); - return res; +char* pcre_subn(const char* pat, + const char* to, + const char* s, + int len, + int n, + int opt, + int* out_repl_times) { + const char* error; + int erroffset; + pcre* re = pcre_compile(pat, opt, &error, &erroffset, NULL); + if (!re) + return NULL; + char* res = re_subn2(re, to, s, len, n, opt, out_repl_times); + pcre_free(re); + return res; } -char *re_subn2(pcre *re, const char *to, const char *s, int len, int n, int opt, int *out_repl_times) -{ - int group_n = 0; - pcre *re2 = NULL; - int vcs1_n = 0, vcs2_n = 0; - int **vcs1 = re_searchall2(re, s, len, &vcs1_n, &group_n, opt); - int **vcs2 = NULL; - int match_limit = 0; - if (!vcs1_n) - { - return (char *)s; - } - const char *p2 = "(\\\\\\\\|\\\\\\d{1,2})"; - int erroffset; - const char *error; - int len_to, remain_size, remain_length2, pi = 0, qi = 0; - char *new_s = NULL; +char* re_subn2(pcre* re, + const char* to, + const char* s, + int len, + int n, + int opt, + int* out_repl_times) { + int group_n = 0; + pcre* re2 = NULL; + int vcs1_n = 0, vcs2_n = 0; + int** vcs1 = re_searchall2(re, s, len, &vcs1_n, &group_n, opt); + int** vcs2 = NULL; + int match_limit = 0; + if (!vcs1_n) { + return (char*)s; + } + const char* p2 = "(\\\\\\\\|\\\\\\d{1,2})"; + int erroffset; + const char* error; + int len_to, remain_size, remain_length2, pi = 0, qi = 0; + char* new_s = NULL; - re2 = pcre_compile(p2, 0, &error, &erroffset, NULL); - if (!re2) - goto exit_error; - len_to = strlen(to); - vcs2 = re_searchall2(re2, to, len_to, &vcs2_n, NULL, 0); - pcre_free(re2); - re2 = NULL; - remain_length2 = len_to; - for (int i = 0; i < vcs2_n; i++) - { - int *vc = vcs2[i]; - int vc0 = vc[0] + 1; - if (to[vc0] == '\\') - { - vc[2] = 0; - remain_length2 -= 2; - } - else - { - int wanted_number = 0; - int l_n = vc[1] - vc0; - if (l_n == 1) - { - wanted_number = to[vc0] - '0'; - remain_length2 -= 2; - } - else - { - wanted_number = (to[vc0] - '0') * 10 + to[vc0 + 1] - '0'; - remain_length2 -= 3; - } - if (wanted_number <= 0 || wanted_number >= group_n) - goto exit_error; - vc[2] = wanted_number; - } - } + re2 = pcre_compile(p2, 0, &error, &erroffset, NULL); + if (!re2) + goto exit_error; + len_to = strlen(to); + vcs2 = re_searchall2(re2, to, len_to, &vcs2_n, NULL, 0); + pcre_free(re2); + re2 = NULL; + remain_length2 = len_to; + for (int i = 0; i < vcs2_n; i++) { + int* vc = vcs2[i]; + int vc0 = vc[0] + 1; + if (to[vc0] == '\\') { + vc[2] = 0; + remain_length2 -= 2; + } else { + int wanted_number = 0; + int l_n = vc[1] - vc0; + if (l_n == 1) { + wanted_number = to[vc0] - '0'; + remain_length2 -= 2; + } else { + wanted_number = (to[vc0] - '0') * 10 + to[vc0 + 1] - '0'; + remain_length2 -= 3; + } + if (wanted_number <= 0 || wanted_number >= group_n) + goto exit_error; + vc[2] = wanted_number; + } + } - match_limit = n ? (n <= vcs1_n ? n : vcs1_n) : vcs1_n; - remain_size = len + remain_length2 * match_limit; - for (int i = 0; i < match_limit; i++) - { - int *vc = vcs1[i]; - remain_size -= vc[1] - vc[0]; - for (int j = 0; j < vcs2_n; j++) - { - int *v2 = vcs2[j]; - if (v2[2]) - { - remain_size += GetGroupLen(vc, v2[2]); - } - else - { - remain_size++; - } - } - } - new_s = (char *)malloc(remain_size + 1); - if (!new_s) - goto exit_error; - for (int i = 0; i < match_limit; i++) - { - int *vc = vcs1[i]; - memcpy(new_s + pi, s + qi, vc[0] - qi); - pi += vc[0] - qi; - int m_start = 0, m_len = 0; - for (int j = 0; j < vcs2_n; j++) - { - int *v2 = vcs2[j]; - m_len = v2[0] - m_start; - memcpy(new_s + pi, to + m_start, m_len); - pi += m_len; + match_limit = n ? (n <= vcs1_n ? n : vcs1_n) : vcs1_n; + remain_size = len + remain_length2 * match_limit; + for (int i = 0; i < match_limit; i++) { + int* vc = vcs1[i]; + remain_size -= vc[1] - vc[0]; + for (int j = 0; j < vcs2_n; j++) { + int* v2 = vcs2[j]; + if (v2[2]) { + remain_size += GetGroupLen(vc, v2[2]); + } else { + remain_size++; + } + } + } + new_s = (char*)malloc(remain_size + 1); + if (!new_s) + goto exit_error; + for (int i = 0; i < match_limit; i++) { + int* vc = vcs1[i]; + memcpy(new_s + pi, s + qi, vc[0] - qi); + pi += vc[0] - qi; + int m_start = 0, m_len = 0; + for (int j = 0; j < vcs2_n; j++) { + int* v2 = vcs2[j]; + m_len = v2[0] - m_start; + memcpy(new_s + pi, to + m_start, m_len); + pi += m_len; - int to_group = v2[2]; - if (to_group) - { - int to_group_at = vc[to_group * 2]; - int to_group_end = vc[to_group * 2 + 1]; - int g_l = to_group_end - to_group_at; - memcpy(new_s + pi, s + to_group_at, g_l); - pi += g_l; - } - else - { - new_s[pi++] = '\\'; - } - m_start = v2[1]; - } - m_len = len_to - m_start; - memcpy(new_s + pi, to + m_start, m_len); - pi += m_len; - qi = vc[1]; - } - if (out_repl_times) - *out_repl_times = match_limit; - if (vcs1) - re_free_searchall(vcs1, vcs1_n); - if (vcs2) - re_free_searchall(vcs2, vcs2_n); - len -= qi; - if (len) - memcpy(new_s + pi, s + qi, len); - pi += len; - new_s[pi] = '\0'; - return new_s; + int to_group = v2[2]; + if (to_group) { + int to_group_at = vc[to_group * 2]; + int to_group_end = vc[to_group * 2 + 1]; + int g_l = to_group_end - to_group_at; + memcpy(new_s + pi, s + to_group_at, g_l); + pi += g_l; + } else { + new_s[pi++] = '\\'; + } + m_start = v2[1]; + } + m_len = len_to - m_start; + memcpy(new_s + pi, to + m_start, m_len); + pi += m_len; + qi = vc[1]; + } + if (out_repl_times) + *out_repl_times = match_limit; + if (vcs1) + re_free_searchall(vcs1, vcs1_n); + if (vcs2) + re_free_searchall(vcs2, vcs2_n); + len -= qi; + if (len) + memcpy(new_s + pi, s + qi, len); + pi += len; + new_s[pi] = '\0'; + return new_s; exit_error: - if (vcs1) - re_free_searchall(vcs1, vcs1_n); - if (vcs2) - re_free_searchall(vcs2, vcs2_n); - if (re2) - pcre_free(re2); - return NULL; + if (vcs1) + re_free_searchall(vcs1, vcs1_n); + if (vcs2) + re_free_searchall(vcs2, vcs2_n); + if (re2) + pcre_free(re2); + return NULL; } -char *re_sub2(pcre *re, const char *to, const char *s, int len, int opt) -{ - return re_subn2(re, to, s, len, 0, opt, NULL); +char* re_sub2(pcre* re, const char* to, const char* s, int len, int opt) { + return re_subn2(re, to, s, len, 0, opt, NULL); } diff --git a/package/re/cre.h b/package/re/cre.h index 9aa6dfe37..eeaeff6f2 100644 --- a/package/re/cre.h +++ b/package/re/cre.h @@ -4,47 +4,93 @@ #include "pcre.h" #define GetGroupLen(vc, n) (vc[(n)*2 + 1] - vc[(n)*2]) -int *_re_get_vec_table(pcre *re, int *out_groups_number); +int* _re_get_vec_table(pcre* re, int* out_groups_number); -int *pcre_match(const char *_pat, const char *s, int len, int *out_vec_number, int opt); +int* pcre_match(const char* _pat, + const char* s, + int len, + int* out_vec_number, + int opt); -int *re_match2(pcre *re, const char *s, int len, int *out_vec_number, int opt); +int* re_match2(pcre* re, const char* s, int len, int* out_vec_number, int opt); -int *pcre_fullmatch(const char *_pat, const char *s, int len, int *out_vec_number, int opt); +int* pcre_fullmatch(const char* _pat, + const char* s, + int len, + int* out_vec_number, + int opt); -int *re_fullmatch2(pcre *re, const char *s, int len, int *out_vec_number, int opt); +int* re_fullmatch2(pcre* re, + const char* s, + int len, + int* out_vec_number, + int opt); -pcre *re_get_match_re(const char *_pat, int opt); +pcre* re_get_match_re(const char* _pat, int opt); -pcre *re_get_fullmatch_re(const char *_pat, int opt); +pcre* re_get_fullmatch_re(const char* _pat, int opt); -int *pcre_search(const char *pat, const char *s, int len, int *out_vec_number, int opt); +int* pcre_search(const char* pat, + const char* s, + int len, + int* out_vec_number, + int opt); -int *re_search2(pcre *re, const char *s, int len, int *out_vec_number, int opt); +int* re_search2(pcre* re, const char* s, int len, int* out_vec_number, int opt); -int **re_searchall(const char *pat, const char *s, int len, int *out_number, int *out_vec_number, int opt); +int** re_searchall(const char* pat, + const char* s, + int len, + int* out_number, + int* out_vec_number, + int opt); -int **re_searchall2(pcre *re, const char *s, int len, int *out_number, int *out_vec_number, int opt); +int** re_searchall2(pcre* re, + const char* s, + int len, + int* out_number, + int* out_vec_number, + int opt); -void re_free_searchall(int **vecs, int n); +void re_free_searchall(int** vecs, int n); -char **_re_extract_substring(const char *s, int **vecs, int n); +char** _re_extract_substring(const char* s, int** vecs, int n); -char *re_find(const char *pat, const char *s, int len, int opt); +char* re_find(const char* pat, const char* s, int len, int opt); -char *re_find2(pcre *re, const char *s, int len, int opt); +char* re_find2(pcre* re, const char* s, int len, int opt); -char **pcre_findall(const char *pat, const char *s, int len, int *out_number, int opt); +char** pcre_findall(const char* pat, + const char* s, + int len, + int* out_number, + int opt); -char **re_findall2(pcre *re, const char *s, int len, int *out_number, int opt); +char** re_findall2(pcre* re, const char* s, int len, int* out_number, int opt); -void re_free_findall(char **ss, int n); +void re_free_findall(char** ss, int n); -char *pcre_sub(const char *pat, const char *to, const char *s, int len, int opt); +char* pcre_sub(const char* pat, + const char* to, + const char* s, + int len, + int opt); -char *pcre_subn(const char *pat, const char *to, const char *s, int len, int n, int opt, int *out_repl_times); +char* pcre_subn(const char* pat, + const char* to, + const char* s, + int len, + int n, + int opt, + int* out_repl_times); -char *re_subn2(pcre *re, const char *to, const char *s, int len, int n, int opt, int *out_repl_times); +char* re_subn2(pcre* re, + const char* to, + const char* s, + int len, + int n, + int opt, + int* out_repl_times); -char *re_sub2(pcre *re, const char *to, const char *s, int len, int opt); -#endif \ No newline at end of file +char* re_sub2(pcre* re, const char* to, const char* s, int len, int opt); +#endif diff --git a/package/re/pcre.h b/package/re/pcre.h index 1b15b66af..3e391290d 100644 --- a/package/re/pcre.h +++ b/package/re/pcre.h @@ -8,36 +8,36 @@ export setting is defined in pcre_internal.h, which includes this file. So we don't change existing definitions of PCRE_EXP_DECL and PCRECPP_EXP_DECL. */ #if defined(_WIN32) && !defined(PCRE_STATIC) -# ifndef PCRE_EXP_DECL -# define PCRE_EXP_DECL extern __declspec(dllimport) -# endif -# ifdef __cplusplus -# ifndef PCRECPP_EXP_DECL -# define PCRECPP_EXP_DECL extern __declspec(dllimport) -# endif -# ifndef PCRECPP_EXP_DEFN -# define PCRECPP_EXP_DEFN __declspec(dllimport) -# endif -# endif +#ifndef PCRE_EXP_DECL +#define PCRE_EXP_DECL extern __declspec(dllimport) +#endif +#ifdef __cplusplus +#ifndef PCRECPP_EXP_DECL +#define PCRECPP_EXP_DECL extern __declspec(dllimport) +#endif +#ifndef PCRECPP_EXP_DEFN +#define PCRECPP_EXP_DEFN __declspec(dllimport) +#endif +#endif #endif /* By default, we use the standard "extern" declarations. */ #ifndef PCRE_EXP_DECL -# ifdef __cplusplus -# define PCRE_EXP_DECL extern "C" -# else -# define PCRE_EXP_DECL extern -# endif +#ifdef __cplusplus +#define PCRE_EXP_DECL extern "C" +#else +#define PCRE_EXP_DECL extern +#endif #endif #ifdef __cplusplus -# ifndef PCRECPP_EXP_DECL -# define PCRECPP_EXP_DECL extern -# endif -# ifndef PCRECPP_EXP_DEFN -# define PCRECPP_EXP_DEFN -# endif +#ifndef PCRECPP_EXP_DECL +#define PCRECPP_EXP_DECL extern +#endif +#ifndef PCRECPP_EXP_DEFN +#define PCRECPP_EXP_DEFN +#endif #endif /* Have to include stdlib.h in order to ensure that size_t is defined; @@ -45,161 +45,158 @@ it is needed here for malloc. */ #include - #ifdef __cplusplus extern "C" { #endif +#define PCRE_CASELESS 0x00000001 +#define PCRE_MULTILINE 0x00000002 +#define PCRE_DOTALL 0x00000004 +#define PCRE_EXTENDED 0x00000008 +#define PCRE_ANCHORED 0x00000010 +#define PCRE_DOLLAR_ENDONLY 0x00000020 +#define PCRE_EXTRA 0x00000040 +#define PCRE_NOTBOL 0x00000080 +#define PCRE_NOTEOL 0x00000100 +#define PCRE_UNGREEDY 0x00000200 +#define PCRE_NOTEMPTY 0x00000400 +#define PCRE_UTF8 0x00000800 +#define PCRE_NO_AUTO_CAPTURE 0x00001000 +#define PCRE_NO_UTF8_CHECK 0x00002000 +#define PCRE_AUTO_CALLOUT 0x00004000 +#define PCRE_PARTIAL 0x00008000 +#define PCRE_DFA_SHORTEST 0x00010000 +#define PCRE_DFA_RESTART 0x00020000 +#define PCRE_FIRSTLINE 0x00040000 +#define PCRE_DUPNAMES 0x00080000 +#define PCRE_NEWLINE_CR 0x00100000 +#define PCRE_NEWLINE_LF 0x00200000 +#define PCRE_NEWLINE_CRLF 0x00300000 +#define PCRE_NEWLINE_ANY 0x00400000 +#define PCRE_NEWLINE_ANYCRLF 0x00500000 +#define PCRE_BSR_ANYCRLF 0x00800000 +#define PCRE_BSR_UNICODE 0x01000000 +#define PCRE_ONLY_ASCII 0x02000000 -#define PCRE_CASELESS 0x00000001 -#define PCRE_MULTILINE 0x00000002 -#define PCRE_DOTALL 0x00000004 -#define PCRE_EXTENDED 0x00000008 -#define PCRE_ANCHORED 0x00000010 -#define PCRE_DOLLAR_ENDONLY 0x00000020 -#define PCRE_EXTRA 0x00000040 -#define PCRE_NOTBOL 0x00000080 -#define PCRE_NOTEOL 0x00000100 -#define PCRE_UNGREEDY 0x00000200 -#define PCRE_NOTEMPTY 0x00000400 -#define PCRE_UTF8 0x00000800 -#define PCRE_NO_AUTO_CAPTURE 0x00001000 -#define PCRE_NO_UTF8_CHECK 0x00002000 -#define PCRE_AUTO_CALLOUT 0x00004000 -#define PCRE_PARTIAL 0x00008000 -#define PCRE_DFA_SHORTEST 0x00010000 -#define PCRE_DFA_RESTART 0x00020000 -#define PCRE_FIRSTLINE 0x00040000 -#define PCRE_DUPNAMES 0x00080000 -#define PCRE_NEWLINE_CR 0x00100000 -#define PCRE_NEWLINE_LF 0x00200000 -#define PCRE_NEWLINE_CRLF 0x00300000 -#define PCRE_NEWLINE_ANY 0x00400000 -#define PCRE_NEWLINE_ANYCRLF 0x00500000 -#define PCRE_BSR_ANYCRLF 0x00800000 -#define PCRE_BSR_UNICODE 0x01000000 -#define PCRE_ONLY_ASCII 0x02000000 - - -#define PCRE_ERROR_NOMATCH (-1) -#define PCRE_ERROR_NULL (-2) -#define PCRE_ERROR_BADOPTION (-3) -#define PCRE_ERROR_BADMAGIC (-4) -#define PCRE_ERROR_UNKNOWN_OPCODE (-5) -#define PCRE_ERROR_UNKNOWN_NODE (-5) -#define PCRE_ERROR_NOMEMORY (-6) -#define PCRE_ERROR_NOSUBSTRING (-7) -#define PCRE_ERROR_MATCHLIMIT (-8) -#define PCRE_ERROR_CALLOUT (-9) -#define PCRE_ERROR_BADUTF8 (-10) +#define PCRE_ERROR_NOMATCH (-1) +#define PCRE_ERROR_NULL (-2) +#define PCRE_ERROR_BADOPTION (-3) +#define PCRE_ERROR_BADMAGIC (-4) +#define PCRE_ERROR_UNKNOWN_OPCODE (-5) +#define PCRE_ERROR_UNKNOWN_NODE (-5) +#define PCRE_ERROR_NOMEMORY (-6) +#define PCRE_ERROR_NOSUBSTRING (-7) +#define PCRE_ERROR_MATCHLIMIT (-8) +#define PCRE_ERROR_CALLOUT (-9) +#define PCRE_ERROR_BADUTF8 (-10) #define PCRE_ERROR_BADUTF8_OFFSET (-11) -#define PCRE_ERROR_PARTIAL (-12) -#define PCRE_ERROR_BADPARTIAL (-13) -#define PCRE_ERROR_INTERNAL (-14) -#define PCRE_ERROR_BADCOUNT (-15) -#define PCRE_ERROR_DFA_UITEM (-16) -#define PCRE_ERROR_DFA_UCOND (-17) -#define PCRE_ERROR_DFA_UMLIMIT (-18) -#define PCRE_ERROR_DFA_WSSIZE (-19) -#define PCRE_ERROR_DFA_RECURSE (-20) +#define PCRE_ERROR_PARTIAL (-12) +#define PCRE_ERROR_BADPARTIAL (-13) +#define PCRE_ERROR_INTERNAL (-14) +#define PCRE_ERROR_BADCOUNT (-15) +#define PCRE_ERROR_DFA_UITEM (-16) +#define PCRE_ERROR_DFA_UCOND (-17) +#define PCRE_ERROR_DFA_UMLIMIT (-18) +#define PCRE_ERROR_DFA_WSSIZE (-19) +#define PCRE_ERROR_DFA_RECURSE (-20) #define PCRE_ERROR_RECURSIONLIMIT (-21) -#define PCRE_ERROR_NULLWSLIMIT (-22) -#define PCRE_ERROR_BADNEWLINE (-23) +#define PCRE_ERROR_NULLWSLIMIT (-22) +#define PCRE_ERROR_BADNEWLINE (-23) +#define PCRE_INFO_OPTIONS 0 +#define PCRE_INFO_SIZE 1 +#define PCRE_INFO_CAPTURECOUNT 2 +#define PCRE_INFO_BACKREFMAX 3 +#define PCRE_INFO_FIRSTBYTE 4 +#define PCRE_INFO_FIRSTCHAR 4 +#define PCRE_INFO_FIRSTTABLE 5 +#define PCRE_INFO_LASTLITERAL 6 +#define PCRE_INFO_NAMEENTRYSIZE 7 +#define PCRE_INFO_NAMECOUNT 8 +#define PCRE_INFO_NAMETABLE 9 +#define PCRE_INFO_STUDYSIZE 10 +#define PCRE_INFO_DEFAULT_TABLES 11 +#define PCRE_INFO_OKPARTIAL 12 +#define PCRE_INFO_JCHANGED 13 +#define PCRE_INFO_HASCRORLF 14 -#define PCRE_INFO_OPTIONS 0 -#define PCRE_INFO_SIZE 1 -#define PCRE_INFO_CAPTURECOUNT 2 -#define PCRE_INFO_BACKREFMAX 3 -#define PCRE_INFO_FIRSTBYTE 4 -#define PCRE_INFO_FIRSTCHAR 4 -#define PCRE_INFO_FIRSTTABLE 5 -#define PCRE_INFO_LASTLITERAL 6 -#define PCRE_INFO_NAMEENTRYSIZE 7 -#define PCRE_INFO_NAMECOUNT 8 -#define PCRE_INFO_NAMETABLE 9 -#define PCRE_INFO_STUDYSIZE 10 -#define PCRE_INFO_DEFAULT_TABLES 11 -#define PCRE_INFO_OKPARTIAL 12 -#define PCRE_INFO_JCHANGED 13 -#define PCRE_INFO_HASCRORLF 14 +#define PCRE_CONFIG_UTF8 0 +#define PCRE_CONFIG_NEWLINE 1 +#define PCRE_CONFIG_LINK_SIZE 2 +#define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3 +#define PCRE_CONFIG_MATCH_LIMIT 4 +#define PCRE_CONFIG_STACKRECURSE 5 +#define PCRE_CONFIG_UNICODE_PROPERTIES 6 +#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7 +#define PCRE_CONFIG_BSR 8 +#define PCRE_EXTRA_STUDY_DATA 0x0001 +#define PCRE_EXTRA_MATCH_LIMIT 0x0002 +#define PCRE_EXTRA_CALLOUT_DATA 0x0004 +#define PCRE_EXTRA_TABLES 0x0008 +#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010 -#define PCRE_CONFIG_UTF8 0 -#define PCRE_CONFIG_NEWLINE 1 -#define PCRE_CONFIG_LINK_SIZE 2 -#define PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3 -#define PCRE_CONFIG_MATCH_LIMIT 4 -#define PCRE_CONFIG_STACKRECURSE 5 -#define PCRE_CONFIG_UNICODE_PROPERTIES 6 -#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7 -#define PCRE_CONFIG_BSR 8 - - -#define PCRE_EXTRA_STUDY_DATA 0x0001 -#define PCRE_EXTRA_MATCH_LIMIT 0x0002 -#define PCRE_EXTRA_CALLOUT_DATA 0x0004 -#define PCRE_EXTRA_TABLES 0x0008 -#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010 - - -struct real_pcre; /* declaration; the definition is private */ +struct real_pcre; /* declaration; the definition is private */ typedef struct real_pcre pcre; - #ifndef PCRE_SPTR -#define PCRE_SPTR const char * +#define PCRE_SPTR const char* #endif - typedef struct pcre_extra { - unsigned long int flags; - void *study_data; - unsigned long int match_limit; - void *callout_data; - const unsigned char *tables; - unsigned long int match_limit_recursion; + unsigned long int flags; + void* study_data; + unsigned long int match_limit; + void* callout_data; + const unsigned char* tables; + unsigned long int match_limit_recursion; } pcre_extra; - typedef struct pcre_callout_block { - int version; - int callout_number; - int *offset_vector; - PCRE_SPTR subject; - int subject_length; - int start_match; - int current_position; - int capture_top; - int capture_last; - void *callout_data; - int pattern_position; - int next_item_length; + int version; + int callout_number; + int* offset_vector; + PCRE_SPTR subject; + int subject_length; + int start_match; + int current_position; + int capture_top; + int capture_last; + void* callout_data; + int pattern_position; + int next_item_length; } pcre_callout_block; - #ifndef VPCOMPAT -PCRE_EXP_DECL void *(*pcre_malloc)(size_t); -PCRE_EXP_DECL void (*pcre_free)(void *); -PCRE_EXP_DECL void *(*pcre_stack_malloc)(size_t); -PCRE_EXP_DECL void (*pcre_stack_free)(void *); -PCRE_EXP_DECL int (*pcre_callout)(pcre_callout_block *); +PCRE_EXP_DECL void* (*pcre_malloc)(size_t); +PCRE_EXP_DECL void (*pcre_free)(void*); +PCRE_EXP_DECL void* (*pcre_stack_malloc)(size_t); +PCRE_EXP_DECL void (*pcre_stack_free)(void*); +PCRE_EXP_DECL int (*pcre_callout)(pcre_callout_block*); #else -PCRE_EXP_DECL void *pcre_malloc(size_t); -PCRE_EXP_DECL void pcre_free(void *); -PCRE_EXP_DECL void *pcre_stack_malloc(size_t); -PCRE_EXP_DECL void pcre_stack_free(void *); -PCRE_EXP_DECL int pcre_callout(pcre_callout_block *); +PCRE_EXP_DECL void* pcre_malloc(size_t); +PCRE_EXP_DECL void pcre_free(void*); +PCRE_EXP_DECL void* pcre_stack_malloc(size_t); +PCRE_EXP_DECL void pcre_stack_free(void*); +PCRE_EXP_DECL int pcre_callout(pcre_callout_block*); #endif -pcre *pcre_compile(const char *, int, const char **, int *, - const unsigned char *); -pcre *pcre_compile2(const char *, int, int *, const char **, - int *, const unsigned char *); -int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR, - int, int, int, int *, int); -int pcre_fullinfo(const pcre *, const pcre_extra *, int, - void *); +pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*); +pcre* pcre_compile2(const char*, + int, + int*, + const char**, + int*, + const unsigned char*); +int pcre_exec(const pcre*, + const pcre_extra*, + PCRE_SPTR, + int, + int, + int, + int*, + int); +int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*); #ifdef __cplusplus } diff --git a/package/re/pcre_chartables.c b/package/re/pcre_chartables.c index 736adc518..12b788475 100644 --- a/package/re/pcre_chartables.c +++ b/package/re/pcre_chartables.c @@ -3,171 +3,130 @@ const unsigned char _pcre_default_tables[] = { -/* This table is a lower casing table. */ + /* This table is a lower casing table. */ - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, - 56, 57, 58, 59, 60, 61, 62, 63, - 64, 97, 98, 99,100,101,102,103, - 104,105,106,107,108,109,110,111, - 112,113,114,115,116,117,118,119, - 120,121,122, 91, 92, 93, 94, 95, - 96, 97, 98, 99,100,101,102,103, - 104,105,106,107,108,109,110,111, - 112,113,114,115,116,117,118,119, - 120,121,122,123,124,125,126,127, - 128,129,130,131,132,133,134,135, - 136,137,138,139,140,141,142,143, - 144,145,146,147,148,149,150,151, - 152,153,154,155,156,157,158,159, - 160,161,162,163,164,165,166,167, - 168,169,170,171,172,173,174,175, - 176,177,178,179,180,181,182,183, - 184,185,186,187,188,189,190,191, - 192,193,194,195,196,197,198,199, - 200,201,202,203,204,205,206,207, - 208,209,210,211,212,213,214,215, - 216,217,218,219,220,221,222,223, - 224,225,226,227,228,229,230,231, - 232,233,234,235,236,237,238,239, - 240,241,242,243,244,245,246,247, - 248,249,250,251,252,253,254,255, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, + 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, + 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, + 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, + 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, + 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, + 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, + 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, + 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, + 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, -/* This table is a case flipping table. */ + /* This table is a case flipping table. */ - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, - 56, 57, 58, 59, 60, 61, 62, 63, - 64, 97, 98, 99,100,101,102,103, - 104,105,106,107,108,109,110,111, - 112,113,114,115,116,117,118,119, - 120,121,122, 91, 92, 93, 94, 95, - 96, 65, 66, 67, 68, 69, 70, 71, - 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, - 88, 89, 90,123,124,125,126,127, - 128,129,130,131,132,133,134,135, - 136,137,138,139,140,141,142,143, - 144,145,146,147,148,149,150,151, - 152,153,154,155,156,157,158,159, - 160,161,162,163,164,165,166,167, - 168,169,170,171,172,173,174,175, - 176,177,178,179,180,181,182,183, - 184,185,186,187,188,189,190,191, - 192,193,194,195,196,197,198,199, - 200,201,202,203,204,205,206,207, - 208,209,210,211,212,213,214,215, - 216,217,218,219,220,221,222,223, - 224,225,226,227,228,229,230,231, - 232,233,234,235,236,237,238,239, - 240,241,242,243,244,245,246,247, - 248,249,250,251,252,253,254,255, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, + 91, 92, 93, 94, 95, 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, + 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, + 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, + 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, + 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, + 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, + 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, + 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, + 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, -/* This table contains bit maps for various character classes. Each map is 32 -bytes long and the bits run from the least significant end of each byte. The -classes that have their own maps are: space, xdigit, digit, upper, lower, word, -graph, print, punct, and cntrl. Other classes are built from combinations. */ + /* This table contains bit maps for various character classes. Each map is + 32 bytes long and the bits run from the least significant end of each byte. + The classes that have their own maps are: space, xdigit, digit, upper, + lower, word, graph, print, punct, and cntrl. Other classes are built from + combinations. */ - 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00, 0x3e, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, - 0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 0x7e, 0x00, 0x00, 0x00, + 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xfe, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, - 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x03, 0xfe, 0xff, 0xff, 0x87, + 0xfe, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff, - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff, - 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc, - 0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00, 0x00, 0x00, 0x00, 0xfe, 0xff, 0x00, 0xfc, 0x01, 0x00, 0x00, 0xf8, + 0x01, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -/* This table identifies various classes of character by individual bits: - 0x01 white space character - 0x02 letter - 0x04 decimal digit - 0x08 hexadecimal digit - 0x10 alphanumeric or '_' - 0x80 regular expression metacharacter or binary zero -*/ + /* This table identifies various classes of character by individual bits: + 0x01 white space character + 0x02 letter + 0x04 decimal digit + 0x08 hexadecimal digit + 0x10 alphanumeric or '_' + 0x80 regular expression metacharacter or binary zero + */ - 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ - 0x00,0x01,0x01,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ - 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */ - 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */ - 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */ - 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */ - 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */ - 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */ - 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */ - 0x12,0x12,0x12,0x80,0x80,0x00,0x80,0x10, /* X - _ */ - 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */ - 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */ - 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */ - 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0- 7 */ + 0x00, 0x01, 0x01, 0x00, 0x01, 0x01, 0x00, 0x00, /* 8- 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 16- 23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 24- 31 */ + 0x01, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, /* - ' */ + 0x80, 0x80, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, /* ( - / */ + 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, /* 0 - 7 */ + 0x1c, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 8 - ? */ + 0x00, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x12, /* @ - G */ + 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, /* H - O */ + 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, /* P - W */ + 0x12, 0x12, 0x12, 0x80, 0x80, 0x00, 0x80, 0x10, /* X - _ */ + 0x00, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x12, /* ` - g */ + 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, /* h - o */ + 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, /* p - w */ + 0x12, 0x12, 0x12, 0x80, 0x80, 0x00, 0x00, 0x00, /* x -127 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 128-135 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 136-143 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 144-151 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 152-159 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 160-167 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 168-175 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 176-183 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 184-191 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 192-199 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 200-207 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 208-215 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 216-223 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 224-231 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 232-239 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 240-247 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; /* 248-255 */ /* End of pcre_chartables.c */ diff --git a/package/re/pcre_compile.c b/package/re/pcre_compile.c index 63d0783fc..7b621cb3a 100644 --- a/package/re/pcre_compile.c +++ b/package/re/pcre_compile.c @@ -2,16 +2,14 @@ /* This module contains the external function pcre_compile(), along with supporting internal functions that are not used by other modules. */ - -#include "re_config.h" #include "PikaObj.h" -#define NLBLOCK cd /* Block containing newline information */ -#define PSSTART start_pattern /* Field containing processed string start */ -#define PSEND end_pattern /* Field containing processed string end */ +#include "re_config.h" +#define NLBLOCK cd /* Block containing newline information */ +#define PSSTART start_pattern /* Field containing processed string start */ +#define PSEND end_pattern /* Field containing processed string end */ #include "pcre_internal.h" - /* When DEBUG is defined, we need the pcre_printint() function, which is also used by pcretest. DEBUG is not defined when building a production library. */ @@ -19,10 +17,9 @@ used by pcretest. DEBUG is not defined when building a production library. */ #include "pcre_printint.src" #endif - /* Macro for setting individual bits in class bitmaps. */ -#define SETBIT(a,b) a[b/8] |= (1 << (b%8)) +#define SETBIT(a, b) a[b / 8] |= (1 << (b % 8)) /* Maximum length value to check against when making sure that the integer that holds the compiled pattern length does not overflow. We make it a bit less than @@ -31,10 +28,9 @@ to check them every time. */ #define OFLOW_MAX (INT_MAX - 20) - /************************************************* -* Code parameters and static tables * -*************************************************/ + * Code parameters and static tables * + *************************************************/ /* This value specifies the size of stack workspace that is used during the first pre-compile phase that determines how much memory is required. The regex @@ -50,85 +46,236 @@ is 4 there is plenty of room. */ #define COMPILE_WORK_SIZE (4096) - /* Table for handling escaped characters in the range '0'-'z'. Positive returns are simple data values; negative values are for special things like \d and so on. Zero means further processing is needed (for things like \x), or the escape is invalid. */ -#ifndef EBCDIC /* This is the "normal" table for ASCII systems */ +#ifndef EBCDIC /* This is the "normal" table for ASCII systems */ static const short int escapes[] = { - 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ - 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ - '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ --ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */ --ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */ --ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ - '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ --ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */ --ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */ - 0, 0, -ESC_z /* x - z */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ + 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ + '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ + -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */ + -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */ + -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ + '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ + -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */ + -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */ + 0, 0, -ESC_z /* x - z */ }; -#else /* This is the "abnormal" table for EBCDIC systems */ +#else /* This is the "abnormal" table for EBCDIC systems */ static const short int escapes[] = { -/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', -/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, -/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', -/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, -/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', -/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, -/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', -/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, -/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0, -/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p, -/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, -/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0, -/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, -/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, -/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', -/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, -/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0, -/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P, -/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, -/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, -/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, -/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, -/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 -}; + /* 48 */ 0, + 0, + 0, + '.', + '<', + '(', + '+', + '|', + /* 50 */ '&', + 0, + 0, + 0, + 0, + 0, + 0, + 0, + /* 58 */ 0, + 0, + '!', + '$', + '*', + ')', + ';', + '~', + /* 60 */ '-', + '/', + 0, + 0, + 0, + 0, + 0, + 0, + /* 68 */ 0, + 0, + '|', + ',', + '%', + '_', + '>', + '?', + /* 70 */ 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + /* 78 */ 0, + '`', + ':', + '#', + '@', + '\'', + '=', + '"', + /* 80 */ 0, + 7, + -ESC_b, + 0, + -ESC_d, + ESC_e, + ESC_f, + 0, + /* 88 */ -ESC_h, + 0, + 0, + '{', + 0, + 0, + 0, + 0, + /* 90 */ 0, + 0, + -ESC_k, + 'l', + 0, + ESC_n, + 0, + -ESC_p, + /* 98 */ 0, + ESC_r, + 0, + '}', + 0, + 0, + 0, + 0, + /* A0 */ 0, + '~', + -ESC_s, + ESC_tee, + 0, + -ESC_v, + -ESC_w, + 0, + /* A8 */ 0, + -ESC_z, + 0, + 0, + 0, + '[', + 0, + 0, + /* B0 */ 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + /* B8 */ 0, + 0, + 0, + 0, + 0, + ']', + '=', + '-', + /* C0 */ '{', + -ESC_A, + -ESC_B, + -ESC_C, + -ESC_D, + -ESC_E, + 0, + -ESC_G, + /* C8 */ -ESC_H, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + /* D0 */ '}', + 0, + -ESC_K, + 0, + 0, + 0, + 0, + -ESC_P, + /* D8 */ -ESC_Q, + -ESC_R, + 0, + 0, + 0, + 0, + 0, + 0, + /* E0 */ '\\', + 0, + -ESC_S, + 0, + 0, + -ESC_V, + -ESC_W, + -ESC_X, + /* E8 */ 0, + -ESC_Z, + 0, + 0, + 0, + 0, + 0, + 0, + /* F0 */ 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + /* F8 */ 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0}; #endif - /* Table of special "verbs" like (*PRUNE). This is a short table, so it is searched linearly. Put all the names into a single string, in order to reduce the number of relocations when a shared library is dynamically linked. */ typedef struct verbitem { - int len; - int op; + int len; + int op; } verbitem; static const char verbnames[] = - "ACCEPT\0" - "COMMIT\0" - "F\0" - "FAIL\0" - "PRUNE\0" - "SKIP\0" - "THEN"; + "ACCEPT\0" + "COMMIT\0" + "F\0" + "FAIL\0" + "PRUNE\0" + "SKIP\0" + "THEN"; -static verbitem verbs[] = { - { 6, OP_ACCEPT }, - { 6, OP_COMMIT }, - { 1, OP_FAIL }, - { 4, OP_FAIL }, - { 5, OP_PRUNE }, - { 4, OP_SKIP }, - { 4, OP_THEN } -}; - -static int verbcount = sizeof(verbs)/sizeof(verbitem); +static verbitem verbs[] = {{6, OP_ACCEPT}, {6, OP_COMMIT}, {1, OP_FAIL}, + {4, OP_FAIL}, {5, OP_PRUNE}, {4, OP_SKIP}, + {4, OP_THEN}}; +static int verbcount = sizeof(verbs) / sizeof(verbitem); /* Tables of names of POSIX character classes and their lengths. The names are now all in a single string, to reduce the number of relocations when a shared @@ -137,12 +284,23 @@ length entry. The first three must be alpha, lower, upper, as this is assumed for handling case independence. */ static const char posix_names[] = - "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0" - "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0" - "word\0" "xdigit"; + "alpha\0" + "lower\0" + "upper\0" + "alnum\0" + "ascii\0" + "blank\0" + "cntrl\0" + "digit\0" + "graph\0" + "print\0" + "punct\0" + "space\0" + "word\0" + "xdigit"; -static const uschar posix_name_lengths[] = { - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; +static const uschar posix_name_lengths[] = {5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 4, 6, 0}; /* Table of class bit maps for each POSIX class. Each class is formed from a base map, with an optional addition or removal of another map. Then, for some @@ -155,24 +313,23 @@ absolute value of the third field has these meanings: 0 => no tweaking, 1 => remove vertical space characters, 2 => remove underscore. */ static const int posix_class_maps[] = { - cbit_word, cbit_digit, -2, /* alpha */ - cbit_lower, -1, 0, /* lower */ - cbit_upper, -1, 0, /* upper */ - cbit_word, -1, 2, /* alnum - word without underscore */ - cbit_print, cbit_cntrl, 0, /* ascii */ - cbit_space, -1, 1, /* blank - a GNU extension */ - cbit_cntrl, -1, 0, /* cntrl */ - cbit_digit, -1, 0, /* digit */ - cbit_graph, -1, 0, /* graph */ - cbit_print, -1, 0, /* print */ - cbit_punct, -1, 0, /* punct */ - cbit_space, -1, 0, /* space */ - cbit_word, -1, 0, /* word - a Perl extension */ - cbit_xdigit,-1, 0 /* xdigit */ + cbit_word, cbit_digit, -2, /* alpha */ + cbit_lower, -1, 0, /* lower */ + cbit_upper, -1, 0, /* upper */ + cbit_word, -1, 2, /* alnum - word without underscore */ + cbit_print, cbit_cntrl, 0, /* ascii */ + cbit_space, -1, 1, /* blank - a GNU extension */ + cbit_cntrl, -1, 0, /* cntrl */ + cbit_digit, -1, 0, /* digit */ + cbit_graph, -1, 0, /* graph */ + cbit_print, -1, 0, /* print */ + cbit_punct, -1, 0, /* punct */ + cbit_space, -1, 0, /* space */ + cbit_word, -1, 0, /* word - a Perl extension */ + cbit_xdigit, -1, 0 /* xdigit */ }; - -#define STRING(a) # a +#define STRING(a) #a #define XSTRING(s) STRING(s) /* The texts of compile-time error messages. These are "char *" because they @@ -263,7 +420,6 @@ static const char error_texts[] = "subpattern name expected\0" "digit expected after (?+"; - /* Table to identify digits and hex digits. This is used when compiling patterns. Note that the tables in chartables are dependent on the locale, and may mark arbitrary characters as digits - but the PCRE compiling code expects @@ -280,125 +436,131 @@ For convenience, we use the same bit definitions as in chartables: Then we can use ctype_digit and ctype_xdigit in the code. */ -#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */ -static const unsigned char digitab[] = - { - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ - 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ - 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ - 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ - 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ +#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */ +static const unsigned char digitab[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0- 7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8- 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 16- 23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 24- 31 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* - ' */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* ( - / */ + 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 0 - 7 */ + 0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8 - ? */ + 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, /* @ - G */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* H - O */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* P - W */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* X - _ */ + 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, /* ` - g */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* h - o */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* p - w */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* x -127 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 128-135 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 136-143 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 144-151 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 152-159 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 160-167 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 168-175 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 176-183 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 184-191 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 192-199 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 200-207 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 208-215 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 216-223 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 224-231 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 232-239 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 240-247 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; /* 248-255 */ -#else /* This is the "abnormal" case, for EBCDIC systems */ -static const unsigned char digitab[] = - { - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ - 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ - 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ - 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */ - 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ +#else /* This is the "abnormal" case, for EBCDIC systems */ +static const unsigned char digitab[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0- 7 0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8- 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 16- 23 10 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 24- 31 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 32- 39 20 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 40- 47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 48- 55 30 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 56- 63 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* - 71 40 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 72- | */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* & - 87 50 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 88- 95 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* - -103 60 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 104- ? */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 112-119 70 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 120- " */ + 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, /* 128- g 80 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* h -143 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 144- p 90 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* q -159 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 160- x A0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* y -175 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* ^ -183 B0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 184-191 */ + 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, /* { - G C0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* H -207 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* } - P D0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Q -223 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* \ - X E0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Y -239 */ + 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 0 - 7 F0 */ + 0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; /* 8 -255 */ -static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */ - 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */ - 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */ - 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ - 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ - 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */ - 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */ - 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */ - 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ - 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */ - 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ - 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */ - 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ - 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */ - 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ - 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */ - 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ - 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */ - 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ - 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */ - 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ - 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */ - 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ - 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */ - 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ +static const unsigned char ebcdic_chartab[] = + { /* chartable partial dup */ + 0x80, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, /* 0- 7 */ + 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, /* 8- 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, /* 16- 23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 24- 31 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, /* 32- 39 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 40- 47 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 48- 55 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 56- 63 */ + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* - 71 */ + 0x00, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x80, /* 72- | */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* & - 87 */ + 0x00, 0x00, 0x00, 0x80, 0x80, 0x80, 0x00, 0x00, /* 88- 95 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* - -103 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x80, /* 104- ? */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 112-119 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 120- " */ + 0x00, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x12, /* 128- g */ + 0x12, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* h -143 */ + 0x00, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, /* 144- p */ + 0x12, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* q -159 */ + 0x00, 0x00, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, /* 160- x */ + 0x12, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* y -175 */ + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* ^ -183 */ + 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 184-191 */ + 0x80, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x12, /* { - G */ + 0x12, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* H -207 */ + 0x00, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, /* } - P */ + 0x12, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Q -223 */ + 0x00, 0x00, 0x12, 0x12, 0x12, 0x12, 0x12, 0x12, /* \ - X */ + 0x12, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Y -239 */ + 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, /* 0 - 7 */ + 0x1c, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; /* 8 -255 */ #endif - /* Definition to allow mutual recursion */ -static BOOL - compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int, - int *, int *, branch_chain *, compile_data *, int *); - - +static BOOL compile_regex(int, + int, + uschar**, + const uschar**, + int*, + BOOL, + BOOL, + int, + int*, + int*, + branch_chain*, + compile_data*, + int*); /************************************************* -* Find an error text * -*************************************************/ + * Find an error text * + *************************************************/ /* The error texts are now all in one long string, to save on relocations. As some of the text is of unknown length, we can't use a table of offsets. @@ -409,18 +571,17 @@ Argument: the error number Returns: pointer to the error string */ -static const char * -find_error_text(int n) -{ -const char *s = error_texts; -for (; n > 0; n--) while (*s++ != 0); -return s; +static const char* find_error_text(int n) { + const char* s = error_texts; + for (; n > 0; n--) + while (*s++ != 0) + ; + return s; } - /************************************************* -* Handle escapes * -*************************************************/ + * Handle escapes * + *************************************************/ /* This function is called when a \ has been encountered. It either returns a positive value for a simple escape such as \n, or a negative value which @@ -442,273 +603,285 @@ Returns: zero or positive => a data character on error, errorcodeptr is set */ -static int -check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, - int options, BOOL isclass) -{ -BOOL utf8 = (options & PCRE_UTF8) != 0; -const uschar *ptr = *ptrptr + 1; -int c, i; +static int check_escape(const uschar** ptrptr, + int* errorcodeptr, + int bracount, + int options, + BOOL isclass) { + BOOL utf8 = (options & PCRE_UTF8) != 0; + const uschar* ptr = *ptrptr + 1; + int c, i; -GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ -ptr--; /* Set pointer back to the last byte */ + GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ + ptr--; /* Set pointer back to the last byte */ -/* If backslash is at the end of the pattern, it's an error. */ + /* If backslash is at the end of the pattern, it's an error. */ -if (c == 0) *errorcodeptr = ERR1; - -/* Non-alphanumerics are literals. For digits or letters, do an initial lookup -in a table. A non-zero result is something that can be returned immediately. -Otherwise further processing may be required. */ - -#ifndef EBCDIC /* ASCII coding */ -else if (c < '0' || c > 'z') {} /* Not alphanumeric */ -else if ((i = escapes[c - '0']) != 0) c = i; - -#else /* EBCDIC coding */ -else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */ -else if ((i = escapes[c - 0x48]) != 0) c = i; -#endif - -/* Escapes that need further processing, or are illegal. */ - -else - { - const uschar *oldptr; - BOOL braced, negated; - - switch (c) - { - /* A number of Perl escapes are not handled by PCRE. We give an explicit - error. */ - - case 'l': - case 'L': - case 'N': - case 'u': - case 'U': - *errorcodeptr = ERR37; - break; - - /* \g must be followed by a number, either plain or braced. If positive, it - is an absolute backreference. If negative, it is a relative backreference. - This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a - reference to a named group. This is part of Perl's movement towards a - unified syntax for back references. As this is synonymous with \k{name}, we - fudge it up by pretending it really was \k. */ - - case 'g': - if (ptr[1] == '{') - { - const uschar *p; - for (p = ptr+2; *p != 0 && *p != '}'; p++) - if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break; - if (*p != 0 && *p != '}') - { - c = -ESC_k; - break; - } - braced = TRUE; - ptr++; - } - else braced = FALSE; - - if (ptr[1] == '-') - { - negated = TRUE; - ptr++; - } - else negated = FALSE; - - c = 0; - while ((digitab[ptr[1]] & ctype_digit) != 0) - c = c * 10 + *(++ptr) - '0'; - - if (c < 0) - { - *errorcodeptr = ERR61; - break; - } - - if (c == 0 || (braced && *(++ptr) != '}')) - { - *errorcodeptr = ERR57; - break; - } - - if (negated) - { - if (c > bracount) - { - *errorcodeptr = ERR15; - break; - } - c = bracount - (c - 1); - } - - c = -(ESC_REF + c); - break; - - /* The handling of escape sequences consisting of a string of digits - starting with one that is not zero is not straightforward. By experiment, - the way Perl works seems to be as follows: - - Outside a character class, the digits are read as a decimal number. If the - number is less than 10, or if there are that many previous extracting - left brackets, then it is a back reference. Otherwise, up to three octal - digits are read to form an escaped byte. Thus \123 is likely to be octal - 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal - value is greater than 377, the least significant 8 bits are taken. Inside a - character class, \ followed by a digit is always an octal number. */ - - case '1': case '2': case '3': case '4': case '5': - case '6': case '7': case '8': case '9': - - if (!isclass) - { - oldptr = ptr; - c -= '0'; - while ((digitab[ptr[1]] & ctype_digit) != 0) - c = c * 10 + *(++ptr) - '0'; - if (c < 0) - { - *errorcodeptr = ERR61; - break; - } - if (c < 10 || c <= bracount) - { - c = -(ESC_REF + c); - break; - } - ptr = oldptr; /* Put the pointer back and fall through */ - } - - /* Handle an octal number following \. If the first digit is 8 or 9, Perl - generates a binary zero byte and treats the digit as a following literal. - Thus we have to pull back the pointer by one. */ - - if ((c = *ptr) >= '8') - { - ptr--; - c = 0; - break; - } - - /* \0 always starts an octal number, but we may drop through to here with a - larger first octal digit. The original code used just to take the least - significant 8 bits of octal numbers (I think this is what early Perls used - to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more - than 3 octal digits. */ - - case '0': - c -= '0'; - while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') - c = c * 8 + *(++ptr) - '0'; - if (!utf8 && c > 255) *errorcodeptr = ERR51; - break; - - /* \x is complicated. \x{ddd} is a character number which can be greater - than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is - treated as a data character. */ - - case 'x': - if (ptr[1] == '{') - { - const uschar *pt = ptr + 2; - int count = 0; - - c = 0; - while ((digitab[*pt] & ctype_xdigit) != 0) - { - register int cc = *pt++; - if (c == 0 && cc == '0') continue; /* Leading zeroes */ - count++; - -#ifndef EBCDIC /* ASCII coding */ - if (cc >= 'a') cc -= 32; /* Convert to upper case */ - c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); -#else /* EBCDIC coding */ - if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ - c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); -#endif - } - - if (*pt == '}') - { - if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; - ptr = pt; - break; - } - - /* If the sequence of hex digits does not end with '}', then we don't - recognize this construct; fall through to the normal \x handling. */ - } - - /* Read just a single-byte hex-defined char */ - - c = 0; - while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) - { - int cc; /* Some compilers don't like ++ */ - cc = *(++ptr); /* in initializers */ -#ifndef EBCDIC /* ASCII coding */ - if (cc >= 'a') cc -= 32; /* Convert to upper case */ - c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); -#else /* EBCDIC coding */ - if (cc <= 'z') cc += 64; /* Convert to upper case */ - c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); -#endif - } - break; - - /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. - This coding is ASCII-specific, but then the whole concept of \cx is - ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ - - case 'c': - c = *(++ptr); if (c == 0) - { - *errorcodeptr = ERR2; - break; - } + *errorcodeptr = ERR1; -#ifndef EBCDIC /* ASCII coding */ - if (c >= 'a' && c <= 'z') c -= 32; - c ^= 0x40; -#else /* EBCDIC coding */ - if (c >= 'a' && c <= 'z') c += 64; - c ^= 0xC0; + /* Non-alphanumerics are literals. For digits or letters, do an initial + lookup in a table. A non-zero result is something that can be returned + immediately. Otherwise further processing may be required. */ + +#ifndef EBCDIC /* ASCII coding */ + else if (c < '0' || c > 'z') { + } /* Not alphanumeric */ + else if ((i = escapes[c - '0']) != 0) + c = i; + +#else /* EBCDIC coding */ + else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) { + } /* Not alphanumeric */ + else if ((i = escapes[c - 0x48]) != 0) + c = i; #endif - break; - /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any - other alphanumeric following \ is an error if PCRE_EXTRA was set; - otherwise, for Perl compatibility, it is a literal. This code looks a bit - odd, but there used to be some cases other than the default, and there may - be again in future, so I haven't "optimized" it. */ + /* Escapes that need further processing, or are illegal. */ - default: - if ((options & PCRE_EXTRA) != 0) switch(c) - { - default: - *errorcodeptr = ERR3; - break; - } - break; + else { + const uschar* oldptr; + BOOL braced, negated; + + switch (c) { + /* A number of Perl escapes are not handled by PCRE. We give an + explicit error. */ + + case 'l': + case 'L': + case 'N': + case 'u': + case 'U': + *errorcodeptr = ERR37; + break; + + /* \g must be followed by a number, either plain or braced. If + positive, it is an absolute backreference. If negative, it is a + relative backreference. This is a Perl 5.10 feature. Perl 5.10 + also supports \g{name} as a reference to a named group. This is + part of Perl's movement towards a unified syntax for back + references. As this is synonymous with \k{name}, we fudge it up + by pretending it really was \k. */ + + case 'g': + if (ptr[1] == '{') { + const uschar* p; + for (p = ptr + 2; *p != 0 && *p != '}'; p++) + if (*p != '-' && (digitab[*p] & ctype_digit) == 0) + break; + if (*p != 0 && *p != '}') { + c = -ESC_k; + break; + } + braced = TRUE; + ptr++; + } else + braced = FALSE; + + if (ptr[1] == '-') { + negated = TRUE; + ptr++; + } else + negated = FALSE; + + c = 0; + while ((digitab[ptr[1]] & ctype_digit) != 0) + c = c * 10 + *(++ptr) - '0'; + + if (c < 0) { + *errorcodeptr = ERR61; + break; + } + + if (c == 0 || (braced && *(++ptr) != '}')) { + *errorcodeptr = ERR57; + break; + } + + if (negated) { + if (c > bracount) { + *errorcodeptr = ERR15; + break; + } + c = bracount - (c - 1); + } + + c = -(ESC_REF + c); + break; + + /* The handling of escape sequences consisting of a string of + digits starting with one that is not zero is not + straightforward. By experiment, the way Perl works seems to be + as follows: + + Outside a character class, the digits are read as a decimal + number. If the number is less than 10, or if there are that many + previous extracting left brackets, then it is a back reference. + Otherwise, up to three octal digits are read to form an escaped + byte. Thus \123 is likely to be octal 123 (cf \0123, which is + octal 012 followed by the literal 3). If the octal value is + greater than 377, the least significant 8 bits are taken. Inside + a character class, \ followed by a digit is always an octal + number. */ + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + + if (!isclass) { + oldptr = ptr; + c -= '0'; + while ((digitab[ptr[1]] & ctype_digit) != 0) + c = c * 10 + *(++ptr) - '0'; + if (c < 0) { + *errorcodeptr = ERR61; + break; + } + if (c < 10 || c <= bracount) { + c = -(ESC_REF + c); + break; + } + ptr = oldptr; /* Put the pointer back and fall through */ + } + + /* Handle an octal number following \. If the first digit is 8 + or 9, Perl generates a binary zero byte and treats the digit as + a following literal. Thus we have to pull back the pointer by + one. */ + + if ((c = *ptr) >= '8') { + ptr--; + c = 0; + break; + } + + /* \0 always starts an octal number, but we may drop through to + here with a larger first octal digit. The original code used + just to take the least significant 8 bits of octal numbers (I + think this is what early Perls used to do). Nowadays we allow + for larger numbers in UTF-8 mode, but no more than 3 octal + digits. */ + + case '0': + c -= '0'; + while (i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') + c = c * 8 + *(++ptr) - '0'; + if (!utf8 && c > 255) + *errorcodeptr = ERR51; + break; + + /* \x is complicated. \x{ddd} is a character number which can be + greater than 0xff in utf8 mode, but only if the ddd are hex + digits. If not, { is treated as a data character. */ + + case 'x': + if (ptr[1] == '{') { + const uschar* pt = ptr + 2; + int count = 0; + + c = 0; + while ((digitab[*pt] & ctype_xdigit) != 0) { + register int cc = *pt++; + if (c == 0 && cc == '0') + continue; /* Leading zeroes */ + count++; + +#ifndef EBCDIC /* ASCII coding */ + if (cc >= 'a') + cc -= 32; /* Convert to upper case */ + c = (c << 4) + cc - ((cc < 'A') ? '0' : ('A' - 10)); +#else /* EBCDIC coding */ + if (cc >= 'a' && cc <= 'z') + cc += 64; /* Convert to upper case */ + c = (c << 4) + cc - ((cc >= '0') ? '0' : ('A' - 10)); +#endif + } + + if (*pt == '}') { + if (c < 0 || count > (utf8 ? 8 : 2)) + *errorcodeptr = ERR34; + ptr = pt; + break; + } + + /* If the sequence of hex digits does not end with '}', then + we don't recognize this construct; fall through to the + normal \x handling. */ + } + + /* Read just a single-byte hex-defined char */ + + c = 0; + while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) { + int cc; /* Some compilers don't like ++ */ + cc = *(++ptr); /* in initializers */ +#ifndef EBCDIC /* ASCII coding */ + if (cc >= 'a') + cc -= 32; /* Convert to upper case */ + c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10)); +#else /* EBCDIC coding */ + if (cc <= 'z') + cc += 64; /* Convert to upper case */ + c = c * 16 + cc - ((cc >= '0') ? '0' : ('A' - 10)); +#endif + } + break; + + /* For \c, a following letter is upper-cased; then the 0x40 bit + is flipped. This coding is ASCII-specific, but then the whole + concept of \cx is ASCII-specific. (However, an EBCDIC equivalent + has now been added.) */ + + case 'c': + c = *(++ptr); + if (c == 0) { + *errorcodeptr = ERR2; + break; + } + +#ifndef EBCDIC /* ASCII coding */ + if (c >= 'a' && c <= 'z') + c -= 32; + c ^= 0x40; +#else /* EBCDIC coding */ + if (c >= 'a' && c <= 'z') + c += 64; + c ^= 0xC0; +#endif + break; + + /* PCRE_EXTRA enables extensions to Perl in the matter of + escapes. Any other alphanumeric following \ is an error if + PCRE_EXTRA was set; otherwise, for Perl compatibility, it is a + literal. This code looks a bit odd, but there used to be some + cases other than the default, and there may be again in future, + so I haven't "optimized" it. */ + + default: + if ((options & PCRE_EXTRA) != 0) + switch (c) { + default: + *errorcodeptr = ERR3; + break; + } + break; + } } - } -*ptrptr = ptr; -return c; + *ptrptr = ptr; + return c; } - - #ifdef SUPPORT_UCP /************************************************* -* Handle \P and \p * -*************************************************/ + * Handle \P and \p * + *************************************************/ /* This function is called after \P or \p has been encountered, provided that PCRE is compiled with support for Unicode properties. On entry, ptrptr is @@ -724,83 +897,82 @@ Argument: Returns: type value from ucp_type_table, or -1 for an invalid type */ -static int -get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr) -{ -int c, i, bot, top; -const uschar *ptr = *ptrptr; -char name[32]; +static int get_ucp(const uschar** ptrptr, + BOOL* negptr, + int* dptr, + int* errorcodeptr) { + int c, i, bot, top; + const uschar* ptr = *ptrptr; + char name[32]; -c = *(++ptr); -if (c == 0) goto ERROR_RETURN; - -*negptr = FALSE; - -/* \P or \p can be followed by a name in {}, optionally preceded by ^ for -negation. */ - -if (c == '{') - { - if (ptr[1] == '^') - { - *negptr = TRUE; - ptr++; - } - for (i = 0; i < (int)sizeof(name) - 1; i++) - { c = *(++ptr); - if (c == 0) goto ERROR_RETURN; - if (c == '}') break; - name[i] = c; + if (c == 0) + goto ERROR_RETURN; + + *negptr = FALSE; + + /* \P or \p can be followed by a name in {}, optionally preceded by ^ for + negation. */ + + if (c == '{') { + if (ptr[1] == '^') { + *negptr = TRUE; + ptr++; + } + for (i = 0; i < (int)sizeof(name) - 1; i++) { + c = *(++ptr); + if (c == 0) + goto ERROR_RETURN; + if (c == '}') + break; + name[i] = c; + } + if (c != '}') + goto ERROR_RETURN; + name[i] = 0; } - if (c !='}') goto ERROR_RETURN; - name[i] = 0; - } -/* Otherwise there is just one following character */ + /* Otherwise there is just one following character */ -else - { - name[0] = c; - name[1] = 0; - } - -*ptrptr = ptr; - -/* Search for a recognized property name using binary chop */ - -bot = 0; -top = _pcre_utt_size; - -while (bot < top) - { - i = (bot + top) >> 1; - c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset); - if (c == 0) - { - *dptr = _pcre_utt[i].value; - return _pcre_utt[i].type; + else { + name[0] = c; + name[1] = 0; } - if (c > 0) bot = i + 1; else top = i; - } -*errorcodeptr = ERR47; -*ptrptr = ptr; -return -1; + *ptrptr = ptr; + + /* Search for a recognized property name using binary chop */ + + bot = 0; + top = _pcre_utt_size; + + while (bot < top) { + i = (bot + top) >> 1; + c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset); + if (c == 0) { + *dptr = _pcre_utt[i].value; + return _pcre_utt[i].type; + } + if (c > 0) + bot = i + 1; + else + top = i; + } + + *errorcodeptr = ERR47; + *ptrptr = ptr; + return -1; ERROR_RETURN: -*errorcodeptr = ERR46; -*ptrptr = ptr; -return -1; + *errorcodeptr = ERR46; + *ptrptr = ptr; + return -1; } #endif - - - /************************************************* -* Check for counted repeat * -*************************************************/ + * Check for counted repeat * + *************************************************/ /* This function is called when a '{' is encountered in a place where it might start a quantifier. It looks ahead to see if it really is a quantifier or not. @@ -813,27 +985,30 @@ Arguments: Returns: TRUE or FALSE */ -static BOOL -is_counted_repeat(const uschar *p) -{ -if ((digitab[*p++] & ctype_digit) == 0) return FALSE; -while ((digitab[*p] & ctype_digit) != 0) p++; -if (*p == '}') return TRUE; +static BOOL is_counted_repeat(const uschar* p) { + if ((digitab[*p++] & ctype_digit) == 0) + return FALSE; + while ((digitab[*p] & ctype_digit) != 0) + p++; + if (*p == '}') + return TRUE; -if (*p++ != ',') return FALSE; -if (*p == '}') return TRUE; + if (*p++ != ',') + return FALSE; + if (*p == '}') + return TRUE; -if ((digitab[*p++] & ctype_digit) == 0) return FALSE; -while ((digitab[*p] & ctype_digit) != 0) p++; + if ((digitab[*p++] & ctype_digit) == 0) + return FALSE; + while ((digitab[*p] & ctype_digit) != 0) + p++; -return (*p == '}'); + return (*p == '}'); } - - /************************************************* -* Read repeat counts * -*************************************************/ + * Read repeat counts * + *************************************************/ /* Read an item of the form {n,m} and return the values. This is called only after is_counted_repeat() has confirmed that a repeat-count quantifier exists, @@ -850,57 +1025,56 @@ Returns: pointer to '}' on success; current ptr on error, with errorcodeptr set non-zero */ -static const uschar * -read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr) -{ -int min = 0; -int max = -1; +static const uschar* read_repeat_counts(const uschar* p, + int* minp, + int* maxp, + int* errorcodeptr) { + int min = 0; + int max = -1; -/* Read the minimum value and do a paranoid check: a negative value indicates -an integer overflow. */ + /* Read the minimum value and do a paranoid check: a negative value + indicates an integer overflow. */ -while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0'; -if (min < 0 || min > 65535) - { - *errorcodeptr = ERR5; - return p; - } - -/* Read the maximum value if there is one, and again do a paranoid on its size. -Also, max must not be less than min. */ - -if (*p == '}') max = min; else - { - if (*(++p) != '}') - { - max = 0; - while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; - if (max < 0 || max > 65535) - { - *errorcodeptr = ERR5; - return p; - } - if (max < min) - { - *errorcodeptr = ERR4; - return p; - } + while ((digitab[*p] & ctype_digit) != 0) + min = min * 10 + *p++ - '0'; + if (min < 0 || min > 65535) { + *errorcodeptr = ERR5; + return p; } - } -/* Fill in the required variables, and pass back the pointer to the terminating -'}'. */ + /* Read the maximum value if there is one, and again do a paranoid on its + size. Also, max must not be less than min. */ -*minp = min; -*maxp = max; -return p; + if (*p == '}') + max = min; + else { + if (*(++p) != '}') { + max = 0; + while ((digitab[*p] & ctype_digit) != 0) + max = max * 10 + *p++ - '0'; + if (max < 0 || max > 65535) { + *errorcodeptr = ERR5; + return p; + } + if (max < min) { + *errorcodeptr = ERR4; + return p; + } + } + } + + /* Fill in the required variables, and pass back the pointer to the + terminating + '}'. */ + + *minp = min; + *maxp = max; + return p; } - - /************************************************* -* Find forward referenced subpattern * -*************************************************/ + * Find forward referenced subpattern * + *************************************************/ /* This function scans along a pattern's text looking for capturing subpatterns, and counting them. If it finds a named pattern that matches the @@ -919,100 +1093,108 @@ Arguments: Returns: the number of the named subpattern, or -1 if not found */ -static int -find_parens(const uschar *ptr, int count, const uschar *name, int lorn, - BOOL xmode) -{ -const uschar *thisname; +static int find_parens(const uschar* ptr, + int count, + const uschar* name, + int lorn, + BOOL xmode) { + const uschar* thisname; -for (; *ptr != 0; ptr++) - { - int term; + for (; *ptr != 0; ptr++) { + int term; - /* Skip over backslashed characters and also entire \Q...\E */ + /* Skip over backslashed characters and also entire \Q...\E */ - if (*ptr == '\\') - { - if (*(++ptr) == 0) return -1; - if (*ptr == 'Q') for (;;) - { - while (*(++ptr) != 0 && *ptr != '\\'); - if (*ptr == 0) return -1; - if (*(++ptr) == 'E') break; - } - continue; - } - - /* Skip over character classes */ - - if (*ptr == '[') - { - while (*(++ptr) != ']') - { - if (*ptr == 0) return -1; - if (*ptr == '\\') - { - if (*(++ptr) == 0) return -1; - if (*ptr == 'Q') for (;;) - { - while (*(++ptr) != 0 && *ptr != '\\'); - if (*ptr == 0) return -1; - if (*(++ptr) == 'E') break; - } - continue; + if (*ptr == '\\') { + if (*(++ptr) == 0) + return -1; + if (*ptr == 'Q') + for (;;) { + while (*(++ptr) != 0 && *ptr != '\\') + ; + if (*ptr == 0) + return -1; + if (*(++ptr) == 'E') + break; + } + continue; } - } - continue; + + /* Skip over character classes */ + + if (*ptr == '[') { + while (*(++ptr) != ']') { + if (*ptr == 0) + return -1; + if (*ptr == '\\') { + if (*(++ptr) == 0) + return -1; + if (*ptr == 'Q') + for (;;) { + while (*(++ptr) != 0 && *ptr != '\\') + ; + if (*ptr == 0) + return -1; + if (*(++ptr) == 'E') + break; + } + continue; + } + } + continue; + } + + /* Skip comments in /x mode */ + + if (xmode && *ptr == '#') { + while (*(++ptr) != 0 && *ptr != '\n') + ; + if (*ptr == 0) + return -1; + continue; + } + + /* An opening parens must now be a real metacharacter */ + + if (*ptr != '(') + continue; + if (ptr[1] != '?' && ptr[1] != '*') { + count++; + if (name == NULL && count == lorn) + return count; + continue; + } + + ptr += 2; + if (*ptr == 'P') + ptr++; /* Allow optional P */ + + /* We have to disambiguate (? */ + + if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') && *ptr != '\'') + continue; + + count++; + + if (name == NULL && count == lorn) + return count; + term = *ptr++; + if (term == '<') + term = '>'; + thisname = ptr; + while (*ptr != term) + ptr++; + if (name != NULL && lorn == ptr - thisname && + strncmp((const char*)name, (const char*)thisname, lorn) == 0) + return count; } - /* Skip comments in /x mode */ - - if (xmode && *ptr == '#') - { - while (*(++ptr) != 0 && *ptr != '\n'); - if (*ptr == 0) return -1; - continue; - } - - /* An opening parens must now be a real metacharacter */ - - if (*ptr != '(') continue; - if (ptr[1] != '?' && ptr[1] != '*') - { - count++; - if (name == NULL && count == lorn) return count; - continue; - } - - ptr += 2; - if (*ptr == 'P') ptr++; /* Allow optional P */ - - /* We have to disambiguate (? */ - - if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') && - *ptr != '\'') - continue; - - count++; - - if (name == NULL && count == lorn) return count; - term = *ptr++; - if (term == '<') term = '>'; - thisname = ptr; - while (*ptr != term) ptr++; - if (name != NULL && lorn == ptr - thisname && - strncmp((const char *)name, (const char *)thisname, lorn) == 0) - return count; - } - -return -1; + return -1; } - - /************************************************* -* Find first significant op code * -*************************************************/ + * Find first significant op code * + *************************************************/ /* This is called by several functions that scan a compiled expression looking for a fixed first character, or an anchoring op code etc. It skips over things @@ -1030,53 +1212,53 @@ Arguments: Returns: pointer to the first significant opcode */ -static const uschar* -first_significant_code(const uschar *code, int *options, int optbit, - BOOL skipassert) -{ -for (;;) - { - switch ((int)*code) - { - case OP_OPT: - if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit)) - *options = (int)code[1]; - code += 2; - break; +static const uschar* first_significant_code(const uschar* code, + int* options, + int optbit, + BOOL skipassert) { + for (;;) { + switch ((int)*code) { + case OP_OPT: + if (optbit > 0 && + ((int)code[1] & optbit) != (*options & optbit)) + *options = (int)code[1]; + code += 2; + break; - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - if (!skipassert) return code; - do code += GET(code, 1); while (*code == OP_ALT); - code += _pcre_OP_lengths[*code]; - break; + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + if (!skipassert) + return code; + do + code += GET(code, 1); + while (*code == OP_ALT); + code += _pcre_OP_lengths[*code]; + break; - case OP_WORD_BOUNDARY: - case OP_NOT_WORD_BOUNDARY: - if (!skipassert) return code; - /* Fall through */ + case OP_WORD_BOUNDARY: + case OP_NOT_WORD_BOUNDARY: + if (!skipassert) + return code; + /* Fall through */ - case OP_CALLOUT: - case OP_CREF: - case OP_RREF: - case OP_DEF: - code += _pcre_OP_lengths[*code]; - break; + case OP_CALLOUT: + case OP_CREF: + case OP_RREF: + case OP_DEF: + code += _pcre_OP_lengths[*code]; + break; - default: - return code; + default: + return code; + } } - } -/* Control never reaches here */ + /* Control never reaches here */ } - - - /************************************************* -* Find the fixed length of a pattern * -*************************************************/ + * Find the fixed length of a pattern * + *************************************************/ /* Scan a pattern and compute the fixed length of subject that will match it, if the length is fixed. This is needed for dealing with backward assertions. @@ -1090,183 +1272,186 @@ Returns: the fixed length, or -1 if there is no fixed length, or -2 if \C was encountered */ -static int -find_fixedlength(uschar *code, int options) -{ -int length = -1; +static int find_fixedlength(uschar* code, int options) { + int length = -1; -register int branchlength = 0; -register uschar *cc = code + 1 + LINK_SIZE; + register int branchlength = 0; + register uschar* cc = code + 1 + LINK_SIZE; -/* Scan along the opcodes for this branch. If we get to the end of the -branch, check the length against that of the other branches. */ + /* Scan along the opcodes for this branch. If we get to the end of the + branch, check the length against that of the other branches. */ -for (;;) - { - int d; - register int op = *cc; - switch (op) - { - case OP_CBRA: - case OP_BRA: - case OP_ONCE: - case OP_COND: - d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options); - if (d < 0) return d; - branchlength += d; - do cc += GET(cc, 1); while (*cc == OP_ALT); - cc += 1 + LINK_SIZE; - break; + for (;;) { + int d; + register int op = *cc; + switch (op) { + case OP_CBRA: + case OP_BRA: + case OP_ONCE: + case OP_COND: + d = find_fixedlength(cc + ((op == OP_CBRA) ? 2 : 0), options); + if (d < 0) + return d; + branchlength += d; + do + cc += GET(cc, 1); + while (*cc == OP_ALT); + cc += 1 + LINK_SIZE; + break; - /* Reached end of a branch; if it's a ket it is the end of a nested - call. If it's ALT it is an alternation in a nested call. If it is - END it's the end of the outer call. All can be handled by the same code. */ + /* Reached end of a branch; if it's a ket it is the end of a + nested call. If it's ALT it is an alternation in a nested call. + If it is END it's the end of the outer call. All can be handled + by the same code. */ - case OP_ALT: - case OP_KET: - case OP_KETRMAX: - case OP_KETRMIN: - case OP_END: - if (length < 0) length = branchlength; - else if (length != branchlength) return -1; - if (*cc != OP_ALT) return length; - cc += 1 + LINK_SIZE; - branchlength = 0; - break; + case OP_ALT: + case OP_KET: + case OP_KETRMAX: + case OP_KETRMIN: + case OP_END: + if (length < 0) + length = branchlength; + else if (length != branchlength) + return -1; + if (*cc != OP_ALT) + return length; + cc += 1 + LINK_SIZE; + branchlength = 0; + break; - /* Skip over assertive subpatterns */ + /* Skip over assertive subpatterns */ - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - do cc += GET(cc, 1); while (*cc == OP_ALT); - /* Fall through */ + case OP_ASSERT: + case OP_ASSERT_NOT: + case OP_ASSERTBACK: + case OP_ASSERTBACK_NOT: + do + cc += GET(cc, 1); + while (*cc == OP_ALT); + /* Fall through */ - /* Skip over things that don't match chars */ + /* Skip over things that don't match chars */ - case OP_REVERSE: - case OP_CREF: - case OP_RREF: - case OP_DEF: - case OP_OPT: - case OP_CALLOUT: - case OP_SOD: - case OP_SOM: - case OP_EOD: - case OP_EODN: - case OP_CIRC: - case OP_DOLL: - case OP_NOT_WORD_BOUNDARY: - case OP_WORD_BOUNDARY: - cc += _pcre_OP_lengths[*cc]; - break; + case OP_REVERSE: + case OP_CREF: + case OP_RREF: + case OP_DEF: + case OP_OPT: + case OP_CALLOUT: + case OP_SOD: + case OP_SOM: + case OP_EOD: + case OP_EODN: + case OP_CIRC: + case OP_DOLL: + case OP_NOT_WORD_BOUNDARY: + case OP_WORD_BOUNDARY: + cc += _pcre_OP_lengths[*cc]; + break; - /* Handle literal characters */ + /* Handle literal characters */ - case OP_CHAR: - case OP_CHARNC: - case OP_NOT: - branchlength++; - cc += 2; + case OP_CHAR: + case OP_CHARNC: + case OP_NOT: + branchlength++; + cc += 2; #ifdef SUPPORT_UTF8 - if ((options & PCRE_UTF8) != 0) - { - while ((*cc & 0xc0) == 0x80) cc++; - } + if ((options & PCRE_UTF8) != 0) { + while ((*cc & 0xc0) == 0x80) + cc++; + } #endif - break; + break; - /* Handle exact repetitions. The count is already in characters, but we - need to skip over a multibyte character in UTF8 mode. */ + /* Handle exact repetitions. The count is already in characters, + but we need to skip over a multibyte character in UTF8 mode. */ - case OP_EXACT: - branchlength += GET2(cc,1); - cc += 4; + case OP_EXACT: + branchlength += GET2(cc, 1); + cc += 4; #ifdef SUPPORT_UTF8 - if ((options & PCRE_UTF8) != 0) - { - while((*cc & 0x80) == 0x80) cc++; - } + if ((options & PCRE_UTF8) != 0) { + while ((*cc & 0x80) == 0x80) + cc++; + } #endif - break; + break; - case OP_TYPEEXACT: - branchlength += GET2(cc,1); - if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; - cc += 4; - break; + case OP_TYPEEXACT: + branchlength += GET2(cc, 1); + if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) + cc += 2; + cc += 4; + break; - /* Handle single-char matchers */ + /* Handle single-char matchers */ - case OP_PROP: - case OP_NOTPROP: - cc += 2; - /* Fall through */ + case OP_PROP: + case OP_NOTPROP: + cc += 2; + /* Fall through */ - case OP_NOT_DIGIT: - case OP_DIGIT: - case OP_NOT_WHITESPACE: - case OP_WHITESPACE: - case OP_NOT_WORDCHAR: - case OP_WORDCHAR: - case OP_ANY: - branchlength++; - cc++; - break; + case OP_NOT_DIGIT: + case OP_DIGIT: + case OP_NOT_WHITESPACE: + case OP_WHITESPACE: + case OP_NOT_WORDCHAR: + case OP_WORDCHAR: + case OP_ANY: + branchlength++; + cc++; + break; - /* The single-byte matcher isn't allowed */ + /* The single-byte matcher isn't allowed */ - case OP_ANYBYTE: - return -2; + case OP_ANYBYTE: + return -2; - /* Check a class for variable quantification */ + /* Check a class for variable quantification */ #ifdef SUPPORT_UTF8 - case OP_XCLASS: - cc += GET(cc, 1) - 33; - /* Fall through */ + case OP_XCLASS: + cc += GET(cc, 1) - 33; + /* Fall through */ #endif - case OP_CLASS: - case OP_NCLASS: - cc += 33; + case OP_CLASS: + case OP_NCLASS: + cc += 33; - switch (*cc) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRQUERY: - case OP_CRMINQUERY: - return -1; + switch (*cc) { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + return -1; - case OP_CRRANGE: - case OP_CRMINRANGE: - if (GET2(cc,1) != GET2(cc,3)) return -1; - branchlength += GET2(cc,1); - cc += 5; - break; + case OP_CRRANGE: + case OP_CRMINRANGE: + if (GET2(cc, 1) != GET2(cc, 3)) + return -1; + branchlength += GET2(cc, 1); + cc += 5; + break; - default: - branchlength++; - } - break; + default: + branchlength++; + } + break; - /* Anything else is variable length */ + /* Anything else is variable length */ - default: - return -1; + default: + return -1; + } } - } -/* Control never gets here */ + /* Control never gets here */ } - - - /************************************************* -* Scan compiled regex for numbered bracket * -*************************************************/ + * Scan compiled regex for numbered bracket * + *************************************************/ /* This little function scans through a compiled pattern until it finds a capturing bracket with the given number. @@ -1279,96 +1464,94 @@ Arguments: Returns: pointer to the opcode for the bracket, or NULL if not found */ -static const uschar * -find_bracket(const uschar *code, BOOL utf8, int number) -{ -for (;;) - { - register int c = *code; - if (c == OP_END) return NULL; +static const uschar* find_bracket(const uschar* code, BOOL utf8, int number) { + for (;;) { + register int c = *code; + if (c == OP_END) + return NULL; - /* XCLASS is used for classes that cannot be represented just by a bit - map. This includes negated single high-valued characters. The length in - the table is zero; the actual length is stored in the compiled code. */ + /* XCLASS is used for classes that cannot be represented just by a bit + map. This includes negated single high-valued characters. The length in + the table is zero; the actual length is stored in the compiled code. */ - if (c == OP_XCLASS) code += GET(code, 1); + if (c == OP_XCLASS) + code += GET(code, 1); - /* Handle capturing bracket */ + /* Handle capturing bracket */ - else if (c == OP_CBRA) - { - int n = GET2(code, 1+LINK_SIZE); - if (n == number) return (uschar *)code; - code += _pcre_OP_lengths[c]; - } + else if (c == OP_CBRA) { + int n = GET2(code, 1 + LINK_SIZE); + if (n == number) + return (uschar*)code; + code += _pcre_OP_lengths[c]; + } - /* Otherwise, we can get the item's length from the table, except that for - repeated character types, we have to test for \p and \P, which have an extra - two bytes of parameters. */ + /* Otherwise, we can get the item's length from the table, except that + for repeated character types, we have to test for \p and \P, which have + an extra two bytes of parameters. */ - else - { - switch(c) - { - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; - break; + else { + switch (c) { + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) + code += 2; + break; - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEEXACT: - case OP_TYPEPOSUPTO: - if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; - break; - } + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEEXACT: + case OP_TYPEPOSUPTO: + if (code[3] == OP_PROP || code[3] == OP_NOTPROP) + code += 2; + break; + } - /* Add in the fixed length from the table */ + /* Add in the fixed length from the table */ - code += _pcre_OP_lengths[c]; + code += _pcre_OP_lengths[c]; - /* In UTF-8 mode, opcodes that are followed by a character may be followed by - a multi-byte character. The length in the table is a minimum, so we have to - arrange to skip the extra bytes. */ + /* In UTF-8 mode, opcodes that are followed by a character may be + followed by a multi-byte character. The length in the table is a + minimum, so we have to arrange to skip the extra bytes. */ #ifdef SUPPORT_UTF8 - if (utf8) switch(c) - { - case OP_CHAR: - case OP_CHARNC: - case OP_EXACT: - case OP_UPTO: - case OP_MINUPTO: - case OP_POSUPTO: - case OP_STAR: - case OP_MINSTAR: - case OP_POSSTAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_POSPLUS: - case OP_QUERY: - case OP_MINQUERY: - case OP_POSQUERY: - if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; - break; - } + if (utf8) + switch (c) { + case OP_CHAR: + case OP_CHARNC: + case OP_EXACT: + case OP_UPTO: + case OP_MINUPTO: + case OP_POSUPTO: + case OP_STAR: + case OP_MINSTAR: + case OP_POSSTAR: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + case OP_QUERY: + case OP_MINQUERY: + case OP_POSQUERY: + if (code[-1] >= 0xc0) + code += _pcre_utf8_table4[code[-1] & 0x3f]; + break; + } #endif + } } - } } - - /************************************************* -* Scan compiled regex for recursion reference * -*************************************************/ + * Scan compiled regex for recursion reference * + *************************************************/ /* This little function scans through a compiled pattern until it finds an instance of OP_RECURSE. @@ -1380,88 +1563,87 @@ Arguments: Returns: pointer to the opcode for OP_RECURSE, or NULL if not found */ -static const uschar * -find_recurse(const uschar *code, BOOL utf8) -{ -for (;;) - { - register int c = *code; - if (c == OP_END) return NULL; - if (c == OP_RECURSE) return code; +static const uschar* find_recurse(const uschar* code, BOOL utf8) { + for (;;) { + register int c = *code; + if (c == OP_END) + return NULL; + if (c == OP_RECURSE) + return code; - /* XCLASS is used for classes that cannot be represented just by a bit - map. This includes negated single high-valued characters. The length in - the table is zero; the actual length is stored in the compiled code. */ + /* XCLASS is used for classes that cannot be represented just by a bit + map. This includes negated single high-valued characters. The length in + the table is zero; the actual length is stored in the compiled code. */ - if (c == OP_XCLASS) code += GET(code, 1); + if (c == OP_XCLASS) + code += GET(code, 1); - /* Otherwise, we can get the item's length from the table, except that for - repeated character types, we have to test for \p and \P, which have an extra - two bytes of parameters. */ + /* Otherwise, we can get the item's length from the table, except that + for repeated character types, we have to test for \p and \P, which have + an extra two bytes of parameters. */ - else - { - switch(c) - { - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; - break; + else { + switch (c) { + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) + code += 2; + break; - case OP_TYPEPOSUPTO: - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEEXACT: - if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; - break; - } + case OP_TYPEPOSUPTO: + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEEXACT: + if (code[3] == OP_PROP || code[3] == OP_NOTPROP) + code += 2; + break; + } - /* Add in the fixed length from the table */ + /* Add in the fixed length from the table */ - code += _pcre_OP_lengths[c]; + code += _pcre_OP_lengths[c]; - /* In UTF-8 mode, opcodes that are followed by a character may be followed - by a multi-byte character. The length in the table is a minimum, so we have - to arrange to skip the extra bytes. */ + /* In UTF-8 mode, opcodes that are followed by a character may be + followed by a multi-byte character. The length in the table is a + minimum, so we have to arrange to skip the extra bytes. */ #ifdef SUPPORT_UTF8 - if (utf8) switch(c) - { - case OP_CHAR: - case OP_CHARNC: - case OP_EXACT: - case OP_UPTO: - case OP_MINUPTO: - case OP_POSUPTO: - case OP_STAR: - case OP_MINSTAR: - case OP_POSSTAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_POSPLUS: - case OP_QUERY: - case OP_MINQUERY: - case OP_POSQUERY: - if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; - break; - } + if (utf8) + switch (c) { + case OP_CHAR: + case OP_CHARNC: + case OP_EXACT: + case OP_UPTO: + case OP_MINUPTO: + case OP_POSUPTO: + case OP_STAR: + case OP_MINSTAR: + case OP_POSSTAR: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + case OP_QUERY: + case OP_MINQUERY: + case OP_POSQUERY: + if (code[-1] >= 0xc0) + code += _pcre_utf8_table4[code[-1] & 0x3f]; + break; + } #endif + } } - } } - - /************************************************* -* Scan compiled branch for non-emptiness * -*************************************************/ + * Scan compiled branch for non-emptiness * + *************************************************/ /* This function scans through a branch of a compiled pattern to see whether it can match the empty string or not. It is called from could_be_empty() @@ -1479,189 +1661,192 @@ Arguments: Returns: TRUE if what is matched could be empty */ -static BOOL -could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8) -{ -register int c; -for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE); - code < endcode; - code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE)) - { - const uschar *ccode; +static BOOL could_be_empty_branch(const uschar* code, + const uschar* endcode, + BOOL utf8) { + register int c; + for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, + TRUE); + code < endcode; code = first_significant_code( + code + _pcre_OP_lengths[c], NULL, 0, TRUE)) { + const uschar* ccode; - c = *code; + c = *code; - /* Skip over forward assertions; the other assertions are skipped by - first_significant_code() with a TRUE final argument. */ + /* Skip over forward assertions; the other assertions are skipped by + first_significant_code() with a TRUE final argument. */ - if (c == OP_ASSERT) - { - do code += GET(code, 1); while (*code == OP_ALT); - c = *code; - continue; - } + if (c == OP_ASSERT) { + do + code += GET(code, 1); + while (*code == OP_ALT); + c = *code; + continue; + } - /* Groups with zero repeats can of course be empty; skip them. */ + /* Groups with zero repeats can of course be empty; skip them. */ - if (c == OP_BRAZERO || c == OP_BRAMINZERO) - { - code += _pcre_OP_lengths[c]; - do code += GET(code, 1); while (*code == OP_ALT); - c = *code; - continue; - } + if (c == OP_BRAZERO || c == OP_BRAMINZERO) { + code += _pcre_OP_lengths[c]; + do + code += GET(code, 1); + while (*code == OP_ALT); + c = *code; + continue; + } - /* For other groups, scan the branches. */ + /* For other groups, scan the branches. */ - if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND) - { - BOOL empty_branch; - if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ + if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND) { + BOOL empty_branch; + if (GET(code, 1) == 0) + return TRUE; /* Hit unclosed bracket */ - /* Scan a closed bracket */ + /* Scan a closed bracket */ - empty_branch = FALSE; - do - { - if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) - empty_branch = TRUE; - code += GET(code, 1); - } - while (*code == OP_ALT); - if (!empty_branch) return FALSE; /* All branches are non-empty */ - c = *code; - continue; - } + empty_branch = FALSE; + do { + if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) + empty_branch = TRUE; + code += GET(code, 1); + } while (*code == OP_ALT); + if (!empty_branch) + return FALSE; /* All branches are non-empty */ + c = *code; + continue; + } - /* Handle the other opcodes */ + /* Handle the other opcodes */ - switch (c) - { - /* Check for quantifiers after a class. XCLASS is used for classes that - cannot be represented just by a bit map. This includes negated single - high-valued characters. The length in _pcre_OP_lengths[] is zero; the - actual length is stored in the compiled code, so we must update "code" - here. */ + switch (c) { + /* Check for quantifiers after a class. XCLASS is used for classes + that cannot be represented just by a bit map. This includes negated + single high-valued characters. The length in _pcre_OP_lengths[] is + zero; the actual length is stored in the compiled code, so we must + update "code" here. */ #ifdef SUPPORT_UTF8 - case OP_XCLASS: - ccode = code += GET(code, 1); - goto CHECK_CLASS_REPEAT; + case OP_XCLASS: + ccode = code += GET(code, 1); + goto CHECK_CLASS_REPEAT; #endif - case OP_CLASS: - case OP_NCLASS: - ccode = code + 33; + case OP_CLASS: + case OP_NCLASS: + ccode = code + 33; #ifdef SUPPORT_UTF8 - CHECK_CLASS_REPEAT: + CHECK_CLASS_REPEAT: #endif - switch (*ccode) - { - case OP_CRSTAR: /* These could be empty; continue */ - case OP_CRMINSTAR: - case OP_CRQUERY: - case OP_CRMINQUERY: - break; + switch (*ccode) { + case OP_CRSTAR: /* These could be empty; continue */ + case OP_CRMINSTAR: + case OP_CRQUERY: + case OP_CRMINQUERY: + break; - default: /* Non-repeat => class must match */ - case OP_CRPLUS: /* These repeats aren't empty */ - case OP_CRMINPLUS: - return FALSE; + default: /* Non-repeat => class must match */ + case OP_CRPLUS: /* These repeats aren't empty */ + case OP_CRMINPLUS: + return FALSE; - case OP_CRRANGE: - case OP_CRMINRANGE: - if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */ - break; - } - break; + case OP_CRRANGE: + case OP_CRMINRANGE: + if (GET2(ccode, 1) > 0) + return FALSE; /* Minimum > 0 */ + break; + } + break; - /* Opcodes that must match a character */ + /* Opcodes that must match a character */ - case OP_PROP: - case OP_NOTPROP: - case OP_EXTUNI: - case OP_NOT_DIGIT: - case OP_DIGIT: - case OP_NOT_WHITESPACE: - case OP_WHITESPACE: - case OP_NOT_WORDCHAR: - case OP_WORDCHAR: - case OP_ANY: - case OP_ANYBYTE: - case OP_CHAR: - case OP_CHARNC: - case OP_NOT: - case OP_PLUS: - case OP_MINPLUS: - case OP_POSPLUS: - case OP_EXACT: - case OP_NOTPLUS: - case OP_NOTMINPLUS: - case OP_NOTPOSPLUS: - case OP_NOTEXACT: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEPOSPLUS: - case OP_TYPEEXACT: - return FALSE; + case OP_PROP: + case OP_NOTPROP: + case OP_EXTUNI: + case OP_NOT_DIGIT: + case OP_DIGIT: + case OP_NOT_WHITESPACE: + case OP_WHITESPACE: + case OP_NOT_WORDCHAR: + case OP_WORDCHAR: + case OP_ANY: + case OP_ANYBYTE: + case OP_CHAR: + case OP_CHARNC: + case OP_NOT: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + case OP_EXACT: + case OP_NOTPLUS: + case OP_NOTMINPLUS: + case OP_NOTPOSPLUS: + case OP_NOTEXACT: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEPOSPLUS: + case OP_TYPEEXACT: + return FALSE; - /* These are going to continue, as they may be empty, but we have to - fudge the length for the \p and \P cases. */ + /* These are going to continue, as they may be empty, but we + have to fudge the length for the \p and \P cases. */ - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPOSSTAR: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSQUERY: - if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; - break; + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPOSSTAR: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) + code += 2; + break; - /* Same for these */ + /* Same for these */ - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEPOSUPTO: - if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; - break; + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEPOSUPTO: + if (code[3] == OP_PROP || code[3] == OP_NOTPROP) + code += 2; + break; - /* End of branch */ + /* End of branch */ + + case OP_KET: + case OP_KETRMAX: + case OP_KETRMIN: + case OP_ALT: + return TRUE; + + /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, + POSQUERY, UPTO, MINUPTO, and POSUPTO may be followed by a + multibyte character */ + +#ifdef SUPPORT_UTF8 + case OP_STAR: + case OP_MINSTAR: + case OP_POSSTAR: + case OP_QUERY: + case OP_MINQUERY: + case OP_POSQUERY: + case OP_UPTO: + case OP_MINUPTO: + case OP_POSUPTO: + if (utf8) + while ((code[2] & 0xc0) == 0x80) + code++; + break; +#endif + } + } - case OP_KET: - case OP_KETRMAX: - case OP_KETRMIN: - case OP_ALT: return TRUE; - - /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, - MINUPTO, and POSUPTO may be followed by a multibyte character */ - -#ifdef SUPPORT_UTF8 - case OP_STAR: - case OP_MINSTAR: - case OP_POSSTAR: - case OP_QUERY: - case OP_MINQUERY: - case OP_POSQUERY: - case OP_UPTO: - case OP_MINUPTO: - case OP_POSUPTO: - if (utf8) while ((code[2] & 0xc0) == 0x80) code++; - break; -#endif - } - } - -return TRUE; } - - /************************************************* -* Scan compiled regex for non-emptiness * -*************************************************/ + * Scan compiled regex for non-emptiness * + *************************************************/ /* This function is called to check for left recursive calls. We want to check the current branch of the current pattern to see if it could match the empty @@ -1677,23 +1862,21 @@ Arguments: Returns: TRUE if what is matched could be empty */ -static BOOL -could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr, - BOOL utf8) -{ -while (bcptr != NULL && bcptr->current >= code) - { - if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE; - bcptr = bcptr->outer; - } -return TRUE; +static BOOL could_be_empty(const uschar* code, + const uschar* endcode, + branch_chain* bcptr, + BOOL utf8) { + while (bcptr != NULL && bcptr->current >= code) { + if (!could_be_empty_branch(bcptr->current, endcode, utf8)) + return FALSE; + bcptr = bcptr->outer; + } + return TRUE; } - - /************************************************* -* Check for POSIX class syntax * -*************************************************/ + * Check for POSIX class syntax * + *************************************************/ /* This function is called when the sequence "[:" or "[." or "[=" is encountered in a character class. It checks whether this is followed by a @@ -1722,32 +1905,28 @@ Arguments: Returns: TRUE or FALSE */ -static BOOL -check_posix_syntax(const uschar *ptr, const uschar **endptr) -{ -int terminator; /* Don't combine these lines; the Solaris cc */ -terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ -for (++ptr; *ptr != 0; ptr++) - { - if (*ptr == '\\' && ptr[1] == ']') ptr++; else - { - if (*ptr == ']') return FALSE; - if (*ptr == terminator && ptr[1] == ']') - { - *endptr = ptr; - return TRUE; - } +static BOOL check_posix_syntax(const uschar* ptr, const uschar** endptr) { + int terminator; /* Don't combine these lines; the Solaris cc */ + terminator = + *(++ptr); /* compiler warns about "non-constant" initializer. */ + for (++ptr; *ptr != 0; ptr++) { + if (*ptr == '\\' && ptr[1] == ']') + ptr++; + else { + if (*ptr == ']') + return FALSE; + if (*ptr == terminator && ptr[1] == ']') { + *endptr = ptr; + return TRUE; + } + } } - } -return FALSE; + return FALSE; } - - - /************************************************* -* Check POSIX class name * -*************************************************/ + * Check POSIX class name * + *************************************************/ /* This function is called to check the name given in a POSIX-style class entry such as [:alnum:]. @@ -1759,25 +1938,22 @@ Arguments: Returns: a value representing the name, or -1 if unknown */ -static int -check_posix_name(const uschar *ptr, int len) -{ -const char *pn = posix_names; -register int yield = 0; -while (posix_name_lengths[yield] != 0) - { - if (len == posix_name_lengths[yield] && - strncmp((const char *)ptr, pn, len) == 0) return yield; - pn += posix_name_lengths[yield] + 1; - yield++; - } -return -1; +static int check_posix_name(const uschar* ptr, int len) { + const char* pn = posix_names; + register int yield = 0; + while (posix_name_lengths[yield] != 0) { + if (len == posix_name_lengths[yield] && + strncmp((const char*)ptr, pn, len) == 0) + return yield; + pn += posix_name_lengths[yield] + 1; + yield++; + } + return -1; } - /************************************************* -* Adjust OP_RECURSE items in repeated group * -*************************************************/ + * Adjust OP_RECURSE items in repeated group * + *************************************************/ /* OP_RECURSE items contain an offset from the start of the regex to the group that is referenced. This means that groups can be replicated for fixed @@ -1805,48 +1981,44 @@ Arguments: Returns: nothing */ -static void -adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, - uschar *save_hwm) -{ -uschar *ptr = group; +static void adjust_recurse(uschar* group, + int adjust, + BOOL utf8, + compile_data* cd, + uschar* save_hwm) { + uschar* ptr = group; -while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) - { - int offset; - uschar *hc; + while ((ptr = (uschar*)find_recurse(ptr, utf8)) != NULL) { + int offset; + uschar* hc; - /* See if this recursion is on the forward reference list. If so, adjust the - reference. */ + /* See if this recursion is on the forward reference list. If so, adjust + the reference. */ - for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) - { - offset = GET(hc, 0); - if (cd->start_code + offset == ptr + 1) - { - PUT(hc, 0, offset + adjust); - break; - } + for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) { + offset = GET(hc, 0); + if (cd->start_code + offset == ptr + 1) { + PUT(hc, 0, offset + adjust); + break; + } + } + + /* Otherwise, adjust the recursion offset if it's after the start of + this group. */ + + if (hc >= cd->hwm) { + offset = GET(ptr, 1); + if (cd->start_code + offset >= group) + PUT(ptr, 1, offset + adjust); + } + + ptr += 1 + LINK_SIZE; } - - /* Otherwise, adjust the recursion offset if it's after the start of this - group. */ - - if (hc >= cd->hwm) - { - offset = GET(ptr, 1); - if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); - } - - ptr += 1 + LINK_SIZE; - } } - - /************************************************* -* Insert an automatic callout point * -*************************************************/ + * Insert an automatic callout point * + *************************************************/ /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert callout points before each pattern item. @@ -1859,21 +2031,17 @@ Arguments: Returns: new code pointer */ -static uschar * -auto_callout(uschar *code, const uschar *ptr, compile_data *cd) -{ -*code++ = OP_CALLOUT; -*code++ = 255; -PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */ -PUT(code, LINK_SIZE, 0); /* Default length */ -return code + 2*LINK_SIZE; +static uschar* auto_callout(uschar* code, const uschar* ptr, compile_data* cd) { + *code++ = OP_CALLOUT; + *code++ = 255; + PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */ + PUT(code, LINK_SIZE, 0); /* Default length */ + return code + 2 * LINK_SIZE; } - - /************************************************* -* Complete a callout item * -*************************************************/ + * Complete a callout item * + *************************************************/ /* A callout item contains the length of the next item in the pattern, which we can't fill in till after we have reached the relevant point. This is used @@ -1887,19 +2055,17 @@ Arguments: Returns: nothing */ -static void -complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd) -{ -int length = ptr - cd->start_pattern - GET(previous_callout, 2); -PUT(previous_callout, 2 + LINK_SIZE, length); +static void complete_callout(uschar* previous_callout, + const uschar* ptr, + compile_data* cd) { + int length = ptr - cd->start_pattern - GET(previous_callout, 2); + PUT(previous_callout, 2 + LINK_SIZE, length); } - - #ifdef SUPPORT_UCP /************************************************* -* Get othercase range * -*************************************************/ + * Get othercase range * + *************************************************/ /* This function is passed the start and end of a class range, in UTF-8 mode with UCP support. It searches up the characters, looking for internal ranges of @@ -1915,38 +2081,39 @@ Arguments: Yield: TRUE when range returned; FALSE when no more */ -static BOOL -get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, - unsigned int *odptr) -{ -unsigned int c, othercase, next; +static BOOL get_othercase_range(unsigned int* cptr, + unsigned int d, + unsigned int* ocptr, + unsigned int* odptr) { + unsigned int c, othercase, next; -for (c = *cptr; c <= d; c++) - { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; } + for (c = *cptr; c <= d; c++) { + if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) + break; + } -if (c > d) return FALSE; + if (c > d) + return FALSE; -*ocptr = othercase; -next = othercase + 1; + *ocptr = othercase; + next = othercase + 1; -for (++c; c <= d; c++) - { - if (_pcre_ucp_othercase(c) != next) break; - next++; - } + for (++c; c <= d; c++) { + if (_pcre_ucp_othercase(c) != next) + break; + next++; + } -*odptr = next - 1; -*cptr = c; + *odptr = next - 1; + *cptr = c; -return TRUE; + return TRUE; } -#endif /* SUPPORT_UCP */ - - +#endif /* SUPPORT_UCP */ /************************************************* -* Check if auto-possessifying is possible * -*************************************************/ + * Check if auto-possessifying is possible * + *************************************************/ /* This function is called for unlimited repeats of certain items, to see whether the next thing could possibly match the repeated item. If not, it makes @@ -1964,321 +2131,330 @@ Arguments: Returns: TRUE if possessifying is wanted */ -static BOOL -check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, - const uschar *ptr, int options, compile_data *cd) -{ -int next; +static BOOL check_auto_possessive(int op_code, + int item, + BOOL utf8, + uschar* utf8_char, + const uschar* ptr, + int options, + compile_data* cd) { + int next; -/* Skip whitespace and comments in extended mode */ + /* Skip whitespace and comments in extended mode */ -if ((options & PCRE_EXTENDED) != 0) - { - for (;;) - { - while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; - if (*ptr == '#') - { - while (*(++ptr) != 0) - if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } - } - else break; + if ((options & PCRE_EXTENDED) != 0) { + for (;;) { + while ((cd->ctypes[*ptr] & ctype_space) != 0) + ptr++; + if (*ptr == '#') { + while (*(++ptr) != 0) + if (IS_NEWLINE(ptr)) { + ptr += cd->nllen; + break; + } + } else + break; + } } - } -/* If the next item is one that we can handle, get its value. A non-negative -value is a character, a negative value is an escape value. */ + /* If the next item is one that we can handle, get its value. A non-negative + value is a character, a negative value is an escape value. */ -if (*ptr == '\\') - { - int temperrorcode = 0; - next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); - if (temperrorcode != 0) return FALSE; - ptr++; /* Point after the escape sequence */ - } - -else if ((cd->ctypes[*ptr] & ctype_meta) == 0) - { -#ifdef SUPPORT_UTF8 - if (utf8) { GETCHARINC(next, ptr); } else -#endif - next = *ptr++; - } - -else return FALSE; - -/* Skip whitespace and comments in extended mode */ - -if ((options & PCRE_EXTENDED) != 0) - { - for (;;) - { - while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; - if (*ptr == '#') - { - while (*(++ptr) != 0) - if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } - } - else break; + if (*ptr == '\\') { + int temperrorcode = 0; + next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); + if (temperrorcode != 0) + return FALSE; + ptr++; /* Point after the escape sequence */ } - } -/* If the next thing is itself optional, we have to give up. */ - -if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0) - return FALSE; - -/* Now compare the next item with the previous opcode. If the previous is a -positive single character match, "item" either contains the character or, if -"item" is greater than 127 in utf8 mode, the character's bytes are in -utf8_char. */ - - -/* Handle cases when the next item is a character. */ - -if (next >= 0) switch(op_code) - { - case OP_CHAR: + else if ((cd->ctypes[*ptr] & ctype_meta) == 0) { #ifdef SUPPORT_UTF8 - if (utf8 && item > 127) { GETCHAR(item, utf8_char); } + if (utf8) { + GETCHARINC(next, ptr); + } else #endif - return item != next; + next = *ptr++; + } - /* For CHARNC (caseless character) we must check the other case. If we have - Unicode property support, we can use it to test the other case of - high-valued characters. */ + else + return FALSE; - case OP_CHARNC: + /* Skip whitespace and comments in extended mode */ + + if ((options & PCRE_EXTENDED) != 0) { + for (;;) { + while ((cd->ctypes[*ptr] & ctype_space) != 0) + ptr++; + if (*ptr == '#') { + while (*(++ptr) != 0) + if (IS_NEWLINE(ptr)) { + ptr += cd->nllen; + break; + } + } else + break; + } + } + + /* If the next thing is itself optional, we have to give up. */ + + if (*ptr == '*' || *ptr == '?' || strncmp((char*)ptr, "{0,", 3) == 0) + return FALSE; + + /* Now compare the next item with the previous opcode. If the previous is a + positive single character match, "item" either contains the character or, if + "item" is greater than 127 in utf8 mode, the character's bytes are in + utf8_char. */ + + /* Handle cases when the next item is a character. */ + + if (next >= 0) + switch (op_code) { + case OP_CHAR: #ifdef SUPPORT_UTF8 - if (utf8 && item > 127) { GETCHAR(item, utf8_char); } + if (utf8 && item > 127) { + GETCHAR(item, utf8_char); + } #endif - if (item == next) return FALSE; + return item != next; + + /* For CHARNC (caseless character) we must check the other case. + If we have Unicode property support, we can use it to test the + other case of high-valued characters. */ + + case OP_CHARNC: #ifdef SUPPORT_UTF8 - if (utf8) - { - unsigned int othercase; - if (next < 128) othercase = cd->fcc[next]; else + if (utf8 && item > 127) { + GETCHAR(item, utf8_char); + } +#endif + if (item == next) + return FALSE; +#ifdef SUPPORT_UTF8 + if (utf8) { + unsigned int othercase; + if (next < 128) + othercase = cd->fcc[next]; + else #ifdef SUPPORT_UCP - othercase = _pcre_ucp_othercase((unsigned int)next); + othercase = _pcre_ucp_othercase((unsigned int)next); #else - othercase = NOTACHAR; + othercase = NOTACHAR; #endif - return (unsigned int)item != othercase; - } - else -#endif /* SUPPORT_UTF8 */ - return (item != cd->fcc[next]); /* Non-UTF-8 mode */ + return (unsigned int)item != othercase; + } else +#endif /* SUPPORT_UTF8 */ + return (item != cd->fcc[next]); /* Non-UTF-8 mode */ - /* For OP_NOT, "item" must be a single-byte character. */ + /* For OP_NOT, "item" must be a single-byte character. */ - case OP_NOT: - if (next < 0) return FALSE; /* Not a character */ - if (item == next) return TRUE; - if ((options & PCRE_CASELESS) == 0) return FALSE; + case OP_NOT: + if (next < 0) + return FALSE; /* Not a character */ + if (item == next) + return TRUE; + if ((options & PCRE_CASELESS) == 0) + return FALSE; #ifdef SUPPORT_UTF8 - if (utf8) - { - unsigned int othercase; - if (next < 128) othercase = cd->fcc[next]; else + if (utf8) { + unsigned int othercase; + if (next < 128) + othercase = cd->fcc[next]; + else #ifdef SUPPORT_UCP - othercase = _pcre_ucp_othercase(next); + othercase = _pcre_ucp_othercase(next); #else - othercase = NOTACHAR; + othercase = NOTACHAR; #endif - return (unsigned int)item == othercase; - } - else -#endif /* SUPPORT_UTF8 */ - return (item == cd->fcc[next]); /* Non-UTF-8 mode */ + return (unsigned int)item == othercase; + } else +#endif /* SUPPORT_UTF8 */ + return (item == cd->fcc[next]); /* Non-UTF-8 mode */ - case OP_DIGIT: - return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; + case OP_DIGIT: + return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; - case OP_NOT_DIGIT: - return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0; + case OP_NOT_DIGIT: + return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0; - case OP_WHITESPACE: - return next > 127 || (cd->ctypes[next] & ctype_space) == 0; + case OP_WHITESPACE: + return next > 127 || (cd->ctypes[next] & ctype_space) == 0; - case OP_NOT_WHITESPACE: - return next <= 127 && (cd->ctypes[next] & ctype_space) != 0; + case OP_NOT_WHITESPACE: + return next <= 127 && (cd->ctypes[next] & ctype_space) != 0; - case OP_WORDCHAR: - return next > 127 || (cd->ctypes[next] & ctype_word) == 0; + case OP_WORDCHAR: + return next > 127 || (cd->ctypes[next] & ctype_word) == 0; - case OP_NOT_WORDCHAR: - return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; + case OP_NOT_WORDCHAR: + return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; - case OP_HSPACE: - case OP_NOT_HSPACE: - switch(next) - { - case 0x09: - case 0x20: - case 0xa0: - case 0x1680: - case 0x180e: - case 0x2000: - case 0x2001: - case 0x2002: - case 0x2003: - case 0x2004: - case 0x2005: - case 0x2006: - case 0x2007: - case 0x2008: - case 0x2009: - case 0x200A: - case 0x202f: - case 0x205f: - case 0x3000: - return op_code != OP_HSPACE; - default: - return op_code == OP_HSPACE; - } + case OP_HSPACE: + case OP_NOT_HSPACE: + switch (next) { + case 0x09: + case 0x20: + case 0xa0: + case 0x1680: + case 0x180e: + case 0x2000: + case 0x2001: + case 0x2002: + case 0x2003: + case 0x2004: + case 0x2005: + case 0x2006: + case 0x2007: + case 0x2008: + case 0x2009: + case 0x200A: + case 0x202f: + case 0x205f: + case 0x3000: + return op_code != OP_HSPACE; + default: + return op_code == OP_HSPACE; + } - case OP_VSPACE: - case OP_NOT_VSPACE: - switch(next) - { - case 0x0a: - case 0x0b: - case 0x0c: - case 0x0d: - case 0x85: - case 0x2028: - case 0x2029: - return op_code != OP_VSPACE; - default: - return op_code == OP_VSPACE; - } + case OP_VSPACE: + case OP_NOT_VSPACE: + switch (next) { + case 0x0a: + case 0x0b: + case 0x0c: + case 0x0d: + case 0x85: + case 0x2028: + case 0x2029: + return op_code != OP_VSPACE; + default: + return op_code == OP_VSPACE; + } - default: - return FALSE; - } + default: + return FALSE; + } + /* Handle the case when the next item is \d, \s, etc. */ -/* Handle the case when the next item is \d, \s, etc. */ - -switch(op_code) - { - case OP_CHAR: - case OP_CHARNC: + switch (op_code) { + case OP_CHAR: + case OP_CHARNC: #ifdef SUPPORT_UTF8 - if (utf8 && item > 127) { GETCHAR(item, utf8_char); } + if (utf8 && item > 127) { + GETCHAR(item, utf8_char); + } #endif - switch(-next) - { - case ESC_d: - return item > 127 || (cd->ctypes[item] & ctype_digit) == 0; + switch (-next) { + case ESC_d: + return item > 127 || (cd->ctypes[item] & ctype_digit) == 0; - case ESC_D: - return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; + case ESC_D: + return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; - case ESC_s: - return item > 127 || (cd->ctypes[item] & ctype_space) == 0; + case ESC_s: + return item > 127 || (cd->ctypes[item] & ctype_space) == 0; - case ESC_S: - return item <= 127 && (cd->ctypes[item] & ctype_space) != 0; + case ESC_S: + return item <= 127 && (cd->ctypes[item] & ctype_space) != 0; - case ESC_w: - return item > 127 || (cd->ctypes[item] & ctype_word) == 0; + case ESC_w: + return item > 127 || (cd->ctypes[item] & ctype_word) == 0; - case ESC_W: - return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; + case ESC_W: + return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; - case ESC_h: - case ESC_H: - switch(item) - { - case 0x09: - case 0x20: - case 0xa0: - case 0x1680: - case 0x180e: - case 0x2000: - case 0x2001: - case 0x2002: - case 0x2003: - case 0x2004: - case 0x2005: - case 0x2006: - case 0x2007: - case 0x2008: - case 0x2009: - case 0x200A: - case 0x202f: - case 0x205f: - case 0x3000: - return -next != ESC_h; - default: - return -next == ESC_h; - } + case ESC_h: + case ESC_H: + switch (item) { + case 0x09: + case 0x20: + case 0xa0: + case 0x1680: + case 0x180e: + case 0x2000: + case 0x2001: + case 0x2002: + case 0x2003: + case 0x2004: + case 0x2005: + case 0x2006: + case 0x2007: + case 0x2008: + case 0x2009: + case 0x200A: + case 0x202f: + case 0x205f: + case 0x3000: + return -next != ESC_h; + default: + return -next == ESC_h; + } - case ESC_v: - case ESC_V: - switch(item) - { - case 0x0a: - case 0x0b: - case 0x0c: - case 0x0d: - case 0x85: - case 0x2028: - case 0x2029: - return -next != ESC_v; - default: - return -next == ESC_v; - } + case ESC_v: + case ESC_V: + switch (item) { + case 0x0a: + case 0x0b: + case 0x0c: + case 0x0d: + case 0x85: + case 0x2028: + case 0x2029: + return -next != ESC_v; + default: + return -next == ESC_v; + } - default: - return FALSE; + default: + return FALSE; + } + + case OP_DIGIT: + return next == -ESC_D || next == -ESC_s || next == -ESC_W || + next == -ESC_h || next == -ESC_v; + + case OP_NOT_DIGIT: + return next == -ESC_d; + + case OP_WHITESPACE: + return next == -ESC_S || next == -ESC_d || next == -ESC_w; + + case OP_NOT_WHITESPACE: + return next == -ESC_s || next == -ESC_h || next == -ESC_v; + + case OP_HSPACE: + return next == -ESC_S || next == -ESC_H || next == -ESC_d || + next == -ESC_w; + + case OP_NOT_HSPACE: + return next == -ESC_h; + + /* Can't have \S in here because VT matches \S (Perl anomaly) */ + case OP_VSPACE: + return next == -ESC_V || next == -ESC_d || next == -ESC_w; + + case OP_NOT_VSPACE: + return next == -ESC_v; + + case OP_WORDCHAR: + return next == -ESC_W || next == -ESC_s || next == -ESC_h || + next == -ESC_v; + + case OP_NOT_WORDCHAR: + return next == -ESC_w || next == -ESC_d; + + default: + return FALSE; } - case OP_DIGIT: - return next == -ESC_D || next == -ESC_s || next == -ESC_W || - next == -ESC_h || next == -ESC_v; - - case OP_NOT_DIGIT: - return next == -ESC_d; - - case OP_WHITESPACE: - return next == -ESC_S || next == -ESC_d || next == -ESC_w; - - case OP_NOT_WHITESPACE: - return next == -ESC_s || next == -ESC_h || next == -ESC_v; - - case OP_HSPACE: - return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w; - - case OP_NOT_HSPACE: - return next == -ESC_h; - - /* Can't have \S in here because VT matches \S (Perl anomaly) */ - case OP_VSPACE: - return next == -ESC_V || next == -ESC_d || next == -ESC_w; - - case OP_NOT_VSPACE: - return next == -ESC_v; - - case OP_WORDCHAR: - return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v; - - case OP_NOT_WORDCHAR: - return next == -ESC_w || next == -ESC_d; - - default: - return FALSE; - } - -/* Control does not reach here */ + /* Control does not reach here */ } - - /************************************************* -* Compile one branch * -*************************************************/ + * Compile one branch * + *************************************************/ /* Scan the pattern, compiling it into the a vector. If the options are changed during the branch, the pointer is used to change the external options @@ -2302,967 +2478,1074 @@ Returns: TRUE on success FALSE, with *errorcodeptr set non-zero on error */ -static BOOL -compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, - int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, - compile_data *cd, int *lengthptr) -{ -int repeat_type, op_type; -int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ -int bravalue = 0; -int greedy_default, greedy_non_default; -int firstbyte, reqbyte; -int zeroreqbyte, zerofirstbyte; -int req_caseopt, reqvary, tempreqvary; -int options = *optionsptr; -int after_manual_callout = 0; -int length_prevgroup = 0; -register int c; -register uschar *code = *codeptr; -uschar *last_code = code; -uschar *orig_code = code; -uschar *tempcode; -BOOL inescq = FALSE; -BOOL groupsetfirstbyte = FALSE; -const uschar *ptr = *ptrptr; -const uschar *tempptr; -uschar *previous = NULL; -uschar *previous_callout = NULL; -uschar *save_hwm = NULL; -uschar classbits[32]; +static BOOL compile_branch(int* optionsptr, + uschar** codeptr, + const uschar** ptrptr, + int* errorcodeptr, + int* firstbyteptr, + int* reqbyteptr, + branch_chain* bcptr, + compile_data* cd, + int* lengthptr) { + int repeat_type, op_type; + int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ + int bravalue = 0; + int greedy_default, greedy_non_default; + int firstbyte, reqbyte; + int zeroreqbyte, zerofirstbyte; + int req_caseopt, reqvary, tempreqvary; + int options = *optionsptr; + int after_manual_callout = 0; + int length_prevgroup = 0; + register int c; + register uschar* code = *codeptr; + uschar* last_code = code; + uschar* orig_code = code; + uschar* tempcode; + BOOL inescq = FALSE; + BOOL groupsetfirstbyte = FALSE; + const uschar* ptr = *ptrptr; + const uschar* tempptr; + uschar* previous = NULL; + uschar* previous_callout = NULL; + uschar* save_hwm = NULL; + uschar classbits[32]; #ifdef SUPPORT_UTF8 -BOOL class_utf8; -BOOL utf8 = (options & PCRE_UTF8) != 0; -uschar *class_utf8data; -uschar *class_utf8data_base; -uschar utf8_char[6]; + BOOL class_utf8; + BOOL utf8 = (options & PCRE_UTF8) != 0; + uschar* class_utf8data; + uschar* class_utf8data_base; + uschar utf8_char[6]; #else -BOOL utf8 = FALSE; -uschar *utf8_char = NULL; + BOOL utf8 = FALSE; + uschar* utf8_char = NULL; #endif #ifdef DEBUG -if (lengthptr != NULL) DPRINTF((">> start branch\n")); -#endif - -/* Set up the default and non-default settings for greediness */ - -greedy_default = ((options & PCRE_UNGREEDY) != 0); -greedy_non_default = greedy_default ^ 1; - -/* Initialize no first byte, no required byte. REQ_UNSET means "no char -matching encountered yet". It gets changed to REQ_NONE if we hit something that -matches a non-fixed char first char; reqbyte just remains unset if we never -find one. - -When we hit a repeat whose minimum is zero, we may have to adjust these values -to take the zero repeat into account. This is implemented by setting them to -zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual -item types that can be repeated set these backoff variables appropriately. */ - -firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET; - -/* The variable req_caseopt contains either the REQ_CASELESS value or zero, -according to the current setting of the caseless flag. REQ_CASELESS is a bit -value > 255. It is added into the firstbyte or reqbyte variables to record the -case status of the value. This is used only for ASCII characters. */ - -req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; - -/* Switch on next character until the end of the branch */ - -for (;; ptr++) - { - BOOL negate_class; - BOOL should_flip_negation; - BOOL possessive_quantifier; - BOOL is_quantifier; - BOOL is_recurse; - BOOL reset_bracount; - int class_charcount; - int class_lastchar; - int newoptions; - int recno; - int refsign; - int skipbytes; - int subreqbyte; - int subfirstbyte; - int terminator; - int mclength; - uschar mcbuffer[8]; - - /* Get next byte in the pattern */ - - c = *ptr; - - /* If we are in the pre-compile phase, accumulate the length used for the - previous cycle of this loop. */ - - if (lengthptr != NULL) - { -#ifdef DEBUG - if (code > cd->hwm) cd->hwm = code; /* High water info */ -#endif - if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */ - { - *errorcodeptr = ERR52; - goto FAILED; - } - - /* There is at least one situation where code goes backwards: this is the - case of a zero quantifier after a class (e.g. [ab]{0}). At compile time, - the class is simply eliminated. However, it is created first, so we have to - allow memory for it. Therefore, don't ever reduce the length at this point. - */ - - if (code < last_code) code = last_code; - - /* Paranoid check for integer overflow */ - - if (OFLOW_MAX - *lengthptr < code - last_code) - { - *errorcodeptr = ERR20; - goto FAILED; - } - - *lengthptr += code - last_code; - DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); - - /* If "previous" is set and it is not at the start of the work space, move - it back to there, in order to avoid filling up the work space. Otherwise, - if "previous" is NULL, reset the current code pointer to the start. */ - - if (previous != NULL) - { - if (previous > orig_code) - { - memmove(orig_code, previous, code - previous); - code -= previous - orig_code; - previous = orig_code; - } - } - else code = orig_code; - - /* Remember where this code item starts so we can pick up the length - next time round. */ - - last_code = code; - } - - /* In the real compile phase, just check the workspace used by the forward - reference list. */ - - else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE) - { - *errorcodeptr = ERR52; - goto FAILED; - } - - /* If in \Q...\E, check for the end; if not, we have a literal */ - - if (inescq && c != 0) - { - if (c == '\\' && ptr[1] == 'E') - { - inescq = FALSE; - ptr++; - continue; - } - else - { - if (previous_callout != NULL) - { - if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ - complete_callout(previous_callout, ptr, cd); - previous_callout = NULL; - } - if ((options & PCRE_AUTO_CALLOUT) != 0) - { - previous_callout = code; - code = auto_callout(code, ptr, cd); - } - goto NORMAL_CHAR; - } - } - - /* Fill in length of a previous callout, except when the next thing is - a quantifier. */ - - is_quantifier = c == '*' || c == '+' || c == '?' || - (c == '{' && is_counted_repeat(ptr+1)); - - if (!is_quantifier && previous_callout != NULL && - after_manual_callout-- <= 0) - { - if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ - complete_callout(previous_callout, ptr, cd); - previous_callout = NULL; - } - - /* In extended mode, skip white space and comments */ - - if ((options & PCRE_EXTENDED) != 0) - { - if ((cd->ctypes[c] & ctype_space) != 0) continue; - if (c == '#') - { - while (*(++ptr) != 0) - { - if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } - } - if (*ptr != 0) continue; - - /* Else fall through to handle end of string */ - c = 0; - } - } - - /* No auto callout for quantifiers. */ - - if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier) - { - previous_callout = code; - code = auto_callout(code, ptr, cd); - } - - switch(c) - { - /* ===================================================================*/ - case 0: /* The branch terminates at string end */ - case '|': /* or | or ) */ - case ')': - *firstbyteptr = firstbyte; - *reqbyteptr = reqbyte; - *codeptr = code; - *ptrptr = ptr; if (lengthptr != NULL) - { - if (OFLOW_MAX - *lengthptr < code - last_code) - { - *errorcodeptr = ERR20; - goto FAILED; - } - *lengthptr += code - last_code; /* To include callout length */ - DPRINTF((">> end branch\n")); - } - return TRUE; - - - /* ===================================================================*/ - /* Handle single-character metacharacters. In multiline mode, ^ disables - the setting of any following char as a first character. */ - - case '^': - if ((options & PCRE_MULTILINE) != 0) - { - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; - } - previous = NULL; - *code++ = OP_CIRC; - break; - - case '$': - previous = NULL; - *code++ = OP_DOLL; - break; - - /* There can never be a first char if '.' is first, whatever happens about - repeats. The value of reqbyte doesn't change either. */ - - case '.': - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; - zerofirstbyte = firstbyte; - zeroreqbyte = reqbyte; - previous = code; - *code++ = OP_ANY; - break; - - - /* ===================================================================*/ - /* Character classes. If the included characters are all < 256, we build a - 32-byte bitmap of the permitted characters, except in the special case - where there is only one such character. For negated classes, we build the - map as usual, then invert it at the end. However, we use a different opcode - so that data characters > 255 can be handled correctly. - - If the class contains characters outside the 0-255 range, a different - opcode is compiled. It may optionally have a bit map for characters < 256, - but those above are are explicitly listed afterwards. A flag byte tells - whether the bitmap is present, and whether this is a negated class or not. - */ - - case '[': - previous = code; - - /* PCRE supports POSIX class stuff inside a class. Perl gives an error if - they are encountered at the top level, so we'll do that too. */ - - if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && - check_posix_syntax(ptr, &tempptr)) - { - *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31; - goto FAILED; - } - - /* If the first character is '^', set the negation flag and skip it. Also, - if the first few characters (either before or after ^) are \Q\E or \E we - skip them too. This makes for compatibility with Perl. */ - - negate_class = FALSE; - for (;;) - { - c = *(++ptr); - if (c == '\\') - { - if (ptr[1] == 'E') ptr++; - else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; - else break; - } - else if (!negate_class && c == '^') - negate_class = TRUE; - else break; - } - - /* If a class contains a negative special such as \S, we need to flip the - negation flag at the end, so that support for characters > 255 works - correctly (they are all included in the class). */ - - should_flip_negation = FALSE; - - /* Keep a count of chars with values < 256 so that we can optimize the case - of just a single character (as long as it's < 256). However, For higher - valued UTF-8 characters, we don't yet do any optimization. */ - - class_charcount = 0; - class_lastchar = -1; - - /* Initialize the 32-char bit map to all zeros. We build the map in a - temporary bit of memory, in case the class contains only 1 character (less - than 256), because in that case the compiled code doesn't use the bit map. - */ - - memset(classbits, 0, 32 * sizeof(uschar)); - -#ifdef SUPPORT_UTF8 - class_utf8 = FALSE; /* No chars >= 256 */ - class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ - class_utf8data_base = class_utf8data; /* For resetting in pass 1 */ + DPRINTF((">> start branch\n")); #endif - /* Process characters until ] is reached. By writing this as a "do" it - means that an initial ] is taken as a data character. At the start of the - loop, c contains the first byte of the character. */ + /* Set up the default and non-default settings for greediness */ - if (c != 0) do - { - const uschar *oldptr; + greedy_default = ((options & PCRE_UNGREEDY) != 0); + greedy_non_default = greedy_default ^ 1; -#ifdef SUPPORT_UTF8 - if (utf8 && c > 127) - { /* Braces are required because the */ - GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ - } + /* Initialize no first byte, no required byte. REQ_UNSET means "no char + matching encountered yet". It gets changed to REQ_NONE if we hit something + that matches a non-fixed char first char; reqbyte just remains unset if we + never find one. - /* In the pre-compile phase, accumulate the length of any UTF-8 extra - data and reset the pointer. This is so that very large classes that - contain a zillion UTF-8 characters no longer overwrite the work space - (which is on the stack). */ + When we hit a repeat whose minimum is zero, we may have to adjust these + values to take the zero repeat into account. This is implemented by setting + them to zerofirstbyte and zeroreqbyte when such a repeat is encountered. The + individual item types that can be repeated set these backoff variables + appropriately. */ - if (lengthptr != NULL) - { - *lengthptr += class_utf8data - class_utf8data_base; - class_utf8data = class_utf8data_base; - } + firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET; + /* The variable req_caseopt contains either the REQ_CASELESS value or zero, + according to the current setting of the caseless flag. REQ_CASELESS is a bit + value > 255. It is added into the firstbyte or reqbyte variables to record + the case status of the value. This is used only for ASCII characters. */ + + req_caseopt = ((options & PCRE_CASELESS) != 0) ? REQ_CASELESS : 0; + + /* Switch on next character until the end of the branch */ + + for (;; ptr++) { + BOOL negate_class; + BOOL should_flip_negation; + BOOL possessive_quantifier; + BOOL is_quantifier; + BOOL is_recurse; + BOOL reset_bracount; + int class_charcount; + int class_lastchar; + int newoptions; + int recno; + int refsign; + int skipbytes; + int subreqbyte; + int subfirstbyte; + int terminator; + int mclength; + uschar mcbuffer[8]; + + /* Get next byte in the pattern */ + + c = *ptr; + + /* If we are in the pre-compile phase, accumulate the length used for + the previous cycle of this loop. */ + + if (lengthptr != NULL) { +#ifdef DEBUG + if (code > cd->hwm) + cd->hwm = code; /* High water info */ #endif - - /* Inside \Q...\E everything is literal except \E */ - - if (inescq) - { - if (c == '\\' && ptr[1] == 'E') /* If we are at \E */ - { - inescq = FALSE; /* Reset literal state */ - ptr++; /* Skip the 'E' */ - continue; /* Carry on with next */ - } - goto CHECK_RANGE; /* Could be range if \E follows */ - } - - /* Handle POSIX class names. Perl allows a negation extension of the - form [:^name:]. A square bracket that doesn't match the syntax is - treated as a literal. We also recognize the POSIX constructions - [.ch.] and [=ch=] ("collating elements") and fault them, as Perl - 5.6 and 5.8 do. */ - - if (c == '[' && - (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && - check_posix_syntax(ptr, &tempptr)) - { - BOOL local_negate = FALSE; - int posix_class, taboffset, tabopt; - register const uschar *cbits = cd->cbits; - uschar pbits[32]; - - if (ptr[1] != ':') - { - *errorcodeptr = ERR31; - goto FAILED; - } - - ptr += 2; - if (*ptr == '^') - { - local_negate = TRUE; - should_flip_negation = TRUE; /* Note negative special */ - ptr++; - } - - posix_class = check_posix_name(ptr, tempptr - ptr); - if (posix_class < 0) - { - *errorcodeptr = ERR30; - goto FAILED; - } - - /* If matching is caseless, upper and lower are converted to - alpha. This relies on the fact that the class table starts with - alpha, lower, upper as the first 3 entries. */ - - if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) - posix_class = 0; - - /* We build the bit map for the POSIX class in a chunk of local store - because we may be adding and subtracting from it, and we don't want to - subtract bits that may be in the main map already. At the end we or the - result into the bit map that is being built. */ - - posix_class *= 3; - - /* Copy in the first table (always present) */ - - memcpy(pbits, cbits + posix_class_maps[posix_class], - 32 * sizeof(uschar)); - - /* If there is a second table, add or remove it as required. */ - - taboffset = posix_class_maps[posix_class + 1]; - tabopt = posix_class_maps[posix_class + 2]; - - if (taboffset >= 0) - { - if (tabopt >= 0) - for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset]; - else - for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; - } - - /* Not see if we need to remove any special characters. An option - value of 1 removes vertical space and 2 removes underscore. */ - - if (tabopt < 0) tabopt = -tabopt; - if (tabopt == 1) pbits[1] &= ~0x3c; - else if (tabopt == 2) pbits[11] &= 0x7f; - - /* Add the POSIX table or its complement into the main table that is - being built and we are done. */ - - if (local_negate) - for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c]; - else - for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; - - ptr = tempptr + 1; - class_charcount = 10; /* Set > 1; assumes more than 1 per class */ - continue; /* End of POSIX syntax handling */ - } - - /* Backslash may introduce a single character, or it may introduce one - of the specials, which just set a flag. The sequence \b is a special - case. Inside a class (and only there) it is treated as backspace. - Elsewhere it marks a word boundary. Other escapes have preset maps ready - to 'or' into the one we are building. We assume they have more than one - character in them, so set class_charcount bigger than one. */ - - if (c == '\\') - { - c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); - if (*errorcodeptr != 0) goto FAILED; - - if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */ - else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */ - else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */ - else if (-c == ESC_Q) /* Handle start of quoted string */ - { - if (ptr[1] == '\\' && ptr[2] == 'E') + if (code > + cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */ { - ptr += 2; /* avoid empty string */ - } - else inescq = TRUE; - continue; - } - else if (-c == ESC_E) continue; /* Ignore orphan \E */ - - if (c < 0) - { - register const uschar *cbits = cd->cbits; - class_charcount += 2; /* Greater than 1 is what matters */ - - /* Save time by not doing this in the pre-compile phase. */ - - if (lengthptr == NULL) switch (-c) - { - case ESC_d: - for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; - continue; - - case ESC_D: - should_flip_negation = TRUE; - for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; - continue; - - case ESC_w: - for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word]; - continue; - - case ESC_W: - should_flip_negation = TRUE; - for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; - continue; - - case ESC_s: - for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space]; - classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */ - continue; - - case ESC_S: - should_flip_negation = TRUE; - for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; - classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ - continue; - - default: /* Not recognized; fall through */ - break; /* Need "default" setting to stop compiler warning. */ + *errorcodeptr = ERR52; + goto FAILED; } - /* In the pre-compile phase, just do the recognition. */ + /* There is at least one situation where code goes backwards: this + is the case of a zero quantifier after a class (e.g. [ab]{0}). At + compile time, the class is simply eliminated. However, it is created + first, so we have to allow memory for it. Therefore, don't ever + reduce the length at this point. + */ - else if (c == -ESC_d || c == -ESC_D || c == -ESC_w || - c == -ESC_W || c == -ESC_s || c == -ESC_S) continue; + if (code < last_code) + code = last_code; - /* We need to deal with \H, \h, \V, and \v in both phases because - they use extra memory. */ + /* Paranoid check for integer overflow */ - if (-c == ESC_h) - { - SETBIT(classbits, 0x09); /* VT */ - SETBIT(classbits, 0x20); /* SPACE */ - SETBIT(classbits, 0xa0); /* NSBP */ -#ifdef SUPPORT_UTF8 - if (utf8) - { - class_utf8 = TRUE; - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data); - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data); - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data); - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data); - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data); - } -#endif - continue; + if (OFLOW_MAX - *lengthptr < code - last_code) { + *errorcodeptr = ERR20; + goto FAILED; } - if (-c == ESC_H) - { - for (c = 0; c < 32; c++) - { - int x = 0xff; - switch (c) - { - case 0x09/8: x ^= 1 << (0x09%8); break; - case 0x20/8: x ^= 1 << (0x20%8); break; - case 0xa0/8: x ^= 1 << (0xa0%8); break; - default: break; + *lengthptr += code - last_code; + DPRINTF( + ("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); + + /* If "previous" is set and it is not at the start of the work + space, move it back to there, in order to avoid filling up the work + space. Otherwise, if "previous" is NULL, reset the current code + pointer to the start. */ + + if (previous != NULL) { + if (previous > orig_code) { + memmove(orig_code, previous, code - previous); + code -= previous - orig_code; + previous = orig_code; } - classbits[c] |= x; - } + } else + code = orig_code; -#ifdef SUPPORT_UTF8 - if (utf8) - { - class_utf8 = TRUE; - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); - } -#endif - continue; - } + /* Remember where this code item starts so we can pick up the length + next time round. */ - if (-c == ESC_v) - { - SETBIT(classbits, 0x0a); /* LF */ - SETBIT(classbits, 0x0b); /* VT */ - SETBIT(classbits, 0x0c); /* FF */ - SETBIT(classbits, 0x0d); /* CR */ - SETBIT(classbits, 0x85); /* NEL */ -#ifdef SUPPORT_UTF8 - if (utf8) - { - class_utf8 = TRUE; - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); - } -#endif - continue; - } + last_code = code; + } - if (-c == ESC_V) - { - for (c = 0; c < 32; c++) - { - int x = 0xff; - switch (c) - { - case 0x0a/8: x ^= 1 << (0x0a%8); - x ^= 1 << (0x0b%8); - x ^= 1 << (0x0c%8); - x ^= 1 << (0x0d%8); - break; - case 0x85/8: x ^= 1 << (0x85%8); break; - default: break; - } - classbits[c] |= x; - } + /* In the real compile phase, just check the workspace used by the + forward reference list. */ -#ifdef SUPPORT_UTF8 - if (utf8) - { - class_utf8 = TRUE; - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); - } -#endif - continue; - } - - /* We need to deal with \P and \p in both phases. */ - -#ifdef SUPPORT_UCP - if (-c == ESC_p || -c == ESC_P) - { - BOOL negated; - int pdata; - int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); - if (ptype < 0) goto FAILED; - class_utf8 = TRUE; - *class_utf8data++ = ((-c == ESC_p) != negated)? - XCL_PROP : XCL_NOTPROP; - *class_utf8data++ = ptype; - *class_utf8data++ = pdata; - class_charcount -= 2; /* Not a < 256 character */ - continue; - } -#endif - /* Unrecognized escapes are faulted if PCRE is running in its - strict mode. By default, for compatibility with Perl, they are - treated as literals. */ - - if ((options & PCRE_EXTRA) != 0) - { - *errorcodeptr = ERR7; + else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE) { + *errorcodeptr = ERR52; goto FAILED; - } - - class_charcount -= 2; /* Undo the default count from above */ - c = *ptr; /* Get the final character and fall through */ - } - - /* Fall through if we have a single character (c >= 0). This may be - greater than 256 in UTF-8 mode. */ - - } /* End of backslash handling */ - - /* A single character may be followed by '-' to form a range. However, - Perl does not permit ']' to be the end of the range. A '-' character - at the end is treated as a literal. Perl ignores orphaned \E sequences - entirely. The code for handling \Q and \E is messy. */ - - CHECK_RANGE: - while (ptr[1] == '\\' && ptr[2] == 'E') - { - inescq = FALSE; - ptr += 2; } - oldptr = ptr; + /* If in \Q...\E, check for the end; if not, we have a literal */ - /* Remember \r or \n */ - - if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF; - - /* Check for range */ - - if (!inescq && ptr[1] == '-') - { - int d; - ptr += 2; - while (*ptr == '\\' && ptr[1] == 'E') ptr += 2; - - /* If we hit \Q (not followed by \E) at this point, go into escaped - mode. */ - - while (*ptr == '\\' && ptr[1] == 'Q') - { - ptr += 2; - if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; } - inescq = TRUE; - break; - } - - if (*ptr == 0 || (!inescq && *ptr == ']')) - { - ptr = oldptr; - goto LONE_SINGLE_CHARACTER; - } - -#ifdef SUPPORT_UTF8 - if (utf8) - { /* Braces are required because the */ - GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ - } - else -#endif - d = *ptr; /* Not UTF-8 mode */ - - /* The second part of a range can be a single-character escape, but - not any of the other escapes. Perl 5.6 treats a hyphen as a literal - in such circumstances. */ - - if (!inescq && d == '\\') - { - d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); - if (*errorcodeptr != 0) goto FAILED; - - /* \b is backspace; \X is literal X; \R is literal R; any other - special means the '-' was literal */ - - if (d < 0) - { - if (d == -ESC_b) d = '\b'; - else if (d == -ESC_X) d = 'X'; - else if (d == -ESC_R) d = 'R'; else - { - ptr = oldptr; - goto LONE_SINGLE_CHARACTER; /* A few lines below */ - } - } - } - - /* Check that the two values are in the correct order. Optimize - one-character ranges */ - - if (d < c) - { - *errorcodeptr = ERR8; - goto FAILED; - } - - if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ - - /* Remember \r or \n */ - - if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF; - - /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless - matching, we have to use an XCLASS with extra data items. Caseless - matching for characters > 127 is available only if UCP support is - available. */ - -#ifdef SUPPORT_UTF8 - if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) - { - class_utf8 = TRUE; - - /* With UCP support, we can find the other case equivalents of - the relevant characters. There may be several ranges. Optimize how - they fit with the basic range. */ - -#ifdef SUPPORT_UCP - if ((options & PCRE_CASELESS) != 0) - { - unsigned int occ, ocd; - unsigned int cc = c; - unsigned int origd = d; - while (get_othercase_range(&cc, origd, &occ, &ocd)) - { - if (occ >= (unsigned int)c && - ocd <= (unsigned int)d) - continue; /* Skip embedded ranges */ - - if (occ < (unsigned int)c && - ocd >= (unsigned int)c - 1) /* Extend the basic range */ - { /* if there is overlap, */ - c = occ; /* noting that if occ < c */ - continue; /* we can't have ocd > d */ - } /* because a subrange is */ - if (ocd > (unsigned int)d && - occ <= (unsigned int)d + 1) /* always shorter than */ - { /* the basic range. */ - d = ocd; + if (inescq && c != 0) { + if (c == '\\' && ptr[1] == 'E') { + inescq = FALSE; + ptr++; continue; + } else { + if (previous_callout != NULL) { + if (lengthptr == + NULL) /* Don't attempt in pre-compile phase */ + complete_callout(previous_callout, ptr, cd); + previous_callout = NULL; } - - if (occ == ocd) - { - *class_utf8data++ = XCL_SINGLE; + if ((options & PCRE_AUTO_CALLOUT) != 0) { + previous_callout = code; + code = auto_callout(code, ptr, cd); } - else - { - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(occ, class_utf8data); - } - class_utf8data += _pcre_ord2utf8(ocd, class_utf8data); - } + goto NORMAL_CHAR; } -#endif /* SUPPORT_UCP */ - - /* Now record the original range, possibly modified for UCP caseless - overlapping ranges. */ - - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(c, class_utf8data); - class_utf8data += _pcre_ord2utf8(d, class_utf8data); - - /* With UCP support, we are done. Without UCP support, there is no - caseless matching for UTF-8 characters > 127; we can use the bit map - for the smaller ones. */ - -#ifdef SUPPORT_UCP - continue; /* With next character in the class */ -#else - if ((options & PCRE_CASELESS) == 0 || c > 127) continue; - - /* Adjust upper limit and fall through to set up the map */ - - d = 127; - -#endif /* SUPPORT_UCP */ - } -#endif /* SUPPORT_UTF8 */ - - /* We use the bit map for all cases when not in UTF-8 mode; else - ranges that lie entirely within 0-127 when there is UCP support; else - for partial ranges without UCP support. */ - - class_charcount += d - c + 1; - class_lastchar = d; - - /* We can save a bit of time by skipping this in the pre-compile. */ - - if (lengthptr == NULL) for (; c <= d; c++) - { - classbits[c/8] |= (1 << (c&7)); - if ((options & PCRE_CASELESS) != 0) - { - int uc = cd->fcc[c]; /* flip case */ - classbits[uc/8] |= (1 << (uc&7)); - } - } - - continue; /* Go get the next char in the class */ } - /* Handle a lone single character - we can get here for a normal - non-escape char, or after \ that introduces a single character or for an - apparent range that isn't. */ + /* Fill in length of a previous callout, except when the next thing is + a quantifier. */ - LONE_SINGLE_CHARACTER: + is_quantifier = c == '*' || c == '+' || c == '?' || + (c == '{' && is_counted_repeat(ptr + 1)); - /* Handle a character that cannot go in the bit map */ + if (!is_quantifier && previous_callout != NULL && + after_manual_callout-- <= 0) { + if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ + complete_callout(previous_callout, ptr, cd); + previous_callout = NULL; + } + + /* In extended mode, skip white space and comments */ + + if ((options & PCRE_EXTENDED) != 0) { + if ((cd->ctypes[c] & ctype_space) != 0) + continue; + if (c == '#') { + while (*(++ptr) != 0) { + if (IS_NEWLINE(ptr)) { + ptr += cd->nllen - 1; + break; + } + } + if (*ptr != 0) + continue; + + /* Else fall through to handle end of string */ + c = 0; + } + } + + /* No auto callout for quantifiers. */ + + if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier) { + previous_callout = code; + code = auto_callout(code, ptr, cd); + } + + switch (c) { + /* ===================================================================*/ + case 0: /* The branch terminates at string end */ + case '|': /* or | or ) */ + case ')': + *firstbyteptr = firstbyte; + *reqbyteptr = reqbyte; + *codeptr = code; + *ptrptr = ptr; + if (lengthptr != NULL) { + if (OFLOW_MAX - *lengthptr < code - last_code) { + *errorcodeptr = ERR20; + goto FAILED; + } + *lengthptr += + code - last_code; /* To include callout length */ + DPRINTF((">> end branch\n")); + } + return TRUE; + + /* ===================================================================*/ + /* Handle single-character metacharacters. In multiline mode, ^ + disables the setting of any following char as a first character. + */ + + case '^': + if ((options & PCRE_MULTILINE) != 0) { + if (firstbyte == REQ_UNSET) + firstbyte = REQ_NONE; + } + previous = NULL; + *code++ = OP_CIRC; + break; + + case '$': + previous = NULL; + *code++ = OP_DOLL; + break; + + /* There can never be a first char if '.' is first, whatever + happens about repeats. The value of reqbyte doesn't change + either. */ + + case '.': + if (firstbyte == REQ_UNSET) + firstbyte = REQ_NONE; + zerofirstbyte = firstbyte; + zeroreqbyte = reqbyte; + previous = code; + *code++ = OP_ANY; + break; + + /* ===================================================================*/ + /* Character classes. If the included characters are all < 256, + we build a 32-byte bitmap of the permitted characters, except in + the special case where there is only one such character. For + negated classes, we build the map as usual, then invert it at + the end. However, we use a different opcode so that data + characters > 255 can be handled correctly. + + If the class contains characters outside the 0-255 range, a + different opcode is compiled. It may optionally have a bit map + for characters < 256, but those above are are explicitly listed + afterwards. A flag byte tells whether the bitmap is present, and + whether this is a negated class or not. + */ + + case '[': + previous = code; + + /* PCRE supports POSIX class stuff inside a class. Perl gives an + error if they are encountered at the top level, so we'll do that + too. */ + + if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && + check_posix_syntax(ptr, &tempptr)) { + *errorcodeptr = (ptr[1] == ':') ? ERR13 : ERR31; + goto FAILED; + } + + /* If the first character is '^', set the negation flag and skip + it. Also, if the first few characters (either before or after ^) + are \Q\E or \E we skip them too. This makes for compatibility + with Perl. */ + + negate_class = FALSE; + for (;;) { + c = *(++ptr); + if (c == '\\') { + if (ptr[1] == 'E') + ptr++; + else if (strncmp((const char*)ptr + 1, "Q\\E", 3) == 0) + ptr += 3; + else + break; + } else if (!negate_class && c == '^') + negate_class = TRUE; + else + break; + } + + /* If a class contains a negative special such as \S, we need to + flip the negation flag at the end, so that support for + characters > 255 works correctly (they are all included in the + class). */ + + should_flip_negation = FALSE; + + /* Keep a count of chars with values < 256 so that we can + optimize the case of just a single character (as long as it's < + 256). However, For higher valued UTF-8 characters, we don't yet + do any optimization. */ + + class_charcount = 0; + class_lastchar = -1; + + /* Initialize the 32-char bit map to all zeros. We build the map + in a temporary bit of memory, in case the class contains only 1 + character (less than 256), because in that case the compiled + code doesn't use the bit map. + */ + + memset(classbits, 0, 32 * sizeof(uschar)); #ifdef SUPPORT_UTF8 - if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) - { - class_utf8 = TRUE; - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(c, class_utf8data); + class_utf8 = FALSE; /* No chars >= 256 */ + class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ + class_utf8data_base = + class_utf8data; /* For resetting in pass 1 */ +#endif + + /* Process characters until ] is reached. By writing this as a + "do" it means that an initial ] is taken as a data character. At + the start of the loop, c contains the first byte of the + character. */ + + if (c != 0) + do { + const uschar* oldptr; + +#ifdef SUPPORT_UTF8 + if (utf8 && + c > 127) { /* Braces are required because the */ + GETCHARLEN( + c, ptr, + ptr); /* macro generates multiple statements */ + } + + /* In the pre-compile phase, accumulate the length of + any UTF-8 extra data and reset the pointer. This is so + that very large classes that contain a zillion UTF-8 + characters no longer overwrite the work space (which is + on the stack). */ + + if (lengthptr != NULL) { + *lengthptr += class_utf8data - class_utf8data_base; + class_utf8data = class_utf8data_base; + } + +#endif + + /* Inside \Q...\E everything is literal except \E */ + + if (inescq) { + if (c == '\\' && + ptr[1] == 'E') /* If we are at \E */ + { + inescq = FALSE; /* Reset literal state */ + ptr++; /* Skip the 'E' */ + continue; /* Carry on with next */ + } + goto CHECK_RANGE; /* Could be range if \E follows */ + } + + /* Handle POSIX class names. Perl allows a negation + extension of the form [:^name:]. A square bracket that + doesn't match the syntax is treated as a literal. We + also recognize the POSIX constructions + [.ch.] and [=ch=] ("collating elements") and fault them, + as Perl 5.6 and 5.8 do. */ + + if (c == '[' && + (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && + check_posix_syntax(ptr, &tempptr)) { + BOOL local_negate = FALSE; + int posix_class, taboffset, tabopt; + register const uschar* cbits = cd->cbits; + uschar pbits[32]; + + if (ptr[1] != ':') { + *errorcodeptr = ERR31; + goto FAILED; + } + + ptr += 2; + if (*ptr == '^') { + local_negate = TRUE; + should_flip_negation = + TRUE; /* Note negative special */ + ptr++; + } + + posix_class = check_posix_name(ptr, tempptr - ptr); + if (posix_class < 0) { + *errorcodeptr = ERR30; + goto FAILED; + } + + /* If matching is caseless, upper and lower are + converted to alpha. This relies on the fact that the + class table starts with alpha, lower, upper as the + first 3 entries. */ + + if ((options & PCRE_CASELESS) != 0 && + posix_class <= 2) + posix_class = 0; + + /* We build the bit map for the POSIX class in a + chunk of local store because we may be adding and + subtracting from it, and we don't want to subtract + bits that may be in the main map already. At the end + we or the result into the bit map that is being + built. */ + + posix_class *= 3; + + /* Copy in the first table (always present) */ + + memcpy(pbits, cbits + posix_class_maps[posix_class], + 32 * sizeof(uschar)); + + /* If there is a second table, add or remove it as + * required. */ + + taboffset = posix_class_maps[posix_class + 1]; + tabopt = posix_class_maps[posix_class + 2]; + + if (taboffset >= 0) { + if (tabopt >= 0) + for (c = 0; c < 32; c++) + pbits[c] |= cbits[c + taboffset]; + else + for (c = 0; c < 32; c++) + pbits[c] &= ~cbits[c + taboffset]; + } + + /* Not see if we need to remove any special + characters. An option value of 1 removes vertical + space and 2 removes underscore. */ + + if (tabopt < 0) + tabopt = -tabopt; + if (tabopt == 1) + pbits[1] &= ~0x3c; + else if (tabopt == 2) + pbits[11] &= 0x7f; + + /* Add the POSIX table or its complement into the + main table that is being built and we are done. */ + + if (local_negate) + for (c = 0; c < 32; c++) + classbits[c] |= ~pbits[c]; + else + for (c = 0; c < 32; c++) + classbits[c] |= pbits[c]; + + ptr = tempptr + 1; + class_charcount = + 10; /* Set > 1; assumes more than 1 per class */ + continue; /* End of POSIX syntax handling */ + } + + /* Backslash may introduce a single character, or it may + introduce one of the specials, which just set a flag. + The sequence \b is a special case. Inside a class (and + only there) it is treated as backspace. Elsewhere it + marks a word boundary. Other escapes have preset maps + ready to 'or' into the one we are building. We assume + they have more than one character in them, so set + class_charcount bigger than one. */ + + if (c == '\\') { + c = check_escape(&ptr, errorcodeptr, cd->bracount, + options, TRUE); + if (*errorcodeptr != 0) + goto FAILED; + + if (-c == ESC_b) + c = '\b'; /* \b is backspace in a class */ + else if (-c == ESC_X) + c = 'X'; /* \X is literal X in a class */ + else if (-c == ESC_R) + c = 'R'; /* \R is literal R in a class */ + else if (-c == + ESC_Q) /* Handle start of quoted string */ + { + if (ptr[1] == '\\' && ptr[2] == 'E') { + ptr += 2; /* avoid empty string */ + } else + inescq = TRUE; + continue; + } else if (-c == ESC_E) + continue; /* Ignore orphan \E */ + + if (c < 0) { + register const uschar* cbits = cd->cbits; + class_charcount += + 2; /* Greater than 1 is what matters */ + + /* Save time by not doing this in the + * pre-compile phase. */ + + if (lengthptr == NULL) + switch (-c) { + case ESC_d: + for (c = 0; c < 32; c++) + classbits[c] |= + cbits[c + cbit_digit]; + continue; + + case ESC_D: + should_flip_negation = TRUE; + for (c = 0; c < 32; c++) + classbits[c] |= + ~cbits[c + cbit_digit]; + continue; + + case ESC_w: + for (c = 0; c < 32; c++) + classbits[c] |= + cbits[c + cbit_word]; + continue; + + case ESC_W: + should_flip_negation = TRUE; + for (c = 0; c < 32; c++) + classbits[c] |= + ~cbits[c + cbit_word]; + continue; + + case ESC_s: + for (c = 0; c < 32; c++) + classbits[c] |= + cbits[c + cbit_space]; + classbits[1] &= + ~0x08; /* Perl 5.004 onwards + omits VT from \s */ + continue; + + case ESC_S: + should_flip_negation = TRUE; + for (c = 0; c < 32; c++) + classbits[c] |= + ~cbits[c + cbit_space]; + classbits[1] |= + 0x08; /* Perl 5.004 onwards + omits VT from \s */ + continue; + + default: /* Not recognized; fall through + */ + break; /* Need "default" setting to + stop compiler warning. */ + } + + /* In the pre-compile phase, just do the + * recognition. */ + + else if (c == -ESC_d || c == -ESC_D || + c == -ESC_w || c == -ESC_W || + c == -ESC_s || c == -ESC_S) + continue; + + /* We need to deal with \H, \h, \V, and \v in + both phases because they use extra memory. */ + + if (-c == ESC_h) { + SETBIT(classbits, 0x09); /* VT */ + SETBIT(classbits, 0x20); /* SPACE */ + SETBIT(classbits, 0xa0); /* NSBP */ +#ifdef SUPPORT_UTF8 + if (utf8) { + class_utf8 = TRUE; + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8( + 0x1680, class_utf8data); + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8( + 0x180e, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8( + 0x2000, class_utf8data); + class_utf8data += _pcre_ord2utf8( + 0x200A, class_utf8data); + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8( + 0x202f, class_utf8data); + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8( + 0x205f, class_utf8data); + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8( + 0x3000, class_utf8data); + } +#endif + continue; + } + + if (-c == ESC_H) { + for (c = 0; c < 32; c++) { + int x = 0xff; + switch (c) { + case 0x09 / 8: + x ^= 1 << (0x09 % 8); + break; + case 0x20 / 8: + x ^= 1 << (0x20 % 8); + break; + case 0xa0 / 8: + x ^= 1 << (0xa0 % 8); + break; + default: + break; + } + classbits[c] |= x; + } + +#ifdef SUPPORT_UTF8 + if (utf8) { + class_utf8 = TRUE; + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8( + 0x0100, class_utf8data); + class_utf8data += _pcre_ord2utf8( + 0x167f, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8( + 0x1681, class_utf8data); + class_utf8data += _pcre_ord2utf8( + 0x180d, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8( + 0x180f, class_utf8data); + class_utf8data += _pcre_ord2utf8( + 0x1fff, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8( + 0x200B, class_utf8data); + class_utf8data += _pcre_ord2utf8( + 0x202e, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8( + 0x2030, class_utf8data); + class_utf8data += _pcre_ord2utf8( + 0x205e, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8( + 0x2060, class_utf8data); + class_utf8data += _pcre_ord2utf8( + 0x2fff, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8( + 0x3001, class_utf8data); + class_utf8data += _pcre_ord2utf8( + 0x7fffffff, class_utf8data); + } +#endif + continue; + } + + if (-c == ESC_v) { + SETBIT(classbits, 0x0a); /* LF */ + SETBIT(classbits, 0x0b); /* VT */ + SETBIT(classbits, 0x0c); /* FF */ + SETBIT(classbits, 0x0d); /* CR */ + SETBIT(classbits, 0x85); /* NEL */ +#ifdef SUPPORT_UTF8 + if (utf8) { + class_utf8 = TRUE; + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8( + 0x2028, class_utf8data); + class_utf8data += _pcre_ord2utf8( + 0x2029, class_utf8data); + } +#endif + continue; + } + + if (-c == ESC_V) { + for (c = 0; c < 32; c++) { + int x = 0xff; + switch (c) { + case 0x0a / 8: + x ^= 1 << (0x0a % 8); + x ^= 1 << (0x0b % 8); + x ^= 1 << (0x0c % 8); + x ^= 1 << (0x0d % 8); + break; + case 0x85 / 8: + x ^= 1 << (0x85 % 8); + break; + default: + break; + } + classbits[c] |= x; + } + +#ifdef SUPPORT_UTF8 + if (utf8) { + class_utf8 = TRUE; + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8( + 0x0100, class_utf8data); + class_utf8data += _pcre_ord2utf8( + 0x2027, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8( + 0x2029, class_utf8data); + class_utf8data += _pcre_ord2utf8( + 0x7fffffff, class_utf8data); + } +#endif + continue; + } + + /* We need to deal with \P and \p in both + * phases. */ #ifdef SUPPORT_UCP - if ((options & PCRE_CASELESS) != 0) - { - unsigned int othercase; - if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) - { - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); - } - } -#endif /* SUPPORT_UCP */ + if (-c == ESC_p || -c == ESC_P) { + BOOL negated; + int pdata; + int ptype = get_ucp(&ptr, &negated, &pdata, + errorcodeptr); + if (ptype < 0) + goto FAILED; + class_utf8 = TRUE; + *class_utf8data++ = + ((-c == ESC_p) != negated) + ? XCL_PROP + : XCL_NOTPROP; + *class_utf8data++ = ptype; + *class_utf8data++ = pdata; + class_charcount -= + 2; /* Not a < 256 character */ + continue; + } +#endif + /* Unrecognized escapes are faulted if PCRE is + running in its strict mode. By default, for + compatibility with Perl, they are treated as + literals. */ - } - else -#endif /* SUPPORT_UTF8 */ + if ((options & PCRE_EXTRA) != 0) { + *errorcodeptr = ERR7; + goto FAILED; + } - /* Handle a single-byte character */ - { - classbits[c/8] |= (1 << (c&7)); - if ((options & PCRE_CASELESS) != 0) - { - c = cd->fcc[c]; /* flip case */ - classbits[c/8] |= (1 << (c&7)); - } - class_charcount++; - class_lastchar = c; - } - } + class_charcount -= + 2; /* Undo the default count from above */ + c = *ptr; /* Get the final character and fall + through */ + } - /* Loop until ']' reached. This "while" is the end of the "do" above. */ + /* Fall through if we have a single character (c >= + 0). This may be greater than 256 in UTF-8 mode. */ - while ((c = *(++ptr)) != 0 && (c != ']' || inescq)); + } /* End of backslash handling */ - if (c == 0) /* Missing terminating ']' */ - { - *errorcodeptr = ERR6; - goto FAILED; - } + /* A single character may be followed by '-' to form a + range. However, Perl does not permit ']' to be the end + of the range. A '-' character at the end is treated as a + literal. Perl ignores orphaned \E sequences entirely. + The code for handling \Q and \E is messy. */ + CHECK_RANGE: + while (ptr[1] == '\\' && ptr[2] == 'E') { + inescq = FALSE; + ptr += 2; + } -/* This code has been disabled because it would mean that \s counts as -an explicit \r or \n reference, and that's not really what is wanted. Now -we set the flag only if there is a literal "\r" or "\n" in the class. */ + oldptr = ptr; + + /* Remember \r or \n */ + + if (c == '\r' || c == '\n') + cd->external_flags |= PCRE_HASCRORLF; + + /* Check for range */ + + if (!inescq && ptr[1] == '-') { + int d; + ptr += 2; + while (*ptr == '\\' && ptr[1] == 'E') + ptr += 2; + + /* If we hit \Q (not followed by \E) at this point, + go into escaped mode. */ + + while (*ptr == '\\' && ptr[1] == 'Q') { + ptr += 2; + if (*ptr == '\\' && ptr[1] == 'E') { + ptr += 2; + continue; + } + inescq = TRUE; + break; + } + + if (*ptr == 0 || (!inescq && *ptr == ']')) { + ptr = oldptr; + goto LONE_SINGLE_CHARACTER; + } + +#ifdef SUPPORT_UTF8 + if (utf8) { /* Braces are required because the */ + GETCHARLEN(d, ptr, + ptr); /* macro generates multiple + statements */ + } else +#endif + d = *ptr; /* Not UTF-8 mode */ + + /* The second part of a range can be a + single-character escape, but not any of the other + escapes. Perl 5.6 treats a hyphen as a literal in + such circumstances. */ + + if (!inescq && d == '\\') { + d = check_escape(&ptr, errorcodeptr, + cd->bracount, options, TRUE); + if (*errorcodeptr != 0) + goto FAILED; + + /* \b is backspace; \X is literal X; \R is + literal R; any other special means the '-' was + literal */ + + if (d < 0) { + if (d == -ESC_b) + d = '\b'; + else if (d == -ESC_X) + d = 'X'; + else if (d == -ESC_R) + d = 'R'; + else { + ptr = oldptr; + goto LONE_SINGLE_CHARACTER; /* A few + lines + below */ + } + } + } + + /* Check that the two values are in the correct + order. Optimize one-character ranges */ + + if (d < c) { + *errorcodeptr = ERR8; + goto FAILED; + } + + if (d == c) + goto LONE_SINGLE_CHARACTER; /* A few lines below + */ + + /* Remember \r or \n */ + + if (d == '\r' || d == '\n') + cd->external_flags |= PCRE_HASCRORLF; + + /* In UTF-8 mode, if the upper limit is > 255, + or > 127 for caseless matching, we have to use + an XCLASS with extra data items. Caseless + matching for characters > 127 is available only + if UCP support is available. */ + +#ifdef SUPPORT_UTF8 + if (utf8 && + (d > 255 || + ((options & PCRE_CASELESS) != 0 && d > 127))) { + class_utf8 = TRUE; + + /* With UCP support, we can find the other case + equivalents of the relevant characters. There + may be several ranges. Optimize how they fit + with the basic range. */ + +#ifdef SUPPORT_UCP + if ((options & PCRE_CASELESS) != 0) { + unsigned int occ, ocd; + unsigned int cc = c; + unsigned int origd = d; + while (get_othercase_range(&cc, origd, &occ, + &ocd)) { + if (occ >= (unsigned int)c && + ocd <= (unsigned int)d) + continue; /* Skip embedded ranges */ + + if (occ < (unsigned int)c && + ocd >= (unsigned int)c - + 1) /* Extend the basic + range */ + { /* if there is overlap, */ + c = occ; /* noting that if occ < c + */ + continue; /* we can't have ocd > d + */ + } /* because a subrange is */ + if (ocd > (unsigned int)d && + occ <= + (unsigned int)d + + 1) /* always shorter than */ + { /* the basic range. */ + d = ocd; + continue; + } + + if (occ == ocd) { + *class_utf8data++ = XCL_SINGLE; + } else { + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8( + occ, class_utf8data); + } + class_utf8data += + _pcre_ord2utf8(ocd, class_utf8data); + } + } +#endif /* SUPPORT_UCP */ + + /* Now record the original range, possibly + modified for UCP caseless overlapping ranges. */ + + *class_utf8data++ = XCL_RANGE; + class_utf8data += + _pcre_ord2utf8(c, class_utf8data); + class_utf8data += + _pcre_ord2utf8(d, class_utf8data); + + /* With UCP support, we are done. Without UCP + support, there is no caseless matching for UTF-8 + characters > 127; we can use the bit map for the + smaller ones. */ + +#ifdef SUPPORT_UCP + continue; /* With next character in the class */ +#else + if ((options & PCRE_CASELESS) == 0 || c > 127) + continue; + + /* Adjust upper limit and fall through to set up + * the map */ + + d = 127; + +#endif /* SUPPORT_UCP */ + } +#endif /* SUPPORT_UTF8 */ + + /* We use the bit map for all cases when not in + UTF-8 mode; else ranges that lie entirely within + 0-127 when there is UCP support; else for partial + ranges without UCP support. */ + + class_charcount += d - c + 1; + class_lastchar = d; + + /* We can save a bit of time by skipping this in the + * pre-compile. */ + + if (lengthptr == NULL) + for (; c <= d; c++) { + classbits[c / 8] |= (1 << (c & 7)); + if ((options & PCRE_CASELESS) != 0) { + int uc = cd->fcc[c]; /* flip case */ + classbits[uc / 8] |= (1 << (uc & 7)); + } + } + + continue; /* Go get the next char in the class */ + } + + /* Handle a lone single character - we can get here for + a normal non-escape char, or after \ that introduces a + single character or for an apparent range that isn't. */ + + LONE_SINGLE_CHARACTER: + + /* Handle a character that cannot go in the bit map */ + +#ifdef SUPPORT_UTF8 + if (utf8 && + (c > 255 || + ((options & PCRE_CASELESS) != 0 && c > 127))) { + class_utf8 = TRUE; + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8(c, class_utf8data); + +#ifdef SUPPORT_UCP + if ((options & PCRE_CASELESS) != 0) { + unsigned int othercase; + if ((othercase = _pcre_ucp_othercase(c)) != + NOTACHAR) { + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8( + othercase, class_utf8data); + } + } +#endif /* SUPPORT_UCP */ + + } else +#endif /* SUPPORT_UTF8 */ + + /* Handle a single-byte character */ + { + classbits[c / 8] |= (1 << (c & 7)); + if ((options & PCRE_CASELESS) != 0) { + c = cd->fcc[c]; /* flip case */ + classbits[c / 8] |= (1 << (c & 7)); + } + class_charcount++; + class_lastchar = c; + } + } + + /* Loop until ']' reached. This "while" is the end of the + "do" above. */ + + while ((c = *(++ptr)) != 0 && (c != ']' || inescq)); + + if (c == 0) /* Missing terminating ']' */ + { + *errorcodeptr = ERR6; + goto FAILED; + } + + /* This code has been disabled because it would mean that \s + counts as an explicit \r or \n reference, and that's not really + what is wanted. Now we set the flag only if there is a literal + "\r" or "\n" in the class. */ #if 0 /* Remember whether \r or \n are in this class */ @@ -3277,1943 +3560,2027 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ } #endif + /* If class_charcount is 1, we saw precisely one character whose + value is less than 256. As long as there were no characters >= + 128 and there was no use of \p or \P, in other words, no use of + any XCLASS features, we can optimize. - /* If class_charcount is 1, we saw precisely one character whose value is - less than 256. As long as there were no characters >= 128 and there was no - use of \p or \P, in other words, no use of any XCLASS features, we can - optimize. + In UTF-8 mode, we can optimize the negative case only if there + were no characters >= 128 because OP_NOT and the related opcodes + like OP_NOTSTAR operate on single-bytes only. This is an + historical hangover. Maybe one day we can tidy these opcodes to + handle multi-byte characters. - In UTF-8 mode, we can optimize the negative case only if there were no - characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR - operate on single-bytes only. This is an historical hangover. Maybe one day - we can tidy these opcodes to handle multi-byte characters. - - The optimization throws away the bit map. We turn the item into a - 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note - that OP_NOT does not support multibyte characters. In the positive case, it - can cause firstbyte to be set. Otherwise, there can be no first char if - this item is first, whatever repeat count may follow. In the case of - reqbyte, save the previous value for reinstating. */ + The optimization throws away the bit map. We turn the item into + a 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's + negative. Note that OP_NOT does not support multibyte + characters. In the positive case, it can cause firstbyte to be + set. Otherwise, there can be no first char if this item is + first, whatever repeat count may follow. In the case of reqbyte, + save the previous value for reinstating. */ #ifdef SUPPORT_UTF8 - if (class_charcount == 1 && !class_utf8 && - (!utf8 || !negate_class || class_lastchar < 128)) + if (class_charcount == 1 && !class_utf8 && + (!utf8 || !negate_class || class_lastchar < 128)) #else - if (class_charcount == 1) + if (class_charcount == 1) #endif - { - zeroreqbyte = reqbyte; - - /* The OP_NOT opcode works on one-byte characters only. */ - - if (negate_class) - { - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; - zerofirstbyte = firstbyte; - *code++ = OP_NOT; - *code++ = class_lastchar; - break; - } - - /* For a single, positive character, get the value into mcbuffer, and - then we can handle this with the normal one-character code. */ - -#ifdef SUPPORT_UTF8 - if (utf8 && class_lastchar > 127) - mclength = _pcre_ord2utf8(class_lastchar, mcbuffer); - else -#endif - { - mcbuffer[0] = class_lastchar; - mclength = 1; - } - goto ONE_CHAR; - } /* End of 1-char optimization */ - - /* The general case - not the one-char optimization. If this is the first - thing in the branch, there can be no first char setting, whatever the - repeat count. Any reqbyte setting must remain unchanged after any kind of - repeat. */ - - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; - zerofirstbyte = firstbyte; - zeroreqbyte = reqbyte; - - /* If there are characters with values > 255, we have to compile an - extended class, with its own opcode, unless there was a negated special - such as \S in the class, because in that case all characters > 255 are in - the class, so any that were explicitly given as well can be ignored. If - (when there are explicit characters > 255 that must be listed) there are no - characters < 256, we can omit the bitmap in the actual compiled code. */ - -#ifdef SUPPORT_UTF8 - if (class_utf8 && !should_flip_negation) - { - *class_utf8data++ = XCL_END; /* Marks the end of extra data */ - *code++ = OP_XCLASS; - code += LINK_SIZE; - *code = negate_class? XCL_NOT : 0; - - /* If the map is required, move up the extra data to make room for it; - otherwise just move the code pointer to the end of the extra data. */ - - if (class_charcount > 0) - { - *code++ |= XCL_MAP; - memmove(code + 32, code, class_utf8data - code); - memcpy(code, classbits, 32); - code = class_utf8data + 32; - } - else code = class_utf8data; - - /* Now fill in the complete length of the item */ - - PUT(previous, 1, code - previous); - break; /* End of class handling */ - } -#endif - - /* If there are no characters > 255, set the opcode to OP_CLASS or - OP_NCLASS, depending on whether the whole class was negated and whether - there were negative specials such as \S in the class. Then copy the 32-byte - map into the code vector, negating it if necessary. */ - - *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; - if (negate_class) - { - if (lengthptr == NULL) /* Save time in the pre-compile phase */ - for (c = 0; c < 32; c++) code[c] = ~classbits[c]; - } - else - { - memcpy(code, classbits, 32); - } - code += 32; - break; - - - /* ===================================================================*/ - /* Various kinds of repeat; '{' is not necessarily a quantifier, but this - has been tested above. */ - - case '{': - if (!is_quantifier) goto NORMAL_CHAR; - ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr); - if (*errorcodeptr != 0) goto FAILED; - goto REPEAT; - - case '*': - repeat_min = 0; - repeat_max = -1; - goto REPEAT; - - case '+': - repeat_min = 1; - repeat_max = -1; - goto REPEAT; - - case '?': - repeat_min = 0; - repeat_max = 1; - - REPEAT: - if (previous == NULL) - { - *errorcodeptr = ERR9; - goto FAILED; - } - - if (repeat_min == 0) - { - firstbyte = zerofirstbyte; /* Adjust for zero repeat */ - reqbyte = zeroreqbyte; /* Ditto */ - } - - /* Remember whether this is a variable length repeat */ - - reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; - - op_type = 0; /* Default single-char op codes */ - possessive_quantifier = FALSE; /* Default not possessive quantifier */ - - /* Save start of previous item, in case we have to move it up to make space - for an inserted OP_ONCE for the additional '+' extension. */ - - tempcode = previous; - - /* If the next character is '+', we have a possessive quantifier. This - implies greediness, whatever the setting of the PCRE_UNGREEDY option. - If the next character is '?' this is a minimizing repeat, by default, - but if PCRE_UNGREEDY is set, it works the other way round. We change the - repeat type to the non-default. */ - - if (ptr[1] == '+') - { - repeat_type = 0; /* Force greedy */ - possessive_quantifier = TRUE; - ptr++; - } - else if (ptr[1] == '?') - { - repeat_type = greedy_non_default; - ptr++; - } - else repeat_type = greedy_default; - - /* If previous was a character match, abolish the item and generate a - repeat item instead. If a char item has a minumum of more than one, ensure - that it is set in reqbyte - it might not be if a sequence such as x{3} is - the first thing in a branch because the x will have gone into firstbyte - instead. */ - - if (*previous == OP_CHAR || *previous == OP_CHARNC) - { - /* Deal with UTF-8 characters that take up more than one byte. It's - easier to write this out separately than try to macrify it. Use c to - hold the length of the character in bytes, plus 0x80 to flag that it's a - length rather than a small character. */ - -#ifdef SUPPORT_UTF8 - if (utf8 && (code[-1] & 0x80) != 0) - { - uschar *lastchar = code - 1; - while((*lastchar & 0xc0) == 0x80) lastchar--; - c = code - lastchar; /* Length of UTF-8 character */ - memcpy(utf8_char, lastchar, c); /* Save the char */ - c |= 0x80; /* Flag c as a length */ - } - else -#endif - - /* Handle the case of a single byte - either with no UTF8 support, or - with UTF-8 disabled, or for a UTF-8 character < 128. */ - - { - c = code[-1]; - if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt; - } - - /* If the repetition is unlimited, it pays to see if the next thing on - the line is something that cannot possibly match this character. If so, - automatically possessifying this item gains some performance in the case - where the match fails. */ - - if (!possessive_quantifier && - repeat_max < 0 && - check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1, - options, cd)) - { - repeat_type = 0; /* Force greedy */ - possessive_quantifier = TRUE; - } - - goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ - } - - /* If previous was a single negated character ([^a] or similar), we use - one of the special opcodes, replacing it. The code is shared with single- - character repeats by setting opt_type to add a suitable offset into - repeat_type. We can also test for auto-possessification. OP_NOT is - currently used only for single-byte chars. */ - - else if (*previous == OP_NOT) - { - op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */ - c = previous[1]; - if (!possessive_quantifier && - repeat_max < 0 && - check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd)) - { - repeat_type = 0; /* Force greedy */ - possessive_quantifier = TRUE; - } - goto OUTPUT_SINGLE_REPEAT; - } - - /* If previous was a character type match (\d or similar), abolish it and - create a suitable repeat item. The code is shared with single-character - repeats by setting op_type to add a suitable offset into repeat_type. Note - the the Unicode property types will be present only when SUPPORT_UCP is - defined, but we don't wrap the little bits of code here because it just - makes it horribly messy. */ - - else if (*previous < OP_EODN) - { - uschar *oldcode; - int prop_type, prop_value; - op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ - c = *previous; - - if (!possessive_quantifier && - repeat_max < 0 && - check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd)) - { - repeat_type = 0; /* Force greedy */ - possessive_quantifier = TRUE; - } - - OUTPUT_SINGLE_REPEAT: - if (*previous == OP_PROP || *previous == OP_NOTPROP) - { - prop_type = previous[1]; - prop_value = previous[2]; - } - else prop_type = prop_value = -1; - - oldcode = code; - code = previous; /* Usually overwrite previous item */ - - /* If the maximum is zero then the minimum must also be zero; Perl allows - this case, so we do too - by simply omitting the item altogether. */ - - if (repeat_max == 0) goto END_REPEAT; - - /* All real repeats make it impossible to handle partial matching (maybe - one day we will be able to remove this restriction). */ - - if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; - - /* Combine the op_type with the repeat_type */ - - repeat_type += op_type; - - /* A minimum of zero is handled either as the special case * or ?, or as - an UPTO, with the maximum given. */ - - if (repeat_min == 0) - { - if (repeat_max == -1) *code++ = OP_STAR + repeat_type; - else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; - else - { - *code++ = OP_UPTO + repeat_type; - PUT2INC(code, 0, repeat_max); - } - } - - /* A repeat minimum of 1 is optimized into some special cases. If the - maximum is unlimited, we use OP_PLUS. Otherwise, the original item is - left in place and, if the maximum is greater than 1, we use OP_UPTO with - one less than the maximum. */ - - else if (repeat_min == 1) - { - if (repeat_max == -1) - *code++ = OP_PLUS + repeat_type; - else - { - code = oldcode; /* leave previous item in place */ - if (repeat_max == 1) goto END_REPEAT; - *code++ = OP_UPTO + repeat_type; - PUT2INC(code, 0, repeat_max - 1); - } - } - - /* The case {n,n} is just an EXACT, while the general case {n,m} is - handled as an EXACT followed by an UPTO. */ - - else - { - *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ - PUT2INC(code, 0, repeat_min); - - /* If the maximum is unlimited, insert an OP_STAR. Before doing so, - we have to insert the character for the previous code. For a repeated - Unicode property match, there are two extra bytes that define the - required property. In UTF-8 mode, long characters have their length in - c, with the 0x80 bit as a flag. */ - - if (repeat_max < 0) - { -#ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) - { - memcpy(code, utf8_char, c & 7); - code += c & 7; - } - else -#endif - { - *code++ = c; - if (prop_type >= 0) - { - *code++ = prop_type; - *code++ = prop_value; - } - } - *code++ = OP_STAR + repeat_type; - } - - /* Else insert an UPTO if the max is greater than the min, again - preceded by the character, for the previously inserted code. If the - UPTO is just for 1 instance, we can use QUERY instead. */ - - else if (repeat_max != repeat_min) - { -#ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) - { - memcpy(code, utf8_char, c & 7); - code += c & 7; - } - else -#endif - *code++ = c; - if (prop_type >= 0) - { - *code++ = prop_type; - *code++ = prop_value; - } - repeat_max -= repeat_min; - - if (repeat_max == 1) - { - *code++ = OP_QUERY + repeat_type; - } - else - { - *code++ = OP_UPTO + repeat_type; - PUT2INC(code, 0, repeat_max); - } - } - } - - /* The character or character type itself comes last in all cases. */ - -#ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) - { - memcpy(code, utf8_char, c & 7); - code += c & 7; - } - else -#endif - *code++ = c; - - /* For a repeated Unicode property match, there are two extra bytes that - define the required property. */ - -#ifdef SUPPORT_UCP - if (prop_type >= 0) - { - *code++ = prop_type; - *code++ = prop_value; - } -#endif - } - - /* If previous was a character class or a back reference, we put the repeat - stuff after it, but just skip the item if the repeat was {0,0}. */ - - else if (*previous == OP_CLASS || - *previous == OP_NCLASS || -#ifdef SUPPORT_UTF8 - *previous == OP_XCLASS || -#endif - *previous == OP_REF) - { - if (repeat_max == 0) - { - code = previous; - goto END_REPEAT; - } - - /* All real repeats make it impossible to handle partial matching (maybe - one day we will be able to remove this restriction). */ - - if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; - - if (repeat_min == 0 && repeat_max == -1) - *code++ = OP_CRSTAR + repeat_type; - else if (repeat_min == 1 && repeat_max == -1) - *code++ = OP_CRPLUS + repeat_type; - else if (repeat_min == 0 && repeat_max == 1) - *code++ = OP_CRQUERY + repeat_type; - else - { - *code++ = OP_CRRANGE + repeat_type; - PUT2INC(code, 0, repeat_min); - if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */ - PUT2INC(code, 0, repeat_max); - } - } - - /* If previous was a bracket group, we may have to replicate it in certain - cases. */ - - else if (*previous == OP_BRA || *previous == OP_CBRA || - *previous == OP_ONCE || *previous == OP_COND) - { - register int i; - int ketoffset = 0; - int len = code - previous; - uschar *bralink = NULL; - - /* Repeating a DEFINE group is pointless */ - - if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) - { - *errorcodeptr = ERR55; - goto FAILED; - } - - /* If the maximum repeat count is unlimited, find the end of the bracket - by scanning through from the start, and compute the offset back to it - from the current code pointer. There may be an OP_OPT setting following - the final KET, so we can't find the end just by going back from the code - pointer. */ - - if (repeat_max == -1) - { - register uschar *ket = previous; - do ket += GET(ket, 1); while (*ket != OP_KET); - ketoffset = code - ket; - } - - /* The case of a zero minimum is special because of the need to stick - OP_BRAZERO in front of it, and because the group appears once in the - data, whereas in other cases it appears the minimum number of times. For - this reason, it is simplest to treat this case separately, as otherwise - the code gets far too messy. There are several special subcases when the - minimum is zero. */ - - if (repeat_min == 0) - { - /* If the maximum is also zero, we just omit the group from the output - altogether. */ - - if (repeat_max == 0) - { - code = previous; - goto END_REPEAT; - } - - /* If the maximum is 1 or unlimited, we just have to stick in the - BRAZERO and do no more at this point. However, we do need to adjust - any OP_RECURSE calls inside the group that refer to the group itself or - any internal or forward referenced group, because the offset is from - the start of the whole regex. Temporarily terminate the pattern while - doing this. */ - - if (repeat_max <= 1) - { - *code = OP_END; - adjust_recurse(previous, 1, utf8, cd, save_hwm); - memmove(previous+1, previous, len); - code++; - *previous++ = OP_BRAZERO + repeat_type; - } - - /* If the maximum is greater than 1 and limited, we have to replicate - in a nested fashion, sticking OP_BRAZERO before each set of brackets. - The first one has to be handled carefully because it's the original - copy, which has to be moved up. The remainder can be handled by code - that is common with the non-zero minimum case below. We have to - adjust the value or repeat_max, since one less copy is required. Once - again, we may have to adjust any OP_RECURSE calls inside the group. */ - - else - { - int offset; - *code = OP_END; - adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm); - memmove(previous + 2 + LINK_SIZE, previous, len); - code += 2 + LINK_SIZE; - *previous++ = OP_BRAZERO + repeat_type; - *previous++ = OP_BRA; - - /* We chain together the bracket offset fields that have to be - filled in later when the ends of the brackets are reached. */ - - offset = (bralink == NULL)? 0 : previous - bralink; - bralink = previous; - PUTINC(previous, 0, offset); - } - - repeat_max--; - } - - /* If the minimum is greater than zero, replicate the group as many - times as necessary, and adjust the maximum to the number of subsequent - copies that we need. If we set a first char from the group, and didn't - set a required char, copy the latter from the former. If there are any - forward reference subroutine calls in the group, there will be entries on - the workspace list; replicate these with an appropriate increment. */ - - else - { - if (repeat_min > 1) - { - /* In the pre-compile phase, we don't actually do the replication. We - just adjust the length as if we had. Do some paranoid checks for - potential integer overflow. */ - - if (lengthptr != NULL) - { - int delta = (repeat_min - 1)*length_prevgroup; - if ((pika_float)(repeat_min - 1)*(pika_float)length_prevgroup > - (pika_float)INT_MAX || - OFLOW_MAX - *lengthptr < delta) - { - *errorcodeptr = ERR20; - goto FAILED; - } - *lengthptr += delta; - } - - /* This is compiling for real */ - - else - { - if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; - for (i = 1; i < repeat_min; i++) - { - uschar *hc; - uschar *this_hwm = cd->hwm; - memcpy(code, previous, len); - for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) { - PUT(cd->hwm, 0, GET(hc, 0) + len); - cd->hwm += LINK_SIZE; - } - save_hwm = this_hwm; - code += len; - } - } - } + zeroreqbyte = reqbyte; - if (repeat_max > 0) repeat_max -= repeat_min; - } + /* The OP_NOT opcode works on one-byte characters only. */ - /* This code is common to both the zero and non-zero minimum cases. If - the maximum is limited, it replicates the group in a nested fashion, - remembering the bracket starts on a stack. In the case of a zero minimum, - the first one was set up above. In all cases the repeat_max now specifies - the number of additional copies needed. Again, we must remember to - replicate entries on the forward reference list. */ - - if (repeat_max >= 0) - { - /* In the pre-compile phase, we don't actually do the replication. We - just adjust the length as if we had. For each repetition we must add 1 - to the length for BRAZERO and for all but the last repetition we must - add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some - paranoid checks to avoid integer overflow. */ - - if (lengthptr != NULL && repeat_max > 0) - { - int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - - 2 - 2*LINK_SIZE; /* Last one doesn't nest */ - if ((pika_float)repeat_max * - (pika_float)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) - > (pika_float)INT_MAX || - OFLOW_MAX - *lengthptr < delta) - { - *errorcodeptr = ERR20; - goto FAILED; - } - *lengthptr += delta; - } - - /* This is compiling for real */ - - else for (i = repeat_max - 1; i >= 0; i--) - { - uschar *hc; - uschar *this_hwm = cd->hwm; - - *code++ = OP_BRAZERO + repeat_type; - - /* All but the final copy start a new nesting, maintaining the - chain of brackets outstanding. */ - - if (i != 0) - { - int offset; - *code++ = OP_BRA; - offset = (bralink == NULL)? 0 : code - bralink; - bralink = code; - PUTINC(code, 0, offset); - } - - memcpy(code, previous, len); - for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) - { - PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); - cd->hwm += LINK_SIZE; - } - save_hwm = this_hwm; - code += len; - } - - /* Now chain through the pending brackets, and fill in their length - fields (which are holding the chain links pro tem). */ - - while (bralink != NULL) - { - int oldlinkoffset; - int offset = code - bralink + 1; - uschar *bra = code - offset; - oldlinkoffset = GET(bra, 1); - bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; - *code++ = OP_KET; - PUTINC(code, 0, offset); - PUT(bra, 1, offset); - } - } - - /* If the maximum is unlimited, set a repeater in the final copy. We - can't just offset backwards from the current code point, because we - don't know if there's been an options resetting after the ket. The - correct offset was computed above. - - Then, when we are doing the actual compile phase, check to see whether - this group is a non-atomic one that could match an empty string. If so, - convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so - that runtime checking can be done. [This check is also applied to - atomic groups at runtime, but in a different way.] */ - - else - { - uschar *ketcode = code - ketoffset; - uschar *bracode = ketcode - GET(ketcode, 1); - *ketcode = OP_KETRMAX + repeat_type; - if (lengthptr == NULL && *bracode != OP_ONCE) - { - uschar *scode = bracode; - do - { - if (could_be_empty_branch(scode, ketcode, utf8)) - { - *bracode += OP_SBRA - OP_BRA; - break; - } - scode += GET(scode, 1); - } - while (*scode == OP_ALT); - } - } - } - - /* Else there's some kind of shambles */ - - else - { - *errorcodeptr = ERR11; - goto FAILED; - } - - /* If the character following a repeat is '+', or if certain optimization - tests above succeeded, possessive_quantifier is TRUE. For some of the - simpler opcodes, there is an special alternative opcode for this. For - anything else, we wrap the entire repeated item inside OP_ONCE brackets. - The '+' notation is just syntactic sugar, taken from Sun's Java package, - but the special opcodes can optimize it a bit. The repeated item starts at - tempcode, not at previous, which might be the first part of a string whose - (former) last char we repeated. - - Possessifying an 'exact' quantifier has no effect, so we can ignore it. But - an 'upto' may follow. We skip over an 'exact' item, and then test the - length of what remains before proceeding. */ - - if (possessive_quantifier) - { - int len; - if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT || - *tempcode == OP_NOTEXACT) - tempcode += _pcre_OP_lengths[*tempcode] + - ((*tempcode == OP_TYPEEXACT && - (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0); - len = code - tempcode; - if (len > 0) switch (*tempcode) - { - case OP_STAR: *tempcode = OP_POSSTAR; break; - case OP_PLUS: *tempcode = OP_POSPLUS; break; - case OP_QUERY: *tempcode = OP_POSQUERY; break; - case OP_UPTO: *tempcode = OP_POSUPTO; break; - - case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break; - case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break; - case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break; - case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break; - - case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break; - case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break; - case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break; - case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break; - - default: - memmove(tempcode + 1+LINK_SIZE, tempcode, len); - code += 1 + LINK_SIZE; - len += 1 + LINK_SIZE; - tempcode[0] = OP_ONCE; - *code++ = OP_KET; - PUTINC(code, 0, len); - PUT(tempcode, 1, len); - break; - } - } - - /* In all case we no longer have a previous item. We also set the - "follows varying string" flag for subsequently encountered reqbytes if - it isn't already set and we have just passed a varying length item. */ - - END_REPEAT: - previous = NULL; - cd->req_varyopt |= reqvary; - break; - - - /* ===================================================================*/ - /* Start of nested parenthesized sub-expression, or comment or lookahead or - lookbehind or option setting or condition or all the other extended - parenthesis forms. */ - - case '(': - newoptions = options; - skipbytes = 0; - bravalue = OP_CBRA; - save_hwm = cd->hwm; - reset_bracount = FALSE; - - /* First deal with various "verbs" that can be introduced by '*'. */ - - if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) - { - int i, namelen; - const char *vn = verbnames; - const uschar *name = ++ptr; - previous = NULL; - while ((cd->ctypes[*++ptr] & ctype_letter) != 0); - if (*ptr == ':') - { - *errorcodeptr = ERR59; /* Not supported */ - goto FAILED; - } - if (*ptr != ')') - { - *errorcodeptr = ERR60; - goto FAILED; - } - namelen = ptr - name; - for (i = 0; i < verbcount; i++) - { - if (namelen == verbs[i].len && - strncmp((char *)name, vn, namelen) == 0) - { - *code = verbs[i].op; - if (*code++ == OP_ACCEPT) cd->had_accept = TRUE; - break; - } - vn += verbs[i].len + 1; - } - if (i < verbcount) continue; - *errorcodeptr = ERR60; - goto FAILED; - } - - /* Deal with the extended parentheses; all are introduced by '?', and the - appearance of any of them means that this is not a capturing group. */ - - else if (*ptr == '?') - { - int i, set, unset, namelen; - int *optset; - const uschar *name; - uschar *slot; - - switch (*(++ptr)) - { - case '#': /* Comment; skip to ket */ - ptr++; - while (*ptr != 0 && *ptr != ')') ptr++; - if (*ptr == 0) - { - *errorcodeptr = ERR18; - goto FAILED; - } - continue; - - - /* ------------------------------------------------------------ */ - case '|': /* Reset capture count for each branch */ - reset_bracount = TRUE; - /* Fall through */ - - /* ------------------------------------------------------------ */ - case ':': /* Non-capturing bracket */ - bravalue = OP_BRA; - ptr++; - break; - - - /* ------------------------------------------------------------ */ - case '(': - bravalue = OP_COND; /* Conditional group */ - - /* A condition can be an assertion, a number (referring to a numbered - group), a name (referring to a named group), or 'R', referring to - recursion. R and R&name are also permitted for recursion tests. - - There are several syntaxes for testing a named group: (?(name)) is used - by Python; Perl 5.10 onwards uses (?() or (?('name')). - - There are two unfortunate ambiguities, caused by history. (a) 'R' can - be the recursive thing or the name 'R' (and similarly for 'R' followed - by digits), and (b) a number could be a name that consists of digits. - In both cases, we look for a name first; if not found, we try the other - cases. */ - - /* For conditions that are assertions, check the syntax, and then exit - the switch. This will take control down to where bracketed groups, - including assertions, are processed. */ - - if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<')) - break; - - /* Most other conditions use OP_CREF (a couple change to OP_RREF - below), and all need to skip 3 bytes at the start of the group. */ - - code[1+LINK_SIZE] = OP_CREF; - skipbytes = 3; - refsign = -1; - - /* Check for a test for recursion in a named group. */ - - if (ptr[1] == 'R' && ptr[2] == '&') - { - terminator = -1; - ptr += 2; - code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */ - } - - /* Check for a test for a named group's having been set, using the Perl - syntax (?() or (?('name') */ - - else if (ptr[1] == '<') - { - terminator = '>'; - ptr++; - } - else if (ptr[1] == '\'') - { - terminator = '\''; - ptr++; - } - else - { - terminator = 0; - if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr); - } - - /* We now expect to read a name; any thing else is an error */ - - if ((cd->ctypes[ptr[1]] & ctype_word) == 0) - { - ptr += 1; /* To get the right offset */ - *errorcodeptr = ERR28; - goto FAILED; - } - - /* Read the name, but also get it as a number if it's all digits */ - - recno = 0; - name = ++ptr; - while ((cd->ctypes[*ptr] & ctype_word) != 0) - { - if (recno >= 0) - recno = ((digitab[*ptr] & ctype_digit) != 0)? - recno * 10 + *ptr - '0' : -1; - ptr++; - } - namelen = ptr - name; - - if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')') - { - ptr--; /* Error offset */ - *errorcodeptr = ERR26; - goto FAILED; - } - - /* Do no further checking in the pre-compile phase. */ - - if (lengthptr != NULL) break; - - /* In the real compile we do the work of looking for the actual - reference. If the string started with "+" or "-" we require the rest to - be digits, in which case recno will be set. */ - - if (refsign > 0) - { - if (recno <= 0) - { - *errorcodeptr = ERR58; - goto FAILED; - } - recno = (refsign == '-')? - cd->bracount - recno + 1 : recno +cd->bracount; - if (recno <= 0 || recno > cd->final_bracount) - { - *errorcodeptr = ERR15; - goto FAILED; - } - PUT2(code, 2+LINK_SIZE, recno); - break; - } - - /* Otherwise (did not start with "+" or "-"), start by looking for the - name. */ - - slot = cd->name_table; - for (i = 0; i < cd->names_found; i++) - { - if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; - slot += cd->name_entry_size; - } - - /* Found a previous named subpattern */ - - if (i < cd->names_found) - { - recno = GET2(slot, 0); - PUT2(code, 2+LINK_SIZE, recno); - } - - /* Search the pattern for a forward reference */ - - else if ((i = find_parens(ptr, cd->bracount, name, namelen, - (options & PCRE_EXTENDED) != 0)) > 0) - { - PUT2(code, 2+LINK_SIZE, i); - } - - /* If terminator == 0 it means that the name followed directly after - the opening parenthesis [e.g. (?(abc)...] and in this case there are - some further alternatives to try. For the cases where terminator != 0 - [things like (?(... or (?('name')... or (?(R&name)... ] we have - now checked all the possibilities, so give an error. */ - - else if (terminator != 0) - { - *errorcodeptr = ERR15; - goto FAILED; - } - - /* Check for (?(R) for recursion. Allow digits after R to specify a - specific group number. */ - - else if (*name == 'R') - { - recno = 0; - for (i = 1; i < namelen; i++) - { - if ((digitab[name[i]] & ctype_digit) == 0) - { - *errorcodeptr = ERR15; - goto FAILED; - } - recno = recno * 10 + name[i] - '0'; - } - if (recno == 0) recno = RREF_ANY; - code[1+LINK_SIZE] = OP_RREF; /* Change test type */ - PUT2(code, 2+LINK_SIZE, recno); - } - - /* Similarly, check for the (?(DEFINE) "condition", which is always - false. */ - - else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0) - { - code[1+LINK_SIZE] = OP_DEF; - skipbytes = 1; - } - - /* Check for the "name" actually being a subpattern number. We are - in the second pass here, so final_bracount is set. */ - - else if (recno > 0 && recno <= cd->final_bracount) - { - PUT2(code, 2+LINK_SIZE, recno); - } - - /* Either an unidentified subpattern, or a reference to (?(0) */ - - else - { - *errorcodeptr = (recno == 0)? ERR35: ERR15; - goto FAILED; - } - break; - - - /* ------------------------------------------------------------ */ - case '=': /* Positive lookahead */ - bravalue = OP_ASSERT; - ptr++; - break; - - - /* ------------------------------------------------------------ */ - case '!': /* Negative lookahead */ - ptr++; - if (*ptr == ')') /* Optimize (?!) */ - { - *code++ = OP_FAIL; - previous = NULL; - continue; - } - bravalue = OP_ASSERT_NOT; - break; - - - /* ------------------------------------------------------------ */ - case '<': /* Lookbehind or named define */ - switch (ptr[1]) - { - case '=': /* Positive lookbehind */ - bravalue = OP_ASSERTBACK; - ptr += 2; - break; - - case '!': /* Negative lookbehind */ - bravalue = OP_ASSERTBACK_NOT; - ptr += 2; - break; - - default: /* Could be name define, else bad */ - if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME; - ptr++; /* Correct offset for error */ - *errorcodeptr = ERR24; - goto FAILED; - } - break; - - - /* ------------------------------------------------------------ */ - case '>': /* One-time brackets */ - bravalue = OP_ONCE; - ptr++; - break; - - - /* ------------------------------------------------------------ */ - case 'C': /* Callout - may be followed by digits; */ - previous_callout = code; /* Save for later completion */ - after_manual_callout = 1; /* Skip one item before completing */ - *code++ = OP_CALLOUT; - { - int n = 0; - while ((digitab[*(++ptr)] & ctype_digit) != 0) - n = n * 10 + *ptr - '0'; - if (*ptr != ')') - { - *errorcodeptr = ERR39; - goto FAILED; - } - if (n > 255) - { - *errorcodeptr = ERR38; - goto FAILED; - } - *code++ = n; - PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */ - PUT(code, LINK_SIZE, 0); /* Default length */ - code += 2 * LINK_SIZE; - } - previous = NULL; - continue; - - - /* ------------------------------------------------------------ */ - case 'P': /* Python-style named subpattern handling */ - if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */ - { - is_recurse = *ptr == '>'; - terminator = ')'; - goto NAMED_REF_OR_RECURSE; - } - else if (*ptr != '<') /* Test for Python-style definition */ - { - *errorcodeptr = ERR41; - goto FAILED; - } - /* Fall through to handle (?P< as (?< is handled */ - - - /* ------------------------------------------------------------ */ - DEFINE_NAME: /* Come here from (?< handling */ - case '\'': - { - terminator = (*ptr == '<')? '>' : '\''; - name = ++ptr; - - while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; - namelen = ptr - name; - - /* In the pre-compile phase, just do a syntax check. */ - - if (lengthptr != NULL) - { - if (*ptr != terminator) - { - *errorcodeptr = ERR42; - goto FAILED; - } - if (cd->names_found >= MAX_NAME_COUNT) - { - *errorcodeptr = ERR49; - goto FAILED; - } - if (namelen + 3 > cd->name_entry_size) - { - cd->name_entry_size = namelen + 3; - if (namelen > MAX_NAME_SIZE) - { - *errorcodeptr = ERR48; - goto FAILED; - } - } - } - - /* In the real compile, create the entry in the table */ - - else - { - slot = cd->name_table; - for (i = 0; i < cd->names_found; i++) - { - int crc = memcmp(name, slot+2, namelen); - if (crc == 0) - { - if (slot[2+namelen] == 0) - { - if ((options & PCRE_DUPNAMES) == 0) - { - *errorcodeptr = ERR43; - goto FAILED; + if (negate_class) { + if (firstbyte == REQ_UNSET) + firstbyte = REQ_NONE; + zerofirstbyte = firstbyte; + *code++ = OP_NOT; + *code++ = class_lastchar; + break; } - } - else crc = -1; /* Current name is substring */ + + /* For a single, positive character, get the value into + mcbuffer, and then we can handle this with the normal + one-character code. */ + +#ifdef SUPPORT_UTF8 + if (utf8 && class_lastchar > 127) + mclength = _pcre_ord2utf8(class_lastchar, mcbuffer); + else +#endif + { + mcbuffer[0] = class_lastchar; + mclength = 1; + } + goto ONE_CHAR; + } /* End of 1-char optimization */ + + /* The general case - not the one-char optimization. If this is + the first thing in the branch, there can be no first char + setting, whatever the repeat count. Any reqbyte setting must + remain unchanged after any kind of repeat. */ + + if (firstbyte == REQ_UNSET) + firstbyte = REQ_NONE; + zerofirstbyte = firstbyte; + zeroreqbyte = reqbyte; + + /* If there are characters with values > 255, we have to compile + an extended class, with its own opcode, unless there was a + negated special such as \S in the class, because in that case + all characters > 255 are in the class, so any that were + explicitly given as well can be ignored. If (when there are + explicit characters > 255 that must be listed) there are no + characters < 256, we can omit the bitmap in the actual compiled + code. */ + +#ifdef SUPPORT_UTF8 + if (class_utf8 && !should_flip_negation) { + *class_utf8data++ = + XCL_END; /* Marks the end of extra data */ + *code++ = OP_XCLASS; + code += LINK_SIZE; + *code = negate_class ? XCL_NOT : 0; + + /* If the map is required, move up the extra data to make + room for it; otherwise just move the code pointer to the end + of the extra data. */ + + if (class_charcount > 0) { + *code++ |= XCL_MAP; + memmove(code + 32, code, class_utf8data - code); + memcpy(code, classbits, 32); + code = class_utf8data + 32; + } else + code = class_utf8data; + + /* Now fill in the complete length of the item */ + + PUT(previous, 1, code - previous); + break; /* End of class handling */ } - if (crc < 0) - { - memmove(slot + cd->name_entry_size, slot, - (cd->names_found - i) * cd->name_entry_size); +#endif + + /* If there are no characters > 255, set the opcode to OP_CLASS + or OP_NCLASS, depending on whether the whole class was negated + and whether there were negative specials such as \S in the + class. Then copy the 32-byte map into the code vector, negating + it if necessary. */ + + *code++ = (negate_class == should_flip_negation) ? OP_CLASS + : OP_NCLASS; + if (negate_class) { + if (lengthptr == + NULL) /* Save time in the pre-compile phase */ + for (c = 0; c < 32; c++) + code[c] = ~classbits[c]; + } else { + memcpy(code, classbits, 32); + } + code += 32; break; + + /* ===================================================================*/ + /* Various kinds of repeat; '{' is not necessarily a quantifier, + but this has been tested above. */ + + case '{': + if (!is_quantifier) + goto NORMAL_CHAR; + ptr = read_repeat_counts(ptr + 1, &repeat_min, &repeat_max, + errorcodeptr); + if (*errorcodeptr != 0) + goto FAILED; + goto REPEAT; + + case '*': + repeat_min = 0; + repeat_max = -1; + goto REPEAT; + + case '+': + repeat_min = 1; + repeat_max = -1; + goto REPEAT; + + case '?': + repeat_min = 0; + repeat_max = 1; + + REPEAT: + if (previous == NULL) { + *errorcodeptr = ERR9; + goto FAILED; } - slot += cd->name_entry_size; - } - PUT2(slot, 0, cd->bracount + 1); - memcpy(slot + 2, name, namelen); - slot[2+namelen] = 0; - } - } - - /* In both cases, count the number of names we've encountered. */ - - ptr++; /* Move past > or ' */ - cd->names_found++; - goto NUMBERED_GROUP; - - - /* ------------------------------------------------------------ */ - case '&': /* Perl recursion/subroutine syntax */ - terminator = ')'; - is_recurse = TRUE; - /* Fall through */ - - /* We come here from the Python syntax above that handles both - references (?P=name) and recursion (?P>name), as well as falling - through from the Perl recursion syntax (?&name). We also come here from - the Perl \k or \k'name' back reference syntax and the \k{name} - .NET syntax. */ - - NAMED_REF_OR_RECURSE: - name = ++ptr; - while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; - namelen = ptr - name; - - /* In the pre-compile phase, do a syntax check and set a dummy - reference number. */ - - if (lengthptr != NULL) - { - if (namelen == 0) - { - *errorcodeptr = ERR62; - goto FAILED; - } - if (*ptr != terminator) - { - *errorcodeptr = ERR42; - goto FAILED; - } - if (namelen > MAX_NAME_SIZE) - { - *errorcodeptr = ERR48; - goto FAILED; - } - recno = 0; - } - - /* In the real compile, seek the name in the table. We check the name - first, and then check that we have reached the end of the name in the - table. That way, if the name that is longer than any in the table, - the comparison will fail without reading beyond the table entry. */ - - else - { - slot = cd->name_table; - for (i = 0; i < cd->names_found; i++) - { - if (strncmp((char *)name, (char *)slot+2, namelen) == 0 && - slot[2+namelen] == 0) - break; - slot += cd->name_entry_size; - } - - if (i < cd->names_found) /* Back reference */ - { - recno = GET2(slot, 0); - } - else if ((recno = /* Forward back reference */ - find_parens(ptr, cd->bracount, name, namelen, - (options & PCRE_EXTENDED) != 0)) <= 0) - { - *errorcodeptr = ERR15; - goto FAILED; - } - } - - /* In both phases, we can now go to the code than handles numerical - recursion or backreferences. */ - - if (is_recurse) goto HANDLE_RECURSION; - else goto HANDLE_REFERENCE; - - - /* ------------------------------------------------------------ */ - case 'R': /* Recursion */ - ptr++; /* Same as (?0) */ - /* Fall through */ - - - /* ------------------------------------------------------------ */ - case '-': case '+': - case '0': case '1': case '2': case '3': case '4': /* Recursion or */ - case '5': case '6': case '7': case '8': case '9': /* subroutine */ - { - const uschar *called; - - if ((refsign = *ptr) == '+') - { - ptr++; - if ((digitab[*ptr] & ctype_digit) == 0) - { - *errorcodeptr = ERR63; - goto FAILED; - } - } - else if (refsign == '-') - { - if ((digitab[ptr[1]] & ctype_digit) == 0) - goto OTHER_CHAR_AFTER_QUERY; - ptr++; - } - - recno = 0; - while((digitab[*ptr] & ctype_digit) != 0) - recno = recno * 10 + *ptr++ - '0'; - - if (*ptr != ')') - { - *errorcodeptr = ERR29; - goto FAILED; - } - - if (refsign == '-') - { - if (recno == 0) - { - *errorcodeptr = ERR58; - goto FAILED; - } - recno = cd->bracount - recno + 1; - if (recno <= 0) - { - *errorcodeptr = ERR15; - goto FAILED; - } - } - else if (refsign == '+') - { - if (recno == 0) - { - *errorcodeptr = ERR58; - goto FAILED; - } - recno += cd->bracount; - } - - /* Come here from code above that handles a named recursion */ - - HANDLE_RECURSION: - - previous = code; - called = cd->start_code; - - /* When we are actually compiling, find the bracket that is being - referenced. Temporarily end the regex in case it doesn't exist before - this point. If we end up with a forward reference, first check that - the bracket does occur later so we can give the error (and position) - now. Then remember this forward reference in the workspace so it can - be filled in at the end. */ - - if (lengthptr == NULL) - { - *code = OP_END; - if (recno != 0) called = find_bracket(cd->start_code, utf8, recno); - - /* Forward reference */ - - if (called == NULL) - { - if (find_parens(ptr, cd->bracount, NULL, recno, - (options & PCRE_EXTENDED) != 0) < 0) - { - *errorcodeptr = ERR15; - goto FAILED; + if (repeat_min == 0) { + firstbyte = zerofirstbyte; /* Adjust for zero repeat */ + reqbyte = zeroreqbyte; /* Ditto */ } - called = cd->start_code + recno; - PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code); - } - - /* If not a forward reference, and the subpattern is still open, - this is a recursive call. We check to see if this is a left - recursion that could loop for ever, and diagnose that case. */ - - else if (GET(called, 1) == 0 && - could_be_empty(called, code, bcptr, utf8)) - { - *errorcodeptr = ERR40; - goto FAILED; - } - } - - /* Insert the recursion/subroutine item, automatically wrapped inside - "once" brackets. Set up a "previous group" length so that a - subsequent quantifier will work. */ - - *code = OP_ONCE; - PUT(code, 1, 2 + 2*LINK_SIZE); - code += 1 + LINK_SIZE; - - *code = OP_RECURSE; - PUT(code, 1, called - cd->start_code); - code += 1 + LINK_SIZE; - - *code = OP_KET; - PUT(code, 1, 2 + 2*LINK_SIZE); - code += 1 + LINK_SIZE; - - length_prevgroup = 3 + 3*LINK_SIZE; - } - - /* Can't determine a first byte now */ - - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; - continue; - - - /* ------------------------------------------------------------ */ - default: /* Other characters: check option setting */ - OTHER_CHAR_AFTER_QUERY: - set = unset = 0; - optset = &set; - - while (*ptr != ')' && *ptr != ':') - { - switch (*ptr++) - { - case '-': optset = &unset; break; - - case 'J': /* Record that it changed in the external options */ - *optset |= PCRE_DUPNAMES; - cd->external_flags |= PCRE_JCHANGED; - break; - - case 'i': *optset |= PCRE_CASELESS; break; - case 'm': *optset |= PCRE_MULTILINE; break; - case 's': *optset |= PCRE_DOTALL; break; - case 'x': *optset |= PCRE_EXTENDED; break; - case 'U': *optset |= PCRE_UNGREEDY; break; - case 'X': *optset |= PCRE_EXTRA; break; - - default: *errorcodeptr = ERR12; - ptr--; /* Correct the offset */ - goto FAILED; - } - } - - /* Set up the changed option bits, but don't change anything yet. */ - - newoptions = (options | set) & (~unset); - - /* If the options ended with ')' this is not the start of a nested - group with option changes, so the options change at this level. If this - item is right at the start of the pattern, the options can be - abstracted and made external in the pre-compile phase, and ignored in - the compile phase. This can be helpful when matching -- for instance in - caseless checking of required bytes. - - If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are - definitely *not* at the start of the pattern because something has been - compiled. In the pre-compile phase, however, the code pointer can have - that value after the start, because it gets reset as code is discarded - during the pre-compile. However, this can happen only at top level - if - we are within parentheses, the starting BRA will still be present. At - any parenthesis level, the length value can be used to test if anything - has been compiled at that level. Thus, a test for both these conditions - is necessary to ensure we correctly detect the start of the pattern in - both phases. - - If we are not at the pattern start, compile code to change the ims - options if this setting actually changes any of them. We also pass the - new setting back so that it can be put at the start of any following - branches, and when this group ends (if we are in a group), a resetting - item can be compiled. */ - - if (*ptr == ')') - { - if (code == cd->start_code + 1 + LINK_SIZE && - (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE)) - { - cd->external_options = newoptions; - options = newoptions; - } - else - { - if ((options & PCRE_IMS) != (newoptions & PCRE_IMS)) - { - *code++ = OP_OPT; - *code++ = newoptions & PCRE_IMS; - } - - /* Change options at this level, and pass them back for use - in subsequent branches. Reset the greedy defaults and the case - value for firstbyte and reqbyte. */ - - *optionsptr = options = newoptions; - greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); - greedy_non_default = greedy_default ^ 1; - req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; - } - - previous = NULL; /* This item can't be repeated */ - continue; /* It is complete */ - } - - /* If the options ended with ':' we are heading into a nested group - with possible change of options. Such groups are non-capturing and are - not assertions of any kind. All we need to do is skip over the ':'; - the newoptions value is handled below. */ - - bravalue = OP_BRA; - ptr++; - } /* End of switch for character following (? */ - } /* End of (? handling */ - - /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set, - all unadorned brackets become non-capturing and behave like (?:...) - brackets. */ - - else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) - { - bravalue = OP_BRA; - } - - /* Else we have a capturing group. */ - - else - { - NUMBERED_GROUP: - cd->bracount += 1; - PUT2(code, 1+LINK_SIZE, cd->bracount); - skipbytes = 2; - } - - /* Process nested bracketed regex. Assertions may not be repeated, but - other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a - non-register variable in order to be able to pass its address because some - compilers complain otherwise. Pass in a new setting for the ims options if - they have changed. */ - - previous = (bravalue >= OP_ONCE)? code : NULL; - *code = bravalue; - tempcode = code; - tempreqvary = cd->req_varyopt; /* Save value before bracket */ - length_prevgroup = 0; /* Initialize for pre-compile phase */ - - if (!compile_regex( - newoptions, /* The complete new option state */ - options & PCRE_IMS, /* The previous ims option state */ - &tempcode, /* Where to put code (updated) */ - &ptr, /* Input pointer (updated) */ - errorcodeptr, /* Where to put an error message */ - (bravalue == OP_ASSERTBACK || - bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ - reset_bracount, /* True if (?| group */ - skipbytes, /* Skip over bracket number */ - &subfirstbyte, /* For possible first char */ - &subreqbyte, /* For possible last char */ - bcptr, /* Current branch chain */ - cd, /* Tables block */ - (lengthptr == NULL)? NULL : /* Actual compile phase */ - &length_prevgroup /* Pre-compile phase */ - )) - goto FAILED; - - /* At the end of compiling, code is still pointing to the start of the - group, while tempcode has been updated to point past the end of the group - and any option resetting that may follow it. The pattern pointer (ptr) - is on the bracket. */ - - /* If this is a conditional bracket, check that there are no more than - two branches in the group, or just one if it's a DEFINE group. We do this - in the real compile phase, not in the pre-pass, where the whole group may - not be available. */ - - if (bravalue == OP_COND && lengthptr == NULL) - { - uschar *tc = code; - int condcount = 0; - - do { - condcount++; - tc += GET(tc,1); - } - while (*tc != OP_KET); - - /* A DEFINE group is never obeyed inline (the "condition" is always - false). It must have only one branch. */ - - if (code[LINK_SIZE+1] == OP_DEF) - { - if (condcount > 1) - { - *errorcodeptr = ERR54; - goto FAILED; - } - bravalue = OP_DEF; /* Just a flag to suppress char handling below */ - } - - /* A "normal" conditional group. If there is just one branch, we must not - make use of its firstbyte or reqbyte, because this is equivalent to an - empty second branch. */ - - else - { - if (condcount > 2) - { - *errorcodeptr = ERR27; - goto FAILED; - } - if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE; - } - } - - /* Error if hit end of pattern */ - - if (*ptr != ')') - { - *errorcodeptr = ERR14; - goto FAILED; - } - - /* In the pre-compile phase, update the length by the length of the group, - less the brackets at either end. Then reduce the compiled code to just a - set of non-capturing brackets so that it doesn't use much memory if it is - duplicated by a quantifier.*/ - - if (lengthptr != NULL) - { - if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) - { - *errorcodeptr = ERR20; - goto FAILED; - } - *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; - *code++ = OP_BRA; - PUTINC(code, 0, 1 + LINK_SIZE); - *code++ = OP_KET; - PUTINC(code, 0, 1 + LINK_SIZE); - break; /* No need to waste time with special character handling */ - } - - /* Otherwise update the main code pointer to the end of the group. */ - - code = tempcode; - - /* For a DEFINE group, required and first character settings are not - relevant. */ - - if (bravalue == OP_DEF) break; - - /* Handle updating of the required and first characters for other types of - group. Update for normal brackets of all kinds, and conditions with two - branches (see code above). If the bracket is followed by a quantifier with - zero repeat, we have to back off. Hence the definition of zeroreqbyte and - zerofirstbyte outside the main loop so that they can be accessed for the - back off. */ - - zeroreqbyte = reqbyte; - zerofirstbyte = firstbyte; - groupsetfirstbyte = FALSE; - - if (bravalue >= OP_ONCE) - { - /* If we have not yet set a firstbyte in this branch, take it from the - subpattern, remembering that it was set here so that a repeat of more - than one can replicate it as reqbyte if necessary. If the subpattern has - no firstbyte, set "none" for the whole branch. In both cases, a zero - repeat forces firstbyte to "none". */ - - if (firstbyte == REQ_UNSET) - { - if (subfirstbyte >= 0) - { - firstbyte = subfirstbyte; - groupsetfirstbyte = TRUE; - } - else firstbyte = REQ_NONE; - zerofirstbyte = REQ_NONE; - } - - /* If firstbyte was previously set, convert the subpattern's firstbyte - into reqbyte if there wasn't one, using the vary flag that was in - existence beforehand. */ - - else if (subfirstbyte >= 0 && subreqbyte < 0) - subreqbyte = subfirstbyte | tempreqvary; - - /* If the subpattern set a required byte (or set a first byte that isn't - really the first byte - see above), set it. */ - - if (subreqbyte >= 0) reqbyte = subreqbyte; - } - - /* For a forward assertion, we take the reqbyte, if set. This can be - helpful if the pattern that follows the assertion doesn't set a different - char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte - for an assertion, however because it leads to incorrect effect for patterns - such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead - of a firstbyte. This is overcome by a scan at the end if there's no - firstbyte, looking for an asserted first char. */ - - else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte; - break; /* End of processing '(' */ - - - /* ===================================================================*/ - /* Handle metasequences introduced by \. For ones like \d, the ESC_ values - are arranged to be the negation of the corresponding OP_values. For the - back references, the values are ESC_REF plus the reference number. Only - back references and those types that consume a character may be repeated. - We can test for values between ESC_b and ESC_Z for the latter; this may - have to change if any new ones are ever created. */ - - case '\\': - tempptr = ptr; - c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); - if (*errorcodeptr != 0) goto FAILED; - - if (c < 0) - { - if (-c == ESC_Q) /* Handle start of quoted string */ - { - if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */ - else inescq = TRUE; - continue; - } - - if (-c == ESC_E) continue; /* Perl ignores an orphan \E */ - - /* For metasequences that actually match a character, we disable the - setting of a first character if it hasn't already been set. */ - - if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z) - firstbyte = REQ_NONE; - - /* Set values to reset to if this is followed by a zero repeat. */ - - zerofirstbyte = firstbyte; - zeroreqbyte = reqbyte; - - /* \k or \k'name' is a back reference by name (Perl syntax). - We also support \k{name} (.NET syntax) */ - - if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{')) - { - is_recurse = FALSE; - terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}'; - goto NAMED_REF_OR_RECURSE; - } - - /* Back references are handled specially; must disable firstbyte if - not set to cope with cases like (?=(\w+))\1: which would otherwise set - ':' later. */ - - if (-c >= ESC_REF) - { - recno = -c - ESC_REF; - - HANDLE_REFERENCE: /* Come here from named backref handling */ - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; - previous = code; - *code++ = OP_REF; - PUT2INC(code, 0, recno); - cd->backref_map |= (recno < 32)? (1 << recno) : 1; - if (recno > cd->top_backref) cd->top_backref = recno; - } - - /* So are Unicode property matches, if supported. */ + + /* Remember whether this is a variable length repeat */ + + reqvary = (repeat_min == repeat_max) ? 0 : REQ_VARY; + + op_type = 0; /* Default single-char op codes */ + possessive_quantifier = + FALSE; /* Default not possessive quantifier */ + + /* Save start of previous item, in case we have to move it up to + make space for an inserted OP_ONCE for the additional '+' + extension. */ + + tempcode = previous; + + /* If the next character is '+', we have a possessive + quantifier. This implies greediness, whatever the setting of the + PCRE_UNGREEDY option. If the next character is '?' this is a + minimizing repeat, by default, but if PCRE_UNGREEDY is set, it + works the other way round. We change the repeat type to the + non-default. */ + + if (ptr[1] == '+') { + repeat_type = 0; /* Force greedy */ + possessive_quantifier = TRUE; + ptr++; + } else if (ptr[1] == '?') { + repeat_type = greedy_non_default; + ptr++; + } else + repeat_type = greedy_default; + + /* If previous was a character match, abolish the item and + generate a repeat item instead. If a char item has a minumum of + more than one, ensure that it is set in reqbyte - it might not + be if a sequence such as x{3} is the first thing in a branch + because the x will have gone into firstbyte instead. */ + + if (*previous == OP_CHAR || *previous == OP_CHARNC) { + /* Deal with UTF-8 characters that take up more than one + byte. It's easier to write this out separately than try to + macrify it. Use c to hold the length of the character in + bytes, plus 0x80 to flag that it's a length rather than a + small character. */ + +#ifdef SUPPORT_UTF8 + if (utf8 && (code[-1] & 0x80) != 0) { + uschar* lastchar = code - 1; + while ((*lastchar & 0xc0) == 0x80) + lastchar--; + c = code - lastchar; /* Length of UTF-8 character */ + memcpy(utf8_char, lastchar, c); /* Save the char */ + c |= 0x80; /* Flag c as a length */ + } else +#endif + + /* Handle the case of a single byte - either with no UTF8 + support, or with UTF-8 disabled, or for a UTF-8 character < + 128. */ + + { + c = code[-1]; + if (repeat_min > 1) + reqbyte = c | req_caseopt | cd->req_varyopt; + } + + /* If the repetition is unlimited, it pays to see if the + next thing on the line is something that cannot possibly + match this character. If so, automatically possessifying + this item gains some performance in the case where the match + fails. */ + + if (!possessive_quantifier && repeat_max < 0 && + check_auto_possessive(*previous, c, utf8, utf8_char, + ptr + 1, options, cd)) { + repeat_type = 0; /* Force greedy */ + possessive_quantifier = TRUE; + } + + goto OUTPUT_SINGLE_REPEAT; /* Code shared with single + character types */ + } + + /* If previous was a single negated character ([^a] or similar), + we use one of the special opcodes, replacing it. The code is + shared with single- character repeats by setting opt_type to add + a suitable offset into repeat_type. We can also test for + auto-possessification. OP_NOT is currently used only for + single-byte chars. */ + + else if (*previous == OP_NOT) { + op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */ + c = previous[1]; + if (!possessive_quantifier && repeat_max < 0 && + check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, + options, cd)) { + repeat_type = 0; /* Force greedy */ + possessive_quantifier = TRUE; + } + goto OUTPUT_SINGLE_REPEAT; + } + + /* If previous was a character type match (\d or similar), + abolish it and create a suitable repeat item. The code is shared + with single-character repeats by setting op_type to add a + suitable offset into repeat_type. Note the the Unicode property + types will be present only when SUPPORT_UCP is defined, but we + don't wrap the little bits of code here because it just makes it + horribly messy. */ + + else if (*previous < OP_EODN) { + uschar* oldcode; + int prop_type, prop_value; + op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ + c = *previous; + + if (!possessive_quantifier && repeat_max < 0 && + check_auto_possessive(c, 0, utf8, NULL, ptr + 1, + options, cd)) { + repeat_type = 0; /* Force greedy */ + possessive_quantifier = TRUE; + } + + OUTPUT_SINGLE_REPEAT: + if (*previous == OP_PROP || *previous == OP_NOTPROP) { + prop_type = previous[1]; + prop_value = previous[2]; + } else + prop_type = prop_value = -1; + + oldcode = code; + code = previous; /* Usually overwrite previous item */ + + /* If the maximum is zero then the minimum must also be + zero; Perl allows this case, so we do too - by simply + omitting the item altogether. */ + + if (repeat_max == 0) + goto END_REPEAT; + + /* All real repeats make it impossible to handle partial + matching (maybe one day we will be able to remove this + restriction). */ + + if (repeat_max != 1) + cd->external_flags |= PCRE_NOPARTIAL; + + /* Combine the op_type with the repeat_type */ + + repeat_type += op_type; + + /* A minimum of zero is handled either as the special case * + or ?, or as an UPTO, with the maximum given. */ + + if (repeat_min == 0) { + if (repeat_max == -1) + *code++ = OP_STAR + repeat_type; + else if (repeat_max == 1) + *code++ = OP_QUERY + repeat_type; + else { + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max); + } + } + + /* A repeat minimum of 1 is optimized into some special + cases. If the maximum is unlimited, we use OP_PLUS. + Otherwise, the original item is left in place and, if the + maximum is greater than 1, we use OP_UPTO with one less than + the maximum. */ + + else if (repeat_min == 1) { + if (repeat_max == -1) + *code++ = OP_PLUS + repeat_type; + else { + code = oldcode; /* leave previous item in place */ + if (repeat_max == 1) + goto END_REPEAT; + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max - 1); + } + } + + /* The case {n,n} is just an EXACT, while the general case + {n,m} is handled as an EXACT followed by an UPTO. */ + + else { + *code++ = + OP_EXACT + + op_type; /* NB EXACT doesn't have repeat_type */ + PUT2INC(code, 0, repeat_min); + + /* If the maximum is unlimited, insert an OP_STAR. + Before doing so, we have to insert the character for the + previous code. For a repeated Unicode property match, + there are two extra bytes that define the required + property. In UTF-8 mode, long characters have their + length in c, with the 0x80 bit as a flag. */ + + if (repeat_max < 0) { +#ifdef SUPPORT_UTF8 + if (utf8 && c >= 128) { + memcpy(code, utf8_char, c & 7); + code += c & 7; + } else +#endif + { + *code++ = c; + if (prop_type >= 0) { + *code++ = prop_type; + *code++ = prop_value; + } + } + *code++ = OP_STAR + repeat_type; + } + + /* Else insert an UPTO if the max is greater than the + min, again preceded by the character, for the previously + inserted code. If the UPTO is just for 1 instance, we + can use QUERY instead. */ + + else if (repeat_max != repeat_min) { +#ifdef SUPPORT_UTF8 + if (utf8 && c >= 128) { + memcpy(code, utf8_char, c & 7); + code += c & 7; + } else +#endif + *code++ = c; + if (prop_type >= 0) { + *code++ = prop_type; + *code++ = prop_value; + } + repeat_max -= repeat_min; + + if (repeat_max == 1) { + *code++ = OP_QUERY + repeat_type; + } else { + *code++ = OP_UPTO + repeat_type; + PUT2INC(code, 0, repeat_max); + } + } + } + + /* The character or character type itself comes last in all + * cases. */ + +#ifdef SUPPORT_UTF8 + if (utf8 && c >= 128) { + memcpy(code, utf8_char, c & 7); + code += c & 7; + } else +#endif + *code++ = c; + + /* For a repeated Unicode property match, there are two + extra bytes that define the required property. */ #ifdef SUPPORT_UCP - else if (-c == ESC_P || -c == ESC_p) - { - BOOL negated; - int pdata; - int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); - if (ptype < 0) goto FAILED; - previous = code; - *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP; - *code++ = ptype; - *code++ = pdata; - } + if (prop_type >= 0) { + *code++ = prop_type; + *code++ = prop_value; + } +#endif + } + + /* If previous was a character class or a back reference, we put + the repeat stuff after it, but just skip the item if the repeat + was {0,0}. */ + + else if (*previous == OP_CLASS || *previous == OP_NCLASS || +#ifdef SUPPORT_UTF8 + *previous == OP_XCLASS || +#endif + *previous == OP_REF) { + if (repeat_max == 0) { + code = previous; + goto END_REPEAT; + } + + /* All real repeats make it impossible to handle partial + matching (maybe one day we will be able to remove this + restriction). */ + + if (repeat_max != 1) + cd->external_flags |= PCRE_NOPARTIAL; + + if (repeat_min == 0 && repeat_max == -1) + *code++ = OP_CRSTAR + repeat_type; + else if (repeat_min == 1 && repeat_max == -1) + *code++ = OP_CRPLUS + repeat_type; + else if (repeat_min == 0 && repeat_max == 1) + *code++ = OP_CRQUERY + repeat_type; + else { + *code++ = OP_CRRANGE + repeat_type; + PUT2INC(code, 0, repeat_min); + if (repeat_max == -1) + repeat_max = 0; /* 2-byte encoding for max */ + PUT2INC(code, 0, repeat_max); + } + } + + /* If previous was a bracket group, we may have to replicate it + in certain cases. */ + + else if (*previous == OP_BRA || *previous == OP_CBRA || + *previous == OP_ONCE || *previous == OP_COND) { + register int i; + int ketoffset = 0; + int len = code - previous; + uschar* bralink = NULL; + + /* Repeating a DEFINE group is pointless */ + + if (*previous == OP_COND && + previous[LINK_SIZE + 1] == OP_DEF) { + *errorcodeptr = ERR55; + goto FAILED; + } + + /* If the maximum repeat count is unlimited, find the end of + the bracket by scanning through from the start, and compute + the offset back to it from the current code pointer. There + may be an OP_OPT setting following the final KET, so we + can't find the end just by going back from the code pointer. + */ + + if (repeat_max == -1) { + register uschar* ket = previous; + do + ket += GET(ket, 1); + while (*ket != OP_KET); + ketoffset = code - ket; + } + + /* The case of a zero minimum is special because of the need + to stick OP_BRAZERO in front of it, and because the group + appears once in the data, whereas in other cases it appears + the minimum number of times. For this reason, it is simplest + to treat this case separately, as otherwise the code gets + far too messy. There are several special subcases when the + minimum is zero. */ + + if (repeat_min == 0) { + /* If the maximum is also zero, we just omit the group + from the output altogether. */ + + if (repeat_max == 0) { + code = previous; + goto END_REPEAT; + } + + /* If the maximum is 1 or unlimited, we just have to + stick in the BRAZERO and do no more at this point. + However, we do need to adjust any OP_RECURSE calls + inside the group that refer to the group itself or any + internal or forward referenced group, because the offset + is from the start of the whole regex. Temporarily + terminate the pattern while doing this. */ + + if (repeat_max <= 1) { + *code = OP_END; + adjust_recurse(previous, 1, utf8, cd, save_hwm); + memmove(previous + 1, previous, len); + code++; + *previous++ = OP_BRAZERO + repeat_type; + } + + /* If the maximum is greater than 1 and limited, we have + to replicate in a nested fashion, sticking OP_BRAZERO + before each set of brackets. The first one has to be + handled carefully because it's the original copy, which + has to be moved up. The remainder can be handled by code + that is common with the non-zero minimum case below. We + have to adjust the value or repeat_max, since one less + copy is required. Once again, we may have to adjust any + OP_RECURSE calls inside the group. */ + + else { + int offset; + *code = OP_END; + adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, + save_hwm); + memmove(previous + 2 + LINK_SIZE, previous, len); + code += 2 + LINK_SIZE; + *previous++ = OP_BRAZERO + repeat_type; + *previous++ = OP_BRA; + + /* We chain together the bracket offset fields that + have to be filled in later when the ends of the + brackets are reached. */ + + offset = (bralink == NULL) ? 0 : previous - bralink; + bralink = previous; + PUTINC(previous, 0, offset); + } + + repeat_max--; + } + + /* If the minimum is greater than zero, replicate the group + as many times as necessary, and adjust the maximum to the + number of subsequent copies that we need. If we set a first + char from the group, and didn't set a required char, copy + the latter from the former. If there are any forward + reference subroutine calls in the group, there will be + entries on the workspace list; replicate these with an + appropriate increment. */ + + else { + if (repeat_min > 1) { + /* In the pre-compile phase, we don't actually do + the replication. We just adjust the length as if we + had. Do some paranoid checks for potential integer + overflow. */ + + if (lengthptr != NULL) { + int delta = (repeat_min - 1) * length_prevgroup; + if ((pika_float)(repeat_min - 1) * + (pika_float)length_prevgroup > + (pika_float)INT_MAX || + OFLOW_MAX - *lengthptr < delta) { + *errorcodeptr = ERR20; + goto FAILED; + } + *lengthptr += delta; + } + + /* This is compiling for real */ + + else { + if (groupsetfirstbyte && reqbyte < 0) + reqbyte = firstbyte; + for (i = 1; i < repeat_min; i++) { + uschar* hc; + uschar* this_hwm = cd->hwm; + memcpy(code, previous, len); + for (hc = save_hwm; hc < this_hwm; + hc += LINK_SIZE) { + PUT(cd->hwm, 0, GET(hc, 0) + len); + cd->hwm += LINK_SIZE; + } + save_hwm = this_hwm; + code += len; + } + } + } + + if (repeat_max > 0) + repeat_max -= repeat_min; + } + + /* This code is common to both the zero and non-zero minimum + cases. If the maximum is limited, it replicates the group in + a nested fashion, remembering the bracket starts on a stack. + In the case of a zero minimum, the first one was set up + above. In all cases the repeat_max now specifies the number + of additional copies needed. Again, we must remember to + replicate entries on the forward reference list. */ + + if (repeat_max >= 0) { + /* In the pre-compile phase, we don't actually do the + replication. We just adjust the length as if we had. For + each repetition we must add 1 to the length for BRAZERO + and for all but the last repetition we must add 2 + + 2*LINKSIZE to allow for the nesting that occurs. Do some + paranoid checks to avoid integer overflow. */ + + if (lengthptr != NULL && repeat_max > 0) { + int delta = + repeat_max * + (length_prevgroup + 1 + 2 + 2 * LINK_SIZE) - + 2 - 2 * LINK_SIZE; /* Last one doesn't nest */ + if ((pika_float)repeat_max * + (pika_float)(length_prevgroup + 1 + 2 + + 2 * LINK_SIZE) > + (pika_float)INT_MAX || + OFLOW_MAX - *lengthptr < delta) { + *errorcodeptr = ERR20; + goto FAILED; + } + *lengthptr += delta; + } + + /* This is compiling for real */ + + else + for (i = repeat_max - 1; i >= 0; i--) { + uschar* hc; + uschar* this_hwm = cd->hwm; + + *code++ = OP_BRAZERO + repeat_type; + + /* All but the final copy start a new nesting, + maintaining the chain of brackets outstanding. + */ + + if (i != 0) { + int offset; + *code++ = OP_BRA; + offset = + (bralink == NULL) ? 0 : code - bralink; + bralink = code; + PUTINC(code, 0, offset); + } + + memcpy(code, previous, len); + for (hc = save_hwm; hc < this_hwm; + hc += LINK_SIZE) { + PUT(cd->hwm, 0, + GET(hc, 0) + len + + ((i != 0) ? 2 + LINK_SIZE : 1)); + cd->hwm += LINK_SIZE; + } + save_hwm = this_hwm; + code += len; + } + + /* Now chain through the pending brackets, and fill in + their length fields (which are holding the chain links + pro tem). */ + + while (bralink != NULL) { + int oldlinkoffset; + int offset = code - bralink + 1; + uschar* bra = code - offset; + oldlinkoffset = GET(bra, 1); + bralink = (oldlinkoffset == 0) + ? NULL + : bralink - oldlinkoffset; + *code++ = OP_KET; + PUTINC(code, 0, offset); + PUT(bra, 1, offset); + } + } + + /* If the maximum is unlimited, set a repeater in the final + copy. We can't just offset backwards from the current code + point, because we don't know if there's been an options + resetting after the ket. The correct offset was computed + above. + + Then, when we are doing the actual compile phase, check to + see whether this group is a non-atomic one that could match + an empty string. If so, convert the initial operator to the + S form (e.g. OP_BRA -> OP_SBRA) so that runtime checking can + be done. [This check is also applied to atomic groups at + runtime, but in a different way.] */ + + else { + uschar* ketcode = code - ketoffset; + uschar* bracode = ketcode - GET(ketcode, 1); + *ketcode = OP_KETRMAX + repeat_type; + if (lengthptr == NULL && *bracode != OP_ONCE) { + uschar* scode = bracode; + do { + if (could_be_empty_branch(scode, ketcode, + utf8)) { + *bracode += OP_SBRA - OP_BRA; + break; + } + scode += GET(scode, 1); + } while (*scode == OP_ALT); + } + } + } + + /* Else there's some kind of shambles */ + + else { + *errorcodeptr = ERR11; + goto FAILED; + } + + /* If the character following a repeat is '+', or if certain + optimization tests above succeeded, possessive_quantifier is + TRUE. For some of the simpler opcodes, there is an special + alternative opcode for this. For anything else, we wrap the + entire repeated item inside OP_ONCE brackets. The '+' notation + is just syntactic sugar, taken from Sun's Java package, but the + special opcodes can optimize it a bit. The repeated item starts + at tempcode, not at previous, which might be the first part of a + string whose (former) last char we repeated. + + Possessifying an 'exact' quantifier has no effect, so we can + ignore it. But an 'upto' may follow. We skip over an 'exact' + item, and then test the length of what remains before + proceeding. */ + + if (possessive_quantifier) { + int len; + if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT || + *tempcode == OP_NOTEXACT) + tempcode += _pcre_OP_lengths[*tempcode] + + ((*tempcode == OP_TYPEEXACT && + (tempcode[3] == OP_PROP || + tempcode[3] == OP_NOTPROP)) + ? 2 + : 0); + len = code - tempcode; + if (len > 0) + switch (*tempcode) { + case OP_STAR: + *tempcode = OP_POSSTAR; + break; + case OP_PLUS: + *tempcode = OP_POSPLUS; + break; + case OP_QUERY: + *tempcode = OP_POSQUERY; + break; + case OP_UPTO: + *tempcode = OP_POSUPTO; + break; + + case OP_TYPESTAR: + *tempcode = OP_TYPEPOSSTAR; + break; + case OP_TYPEPLUS: + *tempcode = OP_TYPEPOSPLUS; + break; + case OP_TYPEQUERY: + *tempcode = OP_TYPEPOSQUERY; + break; + case OP_TYPEUPTO: + *tempcode = OP_TYPEPOSUPTO; + break; + + case OP_NOTSTAR: + *tempcode = OP_NOTPOSSTAR; + break; + case OP_NOTPLUS: + *tempcode = OP_NOTPOSPLUS; + break; + case OP_NOTQUERY: + *tempcode = OP_NOTPOSQUERY; + break; + case OP_NOTUPTO: + *tempcode = OP_NOTPOSUPTO; + break; + + default: + memmove(tempcode + 1 + LINK_SIZE, tempcode, + len); + code += 1 + LINK_SIZE; + len += 1 + LINK_SIZE; + tempcode[0] = OP_ONCE; + *code++ = OP_KET; + PUTINC(code, 0, len); + PUT(tempcode, 1, len); + break; + } + } + + /* In all case we no longer have a previous item. We also set + the "follows varying string" flag for subsequently encountered + reqbytes if it isn't already set and we have just passed a + varying length item. */ + + END_REPEAT: + previous = NULL; + cd->req_varyopt |= reqvary; + break; + + /* ===================================================================*/ + /* Start of nested parenthesized sub-expression, or comment or + lookahead or lookbehind or option setting or condition or all + the other extended parenthesis forms. */ + + case '(': + newoptions = options; + skipbytes = 0; + bravalue = OP_CBRA; + save_hwm = cd->hwm; + reset_bracount = FALSE; + + /* First deal with various "verbs" that can be introduced by + * '*'. */ + + if (*(++ptr) == '*' && + (cd->ctypes[ptr[1]] & ctype_letter) != 0) { + int i, namelen; + const char* vn = verbnames; + const uschar* name = ++ptr; + previous = NULL; + while ((cd->ctypes[*++ptr] & ctype_letter) != 0) + ; + if (*ptr == ':') { + *errorcodeptr = ERR59; /* Not supported */ + goto FAILED; + } + if (*ptr != ')') { + *errorcodeptr = ERR60; + goto FAILED; + } + namelen = ptr - name; + for (i = 0; i < verbcount; i++) { + if (namelen == verbs[i].len && + strncmp((char*)name, vn, namelen) == 0) { + *code = verbs[i].op; + if (*code++ == OP_ACCEPT) + cd->had_accept = TRUE; + break; + } + vn += verbs[i].len + 1; + } + if (i < verbcount) + continue; + *errorcodeptr = ERR60; + goto FAILED; + } + + /* Deal with the extended parentheses; all are introduced by + '?', and the appearance of any of them means that this is not a + capturing group. */ + + else if (*ptr == '?') { + int i, set, unset, namelen; + int* optset; + const uschar* name; + uschar* slot; + + switch (*(++ptr)) { + case '#': /* Comment; skip to ket */ + ptr++; + while (*ptr != 0 && *ptr != ')') + ptr++; + if (*ptr == 0) { + *errorcodeptr = ERR18; + goto FAILED; + } + continue; + + /* ------------------------------------------------------------ + */ + case '|': /* Reset capture count for each branch */ + reset_bracount = TRUE; + /* Fall through */ + + /* ------------------------------------------------------------ + */ + case ':': /* Non-capturing bracket */ + bravalue = OP_BRA; + ptr++; + break; + + /* ------------------------------------------------------------ + */ + case '(': + bravalue = OP_COND; /* Conditional group */ + + /* A condition can be an assertion, a number + (referring to a numbered group), a name (referring + to a named group), or 'R', referring to recursion. + R and R&name are also permitted for + recursion tests. + + There are several syntaxes for testing a named + group: (?(name)) is used by Python; Perl 5.10 + onwards uses (?() or (?('name')). + + There are two unfortunate ambiguities, caused by + history. (a) 'R' can be the recursive thing or the + name 'R' (and similarly for 'R' followed by digits), + and (b) a number could be a name that consists of + digits. In both cases, we look for a name first; if + not found, we try the other cases. */ + + /* For conditions that are assertions, check the + syntax, and then exit the switch. This will take + control down to where bracketed groups, including + assertions, are processed. */ + + if (ptr[1] == '?' && + (ptr[2] == '=' || ptr[2] == '!' || + ptr[2] == '<')) + break; + + /* Most other conditions use OP_CREF (a couple + change to OP_RREF below), and all need to skip 3 + bytes at the start of the group. */ + + code[1 + LINK_SIZE] = OP_CREF; + skipbytes = 3; + refsign = -1; + + /* Check for a test for recursion in a named group. + */ + + if (ptr[1] == 'R' && ptr[2] == '&') { + terminator = -1; + ptr += 2; + code[1 + LINK_SIZE] = + OP_RREF; /* Change the type of test */ + } + + /* Check for a test for a named group's having been + set, using the Perl syntax (?() or (?('name') + */ + + else if (ptr[1] == '<') { + terminator = '>'; + ptr++; + } else if (ptr[1] == '\'') { + terminator = '\''; + ptr++; + } else { + terminator = 0; + if (ptr[1] == '-' || ptr[1] == '+') + refsign = *(++ptr); + } + + /* We now expect to read a name; any thing else is + * an error */ + + if ((cd->ctypes[ptr[1]] & ctype_word) == 0) { + ptr += 1; /* To get the right offset */ + *errorcodeptr = ERR28; + goto FAILED; + } + + /* Read the name, but also get it as a number if + * it's all digits */ + + recno = 0; + name = ++ptr; + while ((cd->ctypes[*ptr] & ctype_word) != 0) { + if (recno >= 0) + recno = ((digitab[*ptr] & ctype_digit) != 0) + ? recno * 10 + *ptr - '0' + : -1; + ptr++; + } + namelen = ptr - name; + + if ((terminator > 0 && *ptr++ != terminator) || + *ptr++ != ')') { + ptr--; /* Error offset */ + *errorcodeptr = ERR26; + goto FAILED; + } + + /* Do no further checking in the pre-compile phase. + */ + + if (lengthptr != NULL) + break; + + /* In the real compile we do the work of looking for + the actual reference. If the string started with "+" + or "-" we require the rest to be digits, in which + case recno will be set. */ + + if (refsign > 0) { + if (recno <= 0) { + *errorcodeptr = ERR58; + goto FAILED; + } + recno = (refsign == '-') + ? cd->bracount - recno + 1 + : recno + cd->bracount; + if (recno <= 0 || recno > cd->final_bracount) { + *errorcodeptr = ERR15; + goto FAILED; + } + PUT2(code, 2 + LINK_SIZE, recno); + break; + } + + /* Otherwise (did not start with "+" or "-"), start + by looking for the name. */ + + slot = cd->name_table; + for (i = 0; i < cd->names_found; i++) { + if (strncmp((char*)name, (char*)slot + 2, + namelen) == 0) + break; + slot += cd->name_entry_size; + } + + /* Found a previous named subpattern */ + + if (i < cd->names_found) { + recno = GET2(slot, 0); + PUT2(code, 2 + LINK_SIZE, recno); + } + + /* Search the pattern for a forward reference */ + + else if ((i = find_parens( + ptr, cd->bracount, name, namelen, + (options & PCRE_EXTENDED) != 0)) > + 0) { + PUT2(code, 2 + LINK_SIZE, i); + } + + /* If terminator == 0 it means that the name + followed directly after the opening parenthesis + [e.g. (?(abc)...] and in this case there are some + further alternatives to try. For the cases where + terminator != 0 [things like (?(... or + (?('name')... or (?(R&name)... ] we have now checked + all the possibilities, so give an error. */ + + else if (terminator != 0) { + *errorcodeptr = ERR15; + goto FAILED; + } + + /* Check for (?(R) for recursion. Allow digits after + R to specify a specific group number. */ + + else if (*name == 'R') { + recno = 0; + for (i = 1; i < namelen; i++) { + if ((digitab[name[i]] & ctype_digit) == 0) { + *errorcodeptr = ERR15; + goto FAILED; + } + recno = recno * 10 + name[i] - '0'; + } + if (recno == 0) + recno = RREF_ANY; + code[1 + LINK_SIZE] = + OP_RREF; /* Change test type */ + PUT2(code, 2 + LINK_SIZE, recno); + } + + /* Similarly, check for the (?(DEFINE) "condition", + which is always false. */ + + else if (namelen == 6 && + strncmp((char*)name, "DEFINE", 6) == 0) { + code[1 + LINK_SIZE] = OP_DEF; + skipbytes = 1; + } + + /* Check for the "name" actually being a subpattern + number. We are in the second pass here, so + final_bracount is set. */ + + else if (recno > 0 && recno <= cd->final_bracount) { + PUT2(code, 2 + LINK_SIZE, recno); + } + + /* Either an unidentified subpattern, or a reference + to (?(0) */ + + else { + *errorcodeptr = (recno == 0) ? ERR35 : ERR15; + goto FAILED; + } + break; + + /* ------------------------------------------------------------ + */ + case '=': /* Positive lookahead */ + bravalue = OP_ASSERT; + ptr++; + break; + + /* ------------------------------------------------------------ + */ + case '!': /* Negative lookahead */ + ptr++; + if (*ptr == ')') /* Optimize (?!) */ + { + *code++ = OP_FAIL; + previous = NULL; + continue; + } + bravalue = OP_ASSERT_NOT; + break; + + /* ------------------------------------------------------------ + */ + case '<': /* Lookbehind or named define */ + switch (ptr[1]) { + case '=': /* Positive lookbehind */ + bravalue = OP_ASSERTBACK; + ptr += 2; + break; + + case '!': /* Negative lookbehind */ + bravalue = OP_ASSERTBACK_NOT; + ptr += 2; + break; + + default: /* Could be name define, else bad */ + if ((cd->ctypes[ptr[1]] & ctype_word) != 0) + goto DEFINE_NAME; + ptr++; /* Correct offset for error */ + *errorcodeptr = ERR24; + goto FAILED; + } + break; + + /* ------------------------------------------------------------ + */ + case '>': /* One-time brackets */ + bravalue = OP_ONCE; + ptr++; + break; + + /* ------------------------------------------------------------ + */ + case 'C': /* Callout - may be followed by digits; */ + previous_callout = + code; /* Save for later completion */ + after_manual_callout = + 1; /* Skip one item before completing */ + *code++ = OP_CALLOUT; + { + int n = 0; + while ((digitab[*(++ptr)] & ctype_digit) != 0) + n = n * 10 + *ptr - '0'; + if (*ptr != ')') { + *errorcodeptr = ERR39; + goto FAILED; + } + if (n > 255) { + *errorcodeptr = ERR38; + goto FAILED; + } + *code++ = n; + PUT(code, 0, + ptr - cd->start_pattern + + 1); /* Pattern offset */ + PUT(code, LINK_SIZE, 0); /* Default length */ + code += 2 * LINK_SIZE; + } + previous = NULL; + continue; + + /* ------------------------------------------------------------ + */ + case 'P': /* Python-style named subpattern handling */ + if (*(++ptr) == '=' || + *ptr == '>') /* Reference or recursion */ + { + is_recurse = *ptr == '>'; + terminator = ')'; + goto NAMED_REF_OR_RECURSE; + } else if (*ptr != '<') /* Test for Python-style + definition */ + { + *errorcodeptr = ERR41; + goto FAILED; + } + /* Fall through to handle (?P< as (?< is handled */ + + /* ------------------------------------------------------------ + */ + DEFINE_NAME: /* Come here from (?< handling */ + case '\'': { + terminator = (*ptr == '<') ? '>' : '\''; + name = ++ptr; + + while ((cd->ctypes[*ptr] & ctype_word) != 0) + ptr++; + namelen = ptr - name; + + /* In the pre-compile phase, just do a syntax check. + */ + + if (lengthptr != NULL) { + if (*ptr != terminator) { + *errorcodeptr = ERR42; + goto FAILED; + } + if (cd->names_found >= MAX_NAME_COUNT) { + *errorcodeptr = ERR49; + goto FAILED; + } + if (namelen + 3 > cd->name_entry_size) { + cd->name_entry_size = namelen + 3; + if (namelen > MAX_NAME_SIZE) { + *errorcodeptr = ERR48; + goto FAILED; + } + } + } + + /* In the real compile, create the entry in the + table */ + + else { + slot = cd->name_table; + for (i = 0; i < cd->names_found; i++) { + int crc = memcmp(name, slot + 2, namelen); + if (crc == 0) { + if (slot[2 + namelen] == 0) { + if ((options & PCRE_DUPNAMES) == + 0) { + *errorcodeptr = ERR43; + goto FAILED; + } + } else + crc = -1; /* Current name is + substring */ + } + if (crc < 0) { + memmove(slot + cd->name_entry_size, + slot, + (cd->names_found - i) * + cd->name_entry_size); + break; + } + slot += cd->name_entry_size; + } + + PUT2(slot, 0, cd->bracount + 1); + memcpy(slot + 2, name, namelen); + slot[2 + namelen] = 0; + } + } + + /* In both cases, count the number of names we've + * encountered. */ + + ptr++; /* Move past > or ' */ + cd->names_found++; + goto NUMBERED_GROUP; + + /* ------------------------------------------------------------ + */ + case '&': /* Perl recursion/subroutine syntax */ + terminator = ')'; + is_recurse = TRUE; + /* Fall through */ + + /* We come here from the Python syntax above that + handles both references (?P=name) and recursion + (?P>name), as well as falling through from the Perl + recursion syntax (?&name). We also come here from + the Perl \k or \k'name' back reference syntax + and the \k{name} .NET syntax. */ + + NAMED_REF_OR_RECURSE: + name = ++ptr; + while ((cd->ctypes[*ptr] & ctype_word) != 0) + ptr++; + namelen = ptr - name; + + /* In the pre-compile phase, do a syntax check and + set a dummy reference number. */ + + if (lengthptr != NULL) { + if (namelen == 0) { + *errorcodeptr = ERR62; + goto FAILED; + } + if (*ptr != terminator) { + *errorcodeptr = ERR42; + goto FAILED; + } + if (namelen > MAX_NAME_SIZE) { + *errorcodeptr = ERR48; + goto FAILED; + } + recno = 0; + } + + /* In the real compile, seek the name in the table. + We check the name first, and then check that we have + reached the end of the name in the table. That way, + if the name that is longer than any in the table, + the comparison will fail without reading beyond the + table entry. */ + + else { + slot = cd->name_table; + for (i = 0; i < cd->names_found; i++) { + if (strncmp((char*)name, (char*)slot + 2, + namelen) == 0 && + slot[2 + namelen] == 0) + break; + slot += cd->name_entry_size; + } + + if (i < cd->names_found) /* Back reference */ + { + recno = GET2(slot, 0); + } else if ((recno = /* Forward back reference */ + find_parens( + ptr, cd->bracount, name, + namelen, + (options & PCRE_EXTENDED) != + 0)) <= 0) { + *errorcodeptr = ERR15; + goto FAILED; + } + } + + /* In both phases, we can now go to the code than + handles numerical recursion or backreferences. */ + + if (is_recurse) + goto HANDLE_RECURSION; + else + goto HANDLE_REFERENCE; + + /* ------------------------------------------------------------ + */ + case 'R': /* Recursion */ + ptr++; /* Same as (?0) */ + /* Fall through */ + + /* ------------------------------------------------------------ + */ + case '-': + case '+': + case '0': + case '1': + case '2': + case '3': + case '4': /* Recursion or */ + case '5': + case '6': + case '7': + case '8': + case '9': /* subroutine */ + { + const uschar* called; + + if ((refsign = *ptr) == '+') { + ptr++; + if ((digitab[*ptr] & ctype_digit) == 0) { + *errorcodeptr = ERR63; + goto FAILED; + } + } else if (refsign == '-') { + if ((digitab[ptr[1]] & ctype_digit) == 0) + goto OTHER_CHAR_AFTER_QUERY; + ptr++; + } + + recno = 0; + while ((digitab[*ptr] & ctype_digit) != 0) + recno = recno * 10 + *ptr++ - '0'; + + if (*ptr != ')') { + *errorcodeptr = ERR29; + goto FAILED; + } + + if (refsign == '-') { + if (recno == 0) { + *errorcodeptr = ERR58; + goto FAILED; + } + recno = cd->bracount - recno + 1; + if (recno <= 0) { + *errorcodeptr = ERR15; + goto FAILED; + } + } else if (refsign == '+') { + if (recno == 0) { + *errorcodeptr = ERR58; + goto FAILED; + } + recno += cd->bracount; + } + + /* Come here from code above that handles a named + * recursion */ + + HANDLE_RECURSION: + + previous = code; + called = cd->start_code; + + /* When we are actually compiling, find the bracket + that is being referenced. Temporarily end the regex + in case it doesn't exist before this point. If we + end up with a forward reference, first check that + the bracket does occur later so we can give the + error (and position) now. Then remember this forward + reference in the workspace so it can be filled in at + the end. */ + + if (lengthptr == NULL) { + *code = OP_END; + if (recno != 0) + called = find_bracket(cd->start_code, utf8, + recno); + + /* Forward reference */ + + if (called == NULL) { + if (find_parens( + ptr, cd->bracount, NULL, recno, + (options & PCRE_EXTENDED) != 0) < + 0) { + *errorcodeptr = ERR15; + goto FAILED; + } + called = cd->start_code + recno; + PUTINC( + cd->hwm, 0, + code + 2 + LINK_SIZE - cd->start_code); + } + + /* If not a forward reference, and the + subpattern is still open, this is a recursive + call. We check to see if this is a left + recursion that could loop for ever, and diagnose + that case. */ + + else if (GET(called, 1) == 0 && + could_be_empty(called, code, bcptr, + utf8)) { + *errorcodeptr = ERR40; + goto FAILED; + } + } + + /* Insert the recursion/subroutine item, + automatically wrapped inside "once" brackets. Set up + a "previous group" length so that a subsequent + quantifier will work. */ + + *code = OP_ONCE; + PUT(code, 1, 2 + 2 * LINK_SIZE); + code += 1 + LINK_SIZE; + + *code = OP_RECURSE; + PUT(code, 1, called - cd->start_code); + code += 1 + LINK_SIZE; + + *code = OP_KET; + PUT(code, 1, 2 + 2 * LINK_SIZE); + code += 1 + LINK_SIZE; + + length_prevgroup = 3 + 3 * LINK_SIZE; + } + + /* Can't determine a first byte now */ + + if (firstbyte == REQ_UNSET) + firstbyte = REQ_NONE; + continue; + + /* ------------------------------------------------------------ + */ + default: /* Other characters: check option setting */ + OTHER_CHAR_AFTER_QUERY: + set = unset = 0; + optset = &set; + + while (*ptr != ')' && *ptr != ':') { + switch (*ptr++) { + case '-': + optset = &unset; + break; + + case 'J': /* Record that it changed in the + external options */ + *optset |= PCRE_DUPNAMES; + cd->external_flags |= PCRE_JCHANGED; + break; + + case 'i': + *optset |= PCRE_CASELESS; + break; + case 'm': + *optset |= PCRE_MULTILINE; + break; + case 's': + *optset |= PCRE_DOTALL; + break; + case 'x': + *optset |= PCRE_EXTENDED; + break; + case 'U': + *optset |= PCRE_UNGREEDY; + break; + case 'X': + *optset |= PCRE_EXTRA; + break; + + default: + *errorcodeptr = ERR12; + ptr--; /* Correct the offset */ + goto FAILED; + } + } + + /* Set up the changed option bits, but don't change + * anything yet. */ + + newoptions = (options | set) & (~unset); + + /* If the options ended with ')' this is not the + start of a nested group with option changes, so the + options change at this level. If this item is right + at the start of the pattern, the options can be + abstracted and made external in the pre-compile + phase, and ignored in the compile phase. This can be + helpful when matching -- for instance in caseless + checking of required bytes. + + If the code pointer is not (cd->start_code + 1 + + LINK_SIZE), we are definitely *not* at the start of + the pattern because something has been compiled. In + the pre-compile phase, however, the code pointer can + have that value after the start, because it gets + reset as code is discarded during the pre-compile. + However, this can happen only at top level - if we + are within parentheses, the starting BRA will still + be present. At any parenthesis level, the length + value can be used to test if anything has been + compiled at that level. Thus, a test for both these + conditions is necessary to ensure we correctly + detect the start of the pattern in both phases. + + If we are not at the pattern start, compile code to + change the ims options if this setting actually + changes any of them. We also pass the new setting + back so that it can be put at the start of any + following branches, and when this group ends (if we + are in a group), a resetting item can be compiled. + */ + + if (*ptr == ')') { + if (code == cd->start_code + 1 + LINK_SIZE && + (lengthptr == NULL || + *lengthptr == 2 + 2 * LINK_SIZE)) { + cd->external_options = newoptions; + options = newoptions; + } else { + if ((options & PCRE_IMS) != + (newoptions & PCRE_IMS)) { + *code++ = OP_OPT; + *code++ = newoptions & PCRE_IMS; + } + + /* Change options at this level, and pass + them back for use in subsequent branches. + Reset the greedy defaults and the case value + for firstbyte and reqbyte. */ + + *optionsptr = options = newoptions; + greedy_default = + ((newoptions & PCRE_UNGREEDY) != 0); + greedy_non_default = greedy_default ^ 1; + req_caseopt = + ((options & PCRE_CASELESS) != 0) + ? REQ_CASELESS + : 0; + } + + previous = + NULL; /* This item can't be repeated */ + continue; /* It is complete */ + } + + /* If the options ended with ':' we are heading into + a nested group with possible change of options. Such + groups are non-capturing and are not assertions of + any kind. All we need to do is skip over the ':'; + the newoptions value is handled below. */ + + bravalue = OP_BRA; + ptr++; + } /* End of switch for character following (? */ + } /* End of (? handling */ + + /* Opening parenthesis not followed by '?'. If + PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become + non-capturing and behave like (?:...) brackets. */ + + else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) { + bravalue = OP_BRA; + } + + /* Else we have a capturing group. */ + + else { + NUMBERED_GROUP: + cd->bracount += 1; + PUT2(code, 1 + LINK_SIZE, cd->bracount); + skipbytes = 2; + } + + /* Process nested bracketed regex. Assertions may not be + repeated, but other kinds can be. All their opcodes are >= + OP_ONCE. We copy code into a non-register variable in order to + be able to pass its address because some compilers complain + otherwise. Pass in a new setting for the ims options if they + have changed. */ + + previous = (bravalue >= OP_ONCE) ? code : NULL; + *code = bravalue; + tempcode = code; + tempreqvary = cd->req_varyopt; /* Save value before bracket */ + length_prevgroup = 0; /* Initialize for pre-compile phase */ + + if (!compile_regex( + newoptions, /* The complete new option state */ + options & PCRE_IMS, /* The previous ims option state */ + &tempcode, /* Where to put code (updated) */ + &ptr, /* Input pointer (updated) */ + errorcodeptr, /* Where to put an error message */ + (bravalue == OP_ASSERTBACK || + bravalue == + OP_ASSERTBACK_NOT), /* TRUE if back assert */ + reset_bracount, /* True if (?| group */ + skipbytes, /* Skip over bracket number */ + &subfirstbyte, /* For possible first char */ + &subreqbyte, /* For possible last char */ + bcptr, /* Current branch chain */ + cd, /* Tables block */ + (lengthptr == NULL) ? NULL : /* Actual compile phase */ + &length_prevgroup /* Pre-compile phase */ + )) + goto FAILED; + + /* At the end of compiling, code is still pointing to the start + of the group, while tempcode has been updated to point past the + end of the group and any option resetting that may follow it. + The pattern pointer (ptr) is on the bracket. */ + + /* If this is a conditional bracket, check that there are no + more than two branches in the group, or just one if it's a + DEFINE group. We do this in the real compile phase, not in the + pre-pass, where the whole group may not be available. */ + + if (bravalue == OP_COND && lengthptr == NULL) { + uschar* tc = code; + int condcount = 0; + + do { + condcount++; + tc += GET(tc, 1); + } while (*tc != OP_KET); + + /* A DEFINE group is never obeyed inline (the "condition" is + always false). It must have only one branch. */ + + if (code[LINK_SIZE + 1] == OP_DEF) { + if (condcount > 1) { + *errorcodeptr = ERR54; + goto FAILED; + } + bravalue = OP_DEF; /* Just a flag to suppress char + handling below */ + } + + /* A "normal" conditional group. If there is just one + branch, we must not make use of its firstbyte or reqbyte, + because this is equivalent to an empty second branch. */ + + else { + if (condcount > 2) { + *errorcodeptr = ERR27; + goto FAILED; + } + if (condcount == 1) + subfirstbyte = subreqbyte = REQ_NONE; + } + } + + /* Error if hit end of pattern */ + + if (*ptr != ')') { + *errorcodeptr = ERR14; + goto FAILED; + } + + /* In the pre-compile phase, update the length by the length of + the group, less the brackets at either end. Then reduce the + compiled code to just a set of non-capturing brackets so that it + doesn't use much memory if it is duplicated by a quantifier.*/ + + if (lengthptr != NULL) { + if (OFLOW_MAX - *lengthptr < + length_prevgroup - 2 - 2 * LINK_SIZE) { + *errorcodeptr = ERR20; + goto FAILED; + } + *lengthptr += length_prevgroup - 2 - 2 * LINK_SIZE; + *code++ = OP_BRA; + PUTINC(code, 0, 1 + LINK_SIZE); + *code++ = OP_KET; + PUTINC(code, 0, 1 + LINK_SIZE); + break; /* No need to waste time with special character + handling */ + } + + /* Otherwise update the main code pointer to the end of the + * group. */ + + code = tempcode; + + /* For a DEFINE group, required and first character settings are + not relevant. */ + + if (bravalue == OP_DEF) + break; + + /* Handle updating of the required and first characters for + other types of group. Update for normal brackets of all kinds, + and conditions with two branches (see code above). If the + bracket is followed by a quantifier with zero repeat, we have to + back off. Hence the definition of zeroreqbyte and zerofirstbyte + outside the main loop so that they can be accessed for the back + off. */ + + zeroreqbyte = reqbyte; + zerofirstbyte = firstbyte; + groupsetfirstbyte = FALSE; + + if (bravalue >= OP_ONCE) { + /* If we have not yet set a firstbyte in this branch, take + it from the subpattern, remembering that it was set here so + that a repeat of more than one can replicate it as reqbyte + if necessary. If the subpattern has no firstbyte, set "none" + for the whole branch. In both cases, a zero repeat forces + firstbyte to "none". */ + + if (firstbyte == REQ_UNSET) { + if (subfirstbyte >= 0) { + firstbyte = subfirstbyte; + groupsetfirstbyte = TRUE; + } else + firstbyte = REQ_NONE; + zerofirstbyte = REQ_NONE; + } + + /* If firstbyte was previously set, convert the subpattern's + firstbyte into reqbyte if there wasn't one, using the vary + flag that was in existence beforehand. */ + + else if (subfirstbyte >= 0 && subreqbyte < 0) + subreqbyte = subfirstbyte | tempreqvary; + + /* If the subpattern set a required byte (or set a first + byte that isn't really the first byte - see above), set it. + */ + + if (subreqbyte >= 0) + reqbyte = subreqbyte; + } + + /* For a forward assertion, we take the reqbyte, if set. This + can be helpful if the pattern that follows the assertion doesn't + set a different char. For example, it's useful for + /(?=abcde).+/. We can't set firstbyte for an assertion, however + because it leads to incorrect effect for patterns such as + /(?=a)a.+/ when the "real" "a" would then become a reqbyte + instead of a firstbyte. This is overcome by a scan at the end if + there's no firstbyte, looking for an asserted first char. */ + + else if (bravalue == OP_ASSERT && subreqbyte >= 0) + reqbyte = subreqbyte; + break; /* End of processing '(' */ + + /* ===================================================================*/ + /* Handle metasequences introduced by \. For ones like \d, the + ESC_ values are arranged to be the negation of the corresponding + OP_values. For the back references, the values are ESC_REF plus + the reference number. Only back references and those types that + consume a character may be repeated. We can test for values + between ESC_b and ESC_Z for the latter; this may have to change + if any new ones are ever created. */ + + case '\\': + tempptr = ptr; + c = check_escape(&ptr, errorcodeptr, cd->bracount, options, + FALSE); + if (*errorcodeptr != 0) + goto FAILED; + + if (c < 0) { + if (-c == ESC_Q) /* Handle start of quoted string */ + { + if (ptr[1] == '\\' && ptr[2] == 'E') + ptr += 2; /* avoid empty string */ + else + inescq = TRUE; + continue; + } + + if (-c == ESC_E) + continue; /* Perl ignores an orphan \E */ + + /* For metasequences that actually match a character, we + disable the setting of a first character if it hasn't + already been set. */ + + if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z) + firstbyte = REQ_NONE; + + /* Set values to reset to if this is followed by a zero + * repeat. */ + + zerofirstbyte = firstbyte; + zeroreqbyte = reqbyte; + + /* \k or \k'name' is a back reference by name (Perl + syntax). We also support \k{name} (.NET syntax) */ + + if (-c == ESC_k && + (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{')) { + is_recurse = FALSE; + terminator = (*(++ptr) == '<') ? '>' + : (*ptr == '\'') ? '\'' + : '}'; + goto NAMED_REF_OR_RECURSE; + } + + /* Back references are handled specially; must disable + firstbyte if not set to cope with cases like (?=(\w+))\1: + which would otherwise set + ':' later. */ + + if (-c >= ESC_REF) { + recno = -c - ESC_REF; + + HANDLE_REFERENCE + : /* Come here from named backref handling */ + if (firstbyte == REQ_UNSET) + firstbyte = REQ_NONE; + previous = code; + *code++ = OP_REF; + PUT2INC(code, 0, recno); + cd->backref_map |= (recno < 32) ? (1 << recno) : 1; + if (recno > cd->top_backref) + cd->top_backref = recno; + } + /* So are Unicode property matches, if supported. */ + +#ifdef SUPPORT_UCP + else if (-c == ESC_P || -c == ESC_p) { + BOOL negated; + int pdata; + int ptype = + get_ucp(&ptr, &negated, &pdata, errorcodeptr); + if (ptype < 0) + goto FAILED; + previous = code; + *code++ = + ((-c == ESC_p) != negated) ? OP_PROP : OP_NOTPROP; + *code++ = ptype; + *code++ = pdata; + } #else - /* If Unicode properties are not supported, \X, \P, and \p are not - allowed. */ + /* If Unicode properties are not supported, \X, \P, and \p + are not allowed. */ - else if (-c == ESC_X || -c == ESC_P || -c == ESC_p) - { - *errorcodeptr = ERR45; - goto FAILED; - } + else if (-c == ESC_X || -c == ESC_P || -c == ESC_p) { + *errorcodeptr = ERR45; + goto FAILED; + } #endif - /* For the rest (including \X when Unicode properties are supported), we - can obtain the OP value by negating the escape value. */ + /* For the rest (including \X when Unicode properties are + supported), we can obtain the OP value by negating the + escape value. */ - else - { - previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; - *code++ = -c; - } - continue; - } + else { + previous = (-c > ESC_b && -c < ESC_Z) ? code : NULL; + *code++ = -c; + } + continue; + } - /* We have a data character whose value is in c. In UTF-8 mode it may have - a value > 127. We set its representation in the length/buffer, and then - handle it as a data character. */ + /* We have a data character whose value is in c. In UTF-8 mode + it may have a value > 127. We set its representation in the + length/buffer, and then handle it as a data character. */ #ifdef SUPPORT_UTF8 - if (utf8 && c > 127) - mclength = _pcre_ord2utf8(c, mcbuffer); - else + if (utf8 && c > 127) + mclength = _pcre_ord2utf8(c, mcbuffer); + else #endif - { - mcbuffer[0] = c; - mclength = 1; - } - goto ONE_CHAR; + { + mcbuffer[0] = c; + mclength = 1; + } + goto ONE_CHAR; + /* ===================================================================*/ + /* Handle a literal character. It is guaranteed not to be + whitespace or # when the extended flag is set. If we are in + UTF-8 mode, it may be a multi-byte literal character. */ - /* ===================================================================*/ - /* Handle a literal character. It is guaranteed not to be whitespace or # - when the extended flag is set. If we are in UTF-8 mode, it may be a - multi-byte literal character. */ - - default: - NORMAL_CHAR: - mclength = 1; - mcbuffer[0] = c; + default: + NORMAL_CHAR: + mclength = 1; + mcbuffer[0] = c; #ifdef SUPPORT_UTF8 - if (utf8 && c >= 0xc0) - { - while ((ptr[1] & 0xc0) == 0x80) - mcbuffer[mclength++] = *(++ptr); - } + if (utf8 && c >= 0xc0) { + while ((ptr[1] & 0xc0) == 0x80) + mcbuffer[mclength++] = *(++ptr); + } #endif - /* At this point we have the character's bytes in mcbuffer, and the length - in mclength. When not in UTF-8 mode, the length is always 1. */ + /* At this point we have the character's bytes in mcbuffer, and + the length in mclength. When not in UTF-8 mode, the length is + always 1. */ - ONE_CHAR: - previous = code; - *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR; - for (c = 0; c < mclength; c++) *code++ = mcbuffer[c]; + ONE_CHAR: + previous = code; + *code++ = + ((options & PCRE_CASELESS) != 0) ? OP_CHARNC : OP_CHAR; + for (c = 0; c < mclength; c++) + *code++ = mcbuffer[c]; - /* Remember if \r or \n were seen */ + /* Remember if \r or \n were seen */ - if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n') - cd->external_flags |= PCRE_HASCRORLF; + if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n') + cd->external_flags |= PCRE_HASCRORLF; - /* Set the first and required bytes appropriately. If no previous first - byte, set it from this character, but revert to none on a zero repeat. - Otherwise, leave the firstbyte value alone, and don't change it on a zero - repeat. */ + /* Set the first and required bytes appropriately. If no + previous first byte, set it from this character, but revert to + none on a zero repeat. Otherwise, leave the firstbyte value + alone, and don't change it on a zero repeat. */ - if (firstbyte == REQ_UNSET) - { - zerofirstbyte = REQ_NONE; - zeroreqbyte = reqbyte; + if (firstbyte == REQ_UNSET) { + zerofirstbyte = REQ_NONE; + zeroreqbyte = reqbyte; - /* If the character is more than one byte long, we can set firstbyte - only if it is not to be matched caselessly. */ + /* If the character is more than one byte long, we can set + firstbyte only if it is not to be matched caselessly. */ - if (mclength == 1 || req_caseopt == 0) - { - firstbyte = mcbuffer[0] | req_caseopt; - if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt; + if (mclength == 1 || req_caseopt == 0) { + firstbyte = mcbuffer[0] | req_caseopt; + if (mclength != 1) + reqbyte = code[-1] | cd->req_varyopt; + } else + firstbyte = reqbyte = REQ_NONE; + } + + /* firstbyte was previously set; we can set reqbyte only the + length is 1 or the matching is caseful. */ + + else { + zerofirstbyte = firstbyte; + zeroreqbyte = reqbyte; + if (mclength == 1 || req_caseopt == 0) + reqbyte = code[-1] | req_caseopt | cd->req_varyopt; + } + + break; /* End of literal character handling */ } - else firstbyte = reqbyte = REQ_NONE; - } + } /* end of big loop */ - /* firstbyte was previously set; we can set reqbyte only the length is - 1 or the matching is caseful. */ - - else - { - zerofirstbyte = firstbyte; - zeroreqbyte = reqbyte; - if (mclength == 1 || req_caseopt == 0) - reqbyte = code[-1] | req_caseopt | cd->req_varyopt; - } - - break; /* End of literal character handling */ - } - } /* end of big loop */ - - -/* Control never reaches here by falling through, only by a goto for all the -error states. Pass back the position in the pattern so that it can be displayed -to the user for diagnosing the error. */ + /* Control never reaches here by falling through, only by a goto for all the + error states. Pass back the position in the pattern so that it can be + displayed to the user for diagnosing the error. */ FAILED: -*ptrptr = ptr; -return FALSE; + *ptrptr = ptr; + return FALSE; } - - - /************************************************* -* Compile sequence of alternatives * -*************************************************/ + * Compile sequence of alternatives * + *************************************************/ /* On entry, ptr is pointing past the bracket character, but on return it points to the closing bracket, or vertical bar, or end of string. The code @@ -5246,249 +5613,244 @@ Arguments: Returns: TRUE on success */ -static BOOL -compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr, - int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes, - int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd, - int *lengthptr) -{ -const uschar *ptr = *ptrptr; -uschar *code = *codeptr; -uschar *last_branch = code; -uschar *start_bracket = code; -uschar *reverse_count = NULL; -int firstbyte, reqbyte; -int branchfirstbyte, branchreqbyte; -int length; -int orig_bracount; -int max_bracount; -branch_chain bc; +static BOOL compile_regex(int options, + int oldims, + uschar** codeptr, + const uschar** ptrptr, + int* errorcodeptr, + BOOL lookbehind, + BOOL reset_bracount, + int skipbytes, + int* firstbyteptr, + int* reqbyteptr, + branch_chain* bcptr, + compile_data* cd, + int* lengthptr) { + const uschar* ptr = *ptrptr; + uschar* code = *codeptr; + uschar* last_branch = code; + uschar* start_bracket = code; + uschar* reverse_count = NULL; + int firstbyte, reqbyte; + int branchfirstbyte, branchreqbyte; + int length; + int orig_bracount; + int max_bracount; + branch_chain bc; -bc.outer = bcptr; -bc.current = code; + bc.outer = bcptr; + bc.current = code; -firstbyte = reqbyte = REQ_UNSET; + firstbyte = reqbyte = REQ_UNSET; -/* Accumulate the length for use in the pre-compile phase. Start with the -length of the BRA and KET and any extra bytes that are required at the -beginning. We accumulate in a local variable to save frequent testing of -lenthptr for NULL. We cannot do this by looking at the value of code at the -start and end of each alternative, because compiled items are discarded during -the pre-compile phase so that the work space is not exceeded. */ + /* Accumulate the length for use in the pre-compile phase. Start with the + length of the BRA and KET and any extra bytes that are required at the + beginning. We accumulate in a local variable to save frequent testing of + lenthptr for NULL. We cannot do this by looking at the value of code at the + start and end of each alternative, because compiled items are discarded + during the pre-compile phase so that the work space is not exceeded. */ -length = 2 + 2*LINK_SIZE + skipbytes; + length = 2 + 2 * LINK_SIZE + skipbytes; -/* WARNING: If the above line is changed for any reason, you must also change -the code that abstracts option settings at the start of the pattern and makes -them global. It tests the value of length for (2 + 2*LINK_SIZE) in the -pre-compile phase to find out whether anything has yet been compiled or not. */ + /* WARNING: If the above line is changed for any reason, you must also + change the code that abstracts option settings at the start of the pattern + and makes them global. It tests the value of length for (2 + 2*LINK_SIZE) in + the pre-compile phase to find out whether anything has yet been compiled or + not. */ -/* Offset is set zero to mark that this bracket is still open */ + /* Offset is set zero to mark that this bracket is still open */ -PUT(code, 1, 0); -code += 1 + LINK_SIZE + skipbytes; + PUT(code, 1, 0); + code += 1 + LINK_SIZE + skipbytes; -/* Loop for each alternative branch */ + /* Loop for each alternative branch */ -orig_bracount = max_bracount = cd->bracount; -for (;;) - { - /* For a (?| group, reset the capturing bracket count so that each branch - uses the same numbers. */ + orig_bracount = max_bracount = cd->bracount; + for (;;) { + /* For a (?| group, reset the capturing bracket count so that each + branch uses the same numbers. */ - if (reset_bracount) cd->bracount = orig_bracount; + if (reset_bracount) + cd->bracount = orig_bracount; - /* Handle a change of ims options at the start of the branch */ + /* Handle a change of ims options at the start of the branch */ - if ((options & PCRE_IMS) != oldims) - { - *code++ = OP_OPT; - *code++ = options & PCRE_IMS; - length += 2; - } - - /* Set up dummy OP_REVERSE if lookbehind assertion */ - - if (lookbehind) - { - *code++ = OP_REVERSE; - reverse_count = code; - PUTINC(code, 0, 0); - length += 1 + LINK_SIZE; - } - - /* Now compile the branch; in the pre-compile phase its length gets added - into the length. */ - - if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte, - &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length)) - { - *ptrptr = ptr; - return FALSE; - } - - /* Keep the highest bracket count in case (?| was used and some branch - has fewer than the rest. */ - - if (cd->bracount > max_bracount) max_bracount = cd->bracount; - - /* In the real compile phase, there is some post-processing to be done. */ - - if (lengthptr == NULL) - { - /* If this is the first branch, the firstbyte and reqbyte values for the - branch become the values for the regex. */ - - if (*last_branch != OP_ALT) - { - firstbyte = branchfirstbyte; - reqbyte = branchreqbyte; - } - - /* If this is not the first branch, the first char and reqbyte have to - match the values from all the previous branches, except that if the - previous value for reqbyte didn't have REQ_VARY set, it can still match, - and we set REQ_VARY for the regex. */ - - else - { - /* If we previously had a firstbyte, but it doesn't match the new branch, - we have to abandon the firstbyte for the regex, but if there was - previously no reqbyte, it takes on the value of the old firstbyte. */ - - if (firstbyte >= 0 && firstbyte != branchfirstbyte) - { - if (reqbyte < 0) reqbyte = firstbyte; - firstbyte = REQ_NONE; + if ((options & PCRE_IMS) != oldims) { + *code++ = OP_OPT; + *code++ = options & PCRE_IMS; + length += 2; } - /* If we (now or from before) have no firstbyte, a firstbyte from the - branch becomes a reqbyte if there isn't a branch reqbyte. */ + /* Set up dummy OP_REVERSE if lookbehind assertion */ - if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) - branchreqbyte = branchfirstbyte; - - /* Now ensure that the reqbytes match */ - - if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) - reqbyte = REQ_NONE; - else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ - } - - /* If lookbehind, check that this branch matches a fixed-length string, and - put the length into the OP_REVERSE item. Temporarily mark the end of the - branch with OP_END. */ - - if (lookbehind) - { - int fixed_length; - *code = OP_END; - fixed_length = find_fixedlength(last_branch, options); - DPRINTF(("fixed length = %d\n", fixed_length)); - if (fixed_length < 0) - { - *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25; - *ptrptr = ptr; - return FALSE; + if (lookbehind) { + *code++ = OP_REVERSE; + reverse_count = code; + PUTINC(code, 0, 0); + length += 1 + LINK_SIZE; } - PUT(reverse_count, 0, fixed_length); - } - } - /* Reached end of expression, either ')' or end of pattern. In the real - compile phase, go back through the alternative branches and reverse the chain - of offsets, with the field in the BRA item now becoming an offset to the - first alternative. If there are no alternatives, it points to the end of the - group. The length in the terminating ket is always the length of the whole - bracketed item. If any of the ims options were changed inside the group, - compile a resetting op-code following, except at the very end of the pattern. - Return leaving the pointer at the terminating char. */ + /* Now compile the branch; in the pre-compile phase its length gets + added into the length. */ - if (*ptr != '|') - { - if (lengthptr == NULL) - { - int branch_length = code - last_branch; - do - { - int prev_length = GET(last_branch, 1); - PUT(last_branch, 1, branch_length); - branch_length = prev_length; - last_branch -= branch_length; + if (!compile_branch(&options, &code, &ptr, errorcodeptr, + &branchfirstbyte, &branchreqbyte, &bc, cd, + (lengthptr == NULL) ? NULL : &length)) { + *ptrptr = ptr; + return FALSE; } - while (branch_length > 0); - } - /* Fill in the ket */ + /* Keep the highest bracket count in case (?| was used and some branch + has fewer than the rest. */ - *code = OP_KET; - PUT(code, 1, code - start_bracket); - code += 1 + LINK_SIZE; + if (cd->bracount > max_bracount) + max_bracount = cd->bracount; - /* Resetting option if needed */ + /* In the real compile phase, there is some post-processing to be done. + */ - if ((options & PCRE_IMS) != oldims && *ptr == ')') - { - *code++ = OP_OPT; - *code++ = oldims; - length += 2; - } + if (lengthptr == NULL) { + /* If this is the first branch, the firstbyte and reqbyte values for + the branch become the values for the regex. */ - /* Retain the highest bracket number, in case resetting was used. */ + if (*last_branch != OP_ALT) { + firstbyte = branchfirstbyte; + reqbyte = branchreqbyte; + } - cd->bracount = max_bracount; + /* If this is not the first branch, the first char and reqbyte have + to match the values from all the previous branches, except that if + the previous value for reqbyte didn't have REQ_VARY set, it can + still match, and we set REQ_VARY for the regex. */ - /* Set values to pass back */ + else { + /* If we previously had a firstbyte, but it doesn't match the + new branch, we have to abandon the firstbyte for the regex, but + if there was previously no reqbyte, it takes on the value of the + old firstbyte. */ - *codeptr = code; - *ptrptr = ptr; - *firstbyteptr = firstbyte; - *reqbyteptr = reqbyte; - if (lengthptr != NULL) - { - if (OFLOW_MAX - *lengthptr < length) - { - *errorcodeptr = ERR20; - return FALSE; + if (firstbyte >= 0 && firstbyte != branchfirstbyte) { + if (reqbyte < 0) + reqbyte = firstbyte; + firstbyte = REQ_NONE; + } + + /* If we (now or from before) have no firstbyte, a firstbyte + from the branch becomes a reqbyte if there isn't a branch + reqbyte. */ + + if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) + branchreqbyte = branchfirstbyte; + + /* Now ensure that the reqbytes match */ + + if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) + reqbyte = REQ_NONE; + else + reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ + } + + /* If lookbehind, check that this branch matches a fixed-length + string, and put the length into the OP_REVERSE item. Temporarily + mark the end of the branch with OP_END. */ + + if (lookbehind) { + int fixed_length; + *code = OP_END; + fixed_length = find_fixedlength(last_branch, options); + DPRINTF(("fixed length = %d\n", fixed_length)); + if (fixed_length < 0) { + *errorcodeptr = (fixed_length == -2) ? ERR36 : ERR25; + *ptrptr = ptr; + return FALSE; + } + PUT(reverse_count, 0, fixed_length); + } } - *lengthptr += length; - } - return TRUE; + + /* Reached end of expression, either ')' or end of pattern. In the real + compile phase, go back through the alternative branches and reverse the + chain of offsets, with the field in the BRA item now becoming an offset + to the first alternative. If there are no alternatives, it points to the + end of the group. The length in the terminating ket is always the length + of the whole bracketed item. If any of the ims options were changed + inside the group, compile a resetting op-code following, except at the + very end of the pattern. Return leaving the pointer at the terminating + char. */ + + if (*ptr != '|') { + if (lengthptr == NULL) { + int branch_length = code - last_branch; + do { + int prev_length = GET(last_branch, 1); + PUT(last_branch, 1, branch_length); + branch_length = prev_length; + last_branch -= branch_length; + } while (branch_length > 0); + } + + /* Fill in the ket */ + + *code = OP_KET; + PUT(code, 1, code - start_bracket); + code += 1 + LINK_SIZE; + + /* Resetting option if needed */ + + if ((options & PCRE_IMS) != oldims && *ptr == ')') { + *code++ = OP_OPT; + *code++ = oldims; + length += 2; + } + + /* Retain the highest bracket number, in case resetting was used. */ + + cd->bracount = max_bracount; + + /* Set values to pass back */ + + *codeptr = code; + *ptrptr = ptr; + *firstbyteptr = firstbyte; + *reqbyteptr = reqbyte; + if (lengthptr != NULL) { + if (OFLOW_MAX - *lengthptr < length) { + *errorcodeptr = ERR20; + return FALSE; + } + *lengthptr += length; + } + return TRUE; + } + + /* Another branch follows. In the pre-compile phase, we can move the + code pointer back to where it was for the start of the first branch. + (That is, pretend that each branch is the only one.) + + In the real compile phase, insert an ALT node. Its length field points + back to the previous branch while the bracket remains open. At the end + the chain is reversed. It's done like this so that the start of the + bracket has a zero offset until it is closed, making it possible to + detect recursion. */ + + if (lengthptr != NULL) { + code = *codeptr + 1 + LINK_SIZE + skipbytes; + length += 1 + LINK_SIZE; + } else { + *code = OP_ALT; + PUT(code, 1, code - last_branch); + bc.current = last_branch = code; + code += 1 + LINK_SIZE; + } + + ptr++; } - - /* Another branch follows. In the pre-compile phase, we can move the code - pointer back to where it was for the start of the first branch. (That is, - pretend that each branch is the only one.) - - In the real compile phase, insert an ALT node. Its length field points back - to the previous branch while the bracket remains open. At the end the chain - is reversed. It's done like this so that the start of the bracket has a - zero offset until it is closed, making it possible to detect recursion. */ - - if (lengthptr != NULL) - { - code = *codeptr + 1 + LINK_SIZE + skipbytes; - length += 1 + LINK_SIZE; - } - else - { - *code = OP_ALT; - PUT(code, 1, code - last_branch); - bc.current = last_branch = code; - code += 1 + LINK_SIZE; - } - - ptr++; - } -/* Control never reaches here */ + /* Control never reaches here */ } - - - /************************************************* -* Check for anchored expression * -*************************************************/ + * Check for anchored expression * + *************************************************/ /* Try to find out if this is an anchored regular expression. Consider each alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket @@ -5524,64 +5886,61 @@ Arguments: Returns: TRUE or FALSE */ -static BOOL -is_anchored(register const uschar *code, int *options, unsigned int bracket_map, - unsigned int backref_map) -{ -do { - const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], - options, PCRE_MULTILINE, FALSE); - register int op = *scode; +static BOOL is_anchored(register const uschar* code, + int* options, + unsigned int bracket_map, + unsigned int backref_map) { + do { + const uschar* scode = first_significant_code( + code + _pcre_OP_lengths[*code], options, PCRE_MULTILINE, FALSE); + register int op = *scode; - /* Non-capturing brackets */ + /* Non-capturing brackets */ - if (op == OP_BRA) - { - if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; - } + if (op == OP_BRA) { + if (!is_anchored(scode, options, bracket_map, backref_map)) + return FALSE; + } - /* Capturing brackets */ + /* Capturing brackets */ - else if (op == OP_CBRA) - { - int n = GET2(scode, 1+LINK_SIZE); - int new_map = bracket_map | ((n < 32)? (1 << n) : 1); - if (!is_anchored(scode, options, new_map, backref_map)) return FALSE; - } + else if (op == OP_CBRA) { + int n = GET2(scode, 1 + LINK_SIZE); + int new_map = bracket_map | ((n < 32) ? (1 << n) : 1); + if (!is_anchored(scode, options, new_map, backref_map)) + return FALSE; + } - /* Other brackets */ + /* Other brackets */ - else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) - { - if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; - } + else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) { + if (!is_anchored(scode, options, bracket_map, backref_map)) + return FALSE; + } - /* .* is not anchored unless DOTALL is set and it isn't in brackets that - are or may be referenced. */ + /* .* is not anchored unless DOTALL is set and it isn't in brackets that + are or may be referenced. */ - else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || - op == OP_TYPEPOSSTAR) && - (*options & PCRE_DOTALL) != 0) - { - if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; - } + else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || + op == OP_TYPEPOSSTAR) && + (*options & PCRE_DOTALL) != 0) { + if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) + return FALSE; + } - /* Check for explicit anchoring */ + /* Check for explicit anchoring */ - else if (op != OP_SOD && op != OP_SOM && - ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC)) - return FALSE; - code += GET(code, 1); - } -while (*code == OP_ALT); /* Loop for each alternative */ -return TRUE; + else if (op != OP_SOD && op != OP_SOM && + ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC)) + return FALSE; + code += GET(code, 1); + } while (*code == OP_ALT); /* Loop for each alternative */ + return TRUE; } - - /************************************************* -* Check for starting with ^ or .* * -*************************************************/ + * Check for starting with ^ or .* * + *************************************************/ /* This is called to find out if every branch starts with ^ or .* so that "first char" processing can be done to speed things up in multiline @@ -5600,61 +5959,61 @@ Arguments: Returns: TRUE or FALSE */ -static BOOL -is_startline(const uschar *code, unsigned int bracket_map, - unsigned int backref_map) -{ -do { - const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], - NULL, 0, FALSE); - register int op = *scode; +static BOOL is_startline(const uschar* code, + unsigned int bracket_map, + unsigned int backref_map) { + do { + const uschar* scode = first_significant_code( + code + _pcre_OP_lengths[*code], NULL, 0, FALSE); + register int op = *scode; - /* Non-capturing brackets */ + /* Non-capturing brackets */ - if (op == OP_BRA) - { - if (!is_startline(scode, bracket_map, backref_map)) return FALSE; - } + if (op == OP_BRA) { + if (!is_startline(scode, bracket_map, backref_map)) + return FALSE; + } - /* Capturing brackets */ + /* Capturing brackets */ - else if (op == OP_CBRA) - { - int n = GET2(scode, 1+LINK_SIZE); - int new_map = bracket_map | ((n < 32)? (1 << n) : 1); - if (!is_startline(scode, new_map, backref_map)) return FALSE; - } + else if (op == OP_CBRA) { + int n = GET2(scode, 1 + LINK_SIZE); + int new_map = bracket_map | ((n < 32) ? (1 << n) : 1); + if (!is_startline(scode, new_map, backref_map)) + return FALSE; + } - /* Other brackets */ + /* Other brackets */ - else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) - { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; } + else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) { + if (!is_startline(scode, bracket_map, backref_map)) + return FALSE; + } - /* .* means "start at start or after \n" if it isn't in brackets that - may be referenced. */ + /* .* means "start at start or after \n" if it isn't in brackets that + may be referenced. */ - else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) - { - if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; - } + else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || + op == OP_TYPEPOSSTAR) { + if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) + return FALSE; + } - /* Check for explicit circumflex */ + /* Check for explicit circumflex */ - else if (op != OP_CIRC) return FALSE; + else if (op != OP_CIRC) + return FALSE; - /* Move on to the next alternative */ + /* Move on to the next alternative */ - code += GET(code, 1); - } -while (*code == OP_ALT); /* Loop for each alternative */ -return TRUE; + code += GET(code, 1); + } while (*code == OP_ALT); /* Loop for each alternative */ + return TRUE; } - - /************************************************* -* Check for asserted fixed first char * -*************************************************/ + * Check for asserted fixed first char * + *************************************************/ /* During compilation, the "first char" settings from forward assertions are discarded, because they can cause conflicts with actual literals that follow. @@ -5672,60 +6031,61 @@ Arguments: Returns: -1 or the fixed first char */ -static int -find_firstassertedchar(const uschar *code, int *options, BOOL inassert) -{ -register int c = -1; -do { - int d; - const uschar *scode = - first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE); - register int op = *scode; +static int find_firstassertedchar(const uschar* code, + int* options, + BOOL inassert) { + register int c = -1; + do { + int d; + const uschar* scode = first_significant_code( + code + 1 + LINK_SIZE, options, PCRE_CASELESS, TRUE); + register int op = *scode; - switch(op) - { - default: - return -1; + switch (op) { + default: + return -1; - case OP_BRA: - case OP_CBRA: - case OP_ASSERT: - case OP_ONCE: - case OP_COND: - if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0) - return -1; - if (c < 0) c = d; else if (c != d) return -1; - break; + case OP_BRA: + case OP_CBRA: + case OP_ASSERT: + case OP_ONCE: + case OP_COND: + if ((d = find_firstassertedchar(scode, options, + op == OP_ASSERT)) < 0) + return -1; + if (c < 0) + c = d; + else if (c != d) + return -1; + break; - case OP_EXACT: /* Fall through */ - scode += 2; + case OP_EXACT: /* Fall through */ + scode += 2; - case OP_CHAR: - case OP_CHARNC: - case OP_PLUS: - case OP_MINPLUS: - case OP_POSPLUS: - if (!inassert) return -1; - if (c < 0) - { - c = scode[1]; - if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS; - } - else if (c != scode[1]) return -1; - break; - } + case OP_CHAR: + case OP_CHARNC: + case OP_PLUS: + case OP_MINPLUS: + case OP_POSPLUS: + if (!inassert) + return -1; + if (c < 0) { + c = scode[1]; + if ((*options & PCRE_CASELESS) != 0) + c |= REQ_CASELESS; + } else if (c != scode[1]) + return -1; + break; + } - code += GET(code, 1); - } -while (*code == OP_ALT); -return c; + code += GET(code, 1); + } while (*code == OP_ALT); + return c; } - - /************************************************* -* Compile a Regular Expression * -*************************************************/ + * Compile a Regular Expression * + *************************************************/ /* This function takes a string and returns a pointer to a block of store holding a compiled version of the expression. The original API for this @@ -5745,436 +6105,470 @@ Returns: pointer to compiled data block, or NULL on error, with errorptr and erroroffset set */ -pcre * -pcre_compile(const char *pattern, int options, const char **errorptr, - int *erroroffset, const unsigned char *tables) -{ -return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); +pcre* pcre_compile(const char* pattern, + int options, + const char** errorptr, + int* erroroffset, + const unsigned char* tables) { + return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); } - -pcre * -pcre_compile2(const char *pattern, int options, int *errorcodeptr, - const char **errorptr, int *erroroffset, const unsigned char *tables) -{ -real_pcre *re; -int length = 1; /* For final END opcode */ -int firstbyte, reqbyte, newline; -int errorcode = 0; -int skipatstart = 0; +pcre* pcre_compile2(const char* pattern, + int options, + int* errorcodeptr, + const char** errorptr, + int* erroroffset, + const unsigned char* tables) { + real_pcre* re; + int length = 1; /* For final END opcode */ + int firstbyte, reqbyte, newline; + int errorcode = 0; + int skipatstart = 0; #ifdef SUPPORT_UTF8 -BOOL utf8; + BOOL utf8; #endif -size_t size; -uschar *code; -const uschar *codestart; -const uschar *ptr; -compile_data compile_block; -compile_data *cd = &compile_block; + size_t size; + uschar* code; + const uschar* codestart; + const uschar* ptr; + compile_data compile_block; + compile_data* cd = &compile_block; -/* This space is used for "compiling" into during the first phase, when we are -computing the amount of memory that is needed. Compiled items are thrown away -as soon as possible, so that a fairly large buffer should be sufficient for -this purpose. The same space is used in the second phase for remembering where -to fill in forward references to subpatterns. */ + /* This space is used for "compiling" into during the first phase, when we + are computing the amount of memory that is needed. Compiled items are thrown + away as soon as possible, so that a fairly large buffer should be sufficient + for this purpose. The same space is used in the second phase for remembering + where to fill in forward references to subpatterns. */ -uschar cworkspace[COMPILE_WORK_SIZE]; + uschar cworkspace[COMPILE_WORK_SIZE]; -/* Set this early so that early errors get offset 0. */ + /* Set this early so that early errors get offset 0. */ -ptr = (const uschar *)pattern; + ptr = (const uschar*)pattern; -/* We can't pass back an error message if errorptr is NULL; I guess the best we -can do is just return NULL, but we can set a code value if there is a code -pointer. */ + /* We can't pass back an error message if errorptr is NULL; I guess the best + we can do is just return NULL, but we can set a code value if there is a + code pointer. */ -if (errorptr == NULL) - { - if (errorcodeptr != NULL) *errorcodeptr = 99; - return NULL; - } + if (errorptr == NULL) { + if (errorcodeptr != NULL) + *errorcodeptr = 99; + return NULL; + } -*errorptr = NULL; -if (errorcodeptr != NULL) *errorcodeptr = ERR0; + *errorptr = NULL; + if (errorcodeptr != NULL) + *errorcodeptr = ERR0; -/* However, we can give a message for this error */ + /* However, we can give a message for this error */ -if (erroroffset == NULL) - { - errorcode = ERR16; - goto PCRE_EARLY_ERROR_RETURN2; - } + if (erroroffset == NULL) { + errorcode = ERR16; + goto PCRE_EARLY_ERROR_RETURN2; + } -*erroroffset = 0; + *erroroffset = 0; -/* Can't support UTF8 unless PCRE has been compiled to include the code. */ + /* Can't support UTF8 unless PCRE has been compiled to include the code. */ #ifdef SUPPORT_UTF8 -utf8 = (options & PCRE_UTF8) != 0; -if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && - (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0) - { - errorcode = ERR44; - goto PCRE_EARLY_ERROR_RETURN2; - } + utf8 = (options & PCRE_UTF8) != 0; + if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && + (*erroroffset = _pcre_valid_utf8((uschar*)pattern, -1)) >= 0) { + errorcode = ERR44; + goto PCRE_EARLY_ERROR_RETURN2; + } #else -if ((options & PCRE_UTF8) != 0) - { - errorcode = ERR32; - goto PCRE_EARLY_ERROR_RETURN; - } + if ((options & PCRE_UTF8) != 0) { + errorcode = ERR32; + goto PCRE_EARLY_ERROR_RETURN; + } #endif -if ((options & ~PUBLIC_OPTIONS) != 0) - { - errorcode = ERR17; - goto PCRE_EARLY_ERROR_RETURN; - } - -/* Set up pointers to the individual character tables */ - -if (tables == NULL) tables = _pcre_default_tables; -cd->lcc = tables + lcc_offset; -cd->fcc = tables + fcc_offset; -cd->cbits = tables + cbits_offset; -cd->ctypes = tables + ctypes_offset; - -/* Check for global one-time settings at the start of the pattern, and remember -the offset for later. */ - -while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*') - { - int newnl = 0; - int newbsr = 0; - - if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0) - { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } - else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0) - { skipatstart += 5; newnl = PCRE_NEWLINE_LF; } - else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0) - { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; } - else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0) - { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; } - else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0) - { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; } - - else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0) - { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; } - else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0) - { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; } - - if (newnl != 0) - options = (options & ~PCRE_NEWLINE_BITS) | newnl; - else if (newbsr != 0) - options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr; - else break; - } - -/* Check validity of \R options. */ - -switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) - { - case 0: - case PCRE_BSR_ANYCRLF: - case PCRE_BSR_UNICODE: - break; - default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; - } - -/* Handle different types of newline. The three bits give seven cases. The -current code allows for fixed one- or two-byte sequences, plus "any" and -"anycrlf". */ - -switch (options & PCRE_NEWLINE_BITS) - { - case 0: newline = NEWLINE; break; /* Build-time default */ - case PCRE_NEWLINE_CR: newline = '\r'; break; - case PCRE_NEWLINE_LF: newline = '\n'; break; - case PCRE_NEWLINE_CR+ - PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; - case PCRE_NEWLINE_ANY: newline = -1; break; - case PCRE_NEWLINE_ANYCRLF: newline = -2; break; - default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; - } - -if (newline == -2) - { - cd->nltype = NLTYPE_ANYCRLF; - } -else if (newline < 0) - { - cd->nltype = NLTYPE_ANY; - } -else - { - cd->nltype = NLTYPE_FIXED; - if (newline > 255) - { - cd->nllen = 2; - cd->nl[0] = (newline >> 8) & 255; - cd->nl[1] = newline & 255; + if ((options & ~PUBLIC_OPTIONS) != 0) { + errorcode = ERR17; + goto PCRE_EARLY_ERROR_RETURN; } - else - { - cd->nllen = 1; - cd->nl[0] = newline; + + /* Set up pointers to the individual character tables */ + + if (tables == NULL) + tables = _pcre_default_tables; + cd->lcc = tables + lcc_offset; + cd->fcc = tables + fcc_offset; + cd->cbits = tables + cbits_offset; + cd->ctypes = tables + ctypes_offset; + + /* Check for global one-time settings at the start of the pattern, and + remember the offset for later. */ + + while (ptr[skipatstart] == '(' && ptr[skipatstart + 1] == '*') { + int newnl = 0; + int newbsr = 0; + + if (strncmp((char*)(ptr + skipatstart + 2), "CR)", 3) == 0) { + skipatstart += 5; + newnl = PCRE_NEWLINE_CR; + } else if (strncmp((char*)(ptr + skipatstart + 2), "LF)", 3) == 0) { + skipatstart += 5; + newnl = PCRE_NEWLINE_LF; + } else if (strncmp((char*)(ptr + skipatstart + 2), "CRLF)", 5) == 0) { + skipatstart += 7; + newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; + } else if (strncmp((char*)(ptr + skipatstart + 2), "ANY)", 4) == 0) { + skipatstart += 6; + newnl = PCRE_NEWLINE_ANY; + } else if (strncmp((char*)(ptr + skipatstart + 2), "ANYCRLF)", 8) == + 0) { + skipatstart += 10; + newnl = PCRE_NEWLINE_ANYCRLF; + } + + else if (strncmp((char*)(ptr + skipatstart + 2), "BSR_ANYCRLF)", 12) == + 0) { + skipatstart += 14; + newbsr = PCRE_BSR_ANYCRLF; + } else if (strncmp((char*)(ptr + skipatstart + 2), "BSR_UNICODE)", + 12) == 0) { + skipatstart += 14; + newbsr = PCRE_BSR_UNICODE; + } + + if (newnl != 0) + options = (options & ~PCRE_NEWLINE_BITS) | newnl; + else if (newbsr != 0) + options = + (options & ~(PCRE_BSR_ANYCRLF | PCRE_BSR_UNICODE)) | newbsr; + else + break; } - } -/* Maximum back reference and backref bitmap. The bitmap records up to 31 back -references to help in deciding whether (.*) can be treated as anchored or not. -*/ + /* Check validity of \R options. */ -cd->top_backref = 0; -cd->backref_map = 0; + switch (options & (PCRE_BSR_ANYCRLF | PCRE_BSR_UNICODE)) { + case 0: + case PCRE_BSR_ANYCRLF: + case PCRE_BSR_UNICODE: + break; + default: + errorcode = ERR56; + goto PCRE_EARLY_ERROR_RETURN; + } -/* Reflect pattern for debugging output */ + /* Handle different types of newline. The three bits give seven cases. The + current code allows for fixed one- or two-byte sequences, plus "any" and + "anycrlf". */ -DPRINTF(("------------------------------------------------------------------\n")); -DPRINTF(("%s\n", pattern)); + switch (options & PCRE_NEWLINE_BITS) { + case 0: + newline = NEWLINE; + break; /* Build-time default */ + case PCRE_NEWLINE_CR: + newline = '\r'; + break; + case PCRE_NEWLINE_LF: + newline = '\n'; + break; + case PCRE_NEWLINE_CR + PCRE_NEWLINE_LF: + newline = ('\r' << 8) | '\n'; + break; + case PCRE_NEWLINE_ANY: + newline = -1; + break; + case PCRE_NEWLINE_ANYCRLF: + newline = -2; + break; + default: + errorcode = ERR56; + goto PCRE_EARLY_ERROR_RETURN; + } -/* Pretend to compile the pattern while actually just accumulating the length -of memory required. This behaviour is triggered by passing a non-NULL final -argument to compile_regex(). We pass a block of workspace (cworkspace) for it -to compile parts of the pattern into; the compiled code is discarded when it is -no longer needed, so hopefully this workspace will never overflow, though there -is a test for its doing so. */ + if (newline == -2) { + cd->nltype = NLTYPE_ANYCRLF; + } else if (newline < 0) { + cd->nltype = NLTYPE_ANY; + } else { + cd->nltype = NLTYPE_FIXED; + if (newline > 255) { + cd->nllen = 2; + cd->nl[0] = (newline >> 8) & 255; + cd->nl[1] = newline & 255; + } else { + cd->nllen = 1; + cd->nl[0] = newline; + } + } -cd->bracount = cd->final_bracount = 0; -cd->names_found = 0; -cd->name_entry_size = 0; -cd->name_table = NULL; -cd->start_workspace = cworkspace; -cd->start_code = cworkspace; -cd->hwm = cworkspace; -cd->start_pattern = (const uschar *)pattern; -cd->end_pattern = (const uschar *)(pattern + strlen(pattern)); -cd->req_varyopt = 0; -cd->external_options = options; -cd->external_flags = 0; + /* Maximum back reference and backref bitmap. The bitmap records up to 31 + back references to help in deciding whether (.*) can be treated as anchored + or not. + */ -/* Now do the pre-compile. On error, errorcode will be set non-zero, so we -don't need to look at the result of the function here. The initial options have -been put into the cd block so that they can be changed if an option setting is -found within the regex right at the beginning. Bringing initial option settings -outside can help speed up starting point checks. */ + cd->top_backref = 0; + cd->backref_map = 0; -ptr += skipatstart; -code = cworkspace; -*code = OP_BRA; -(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS, - &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, - &length); -if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; + /* Reflect pattern for debugging output */ -DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, - cd->hwm - cworkspace)); + DPRINTF( + ("------------------------------------------------------------------" + "\n")); + DPRINTF(("%s\n", pattern)); -if (length > MAX_PATTERN_SIZE) - { - errorcode = ERR20; - goto PCRE_EARLY_ERROR_RETURN; - } + /* Pretend to compile the pattern while actually just accumulating the + length of memory required. This behaviour is triggered by passing a non-NULL + final argument to compile_regex(). We pass a block of workspace (cworkspace) + for it to compile parts of the pattern into; the compiled code is discarded + when it is no longer needed, so hopefully this workspace will never + overflow, though there is a test for its doing so. */ -/* Compute the size of data block needed and get it, either from malloc or -externally provided function. Integer overflow should no longer be possible -because nowadays we limit the maximum value of cd->names_found and -cd->name_entry_size. */ + cd->bracount = cd->final_bracount = 0; + cd->names_found = 0; + cd->name_entry_size = 0; + cd->name_table = NULL; + cd->start_workspace = cworkspace; + cd->start_code = cworkspace; + cd->hwm = cworkspace; + cd->start_pattern = (const uschar*)pattern; + cd->end_pattern = (const uschar*)(pattern + strlen(pattern)); + cd->req_varyopt = 0; + cd->external_options = options; + cd->external_flags = 0; -size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3); -re = (real_pcre *)(pcre_malloc)(size); + /* Now do the pre-compile. On error, errorcode will be set non-zero, so we + don't need to look at the result of the function here. The initial options + have been put into the cd block so that they can be changed if an option + setting is found within the regex right at the beginning. Bringing initial + option settings outside can help speed up starting point checks. */ -if (re == NULL) - { - errorcode = ERR21; - goto PCRE_EARLY_ERROR_RETURN; - } + ptr += skipatstart; + code = cworkspace; + *code = OP_BRA; + (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS, + &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, + &reqbyte, NULL, cd, &length); + if (errorcode != 0) + goto PCRE_EARLY_ERROR_RETURN; -/* Put in the magic number, and save the sizes, initial options, internal -flags, and character table pointer. NULL is used for the default character -tables. The nullpad field is at the end; it's there to help in the case when a -regex compiled on a system with 4-byte pointers is run on another with 8-byte -pointers. */ + DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, + cd->hwm - cworkspace)); -re->magic_number = MAGIC_NUMBER; -re->size = size; -re->options = cd->external_options; -re->flags = cd->external_flags; -re->dummy1 = 0; -re->first_byte = 0; -re->req_byte = 0; -re->name_table_offset = sizeof(real_pcre); -re->name_entry_size = cd->name_entry_size; -re->name_count = cd->names_found; -re->ref_count = 0; -re->tables = (tables == _pcre_default_tables)? NULL : tables; -re->nullpad = NULL; + if (length > MAX_PATTERN_SIZE) { + errorcode = ERR20; + goto PCRE_EARLY_ERROR_RETURN; + } -/* The starting points of the name/number translation table and of the code are -passed around in the compile data block. The start/end pattern and initial -options are already set from the pre-compile phase, as is the name_entry_size -field. Reset the bracket count and the names_found field. Also reset the hwm -field; this time it's used for remembering forward references to subpatterns. -*/ + /* Compute the size of data block needed and get it, either from malloc or + externally provided function. Integer overflow should no longer be possible + because nowadays we limit the maximum value of cd->names_found and + cd->name_entry_size. */ -cd->final_bracount = cd->bracount; /* Save for checking forward references */ -cd->bracount = 0; -cd->names_found = 0; -cd->name_table = (uschar *)re + re->name_table_offset; -codestart = cd->name_table + re->name_entry_size * re->name_count; -cd->start_code = codestart; -cd->hwm = cworkspace; -cd->req_varyopt = 0; -cd->had_accept = FALSE; + size = length + sizeof(real_pcre) + + cd->names_found * (cd->name_entry_size + 3); + re = (real_pcre*)(pcre_malloc)(size); -/* Set up a starting, non-extracting bracket, then compile the expression. On -error, errorcode will be set non-zero, so we don't need to look at the result -of the function here. */ + if (re == NULL) { + errorcode = ERR21; + goto PCRE_EARLY_ERROR_RETURN; + } -ptr = (const uschar *)pattern + skipatstart; -code = (uschar *)codestart; -*code = OP_BRA; -(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr, - &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL); -re->top_bracket = cd->bracount; -re->top_backref = cd->top_backref; -re->flags = cd->external_flags; + /* Put in the magic number, and save the sizes, initial options, internal + flags, and character table pointer. NULL is used for the default character + tables. The nullpad field is at the end; it's there to help in the case when + a regex compiled on a system with 4-byte pointers is run on another with + 8-byte pointers. */ -if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */ + re->magic_number = MAGIC_NUMBER; + re->size = size; + re->options = cd->external_options; + re->flags = cd->external_flags; + re->dummy1 = 0; + re->first_byte = 0; + re->req_byte = 0; + re->name_table_offset = sizeof(real_pcre); + re->name_entry_size = cd->name_entry_size; + re->name_count = cd->names_found; + re->ref_count = 0; + re->tables = (tables == _pcre_default_tables) ? NULL : tables; + re->nullpad = NULL; -/* If not reached end of pattern on success, there's an excess bracket. */ + /* The starting points of the name/number translation table and of the code + are passed around in the compile data block. The start/end pattern and + initial options are already set from the pre-compile phase, as is the + name_entry_size field. Reset the bracket count and the names_found field. + Also reset the hwm field; this time it's used for remembering forward + references to subpatterns. + */ -if (errorcode == 0 && *ptr != 0) errorcode = ERR22; + cd->final_bracount = + cd->bracount; /* Save for checking forward references */ + cd->bracount = 0; + cd->names_found = 0; + cd->name_table = (uschar*)re + re->name_table_offset; + codestart = cd->name_table + re->name_entry_size * re->name_count; + cd->start_code = codestart; + cd->hwm = cworkspace; + cd->req_varyopt = 0; + cd->had_accept = FALSE; -/* Fill in the terminating state and check for disastrous overflow, but -if debugging, leave the test till after things are printed out. */ + /* Set up a starting, non-extracting bracket, then compile the expression. + On error, errorcode will be set non-zero, so we don't need to look at the + result of the function here. */ -*code++ = OP_END; + ptr = (const uschar*)pattern + skipatstart; + code = (uschar*)codestart; + *code = OP_BRA; + (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr, + &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, + cd, NULL); + re->top_bracket = cd->bracount; + re->top_backref = cd->top_backref; + re->flags = cd->external_flags; + + if (cd->had_accept) + reqbyte = -1; /* Must disable after (*ACCEPT) */ + + /* If not reached end of pattern on success, there's an excess bracket. */ + + if (errorcode == 0 && *ptr != 0) + errorcode = ERR22; + + /* Fill in the terminating state and check for disastrous overflow, but + if debugging, leave the test till after things are printed out. */ + + *code++ = OP_END; #ifndef DEBUG -if (code - codestart > length) errorcode = ERR23; + if (code - codestart > length) + errorcode = ERR23; #endif -/* Fill in any forward references that are required. */ + /* Fill in any forward references that are required. */ -while (errorcode == 0 && cd->hwm > cworkspace) - { - int offset, recno; - const uschar *groupptr; - cd->hwm -= LINK_SIZE; - offset = GET(cd->hwm, 0); - recno = GET(codestart, offset); - groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno); - if (groupptr == NULL) errorcode = ERR53; - else PUT(((uschar *)codestart), offset, groupptr - codestart); - } - -/* Give an error if there's back reference to a non-existent capturing -subpattern. */ - -if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15; - -/* Failed to compile, or error while post-processing */ - -if (errorcode != 0) - { - (pcre_free)(re); - PCRE_EARLY_ERROR_RETURN: - *erroroffset = ptr - (const uschar *)pattern; - PCRE_EARLY_ERROR_RETURN2: - *errorptr = find_error_text(errorcode); - if (errorcodeptr != NULL) *errorcodeptr = errorcode; - return NULL; - } - -/* If the anchored option was not passed, set the flag if we can determine that -the pattern is anchored by virtue of ^ characters or \A or anything else (such -as starting with .* when DOTALL is set). - -Otherwise, if we know what the first byte has to be, save it, because that -speeds up unanchored matches no end. If not, see if we can set the -PCRE_STARTLINE flag. This is helpful for multiline matches when all branches -start with ^. and also when all branches start with .* for non-DOTALL matches. -*/ - -if ((re->options & PCRE_ANCHORED) == 0) - { - int temp_options = re->options; /* May get changed during these scans */ - if (is_anchored(codestart, &temp_options, 0, cd->backref_map)) - re->options |= PCRE_ANCHORED; - else - { - if (firstbyte < 0) - firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE); - if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */ - { - int ch = firstbyte & 255; - re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && - cd->fcc[ch] == ch)? ch : firstbyte; - re->flags |= PCRE_FIRSTSET; - } - else if (is_startline(codestart, 0, cd->backref_map)) - re->flags |= PCRE_STARTLINE; + while (errorcode == 0 && cd->hwm > cworkspace) { + int offset, recno; + const uschar* groupptr; + cd->hwm -= LINK_SIZE; + offset = GET(cd->hwm, 0); + recno = GET(codestart, offset); + groupptr = + find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno); + if (groupptr == NULL) + errorcode = ERR53; + else + PUT(((uschar*)codestart), offset, groupptr - codestart); } - } -/* For an anchored pattern, we use the "required byte" only if it follows a -variable length item in the regex. Remove the caseless flag for non-caseable -bytes. */ + /* Give an error if there's back reference to a non-existent capturing + subpattern. */ -if (reqbyte >= 0 && - ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0)) - { - int ch = reqbyte & 255; - re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && - cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; - re->flags |= PCRE_REQCHSET; - } + if (errorcode == 0 && re->top_backref > re->top_bracket) + errorcode = ERR15; -/* Print out the compiled data if debugging is enabled. This is never the -case when building a production library. */ + /* Failed to compile, or error while post-processing */ + + if (errorcode != 0) { + (pcre_free)(re); + PCRE_EARLY_ERROR_RETURN: + *erroroffset = ptr - (const uschar*)pattern; + PCRE_EARLY_ERROR_RETURN2: + *errorptr = find_error_text(errorcode); + if (errorcodeptr != NULL) + *errorcodeptr = errorcode; + return NULL; + } + + /* If the anchored option was not passed, set the flag if we can determine + that the pattern is anchored by virtue of ^ characters or \A or anything + else (such as starting with .* when DOTALL is set). + + Otherwise, if we know what the first byte has to be, save it, because that + speeds up unanchored matches no end. If not, see if we can set the + PCRE_STARTLINE flag. This is helpful for multiline matches when all branches + start with ^. and also when all branches start with .* for non-DOTALL + matches. + */ + + if ((re->options & PCRE_ANCHORED) == 0) { + int temp_options = re->options; /* May get changed during these scans */ + if (is_anchored(codestart, &temp_options, 0, cd->backref_map)) + re->options |= PCRE_ANCHORED; + else { + if (firstbyte < 0) + firstbyte = + find_firstassertedchar(codestart, &temp_options, FALSE); + if (firstbyte >= + 0) /* Remove caseless flag for non-caseable chars */ + { + int ch = firstbyte & 255; + re->first_byte = + ((firstbyte & REQ_CASELESS) != 0 && cd->fcc[ch] == ch) + ? ch + : firstbyte; + re->flags |= PCRE_FIRSTSET; + } else if (is_startline(codestart, 0, cd->backref_map)) + re->flags |= PCRE_STARTLINE; + } + } + + /* For an anchored pattern, we use the "required byte" only if it follows a + variable length item in the regex. Remove the caseless flag for non-caseable + bytes. */ + + if (reqbyte >= 0 && + ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0)) { + int ch = reqbyte & 255; + re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && cd->fcc[ch] == ch) + ? (reqbyte & ~REQ_CASELESS) + : reqbyte; + re->flags |= PCRE_REQCHSET; + } + + /* Print out the compiled data if debugging is enabled. This is never the + case when building a production library. */ #ifdef DEBUG -printf("Length = %d top_bracket = %d top_backref = %d\n", - length, re->top_bracket, re->top_backref); + printf("Length = %d top_bracket = %d top_backref = %d\n", length, + re->top_bracket, re->top_backref); -printf("Options=%08x\n", re->options); + printf("Options=%08x\n", re->options); -if ((re->flags & PCRE_FIRSTSET) != 0) - { - int ch = re->first_byte & 255; - const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? - "" : " (caseless)"; - if (isprint(ch)) printf("First char = %c%s\n", ch, caseless); - else printf("First char = \\x%02x%s\n", ch, caseless); - } + if ((re->flags & PCRE_FIRSTSET) != 0) { + int ch = re->first_byte & 255; + const char* caseless = + ((re->first_byte & REQ_CASELESS) == 0) ? "" : " (caseless)"; + if (isprint(ch)) + printf("First char = %c%s\n", ch, caseless); + else + printf("First char = \\x%02x%s\n", ch, caseless); + } -if ((re->flags & PCRE_REQCHSET) != 0) - { - int ch = re->req_byte & 255; - const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? - "" : " (caseless)"; - if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless); - else printf("Req char = \\x%02x%s\n", ch, caseless); - } + if ((re->flags & PCRE_REQCHSET) != 0) { + int ch = re->req_byte & 255; + const char* caseless = + ((re->req_byte & REQ_CASELESS) == 0) ? "" : " (caseless)"; + if (isprint(ch)) + printf("Req char = %c%s\n", ch, caseless); + else + printf("Req char = \\x%02x%s\n", ch, caseless); + } -pcre_printint(re, stdout, TRUE); + pcre_printint(re, stdout, TRUE); -/* This check is done here in the debugging case so that the code that -was compiled can be seen. */ + /* This check is done here in the debugging case so that the code that + was compiled can be seen. */ -if (code - codestart > length) - { - (pcre_free)(re); - *errorptr = find_error_text(ERR23); - *erroroffset = ptr - (uschar *)pattern; - if (errorcodeptr != NULL) *errorcodeptr = ERR23; - return NULL; - } -#endif /* DEBUG */ + if (code - codestart > length) { + (pcre_free)(re); + *errorptr = find_error_text(ERR23); + *erroroffset = ptr - (uschar*)pattern; + if (errorcodeptr != NULL) + *errorcodeptr = ERR23; + return NULL; + } +#endif /* DEBUG */ -return (pcre *)re; + return (pcre*)re; } /* End of pcre_compile.c */ diff --git a/package/re/pcre_exec.c b/package/re/pcre_exec.c index e7a8143a2..e3bd1b0ae 100644 --- a/package/re/pcre_exec.c +++ b/package/re/pcre_exec.c @@ -4,9 +4,9 @@ pattern matching using an NFA algorithm, trying to mimic Perl as closely as possible. There are also some static supporting functions. */ #include "re_config.h" -#define NLBLOCK md /* Block containing newline information */ -#define PSSTART start_subject /* Field containing processed string start */ -#define PSEND end_subject /* Field containing processed string end */ +#define NLBLOCK md /* Block containing newline information */ +#define PSSTART start_subject /* Field containing processed string start */ +#define PSEND end_subject /* Field containing processed string end */ #include "pcre_internal.h" @@ -17,22 +17,22 @@ possible. There are also some static supporting functions. */ /* Flag bits for the match() function */ -#define match_condassert 0x01 /* Called to check a condition assertion */ -#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */ +#define match_condassert 0x01 /* Called to check a condition assertion */ +#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */ /* Non-error returns from the match() function. Error returns are externally defined PCRE_ERROR_xxx codes, which are all negative. */ -#define MATCH_MATCH 1 -#define MATCH_NOMATCH 0 +#define MATCH_MATCH 1 +#define MATCH_NOMATCH 0 /* Special internal returns from the match() function. Make them sufficiently negative to avoid the external error codes. */ -#define MATCH_COMMIT (-999) -#define MATCH_PRUNE (-998) -#define MATCH_SKIP (-997) -#define MATCH_THEN (-996) +#define MATCH_COMMIT (-999) +#define MATCH_PRUNE (-998) +#define MATCH_SKIP (-997) +#define MATCH_THEN (-996) /* Maximum number of ints of offset to save on the stack for recursive calls. If the offset vector is bigger, malloc is used. This should be a multiple of 3, @@ -42,15 +42,13 @@ because the offset vector is always a multiple of 3 long. */ /* Min and max values for the common repeats; for the maxima, 0 => infinity */ -static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; -static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; - - +static const char rep_min[] = {0, 0, 1, 1, 0, 0}; +static const char rep_max[] = {0, 0, 0, 0, 1, 1}; #ifdef DEBUG /************************************************* -* Debugging function to print chars * -*************************************************/ + * Debugging function to print chars * + *************************************************/ /* Print a sequence of chars in printable format, stopping at the end of the subject if the requested. @@ -64,21 +62,24 @@ Arguments: Returns: nothing */ -static void -pchars(const uschar *p, int length, BOOL is_subject, match_data *md) -{ -unsigned int c; -if (is_subject && length > md->end_subject - p) length = md->end_subject - p; -while (length-- > 0) - if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); +static void pchars(const uschar* p, + int length, + BOOL is_subject, + match_data* md) { + unsigned int c; + if (is_subject && length > md->end_subject - p) + length = md->end_subject - p; + while (length-- > 0) + if (isprint(c = *(p++))) + printf("%c", c); + else + printf("\\x%02x", c); } #endif - - /************************************************* -* Match a back-reference * -*************************************************/ + * Match a back-reference * + *************************************************/ /* If a back reference hasn't been set, the length that is passed is greater than the number of characters left in the string, so the match fails. @@ -93,44 +94,45 @@ Arguments: Returns: TRUE if matched */ -static BOOL -match_ref(int offset, register USPTR eptr, int length, match_data *md, - unsigned long int ims) -{ -USPTR p = md->start_subject + md->offset_vector[offset]; +static BOOL match_ref(int offset, + register USPTR eptr, + int length, + match_data* md, + unsigned long int ims) { + USPTR p = md->start_subject + md->offset_vector[offset]; #ifdef DEBUG -if (eptr >= md->end_subject) - printf("matching subject "); -else - { - printf("matching subject "); - pchars(eptr, length, TRUE, md); - } -printf(" against backref "); -pchars(p, length, FALSE, md); -printf("\n"); + if (eptr >= md->end_subject) + printf("matching subject "); + else { + printf("matching subject "); + pchars(eptr, length, TRUE, md); + } + printf(" against backref "); + pchars(p, length, FALSE, md); + printf("\n"); #endif -/* Always fail if not enough characters left */ + /* Always fail if not enough characters left */ -if (length > md->end_subject - eptr) return FALSE; + if (length > md->end_subject - eptr) + return FALSE; -/* Separate the caselesss case for speed */ + /* Separate the caselesss case for speed */ -if ((ims & PCRE_CASELESS) != 0) - { - while (length-- > 0) - if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; - } -else - { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } + if ((ims & PCRE_CASELESS) != 0) { + while (length-- > 0) + if (md->lcc[*p++] != md->lcc[*eptr++]) + return FALSE; + } else { + while (length-- > 0) + if (*p++ != *eptr++) + return FALSE; + } -return TRUE; + return TRUE; } - - /*************************************************************************** **************************************************************************** RECURSION IN THE match() FUNCTION @@ -172,12 +174,62 @@ variable instead of being passed in the frame. /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN below must be updated in sync. */ -enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, - RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, - RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, - RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, - RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, - RM51, RM52, RM53, RM54 }; +enum { + RM1 = 1, + RM2, + RM3, + RM4, + RM5, + RM6, + RM7, + RM8, + RM9, + RM10, + RM11, + RM12, + RM13, + RM14, + RM15, + RM16, + RM17, + RM18, + RM19, + RM20, + RM21, + RM22, + RM23, + RM24, + RM25, + RM26, + RM27, + RM28, + RM29, + RM30, + RM31, + RM32, + RM33, + RM34, + RM35, + RM36, + RM37, + RM38, + RM39, + RM40, + RM41, + RM42, + RM43, + RM44, + RM45, + RM46, + RM47, + RM48, + RM49, + RM50, + RM51, + RM52, + RM53, + RM54 +}; /* These versions of the macros use the stack, as normal. There are debugging versions and production versions. Note that the "rw" argument of RMATCH isn't @@ -187,143 +239,136 @@ actuall used in this definition. */ #define REGISTER register #ifdef DEBUG -#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \ - { \ - printf("match() called in line %d\n", __LINE__); \ - rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \ - printf("to line %d\n", __LINE__); \ - } -#define RRETURN(ra) \ - { \ - printf("match() returned %d from line %d ", ra, __LINE__); \ - return ra; \ - } +#define RMATCH(ra, rb, rc, rd, re, rf, rg, rw) \ + { \ + printf("match() called in line %d\n", __LINE__); \ + rrc = match(ra, rb, mstart, rc, rd, re, rf, rg, rdepth + 1); \ + printf("to line %d\n", __LINE__); \ + } +#define RRETURN(ra) \ + { \ + printf("match() returned %d from line %d ", ra, __LINE__); \ + return ra; \ + } #else -#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \ - rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1) +#define RMATCH(ra, rb, rc, rd, re, rf, rg, rw) \ + rrc = match(ra, rb, mstart, rc, rd, re, rf, rg, rdepth + 1) #define RRETURN(ra) return ra #endif #else - /* These versions of the macros manage a private stack on the heap. Note that the "rd" argument of RMATCH isn't actually used in this definition. It's the md argument of match(), which never changes. */ #define REGISTER -#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\ - {\ - heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\ - frame->Xwhere = rw; \ - newframe->Xeptr = ra;\ - newframe->Xecode = rb;\ - newframe->Xmstart = mstart;\ - newframe->Xoffset_top = rc;\ - newframe->Xims = re;\ - newframe->Xeptrb = rf;\ - newframe->Xflags = rg;\ - newframe->Xrdepth = frame->Xrdepth + 1;\ - newframe->Xprevframe = frame;\ - frame = newframe;\ - DPRINTF(("restarting from line %d\n", __LINE__));\ - goto HEAP_RECURSE;\ - L_##rw:\ - DPRINTF(("jumped back to line %d\n", __LINE__));\ - } - -#define RRETURN(ra)\ - {\ - heapframe *newframe = frame;\ - frame = newframe->Xprevframe;\ - (pcre_stack_free)(newframe);\ - if (frame != NULL)\ - {\ - rrc = ra;\ - goto HEAP_RETURN;\ - }\ - return ra;\ - } +#define RMATCH(ra, rb, rc, rd, re, rf, rg, rw) \ + { \ + heapframe* newframe = (pcre_stack_malloc)(sizeof(heapframe)); \ + frame->Xwhere = rw; \ + newframe->Xeptr = ra; \ + newframe->Xecode = rb; \ + newframe->Xmstart = mstart; \ + newframe->Xoffset_top = rc; \ + newframe->Xims = re; \ + newframe->Xeptrb = rf; \ + newframe->Xflags = rg; \ + newframe->Xrdepth = frame->Xrdepth + 1; \ + newframe->Xprevframe = frame; \ + frame = newframe; \ + DPRINTF(("restarting from line %d\n", __LINE__)); \ + goto HEAP_RECURSE; \ + L_##rw : DPRINTF(("jumped back to line %d\n", __LINE__)); \ + } +#define RRETURN(ra) \ + { \ + heapframe* newframe = frame; \ + frame = newframe->Xprevframe; \ + (pcre_stack_free)(newframe); \ + if (frame != NULL) { \ + rrc = ra; \ + goto HEAP_RETURN; \ + } \ + return ra; \ + } /* Structure for remembering the local variables in a private frame */ typedef struct heapframe { - struct heapframe *Xprevframe; + struct heapframe* Xprevframe; - /* Function arguments that may change */ + /* Function arguments that may change */ - const uschar *Xeptr; - const uschar *Xecode; - const uschar *Xmstart; - int Xoffset_top; - long int Xims; - eptrblock *Xeptrb; - int Xflags; - unsigned int Xrdepth; + const uschar* Xeptr; + const uschar* Xecode; + const uschar* Xmstart; + int Xoffset_top; + long int Xims; + eptrblock* Xeptrb; + int Xflags; + unsigned int Xrdepth; - /* Function local variables */ + /* Function local variables */ - const uschar *Xcallpat; - const uschar *Xcharptr; - const uschar *Xdata; - const uschar *Xnext; - const uschar *Xpp; - const uschar *Xprev; - const uschar *Xsaved_eptr; + const uschar* Xcallpat; + const uschar* Xcharptr; + const uschar* Xdata; + const uschar* Xnext; + const uschar* Xpp; + const uschar* Xprev; + const uschar* Xsaved_eptr; - recursion_info Xnew_recursive; + recursion_info Xnew_recursive; - BOOL Xcur_is_word; - BOOL Xcondition; - BOOL Xprev_is_word; + BOOL Xcur_is_word; + BOOL Xcondition; + BOOL Xprev_is_word; - unsigned long int Xoriginal_ims; + unsigned long int Xoriginal_ims; #ifdef SUPPORT_UCP - int Xprop_type; - int Xprop_value; - int Xprop_fail_result; - int Xprop_category; - int Xprop_chartype; - int Xprop_script; - int Xoclength; - uschar Xocchars[8]; + int Xprop_type; + int Xprop_value; + int Xprop_fail_result; + int Xprop_category; + int Xprop_chartype; + int Xprop_script; + int Xoclength; + uschar Xocchars[8]; #endif - int Xctype; - unsigned int Xfc; - int Xfi; - int Xlength; - int Xmax; - int Xmin; - int Xnumber; - int Xoffset; - int Xop; - int Xsave_capture_last; - int Xsave_offset1, Xsave_offset2, Xsave_offset3; - int Xstacksave[REC_STACK_SAVE_MAX]; + int Xctype; + unsigned int Xfc; + int Xfi; + int Xlength; + int Xmax; + int Xmin; + int Xnumber; + int Xoffset; + int Xop; + int Xsave_capture_last; + int Xsave_offset1, Xsave_offset2, Xsave_offset3; + int Xstacksave[REC_STACK_SAVE_MAX]; - eptrblock Xnewptrb; + eptrblock Xnewptrb; - /* Where to jump back to */ + /* Where to jump back to */ - int Xwhere; + int Xwhere; } heapframe; #endif - /*************************************************************************** ***************************************************************************/ - - /************************************************* -* Match from current position * -*************************************************/ + * Match from current position * + *************************************************/ /* This function is called recursively in many circumstances. Whenever it returns a negative (error) response, the outer incarnation must also return the @@ -356,3874 +401,4150 @@ Returns: MATCH_MATCH if matched ) these values are >= 0 (e.g. stopped by repeated call or recursion limit) */ -static int -match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart, - int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, - int flags, unsigned int rdepth) -{ -/* These variables do not need to be preserved over recursion in this function, -so they can be ordinary variables in all cases. Mark some of them with -"register" because they are used a lot in loops. */ +static int match(REGISTER USPTR eptr, + REGISTER const uschar* ecode, + const uschar* mstart, + int offset_top, + match_data* md, + unsigned long int ims, + eptrblock* eptrb, + int flags, + unsigned int rdepth) { + /* These variables do not need to be preserved over recursion in this + function, so they can be ordinary variables in all cases. Mark some of them + with "register" because they are used a lot in loops. */ -register int rrc; /* Returns from recursive calls */ -register int i; /* Used for loops not involving calls to RMATCH() */ -register unsigned int c; /* Character values not kept over RMATCH() calls */ -register BOOL utf8; /* Local copy of UTF-8 flag for speed */ + register int rrc; /* Returns from recursive calls */ + register int i; /* Used for loops not involving calls to RMATCH() */ + register unsigned int c; /* Character values not kept over RMATCH() calls */ + register BOOL utf8; /* Local copy of UTF-8 flag for speed */ -BOOL minimize, possessive; /* Quantifier options */ + BOOL minimize, possessive; /* Quantifier options */ -/* When recursion is not being used, all "local" variables that have to be -preserved over calls to RMATCH() are part of a "frame" which is obtained from -heap storage. Set up the top-level frame here; others are obtained from the -heap whenever RMATCH() does a "recursion". See the macro definitions above. */ + /* When recursion is not being used, all "local" variables that have to be + preserved over calls to RMATCH() are part of a "frame" which is obtained + from heap storage. Set up the top-level frame here; others are obtained from + the heap whenever RMATCH() does a "recursion". See the macro definitions + above. */ #ifdef NO_RECURSE -heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe)); -frame->Xprevframe = NULL; /* Marks the top level */ + heapframe* frame = (pcre_stack_malloc)(sizeof(heapframe)); + frame->Xprevframe = NULL; /* Marks the top level */ -/* Copy in the original argument variables */ + /* Copy in the original argument variables */ -frame->Xeptr = eptr; -frame->Xecode = ecode; -frame->Xmstart = mstart; -frame->Xoffset_top = offset_top; -frame->Xims = ims; -frame->Xeptrb = eptrb; -frame->Xflags = flags; -frame->Xrdepth = rdepth; + frame->Xeptr = eptr; + frame->Xecode = ecode; + frame->Xmstart = mstart; + frame->Xoffset_top = offset_top; + frame->Xims = ims; + frame->Xeptrb = eptrb; + frame->Xflags = flags; + frame->Xrdepth = rdepth; -/* This is where control jumps back to to effect "recursion" */ + /* This is where control jumps back to to effect "recursion" */ HEAP_RECURSE: -/* Macros make the argument variables come from the current frame */ + /* Macros make the argument variables come from the current frame */ -#define eptr frame->Xeptr -#define ecode frame->Xecode -#define mstart frame->Xmstart -#define offset_top frame->Xoffset_top -#define ims frame->Xims -#define eptrb frame->Xeptrb -#define flags frame->Xflags -#define rdepth frame->Xrdepth +#define eptr frame->Xeptr +#define ecode frame->Xecode +#define mstart frame->Xmstart +#define offset_top frame->Xoffset_top +#define ims frame->Xims +#define eptrb frame->Xeptrb +#define flags frame->Xflags +#define rdepth frame->Xrdepth -/* Ditto for the local variables */ + /* Ditto for the local variables */ #ifdef SUPPORT_UTF8 -#define charptr frame->Xcharptr +#define charptr frame->Xcharptr #endif -#define callpat frame->Xcallpat -#define data frame->Xdata -#define next frame->Xnext -#define pp frame->Xpp -#define prev frame->Xprev -#define saved_eptr frame->Xsaved_eptr +#define callpat frame->Xcallpat +#define data frame->Xdata +#define next frame->Xnext +#define pp frame->Xpp +#define prev frame->Xprev +#define saved_eptr frame->Xsaved_eptr -#define new_recursive frame->Xnew_recursive +#define new_recursive frame->Xnew_recursive -#define cur_is_word frame->Xcur_is_word -#define condition frame->Xcondition -#define prev_is_word frame->Xprev_is_word +#define cur_is_word frame->Xcur_is_word +#define condition frame->Xcondition +#define prev_is_word frame->Xprev_is_word -#define original_ims frame->Xoriginal_ims +#define original_ims frame->Xoriginal_ims #ifdef SUPPORT_UCP -#define prop_type frame->Xprop_type -#define prop_value frame->Xprop_value -#define prop_fail_result frame->Xprop_fail_result -#define prop_category frame->Xprop_category -#define prop_chartype frame->Xprop_chartype -#define prop_script frame->Xprop_script -#define oclength frame->Xoclength -#define occhars frame->Xocchars +#define prop_type frame->Xprop_type +#define prop_value frame->Xprop_value +#define prop_fail_result frame->Xprop_fail_result +#define prop_category frame->Xprop_category +#define prop_chartype frame->Xprop_chartype +#define prop_script frame->Xprop_script +#define oclength frame->Xoclength +#define occhars frame->Xocchars #endif -#define ctype frame->Xctype -#define fc frame->Xfc -#define fi frame->Xfi -#define length frame->Xlength -#define max frame->Xmax -#define min frame->Xmin -#define number frame->Xnumber -#define offset frame->Xoffset -#define op frame->Xop -#define save_capture_last frame->Xsave_capture_last -#define save_offset1 frame->Xsave_offset1 -#define save_offset2 frame->Xsave_offset2 -#define save_offset3 frame->Xsave_offset3 -#define stacksave frame->Xstacksave +#define ctype frame->Xctype +#define fc frame->Xfc +#define fi frame->Xfi +#define length frame->Xlength +#define max frame->Xmax +#define min frame->Xmin +#define number frame->Xnumber +#define offset frame->Xoffset +#define op frame->Xop +#define save_capture_last frame->Xsave_capture_last +#define save_offset1 frame->Xsave_offset1 +#define save_offset2 frame->Xsave_offset2 +#define save_offset3 frame->Xsave_offset3 +#define stacksave frame->Xstacksave -#define newptrb frame->Xnewptrb +#define newptrb frame->Xnewptrb -/* When recursion is being used, local variables are allocated on the stack and -get preserved during recursion in the normal way. In this environment, fi and -i, and fc and c, can be the same variables. */ + /* When recursion is being used, local variables are allocated on the stack + and get preserved during recursion in the normal way. In this environment, + fi and i, and fc and c, can be the same variables. */ -#else /* NO_RECURSE not defined */ +#else /* NO_RECURSE not defined */ #define fi i #define fc c +#ifdef SUPPORT_UTF8 /* Many of these variables are used only */ + const uschar* charptr; /* in small blocks of the code. My normal */ +#endif /* style of coding would have declared */ + const uschar* callpat; /* them within each of those blocks. */ + const uschar* data; /* However, in order to accommodate the */ + const uschar* next; /* version of this code that uses an */ + USPTR pp; /* external "stack" implemented on the */ + const uschar* prev; /* heap, it is easier to declare them all */ + USPTR saved_eptr; /* here, so the declarations can be cut */ + /* out in a block. The only declarations */ + recursion_info new_recursive; /* within blocks below are for variables */ + /* that do not have to be preserved over */ + BOOL cur_is_word; /* a recursive call to RMATCH(). */ + BOOL condition; + BOOL prev_is_word; -#ifdef SUPPORT_UTF8 /* Many of these variables are used only */ -const uschar *charptr; /* in small blocks of the code. My normal */ -#endif /* style of coding would have declared */ -const uschar *callpat; /* them within each of those blocks. */ -const uschar *data; /* However, in order to accommodate the */ -const uschar *next; /* version of this code that uses an */ -USPTR pp; /* external "stack" implemented on the */ -const uschar *prev; /* heap, it is easier to declare them all */ -USPTR saved_eptr; /* here, so the declarations can be cut */ - /* out in a block. The only declarations */ -recursion_info new_recursive; /* within blocks below are for variables */ - /* that do not have to be preserved over */ -BOOL cur_is_word; /* a recursive call to RMATCH(). */ -BOOL condition; -BOOL prev_is_word; - -unsigned long int original_ims; + unsigned long int original_ims; #ifdef SUPPORT_UCP -int prop_type; -int prop_value; -int prop_fail_result; -int prop_category; -int prop_chartype; -int prop_script; -int oclength; -uschar occhars[8]; + int prop_type; + int prop_value; + int prop_fail_result; + int prop_category; + int prop_chartype; + int prop_script; + int oclength; + uschar occhars[8]; #endif -int ctype; -int length; -int max; -int min; -int number; -int offset; -int op; -int save_capture_last; -int save_offset1, save_offset2, save_offset3; -int stacksave[REC_STACK_SAVE_MAX]; + int ctype; + int length; + int max; + int min; + int number; + int offset; + int op; + int save_capture_last; + int save_offset1, save_offset2, save_offset3; + int stacksave[REC_STACK_SAVE_MAX]; -eptrblock newptrb; -#endif /* NO_RECURSE */ + eptrblock newptrb; +#endif /* NO_RECURSE */ -/* These statements are here to stop the compiler complaining about unitialized -variables. */ + /* These statements are here to stop the compiler complaining about + unitialized variables. */ #ifdef SUPPORT_UCP -prop_value = 0; -prop_fail_result = 0; + prop_value = 0; + prop_fail_result = 0; #endif - -/* This label is used for tail recursion, which is used in a few cases even -when NO_RECURSE is not defined, in order to reduce the amount of stack that is -used. Thanks to Ian Taylor for noticing this possibility and sending the -original patch. */ + /* This label is used for tail recursion, which is used in a few cases even + when NO_RECURSE is not defined, in order to reduce the amount of stack that + is used. Thanks to Ian Taylor for noticing this possibility and sending the + original patch. */ TAIL_RECURSE: -/* OK, now we can get on with the real code of the function. Recursive calls -are specified by the macro RMATCH and RRETURN is used to return. When -NO_RECURSE is *not* defined, these just turn into a recursive call to match() -and a "return", respectively (possibly with some debugging if DEBUG is -defined). However, RMATCH isn't like a function call because it's quite a -complicated macro. It has to be used in one particular way. This shouldn't, -however, impact performance when true recursion is being used. */ + /* OK, now we can get on with the real code of the function. Recursive calls + are specified by the macro RMATCH and RRETURN is used to return. When + NO_RECURSE is *not* defined, these just turn into a recursive call to + match() and a "return", respectively (possibly with some debugging if DEBUG + is defined). However, RMATCH isn't like a function call because it's quite a + complicated macro. It has to be used in one particular way. This shouldn't, + however, impact performance when true recursion is being used. */ #ifdef SUPPORT_UTF8 -utf8 = md->utf8; /* Local copy of the flag */ + utf8 = md->utf8; /* Local copy of the flag */ #else -utf8 = FALSE; + utf8 = FALSE; #endif -/* First check that we haven't called match() too many times, or that we -haven't exceeded the recursive call limit. */ - -if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); -if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); - -original_ims = ims; /* Save for resetting on ')' */ - -/* At the start of a group with an unlimited repeat that may match an empty -string, the match_cbegroup flag is set. When this is the case, add the current -subject pointer to the chain of such remembered pointers, to be checked when we -hit the closing ket, in order to break infinite loops that match no characters. -When match() is called in other circumstances, don't add to the chain. The -match_cbegroup flag must NOT be used with tail recursion, because the memory -block that is used is on the stack, so a new one may be required for each -match(). */ - -if ((flags & match_cbegroup) != 0) - { - newptrb.epb_saved_eptr = eptr; - newptrb.epb_prev = eptrb; - eptrb = &newptrb; - } - -/* Now start processing the opcodes. */ - -for (;;) - { - minimize = possessive = FALSE; - op = *ecode; - - /* For partial matching, remember if we ever hit the end of the subject after - matching at least one subject character. */ - - if (md->partial && - eptr >= md->end_subject && - eptr > mstart) - md->hitend = TRUE; - - switch(op) - { - case OP_FAIL: - RRETURN(MATCH_NOMATCH); - - case OP_PRUNE: - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, - ims, eptrb, flags, RM51); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - RRETURN(MATCH_PRUNE); - - case OP_COMMIT: - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, - ims, eptrb, flags, RM52); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - RRETURN(MATCH_COMMIT); - - case OP_SKIP: - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, - ims, eptrb, flags, RM53); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - md->start_match_ptr = eptr; /* Pass back current position */ - RRETURN(MATCH_SKIP); - - case OP_THEN: - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, - ims, eptrb, flags, RM54); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - RRETURN(MATCH_THEN); - - /* Handle a capturing bracket. If there is space in the offset vector, save - the current subject position in the working slot at the top of the vector. - We mustn't change the current values of the data slot, because they may be - set from a previous iteration of this group, and be referred to by a - reference inside the group. - - If the bracket fails to match, we need to restore this value and also the - values of the final offsets, in case they were set by a previous iteration - of the same bracket. - - If there isn't enough space in the offset vector, treat this as if it were - a non-capturing bracket. Don't worry about setting the flag for the error - case here; that is handled in the code for KET. */ - - case OP_CBRA: - case OP_SCBRA: - number = GET2(ecode, 1+LINK_SIZE); - offset = number << 1; - -#ifdef DEBUG - printf("start bracket %d\n", number); - printf("subject="); - pchars(eptr, 16, TRUE, md); - printf("\n"); -#endif - - if (offset < md->offset_max) - { - save_offset1 = md->offset_vector[offset]; - save_offset2 = md->offset_vector[offset+1]; - save_offset3 = md->offset_vector[md->offset_end - number]; - save_capture_last = md->capture_last; - - DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); - md->offset_vector[md->offset_end - number] = eptr - md->start_subject; - - flags = (op == OP_SCBRA)? match_cbegroup : 0; - do - { - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, - ims, eptrb, flags, RM1); - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); - md->capture_last = save_capture_last; - ecode += GET(ecode, 1); - } - while (*ecode == OP_ALT); - - DPRINTF(("bracket %d failed\n", number)); - - md->offset_vector[offset] = save_offset1; - md->offset_vector[offset+1] = save_offset2; - md->offset_vector[md->offset_end - number] = save_offset3; - - RRETURN(MATCH_NOMATCH); - } - - /* FALL THROUGH ... Insufficient room for saving captured contents. Treat - as a non-capturing bracket. */ - - /* VVVVVVVVVVVVVVVVVVVVVVVVV */ - /* VVVVVVVVVVVVVVVVVVVVVVVVV */ - - DPRINTF(("insufficient capture room: treat as non-capturing\n")); - - /* VVVVVVVVVVVVVVVVVVVVVVVVV */ - /* VVVVVVVVVVVVVVVVVVVVVVVVV */ - - /* Non-capturing bracket. Loop for all the alternatives. When we get to the - final alternative within the brackets, we would return the result of a - recursive call to match() whatever happened. We can reduce stack usage by - turning this into a tail recursion, except in the case when match_cbegroup - is set.*/ - - case OP_BRA: - case OP_SBRA: - DPRINTF(("start non-capturing bracket\n")); - flags = (op >= OP_SBRA)? match_cbegroup : 0; - for (;;) - { - if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */ - { - if (flags == 0) /* Not a possibly empty group */ - { - ecode += _pcre_OP_lengths[*ecode]; - DPRINTF(("bracket 0 tail recursion\n")); - goto TAIL_RECURSE; - } - - /* Possibly empty group; can't use tail recursion. */ - - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, - eptrb, flags, RM48); - RRETURN(rrc); - } - - /* For non-final alternatives, continue the loop for a NOMATCH result; - otherwise return. */ - - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, - eptrb, flags, RM2); - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); - ecode += GET(ecode, 1); - } - /* Control never reaches here. */ - - /* Conditional group: compilation checked that there are no more than - two branches. If the condition is false, skipping the first branch takes us - past the end if there is only one branch, but that's OK because that is - exactly what going to the ket would do. As there is only one branch to be - obeyed, we can use tail recursion to avoid using another stack frame. */ - - case OP_COND: - case OP_SCOND: - if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */ - { - offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ - condition = md->recursive != NULL && - (offset == RREF_ANY || offset == md->recursive->group_num); - ecode += condition? 3 : GET(ecode, 1); - } - - else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */ - { - offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ - condition = offset < offset_top && md->offset_vector[offset] >= 0; - ecode += condition? 3 : GET(ecode, 1); - } - - else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */ - { - condition = FALSE; - ecode += GET(ecode, 1); - } - - /* The condition is an assertion. Call match() to evaluate it - setting - the final argument match_condassert causes it to stop at the end of an - assertion. */ - - else - { - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, - match_condassert, RM3); - if (rrc == MATCH_MATCH) - { - condition = TRUE; - ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); - while (*ecode == OP_ALT) ecode += GET(ecode, 1); - } - else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) - { - RRETURN(rrc); /* Need braces because of following else */ - } - else - { - condition = FALSE; - ecode += GET(ecode, 1); - } - } - - /* We are now at the branch that is to be obeyed. As there is only one, - we can use tail recursion to avoid using another stack frame, except when - match_cbegroup is required for an unlimited repeat of a possibly empty - group. If the second alternative doesn't exist, we can just plough on. */ - - if (condition || *ecode == OP_ALT) - { - ecode += 1 + LINK_SIZE; - if (op == OP_SCOND) /* Possibly empty group */ - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49); - RRETURN(rrc); - } - else /* Group must match something */ - { - flags = 0; - goto TAIL_RECURSE; - } - } - else /* Condition false & no 2nd alternative */ - { - ecode += 1 + LINK_SIZE; - } - break; - - - /* End of the pattern, either real or forced. If we are in a top-level - recursion, we should restore the offsets appropriately and continue from - after the call. */ - - case OP_ACCEPT: - case OP_END: - if (md->recursive != NULL && md->recursive->group_num == 0) - { - recursion_info *rec = md->recursive; - DPRINTF(("End of pattern in a (?0) recursion\n")); - md->recursive = rec->prevrec; - memmove(md->offset_vector, rec->offset_save, - rec->saved_max * sizeof(int)); - mstart = rec->save_start; - ims = original_ims; - ecode = rec->after_call; - break; - } - - /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty - string - backtracking will then try other alternatives, if any. */ - - if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH); - md->end_match_ptr = eptr; /* Record where we ended */ - md->end_offset_top = offset_top; /* and how many extracts were taken */ - md->start_match_ptr = mstart; /* and the start (\K can modify) */ - RRETURN(MATCH_MATCH); - - /* Change option settings */ - - case OP_OPT: - ims = ecode[1]; - ecode += 2; - DPRINTF(("ims set to %02lx\n", ims)); - break; - - /* Assertion brackets. Check the alternative branches in turn - the - matching won't pass the KET for an assertion. If any one branch matches, - the assertion is true. Lookbehind assertions have an OP_REVERSE item at the - start of each branch to move the current point backwards, so the code at - this level is identical to the lookahead case. */ - - case OP_ASSERT: - case OP_ASSERTBACK: - do - { - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, - RM4); - if (rrc == MATCH_MATCH) break; - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); - ecode += GET(ecode, 1); - } - while (*ecode == OP_ALT); - if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); - - /* If checking an assertion for a condition, return MATCH_MATCH. */ - - if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); - - /* Continue from after the assertion, updating the offsets high water - mark, since extracts may have been taken during the assertion. */ - - do ecode += GET(ecode,1); while (*ecode == OP_ALT); - ecode += 1 + LINK_SIZE; - offset_top = md->end_offset_top; - continue; - - /* Negative assertion: all branches must fail to match */ - - case OP_ASSERT_NOT: - case OP_ASSERTBACK_NOT: - do - { - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, - RM5); - if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); - ecode += GET(ecode,1); - } - while (*ecode == OP_ALT); - - if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); - - ecode += 1 + LINK_SIZE; - continue; - - /* Move the subject pointer back. This occurs only at the start of - each branch of a lookbehind assertion. If we are too close to the start to - move back, this match function fails. When working with UTF-8 we move - back a number of characters, not bytes. */ - - case OP_REVERSE: -#ifdef SUPPORT_UTF8 - if (utf8) - { - i = GET(ecode, 1); - while (i-- > 0) - { - eptr--; - if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); - BACKCHAR(eptr); - } - } - else -#endif - - /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ - - { - eptr -= GET(ecode, 1); - if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); - } - - /* Skip to next op code */ - - ecode += 1 + LINK_SIZE; - break; - - /* The callout item calls an external function, if one is provided, passing - details of the match so far. This is mainly for debugging, though the - function is able to force a failure. */ - - case OP_CALLOUT: - if (pcre_callout != NULL) - { - pcre_callout_block cb; - cb.version = 1; /* Version 1 of the callout block */ - cb.callout_number = ecode[1]; - cb.offset_vector = md->offset_vector; - cb.subject = (PCRE_SPTR)md->start_subject; - cb.subject_length = md->end_subject - md->start_subject; - cb.start_match = mstart - md->start_subject; - cb.current_position = eptr - md->start_subject; - cb.pattern_position = GET(ecode, 2); - cb.next_item_length = GET(ecode, 2 + LINK_SIZE); - cb.capture_top = offset_top/2; - cb.capture_last = md->capture_last; - cb.callout_data = md->callout_data; - if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); - if (rrc < 0) RRETURN(rrc); - } - ecode += 2 + 2*LINK_SIZE; - break; - - /* Recursion either matches the current regex, or some subexpression. The - offset data is the offset to the starting bracket from the start of the - whole pattern. (This is so that it works from duplicated subpatterns.) - - If there are any capturing brackets started but not finished, we have to - save their starting points and reinstate them after the recursion. However, - we don't know how many such there are (offset_top records the completed - total) so we just have to save all the potential data. There may be up to - 65535 such values, which is too large to put on the stack, but using malloc - for small numbers seems expensive. As a compromise, the stack is used when - there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc - is used. A problem is what to do if the malloc fails ... there is no way of - returning to the top level with an error. Save the top REC_STACK_SAVE_MAX - values on the stack, and accept that the rest may be wrong. - - There are also other values that have to be saved. We use a chained - sequence of blocks that actually live on the stack. Thanks to Robin Houston - for the original version of this logic. */ - - case OP_RECURSE: - { - callpat = md->start_code + GET(ecode, 1); - new_recursive.group_num = (callpat == md->start_code)? 0 : - GET2(callpat, 1 + LINK_SIZE); - - /* Add to "recursing stack" */ - - new_recursive.prevrec = md->recursive; - md->recursive = &new_recursive; - - /* Find where to continue from afterwards */ - - ecode += 1 + LINK_SIZE; - new_recursive.after_call = ecode; - - /* Now save the offset data. */ - - new_recursive.saved_max = md->offset_end; - if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) - new_recursive.offset_save = stacksave; - else - { - new_recursive.offset_save = - (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int)); - if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); - } - - memcpy(new_recursive.offset_save, md->offset_vector, - new_recursive.saved_max * sizeof(int)); - new_recursive.save_start = mstart; - mstart = eptr; - - /* OK, now we can do the recursion. For each top-level alternative we - restore the offset and recursion data. */ - - DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); - flags = (*callpat >= OP_SBRA)? match_cbegroup : 0; - do - { - RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, - md, ims, eptrb, flags, RM6); - if (rrc == MATCH_MATCH) - { - DPRINTF(("Recursion matched\n")); - md->recursive = new_recursive.prevrec; - if (new_recursive.offset_save != stacksave) - (pcre_free)(new_recursive.offset_save); - RRETURN(MATCH_MATCH); - } - else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) - { - DPRINTF(("Recursion gave error %d\n", rrc)); - RRETURN(rrc); - } - - md->recursive = &new_recursive; - memcpy(md->offset_vector, new_recursive.offset_save, - new_recursive.saved_max * sizeof(int)); - callpat += GET(callpat, 1); - } - while (*callpat == OP_ALT); - - DPRINTF(("Recursion didn't match\n")); - md->recursive = new_recursive.prevrec; - if (new_recursive.offset_save != stacksave) - (pcre_free)(new_recursive.offset_save); - RRETURN(MATCH_NOMATCH); - } - /* Control never reaches here */ - - /* "Once" brackets are like assertion brackets except that after a match, - the point in the subject string is not moved back. Thus there can never be - a move back into the brackets. Friedl calls these "atomic" subpatterns. - Check the alternative branches in turn - the matching won't pass the KET - for this kind of subpattern. If any one branch matches, we carry on as at - the end of a normal bracket, leaving the subject pointer. */ - - case OP_ONCE: - prev = ecode; - saved_eptr = eptr; - - do - { - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7); - if (rrc == MATCH_MATCH) break; - if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); - ecode += GET(ecode,1); - } - while (*ecode == OP_ALT); - - /* If hit the end of the group (which could be repeated), fail */ - - if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); - - /* Continue as from after the assertion, updating the offsets high water - mark, since extracts may have been taken. */ - - do ecode += GET(ecode, 1); while (*ecode == OP_ALT); - - offset_top = md->end_offset_top; - eptr = md->end_match_ptr; - - /* For a non-repeating ket, just continue at this level. This also - happens for a repeating ket if no characters were matched in the group. - This is the forcible breaking of infinite loops as implemented in Perl - 5.005. If there is an options reset, it will get obeyed in the normal - course of events. */ - - if (*ecode == OP_KET || eptr == saved_eptr) - { - ecode += 1+LINK_SIZE; - break; - } - - /* The repeating kets try the rest of the pattern or restart from the - preceding bracket, in the appropriate order. The second "call" of match() - uses tail recursion, to avoid using another stack frame. We need to reset - any options that changed within the bracket before re-running it, so - check the next opcode. */ - - if (ecode[1+LINK_SIZE] == OP_OPT) - { - ims = (ims & ~PCRE_IMS) | ecode[4]; - DPRINTF(("ims set to %02lx at group repeat\n", ims)); - } - - if (*ecode == OP_KETRMIN) - { - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode = prev; - flags = 0; - goto TAIL_RECURSE; - } - else /* OP_KETRMAX */ - { - RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode += 1 + LINK_SIZE; - flags = 0; - goto TAIL_RECURSE; - } - /* Control never gets here */ - - /* An alternation is the end of a branch; scan along to find the end of the - bracketed group and go to there. */ - - case OP_ALT: - do ecode += GET(ecode,1); while (*ecode == OP_ALT); - break; - - /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating - that it may occur zero times. It may repeat infinitely, or not at all - - i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper - repeat limits are compiled as a number of copies, with the optional ones - preceded by BRAZERO or BRAMINZERO. */ - - case OP_BRAZERO: - { - next = ecode+1; - RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - do next += GET(next,1); while (*next == OP_ALT); - ecode = next + 1 + LINK_SIZE; - } - break; - - case OP_BRAMINZERO: - { - next = ecode+1; - do next += GET(next, 1); while (*next == OP_ALT); - RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode++; - } - break; - - /* End of a group, repeated or non-repeating. */ - - case OP_KET: - case OP_KETRMIN: - case OP_KETRMAX: - prev = ecode - GET(ecode, 1); - - /* If this was a group that remembered the subject start, in order to break - infinite repeats of empty string matches, retrieve the subject start from - the chain. Otherwise, set it NULL. */ - - if (*prev >= OP_SBRA) - { - saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ - eptrb = eptrb->epb_prev; /* Backup to previous group */ - } - else saved_eptr = NULL; - - /* If we are at the end of an assertion group, stop matching and return - MATCH_MATCH, but record the current high water mark for use by positive - assertions. Do this also for the "once" (atomic) groups. */ - - if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || - *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || - *prev == OP_ONCE) - { - md->end_match_ptr = eptr; /* For ONCE */ - md->end_offset_top = offset_top; - RRETURN(MATCH_MATCH); - } - - /* For capturing groups we have to check the group number back at the start - and if necessary complete handling an extraction by setting the offsets and - bumping the high water mark. Note that whole-pattern recursion is coded as - a recurse into group 0, so it won't be picked up here. Instead, we catch it - when the OP_END is reached. Other recursion is handled here. */ - - if (*prev == OP_CBRA || *prev == OP_SCBRA) - { - number = GET2(prev, 1+LINK_SIZE); - offset = number << 1; - -#ifdef DEBUG - printf("end bracket %d", number); - printf("\n"); -#endif - - md->capture_last = number; - if (offset >= md->offset_max) md->offset_overflow = TRUE; else - { - md->offset_vector[offset] = - md->offset_vector[md->offset_end - number]; - md->offset_vector[offset+1] = eptr - md->start_subject; - if (offset_top <= offset) offset_top = offset + 2; - } - - /* Handle a recursively called group. Restore the offsets - appropriately and continue from after the call. */ - - if (md->recursive != NULL && md->recursive->group_num == number) - { - recursion_info *rec = md->recursive; - DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); - md->recursive = rec->prevrec; - mstart = rec->save_start; - memcpy(md->offset_vector, rec->offset_save, - rec->saved_max * sizeof(int)); - ecode = rec->after_call; - ims = original_ims; - break; - } - } - - /* For both capturing and non-capturing groups, reset the value of the ims - flags, in case they got changed during the group. */ - - ims = original_ims; - DPRINTF(("ims reset to %02lx\n", ims)); - - /* For a non-repeating ket, just continue at this level. This also - happens for a repeating ket if no characters were matched in the group. - This is the forcible breaking of infinite loops as implemented in Perl - 5.005. If there is an options reset, it will get obeyed in the normal - course of events. */ - - if (*ecode == OP_KET || eptr == saved_eptr) - { - ecode += 1 + LINK_SIZE; - break; - } - - /* The repeating kets try the rest of the pattern or restart from the - preceding bracket, in the appropriate order. In the second case, we can use - tail recursion to avoid using another stack frame, unless we have an - unlimited repeat of a group that can match an empty string. */ - - flags = (*prev >= OP_SBRA)? match_cbegroup : 0; - - if (*ecode == OP_KETRMIN) - { - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (flags != 0) /* Could match an empty string */ - { - RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50); - RRETURN(rrc); - } - ecode = prev; - goto TAIL_RECURSE; - } - else /* OP_KETRMAX */ - { - RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - ecode += 1 + LINK_SIZE; - flags = 0; - goto TAIL_RECURSE; - } - /* Control never gets here */ - - /* Start of subject unless notbol, or after internal newline if multiline */ - - case OP_CIRC: - if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); - if ((ims & PCRE_MULTILINE) != 0) - { - if (eptr != md->start_subject && - (eptr == md->end_subject || !WAS_NEWLINE(eptr))) - RRETURN(MATCH_NOMATCH); - ecode++; - break; - } - /* ... else fall through */ - - /* Start of subject assertion */ - - case OP_SOD: - if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); - ecode++; - break; - - /* Start of match assertion */ - - case OP_SOM: - if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); - ecode++; - break; - - /* Reset the start of match point */ - - case OP_SET_SOM: - mstart = eptr; - ecode++; - break; - - /* Assert before internal newline if multiline, or before a terminating - newline unless endonly is set, else end of subject unless noteol is set. */ - - case OP_DOLL: - if ((ims & PCRE_MULTILINE) != 0) - { - if (eptr < md->end_subject) - { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); } - else - { if (md->noteol) RRETURN(MATCH_NOMATCH); } - ecode++; - break; - } - else - { - if (md->noteol) RRETURN(MATCH_NOMATCH); - if (!md->endonly) - { - if (eptr != md->end_subject && - (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) - RRETURN(MATCH_NOMATCH); - ecode++; - break; - } - } - /* ... else fall through for endonly */ - - /* End of subject assertion (\z) */ - - case OP_EOD: - if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); - ecode++; - break; - - /* End of subject or ending \n assertion (\Z) */ - - case OP_EODN: - if (eptr != md->end_subject && - (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) - RRETURN(MATCH_NOMATCH); - ecode++; - break; - - /* Word boundary assertions */ - - case OP_NOT_WORD_BOUNDARY: - case OP_WORD_BOUNDARY: - { - - /* Find out if the previous and current characters are "word" characters. - It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to - be "non-word" characters. */ - -#ifdef SUPPORT_UTF8 - if (utf8) - { - if (eptr == md->start_subject) prev_is_word = FALSE; else - { - const uschar *lastptr = eptr - 1; - while((*lastptr & 0xc0) == 0x80) lastptr--; - GETCHAR(c, lastptr); - prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; - } - if (eptr >= md->end_subject) cur_is_word = FALSE; else - { - GETCHAR(c, eptr); - cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; - } - } - else -#endif - - /* More streamlined when not in UTF-8 mode */ - - { - prev_is_word = (eptr != md->start_subject) && - ((md->ctypes[eptr[-1]] & ctype_word) != 0); - cur_is_word = (eptr < md->end_subject) && - ((md->ctypes[*eptr] & ctype_word) != 0); - } - - /* Now see if the situation is what we want */ - - if ((*ecode++ == OP_WORD_BOUNDARY)? - cur_is_word == prev_is_word : cur_is_word != prev_is_word) - RRETURN(MATCH_NOMATCH); - } - break; - - /* Match a single character type; inline for speed */ - - case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) - { - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); - } - if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); - if (utf8) - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - ecode++; - break; - - /* Match a single byte, even in UTF-8 mode. This opcode really does match - any byte, even newline, independent of the setting of PCRE_DOTALL. */ - - case OP_ANYBYTE: - if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); - ecode++; - break; - - case OP_NOT_DIGIT: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - if ( -#ifdef SUPPORT_UTF8 - c < 256 && -#endif - (md->ctypes[c] & ctype_digit) != 0 - ) - RRETURN(MATCH_NOMATCH); - ecode++; - break; - - case OP_DIGIT: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - if ( -#ifdef SUPPORT_UTF8 - c >= 256 || -#endif - (md->ctypes[c] & ctype_digit) == 0 - ) - RRETURN(MATCH_NOMATCH); - ecode++; - break; - - case OP_NOT_WHITESPACE: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - if ( -#ifdef SUPPORT_UTF8 - c < 256 && -#endif - (md->ctypes[c] & ctype_space) != 0 - ) - RRETURN(MATCH_NOMATCH); - ecode++; - break; - - case OP_WHITESPACE: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - if ( -#ifdef SUPPORT_UTF8 - c >= 256 || -#endif - (md->ctypes[c] & ctype_space) == 0 - ) - RRETURN(MATCH_NOMATCH); - ecode++; - break; - - case OP_NOT_WORDCHAR: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - if ( -#ifdef SUPPORT_UTF8 - c < 256 && -#endif - (md->ctypes[c] & ctype_word) != 0 - ) - RRETURN(MATCH_NOMATCH); - ecode++; - break; - - case OP_WORDCHAR: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - if ( -#ifdef SUPPORT_UTF8 - c >= 256 || -#endif - (md->ctypes[c] & ctype_word) == 0 - ) - RRETURN(MATCH_NOMATCH); - ecode++; - break; - - case OP_ANYNL: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - switch(c) - { - default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; - break; - - case 0x000a: - break; - - case 0x000b: - case 0x000c: - case 0x0085: - case 0x2028: - case 0x2029: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); - break; - } - ecode++; - break; - - case OP_NOT_HSPACE: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - switch(c) - { - default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); - } - ecode++; - break; - - case OP_HSPACE: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - switch(c) - { - default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - break; - } - ecode++; - break; - - case OP_NOT_VSPACE: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - switch(c) - { - default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); - } - ecode++; - break; - - case OP_VSPACE: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - switch(c) - { - default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - break; - } - ecode++; - break; - -#ifdef SUPPORT_UCP - /* Check the next character by Unicode property. We will get here only - if the support is in the binary; otherwise a compile-time error occurs. */ - - case OP_PROP: - case OP_NOTPROP: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - { - int chartype, script; - int category = _pcre_ucp_findprop(c, &chartype, &script); - - switch(ecode[1]) - { - case PT_ANY: - if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); - break; - - case PT_LAMP: - if ((chartype == ucp_Lu || - chartype == ucp_Ll || - chartype == ucp_Lt) == (op == OP_NOTPROP)) - RRETURN(MATCH_NOMATCH); - break; - - case PT_GC: - if ((ecode[2] != category) == (op == OP_PROP)) - RRETURN(MATCH_NOMATCH); - break; - - case PT_PC: - if ((ecode[2] != chartype) == (op == OP_PROP)) - RRETURN(MATCH_NOMATCH); - break; - - case PT_SC: - if ((ecode[2] != script) == (op == OP_PROP)) - RRETURN(MATCH_NOMATCH); - break; - - default: - RRETURN(PCRE_ERROR_INTERNAL); - } - - ecode += 3; - } - break; - - /* Match an extended Unicode sequence. We will get here only if the support - is in the binary; otherwise a compile-time error occurs. */ - - case OP_EXTUNI: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - { - int chartype, script; - int category = _pcre_ucp_findprop(c, &chartype, &script); - if (category == ucp_M) RRETURN(MATCH_NOMATCH); - while (eptr < md->end_subject) - { - int len = 1; - if (!utf8) c = *eptr; else - { - GETCHARLEN(c, eptr, len); - } - category = _pcre_ucp_findprop(c, &chartype, &script); - if (category != ucp_M) break; - eptr += len; - } - } - ecode++; - break; -#endif - - - /* Match a back reference, possibly repeatedly. Look past the end of the - item to see if there is repeat information following. The code is similar - to that for character classes, but repeated for efficiency. Then obey - similar code to character type repeats - written out again for speed. - However, if the referenced string is the empty string, always treat - it as matched, any number of times (otherwise there could be infinite - loops). */ - - case OP_REF: - { - offset = GET2(ecode, 1) << 1; /* Doubled ref number */ - ecode += 3; /* Advance past item */ - - /* If the reference is unset, set the length to be longer than the amount - of subject left; this ensures that every attempt at a match fails. We - can't just fail here, because of the possibility of quantifiers with zero - minima. */ - - length = (offset >= offset_top || md->offset_vector[offset] < 0)? - md->end_subject - eptr + 1 : - md->offset_vector[offset+1] - md->offset_vector[offset]; - - /* Set up for repetition, or handle the non-repeated case */ - - switch (*ecode) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - c = *ecode++ - OP_CRSTAR; - minimize = (c & 1) != 0; - min = rep_min[c]; /* Pick up values from tables; */ - max = rep_max[c]; /* zero for max => infinity */ - if (max == 0) max = INT_MAX; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - minimize = (*ecode == OP_CRMINRANGE); - min = GET2(ecode, 1); - max = GET2(ecode, 3); - if (max == 0) max = INT_MAX; - ecode += 5; - break; - - default: /* No repeat follows */ - if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); - eptr += length; - continue; /* With the main loop */ - } - - /* If the length of the reference is zero, just continue with the - main loop. */ - - if (length == 0) continue; - - /* First, ensure the minimum number of matches are present. We get back - the length of the reference string explicitly rather than passing the - address of eptr, so that eptr can be a register variable. */ - - for (i = 1; i <= min; i++) - { - if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); - eptr += length; - } - - /* If min = max, continue at the same level without recursion. - They are not both allowed to be zero. */ - - if (min == max) continue; - - /* If minimizing, keep trying and advancing the pointer */ - - if (minimize) - { - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || !match_ref(offset, eptr, length, md, ims)) - RRETURN(MATCH_NOMATCH); - eptr += length; - } - /* Control never gets here */ - } - - /* If maximizing, find the longest string and work backwards */ - - else - { - pp = eptr; - for (i = min; i < max; i++) - { - if (!match_ref(offset, eptr, length, md, ims)) break; - eptr += length; - } - while (eptr >= pp) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - eptr -= length; - } - RRETURN(MATCH_NOMATCH); - } - } - /* Control never gets here */ - - - - /* Match a bit-mapped character class, possibly repeatedly. This op code is - used when all the characters in the class have values in the range 0-255, - and either the matching is caseful, or the characters are in the range - 0-127 when UTF-8 processing is enabled. The only difference between - OP_CLASS and OP_NCLASS occurs when a data character outside the range is - encountered. - - First, look past the end of the item to see if there is repeat information - following. Then obey similar code to character type repeats - written out - again for speed. */ - - case OP_NCLASS: - case OP_CLASS: - { - data = ecode + 1; /* Save for matching */ - ecode += 33; /* Advance past the item */ - - switch (*ecode) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - c = *ecode++ - OP_CRSTAR; - minimize = (c & 1) != 0; - min = rep_min[c]; /* Pick up values from tables; */ - max = rep_max[c]; /* zero for max => infinity */ - if (max == 0) max = INT_MAX; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - minimize = (*ecode == OP_CRMINRANGE); - min = GET2(ecode, 1); - max = GET2(ecode, 3); - if (max == 0) max = INT_MAX; - ecode += 5; - break; - - default: /* No repeat follows */ - min = max = 1; - break; - } - - /* First, ensure the minimum number of matches are present. */ - -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) - { - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - if (c > 255) - { - if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); - } - else - { - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); - } - } - } - else -#endif - /* Not UTF-8 mode */ - { - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - c = *eptr++; - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); - } - } - - /* If max == min we can continue with the main loop without the - need to recurse. */ - - if (min == max) continue; - - /* If minimizing, keep testing the rest of the expression and advancing - the pointer while it matches the class. */ - - if (minimize) - { -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) - { - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - if (c > 255) - { - if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); - } - else - { - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); - } - } - } - else -#endif - /* Not UTF-8 mode */ - { - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - c = *eptr++; - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); - } - } - /* Control never gets here */ - } - - /* If maximizing, find the longest possible run, then work backwards. */ - - else - { - pp = eptr; - -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) - { - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - if (c > 255) - { - if (op == OP_CLASS) break; - } - else - { - if ((data[c/8] & (1 << (c&7))) == 0) break; - } - eptr += len; - } - for (;;) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ - BACKCHAR(eptr); - } - } - else -#endif - /* Not UTF-8 mode */ - { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject) break; - c = *eptr; - if ((data[c/8] & (1 << (c&7))) == 0) break; - eptr++; - } - while (eptr >= pp) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - eptr--; - } - } - - RRETURN(MATCH_NOMATCH); - } - } - /* Control never gets here */ - - - /* Match an extended character class. This opcode is encountered only - in UTF-8 mode, because that's the only time it is compiled. */ - -#ifdef SUPPORT_UTF8 - case OP_XCLASS: - { - data = ecode + 1 + LINK_SIZE; /* Save for matching */ - ecode += GET(ecode, 1); /* Advance past the item */ - - switch (*ecode) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - c = *ecode++ - OP_CRSTAR; - minimize = (c & 1) != 0; - min = rep_min[c]; /* Pick up values from tables; */ - max = rep_max[c]; /* zero for max => infinity */ - if (max == 0) max = INT_MAX; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - minimize = (*ecode == OP_CRMINRANGE); - min = GET2(ecode, 1); - max = GET2(ecode, 3); - if (max == 0) max = INT_MAX; - ecode += 5; - break; - - default: /* No repeat follows */ - min = max = 1; - break; - } - - /* First, ensure the minimum number of matches are present. */ - - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); - } - - /* If max == min we can continue with the main loop without the - need to recurse. */ - - if (min == max) continue; - - /* If minimizing, keep testing the rest of the expression and advancing - the pointer while it matches the class. */ - - if (minimize) - { - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - } - - /* If maximizing, find the longest possible run, then work backwards. */ - - else - { - pp = eptr; - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - if (!_pcre_xclass(c, data)) break; - eptr += len; - } - for(;;) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ - if (utf8) BACKCHAR(eptr); - } - RRETURN(MATCH_NOMATCH); - } - - /* Control never gets here */ - } -#endif /* End of XCLASS */ - - /* Match a single character, casefully */ - - case OP_CHAR: -#ifdef SUPPORT_UTF8 - if (utf8) - { - length = 1; - ecode++; - GETCHARLEN(fc, ecode, length); - if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); - while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); - } - else -#endif - - /* Non-UTF-8 mode */ - { - if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); - if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); - ecode += 2; - } - break; - - /* Match a single character, caselessly */ - - case OP_CHARNC: -#ifdef SUPPORT_UTF8 - if (utf8) - { - length = 1; - ecode++; - GETCHARLEN(fc, ecode, length); - - if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); - - /* If the pattern character's value is < 128, we have only one byte, and - can use the fast lookup table. */ - - if (fc < 128) - { - if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); - } - - /* Otherwise we must pick up the subject character */ - - else - { - unsigned int dc; - GETCHARINC(dc, eptr); - ecode += length; - - /* If we have Unicode property support, we can use it to test the other - case of the character, if there is one. */ - - if (fc != dc) - { -#ifdef SUPPORT_UCP - if (dc != _pcre_ucp_othercase(fc)) -#endif - RRETURN(MATCH_NOMATCH); - } - } - } - else -#endif /* SUPPORT_UTF8 */ - - /* Non-UTF-8 mode */ - { - if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); - if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); - ecode += 2; - } - break; - - /* Match a single character repeatedly. */ - - case OP_EXACT: - min = max = GET2(ecode, 1); - ecode += 3; - goto REPEATCHAR; - - case OP_POSUPTO: - possessive = TRUE; - /* Fall through */ - - case OP_UPTO: - case OP_MINUPTO: - min = 0; - max = GET2(ecode, 1); - minimize = *ecode == OP_MINUPTO; - ecode += 3; - goto REPEATCHAR; - - case OP_POSSTAR: - possessive = TRUE; - min = 0; - max = INT_MAX; - ecode++; - goto REPEATCHAR; - - case OP_POSPLUS: - possessive = TRUE; - min = 1; - max = INT_MAX; - ecode++; - goto REPEATCHAR; - - case OP_POSQUERY: - possessive = TRUE; - min = 0; - max = 1; - ecode++; - goto REPEATCHAR; - - case OP_STAR: - case OP_MINSTAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_QUERY: - case OP_MINQUERY: - c = *ecode++ - OP_STAR; - minimize = (c & 1) != 0; - min = rep_min[c]; /* Pick up values from tables; */ - max = rep_max[c]; /* zero for max => infinity */ - if (max == 0) max = INT_MAX; - - /* Common code for all repeated single-character matches. We can give - up quickly if there are fewer than the minimum number of characters left in - the subject. */ - - REPEATCHAR: -#ifdef SUPPORT_UTF8 - if (utf8) - { - length = 1; - charptr = ecode; - GETCHARLEN(fc, ecode, length); - if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); - ecode += length; - - /* Handle multibyte character matching specially here. There is - support for caseless matching if UCP support is present. */ - - if (length > 1) - { -#ifdef SUPPORT_UCP - unsigned int othercase; - if ((ims & PCRE_CASELESS) != 0 && - (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR) - oclength = _pcre_ord2utf8(othercase, occhars); - else oclength = 0; -#endif /* SUPPORT_UCP */ - - for (i = 1; i <= min; i++) - { - if (memcmp(eptr, charptr, length) == 0) eptr += length; -#ifdef SUPPORT_UCP - /* Need braces because of following else */ - else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } - else - { - if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); - eptr += oclength; - } -#else /* without SUPPORT_UCP */ - else { RRETURN(MATCH_NOMATCH); } -#endif /* SUPPORT_UCP */ - } - - if (min == max) continue; - - if (minimize) - { - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - if (memcmp(eptr, charptr, length) == 0) eptr += length; -#ifdef SUPPORT_UCP - /* Need braces because of following else */ - else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } - else - { - if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); - eptr += oclength; - } -#else /* without SUPPORT_UCP */ - else { RRETURN (MATCH_NOMATCH); } -#endif /* SUPPORT_UCP */ - } - /* Control never gets here */ - } - - else /* Maximize */ - { - pp = eptr; - for (i = min; i < max; i++) - { - if (eptr > md->end_subject - length) break; - if (memcmp(eptr, charptr, length) == 0) eptr += length; -#ifdef SUPPORT_UCP - else if (oclength == 0) break; - else - { - if (memcmp(eptr, occhars, oclength) != 0) break; - eptr += oclength; - } -#else /* without SUPPORT_UCP */ - else break; -#endif /* SUPPORT_UCP */ - } - - if (possessive) continue; - for(;;) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr == pp) RRETURN(MATCH_NOMATCH); -#ifdef SUPPORT_UCP - eptr--; - BACKCHAR(eptr); -#else /* without SUPPORT_UCP */ - eptr -= length; -#endif /* SUPPORT_UCP */ - } - } - /* Control never gets here */ - } - - /* If the length of a UTF-8 character is 1, we fall through here, and - obey the code as for non-UTF-8 characters below, though in this case the - value of fc will always be < 128. */ - } - else -#endif /* SUPPORT_UTF8 */ - - /* When not in UTF-8 mode, load a single-byte character. */ - { - if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); - fc = *ecode++; - } - - /* The value of fc at this point is always less than 256, though we may or - may not be in UTF-8 mode. The code is duplicated for the caseless and - caseful cases, for speed, since matching characters is likely to be quite - common. First, ensure the minimum number of matches are present. If min = - max, continue at the same level without recursing. Otherwise, if - minimizing, keep trying the rest of the expression and advancing one - matching character if failing, up to the maximum. Alternatively, if - maximizing, find the maximum number of characters and work backwards. */ - - DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, - max, eptr)); - - if ((ims & PCRE_CASELESS) != 0) - { - fc = md->lcc[fc]; - for (i = 1; i <= min; i++) - if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); - if (min == max) continue; - if (minimize) - { - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || - fc != md->lcc[*eptr++]) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - } - else /* Maximize */ - { - pp = eptr; - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break; - eptr++; - } - if (possessive) continue; - while (eptr >= pp) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25); - eptr--; - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - } - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - } - - /* Caseful comparisons (includes all multi-byte characters) */ - - else - { - for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH); - if (min == max) continue; - if (minimize) - { - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || fc != *eptr++) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - } - else /* Maximize */ - { - pp = eptr; - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || fc != *eptr) break; - eptr++; - } - if (possessive) continue; - while (eptr >= pp) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27); - eptr--; - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - } - RRETURN(MATCH_NOMATCH); - } - } - /* Control never gets here */ - - /* Match a negated single one-byte character. The character we are - checking can be multibyte. */ - - case OP_NOT: - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - ecode++; - GETCHARINCTEST(c, eptr); - if ((ims & PCRE_CASELESS) != 0) - { -#ifdef SUPPORT_UTF8 - if (c < 256) -#endif - c = md->lcc[c]; - if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH); - } - else - { - if (*ecode++ == c) RRETURN(MATCH_NOMATCH); - } - break; - - /* Match a negated single one-byte character repeatedly. This is almost a - repeat of the code for a repeated single character, but I haven't found a - nice way of commoning these up that doesn't require a test of the - positive/negative option for each character match. Maybe that wouldn't add - very much to the time taken, but character matching *is* what this is all - about... */ - - case OP_NOTEXACT: - min = max = GET2(ecode, 1); - ecode += 3; - goto REPEATNOTCHAR; - - case OP_NOTUPTO: - case OP_NOTMINUPTO: - min = 0; - max = GET2(ecode, 1); - minimize = *ecode == OP_NOTMINUPTO; - ecode += 3; - goto REPEATNOTCHAR; - - case OP_NOTPOSSTAR: - possessive = TRUE; - min = 0; - max = INT_MAX; - ecode++; - goto REPEATNOTCHAR; - - case OP_NOTPOSPLUS: - possessive = TRUE; - min = 1; - max = INT_MAX; - ecode++; - goto REPEATNOTCHAR; - - case OP_NOTPOSQUERY: - possessive = TRUE; - min = 0; - max = 1; - ecode++; - goto REPEATNOTCHAR; - - case OP_NOTPOSUPTO: - possessive = TRUE; - min = 0; - max = GET2(ecode, 1); - ecode += 3; - goto REPEATNOTCHAR; - - case OP_NOTSTAR: - case OP_NOTMINSTAR: - case OP_NOTPLUS: - case OP_NOTMINPLUS: - case OP_NOTQUERY: - case OP_NOTMINQUERY: - c = *ecode++ - OP_NOTSTAR; - minimize = (c & 1) != 0; - min = rep_min[c]; /* Pick up values from tables; */ - max = rep_max[c]; /* zero for max => infinity */ - if (max == 0) max = INT_MAX; - - /* Common code for all repeated single-byte matches. We can give up quickly - if there are fewer than the minimum number of bytes left in the - subject. */ - - REPEATNOTCHAR: - if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); - fc = *ecode++; - - /* The code is duplicated for the caseless and caseful cases, for speed, - since matching characters is likely to be quite common. First, ensure the - minimum number of matches are present. If min = max, continue at the same - level without recursing. Otherwise, if minimizing, keep trying the rest of - the expression and advancing one matching character if failing, up to the - maximum. Alternatively, if maximizing, find the maximum number of - characters and work backwards. */ - - DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, - max, eptr)); - - if ((ims & PCRE_CASELESS) != 0) - { - fc = md->lcc[fc]; - -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) - { - register unsigned int d; - for (i = 1; i <= min; i++) - { - GETCHARINC(d, eptr); - if (d < 256) d = md->lcc[d]; - if (fc == d) RRETURN(MATCH_NOMATCH); - } - } - else -#endif - - /* Not UTF-8 mode */ - { - for (i = 1; i <= min; i++) - if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); - } - - if (min == max) continue; - - if (minimize) - { -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) - { - register unsigned int d; - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - GETCHARINC(d, eptr); - if (d < 256) d = md->lcc[d]; - if (fi >= max || eptr >= md->end_subject || fc == d) - RRETURN(MATCH_NOMATCH); - } - } - else -#endif - /* Not UTF-8 mode */ - { - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++]) - RRETURN(MATCH_NOMATCH); - } - } - /* Control never gets here */ - } - - /* Maximize case */ - - else - { - pp = eptr; - -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) - { - register unsigned int d; - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(d, eptr, len); - if (d < 256) d = md->lcc[d]; - if (fc == d) break; - eptr += len; - } - if (possessive) continue; - for(;;) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ - BACKCHAR(eptr); - } - } - else -#endif - /* Not UTF-8 mode */ - { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break; - eptr++; - } - if (possessive) continue; - while (eptr >= pp) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - eptr--; - } - } - - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - } - - /* Caseful comparisons */ - - else - { -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) - { - register unsigned int d; - for (i = 1; i <= min; i++) - { - GETCHARINC(d, eptr); - if (fc == d) RRETURN(MATCH_NOMATCH); - } - } - else -#endif - /* Not UTF-8 mode */ - { - for (i = 1; i <= min; i++) - if (fc == *eptr++) RRETURN(MATCH_NOMATCH); - } - - if (min == max) continue; - - if (minimize) - { -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) - { - register unsigned int d; - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - GETCHARINC(d, eptr); - if (fi >= max || eptr >= md->end_subject || fc == d) - RRETURN(MATCH_NOMATCH); - } - } - else -#endif - /* Not UTF-8 mode */ - { - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || fc == *eptr++) - RRETURN(MATCH_NOMATCH); - } - } - /* Control never gets here */ - } - - /* Maximize case */ - - else - { - pp = eptr; - -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) - { - register unsigned int d; - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(d, eptr, len); - if (fc == d) break; - eptr += len; - } - if (possessive) continue; - for(;;) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ - BACKCHAR(eptr); - } - } - else -#endif - /* Not UTF-8 mode */ - { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || fc == *eptr) break; - eptr++; - } - if (possessive) continue; - while (eptr >= pp) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - eptr--; - } - } - - RRETURN(MATCH_NOMATCH); - } - } - /* Control never gets here */ - - /* Match a single character type repeatedly; several different opcodes - share code. This is very similar to the code for single characters, but we - repeat it in the interests of efficiency. */ - - case OP_TYPEEXACT: - min = max = GET2(ecode, 1); - minimize = TRUE; - ecode += 3; - goto REPEATTYPE; - - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - min = 0; - max = GET2(ecode, 1); - minimize = *ecode == OP_TYPEMINUPTO; - ecode += 3; - goto REPEATTYPE; - - case OP_TYPEPOSSTAR: - possessive = TRUE; - min = 0; - max = INT_MAX; - ecode++; - goto REPEATTYPE; - - case OP_TYPEPOSPLUS: - possessive = TRUE; - min = 1; - max = INT_MAX; - ecode++; - goto REPEATTYPE; - - case OP_TYPEPOSQUERY: - possessive = TRUE; - min = 0; - max = 1; - ecode++; - goto REPEATTYPE; - - case OP_TYPEPOSUPTO: - possessive = TRUE; - min = 0; - max = GET2(ecode, 1); - ecode += 3; - goto REPEATTYPE; - - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - c = *ecode++ - OP_TYPESTAR; - minimize = (c & 1) != 0; - min = rep_min[c]; /* Pick up values from tables; */ - max = rep_max[c]; /* zero for max => infinity */ - if (max == 0) max = INT_MAX; - - /* Common code for all repeated single character type matches. Note that - in UTF-8 mode, '.' matches a character of any length, but for the other - character types, the valid characters are all one-byte long. */ - - REPEATTYPE: - ctype = *ecode++; /* Code for the character type */ - -#ifdef SUPPORT_UCP - if (ctype == OP_PROP || ctype == OP_NOTPROP) - { - prop_fail_result = ctype == OP_NOTPROP; - prop_type = *ecode++; - prop_value = *ecode++; - } - else prop_type = -1; -#endif - - /* First, ensure the minimum number of matches are present. Use inline - code for maximizing the speed, and do the type test once at the start - (i.e. keep it out of the loop). Also we can test that there are at least - the minimum number of bytes before we start. This isn't as effective in - UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that - is tidier. Also separate the UCP code, which can be the same for both UTF-8 - and single-bytes. */ - - if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); - if (min > 0) - { -#ifdef SUPPORT_UCP - if (prop_type >= 0) - { - switch(prop_type) - { - case PT_ANY: - if (prop_fail_result) RRETURN(MATCH_NOMATCH); - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - } - break; - - case PT_LAMP: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if ((prop_chartype == ucp_Lu || - prop_chartype == ucp_Ll || - prop_chartype == ucp_Lt) == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } - break; - - case PT_GC: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if ((prop_category == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } - break; - - case PT_PC: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if ((prop_chartype == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } - break; - - case PT_SC: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if ((prop_script == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } - break; - - default: - RRETURN(PCRE_ERROR_INTERNAL); - } - } - - /* Match extended Unicode sequences. We will get here only if the - support is in the binary; otherwise a compile-time error occurs. */ - - else if (ctype == OP_EXTUNI) - { - for (i = 1; i <= min; i++) - { - GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); - while (eptr < md->end_subject) - { - int len = 1; - if (!utf8) c = *eptr; else - { - GETCHARLEN(c, eptr, len); - } - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if (prop_category != ucp_M) break; - eptr += len; - } - } - } - - else -#endif /* SUPPORT_UCP */ - -/* Handle all other cases when the coding is UTF-8 */ - -#ifdef SUPPORT_UTF8 - if (utf8) switch(ctype) - { - case OP_ANY: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject || - ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) - RRETURN(MATCH_NOMATCH); - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } - break; - - case OP_ANYBYTE: - eptr += min; - break; - - case OP_ANYNL: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - switch(c) - { - default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; - break; - - case 0x000a: - break; - - case 0x000b: - case 0x000c: - case 0x0085: - case 0x2028: - case 0x2029: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); - break; - } - } - break; - - case OP_NOT_HSPACE: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - switch(c) - { - default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); - } - } - break; - - case OP_HSPACE: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - switch(c) - { - default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - break; - } - } - break; - - case OP_NOT_VSPACE: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - switch(c) - { - default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); - } - } - break; - - case OP_VSPACE: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - switch(c) - { - default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - break; - } - } - break; - - case OP_NOT_DIGIT: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); - } - break; - - case OP_DIGIT: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject || - *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) - RRETURN(MATCH_NOMATCH); - /* No need to skip more bytes - we know it's a 1-byte character */ - } - break; - - case OP_NOT_WHITESPACE: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject || - (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)) - RRETURN(MATCH_NOMATCH); - while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); - } - break; - - case OP_WHITESPACE: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject || - *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) - RRETURN(MATCH_NOMATCH); - /* No need to skip more bytes - we know it's a 1-byte character */ - } - break; - - case OP_NOT_WORDCHAR: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject || - (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)) - RRETURN(MATCH_NOMATCH); - while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); - } - break; - - case OP_WORDCHAR: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject || - *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); - /* No need to skip more bytes - we know it's a 1-byte character */ - } - break; - - default: - RRETURN(PCRE_ERROR_INTERNAL); - } /* End switch(ctype) */ - - else -#endif /* SUPPORT_UTF8 */ - - /* Code for the non-UTF-8 case for minimum matching of operators other - than OP_PROP and OP_NOTPROP. We can assume that there are the minimum - number of bytes present, as this was tested above. */ - - switch(ctype) - { - case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) - { - for (i = 1; i <= min; i++) - { - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); - eptr++; - } - } - else eptr += min; - break; - - case OP_ANYBYTE: - eptr += min; - break; - - /* Because of the CRLF case, we can't assume the minimum number of - bytes are present in this case. */ - - case OP_ANYNL: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - switch(*eptr++) - { - default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; - break; - case 0x000a: - break; - - case 0x000b: - case 0x000c: - case 0x0085: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); - break; - } - } - break; - - case OP_NOT_HSPACE: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - switch(*eptr++) - { - default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - RRETURN(MATCH_NOMATCH); - } - } - break; - - case OP_HSPACE: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - switch(*eptr++) - { - default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - break; - } - } - break; - - case OP_NOT_VSPACE: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - switch(*eptr++) - { - default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - RRETURN(MATCH_NOMATCH); - } - } - break; - - case OP_VSPACE: - for (i = 1; i <= min; i++) - { - if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - switch(*eptr++) - { - default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - break; - } - } - break; - - case OP_NOT_DIGIT: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); - break; - - case OP_DIGIT: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); - break; - - case OP_NOT_WHITESPACE: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); - break; - - case OP_WHITESPACE: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); - break; - - case OP_NOT_WORDCHAR: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_WORDCHAR: - for (i = 1; i <= min; i++) - if ((md->ctypes[*eptr++] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); - break; - - default: - RRETURN(PCRE_ERROR_INTERNAL); - } - } - - /* If min = max, continue at the same level without recursing */ - - if (min == max) continue; - - /* If minimizing, we have to test the rest of the pattern before each - subsequent match. Again, separate the UTF-8 case for speed, and also - separate the UCP cases. */ - - if (minimize) - { -#ifdef SUPPORT_UCP - if (prop_type >= 0) - { - switch(prop_type) - { - case PT_ANY: - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - if (prop_fail_result) RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - case PT_LAMP: - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if ((prop_chartype == ucp_Lu || - prop_chartype == ucp_Ll || - prop_chartype == ucp_Lt) == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - case PT_GC: - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if ((prop_category == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - case PT_PC: - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if ((prop_chartype == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - case PT_SC: - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if ((prop_script == prop_value) == prop_fail_result) - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - default: - RRETURN(PCRE_ERROR_INTERNAL); - } - } - - /* Match extended Unicode sequences. We will get here only if the - support is in the binary; otherwise a compile-time error occurs. */ - - else if (ctype == OP_EXTUNI) - { - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); - while (eptr < md->end_subject) - { - int len = 1; - if (!utf8) c = *eptr; else - { - GETCHARLEN(c, eptr, len); - } - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if (prop_category != ucp_M) break; - eptr += len; - } - } - } - - else -#endif /* SUPPORT_UCP */ - -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) - { - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || - (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 && - IS_NEWLINE(eptr))) - RRETURN(MATCH_NOMATCH); - - GETCHARINC(c, eptr); - switch(ctype) - { - case OP_ANY: /* This is the DOTALL case */ - break; - - case OP_ANYBYTE: - break; - - case OP_ANYNL: - switch(c) - { - default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; - break; - case 0x000a: - break; - - case 0x000b: - case 0x000c: - case 0x0085: - case 0x2028: - case 0x2029: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); - break; - } - break; - - case OP_NOT_HSPACE: - switch(c) - { - default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - RRETURN(MATCH_NOMATCH); - } - break; - - case OP_HSPACE: - switch(c) - { - default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - break; - } - break; - - case OP_NOT_VSPACE: - switch(c) - { - default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - RRETURN(MATCH_NOMATCH); - } - break; - - case OP_VSPACE: - switch(c) - { - default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - break; - } - break; - - case OP_NOT_DIGIT: - if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_DIGIT: - if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_NOT_WHITESPACE: - if (c < 256 && (md->ctypes[c] & ctype_space) != 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_WHITESPACE: - if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_NOT_WORDCHAR: - if (c < 256 && (md->ctypes[c] & ctype_word) != 0) - RRETURN(MATCH_NOMATCH); - break; - - case OP_WORDCHAR: - if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) - RRETURN(MATCH_NOMATCH); - break; - - default: - RRETURN(PCRE_ERROR_INTERNAL); - } - } - } - else -#endif - /* Not UTF-8 mode */ - { - for (fi = min;; fi++) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject || - ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) - RRETURN(MATCH_NOMATCH); - - c = *eptr++; - switch(ctype) - { - case OP_ANY: /* This is the DOTALL case */ - break; - - case OP_ANYBYTE: - break; - - case OP_ANYNL: - switch(c) - { - default: RRETURN(MATCH_NOMATCH); - case 0x000d: - if (eptr < md->end_subject && *eptr == 0x0a) eptr++; - break; - - case 0x000a: - break; - - case 0x000b: - case 0x000c: - case 0x0085: - if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); - break; - } - break; - - case OP_NOT_HSPACE: - switch(c) - { - default: break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - RRETURN(MATCH_NOMATCH); - } - break; - - case OP_HSPACE: - switch(c) - { - default: RRETURN(MATCH_NOMATCH); - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - break; - } - break; - - case OP_NOT_VSPACE: - switch(c) - { - default: break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - RRETURN(MATCH_NOMATCH); - } - break; - - case OP_VSPACE: - switch(c) - { - default: RRETURN(MATCH_NOMATCH); - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - break; - } - break; - - case OP_NOT_DIGIT: - if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); - break; - - case OP_DIGIT: - if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); - break; - - case OP_NOT_WHITESPACE: - if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); - break; - - case OP_WHITESPACE: - if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); - break; - - case OP_NOT_WORDCHAR: - if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); - break; - - case OP_WORDCHAR: - if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); - break; - - default: - RRETURN(PCRE_ERROR_INTERNAL); - } - } - } - /* Control never gets here */ - } - - /* If maximizing, it is worth using inline code for speed, doing the type - test once at the start (i.e. keep it out of the loop). Again, keep the - UTF-8 and UCP stuff separate. */ - - else - { - pp = eptr; /* Remember where we started */ - -#ifdef SUPPORT_UCP - if (prop_type >= 0) - { - switch(prop_type) - { - case PT_ANY: - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - if (prop_fail_result) break; - eptr+= len; - } - break; - - case PT_LAMP: - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if ((prop_chartype == ucp_Lu || - prop_chartype == ucp_Ll || - prop_chartype == ucp_Lt) == prop_fail_result) - break; - eptr+= len; - } - break; - - case PT_GC: - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if ((prop_category == prop_value) == prop_fail_result) - break; - eptr+= len; - } - break; - - case PT_PC: - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if ((prop_chartype == prop_value) == prop_fail_result) - break; - eptr+= len; - } - break; - - case PT_SC: - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if ((prop_script == prop_value) == prop_fail_result) - break; - eptr+= len; - } - break; - } - - /* eptr is now past the end of the maximum run */ - - if (possessive) continue; - for(;;) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ - if (utf8) BACKCHAR(eptr); - } - } - - /* Match extended Unicode sequences. We will get here only if the - support is in the binary; otherwise a compile-time error occurs. */ - - else if (ctype == OP_EXTUNI) - { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject) break; - GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if (prop_category == ucp_M) break; - while (eptr < md->end_subject) - { - int len = 1; - if (!utf8) c = *eptr; else - { - GETCHARLEN(c, eptr, len); - } - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if (prop_category != ucp_M) break; - eptr += len; - } - } - - /* eptr is now past the end of the maximum run */ - - if (possessive) continue; - for(;;) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ - for (;;) /* Move back over one extended */ - { - int len = 1; - if (!utf8) c = *eptr; else - { - BACKCHAR(eptr); - GETCHARLEN(c, eptr, len); - } - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); - if (prop_category != ucp_M) break; - eptr--; - } - } - } - - else -#endif /* SUPPORT_UCP */ - -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - - if (utf8) - { - switch(ctype) - { - case OP_ANY: - if (max < INT_MAX) - { - if ((ims & PCRE_DOTALL) == 0) - { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } - } - else - { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject) break; - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } - } - } - - /* Handle unlimited UTF-8 repeat */ - - else - { - if ((ims & PCRE_DOTALL) == 0) - { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } - } - else - { - eptr = md->end_subject; - } - } - break; - - /* The byte case is the same as non-UTF8 */ - - case OP_ANYBYTE: - c = max - min; - if (c > (unsigned int)(md->end_subject - eptr)) - c = md->end_subject - eptr; - eptr += c; - break; - - case OP_ANYNL: - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - if (c == 0x000d) - { - if (++eptr >= md->end_subject) break; - if (*eptr == 0x000a) eptr++; - } - else - { - if (c != 0x000a && - (md->bsr_anycrlf || - (c != 0x000b && c != 0x000c && - c != 0x0085 && c != 0x2028 && c != 0x2029))) - break; - eptr += len; - } - } - break; - - case OP_NOT_HSPACE: - case OP_HSPACE: - for (i = min; i < max; i++) - { - BOOL gotspace; - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - switch(c) - { - default: gotspace = FALSE; break; - case 0x09: /* HT */ - case 0x20: /* SPACE */ - case 0xa0: /* NBSP */ - case 0x1680: /* OGHAM SPACE MARK */ - case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ - case 0x2000: /* EN QUAD */ - case 0x2001: /* EM QUAD */ - case 0x2002: /* EN SPACE */ - case 0x2003: /* EM SPACE */ - case 0x2004: /* THREE-PER-EM SPACE */ - case 0x2005: /* FOUR-PER-EM SPACE */ - case 0x2006: /* SIX-PER-EM SPACE */ - case 0x2007: /* FIGURE SPACE */ - case 0x2008: /* PUNCTUATION SPACE */ - case 0x2009: /* THIN SPACE */ - case 0x200A: /* HAIR SPACE */ - case 0x202f: /* NARROW NO-BREAK SPACE */ - case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ - case 0x3000: /* IDEOGRAPHIC SPACE */ - gotspace = TRUE; - break; - } - if (gotspace == (ctype == OP_NOT_HSPACE)) break; - eptr += len; - } - break; - - case OP_NOT_VSPACE: - case OP_VSPACE: - for (i = min; i < max; i++) - { - BOOL gotspace; - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - switch(c) - { - default: gotspace = FALSE; break; - case 0x0a: /* LF */ - case 0x0b: /* VT */ - case 0x0c: /* FF */ - case 0x0d: /* CR */ - case 0x85: /* NEL */ - case 0x2028: /* LINE SEPARATOR */ - case 0x2029: /* PARAGRAPH SEPARATOR */ - gotspace = TRUE; - break; - } - if (gotspace == (ctype == OP_NOT_VSPACE)) break; - eptr += len; - } - break; - - case OP_NOT_DIGIT: - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; - eptr+= len; - } - break; - - case OP_DIGIT: - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; - eptr+= len; - } - break; - - case OP_NOT_WHITESPACE: - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; - eptr+= len; - } - break; - - case OP_WHITESPACE: - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; - eptr+= len; - } - break; - - case OP_NOT_WORDCHAR: - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; - eptr+= len; - } - break; - - case OP_WORDCHAR: - for (i = min; i < max; i++) - { - int len = 1; - if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); - if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; - eptr+= len; - } - break; - - default: - RRETURN(PCRE_ERROR_INTERNAL); - } - - /* eptr is now past the end of the maximum run */ - - if (possessive) continue; - for(;;) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (eptr-- == pp) break; /* Stop if tried at original pos */ - BACKCHAR(eptr); - } - } - else -#endif /* SUPPORT_UTF8 */ - - /* Not UTF-8 mode */ - { - switch(ctype) - { - case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) - { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - } - break; - } - /* For DOTALL case, fall through and treat as \C */ - - case OP_ANYBYTE: - c = max - min; - if (c > (unsigned int)(md->end_subject - eptr)) - c = md->end_subject - eptr; - eptr += c; - break; - - case OP_ANYNL: - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject) break; - c = *eptr; - if (c == 0x000d) - { - if (++eptr >= md->end_subject) break; - if (*eptr == 0x000a) eptr++; - } - else - { - if (c != 0x000a && - (md->bsr_anycrlf || - (c != 0x000b && c != 0x000c && c != 0x0085))) - break; - eptr++; - } - } - break; - - case OP_NOT_HSPACE: - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject) break; - c = *eptr; - if (c == 0x09 || c == 0x20 || c == 0xa0) break; - eptr++; - } - break; - - case OP_HSPACE: - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject) break; - c = *eptr; - if (c != 0x09 && c != 0x20 && c != 0xa0) break; - eptr++; - } - break; - - case OP_NOT_VSPACE: - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject) break; - c = *eptr; - if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85) - break; - eptr++; - } - break; - - case OP_VSPACE: - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject) break; - c = *eptr; - if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85) - break; - eptr++; - } - break; - - case OP_NOT_DIGIT: - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0) - break; - eptr++; - } - break; - - case OP_DIGIT: - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0) - break; - eptr++; - } - break; - - case OP_NOT_WHITESPACE: - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0) - break; - eptr++; - } - break; - - case OP_WHITESPACE: - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0) - break; - eptr++; - } - break; - - case OP_NOT_WORDCHAR: - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0) - break; - eptr++; - } - break; - - case OP_WORDCHAR: - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0) - break; - eptr++; - } - break; - - default: - RRETURN(PCRE_ERROR_INTERNAL); - } - - /* eptr is now past the end of the maximum run */ - - if (possessive) continue; - while (eptr >= pp) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47); - eptr--; - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - } - } - - /* Get here if we can't make it match with any permitted repetitions */ - - RRETURN(MATCH_NOMATCH); - } - /* Control never gets here */ - - /* There's been some horrible disaster. Arrival here can only mean there is - something seriously wrong in the code above or the OP_xxx definitions. */ - - default: - DPRINTF(("Unknown opcode %d\n", *ecode)); - RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); + /* First check that we haven't called match() too many times, or that we + haven't exceeded the recursive call limit. */ + + if (md->match_call_count++ >= md->match_limit) + RRETURN(PCRE_ERROR_MATCHLIMIT); + if (rdepth >= md->match_limit_recursion) + RRETURN(PCRE_ERROR_RECURSIONLIMIT); + + original_ims = ims; /* Save for resetting on ')' */ + + /* At the start of a group with an unlimited repeat that may match an empty + string, the match_cbegroup flag is set. When this is the case, add the + current subject pointer to the chain of such remembered pointers, to be + checked when we hit the closing ket, in order to break infinite loops that + match no characters. When match() is called in other circumstances, don't + add to the chain. The match_cbegroup flag must NOT be used with tail + recursion, because the memory block that is used is on the stack, so a new + one may be required for each match(). */ + + if ((flags & match_cbegroup) != 0) { + newptrb.epb_saved_eptr = eptr; + newptrb.epb_prev = eptrb; + eptrb = &newptrb; } - /* Do not stick any code in here without much thought; it is assumed - that "continue" in the code above comes out to here to repeat the main - loop. */ + /* Now start processing the opcodes. */ - } /* End of main loop */ -/* Control never reaches here */ + for (;;) { + minimize = possessive = FALSE; + op = *ecode; + /* For partial matching, remember if we ever hit the end of the subject + after matching at least one subject character. */ -/* When compiling to use the heap rather than the stack for recursive calls to -match(), the RRETURN() macro jumps here. The number that is saved in -frame->Xwhere indicates which label we actually want to return to. */ + if (md->partial && eptr >= md->end_subject && eptr > mstart) + md->hitend = TRUE; + + switch (op) { + case OP_FAIL: + RRETURN(MATCH_NOMATCH); + + case OP_PRUNE: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + ims, eptrb, flags, RM51); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + RRETURN(MATCH_PRUNE); + + case OP_COMMIT: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + ims, eptrb, flags, RM52); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + RRETURN(MATCH_COMMIT); + + case OP_SKIP: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + ims, eptrb, flags, RM53); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + md->start_match_ptr = eptr; /* Pass back current position */ + RRETURN(MATCH_SKIP); + + case OP_THEN: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + ims, eptrb, flags, RM54); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + RRETURN(MATCH_THEN); + + /* Handle a capturing bracket. If there is space in the offset + vector, save the current subject position in the working slot at + the top of the vector. We mustn't change the current values of + the data slot, because they may be set from a previous iteration + of this group, and be referred to by a reference inside the + group. + + If the bracket fails to match, we need to restore this value and + also the values of the final offsets, in case they were set by a + previous iteration of the same bracket. + + If there isn't enough space in the offset vector, treat this as + if it were a non-capturing bracket. Don't worry about setting + the flag for the error case here; that is handled in the code + for KET. */ + + case OP_CBRA: + case OP_SCBRA: + number = GET2(ecode, 1 + LINK_SIZE); + offset = number << 1; + +#ifdef DEBUG + printf("start bracket %d\n", number); + printf("subject="); + pchars(eptr, 16, TRUE, md); + printf("\n"); +#endif + + if (offset < md->offset_max) { + save_offset1 = md->offset_vector[offset]; + save_offset2 = md->offset_vector[offset + 1]; + save_offset3 = md->offset_vector[md->offset_end - number]; + save_capture_last = md->capture_last; + + DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, + save_offset3)); + md->offset_vector[md->offset_end - number] = + eptr - md->start_subject; + + flags = (op == OP_SCBRA) ? match_cbegroup : 0; + do { + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], + offset_top, md, ims, eptrb, flags, RM1); + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) + RRETURN(rrc); + md->capture_last = save_capture_last; + ecode += GET(ecode, 1); + } while (*ecode == OP_ALT); + + DPRINTF(("bracket %d failed\n", number)); + + md->offset_vector[offset] = save_offset1; + md->offset_vector[offset + 1] = save_offset2; + md->offset_vector[md->offset_end - number] = save_offset3; + + RRETURN(MATCH_NOMATCH); + } + + /* FALL THROUGH ... Insufficient room for saving captured + contents. Treat as a non-capturing bracket. */ + + /* VVVVVVVVVVVVVVVVVVVVVVVVV */ + /* VVVVVVVVVVVVVVVVVVVVVVVVV */ + + DPRINTF( + ("insufficient capture room: treat as non-capturing\n")); + + /* VVVVVVVVVVVVVVVVVVVVVVVVV */ + /* VVVVVVVVVVVVVVVVVVVVVVVVV */ + + /* Non-capturing bracket. Loop for all the alternatives. When we + get to the final alternative within the brackets, we would + return the result of a recursive call to match() whatever + happened. We can reduce stack usage by turning this into a tail + recursion, except in the case when match_cbegroup is set.*/ + + case OP_BRA: + case OP_SBRA: + DPRINTF(("start non-capturing bracket\n")); + flags = (op >= OP_SBRA) ? match_cbegroup : 0; + for (;;) { + if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */ + { + if (flags == 0) /* Not a possibly empty group */ + { + ecode += _pcre_OP_lengths[*ecode]; + DPRINTF(("bracket 0 tail recursion\n")); + goto TAIL_RECURSE; + } + + /* Possibly empty group; can't use tail recursion. */ + + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], + offset_top, md, ims, eptrb, flags, RM48); + RRETURN(rrc); + } + + /* For non-final alternatives, continue the loop for a + NOMATCH result; otherwise return. */ + + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, + md, ims, eptrb, flags, RM2); + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) + RRETURN(rrc); + ecode += GET(ecode, 1); + } + /* Control never reaches here. */ + + /* Conditional group: compilation checked that there are no more + than two branches. If the condition is false, skipping the first + branch takes us past the end if there is only one branch, but + that's OK because that is exactly what going to the ket would + do. As there is only one branch to be obeyed, we can use tail + recursion to avoid using another stack frame. */ + + case OP_COND: + case OP_SCOND: + if (ecode[LINK_SIZE + 1] == OP_RREF) /* Recursion test */ + { + offset = + GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ + condition = md->recursive != NULL && + (offset == RREF_ANY || + offset == md->recursive->group_num); + ecode += condition ? 3 : GET(ecode, 1); + } + + else if (ecode[LINK_SIZE + 1] == OP_CREF) /* Group used test */ + { + offset = GET2(ecode, LINK_SIZE + 2) + << 1; /* Doubled ref number */ + condition = + offset < offset_top && md->offset_vector[offset] >= 0; + ecode += condition ? 3 : GET(ecode, 1); + } + + else if (ecode[LINK_SIZE + 1] == + OP_DEF) /* DEFINE - always false */ + { + condition = FALSE; + ecode += GET(ecode, 1); + } + + /* The condition is an assertion. Call match() to evaluate it - + setting the final argument match_condassert causes it to stop at + the end of an assertion. */ + + else { + RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, + NULL, match_condassert, RM3); + if (rrc == MATCH_MATCH) { + condition = TRUE; + ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); + while (*ecode == OP_ALT) + ecode += GET(ecode, 1); + } else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { + RRETURN( + rrc); /* Need braces because of following else */ + } else { + condition = FALSE; + ecode += GET(ecode, 1); + } + } + + /* We are now at the branch that is to be obeyed. As there is + only one, we can use tail recursion to avoid using another stack + frame, except when match_cbegroup is required for an unlimited + repeat of a possibly empty group. If the second alternative + doesn't exist, we can just plough on. */ + + if (condition || *ecode == OP_ALT) { + ecode += 1 + LINK_SIZE; + if (op == OP_SCOND) /* Possibly empty group */ + { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + match_cbegroup, RM49); + RRETURN(rrc); + } else /* Group must match something */ + { + flags = 0; + goto TAIL_RECURSE; + } + } else /* Condition false & no 2nd alternative */ + { + ecode += 1 + LINK_SIZE; + } + break; + + /* End of the pattern, either real or forced. If we are in a + top-level recursion, we should restore the offsets appropriately + and continue from after the call. */ + + case OP_ACCEPT: + case OP_END: + if (md->recursive != NULL && md->recursive->group_num == 0) { + recursion_info* rec = md->recursive; + DPRINTF(("End of pattern in a (?0) recursion\n")); + md->recursive = rec->prevrec; + memmove(md->offset_vector, rec->offset_save, + rec->saved_max * sizeof(int)); + mstart = rec->save_start; + ims = original_ims; + ecode = rec->after_call; + break; + } + + /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched + an empty string - backtracking will then try other alternatives, + if any. */ + + if (md->notempty && eptr == mstart) + RRETURN(MATCH_NOMATCH); + md->end_match_ptr = eptr; /* Record where we ended */ + md->end_offset_top = + offset_top; /* and how many extracts were taken */ + md->start_match_ptr = + mstart; /* and the start (\K can modify) */ + RRETURN(MATCH_MATCH); + + /* Change option settings */ + + case OP_OPT: + ims = ecode[1]; + ecode += 2; + DPRINTF(("ims set to %02lx\n", ims)); + break; + + /* Assertion brackets. Check the alternative branches in turn - + the matching won't pass the KET for an assertion. If any one + branch matches, the assertion is true. Lookbehind assertions + have an OP_REVERSE item at the start of each branch to move the + current point backwards, so the code at this level is identical + to the lookahead case. */ + + case OP_ASSERT: + case OP_ASSERTBACK: + do { + RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, + NULL, 0, RM4); + if (rrc == MATCH_MATCH) + break; + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) + RRETURN(rrc); + ecode += GET(ecode, 1); + } while (*ecode == OP_ALT); + if (*ecode == OP_KET) + RRETURN(MATCH_NOMATCH); + + /* If checking an assertion for a condition, return MATCH_MATCH. + */ + + if ((flags & match_condassert) != 0) + RRETURN(MATCH_MATCH); + + /* Continue from after the assertion, updating the offsets high + water mark, since extracts may have been taken during the + assertion. */ + + do + ecode += GET(ecode, 1); + while (*ecode == OP_ALT); + ecode += 1 + LINK_SIZE; + offset_top = md->end_offset_top; + continue; + + /* Negative assertion: all branches must fail to match */ + + case OP_ASSERT_NOT: + case OP_ASSERTBACK_NOT: + do { + RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, + NULL, 0, RM5); + if (rrc == MATCH_MATCH) + RRETURN(MATCH_NOMATCH); + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) + RRETURN(rrc); + ecode += GET(ecode, 1); + } while (*ecode == OP_ALT); + + if ((flags & match_condassert) != 0) + RRETURN(MATCH_MATCH); + + ecode += 1 + LINK_SIZE; + continue; + + /* Move the subject pointer back. This occurs only at the start + of each branch of a lookbehind assertion. If we are too close to + the start to move back, this match function fails. When working + with UTF-8 we move back a number of characters, not bytes. */ + + case OP_REVERSE: +#ifdef SUPPORT_UTF8 + if (utf8) { + i = GET(ecode, 1); + while (i-- > 0) { + eptr--; + if (eptr < md->start_subject) + RRETURN(MATCH_NOMATCH); + BACKCHAR(eptr); + } + } else +#endif + + /* No UTF-8 support, or not in UTF-8 mode: count is byte count + */ + + { + eptr -= GET(ecode, 1); + if (eptr < md->start_subject) + RRETURN(MATCH_NOMATCH); + } + + /* Skip to next op code */ + + ecode += 1 + LINK_SIZE; + break; + + /* The callout item calls an external function, if one is + provided, passing details of the match so far. This is mainly + for debugging, though the function is able to force a failure. + */ + + case OP_CALLOUT: + if (pcre_callout != NULL) { + pcre_callout_block cb; + cb.version = 1; /* Version 1 of the callout block */ + cb.callout_number = ecode[1]; + cb.offset_vector = md->offset_vector; + cb.subject = (PCRE_SPTR)md->start_subject; + cb.subject_length = md->end_subject - md->start_subject; + cb.start_match = mstart - md->start_subject; + cb.current_position = eptr - md->start_subject; + cb.pattern_position = GET(ecode, 2); + cb.next_item_length = GET(ecode, 2 + LINK_SIZE); + cb.capture_top = offset_top / 2; + cb.capture_last = md->capture_last; + cb.callout_data = md->callout_data; + if ((rrc = (*pcre_callout)(&cb)) > 0) + RRETURN(MATCH_NOMATCH); + if (rrc < 0) + RRETURN(rrc); + } + ecode += 2 + 2 * LINK_SIZE; + break; + + /* Recursion either matches the current regex, or some + subexpression. The offset data is the offset to the starting + bracket from the start of the whole pattern. (This is so that it + works from duplicated subpatterns.) + + If there are any capturing brackets started but not finished, we + have to save their starting points and reinstate them after the + recursion. However, we don't know how many such there are + (offset_top records the completed total) so we just have to save + all the potential data. There may be up to 65535 such values, + which is too large to put on the stack, but using malloc for + small numbers seems expensive. As a compromise, the stack is + used when there are no more than REC_STACK_SAVE_MAX values to + store; otherwise malloc is used. A problem is what to do if the + malloc fails ... there is no way of returning to the top level + with an error. Save the top REC_STACK_SAVE_MAX values on the + stack, and accept that the rest may be wrong. + + There are also other values that have to be saved. We use a + chained sequence of blocks that actually live on the stack. + Thanks to Robin Houston for the original version of this logic. + */ + + case OP_RECURSE: { + callpat = md->start_code + GET(ecode, 1); + new_recursive.group_num = (callpat == md->start_code) + ? 0 + : GET2(callpat, 1 + LINK_SIZE); + + /* Add to "recursing stack" */ + + new_recursive.prevrec = md->recursive; + md->recursive = &new_recursive; + + /* Find where to continue from afterwards */ + + ecode += 1 + LINK_SIZE; + new_recursive.after_call = ecode; + + /* Now save the offset data. */ + + new_recursive.saved_max = md->offset_end; + if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) + new_recursive.offset_save = stacksave; + else { + new_recursive.offset_save = + (int*)(pcre_malloc)(new_recursive.saved_max * + sizeof(int)); + if (new_recursive.offset_save == NULL) + RRETURN(PCRE_ERROR_NOMEMORY); + } + + memcpy(new_recursive.offset_save, md->offset_vector, + new_recursive.saved_max * sizeof(int)); + new_recursive.save_start = mstart; + mstart = eptr; + + /* OK, now we can do the recursion. For each top-level + alternative we restore the offset and recursion data. */ + + DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); + flags = (*callpat >= OP_SBRA) ? match_cbegroup : 0; + do { + RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], + offset_top, md, ims, eptrb, flags, RM6); + if (rrc == MATCH_MATCH) { + DPRINTF(("Recursion matched\n")); + md->recursive = new_recursive.prevrec; + if (new_recursive.offset_save != stacksave) + (pcre_free)(new_recursive.offset_save); + RRETURN(MATCH_MATCH); + } else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { + DPRINTF(("Recursion gave error %d\n", rrc)); + RRETURN(rrc); + } + + md->recursive = &new_recursive; + memcpy(md->offset_vector, new_recursive.offset_save, + new_recursive.saved_max * sizeof(int)); + callpat += GET(callpat, 1); + } while (*callpat == OP_ALT); + + DPRINTF(("Recursion didn't match\n")); + md->recursive = new_recursive.prevrec; + if (new_recursive.offset_save != stacksave) + (pcre_free)(new_recursive.offset_save); + RRETURN(MATCH_NOMATCH); + } + /* Control never reaches here */ + + /* "Once" brackets are like assertion brackets except that after + a match, the point in the subject string is not moved back. Thus + there can never be a move back into the brackets. Friedl calls + these "atomic" subpatterns. Check the alternative branches in + turn - the matching won't pass the KET for this kind of + subpattern. If any one branch matches, we carry on as at the end + of a normal bracket, leaving the subject pointer. */ + + case OP_ONCE: + prev = ecode; + saved_eptr = eptr; + + do { + RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, + eptrb, 0, RM7); + if (rrc == MATCH_MATCH) + break; + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) + RRETURN(rrc); + ecode += GET(ecode, 1); + } while (*ecode == OP_ALT); + + /* If hit the end of the group (which could be repeated), fail + */ + + if (*ecode != OP_ONCE && *ecode != OP_ALT) + RRETURN(MATCH_NOMATCH); + + /* Continue as from after the assertion, updating the offsets + high water mark, since extracts may have been taken. */ + + do + ecode += GET(ecode, 1); + while (*ecode == OP_ALT); + + offset_top = md->end_offset_top; + eptr = md->end_match_ptr; + + /* For a non-repeating ket, just continue at this level. This + also happens for a repeating ket if no characters were matched + in the group. This is the forcible breaking of infinite loops as + implemented in Perl 5.005. If there is an options reset, it will + get obeyed in the normal course of events. */ + + if (*ecode == OP_KET || eptr == saved_eptr) { + ecode += 1 + LINK_SIZE; + break; + } + + /* The repeating kets try the rest of the pattern or restart + from the preceding bracket, in the appropriate order. The second + "call" of match() uses tail recursion, to avoid using another + stack frame. We need to reset any options that changed within + the bracket before re-running it, so check the next opcode. */ + + if (ecode[1 + LINK_SIZE] == OP_OPT) { + ims = (ims & ~PCRE_IMS) | ecode[4]; + DPRINTF(("ims set to %02lx at group repeat\n", ims)); + } + + if (*ecode == OP_KETRMIN) { + RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, + eptrb, 0, RM8); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + ecode = prev; + flags = 0; + goto TAIL_RECURSE; + } else /* OP_KETRMAX */ + { + RMATCH(eptr, prev, offset_top, md, ims, eptrb, + match_cbegroup, RM9); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + ecode += 1 + LINK_SIZE; + flags = 0; + goto TAIL_RECURSE; + } + /* Control never gets here */ + + /* An alternation is the end of a branch; scan along to find the + end of the bracketed group and go to there. */ + + case OP_ALT: + do + ecode += GET(ecode, 1); + while (*ecode == OP_ALT); + break; + + /* BRAZERO and BRAMINZERO occur just before a bracket group, + indicating that it may occur zero times. It may repeat + infinitely, or not at all - i.e. it could be ()* or ()? in the + pattern. Brackets with fixed upper repeat limits are compiled as + a number of copies, with the optional ones preceded by BRAZERO + or BRAMINZERO. */ + + case OP_BRAZERO: { + next = ecode + 1; + RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + do + next += GET(next, 1); + while (*next == OP_ALT); + ecode = next + 1 + LINK_SIZE; + } break; + + case OP_BRAMINZERO: { + next = ecode + 1; + do + next += GET(next, 1); + while (*next == OP_ALT); + RMATCH(eptr, next + 1 + LINK_SIZE, offset_top, md, ims, eptrb, + 0, RM11); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + ecode++; + } break; + + /* End of a group, repeated or non-repeating. */ + + case OP_KET: + case OP_KETRMIN: + case OP_KETRMAX: + prev = ecode - GET(ecode, 1); + + /* If this was a group that remembered the subject start, in + order to break infinite repeats of empty string matches, + retrieve the subject start from the chain. Otherwise, set it + NULL. */ + + if (*prev >= OP_SBRA) { + saved_eptr = + eptrb->epb_saved_eptr; /* Value at start of group */ + eptrb = eptrb->epb_prev; /* Backup to previous group */ + } else + saved_eptr = NULL; + + /* If we are at the end of an assertion group, stop matching and + return MATCH_MATCH, but record the current high water mark for + use by positive assertions. Do this also for the "once" (atomic) + groups. */ + + if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || + *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || + *prev == OP_ONCE) { + md->end_match_ptr = eptr; /* For ONCE */ + md->end_offset_top = offset_top; + RRETURN(MATCH_MATCH); + } + + /* For capturing groups we have to check the group number back + at the start and if necessary complete handling an extraction by + setting the offsets and bumping the high water mark. Note that + whole-pattern recursion is coded as a recurse into group 0, so + it won't be picked up here. Instead, we catch it when the OP_END + is reached. Other recursion is handled here. */ + + if (*prev == OP_CBRA || *prev == OP_SCBRA) { + number = GET2(prev, 1 + LINK_SIZE); + offset = number << 1; + +#ifdef DEBUG + printf("end bracket %d", number); + printf("\n"); +#endif + + md->capture_last = number; + if (offset >= md->offset_max) + md->offset_overflow = TRUE; + else { + md->offset_vector[offset] = + md->offset_vector[md->offset_end - number]; + md->offset_vector[offset + 1] = + eptr - md->start_subject; + if (offset_top <= offset) + offset_top = offset + 2; + } + + /* Handle a recursively called group. Restore the offsets + appropriately and continue from after the call. */ + + if (md->recursive != NULL && + md->recursive->group_num == number) { + recursion_info* rec = md->recursive; + DPRINTF(("Recursion (%d) succeeded - continuing\n", + number)); + md->recursive = rec->prevrec; + mstart = rec->save_start; + memcpy(md->offset_vector, rec->offset_save, + rec->saved_max * sizeof(int)); + ecode = rec->after_call; + ims = original_ims; + break; + } + } + + /* For both capturing and non-capturing groups, reset the value + of the ims flags, in case they got changed during the group. */ + + ims = original_ims; + DPRINTF(("ims reset to %02lx\n", ims)); + + /* For a non-repeating ket, just continue at this level. This + also happens for a repeating ket if no characters were matched + in the group. This is the forcible breaking of infinite loops as + implemented in Perl 5.005. If there is an options reset, it will + get obeyed in the normal course of events. */ + + if (*ecode == OP_KET || eptr == saved_eptr) { + ecode += 1 + LINK_SIZE; + break; + } + + /* The repeating kets try the rest of the pattern or restart + from the preceding bracket, in the appropriate order. In the + second case, we can use tail recursion to avoid using another + stack frame, unless we have an unlimited repeat of a group that + can match an empty string. */ + + flags = (*prev >= OP_SBRA) ? match_cbegroup : 0; + + if (*ecode == OP_KETRMIN) { + RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, + eptrb, 0, RM12); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (flags != 0) /* Could match an empty string */ + { + RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, + RM50); + RRETURN(rrc); + } + ecode = prev; + goto TAIL_RECURSE; + } else /* OP_KETRMAX */ + { + RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + ecode += 1 + LINK_SIZE; + flags = 0; + goto TAIL_RECURSE; + } + /* Control never gets here */ + + /* Start of subject unless notbol, or after internal newline if + * multiline */ + + case OP_CIRC: + if (md->notbol && eptr == md->start_subject) + RRETURN(MATCH_NOMATCH); + if ((ims & PCRE_MULTILINE) != 0) { + if (eptr != md->start_subject && + (eptr == md->end_subject || !WAS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + } + /* ... else fall through */ + + /* Start of subject assertion */ + + case OP_SOD: + if (eptr != md->start_subject) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + /* Start of match assertion */ + + case OP_SOM: + if (eptr != md->start_subject + md->start_offset) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + /* Reset the start of match point */ + + case OP_SET_SOM: + mstart = eptr; + ecode++; + break; + + /* Assert before internal newline if multiline, or before a + terminating newline unless endonly is set, else end of subject + unless noteol is set. */ + + case OP_DOLL: + if ((ims & PCRE_MULTILINE) != 0) { + if (eptr < md->end_subject) { + if (!IS_NEWLINE(eptr)) + RRETURN(MATCH_NOMATCH); + } else { + if (md->noteol) + RRETURN(MATCH_NOMATCH); + } + ecode++; + break; + } else { + if (md->noteol) + RRETURN(MATCH_NOMATCH); + if (!md->endonly) { + if (eptr != md->end_subject && + (!IS_NEWLINE(eptr) || + eptr != md->end_subject - md->nllen)) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + } + } + /* ... else fall through for endonly */ + + /* End of subject assertion (\z) */ + + case OP_EOD: + if (eptr < md->end_subject) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + /* End of subject or ending \n assertion (\Z) */ + + case OP_EODN: + if (eptr != md->end_subject && + (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + /* Word boundary assertions */ + + case OP_NOT_WORD_BOUNDARY: + case OP_WORD_BOUNDARY: { + /* Find out if the previous and current characters are "word" + characters. It takes a bit more work in UTF-8 mode. Characters > + 255 are assumed to be "non-word" characters. */ + +#ifdef SUPPORT_UTF8 + if (utf8) { + if (eptr == md->start_subject) + prev_is_word = FALSE; + else { + const uschar* lastptr = eptr - 1; + while ((*lastptr & 0xc0) == 0x80) + lastptr--; + GETCHAR(c, lastptr); + prev_is_word = + c < 256 && (md->ctypes[c] & ctype_word) != 0; + } + if (eptr >= md->end_subject) + cur_is_word = FALSE; + else { + GETCHAR(c, eptr); + cur_is_word = + c < 256 && (md->ctypes[c] & ctype_word) != 0; + } + } else +#endif + + /* More streamlined when not in UTF-8 mode */ + + { + prev_is_word = (eptr != md->start_subject) && + ((md->ctypes[eptr[-1]] & ctype_word) != 0); + cur_is_word = (eptr < md->end_subject) && + ((md->ctypes[*eptr] & ctype_word) != 0); + } + + /* Now see if the situation is what we want */ + + if ((*ecode++ == OP_WORD_BOUNDARY) + ? cur_is_word == prev_is_word + : cur_is_word != prev_is_word) + RRETURN(MATCH_NOMATCH); + } break; + + /* Match a single character type; inline for speed */ + + case OP_ANY: + if ((ims & PCRE_DOTALL) == 0) { + if (IS_NEWLINE(eptr)) + RRETURN(MATCH_NOMATCH); + } + if (eptr++ >= md->end_subject) + RRETURN(MATCH_NOMATCH); + if (utf8) + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) + eptr++; + ecode++; + break; + + /* Match a single byte, even in UTF-8 mode. This opcode really + does match any byte, even newline, independent of the setting of + PCRE_DOTALL. */ + + case OP_ANYBYTE: + if (eptr++ >= md->end_subject) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_NOT_DIGIT: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if ( +#ifdef SUPPORT_UTF8 + c < 256 && +#endif + (md->ctypes[c] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_DIGIT: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if ( +#ifdef SUPPORT_UTF8 + c >= 256 || +#endif + (md->ctypes[c] & ctype_digit) == 0) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_NOT_WHITESPACE: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if ( +#ifdef SUPPORT_UTF8 + c < 256 && +#endif + (md->ctypes[c] & ctype_space) != 0) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_WHITESPACE: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if ( +#ifdef SUPPORT_UTF8 + c >= 256 || +#endif + (md->ctypes[c] & ctype_space) == 0) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_NOT_WORDCHAR: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if ( +#ifdef SUPPORT_UTF8 + c < 256 && +#endif + (md->ctypes[c] & ctype_word) != 0) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_WORDCHAR: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + if ( +#ifdef SUPPORT_UTF8 + c >= 256 || +#endif + (md->ctypes[c] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + ecode++; + break; + + case OP_ANYNL: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + switch (c) { + default: + RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && *eptr == 0x0a) + eptr++; + break; + + case 0x000a: + break; + + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + if (md->bsr_anycrlf) + RRETURN(MATCH_NOMATCH); + break; + } + ecode++; + break; + + case OP_NOT_HSPACE: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + switch (c) { + default: + break; + case 0x09: /* HT */ + case 0x20: /* SPACE */ + case 0xa0: /* NBSP */ + case 0x1680: /* OGHAM SPACE MARK */ + case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ + case 0x2000: /* EN QUAD */ + case 0x2001: /* EM QUAD */ + case 0x2002: /* EN SPACE */ + case 0x2003: /* EM SPACE */ + case 0x2004: /* THREE-PER-EM SPACE */ + case 0x2005: /* FOUR-PER-EM SPACE */ + case 0x2006: /* SIX-PER-EM SPACE */ + case 0x2007: /* FIGURE SPACE */ + case 0x2008: /* PUNCTUATION SPACE */ + case 0x2009: /* THIN SPACE */ + case 0x200A: /* HAIR SPACE */ + case 0x202f: /* NARROW NO-BREAK SPACE */ + case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ + case 0x3000: /* IDEOGRAPHIC SPACE */ + RRETURN(MATCH_NOMATCH); + } + ecode++; + break; + + case OP_HSPACE: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + switch (c) { + default: + RRETURN(MATCH_NOMATCH); + case 0x09: /* HT */ + case 0x20: /* SPACE */ + case 0xa0: /* NBSP */ + case 0x1680: /* OGHAM SPACE MARK */ + case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ + case 0x2000: /* EN QUAD */ + case 0x2001: /* EM QUAD */ + case 0x2002: /* EN SPACE */ + case 0x2003: /* EM SPACE */ + case 0x2004: /* THREE-PER-EM SPACE */ + case 0x2005: /* FOUR-PER-EM SPACE */ + case 0x2006: /* SIX-PER-EM SPACE */ + case 0x2007: /* FIGURE SPACE */ + case 0x2008: /* PUNCTUATION SPACE */ + case 0x2009: /* THIN SPACE */ + case 0x200A: /* HAIR SPACE */ + case 0x202f: /* NARROW NO-BREAK SPACE */ + case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ + case 0x3000: /* IDEOGRAPHIC SPACE */ + break; + } + ecode++; + break; + + case OP_NOT_VSPACE: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + switch (c) { + default: + break; + case 0x0a: /* LF */ + case 0x0b: /* VT */ + case 0x0c: /* FF */ + case 0x0d: /* CR */ + case 0x85: /* NEL */ + case 0x2028: /* LINE SEPARATOR */ + case 0x2029: /* PARAGRAPH SEPARATOR */ + RRETURN(MATCH_NOMATCH); + } + ecode++; + break; + + case OP_VSPACE: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + switch (c) { + default: + RRETURN(MATCH_NOMATCH); + case 0x0a: /* LF */ + case 0x0b: /* VT */ + case 0x0c: /* FF */ + case 0x0d: /* CR */ + case 0x85: /* NEL */ + case 0x2028: /* LINE SEPARATOR */ + case 0x2029: /* PARAGRAPH SEPARATOR */ + break; + } + ecode++; + break; + +#ifdef SUPPORT_UCP + /* Check the next character by Unicode property. We will get + here only if the support is in the binary; otherwise a + compile-time error occurs. */ + + case OP_PROP: + case OP_NOTPROP: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + { + int chartype, script; + int category = _pcre_ucp_findprop(c, &chartype, &script); + + switch (ecode[1]) { + case PT_ANY: + if (op == OP_NOTPROP) + RRETURN(MATCH_NOMATCH); + break; + + case PT_LAMP: + if ((chartype == ucp_Lu || chartype == ucp_Ll || + chartype == ucp_Lt) == (op == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + break; + + case PT_GC: + if ((ecode[2] != category) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + break; + + case PT_PC: + if ((ecode[2] != chartype) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + break; + + case PT_SC: + if ((ecode[2] != script) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + + ecode += 3; + } + break; + + /* Match an extended Unicode sequence. We will get here only if + the support is in the binary; otherwise a compile-time error + occurs. */ + + case OP_EXTUNI: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + { + int chartype, script; + int category = _pcre_ucp_findprop(c, &chartype, &script); + if (category == ucp_M) + RRETURN(MATCH_NOMATCH); + while (eptr < md->end_subject) { + int len = 1; + if (!utf8) + c = *eptr; + else { + GETCHARLEN(c, eptr, len); + } + category = _pcre_ucp_findprop(c, &chartype, &script); + if (category != ucp_M) + break; + eptr += len; + } + } + ecode++; + break; +#endif + + /* Match a back reference, possibly repeatedly. Look past the + end of the item to see if there is repeat information following. + The code is similar to that for character classes, but repeated + for efficiency. Then obey similar code to character type repeats + - written out again for speed. However, if the referenced string + is the empty string, always treat it as matched, any number of + times (otherwise there could be infinite loops). */ + + case OP_REF: { + offset = GET2(ecode, 1) << 1; /* Doubled ref number */ + ecode += 3; /* Advance past item */ + + /* If the reference is unset, set the length to be longer than + the amount of subject left; this ensures that every attempt at a + match fails. We can't just fail here, because of the possibility + of quantifiers with zero minima. */ + + length = (offset >= offset_top || md->offset_vector[offset] < 0) + ? md->end_subject - eptr + 1 + : md->offset_vector[offset + 1] - + md->offset_vector[offset]; + + /* Set up for repetition, or handle the non-repeated case */ + + switch (*ecode) { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + c = *ecode++ - OP_CRSTAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) + max = INT_MAX; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + minimize = (*ecode == OP_CRMINRANGE); + min = GET2(ecode, 1); + max = GET2(ecode, 3); + if (max == 0) + max = INT_MAX; + ecode += 5; + break; + + default: /* No repeat follows */ + if (!match_ref(offset, eptr, length, md, ims)) + RRETURN(MATCH_NOMATCH); + eptr += length; + continue; /* With the main loop */ + } + + /* If the length of the reference is zero, just continue with + the main loop. */ + + if (length == 0) + continue; + + /* First, ensure the minimum number of matches are present. We + get back the length of the reference string explicitly rather + than passing the address of eptr, so that eptr can be a register + variable. */ + + for (i = 1; i <= min; i++) { + if (!match_ref(offset, eptr, length, md, ims)) + RRETURN(MATCH_NOMATCH); + eptr += length; + } + + /* If min = max, continue at the same level without recursion. + They are not both allowed to be zero. */ + + if (min == max) + continue; + + /* If minimizing, keep trying and advancing the pointer */ + + if (minimize) { + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM14); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || + !match_ref(offset, eptr, length, md, ims)) + RRETURN(MATCH_NOMATCH); + eptr += length; + } + /* Control never gets here */ + } + + /* If maximizing, find the longest string and work backwards */ + + else { + pp = eptr; + for (i = min; i < max; i++) { + if (!match_ref(offset, eptr, length, md, ims)) + break; + eptr += length; + } + while (eptr >= pp) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM15); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + eptr -= length; + } + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + + /* Match a bit-mapped character class, possibly repeatedly. This + op code is used when all the characters in the class have values + in the range 0-255, and either the matching is caseful, or the + characters are in the range 0-127 when UTF-8 processing is + enabled. The only difference between OP_CLASS and OP_NCLASS + occurs when a data character outside the range is encountered. + + First, look past the end of the item to see if there is repeat + information following. Then obey similar code to character type + repeats - written out again for speed. */ + + case OP_NCLASS: + case OP_CLASS: { + data = ecode + 1; /* Save for matching */ + ecode += 33; /* Advance past the item */ + + switch (*ecode) { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + c = *ecode++ - OP_CRSTAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) + max = INT_MAX; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + minimize = (*ecode == OP_CRMINRANGE); + min = GET2(ecode, 1); + max = GET2(ecode, 3); + if (max == 0) + max = INT_MAX; + ecode += 5; + break; + + default: /* No repeat follows */ + min = max = 1; + break; + } + + /* First, ensure the minimum number of matches are present. + */ + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) { + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (c > 255) { + if (op == OP_CLASS) + RRETURN(MATCH_NOMATCH); + } else { + if ((data[c / 8] & (1 << (c & 7))) == 0) + RRETURN(MATCH_NOMATCH); + } + } + } else +#endif + /* Not UTF-8 mode */ + { + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + c = *eptr++; + if ((data[c / 8] & (1 << (c & 7))) == 0) + RRETURN(MATCH_NOMATCH); + } + } + + /* If max == min we can continue with the main loop without the + need to recurse. */ + + if (min == max) + continue; + + /* If minimizing, keep testing the rest of the expression and + advancing the pointer while it matches the class. */ + + if (minimize) { +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) { + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM16); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (c > 255) { + if (op == OP_CLASS) + RRETURN(MATCH_NOMATCH); + } else { + if ((data[c / 8] & (1 << (c & 7))) == 0) + RRETURN(MATCH_NOMATCH); + } + } + } else +#endif + /* Not UTF-8 mode */ + { + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM17); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + c = *eptr++; + if ((data[c / 8] & (1 << (c & 7))) == 0) + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + } + + /* If maximizing, find the longest possible run, then work + backwards. */ + + else { + pp = eptr; + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) { + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + if (c > 255) { + if (op == OP_CLASS) + break; + } else { + if ((data[c / 8] & (1 << (c & 7))) == 0) + break; + } + eptr += len; + } + for (;;) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM18); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (eptr-- == pp) + break; /* Stop if tried at original pos */ + BACKCHAR(eptr); + } + } else +#endif + /* Not UTF-8 mode */ + { + for (i = min; i < max; i++) { + if (eptr >= md->end_subject) + break; + c = *eptr; + if ((data[c / 8] & (1 << (c & 7))) == 0) + break; + eptr++; + } + while (eptr >= pp) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM19); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + eptr--; + } + } + + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + + /* Match an extended character class. This opcode is encountered + only in UTF-8 mode, because that's the only time it is compiled. */ + +#ifdef SUPPORT_UTF8 + case OP_XCLASS: { + data = ecode + 1 + LINK_SIZE; /* Save for matching */ + ecode += GET(ecode, 1); /* Advance past the item */ + + switch (*ecode) { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + c = *ecode++ - OP_CRSTAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) + max = INT_MAX; + break; + + case OP_CRRANGE: + case OP_CRMINRANGE: + minimize = (*ecode == OP_CRMINRANGE); + min = GET2(ecode, 1); + max = GET2(ecode, 3); + if (max == 0) + max = INT_MAX; + ecode += 5; + break; + + default: /* No repeat follows */ + min = max = 1; + break; + } + + /* First, ensure the minimum number of matches are present. */ + + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (!_pcre_xclass(c, data)) + RRETURN(MATCH_NOMATCH); + } + + /* If max == min we can continue with the main loop without the + need to recurse. */ + + if (min == max) + continue; + + /* If minimizing, keep testing the rest of the expression and + advancing the pointer while it matches the class. */ + + if (minimize) { + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM20); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (!_pcre_xclass(c, data)) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } + + /* If maximizing, find the longest possible run, then work + backwards. */ + + else { + pp = eptr; + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + if (!_pcre_xclass(c, data)) + break; + eptr += len; + } + for (;;) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM21); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (eptr-- == pp) + break; /* Stop if tried at original pos */ + if (utf8) + BACKCHAR(eptr); + } + RRETURN(MATCH_NOMATCH); + } + + /* Control never gets here */ + } +#endif /* End of XCLASS */ + + /* Match a single character, casefully */ + + case OP_CHAR: +#ifdef SUPPORT_UTF8 + if (utf8) { + length = 1; + ecode++; + GETCHARLEN(fc, ecode, length); + if (length > md->end_subject - eptr) + RRETURN(MATCH_NOMATCH); + while (length-- > 0) + if (*ecode++ != *eptr++) + RRETURN(MATCH_NOMATCH); + } else +#endif + + /* Non-UTF-8 mode */ + { + if (md->end_subject - eptr < 1) + RRETURN(MATCH_NOMATCH); + if (ecode[1] != *eptr++) + RRETURN(MATCH_NOMATCH); + ecode += 2; + } + break; + + /* Match a single character, caselessly */ + + case OP_CHARNC: +#ifdef SUPPORT_UTF8 + if (utf8) { + length = 1; + ecode++; + GETCHARLEN(fc, ecode, length); + + if (length > md->end_subject - eptr) + RRETURN(MATCH_NOMATCH); + + /* If the pattern character's value is < 128, we have only + one byte, and can use the fast lookup table. */ + + if (fc < 128) { + if (md->lcc[*ecode++] != md->lcc[*eptr++]) + RRETURN(MATCH_NOMATCH); + } + + /* Otherwise we must pick up the subject character */ + + else { + unsigned int dc; + GETCHARINC(dc, eptr); + ecode += length; + + /* If we have Unicode property support, we can use it to + test the other case of the character, if there is one. + */ + + if (fc != dc) { +#ifdef SUPPORT_UCP + if (dc != _pcre_ucp_othercase(fc)) +#endif + RRETURN(MATCH_NOMATCH); + } + } + } else +#endif /* SUPPORT_UTF8 */ + + /* Non-UTF-8 mode */ + { + if (md->end_subject - eptr < 1) + RRETURN(MATCH_NOMATCH); + if (md->lcc[ecode[1]] != md->lcc[*eptr++]) + RRETURN(MATCH_NOMATCH); + ecode += 2; + } + break; + + /* Match a single character repeatedly. */ + + case OP_EXACT: + min = max = GET2(ecode, 1); + ecode += 3; + goto REPEATCHAR; + + case OP_POSUPTO: + possessive = TRUE; + /* Fall through */ + + case OP_UPTO: + case OP_MINUPTO: + min = 0; + max = GET2(ecode, 1); + minimize = *ecode == OP_MINUPTO; + ecode += 3; + goto REPEATCHAR; + + case OP_POSSTAR: + possessive = TRUE; + min = 0; + max = INT_MAX; + ecode++; + goto REPEATCHAR; + + case OP_POSPLUS: + possessive = TRUE; + min = 1; + max = INT_MAX; + ecode++; + goto REPEATCHAR; + + case OP_POSQUERY: + possessive = TRUE; + min = 0; + max = 1; + ecode++; + goto REPEATCHAR; + + case OP_STAR: + case OP_MINSTAR: + case OP_PLUS: + case OP_MINPLUS: + case OP_QUERY: + case OP_MINQUERY: + c = *ecode++ - OP_STAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) + max = INT_MAX; + + /* Common code for all repeated single-character matches. We can + give up quickly if there are fewer than the minimum number of + characters left in the subject. */ + + REPEATCHAR: +#ifdef SUPPORT_UTF8 + if (utf8) { + length = 1; + charptr = ecode; + GETCHARLEN(fc, ecode, length); + if (min * length > md->end_subject - eptr) + RRETURN(MATCH_NOMATCH); + ecode += length; + + /* Handle multibyte character matching specially here. There + is support for caseless matching if UCP support is present. + */ + + if (length > 1) { +#ifdef SUPPORT_UCP + unsigned int othercase; + if ((ims & PCRE_CASELESS) != 0 && + (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR) + oclength = _pcre_ord2utf8(othercase, occhars); + else + oclength = 0; +#endif /* SUPPORT_UCP */ + + for (i = 1; i <= min; i++) { + if (memcmp(eptr, charptr, length) == 0) + eptr += length; +#ifdef SUPPORT_UCP + /* Need braces because of following else */ + else if (oclength == 0) { + RRETURN(MATCH_NOMATCH); + } else { + if (memcmp(eptr, occhars, oclength) != 0) + RRETURN(MATCH_NOMATCH); + eptr += oclength; + } +#else /* without SUPPORT_UCP */ + else { + RRETURN(MATCH_NOMATCH); + } +#endif /* SUPPORT_UCP */ + } + + if (min == max) + continue; + + if (minimize) { + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM22); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + if (memcmp(eptr, charptr, length) == 0) + eptr += length; +#ifdef SUPPORT_UCP + /* Need braces because of following else */ + else if (oclength == 0) { + RRETURN(MATCH_NOMATCH); + } else { + if (memcmp(eptr, occhars, oclength) != 0) + RRETURN(MATCH_NOMATCH); + eptr += oclength; + } +#else /* without SUPPORT_UCP */ + else { + RRETURN(MATCH_NOMATCH); + } +#endif /* SUPPORT_UCP */ + } + /* Control never gets here */ + } + + else /* Maximize */ + { + pp = eptr; + for (i = min; i < max; i++) { + if (eptr > md->end_subject - length) + break; + if (memcmp(eptr, charptr, length) == 0) + eptr += length; +#ifdef SUPPORT_UCP + else if (oclength == 0) + break; + else { + if (memcmp(eptr, occhars, oclength) != 0) + break; + eptr += oclength; + } +#else /* without SUPPORT_UCP */ + else + break; +#endif /* SUPPORT_UCP */ + } + + if (possessive) + continue; + for (;;) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM23); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (eptr == pp) + RRETURN(MATCH_NOMATCH); +#ifdef SUPPORT_UCP + eptr--; + BACKCHAR(eptr); +#else /* without SUPPORT_UCP */ + eptr -= length; +#endif /* SUPPORT_UCP */ + } + } + /* Control never gets here */ + } + + /* If the length of a UTF-8 character is 1, we fall through + here, and obey the code as for non-UTF-8 characters below, + though in this case the value of fc will always be < 128. */ + } else +#endif /* SUPPORT_UTF8 */ + + /* When not in UTF-8 mode, load a single-byte character. */ + { + if (min > md->end_subject - eptr) + RRETURN(MATCH_NOMATCH); + fc = *ecode++; + } + + /* The value of fc at this point is always less than 256, though + we may or may not be in UTF-8 mode. The code is duplicated for + the caseless and caseful cases, for speed, since matching + characters is likely to be quite common. First, ensure the + minimum number of matches are present. If min = max, continue at + the same level without recursing. Otherwise, if minimizing, keep + trying the rest of the expression and advancing one matching + character if failing, up to the maximum. Alternatively, if + maximizing, find the maximum number of characters and work + backwards. */ + + DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, + max, max, eptr)); + + if ((ims & PCRE_CASELESS) != 0) { + fc = md->lcc[fc]; + for (i = 1; i <= min; i++) + if (fc != md->lcc[*eptr++]) + RRETURN(MATCH_NOMATCH); + if (min == max) + continue; + if (minimize) { + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM24); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || + fc != md->lcc[*eptr++]) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } else /* Maximize */ + { + pp = eptr; + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || fc != md->lcc[*eptr]) + break; + eptr++; + } + if (possessive) + continue; + while (eptr >= pp) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM25); + eptr--; + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + } + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } + + /* Caseful comparisons (includes all multi-byte characters) */ + + else { + for (i = 1; i <= min; i++) + if (fc != *eptr++) + RRETURN(MATCH_NOMATCH); + if (min == max) + continue; + if (minimize) { + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM26); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || + fc != *eptr++) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } else /* Maximize */ + { + pp = eptr; + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || fc != *eptr) + break; + eptr++; + } + if (possessive) + continue; + while (eptr >= pp) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM27); + eptr--; + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + } + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + + /* Match a negated single one-byte character. The character we + are checking can be multibyte. */ + + case OP_NOT: + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + ecode++; + GETCHARINCTEST(c, eptr); + if ((ims & PCRE_CASELESS) != 0) { +#ifdef SUPPORT_UTF8 + if (c < 256) +#endif + c = md->lcc[c]; + if (md->lcc[*ecode++] == c) + RRETURN(MATCH_NOMATCH); + } else { + if (*ecode++ == c) + RRETURN(MATCH_NOMATCH); + } + break; + + /* Match a negated single one-byte character repeatedly. This is + almost a repeat of the code for a repeated single character, but + I haven't found a nice way of commoning these up that doesn't + require a test of the positive/negative option for each + character match. Maybe that wouldn't add very much to the time + taken, but character matching *is* what this is all about... */ + + case OP_NOTEXACT: + min = max = GET2(ecode, 1); + ecode += 3; + goto REPEATNOTCHAR; + + case OP_NOTUPTO: + case OP_NOTMINUPTO: + min = 0; + max = GET2(ecode, 1); + minimize = *ecode == OP_NOTMINUPTO; + ecode += 3; + goto REPEATNOTCHAR; + + case OP_NOTPOSSTAR: + possessive = TRUE; + min = 0; + max = INT_MAX; + ecode++; + goto REPEATNOTCHAR; + + case OP_NOTPOSPLUS: + possessive = TRUE; + min = 1; + max = INT_MAX; + ecode++; + goto REPEATNOTCHAR; + + case OP_NOTPOSQUERY: + possessive = TRUE; + min = 0; + max = 1; + ecode++; + goto REPEATNOTCHAR; + + case OP_NOTPOSUPTO: + possessive = TRUE; + min = 0; + max = GET2(ecode, 1); + ecode += 3; + goto REPEATNOTCHAR; + + case OP_NOTSTAR: + case OP_NOTMINSTAR: + case OP_NOTPLUS: + case OP_NOTMINPLUS: + case OP_NOTQUERY: + case OP_NOTMINQUERY: + c = *ecode++ - OP_NOTSTAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) + max = INT_MAX; + + /* Common code for all repeated single-byte matches. We can give + up quickly if there are fewer than the minimum number of bytes + left in the subject. */ + + REPEATNOTCHAR: + if (min > md->end_subject - eptr) + RRETURN(MATCH_NOMATCH); + fc = *ecode++; + + /* The code is duplicated for the caseless and caseful cases, + for speed, since matching characters is likely to be quite + common. First, ensure the minimum number of matches are present. + If min = max, continue at the same level without recursing. + Otherwise, if minimizing, keep trying the rest of the expression + and advancing one matching character if failing, up to the + maximum. Alternatively, if maximizing, find the maximum number + of characters and work backwards. */ + + DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", + fc, min, max, max, eptr)); + + if ((ims & PCRE_CASELESS) != 0) { + fc = md->lcc[fc]; + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) { + register unsigned int d; + for (i = 1; i <= min; i++) { + GETCHARINC(d, eptr); + if (d < 256) + d = md->lcc[d]; + if (fc == d) + RRETURN(MATCH_NOMATCH); + } + } else +#endif + + /* Not UTF-8 mode */ + { + for (i = 1; i <= min; i++) + if (fc == md->lcc[*eptr++]) + RRETURN(MATCH_NOMATCH); + } + + if (min == max) + continue; + + if (minimize) { +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) { + register unsigned int d; + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM28); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + GETCHARINC(d, eptr); + if (d < 256) + d = md->lcc[d]; + if (fi >= max || eptr >= md->end_subject || + fc == d) + RRETURN(MATCH_NOMATCH); + } + } else +#endif + /* Not UTF-8 mode */ + { + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM29); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || + fc == md->lcc[*eptr++]) + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + } + + /* Maximize case */ + + else { + pp = eptr; + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) { + register unsigned int d; + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(d, eptr, len); + if (d < 256) + d = md->lcc[d]; + if (fc == d) + break; + eptr += len; + } + if (possessive) + continue; + for (;;) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM30); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (eptr-- == pp) + break; /* Stop if tried at original pos */ + BACKCHAR(eptr); + } + } else +#endif + /* Not UTF-8 mode */ + { + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || + fc == md->lcc[*eptr]) + break; + eptr++; + } + if (possessive) + continue; + while (eptr >= pp) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM31); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + eptr--; + } + } + + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + } + + /* Caseful comparisons */ + + else { +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) { + register unsigned int d; + for (i = 1; i <= min; i++) { + GETCHARINC(d, eptr); + if (fc == d) + RRETURN(MATCH_NOMATCH); + } + } else +#endif + /* Not UTF-8 mode */ + { + for (i = 1; i <= min; i++) + if (fc == *eptr++) + RRETURN(MATCH_NOMATCH); + } + + if (min == max) + continue; + + if (minimize) { +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) { + register unsigned int d; + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM32); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + GETCHARINC(d, eptr); + if (fi >= max || eptr >= md->end_subject || + fc == d) + RRETURN(MATCH_NOMATCH); + } + } else +#endif + /* Not UTF-8 mode */ + { + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM33); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || + fc == *eptr++) + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + } + + /* Maximize case */ + + else { + pp = eptr; + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) { + register unsigned int d; + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(d, eptr, len); + if (fc == d) + break; + eptr += len; + } + if (possessive) + continue; + for (;;) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM34); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (eptr-- == pp) + break; /* Stop if tried at original pos */ + BACKCHAR(eptr); + } + } else +#endif + /* Not UTF-8 mode */ + { + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || fc == *eptr) + break; + eptr++; + } + if (possessive) + continue; + while (eptr >= pp) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM35); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + eptr--; + } + } + + RRETURN(MATCH_NOMATCH); + } + } + /* Control never gets here */ + + /* Match a single character type repeatedly; several different + opcodes share code. This is very similar to the code for single + characters, but we repeat it in the interests of efficiency. */ + + case OP_TYPEEXACT: + min = max = GET2(ecode, 1); + minimize = TRUE; + ecode += 3; + goto REPEATTYPE; + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + min = 0; + max = GET2(ecode, 1); + minimize = *ecode == OP_TYPEMINUPTO; + ecode += 3; + goto REPEATTYPE; + + case OP_TYPEPOSSTAR: + possessive = TRUE; + min = 0; + max = INT_MAX; + ecode++; + goto REPEATTYPE; + + case OP_TYPEPOSPLUS: + possessive = TRUE; + min = 1; + max = INT_MAX; + ecode++; + goto REPEATTYPE; + + case OP_TYPEPOSQUERY: + possessive = TRUE; + min = 0; + max = 1; + ecode++; + goto REPEATTYPE; + + case OP_TYPEPOSUPTO: + possessive = TRUE; + min = 0; + max = GET2(ecode, 1); + ecode += 3; + goto REPEATTYPE; + + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + c = *ecode++ - OP_TYPESTAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) + max = INT_MAX; + + /* Common code for all repeated single character type matches. + Note that in UTF-8 mode, '.' matches a character of any length, + but for the other character types, the valid characters are all + one-byte long. */ + + REPEATTYPE: + ctype = *ecode++; /* Code for the character type */ + +#ifdef SUPPORT_UCP + if (ctype == OP_PROP || ctype == OP_NOTPROP) { + prop_fail_result = ctype == OP_NOTPROP; + prop_type = *ecode++; + prop_value = *ecode++; + } else + prop_type = -1; +#endif + + /* First, ensure the minimum number of matches are present. Use + inline code for maximizing the speed, and do the type test once + at the start (i.e. keep it out of the loop). Also we can test + that there are at least the minimum number of bytes before we + start. This isn't as effective in UTF-8 mode, but it does no + harm. Separate the UTF-8 code completely as that is tidier. Also + separate the UCP code, which can be the same for both UTF-8 and + single-bytes. */ + + if (min > md->end_subject - eptr) + RRETURN(MATCH_NOMATCH); + if (min > 0) { +#ifdef SUPPORT_UCP + if (prop_type >= 0) { + switch (prop_type) { + case PT_ANY: + if (prop_fail_result) + RRETURN(MATCH_NOMATCH); + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + } + break; + + case PT_LAMP: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if ((prop_chartype == ucp_Lu || + prop_chartype == ucp_Ll || + prop_chartype == ucp_Lt) == + prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_GC: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if ((prop_category == prop_value) == + prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_PC: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if ((prop_chartype == prop_value) == + prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_SC: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if ((prop_script == prop_value) == + prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + } + + /* Match extended Unicode sequences. We will get here only + if the support is in the binary; otherwise a compile-time + error occurs. */ + + else if (ctype == OP_EXTUNI) { + for (i = 1; i <= min; i++) { + GETCHARINCTEST(c, eptr); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if (prop_category == ucp_M) + RRETURN(MATCH_NOMATCH); + while (eptr < md->end_subject) { + int len = 1; + if (!utf8) + c = *eptr; + else { + GETCHARLEN(c, eptr, len); + } + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if (prop_category != ucp_M) + break; + eptr += len; + } + } + } + + else +#endif /* SUPPORT_UCP */ + + /* Handle all other cases when the coding is UTF-8 */ + +#ifdef SUPPORT_UTF8 + if (utf8) + switch (ctype) { + case OP_ANY: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject || + ((ims & PCRE_DOTALL) == 0 && + IS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + eptr++; + while (eptr < md->end_subject && + (*eptr & 0xc0) == 0x80) + eptr++; + } + break; + + case OP_ANYBYTE: + eptr += min; + break; + + case OP_ANYNL: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + switch (c) { + default: + RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && + *eptr == 0x0a) + eptr++; + break; + + case 0x000a: + break; + + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + if (md->bsr_anycrlf) + RRETURN(MATCH_NOMATCH); + break; + } + } + break; + + case OP_NOT_HSPACE: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + switch (c) { + default: + break; + case 0x09: /* HT */ + case 0x20: /* SPACE */ + case 0xa0: /* NBSP */ + case 0x1680: /* OGHAM SPACE MARK */ + case 0x180e: /* MONGOLIAN VOWEL + SEPARATOR */ + case 0x2000: /* EN QUAD */ + case 0x2001: /* EM QUAD */ + case 0x2002: /* EN SPACE */ + case 0x2003: /* EM SPACE */ + case 0x2004: /* THREE-PER-EM SPACE */ + case 0x2005: /* FOUR-PER-EM SPACE */ + case 0x2006: /* SIX-PER-EM SPACE */ + case 0x2007: /* FIGURE SPACE */ + case 0x2008: /* PUNCTUATION SPACE */ + case 0x2009: /* THIN SPACE */ + case 0x200A: /* HAIR SPACE */ + case 0x202f: /* NARROW NO-BREAK SPACE */ + case 0x205f: /* MEDIUM MATHEMATICAL + SPACE */ + case 0x3000: /* IDEOGRAPHIC SPACE */ + RRETURN(MATCH_NOMATCH); + } + } + break; + + case OP_HSPACE: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + switch (c) { + default: + RRETURN(MATCH_NOMATCH); + case 0x09: /* HT */ + case 0x20: /* SPACE */ + case 0xa0: /* NBSP */ + case 0x1680: /* OGHAM SPACE MARK */ + case 0x180e: /* MONGOLIAN VOWEL + SEPARATOR */ + case 0x2000: /* EN QUAD */ + case 0x2001: /* EM QUAD */ + case 0x2002: /* EN SPACE */ + case 0x2003: /* EM SPACE */ + case 0x2004: /* THREE-PER-EM SPACE */ + case 0x2005: /* FOUR-PER-EM SPACE */ + case 0x2006: /* SIX-PER-EM SPACE */ + case 0x2007: /* FIGURE SPACE */ + case 0x2008: /* PUNCTUATION SPACE */ + case 0x2009: /* THIN SPACE */ + case 0x200A: /* HAIR SPACE */ + case 0x202f: /* NARROW NO-BREAK SPACE */ + case 0x205f: /* MEDIUM MATHEMATICAL + SPACE */ + case 0x3000: /* IDEOGRAPHIC SPACE */ + break; + } + } + break; + + case OP_NOT_VSPACE: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + switch (c) { + default: + break; + case 0x0a: /* LF */ + case 0x0b: /* VT */ + case 0x0c: /* FF */ + case 0x0d: /* CR */ + case 0x85: /* NEL */ + case 0x2028: /* LINE SEPARATOR */ + case 0x2029: /* PARAGRAPH SEPARATOR */ + RRETURN(MATCH_NOMATCH); + } + } + break; + + case OP_VSPACE: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + switch (c) { + default: + RRETURN(MATCH_NOMATCH); + case 0x0a: /* LF */ + case 0x0b: /* VT */ + case 0x0c: /* FF */ + case 0x0d: /* CR */ + case 0x85: /* NEL */ + case 0x2028: /* LINE SEPARATOR */ + case 0x2029: /* PARAGRAPH SEPARATOR */ + break; + } + } + break; + + case OP_NOT_DIGIT: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (c < 128 && + (md->ctypes[c] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + } + break; + + case OP_DIGIT: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject || + *eptr >= 128 || + (md->ctypes[*eptr++] & ctype_digit) == + 0) + RRETURN(MATCH_NOMATCH); + /* No need to skip more bytes - we know it's + * a 1-byte character */ + } + break; + + case OP_NOT_WHITESPACE: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject || + (*eptr < 128 && (md->ctypes[*eptr] & + ctype_space) != 0)) + RRETURN(MATCH_NOMATCH); + while (++eptr < md->end_subject && + (*eptr & 0xc0) == 0x80) + ; + } + break; + + case OP_WHITESPACE: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject || + *eptr >= 128 || + (md->ctypes[*eptr++] & ctype_space) == + 0) + RRETURN(MATCH_NOMATCH); + /* No need to skip more bytes - we know it's + * a 1-byte character */ + } + break; + + case OP_NOT_WORDCHAR: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject || + (*eptr < 128 && + (md->ctypes[*eptr] & ctype_word) != 0)) + RRETURN(MATCH_NOMATCH); + while (++eptr < md->end_subject && + (*eptr & 0xc0) == 0x80) + ; + } + break; + + case OP_WORDCHAR: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject || + *eptr >= 128 || + (md->ctypes[*eptr++] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + /* No need to skip more bytes - we know it's + * a 1-byte character */ + } + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } /* End switch(ctype) */ + + else +#endif /* SUPPORT_UTF8 */ + + /* Code for the non-UTF-8 case for minimum matching of + operators other than OP_PROP and OP_NOTPROP. We can + assume that there are the minimum number of bytes + present, as this was tested above. */ + + switch (ctype) { + case OP_ANY: + if ((ims & PCRE_DOTALL) == 0) { + for (i = 1; i <= min; i++) { + if (IS_NEWLINE(eptr)) + RRETURN(MATCH_NOMATCH); + eptr++; + } + } else + eptr += min; + break; + + case OP_ANYBYTE: + eptr += min; + break; + + /* Because of the CRLF case, we can't assume the + minimum number of bytes are present in this + case. */ + + case OP_ANYNL: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + switch (*eptr++) { + default: + RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && + *eptr == 0x0a) + eptr++; + break; + case 0x000a: + break; + + case 0x000b: + case 0x000c: + case 0x0085: + if (md->bsr_anycrlf) + RRETURN(MATCH_NOMATCH); + break; + } + } + break; + + case OP_NOT_HSPACE: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + switch (*eptr++) { + default: + break; + case 0x09: /* HT */ + case 0x20: /* SPACE */ + case 0xa0: /* NBSP */ + RRETURN(MATCH_NOMATCH); + } + } + break; + + case OP_HSPACE: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + switch (*eptr++) { + default: + RRETURN(MATCH_NOMATCH); + case 0x09: /* HT */ + case 0x20: /* SPACE */ + case 0xa0: /* NBSP */ + break; + } + } + break; + + case OP_NOT_VSPACE: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + switch (*eptr++) { + default: + break; + case 0x0a: /* LF */ + case 0x0b: /* VT */ + case 0x0c: /* FF */ + case 0x0d: /* CR */ + case 0x85: /* NEL */ + RRETURN(MATCH_NOMATCH); + } + } + break; + + case OP_VSPACE: + for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + switch (*eptr++) { + default: + RRETURN(MATCH_NOMATCH); + case 0x0a: /* LF */ + case 0x0b: /* VT */ + case 0x0c: /* FF */ + case 0x0d: /* CR */ + case 0x85: /* NEL */ + break; + } + } + break; + + case OP_NOT_DIGIT: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_digit) != + 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_DIGIT: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_digit) == + 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WHITESPACE: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_space) != + 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WHITESPACE: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_space) == + 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WORDCHAR: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_word) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WORDCHAR: + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + } + + /* If min = max, continue at the same level without recursing */ + + if (min == max) + continue; + + /* If minimizing, we have to test the rest of the pattern before + each subsequent match. Again, separate the UTF-8 case for speed, + and also separate the UCP cases. */ + + if (minimize) { +#ifdef SUPPORT_UCP + if (prop_type >= 0) { + switch (prop_type) { + case PT_ANY: + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, + eptrb, 0, RM36); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_LAMP: + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, + eptrb, 0, RM37); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if ((prop_chartype == ucp_Lu || + prop_chartype == ucp_Ll || + prop_chartype == ucp_Lt) == + prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_GC: + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, + eptrb, 0, RM38); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if ((prop_category == prop_value) == + prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_PC: + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, + eptrb, 0, RM39); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if ((prop_chartype == prop_value) == + prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + case PT_SC: + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, + eptrb, 0, RM40); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if ((prop_script == prop_value) == + prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + } + + /* Match extended Unicode sequences. We will get here only + if the support is in the binary; otherwise a compile-time + error occurs. */ + + else if (ctype == OP_EXTUNI) { + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM41); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) + RRETURN(MATCH_NOMATCH); + GETCHARINCTEST(c, eptr); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if (prop_category == ucp_M) + RRETURN(MATCH_NOMATCH); + while (eptr < md->end_subject) { + int len = 1; + if (!utf8) + c = *eptr; + else { + GETCHARLEN(c, eptr, len); + } + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if (prop_category != ucp_M) + break; + eptr += len; + } + } + } + + else +#endif /* SUPPORT_UCP */ + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + if (utf8) { + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM42); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || + (ctype == OP_ANY && + (ims & PCRE_DOTALL) == 0 && + IS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + + GETCHARINC(c, eptr); + switch (ctype) { + case OP_ANY: /* This is the DOTALL case */ + break; + + case OP_ANYBYTE: + break; + + case OP_ANYNL: + switch (c) { + default: + RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && + *eptr == 0x0a) + eptr++; + break; + case 0x000a: + break; + + case 0x000b: + case 0x000c: + case 0x0085: + case 0x2028: + case 0x2029: + if (md->bsr_anycrlf) + RRETURN(MATCH_NOMATCH); + break; + } + break; + + case OP_NOT_HSPACE: + switch (c) { + default: + break; + case 0x09: /* HT */ + case 0x20: /* SPACE */ + case 0xa0: /* NBSP */ + case 0x1680: /* OGHAM SPACE MARK */ + case 0x180e: /* MONGOLIAN VOWEL + SEPARATOR */ + case 0x2000: /* EN QUAD */ + case 0x2001: /* EM QUAD */ + case 0x2002: /* EN SPACE */ + case 0x2003: /* EM SPACE */ + case 0x2004: /* THREE-PER-EM SPACE + */ + case 0x2005: /* FOUR-PER-EM SPACE */ + case 0x2006: /* SIX-PER-EM SPACE */ + case 0x2007: /* FIGURE SPACE */ + case 0x2008: /* PUNCTUATION SPACE */ + case 0x2009: /* THIN SPACE */ + case 0x200A: /* HAIR SPACE */ + case 0x202f: /* NARROW NO-BREAK + SPACE */ + case 0x205f: /* MEDIUM MATHEMATICAL + SPACE */ + case 0x3000: /* IDEOGRAPHIC SPACE */ + RRETURN(MATCH_NOMATCH); + } + break; + + case OP_HSPACE: + switch (c) { + default: + RRETURN(MATCH_NOMATCH); + case 0x09: /* HT */ + case 0x20: /* SPACE */ + case 0xa0: /* NBSP */ + case 0x1680: /* OGHAM SPACE MARK */ + case 0x180e: /* MONGOLIAN VOWEL + SEPARATOR */ + case 0x2000: /* EN QUAD */ + case 0x2001: /* EM QUAD */ + case 0x2002: /* EN SPACE */ + case 0x2003: /* EM SPACE */ + case 0x2004: /* THREE-PER-EM SPACE + */ + case 0x2005: /* FOUR-PER-EM SPACE */ + case 0x2006: /* SIX-PER-EM SPACE */ + case 0x2007: /* FIGURE SPACE */ + case 0x2008: /* PUNCTUATION SPACE */ + case 0x2009: /* THIN SPACE */ + case 0x200A: /* HAIR SPACE */ + case 0x202f: /* NARROW NO-BREAK + SPACE */ + case 0x205f: /* MEDIUM MATHEMATICAL + SPACE */ + case 0x3000: /* IDEOGRAPHIC SPACE */ + break; + } + break; + + case OP_NOT_VSPACE: + switch (c) { + default: + break; + case 0x0a: /* LF */ + case 0x0b: /* VT */ + case 0x0c: /* FF */ + case 0x0d: /* CR */ + case 0x85: /* NEL */ + case 0x2028: /* LINE SEPARATOR */ + case 0x2029: /* PARAGRAPH SEPARATOR + */ + RRETURN(MATCH_NOMATCH); + } + break; + + case OP_VSPACE: + switch (c) { + default: + RRETURN(MATCH_NOMATCH); + case 0x0a: /* LF */ + case 0x0b: /* VT */ + case 0x0c: /* FF */ + case 0x0d: /* CR */ + case 0x85: /* NEL */ + case 0x2028: /* LINE SEPARATOR */ + case 0x2029: /* PARAGRAPH SEPARATOR + */ + break; + } + break; + + case OP_NOT_DIGIT: + if (c < 256 && + (md->ctypes[c] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_DIGIT: + if (c >= 256 || + (md->ctypes[c] & ctype_digit) == 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WHITESPACE: + if (c < 256 && + (md->ctypes[c] & ctype_space) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WHITESPACE: + if (c >= 256 || + (md->ctypes[c] & ctype_space) == 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WORDCHAR: + if (c < 256 && + (md->ctypes[c] & ctype_word) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WORDCHAR: + if (c >= 256 || + (md->ctypes[c] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + } + } else +#endif + /* Not UTF-8 mode */ + { + for (fi = min;; fi++) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM43); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject || + ((ims & PCRE_DOTALL) == 0 && + IS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + + c = *eptr++; + switch (ctype) { + case OP_ANY: /* This is the DOTALL case */ + break; + + case OP_ANYBYTE: + break; + + case OP_ANYNL: + switch (c) { + default: + RRETURN(MATCH_NOMATCH); + case 0x000d: + if (eptr < md->end_subject && + *eptr == 0x0a) + eptr++; + break; + + case 0x000a: + break; + + case 0x000b: + case 0x000c: + case 0x0085: + if (md->bsr_anycrlf) + RRETURN(MATCH_NOMATCH); + break; + } + break; + + case OP_NOT_HSPACE: + switch (c) { + default: + break; + case 0x09: /* HT */ + case 0x20: /* SPACE */ + case 0xa0: /* NBSP */ + RRETURN(MATCH_NOMATCH); + } + break; + + case OP_HSPACE: + switch (c) { + default: + RRETURN(MATCH_NOMATCH); + case 0x09: /* HT */ + case 0x20: /* SPACE */ + case 0xa0: /* NBSP */ + break; + } + break; + + case OP_NOT_VSPACE: + switch (c) { + default: + break; + case 0x0a: /* LF */ + case 0x0b: /* VT */ + case 0x0c: /* FF */ + case 0x0d: /* CR */ + case 0x85: /* NEL */ + RRETURN(MATCH_NOMATCH); + } + break; + + case OP_VSPACE: + switch (c) { + default: + RRETURN(MATCH_NOMATCH); + case 0x0a: /* LF */ + case 0x0b: /* VT */ + case 0x0c: /* FF */ + case 0x0d: /* CR */ + case 0x85: /* NEL */ + break; + } + break; + + case OP_NOT_DIGIT: + if ((md->ctypes[c] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_DIGIT: + if ((md->ctypes[c] & ctype_digit) == 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WHITESPACE: + if ((md->ctypes[c] & ctype_space) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WHITESPACE: + if ((md->ctypes[c] & ctype_space) == 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_NOT_WORDCHAR: + if ((md->ctypes[c] & ctype_word) != 0) + RRETURN(MATCH_NOMATCH); + break; + + case OP_WORDCHAR: + if ((md->ctypes[c] & ctype_word) == 0) + RRETURN(MATCH_NOMATCH); + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + } + } + /* Control never gets here */ + } + + /* If maximizing, it is worth using inline code for speed, doing + the type test once at the start (i.e. keep it out of the loop). + Again, keep the UTF-8 and UCP stuff separate. */ + + else { + pp = eptr; /* Remember where we started */ + +#ifdef SUPPORT_UCP + if (prop_type >= 0) { + switch (prop_type) { + case PT_ANY: + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + if (prop_fail_result) + break; + eptr += len; + } + break; + + case PT_LAMP: + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if ((prop_chartype == ucp_Lu || + prop_chartype == ucp_Ll || + prop_chartype == ucp_Lt) == + prop_fail_result) + break; + eptr += len; + } + break; + + case PT_GC: + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if ((prop_category == prop_value) == + prop_fail_result) + break; + eptr += len; + } + break; + + case PT_PC: + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if ((prop_chartype == prop_value) == + prop_fail_result) + break; + eptr += len; + } + break; + + case PT_SC: + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if ((prop_script == prop_value) == + prop_fail_result) + break; + eptr += len; + } + break; + } + + /* eptr is now past the end of the maximum run */ + + if (possessive) + continue; + for (;;) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM44); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (eptr-- == pp) + break; /* Stop if tried at original pos */ + if (utf8) + BACKCHAR(eptr); + } + } + + /* Match extended Unicode sequences. We will get here only + if the support is in the binary; otherwise a compile-time + error occurs. */ + + else if (ctype == OP_EXTUNI) { + for (i = min; i < max; i++) { + if (eptr >= md->end_subject) + break; + GETCHARINCTEST(c, eptr); + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if (prop_category == ucp_M) + break; + while (eptr < md->end_subject) { + int len = 1; + if (!utf8) + c = *eptr; + else { + GETCHARLEN(c, eptr, len); + } + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if (prop_category != ucp_M) + break; + eptr += len; + } + } + + /* eptr is now past the end of the maximum run */ + + if (possessive) + continue; + for (;;) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, + RM45); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (eptr-- == pp) + break; /* Stop if tried at original pos */ + for (;;) /* Move back over one extended */ + { + int len = 1; + if (!utf8) + c = *eptr; + else { + BACKCHAR(eptr); + GETCHARLEN(c, eptr, len); + } + prop_category = _pcre_ucp_findprop( + c, &prop_chartype, &prop_script); + if (prop_category != ucp_M) + break; + eptr--; + } + } + } + + else +#endif /* SUPPORT_UCP */ + +#ifdef SUPPORT_UTF8 + /* UTF-8 mode */ + + if (utf8) { + switch (ctype) { + case OP_ANY: + if (max < INT_MAX) { + if ((ims & PCRE_DOTALL) == 0) { + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || + IS_NEWLINE(eptr)) + break; + eptr++; + while (eptr < md->end_subject && + (*eptr & 0xc0) == 0x80) + eptr++; + } + } else { + for (i = min; i < max; i++) { + if (eptr >= md->end_subject) + break; + eptr++; + while (eptr < md->end_subject && + (*eptr & 0xc0) == 0x80) + eptr++; + } + } + } + + /* Handle unlimited UTF-8 repeat */ + + else { + if ((ims & PCRE_DOTALL) == 0) { + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || + IS_NEWLINE(eptr)) + break; + eptr++; + while (eptr < md->end_subject && + (*eptr & 0xc0) == 0x80) + eptr++; + } + } else { + eptr = md->end_subject; + } + } + break; + + /* The byte case is the same as non-UTF8 */ + + case OP_ANYBYTE: + c = max - min; + if (c > + (unsigned int)(md->end_subject - eptr)) + c = md->end_subject - eptr; + eptr += c; + break; + + case OP_ANYNL: + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + if (c == 0x000d) { + if (++eptr >= md->end_subject) + break; + if (*eptr == 0x000a) + eptr++; + } else { + if (c != 0x000a && + (md->bsr_anycrlf || + (c != 0x000b && c != 0x000c && + c != 0x0085 && c != 0x2028 && + c != 0x2029))) + break; + eptr += len; + } + } + break; + + case OP_NOT_HSPACE: + case OP_HSPACE: + for (i = min; i < max; i++) { + BOOL gotspace; + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + switch (c) { + default: + gotspace = FALSE; + break; + case 0x09: /* HT */ + case 0x20: /* SPACE */ + case 0xa0: /* NBSP */ + case 0x1680: /* OGHAM SPACE MARK */ + case 0x180e: /* MONGOLIAN VOWEL + SEPARATOR */ + case 0x2000: /* EN QUAD */ + case 0x2001: /* EM QUAD */ + case 0x2002: /* EN SPACE */ + case 0x2003: /* EM SPACE */ + case 0x2004: /* THREE-PER-EM SPACE + */ + case 0x2005: /* FOUR-PER-EM SPACE */ + case 0x2006: /* SIX-PER-EM SPACE */ + case 0x2007: /* FIGURE SPACE */ + case 0x2008: /* PUNCTUATION SPACE */ + case 0x2009: /* THIN SPACE */ + case 0x200A: /* HAIR SPACE */ + case 0x202f: /* NARROW NO-BREAK + SPACE */ + case 0x205f: /* MEDIUM MATHEMATICAL + SPACE */ + case 0x3000: /* IDEOGRAPHIC SPACE */ + gotspace = TRUE; + break; + } + if (gotspace == + (ctype == OP_NOT_HSPACE)) + break; + eptr += len; + } + break; + + case OP_NOT_VSPACE: + case OP_VSPACE: + for (i = min; i < max; i++) { + BOOL gotspace; + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + switch (c) { + default: + gotspace = FALSE; + break; + case 0x0a: /* LF */ + case 0x0b: /* VT */ + case 0x0c: /* FF */ + case 0x0d: /* CR */ + case 0x85: /* NEL */ + case 0x2028: /* LINE SEPARATOR */ + case 0x2029: /* PARAGRAPH SEPARATOR + */ + gotspace = TRUE; + break; + } + if (gotspace == + (ctype == OP_NOT_VSPACE)) + break; + eptr += len; + } + break; + + case OP_NOT_DIGIT: + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + if (c < 256 && + (md->ctypes[c] & ctype_digit) != 0) + break; + eptr += len; + } + break; + + case OP_DIGIT: + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + if (c >= 256 || + (md->ctypes[c] & ctype_digit) == 0) + break; + eptr += len; + } + break; + + case OP_NOT_WHITESPACE: + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + if (c < 256 && + (md->ctypes[c] & ctype_space) != 0) + break; + eptr += len; + } + break; + + case OP_WHITESPACE: + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + if (c >= 256 || + (md->ctypes[c] & ctype_space) == 0) + break; + eptr += len; + } + break; + + case OP_NOT_WORDCHAR: + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + if (c < 256 && + (md->ctypes[c] & ctype_word) != 0) + break; + eptr += len; + } + break; + + case OP_WORDCHAR: + for (i = min; i < max; i++) { + int len = 1; + if (eptr >= md->end_subject) + break; + GETCHARLEN(c, eptr, len); + if (c >= 256 || + (md->ctypes[c] & ctype_word) == 0) + break; + eptr += len; + } + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + + /* eptr is now past the end of the maximum run */ + + if (possessive) + continue; + for (;;) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM46); + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + if (eptr-- == pp) + break; /* Stop if tried at original pos */ + BACKCHAR(eptr); + } + } else +#endif /* SUPPORT_UTF8 */ + + /* Not UTF-8 mode */ + { + switch (ctype) { + case OP_ANY: + if ((ims & PCRE_DOTALL) == 0) { + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || + IS_NEWLINE(eptr)) + break; + eptr++; + } + break; + } + /* For DOTALL case, fall through and treat + * as \C */ + + case OP_ANYBYTE: + c = max - min; + if (c > + (unsigned int)(md->end_subject - eptr)) + c = md->end_subject - eptr; + eptr += c; + break; + + case OP_ANYNL: + for (i = min; i < max; i++) { + if (eptr >= md->end_subject) + break; + c = *eptr; + if (c == 0x000d) { + if (++eptr >= md->end_subject) + break; + if (*eptr == 0x000a) + eptr++; + } else { + if (c != 0x000a && + (md->bsr_anycrlf || + (c != 0x000b && c != 0x000c && + c != 0x0085))) + break; + eptr++; + } + } + break; + + case OP_NOT_HSPACE: + for (i = min; i < max; i++) { + if (eptr >= md->end_subject) + break; + c = *eptr; + if (c == 0x09 || c == 0x20 || c == 0xa0) + break; + eptr++; + } + break; + + case OP_HSPACE: + for (i = min; i < max; i++) { + if (eptr >= md->end_subject) + break; + c = *eptr; + if (c != 0x09 && c != 0x20 && c != 0xa0) + break; + eptr++; + } + break; + + case OP_NOT_VSPACE: + for (i = min; i < max; i++) { + if (eptr >= md->end_subject) + break; + c = *eptr; + if (c == 0x0a || c == 0x0b || + c == 0x0c || c == 0x0d || c == 0x85) + break; + eptr++; + } + break; + + case OP_VSPACE: + for (i = min; i < max; i++) { + if (eptr >= md->end_subject) + break; + c = *eptr; + if (c != 0x0a && c != 0x0b && + c != 0x0c && c != 0x0d && c != 0x85) + break; + eptr++; + } + break; + + case OP_NOT_DIGIT: + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || + (md->ctypes[*eptr] & ctype_digit) != + 0) + break; + eptr++; + } + break; + + case OP_DIGIT: + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || + (md->ctypes[*eptr] & ctype_digit) == + 0) + break; + eptr++; + } + break; + + case OP_NOT_WHITESPACE: + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || + (md->ctypes[*eptr] & ctype_space) != + 0) + break; + eptr++; + } + break; + + case OP_WHITESPACE: + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || + (md->ctypes[*eptr] & ctype_space) == + 0) + break; + eptr++; + } + break; + + case OP_NOT_WORDCHAR: + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || + (md->ctypes[*eptr] & ctype_word) != + 0) + break; + eptr++; + } + break; + + case OP_WORDCHAR: + for (i = min; i < max; i++) { + if (eptr >= md->end_subject || + (md->ctypes[*eptr] & ctype_word) == + 0) + break; + eptr++; + } + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + } + + /* eptr is now past the end of the maximum run */ + + if (possessive) + continue; + while (eptr >= pp) { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, + 0, RM47); + eptr--; + if (rrc != MATCH_NOMATCH) + RRETURN(rrc); + } + } + + /* Get here if we can't make it match with any permitted + * repetitions */ + + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + + /* There's been some horrible disaster. Arrival here can only + mean there is something seriously wrong in the code above or the + OP_xxx definitions. */ + + default: + DPRINTF(("Unknown opcode %d\n", *ecode)); + RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); + } + + /* Do not stick any code in here without much thought; it is assumed + that "continue" in the code above comes out to here to repeat the main + loop. */ + + } /* End of main loop */ + /* Control never reaches here */ + + /* When compiling to use the heap rather than the stack for recursive calls + to match(), the RRETURN() macro jumps here. The number that is saved in + frame->Xwhere indicates which label we actually want to return to. */ #ifdef NO_RECURSE -#define LBL(val) case val: goto L_RM##val; +#define LBL(val) \ + case val: \ + goto L_RM##val; HEAP_RETURN: -switch (frame->Xwhere) - { - LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) - LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) - LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) - LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) - LBL(53) LBL(54) + switch (frame->Xwhere) { + LBL(1) + LBL(2) + LBL(3) + LBL(4) + LBL(5) + LBL(6) + LBL(7) + LBL(8) + LBL(9) + LBL(10) + LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) LBL(19) LBL(24) LBL(25) + LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) LBL(35) LBL(43) LBL(47) + LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) LBL(53) LBL(54) #ifdef SUPPORT_UTF8 - LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30) - LBL(32) LBL(34) LBL(42) LBL(46) + LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) + LBL(30) LBL(32) LBL(34) LBL(42) LBL(46) #ifdef SUPPORT_UCP - LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) -#endif /* SUPPORT_UCP */ -#endif /* SUPPORT_UTF8 */ - default: - DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); - return PCRE_ERROR_INTERNAL; - } + LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) + LBL(44) LBL(45) +#endif /* SUPPORT_UCP */ +#endif /* SUPPORT_UTF8 */ + default + : DPRINTF(("jump error in pcre match: label %d non-existent\n", + frame->Xwhere)); + return PCRE_ERROR_INTERNAL; + } #undef LBL -#endif /* NO_RECURSE */ +#endif /* NO_RECURSE */ } - /*************************************************************************** **************************************************************************** RECURSION IN THE match() FUNCTION @@ -4280,11 +4601,9 @@ Undefine all the macros that were defined above to handle this. */ /*************************************************************************** ***************************************************************************/ - - /************************************************* -* Execute a Regular Expression * -*************************************************/ + * Execute a Regular Expression * + *************************************************/ /* This function applies a compiled re to a subject string and picks out portions of the string if it matches. Two elements in the vector are set for @@ -4306,593 +4625,595 @@ Returns: > 0 => success; value is the number of elements filled in < -1 => some kind of unexpected problem */ -int -pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, - PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, - int offsetcount) -{ -int rc, resetcount, ocount; -int first_byte = -1; -int req_byte = -1; -int req_byte2 = -1; -int newline; -unsigned long int ims; -BOOL using_temporary_offsets = FALSE; -BOOL anchored; -BOOL startline; -BOOL firstline; -BOOL first_byte_caseless = FALSE; -BOOL req_byte_caseless = FALSE; -BOOL utf8; -match_data match_block; -match_data *md = &match_block; -const uschar *tables; -const uschar *start_bits = NULL; -USPTR start_match = (USPTR)subject + start_offset; -USPTR end_subject; -USPTR req_byte_ptr = start_match - 1; +int pcre_exec(const pcre* argument_re, + const pcre_extra* extra_data, + PCRE_SPTR subject, + int length, + int start_offset, + int options, + int* offsets, + int offsetcount) { + int rc, resetcount, ocount; + int first_byte = -1; + int req_byte = -1; + int req_byte2 = -1; + int newline; + unsigned long int ims; + BOOL using_temporary_offsets = FALSE; + BOOL anchored; + BOOL startline; + BOOL firstline; + BOOL first_byte_caseless = FALSE; + BOOL req_byte_caseless = FALSE; + BOOL utf8; + match_data match_block; + match_data* md = &match_block; + const uschar* tables; + const uschar* start_bits = NULL; + USPTR start_match = (USPTR)subject + start_offset; + USPTR end_subject; + USPTR req_byte_ptr = start_match - 1; -pcre_study_data internal_study; -const pcre_study_data *study; + pcre_study_data internal_study; + const pcre_study_data* study; -real_pcre internal_re; -const real_pcre *external_re = (const real_pcre *)argument_re; -const real_pcre *re = external_re; + real_pcre internal_re; + const real_pcre* external_re = (const real_pcre*)argument_re; + const real_pcre* re = external_re; -/* Plausibility checks */ + /* Plausibility checks */ -if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; -if (re == NULL || subject == NULL || - (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; -if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; + if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) + return PCRE_ERROR_BADOPTION; + if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) + return PCRE_ERROR_NULL; + if (offsetcount < 0) + return PCRE_ERROR_BADCOUNT; -/* Fish out the optional data from the extra_data structure, first setting -the default values. */ + /* Fish out the optional data from the extra_data structure, first setting + the default values. */ -study = NULL; -md->match_limit = MATCH_LIMIT; -md->match_limit_recursion = MATCH_LIMIT_RECURSION; -md->callout_data = NULL; + study = NULL; + md->match_limit = MATCH_LIMIT; + md->match_limit_recursion = MATCH_LIMIT_RECURSION; + md->callout_data = NULL; -/* The table pointer is always in native byte order. */ + /* The table pointer is always in native byte order. */ -tables = external_re->tables; + tables = external_re->tables; -if (extra_data != NULL) - { - register unsigned int flags = extra_data->flags; - if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) - study = (const pcre_study_data *)extra_data->study_data; - if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) - md->match_limit = extra_data->match_limit; - if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) - md->match_limit_recursion = extra_data->match_limit_recursion; - if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) - md->callout_data = extra_data->callout_data; - if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; - } + if (extra_data != NULL) { + register unsigned int flags = extra_data->flags; + if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) + study = (const pcre_study_data*)extra_data->study_data; + if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) + md->match_limit = extra_data->match_limit; + if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) + md->match_limit_recursion = extra_data->match_limit_recursion; + if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) + md->callout_data = extra_data->callout_data; + if ((flags & PCRE_EXTRA_TABLES) != 0) + tables = extra_data->tables; + } -/* If the exec call supplied NULL for tables, use the inbuilt ones. This -is a feature that makes it possible to save compiled regex and re-use them -in other programs later. */ + /* If the exec call supplied NULL for tables, use the inbuilt ones. This + is a feature that makes it possible to save compiled regex and re-use them + in other programs later. */ -if (tables == NULL) tables = _pcre_default_tables; + if (tables == NULL) + tables = _pcre_default_tables; -/* Check that the first field in the block is the magic number. If it is not, -test for a regex that was compiled on a host of opposite endianness. If this is -the case, flipped values are put in internal_re and internal_study if there was -study data too. */ + /* Check that the first field in the block is the magic number. If it is + not, test for a regex that was compiled on a host of opposite endianness. If + this is the case, flipped values are put in internal_re and internal_study + if there was study data too. */ -if (re->magic_number != MAGIC_NUMBER) - { - re = _pcre_try_flipped(re, &internal_re, study, &internal_study); - if (re == NULL) return PCRE_ERROR_BADMAGIC; - if (study != NULL) study = &internal_study; - } + if (re->magic_number != MAGIC_NUMBER) { + re = _pcre_try_flipped(re, &internal_re, study, &internal_study); + if (re == NULL) + return PCRE_ERROR_BADMAGIC; + if (study != NULL) + study = &internal_study; + } -/* Set up other data */ + /* Set up other data */ -anchored = ((re->options | options) & PCRE_ANCHORED) != 0; -startline = (re->flags & PCRE_STARTLINE) != 0; -firstline = (re->options & PCRE_FIRSTLINE) != 0; + anchored = ((re->options | options) & PCRE_ANCHORED) != 0; + startline = (re->flags & PCRE_STARTLINE) != 0; + firstline = (re->options & PCRE_FIRSTLINE) != 0; -/* The code starts after the real_pcre block and the capture name table. */ + /* The code starts after the real_pcre block and the capture name table. */ -md->start_code = (const uschar *)external_re + re->name_table_offset + - re->name_count * re->name_entry_size; + md->start_code = (const uschar*)external_re + re->name_table_offset + + re->name_count * re->name_entry_size; -md->start_subject = (USPTR)subject; -md->start_offset = start_offset; -md->end_subject = md->start_subject + length; -end_subject = md->end_subject; + md->start_subject = (USPTR)subject; + md->start_offset = start_offset; + md->end_subject = md->start_subject + length; + end_subject = md->end_subject; -md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; -utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; + md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; + utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; -md->notbol = (options & PCRE_NOTBOL) != 0; -md->noteol = (options & PCRE_NOTEOL) != 0; -md->notempty = (options & PCRE_NOTEMPTY) != 0; -md->partial = (options & PCRE_PARTIAL) != 0; -md->hitend = FALSE; + md->notbol = (options & PCRE_NOTBOL) != 0; + md->noteol = (options & PCRE_NOTEOL) != 0; + md->notempty = (options & PCRE_NOTEMPTY) != 0; + md->partial = (options & PCRE_PARTIAL) != 0; + md->hitend = FALSE; -md->recursive = NULL; /* No recursion at top level */ + md->recursive = NULL; /* No recursion at top level */ -md->lcc = tables + lcc_offset; -md->ctypes = tables + ctypes_offset; + md->lcc = tables + lcc_offset; + md->ctypes = tables + ctypes_offset; -/* Handle different \R options. */ + /* Handle different \R options. */ -switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) - { - case 0: - if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) - md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; - else + switch (options & (PCRE_BSR_ANYCRLF | PCRE_BSR_UNICODE)) { + case 0: + if ((re->options & (PCRE_BSR_ANYCRLF | PCRE_BSR_UNICODE)) != 0) + md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; + else #ifdef BSR_ANYCRLF - md->bsr_anycrlf = TRUE; + md->bsr_anycrlf = TRUE; #else - md->bsr_anycrlf = FALSE; + md->bsr_anycrlf = FALSE; #endif - break; + break; - case PCRE_BSR_ANYCRLF: - md->bsr_anycrlf = TRUE; - break; + case PCRE_BSR_ANYCRLF: + md->bsr_anycrlf = TRUE; + break; - case PCRE_BSR_UNICODE: - md->bsr_anycrlf = FALSE; - break; + case PCRE_BSR_UNICODE: + md->bsr_anycrlf = FALSE; + break; - default: return PCRE_ERROR_BADNEWLINE; - } - -/* Handle different types of newline. The three bits give eight cases. If -nothing is set at run time, whatever was used at compile time applies. */ - -switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : - (pcre_uint32)options) & PCRE_NEWLINE_BITS) - { - case 0: newline = NEWLINE; break; /* Compile-time default */ - case PCRE_NEWLINE_CR: newline = '\r'; break; - case PCRE_NEWLINE_LF: newline = '\n'; break; - case PCRE_NEWLINE_CR+ - PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; - case PCRE_NEWLINE_ANY: newline = -1; break; - case PCRE_NEWLINE_ANYCRLF: newline = -2; break; - default: return PCRE_ERROR_BADNEWLINE; - } - -if (newline == -2) - { - md->nltype = NLTYPE_ANYCRLF; - } -else if (newline < 0) - { - md->nltype = NLTYPE_ANY; - } -else - { - md->nltype = NLTYPE_FIXED; - if (newline > 255) - { - md->nllen = 2; - md->nl[0] = (newline >> 8) & 255; - md->nl[1] = newline & 255; + default: + return PCRE_ERROR_BADNEWLINE; } - else - { - md->nllen = 1; - md->nl[0] = newline; + + /* Handle different types of newline. The three bits give eight cases. If + nothing is set at run time, whatever was used at compile time applies. */ + + switch ((((options & PCRE_NEWLINE_BITS) == 0) ? re->options + : (pcre_uint32)options) & + PCRE_NEWLINE_BITS) { + case 0: + newline = NEWLINE; + break; /* Compile-time default */ + case PCRE_NEWLINE_CR: + newline = '\r'; + break; + case PCRE_NEWLINE_LF: + newline = '\n'; + break; + case PCRE_NEWLINE_CR + PCRE_NEWLINE_LF: + newline = ('\r' << 8) | '\n'; + break; + case PCRE_NEWLINE_ANY: + newline = -1; + break; + case PCRE_NEWLINE_ANYCRLF: + newline = -2; + break; + default: + return PCRE_ERROR_BADNEWLINE; } - } -/* Partial matching is supported only for a restricted set of regexes at the -moment. */ + if (newline == -2) { + md->nltype = NLTYPE_ANYCRLF; + } else if (newline < 0) { + md->nltype = NLTYPE_ANY; + } else { + md->nltype = NLTYPE_FIXED; + if (newline > 255) { + md->nllen = 2; + md->nl[0] = (newline >> 8) & 255; + md->nl[1] = newline & 255; + } else { + md->nllen = 1; + md->nl[0] = newline; + } + } -if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) - return PCRE_ERROR_BADPARTIAL; + /* Partial matching is supported only for a restricted set of regexes at the + moment. */ -/* Check a UTF-8 string if required. Unfortunately there's no way of passing -back the character offset. */ + if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) + return PCRE_ERROR_BADPARTIAL; + + /* Check a UTF-8 string if required. Unfortunately there's no way of + passing back the character offset. */ #ifdef SUPPORT_UTF8 -if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) - { - if (_pcre_valid_utf8((uschar *)subject, length) >= 0) - return PCRE_ERROR_BADUTF8; - if (start_offset > 0 && start_offset < length) - { - int tb = ((uschar *)subject)[start_offset]; - if (tb > 127) - { - tb &= 0xc0; - if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; - } + if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) { + if (_pcre_valid_utf8((uschar*)subject, length) >= 0) + return PCRE_ERROR_BADUTF8; + if (start_offset > 0 && start_offset < length) { + int tb = ((uschar*)subject)[start_offset]; + if (tb > 127) { + tb &= 0xc0; + if (tb != 0 && tb != 0xc0) + return PCRE_ERROR_BADUTF8_OFFSET; + } + } } - } #endif -/* The ims options can vary during the matching as a result of the presence -of (?ims) items in the pattern. They are kept in a local variable so that -restoring at the exit of a group is easy. */ + /* The ims options can vary during the matching as a result of the presence + of (?ims) items in the pattern. They are kept in a local variable so that + restoring at the exit of a group is easy. */ -ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL); + ims = re->options & (PCRE_CASELESS | PCRE_MULTILINE | PCRE_DOTALL); -/* If the expression has got more back references than the offsets supplied can -hold, we get a temporary chunk of working store to use during the matching. -Otherwise, we can use the vector supplied, rounding down its size to a multiple -of 3. */ + /* If the expression has got more back references than the offsets supplied + can hold, we get a temporary chunk of working store to use during the + matching. Otherwise, we can use the vector supplied, rounding down its size + to a multiple of 3. */ -ocount = offsetcount - (offsetcount % 3); + ocount = offsetcount - (offsetcount % 3); -if (re->top_backref > 0 && re->top_backref >= ocount/3) - { - ocount = re->top_backref * 3 + 3; - md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); - if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; - using_temporary_offsets = TRUE; - DPRINTF(("Got memory to hold back references\n")); - } -else md->offset_vector = offsets; + if (re->top_backref > 0 && re->top_backref >= ocount / 3) { + ocount = re->top_backref * 3 + 3; + md->offset_vector = (int*)(pcre_malloc)(ocount * sizeof(int)); + if (md->offset_vector == NULL) + return PCRE_ERROR_NOMEMORY; + using_temporary_offsets = TRUE; + DPRINTF(("Got memory to hold back references\n")); + } else + md->offset_vector = offsets; -md->offset_end = ocount; -md->offset_max = (2*ocount)/3; -md->offset_overflow = FALSE; -md->capture_last = -1; + md->offset_end = ocount; + md->offset_max = (2 * ocount) / 3; + md->offset_overflow = FALSE; + md->capture_last = -1; -/* Compute the minimum number of offsets that we need to reset each time. Doing -this makes a huge difference to execution time when there aren't many brackets -in the pattern. */ + /* Compute the minimum number of offsets that we need to reset each time. + Doing this makes a huge difference to execution time when there aren't many + brackets in the pattern. */ -resetcount = 2 + re->top_bracket * 2; -if (resetcount > offsetcount) resetcount = ocount; + resetcount = 2 + re->top_bracket * 2; + if (resetcount > offsetcount) + resetcount = ocount; -/* Reset the working variable associated with each extraction. These should -never be used unless previously set, but they get saved and restored, and so we -initialize them to avoid reading uninitialized locations. */ + /* Reset the working variable associated with each extraction. These should + never be used unless previously set, but they get saved and restored, and so + we initialize them to avoid reading uninitialized locations. */ -if (md->offset_vector != NULL) - { - register int *iptr = md->offset_vector + ocount; - register int *iend = iptr - resetcount/2 + 1; - while (--iptr >= iend) *iptr = -1; - } - -/* Set up the first character to match, if available. The first_byte value is -never set for an anchored regular expression, but the anchoring may be forced -at run time, so we have to test for anchoring. The first char may be unset for -an unanchored pattern, of course. If there's no first char and the pattern was -studied, there may be a bitmap of possible first characters. */ - -if (!anchored) - { - if ((re->flags & PCRE_FIRSTSET) != 0) - { - first_byte = re->first_byte & 255; - if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) - first_byte = md->lcc[first_byte]; - } - else - if (!startline && study != NULL && - (study->options & PCRE_STUDY_MAPPED) != 0) - start_bits = study->start_bits; - } - -/* For anchored or unanchored matches, there may be a "last known required -character" set. */ - -if ((re->flags & PCRE_REQCHSET) != 0) - { - req_byte = re->req_byte & 255; - req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; - req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ - } - - -/* ==========================================================================*/ - -/* Loop for handling unanchored repeated matching attempts; for anchored regexs -the loop runs just once. */ - -for(;;) - { - USPTR save_end_subject = end_subject; - USPTR new_start_match; - - /* Reset the maximum number of extractions we might see. */ - - if (md->offset_vector != NULL) - { - register int *iptr = md->offset_vector; - register int *iend = iptr + resetcount; - while (iptr < iend) *iptr++ = -1; + if (md->offset_vector != NULL) { + register int* iptr = md->offset_vector + ocount; + register int* iend = iptr - resetcount / 2 + 1; + while (--iptr >= iend) + *iptr = -1; } - /* Advance to a unique first char if possible. If firstline is TRUE, the - start of the match is constrained to the first line of a multiline string. - That is, the match must be before or at the first newline. Implement this by - temporarily adjusting end_subject so that we stop scanning at a newline. If - the match fails at the newline, later code breaks this loop. */ + /* Set up the first character to match, if available. The first_byte value + is never set for an anchored regular expression, but the anchoring may be + forced at run time, so we have to test for anchoring. The first char may be + unset for an unanchored pattern, of course. If there's no first char and the + pattern was studied, there may be a bitmap of possible first characters. */ - if (firstline) - { - USPTR t = start_match; - while (t < md->end_subject && !IS_NEWLINE(t)) t++; - end_subject = t; + if (!anchored) { + if ((re->flags & PCRE_FIRSTSET) != 0) { + first_byte = re->first_byte & 255; + if ((first_byte_caseless = + ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) + first_byte = md->lcc[first_byte]; + } else if (!startline && study != NULL && + (study->options & PCRE_STUDY_MAPPED) != 0) + start_bits = study->start_bits; } - /* Now test for a unique first byte */ + /* For anchored or unanchored matches, there may be a "last known required + character" set. */ - if (first_byte >= 0) - { - if (first_byte_caseless) - while (start_match < end_subject && - md->lcc[*start_match] != first_byte) - { NEXTCHAR(start_match); } - else - while (start_match < end_subject && *start_match != first_byte) - { NEXTCHAR(start_match); } + if ((re->flags & PCRE_REQCHSET) != 0) { + req_byte = re->req_byte & 255; + req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; + req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ } - /* Or to just after a linebreak for a multiline match if possible */ + /* ==========================================================================*/ - else if (startline) - { - if (start_match > md->start_subject + start_offset) - { - while (start_match <= end_subject && !WAS_NEWLINE(start_match)) - { NEXTCHAR(start_match); } + /* Loop for handling unanchored repeated matching attempts; for anchored + regexs the loop runs just once. */ - /* If we have just passed a CR and the newline option is ANY or ANYCRLF, - and we are now at a LF, advance the match position by one more character. + for (;;) { + USPTR save_end_subject = end_subject; + USPTR new_start_match; + + /* Reset the maximum number of extractions we might see. */ + + if (md->offset_vector != NULL) { + register int* iptr = md->offset_vector; + register int* iend = iptr + resetcount; + while (iptr < iend) + *iptr++ = -1; + } + + /* Advance to a unique first char if possible. If firstline is TRUE, the + start of the match is constrained to the first line of a multiline + string. That is, the match must be before or at the first newline. + Implement this by temporarily adjusting end_subject so that we stop + scanning at a newline. If the match fails at the newline, later code + breaks this loop. */ + + if (firstline) { + USPTR t = start_match; + while (t < md->end_subject && !IS_NEWLINE(t)) + t++; + end_subject = t; + } + + /* Now test for a unique first byte */ + + if (first_byte >= 0) { + if (first_byte_caseless) + while (start_match < end_subject && + md->lcc[*start_match] != first_byte) { + NEXTCHAR(start_match); + } + else + while (start_match < end_subject && + *start_match != first_byte) { + NEXTCHAR(start_match); + } + } + + /* Or to just after a linebreak for a multiline match if possible */ + + else if (startline) { + if (start_match > md->start_subject + start_offset) { + while (start_match <= end_subject && + !WAS_NEWLINE(start_match)) { + NEXTCHAR(start_match); + } + + /* If we have just passed a CR and the newline option is ANY or + ANYCRLF, and we are now at a LF, advance the match position by + one more character. + */ + + if (start_match[-1] == '\r' && + (md->nltype == NLTYPE_ANY || + md->nltype == NLTYPE_ANYCRLF) && + start_match < end_subject && *start_match == '\n') + start_match++; + } + } + + /* Or to a non-unique first char after study */ + + else if (start_bits != NULL) { + while (start_match < end_subject) { + register unsigned int c = *start_match; + if ((start_bits[c / 8] & (1 << (c & 7))) == 0) { + NEXTCHAR(start_match); + } else + break; + } + } + + /* Restore fudged end_subject */ + + end_subject = save_end_subject; + +#ifdef DEBUG /* Sigh. Some compilers never learn. */ + printf(">>>> Match against: "); + pchars(start_match, end_subject - start_match, TRUE, md); + printf("\n"); +#endif + + /* If req_byte is set, we know that that character must appear in the + subject for the match to succeed. If the first character is set, + req_byte must be later in the subject; otherwise the test starts at the + match point. This optimization can save a huge amount of backtracking in + patterns with nested unlimited repeats that aren't going to match. + Writing separate code for cased/caseless versions makes it go faster, as + does using an autoincrement and backing off on a match. + + HOWEVER: when the subject string is very, very long, searching to its + end can take a long time, and give bad performance on quite ordinary + patterns. This showed up when somebody was matching something like + /^\d+C/ on a 32-megabyte string... so we don't do this when the string + is sufficiently long. + + ALSO: this processing is disabled when partial matching is requested. + */ + + if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX && + !md->partial) { + register USPTR p = start_match + ((first_byte >= 0) ? 1 : 0); + + /* We don't need to repeat the search if we haven't yet reached the + place we found it at last time. */ + + if (p > req_byte_ptr) { + if (req_byte_caseless) { + while (p < end_subject) { + register int pp = *p++; + if (pp == req_byte || pp == req_byte2) { + p--; + break; + } + } + } else { + while (p < end_subject) { + if (*p++ == req_byte) { + p--; + break; + } + } + } + + /* If we can't find the required character, break the matching + loop, forcing a match failure. */ + + if (p >= end_subject) { + rc = MATCH_NOMATCH; + break; + } + + /* If we have found the required character, save the point where + we found it, so that we don't search again next time round the + loop if the start hasn't passed this character yet. */ + + req_byte_ptr = p; + } + } + + /* OK, we can now run the match. */ + + md->start_match_ptr = start_match; + md->match_call_count = 0; + rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, + 0, 0); + + switch (rc) { + /* NOMATCH and PRUNE advance by one character. THEN at this + level acts exactly like PRUNE. */ + + case MATCH_NOMATCH: + case MATCH_PRUNE: + case MATCH_THEN: + new_start_match = start_match + 1; +#ifdef SUPPORT_UTF8 + if (utf8) + while (new_start_match < end_subject && + (*new_start_match & 0xc0) == 0x80) + new_start_match++; +#endif + break; + + /* SKIP passes back the next starting point explicitly. */ + + case MATCH_SKIP: + new_start_match = md->start_match_ptr; + break; + + /* COMMIT disables the bumpalong, but otherwise behaves as + * NOMATCH. */ + + case MATCH_COMMIT: + rc = MATCH_NOMATCH; + goto ENDLOOP; + + /* Any other return is some kind of error. */ + + default: + goto ENDLOOP; + } + + /* Control reaches here for the various types of "no match at this + point" result. Reset the code to MATCH_NOMATCH for subsequent checking. */ - if (start_match[-1] == '\r' && - (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && - start_match < end_subject && - *start_match == '\n') - start_match++; - } - } - - /* Or to a non-unique first char after study */ - - else if (start_bits != NULL) - { - while (start_match < end_subject) - { - register unsigned int c = *start_match; - if ((start_bits[c/8] & (1 << (c&7))) == 0) - { NEXTCHAR(start_match); } - else break; - } - } - - /* Restore fudged end_subject */ - - end_subject = save_end_subject; - -#ifdef DEBUG /* Sigh. Some compilers never learn. */ - printf(">>>> Match against: "); - pchars(start_match, end_subject - start_match, TRUE, md); - printf("\n"); -#endif - - /* If req_byte is set, we know that that character must appear in the subject - for the match to succeed. If the first character is set, req_byte must be - later in the subject; otherwise the test starts at the match point. This - optimization can save a huge amount of backtracking in patterns with nested - unlimited repeats that aren't going to match. Writing separate code for - cased/caseless versions makes it go faster, as does using an autoincrement - and backing off on a match. - - HOWEVER: when the subject string is very, very long, searching to its end can - take a long time, and give bad performance on quite ordinary patterns. This - showed up when somebody was matching something like /^\d+C/ on a 32-megabyte - string... so we don't do this when the string is sufficiently long. - - ALSO: this processing is disabled when partial matching is requested. - */ - - if (req_byte >= 0 && - end_subject - start_match < REQ_BYTE_MAX && - !md->partial) - { - register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); - - /* We don't need to repeat the search if we haven't yet reached the - place we found it at last time. */ - - if (p > req_byte_ptr) - { - if (req_byte_caseless) - { - while (p < end_subject) - { - register int pp = *p++; - if (pp == req_byte || pp == req_byte2) { p--; break; } - } - } - else - { - while (p < end_subject) - { - if (*p++ == req_byte) { p--; break; } - } - } - - /* If we can't find the required character, break the matching loop, - forcing a match failure. */ - - if (p >= end_subject) - { rc = MATCH_NOMATCH; - break; - } - /* If we have found the required character, save the point where we - found it, so that we don't search again next time round the loop if - the start hasn't passed this character yet. */ + /* If PCRE_FIRSTLINE is set, the match must happen before or at the + first newline in the subject (though it may continue over the newline). + Therefore, if we have just failed to match, starting at a newline, do + not continue. */ - req_byte_ptr = p; - } - } + if (firstline && IS_NEWLINE(start_match)) + break; - /* OK, we can now run the match. */ + /* Advance to new matching position */ - md->start_match_ptr = start_match; - md->match_call_count = 0; - rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0); + start_match = new_start_match; - switch(rc) - { - /* NOMATCH and PRUNE advance by one character. THEN at this level acts - exactly like PRUNE. */ + /* Break the loop if the pattern is anchored or if we have passed the + end of the subject. */ - case MATCH_NOMATCH: - case MATCH_PRUNE: - case MATCH_THEN: - new_start_match = start_match + 1; -#ifdef SUPPORT_UTF8 - if (utf8) - while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80) - new_start_match++; -#endif - break; + if (anchored || start_match > end_subject) + break; - /* SKIP passes back the next starting point explicitly. */ + /* If we have just passed a CR and we are now at a LF, and the pattern + does not contain any explicit matches for \r or \n, and the newline + option is CRLF or ANY or ANYCRLF, advance the match position by one more + character. */ - case MATCH_SKIP: - new_start_match = md->start_match_ptr; - break; + if (start_match[-1] == '\r' && start_match < end_subject && + *start_match == '\n' && (re->flags & PCRE_HASCRORLF) == 0 && + (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF || + md->nllen == 2)) + start_match++; - /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ + } /* End of for(;;) "bumpalong" loop */ - case MATCH_COMMIT: - rc = MATCH_NOMATCH; - goto ENDLOOP; + /* ==========================================================================*/ - /* Any other return is some kind of error. */ + /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping + conditions is true: - default: - goto ENDLOOP; - } + (1) The pattern is anchored or the match was failed by (*COMMIT); - /* Control reaches here for the various types of "no match at this point" - result. Reset the code to MATCH_NOMATCH for subsequent checking. */ + (2) We are past the end of the subject; - rc = MATCH_NOMATCH; + (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because + this option requests that a match occur at or before the first newline + in the subject. - /* If PCRE_FIRSTLINE is set, the match must happen before or at the first - newline in the subject (though it may continue over the newline). Therefore, - if we have just failed to match, starting at a newline, do not continue. */ - - if (firstline && IS_NEWLINE(start_match)) break; - - /* Advance to new matching position */ - - start_match = new_start_match; - - /* Break the loop if the pattern is anchored or if we have passed the end of - the subject. */ - - if (anchored || start_match > end_subject) break; - - /* If we have just passed a CR and we are now at a LF, and the pattern does - not contain any explicit matches for \r or \n, and the newline option is CRLF - or ANY or ANYCRLF, advance the match position by one more character. */ - - if (start_match[-1] == '\r' && - start_match < end_subject && - *start_match == '\n' && - (re->flags & PCRE_HASCRORLF) == 0 && - (md->nltype == NLTYPE_ANY || - md->nltype == NLTYPE_ANYCRLF || - md->nllen == 2)) - start_match++; - - } /* End of for(;;) "bumpalong" loop */ - -/* ==========================================================================*/ - -/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping -conditions is true: - -(1) The pattern is anchored or the match was failed by (*COMMIT); - -(2) We are past the end of the subject; - -(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because - this option requests that a match occur at or before the first newline in - the subject. - -When we have a match and the offset vector is big enough to deal with any -backreferences, captured substring offsets will already be set up. In the case -where we had to get some local store to hold offsets for backreference -processing, copy those that we can. In this case there need not be overflow if -certain parts of the pattern were not used, even though there are more -capturing parentheses than vector slots. */ + When we have a match and the offset vector is big enough to deal with any + backreferences, captured substring offsets will already be set up. In the + case where we had to get some local store to hold offsets for backreference + processing, copy those that we can. In this case there need not be overflow + if certain parts of the pattern were not used, even though there are more + capturing parentheses than vector slots. */ ENDLOOP: -if (rc == MATCH_MATCH) - { - if (using_temporary_offsets) - { - if (offsetcount >= 4) - { - memcpy(offsets + 2, md->offset_vector + 2, - (offsetcount - 2) * sizeof(int)); - DPRINTF(("Copied offsets from temporary memory\n")); - } - if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE; - DPRINTF(("Freeing temporary memory\n")); - (pcre_free)(md->offset_vector); + if (rc == MATCH_MATCH) { + if (using_temporary_offsets) { + if (offsetcount >= 4) { + memcpy(offsets + 2, md->offset_vector + 2, + (offsetcount - 2) * sizeof(int)); + DPRINTF(("Copied offsets from temporary memory\n")); + } + if (md->end_offset_top > offsetcount) + md->offset_overflow = TRUE; + DPRINTF(("Freeing temporary memory\n")); + (pcre_free)(md->offset_vector); + } + + /* Set the return code to the number of captured strings, or 0 if there + are too many to fit into the vector. */ + + rc = md->offset_overflow ? 0 : md->end_offset_top / 2; + + /* If there is space, set up the whole thing as substring 0. The value + of md->start_match_ptr might be modified if \K was encountered on the + success matching path. */ + + if (offsetcount < 2) + rc = 0; + else { + offsets[0] = md->start_match_ptr - md->start_subject; + offsets[1] = md->end_match_ptr - md->start_subject; + } + + DPRINTF((">>>> returning %d\n", rc)); + return rc; } - /* Set the return code to the number of captured strings, or 0 if there are - too many to fit into the vector. */ + /* Control gets here if there has been an error, or if the overall match + attempt has failed at all permitted starting positions. */ - rc = md->offset_overflow? 0 : md->end_offset_top/2; - - /* If there is space, set up the whole thing as substring 0. The value of - md->start_match_ptr might be modified if \K was encountered on the success - matching path. */ - - if (offsetcount < 2) rc = 0; else - { - offsets[0] = md->start_match_ptr - md->start_subject; - offsets[1] = md->end_match_ptr - md->start_subject; + if (using_temporary_offsets) { + DPRINTF(("Freeing temporary memory\n")); + (pcre_free)(md->offset_vector); } - DPRINTF((">>>> returning %d\n", rc)); - return rc; - } - -/* Control gets here if there has been an error, or if the overall match -attempt has failed at all permitted starting positions. */ - -if (using_temporary_offsets) - { - DPRINTF(("Freeing temporary memory\n")); - (pcre_free)(md->offset_vector); - } - -if (rc != MATCH_NOMATCH) - { - DPRINTF((">>>> error: returning %d\n", rc)); - return rc; - } -else if (md->partial && md->hitend) - { - DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); - return PCRE_ERROR_PARTIAL; - } -else - { - DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); - return PCRE_ERROR_NOMATCH; - } + if (rc != MATCH_NOMATCH) { + DPRINTF((">>>> error: returning %d\n", rc)); + return rc; + } else if (md->partial && md->hitend) { + DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); + return PCRE_ERROR_PARTIAL; + } else { + DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); + return PCRE_ERROR_NOMATCH; + } } /* End of pcre_exec.c */ diff --git a/package/re/pcre_fullinfo.c b/package/re/pcre_fullinfo.c index 9144f2ec8..5859fdde0 100644 --- a/package/re/pcre_fullinfo.c +++ b/package/re/pcre_fullinfo.c @@ -2,14 +2,12 @@ /* This module contains the external function pcre_fullinfo(), which returns information about a compiled pattern. */ - #include "re_config.h" #include "pcre_internal.h" - /************************************************* -* Return info about compiled pattern * -*************************************************/ + * Return info about compiled pattern * + *************************************************/ /* This is a newer "info" function which has an extensible interface so that additional items can be added compatibly. @@ -23,101 +21,107 @@ Arguments: Returns: 0 if data returned, negative on error */ -int -pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what, - void *where) -{ -real_pcre internal_re; -pcre_study_data internal_study; -const real_pcre *re = (const real_pcre *)argument_re; -const pcre_study_data *study = NULL; +int pcre_fullinfo(const pcre* argument_re, + const pcre_extra* extra_data, + int what, + void* where) { + real_pcre internal_re; + pcre_study_data internal_study; + const real_pcre* re = (const real_pcre*)argument_re; + const pcre_study_data* study = NULL; -if (re == NULL || where == NULL) return PCRE_ERROR_NULL; + if (re == NULL || where == NULL) + return PCRE_ERROR_NULL; -if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0) - study = (const pcre_study_data *)extra_data->study_data; + if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0) + study = (const pcre_study_data*)extra_data->study_data; -if (re->magic_number != MAGIC_NUMBER) - { - re = _pcre_try_flipped(re, &internal_re, study, &internal_study); - if (re == NULL) return PCRE_ERROR_BADMAGIC; - if (study != NULL) study = &internal_study; - } + if (re->magic_number != MAGIC_NUMBER) { + re = _pcre_try_flipped(re, &internal_re, study, &internal_study); + if (re == NULL) + return PCRE_ERROR_BADMAGIC; + if (study != NULL) + study = &internal_study; + } -switch (what) - { - case PCRE_INFO_OPTIONS: - *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS; - break; + switch (what) { + case PCRE_INFO_OPTIONS: + *((unsigned long int*)where) = re->options & PUBLIC_OPTIONS; + break; - case PCRE_INFO_SIZE: - *((size_t *)where) = re->size; - break; + case PCRE_INFO_SIZE: + *((size_t*)where) = re->size; + break; - case PCRE_INFO_STUDYSIZE: - *((size_t *)where) = (study == NULL)? 0 : study->size; - break; + case PCRE_INFO_STUDYSIZE: + *((size_t*)where) = (study == NULL) ? 0 : study->size; + break; - case PCRE_INFO_CAPTURECOUNT: - *((int *)where) = re->top_bracket; - break; + case PCRE_INFO_CAPTURECOUNT: + *((int*)where) = re->top_bracket; + break; - case PCRE_INFO_BACKREFMAX: - *((int *)where) = re->top_backref; - break; + case PCRE_INFO_BACKREFMAX: + *((int*)where) = re->top_backref; + break; - case PCRE_INFO_FIRSTBYTE: - *((int *)where) = - ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte : - ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2; - break; + case PCRE_INFO_FIRSTBYTE: + *((int*)where) = ((re->flags & PCRE_FIRSTSET) != 0) ? re->first_byte + : ((re->flags & PCRE_STARTLINE) != 0) ? -1 + : -2; + break; - /* Make sure we pass back the pointer to the bit vector in the external - block, not the internal copy (with flipped integer fields). */ + /* Make sure we pass back the pointer to the bit vector in the + external block, not the internal copy (with flipped integer fields). + */ - case PCRE_INFO_FIRSTTABLE: - *((const uschar **)where) = - (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)? - ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL; - break; + case PCRE_INFO_FIRSTTABLE: + *((const uschar**)where) = + (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0) + ? ((const pcre_study_data*)extra_data->study_data) + ->start_bits + : NULL; + break; - case PCRE_INFO_LASTLITERAL: - *((int *)where) = - ((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1; - break; + case PCRE_INFO_LASTLITERAL: + *((int*)where) = + ((re->flags & PCRE_REQCHSET) != 0) ? re->req_byte : -1; + break; - case PCRE_INFO_NAMEENTRYSIZE: - *((int *)where) = re->name_entry_size; - break; + case PCRE_INFO_NAMEENTRYSIZE: + *((int*)where) = re->name_entry_size; + break; - case PCRE_INFO_NAMECOUNT: - *((int *)where) = re->name_count; - break; + case PCRE_INFO_NAMECOUNT: + *((int*)where) = re->name_count; + break; - case PCRE_INFO_NAMETABLE: - *((const uschar **)where) = (const uschar *)re + re->name_table_offset; - break; + case PCRE_INFO_NAMETABLE: + *((const uschar**)where) = + (const uschar*)re + re->name_table_offset; + break; - case PCRE_INFO_DEFAULT_TABLES: - *((const uschar **)where) = (const uschar *)(_pcre_default_tables); - break; + case PCRE_INFO_DEFAULT_TABLES: + *((const uschar**)where) = (const uschar*)(_pcre_default_tables); + break; - case PCRE_INFO_OKPARTIAL: - *((int *)where) = (re->flags & PCRE_NOPARTIAL) == 0; - break; + case PCRE_INFO_OKPARTIAL: + *((int*)where) = (re->flags & PCRE_NOPARTIAL) == 0; + break; - case PCRE_INFO_JCHANGED: - *((int *)where) = (re->flags & PCRE_JCHANGED) != 0; - break; + case PCRE_INFO_JCHANGED: + *((int*)where) = (re->flags & PCRE_JCHANGED) != 0; + break; - case PCRE_INFO_HASCRORLF: - *((int *)where) = (re->flags & PCRE_HASCRORLF) != 0; - break; + case PCRE_INFO_HASCRORLF: + *((int*)where) = (re->flags & PCRE_HASCRORLF) != 0; + break; - default: return PCRE_ERROR_BADOPTION; - } + default: + return PCRE_ERROR_BADOPTION; + } -return 0; + return 0; } /* End of pcre_fullinfo.c */ diff --git a/package/re/pcre_globals.c b/package/re/pcre_globals.c index 02c03906b..1350f56a2 100644 --- a/package/re/pcre_globals.c +++ b/package/re/pcre_globals.c @@ -11,11 +11,11 @@ differently, and global variables are not used (see pcre.in). */ #include "pcre_internal.h" #ifndef VPCOMPAT -void *(*pcre_malloc)(size_t) = malloc; -void (*pcre_free)(void *) = free; -void *(*pcre_stack_malloc)(size_t) = malloc; -void (*pcre_stack_free)(void *) = free; -int (*pcre_callout)(pcre_callout_block *) = NULL; +void* (*pcre_malloc)(size_t) = malloc; +void (*pcre_free)(void*) = free; +void* (*pcre_stack_malloc)(size_t) = malloc; +void (*pcre_stack_free)(void*) = free; +int (*pcre_callout)(pcre_callout_block*) = NULL; #endif /* End of pcre_globals.c */ diff --git a/package/re/pcre_internal.h b/package/re/pcre_internal.h index f75a669b5..b4e131c39 100644 --- a/package/re/pcre_internal.h +++ b/package/re/pcre_internal.h @@ -2,7 +2,6 @@ #ifndef PCRE_INTERNAL_H #define PCRE_INTERNAL_H - #if 0 #define DEBUG #endif @@ -14,7 +13,6 @@ #define DPRINTF(p) #endif - #include #include #include @@ -24,169 +22,135 @@ #include #include - #if USHRT_MAX == 65535 - typedef unsigned short pcre_uint16; +typedef unsigned short pcre_uint16; #elif UINT_MAX == 65535 - typedef unsigned int pcre_uint16; +typedef unsigned int pcre_uint16; #else - #error Cannot determine a type for 16-bit unsigned integers +#error Cannot determine a type for 16-bit unsigned integers #endif #if UINT_MAX == 4294967295 - typedef unsigned int pcre_uint32; +typedef unsigned int pcre_uint32; #elif ULONG_MAX == 4294967295 - typedef unsigned long int pcre_uint32; +typedef unsigned long int pcre_uint32; #else - #error Cannot determine a type for 32-bit unsigned integers +#error Cannot determine a type for 32-bit unsigned integers #endif - typedef unsigned char uschar; - #define NOTACHAR 0xffffffff +#define NLTYPE_FIXED 0 +#define NLTYPE_ANY 1 +#define NLTYPE_ANYCRLF 2 -#define NLTYPE_FIXED 0 -#define NLTYPE_ANY 1 -#define NLTYPE_ANYCRLF 2 - - -#define IS_NEWLINE(p) \ - ((NLBLOCK->nltype != NLTYPE_FIXED)? \ - ((p) < NLBLOCK->PSEND && \ - _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\ - utf8)) \ - : \ - ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ - (p)[0] == NLBLOCK->nl[0] && \ - (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \ - ) \ - ) - - -#define WAS_NEWLINE(p) \ - ((NLBLOCK->nltype != NLTYPE_FIXED)? \ - ((p) > NLBLOCK->PSSTART && \ - _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \ - &(NLBLOCK->nllen), utf8)) \ - : \ - ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ - (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ - (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \ - ) \ - ) +#define IS_NEWLINE(p) \ + ((NLBLOCK->nltype != NLTYPE_FIXED) \ + ? ((p) < NLBLOCK->PSEND && \ + _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, \ + &(NLBLOCK->nllen), utf8)) \ + : ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ + (p)[0] == NLBLOCK->nl[0] && \ + (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]))) +#define WAS_NEWLINE(p) \ + ((NLBLOCK->nltype != NLTYPE_FIXED) \ + ? ((p) > NLBLOCK->PSSTART && \ + _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \ + &(NLBLOCK->nllen), utf8)) \ + : ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ + (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ + (NLBLOCK->nllen == 1 || \ + (p)[-NLBLOCK->nllen + 1] == NLBLOCK->nl[1]))) #ifdef CUSTOM_SUBJECT_PTR #define PCRE_SPTR CUSTOM_SUBJECT_PTR #define USPTR CUSTOM_SUBJECT_PTR #else -#define PCRE_SPTR const char * -#define USPTR const unsigned char * +#define PCRE_SPTR const char* +#define USPTR const unsigned char* #endif - #include "pcre.h" - #ifdef VPCOMPAT -#define strlen(s) _strlen(s) -#define strncmp(s1,s2,m) _strncmp(s1,s2,m) -#define memcmp(s,c,n) _memcmp(s,c,n) -#define memcpy(d,s,n) _memcpy(d,s,n) -#define memmove(d,s,n) _memmove(d,s,n) -#define memset(s,c,n) _memset(s,c,n) +#define strlen(s) _strlen(s) +#define strncmp(s1, s2, m) _strncmp(s1, s2, m) +#define memcmp(s, c, n) _memcmp(s, c, n) +#define memcpy(d, s, n) _memcpy(d, s, n) +#define memmove(d, s, n) _memmove(d, s, n) +#define memset(s, c, n) _memset(s, c, n) #else - #ifndef HAVE_MEMMOVE -#undef memmove +#undef memmove #ifdef HAVE_BCOPY #define memmove(a, b, c) bcopy(b, a, c) #else -static void * -pcre_memmove(void *d, const void *s, size_t n) -{ -size_t i; -unsigned char *dest = (unsigned char *)d; -const unsigned char *src = (const unsigned char *)s; -if (dest > src) - { - dest += n; - src += n; - for (i = 0; i < n; ++i) *(--dest) = *(--src); - return (void *)dest; - } -else - { - for (i = 0; i < n; ++i) *dest++ = *src++; - return (void *)(dest - n); - } +static void* pcre_memmove(void* d, const void* s, size_t n) { + size_t i; + unsigned char* dest = (unsigned char*)d; + const unsigned char* src = (const unsigned char*)s; + if (dest > src) { + dest += n; + src += n; + for (i = 0; i < n; ++i) + *(--dest) = *(--src); + return (void*)dest; + } else { + for (i = 0; i < n; ++i) + *dest++ = *src++; + return (void*)(dest - n); + } } #define memmove(a, b, c) pcre_memmove(a, b, c) #endif #endif #endif - #if LINK_SIZE == 2 -#define PUT(a,n,d) \ - (a[n] = (d) >> 8), \ - (a[(n)+1] = (d) & 255) +#define PUT(a, n, d) (a[n] = (d) >> 8), (a[(n) + 1] = (d)&255) -#define GET(a,n) \ - (((a)[n] << 8) | (a)[(n)+1]) +#define GET(a, n) (((a)[n] << 8) | (a)[(n) + 1]) #define MAX_PATTERN_SIZE (1 << 16) - #elif LINK_SIZE == 3 -#define PUT(a,n,d) \ - (a[n] = (d) >> 16), \ - (a[(n)+1] = (d) >> 8), \ - (a[(n)+2] = (d) & 255) +#define PUT(a, n, d) \ + (a[n] = (d) >> 16), (a[(n) + 1] = (d) >> 8), (a[(n) + 2] = (d)&255) -#define GET(a,n) \ - (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) +#define GET(a, n) (((a)[n] << 16) | ((a)[(n) + 1] << 8) | (a)[(n) + 2]) #define MAX_PATTERN_SIZE (1 << 24) - #elif LINK_SIZE == 4 -#define PUT(a,n,d) \ - (a[n] = (d) >> 24), \ - (a[(n)+1] = (d) >> 16), \ - (a[(n)+2] = (d) >> 8), \ - (a[(n)+3] = (d) & 255) +#define PUT(a, n, d) \ + (a[n] = (d) >> 24), (a[(n) + 1] = (d) >> 16), (a[(n) + 2] = (d) >> 8), \ + (a[(n) + 3] = (d)&255) -#define GET(a,n) \ - (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) +#define GET(a, n) \ + (((a)[n] << 24) | ((a)[(n) + 1] << 16) | ((a)[(n) + 2] << 8) | (a)[(n) + 3]) #define MAX_PATTERN_SIZE (1 << 30) - #else #error LINK_SIZE must be either 2, 3, or 4 #endif +#define PUTINC(a, n, d) PUT(a, n, d), a += LINK_SIZE -#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE +#define PUT2(a, n, d) \ + a[n] = (d) >> 8; \ + a[(n) + 1] = (d)&255 +#define GET2(a, n) (((a)[n] << 8) | (a)[(n) + 1]) -#define PUT2(a,n,d) \ - a[n] = (d) >> 8; \ - a[(n)+1] = (d) & 255 - -#define GET2(a,n) \ - (((a)[n] << 8) | (a)[(n)+1]) - -#define PUT2INC(a,n,d) PUT2(a,n,d), a += 2 - +#define PUT2INC(a, n, d) PUT2(a, n, d), a += 2 #ifndef SUPPORT_UTF8 #define NEXTCHAR(p) p++; @@ -196,145 +160,128 @@ else #define GETCHARINCTEST(c, eptr) c = *eptr++; #define GETCHARLEN(c, eptr, len) c = *eptr; - #else - -#define NEXTCHAR(p) \ - p++; \ - if (utf8) { while((*p & 0xc0) == 0x80) p++; } - - -#define GETCHAR(c, eptr) \ - c = *eptr; \ - if (c >= 0xc0) \ - { \ - int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; \ - int gcss = 6*gcaa; \ - c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ - for (gcii = 1; gcii <= gcaa; gcii++) \ - { \ - gcss -= 6; \ - c |= (eptr[gcii] & 0x3f) << gcss; \ - } \ +#define NEXTCHAR(p) \ + p++; \ + if (utf8) { \ + while ((*p & 0xc0) == 0x80) \ + p++; \ } - -#define GETCHARTEST(c, eptr) \ - c = *eptr; \ - if (utf8 && c >= 0xc0) \ - { \ - int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; \ - int gcss = 6*gcaa; \ - c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ - for (gcii = 1; gcii <= gcaa; gcii++) \ - { \ - gcss -= 6; \ - c |= (eptr[gcii] & 0x3f) << gcss; \ - } \ +#define GETCHAR(c, eptr) \ + c = *eptr; \ + if (c >= 0xc0) { \ + int gcii; \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; \ + int gcss = 6 * gcaa; \ + c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ + for (gcii = 1; gcii <= gcaa; gcii++) { \ + gcss -= 6; \ + c |= (eptr[gcii] & 0x3f) << gcss; \ + } \ } - -#define GETCHARINC(c, eptr) \ - c = *eptr++; \ - if (c >= 0xc0) \ - { \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; \ - int gcss = 6*gcaa; \ - c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ - while (gcaa-- > 0) \ - { \ - gcss -= 6; \ - c |= (*eptr++ & 0x3f) << gcss; \ - } \ +#define GETCHARTEST(c, eptr) \ + c = *eptr; \ + if (utf8 && c >= 0xc0) { \ + int gcii; \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; \ + int gcss = 6 * gcaa; \ + c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ + for (gcii = 1; gcii <= gcaa; gcii++) { \ + gcss -= 6; \ + c |= (eptr[gcii] & 0x3f) << gcss; \ + } \ } - -#define GETCHARINCTEST(c, eptr) \ - c = *eptr++; \ - if (utf8 && c >= 0xc0) \ - { \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; \ - int gcss = 6*gcaa; \ - c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ - while (gcaa-- > 0) \ - { \ - gcss -= 6; \ - c |= (*eptr++ & 0x3f) << gcss; \ - } \ +#define GETCHARINC(c, eptr) \ + c = *eptr++; \ + if (c >= 0xc0) { \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; \ + int gcss = 6 * gcaa; \ + c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ + while (gcaa-- > 0) { \ + gcss -= 6; \ + c |= (*eptr++ & 0x3f) << gcss; \ + } \ } - -#define GETCHARLEN(c, eptr, len) \ - c = *eptr; \ - if (c >= 0xc0) \ - { \ - int gcii; \ - int gcaa = _pcre_utf8_table4[c & 0x3f]; \ - int gcss = 6*gcaa; \ - c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ - for (gcii = 1; gcii <= gcaa; gcii++) \ - { \ - gcss -= 6; \ - c |= (eptr[gcii] & 0x3f) << gcss; \ - } \ - len += gcaa; \ +#define GETCHARINCTEST(c, eptr) \ + c = *eptr++; \ + if (utf8 && c >= 0xc0) { \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; \ + int gcss = 6 * gcaa; \ + c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ + while (gcaa-- > 0) { \ + gcss -= 6; \ + c |= (*eptr++ & 0x3f) << gcss; \ + } \ } +#define GETCHARLEN(c, eptr, len) \ + c = *eptr; \ + if (c >= 0xc0) { \ + int gcii; \ + int gcaa = _pcre_utf8_table4[c & 0x3f]; \ + int gcss = 6 * gcaa; \ + c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ + for (gcii = 1; gcii <= gcaa; gcii++) { \ + gcss -= 6; \ + c |= (eptr[gcii] & 0x3f) << gcss; \ + } \ + len += gcaa; \ + } -#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr-- +#define BACKCHAR(eptr) \ + while ((*eptr & 0xc0) == 0x80) \ + eptr-- #endif - #ifndef offsetof -#define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) +#define offsetof(p_type, field) ((size_t) & (((p_type*)0)->field)) #endif +#define PCRE_IMS (PCRE_CASELESS | PCRE_MULTILINE | PCRE_DOTALL) -#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) +#define PCRE_NOPARTIAL 0x0001 +#define PCRE_FIRSTSET 0x0002 +#define PCRE_REQCHSET 0x0004 +#define PCRE_STARTLINE 0x0008 +#define PCRE_JCHANGED 0x0010 +#define PCRE_HASCRORLF 0x0020 +#define PCRE_STUDY_MAPPED 0x01 -#define PCRE_NOPARTIAL 0x0001 -#define PCRE_FIRSTSET 0x0002 -#define PCRE_REQCHSET 0x0004 -#define PCRE_STARTLINE 0x0008 -#define PCRE_JCHANGED 0x0010 -#define PCRE_HASCRORLF 0x0020 +#define PCRE_NEWLINE_BITS \ + (PCRE_NEWLINE_CR | PCRE_NEWLINE_LF | PCRE_NEWLINE_ANY | \ + PCRE_NEWLINE_ANYCRLF) +#define PUBLIC_OPTIONS \ + (PCRE_CASELESS | PCRE_EXTENDED | PCRE_ANCHORED | PCRE_MULTILINE | \ + PCRE_DOTALL | PCRE_DOLLAR_ENDONLY | PCRE_EXTRA | PCRE_UNGREEDY | \ + PCRE_UTF8 | PCRE_NO_AUTO_CAPTURE | PCRE_NO_UTF8_CHECK | \ + PCRE_AUTO_CALLOUT | PCRE_FIRSTLINE | PCRE_DUPNAMES | PCRE_NEWLINE_BITS | \ + PCRE_BSR_ANYCRLF | PCRE_BSR_UNICODE) -#define PCRE_STUDY_MAPPED 0x01 +#define PUBLIC_EXEC_OPTIONS \ + (PCRE_ANCHORED | PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | \ + PCRE_NO_UTF8_CHECK | PCRE_PARTIAL | PCRE_NEWLINE_BITS | \ + PCRE_BSR_ANYCRLF | PCRE_BSR_UNICODE) - -#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \ - PCRE_NEWLINE_ANYCRLF) - -#define PUBLIC_OPTIONS \ - (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ - PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ - PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ - PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) - -#define PUBLIC_EXEC_OPTIONS \ - (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ - PCRE_PARTIAL|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) - -#define PUBLIC_DFA_EXEC_OPTIONS \ - (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ - PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS| \ - PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) +#define PUBLIC_DFA_EXEC_OPTIONS \ + (PCRE_ANCHORED | PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | \ + PCRE_NO_UTF8_CHECK | PCRE_PARTIAL | PCRE_DFA_SHORTEST | \ + PCRE_DFA_RESTART | PCRE_NEWLINE_BITS | PCRE_BSR_ANYCRLF | \ + PCRE_BSR_UNICODE) #define PUBLIC_STUDY_OPTIONS 0 - -#define MAGIC_NUMBER 0x50435245UL - +#define MAGIC_NUMBER 0x50435245UL #define REQ_UNSET (-2) -#define REQ_NONE (-1) - +#define REQ_NONE (-1) #define REQ_BYTE_MAX 1000 @@ -342,14 +289,12 @@ else variable-length repeat, or a anything other than literal characters. */ #define REQ_CASELESS 0x0100 -#define REQ_VARY 0x0200 - +#define REQ_VARY 0x0200 typedef int BOOL; -#define FALSE 0 -#define TRUE 1 - +#define FALSE 0 +#define TRUE 1 #ifndef ESC_e #define ESC_e 27 @@ -367,27 +312,24 @@ typedef int BOOL; #define ESC_r '\r' #endif - #ifndef ESC_tee #define ESC_tee '\t' #endif +#define PT_ANY 0 +#define PT_LAMP 1 +#define PT_GC 2 +#define PT_PC 3 +#define PT_SC 4 -#define PT_ANY 0 -#define PT_LAMP 1 -#define PT_GC 2 -#define PT_PC 3 -#define PT_SC 4 +#define XCL_NOT 0x01 +#define XCL_MAP 0x02 - -#define XCL_NOT 0x01 -#define XCL_MAP 0x02 - -#define XCL_END 0 -#define XCL_SINGLE 1 -#define XCL_RANGE 2 -#define XCL_PROP 3 -#define XCL_NOTPROP 4 +#define XCL_END 0 +#define XCL_SINGLE 1 +#define XCL_RANGE 2 +#define XCL_PROP 3 +#define XCL_NOTPROP 4 /* These are escaped items that aren't just an encoding of a particular data value such as \n. They must have non-zero values, as check_escape() returns @@ -400,10 +342,35 @@ ESC_Z to detect the types that may be repeated. These are the types that consume characters. If any new escapes are put in between that don't consume a character, that code will have to change. */ -enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, - ESC_W, ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, ESC_h, - ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_k, ESC_REF }; - +enum { + ESC_A = 1, + ESC_G, + ESC_K, + ESC_B, + ESC_b, + ESC_D, + ESC_d, + ESC_S, + ESC_s, + ESC_W, + ESC_w, + ESC_dum1, + ESC_C, + ESC_P, + ESC_p, + ESC_R, + ESC_H, + ESC_h, + ESC_V, + ESC_v, + ESC_X, + ESC_Z, + ESC_z, + ESC_E, + ESC_Q, + ESC_k, + ESC_REF +}; /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in order to the list of escapes immediately above. @@ -413,245 +380,250 @@ that follow must also be updated to match. There is also a table called "coptable" in pcre_dfa_exec.c that must be updated. */ enum { - OP_END, + OP_END, + OP_SOD, + OP_SOM, + OP_SET_SOM, + OP_NOT_WORD_BOUNDARY, + OP_WORD_BOUNDARY, + OP_NOT_DIGIT, + OP_DIGIT, + OP_NOT_WHITESPACE, + OP_WHITESPACE, + OP_NOT_WORDCHAR, + OP_WORDCHAR, + OP_ANY, + OP_ANYBYTE, + OP_NOTPROP, + OP_PROP, + OP_ANYNL, + OP_NOT_HSPACE, + OP_HSPACE, + OP_NOT_VSPACE, + OP_VSPACE, + OP_EXTUNI, + OP_EODN, + OP_EOD, + OP_OPT, + OP_CIRC, + OP_DOLL, + OP_CHAR, + OP_CHARNC, + OP_NOT, - OP_SOD, - OP_SOM, - OP_SET_SOM, - OP_NOT_WORD_BOUNDARY, - OP_WORD_BOUNDARY, - OP_NOT_DIGIT, - OP_DIGIT, - OP_NOT_WHITESPACE, - OP_WHITESPACE, - OP_NOT_WORDCHAR, - OP_WORDCHAR, - OP_ANY, - OP_ANYBYTE, - OP_NOTPROP, - OP_PROP, - OP_ANYNL, - OP_NOT_HSPACE, - OP_HSPACE, - OP_NOT_VSPACE, - OP_VSPACE, - OP_EXTUNI, - OP_EODN, - OP_EOD, + OP_STAR, + OP_MINSTAR, + OP_PLUS, + OP_MINPLUS, + OP_QUERY, + OP_MINQUERY, - OP_OPT, - OP_CIRC, - OP_DOLL, - OP_CHAR, - OP_CHARNC, - OP_NOT, + OP_UPTO, + OP_MINUPTO, + OP_EXACT, - OP_STAR, - OP_MINSTAR, - OP_PLUS, - OP_MINPLUS, - OP_QUERY, - OP_MINQUERY, + OP_POSSTAR, + OP_POSPLUS, + OP_POSQUERY, + OP_POSUPTO, - OP_UPTO, - OP_MINUPTO, - OP_EXACT, + OP_NOTSTAR, + OP_NOTMINSTAR, + OP_NOTPLUS, + OP_NOTMINPLUS, + OP_NOTQUERY, + OP_NOTMINQUERY, - OP_POSSTAR, - OP_POSPLUS, - OP_POSQUERY, - OP_POSUPTO, + OP_NOTUPTO, + OP_NOTMINUPTO, + OP_NOTEXACT, - OP_NOTSTAR, - OP_NOTMINSTAR, - OP_NOTPLUS, - OP_NOTMINPLUS, - OP_NOTQUERY, - OP_NOTMINQUERY, + OP_NOTPOSSTAR, + OP_NOTPOSPLUS, + OP_NOTPOSQUERY, + OP_NOTPOSUPTO, - OP_NOTUPTO, - OP_NOTMINUPTO, - OP_NOTEXACT, + OP_TYPESTAR, + OP_TYPEMINSTAR, + OP_TYPEPLUS, + OP_TYPEMINPLUS, + OP_TYPEQUERY, + OP_TYPEMINQUERY, - OP_NOTPOSSTAR, - OP_NOTPOSPLUS, - OP_NOTPOSQUERY, - OP_NOTPOSUPTO, + OP_TYPEUPTO, + OP_TYPEMINUPTO, + OP_TYPEEXACT, - OP_TYPESTAR, - OP_TYPEMINSTAR, - OP_TYPEPLUS, - OP_TYPEMINPLUS, - OP_TYPEQUERY, - OP_TYPEMINQUERY, + OP_TYPEPOSSTAR, + OP_TYPEPOSPLUS, + OP_TYPEPOSQUERY, + OP_TYPEPOSUPTO, - OP_TYPEUPTO, - OP_TYPEMINUPTO, - OP_TYPEEXACT, + OP_CRSTAR, + OP_CRMINSTAR, + OP_CRPLUS, + OP_CRMINPLUS, + OP_CRQUERY, + OP_CRMINQUERY, + OP_CRRANGE, + OP_CRMINRANGE, - OP_TYPEPOSSTAR, - OP_TYPEPOSPLUS, - OP_TYPEPOSQUERY, - OP_TYPEPOSUPTO, + OP_CLASS, + OP_NCLASS, /* 78 Same, but the bitmap was created from a negative + class - the difference is relevant only when a UTF-8 + character > 255 is encountered. */ - OP_CRSTAR, - OP_CRMINSTAR, - OP_CRPLUS, - OP_CRMINPLUS, - OP_CRQUERY, - OP_CRMINQUERY, - OP_CRRANGE, - OP_CRMINRANGE, + OP_XCLASS, - OP_CLASS, - OP_NCLASS, /* 78 Same, but the bitmap was created from a negative - class - the difference is relevant only when a UTF-8 - character > 255 is encountered. */ + OP_REF, + OP_RECURSE, + OP_CALLOUT, - OP_XCLASS, + OP_ALT, + OP_KET, + OP_KETRMAX, + OP_KETRMIN, - OP_REF, - OP_RECURSE, - OP_CALLOUT, + OP_ASSERT, + OP_ASSERT_NOT, + OP_ASSERTBACK, + OP_ASSERTBACK_NOT, + OP_REVERSE, - OP_ALT, - OP_KET, - OP_KETRMAX, - OP_KETRMIN, + /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE + first, as there's a test for >= ONCE for a subpattern that isn't an + assertion. */ + OP_ONCE, + OP_BRA, + OP_CBRA, + OP_COND, + /* These three must follow the previous three, in the same order. There's a + check for >= SBRA to distinguish the two sets. */ - OP_ASSERT, - OP_ASSERT_NOT, - OP_ASSERTBACK, - OP_ASSERTBACK_NOT, - OP_REVERSE, + OP_SBRA, + OP_SCBRA, + OP_SCOND, - /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first, - as there's a test for >= ONCE for a subpattern that isn't an assertion. */ + OP_CREF, + OP_RREF, + OP_DEF, - OP_ONCE, - OP_BRA, - OP_CBRA, - OP_COND, + OP_BRAZERO, + OP_BRAMINZERO, - /* These three must follow the previous three, in the same order. There's a - check for >= SBRA to distinguish the two sets. */ + OP_PRUNE, + OP_SKIP, + OP_THEN, + OP_COMMIT, - OP_SBRA, - OP_SCBRA, - OP_SCOND, - - OP_CREF, - OP_RREF, - OP_DEF, - - OP_BRAZERO, - OP_BRAMINZERO, - - - - OP_PRUNE, - OP_SKIP, - OP_THEN, - OP_COMMIT, - - - - OP_FAIL, - OP_ACCEPT + OP_FAIL, + OP_ACCEPT }; +#define OP_NAME_LIST \ + "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", "\\S", "\\s", \ + "\\W", "\\w", "Any", "Anybyte", "notprop", "prop", "\\R", "\\H", \ + "\\h", "\\V", "\\v", "extuni", "\\Z", "\\z", "Opt", "^", "$", "char", \ + "charnc", "not", "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*+", \ + "++", "?+", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*+", \ + "++", "?+", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*+", \ + "++", "?+", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", "class", \ + "nclass", "xclass", "Ref", "Recurse", "Callout", "Alt", "Ket", \ + "KetRmax", "KetRmin", "Assert", "Assert not", "AssertB", \ + "AssertB not", "Reverse", "Once", "Bra", "CBra", "Cond", "SBra", \ + "SCBra", "SCond", "Cond ref", "Cond rec", "Cond def", "Brazero", \ + "Braminzero", "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", \ + "*ACCEPT" -#define OP_NAME_LIST \ - "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \ - "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ - "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \ - "extuni", "\\Z", "\\z", \ - "Opt", "^", "$", "char", "charnc", "not", \ - "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ - "*+","++", "?+", "{", \ - "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ - "*+","++", "?+", "{", \ - "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ - "*+","++", "?+", "{", \ - "*", "*?", "+", "+?", "?", "??", "{", "{", \ - "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ - "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ - "AssertB", "AssertB not", "Reverse", \ - "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \ - "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \ - "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT" +#define OP_LENGTHS \ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, \ + 1, 1, 2, 2, 2, \ + \ + 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, \ + \ + 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, \ + \ + 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 4, \ + \ + 1, 1, 1, 1, 1, 1, 5, 5, 33, 33, 0, 3, 1 + LINK_SIZE, \ + 2 + 2 * LINK_SIZE, 1 + LINK_SIZE, 1 + LINK_SIZE, 1 + LINK_SIZE, \ + 1 + LINK_SIZE, 1 + LINK_SIZE, 1 + LINK_SIZE, 1 + LINK_SIZE, \ + 1 + LINK_SIZE, 1 + LINK_SIZE, 1 + LINK_SIZE, 1 + LINK_SIZE, \ + 3 + LINK_SIZE, 1 + LINK_SIZE, 1 + LINK_SIZE, 3 + LINK_SIZE, \ + 1 + LINK_SIZE, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1 +#define RREF_ANY 0xffff -#define OP_LENGTHS \ - 1, \ - 1, 1, 1, 1, 1, \ - 1, 1, 1, 1, 1, 1, \ - 1, 1, \ - 3, 3, 1, \ - 1, 1, 1, 1, 1, \ - 1, 1, 2, 1, 1, \ - 2, \ - 2, \ - 2, \ - \ - 2, 2, 2, 2, 2, 2, \ - 4, 4, 4, \ - 2, 2, 2, 4, \ - \ - 2, 2, 2, 2, 2, 2, \ - 4, 4, 4, \ - 2, 2, 2, 4, \ - \ - 2, 2, 2, 2, 2, 2, \ - 4, 4, 4, \ - 2, 2, 2, 4, \ - \ - 1, 1, 1, 1, 1, 1, \ - 5, 5, \ - 33, \ - 33, \ - 0, \ - 3, \ - 1+LINK_SIZE, \ - 2+2*LINK_SIZE, \ - 1+LINK_SIZE, \ - 1+LINK_SIZE, \ - 1+LINK_SIZE, \ - 1+LINK_SIZE, \ - 1+LINK_SIZE, \ - 1+LINK_SIZE, \ - 1+LINK_SIZE, \ - 1+LINK_SIZE, \ - 1+LINK_SIZE, \ - 1+LINK_SIZE, \ - 1+LINK_SIZE, \ - 3+LINK_SIZE, \ - 1+LINK_SIZE, \ - 1+LINK_SIZE, \ - 3+LINK_SIZE, \ - 1+LINK_SIZE, \ - 3, \ - 3, \ - 1, \ - 1, 1, \ - 1, 1, 1, 1, \ - 1, 1 - - -#define RREF_ANY 0xffff - - -enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, - ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, - ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, - ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, - ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, - ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, - ERR60, ERR61, ERR62, ERR63 }; +enum { + ERR0, + ERR1, + ERR2, + ERR3, + ERR4, + ERR5, + ERR6, + ERR7, + ERR8, + ERR9, + ERR10, + ERR11, + ERR12, + ERR13, + ERR14, + ERR15, + ERR16, + ERR17, + ERR18, + ERR19, + ERR20, + ERR21, + ERR22, + ERR23, + ERR24, + ERR25, + ERR26, + ERR27, + ERR28, + ERR29, + ERR30, + ERR31, + ERR32, + ERR33, + ERR34, + ERR35, + ERR36, + ERR37, + ERR38, + ERR39, + ERR40, + ERR41, + ERR42, + ERR43, + ERR44, + ERR45, + ERR46, + ERR47, + ERR48, + ERR49, + ERR50, + ERR51, + ERR52, + ERR53, + ERR54, + ERR55, + ERR56, + ERR57, + ERR58, + ERR59, + ERR60, + ERR61, + ERR62, + ERR63 +}; /* The real format of the start of the pcre block; the index of names and the code vector run on as long as necessary after the end. We store an explicit @@ -671,172 +643,160 @@ NOTE NOTE NOTE: */ typedef struct real_pcre { - pcre_uint32 magic_number; - pcre_uint32 size; - pcre_uint32 options; - pcre_uint16 flags; - pcre_uint16 dummy1; - pcre_uint16 top_bracket; - pcre_uint16 top_backref; - pcre_uint16 first_byte; - pcre_uint16 req_byte; - pcre_uint16 name_table_offset; - pcre_uint16 name_entry_size; - pcre_uint16 name_count; - pcre_uint16 ref_count; + pcre_uint32 magic_number; + pcre_uint32 size; + pcre_uint32 options; + pcre_uint16 flags; + pcre_uint16 dummy1; + pcre_uint16 top_bracket; + pcre_uint16 top_backref; + pcre_uint16 first_byte; + pcre_uint16 req_byte; + pcre_uint16 name_table_offset; + pcre_uint16 name_entry_size; + pcre_uint16 name_count; + pcre_uint16 ref_count; - const unsigned char *tables; - const unsigned char *nullpad; + const unsigned char* tables; + const unsigned char* nullpad; } real_pcre; - typedef struct pcre_study_data { - pcre_uint32 size; - pcre_uint32 options; - uschar start_bits[32]; + pcre_uint32 size; + pcre_uint32 options; + uschar start_bits[32]; } pcre_study_data; - typedef struct compile_data { - const uschar *lcc; - const uschar *fcc; - const uschar *cbits; - const uschar *ctypes; - const uschar *start_workspace; - const uschar *start_code; - const uschar *start_pattern; - const uschar *end_pattern; - uschar *hwm; - uschar *name_table; - int names_found; - int name_entry_size; - int bracount; - int final_bracount; - int top_backref; - unsigned int backref_map; - int external_options; - int external_flags; - int req_varyopt; - BOOL had_accept; - int nltype; - int nllen; - uschar nl[4]; + const uschar* lcc; + const uschar* fcc; + const uschar* cbits; + const uschar* ctypes; + const uschar* start_workspace; + const uschar* start_code; + const uschar* start_pattern; + const uschar* end_pattern; + uschar* hwm; + uschar* name_table; + int names_found; + int name_entry_size; + int bracount; + int final_bracount; + int top_backref; + unsigned int backref_map; + int external_options; + int external_flags; + int req_varyopt; + BOOL had_accept; + int nltype; + int nllen; + uschar nl[4]; } compile_data; - typedef struct branch_chain { - struct branch_chain *outer; - uschar *current; + struct branch_chain* outer; + uschar* current; } branch_chain; - typedef struct recursion_info { - struct recursion_info *prevrec; - int group_num; - const uschar *after_call; - USPTR save_start; - int *offset_save; - int saved_max; + struct recursion_info* prevrec; + int group_num; + const uschar* after_call; + USPTR save_start; + int* offset_save; + int saved_max; } recursion_info; - typedef struct eptrblock { - struct eptrblock *epb_prev; - USPTR epb_saved_eptr; + struct eptrblock* epb_prev; + USPTR epb_saved_eptr; } eptrblock; - typedef struct match_data { - unsigned long int match_call_count; - unsigned long int match_limit; - unsigned long int match_limit_recursion; - int *offset_vector; - int offset_end; - int offset_max; - int nltype; - int nllen; - uschar nl[4]; - const uschar *lcc; - const uschar *ctypes; - BOOL offset_overflow; - BOOL notbol; - BOOL noteol; - BOOL utf8; - BOOL endonly; - BOOL notempty; - BOOL partial; - BOOL hitend; - BOOL bsr_anycrlf; - const uschar *start_code; - USPTR start_subject; - USPTR end_subject; - USPTR start_match_ptr; - USPTR end_match_ptr; - int end_offset_top; - int capture_last; - int start_offset; - eptrblock *eptrchain; - int eptrn; - recursion_info *recursive; - void *callout_data; + unsigned long int match_call_count; + unsigned long int match_limit; + unsigned long int match_limit_recursion; + int* offset_vector; + int offset_end; + int offset_max; + int nltype; + int nllen; + uschar nl[4]; + const uschar* lcc; + const uschar* ctypes; + BOOL offset_overflow; + BOOL notbol; + BOOL noteol; + BOOL utf8; + BOOL endonly; + BOOL notempty; + BOOL partial; + BOOL hitend; + BOOL bsr_anycrlf; + const uschar* start_code; + USPTR start_subject; + USPTR end_subject; + USPTR start_match_ptr; + USPTR end_match_ptr; + int end_offset_top; + int capture_last; + int start_offset; + eptrblock* eptrchain; + int eptrn; + recursion_info* recursive; + void* callout_data; } match_data; - typedef struct dfa_match_data { - const uschar *start_code; - const uschar *start_subject; - const uschar *end_subject; - const uschar *tables; - int moptions; - int poptions; - int nltype; - int nllen; - uschar nl[4]; - void *callout_data; + const uschar* start_code; + const uschar* start_subject; + const uschar* end_subject; + const uschar* tables; + int moptions; + int poptions; + int nltype; + int nllen; + uschar nl[4]; + void* callout_data; } dfa_match_data; +#define ctype_space 0x01 +#define ctype_letter 0x02 +#define ctype_digit 0x04 +#define ctype_xdigit 0x08 +#define ctype_word 0x10 +#define ctype_meta 0x80 -#define ctype_space 0x01 -#define ctype_letter 0x02 -#define ctype_digit 0x04 -#define ctype_xdigit 0x08 -#define ctype_word 0x10 -#define ctype_meta 0x80 +#define cbit_space 0 +#define cbit_xdigit 32 +#define cbit_digit 64 +#define cbit_upper 96 +#define cbit_lower 128 +#define cbit_word 160 +#define cbit_graph 192 +#define cbit_print 224 +#define cbit_punct 256 +#define cbit_cntrl 288 +#define cbit_length 320 - -#define cbit_space 0 -#define cbit_xdigit 32 -#define cbit_digit 64 -#define cbit_upper 96 -#define cbit_lower 128 -#define cbit_word 160 -#define cbit_graph 192 -#define cbit_print 224 -#define cbit_punct 256 -#define cbit_cntrl 288 -#define cbit_length 320 - - -#define lcc_offset 0 -#define fcc_offset 256 -#define cbits_offset 512 +#define lcc_offset 0 +#define fcc_offset 256 +#define cbits_offset 512 #define ctypes_offset (cbits_offset + cbit_length) #define tables_length (ctypes_offset + 256) - typedef struct { - pcre_uint16 name_offset; - pcre_uint16 type; - pcre_uint16 value; + pcre_uint16 name_offset; + pcre_uint16 type; + pcre_uint16 value; } ucp_type_table; - -extern const int _pcre_utf8_table1[]; -extern const int _pcre_utf8_table2[]; -extern const int _pcre_utf8_table3[]; +extern const int _pcre_utf8_table1[]; +extern const int _pcre_utf8_table2[]; +extern const int _pcre_utf8_table3[]; extern const uschar _pcre_utf8_table4[]; -extern const int _pcre_utf8_table1_size; +extern const int _pcre_utf8_table1_size; extern const ucp_type_table _pcre_utt[]; extern const int _pcre_utt_size; @@ -845,17 +805,14 @@ extern const uschar _pcre_default_tables[]; extern const uschar _pcre_OP_lengths[]; - -extern BOOL _pcre_is_newline(const uschar *, int, const uschar *, - int *, BOOL); -extern int _pcre_ord2utf8(int, uschar *); -extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, - const pcre_study_data *, pcre_study_data *); -extern int _pcre_valid_utf8(const uschar *, int); -extern BOOL _pcre_was_newline(const uschar *, int, const uschar *, - int *, BOOL); -extern BOOL _pcre_xclass(int, const uschar *); +extern BOOL _pcre_is_newline(const uschar*, int, const uschar*, int*, BOOL); +extern int _pcre_ord2utf8(int, uschar*); +extern real_pcre* _pcre_try_flipped(const real_pcre*, + real_pcre*, + const pcre_study_data*, + pcre_study_data*); +extern int _pcre_valid_utf8(const uschar*, int); +extern BOOL _pcre_was_newline(const uschar*, int, const uschar*, int*, BOOL); +extern BOOL _pcre_xclass(int, const uschar*); #endif - - diff --git a/package/re/pcre_newline.c b/package/re/pcre_newline.c index 381705982..3f0eda35a 100644 --- a/package/re/pcre_newline.c +++ b/package/re/pcre_newline.c @@ -1,11 +1,9 @@ #include "re_config.h" #include "pcre_internal.h" - - /************************************************* -* Check for newline at given position * -*************************************************/ + * Check for newline at given position * + *************************************************/ /* It is guaranteed that the initial value of ptr is less than the end of the string that is being processed. @@ -20,42 +18,56 @@ Arguments: Returns: TRUE or FALSE */ -BOOL -_pcre_is_newline(const uschar *ptr, int type, const uschar *endptr, - int *lenptr, BOOL utf8) -{ -int c; -if (utf8) { GETCHAR(c, ptr); } else c = *ptr; +BOOL _pcre_is_newline(const uschar* ptr, + int type, + const uschar* endptr, + int* lenptr, + BOOL utf8) { + int c; + if (utf8) { + GETCHAR(c, ptr); + } else + c = *ptr; -if (type == NLTYPE_ANYCRLF) switch(c) - { - case 0x000a: *lenptr = 1; return TRUE; /* LF */ - case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; - return TRUE; /* CR */ - default: return FALSE; - } + if (type == NLTYPE_ANYCRLF) + switch (c) { + case 0x000a: + *lenptr = 1; + return TRUE; /* LF */ + case 0x000d: + *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a) ? 2 : 1; + return TRUE; /* CR */ + default: + return FALSE; + } -/* NLTYPE_ANY */ + /* NLTYPE_ANY */ -else switch(c) - { - case 0x000a: /* LF */ - case 0x000b: /* VT */ - case 0x000c: *lenptr = 1; return TRUE; /* FF */ - case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; - return TRUE; /* CR */ - case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ - case 0x2028: /* LS */ - case 0x2029: *lenptr = 3; return TRUE; /* PS */ - default: return FALSE; - } + else + switch (c) { + case 0x000a: /* LF */ + case 0x000b: /* VT */ + case 0x000c: + *lenptr = 1; + return TRUE; /* FF */ + case 0x000d: + *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a) ? 2 : 1; + return TRUE; /* CR */ + case 0x0085: + *lenptr = utf8 ? 2 : 1; + return TRUE; /* NEL */ + case 0x2028: /* LS */ + case 0x2029: + *lenptr = 3; + return TRUE; /* PS */ + default: + return FALSE; + } } - - /************************************************* -* Check for newline at previous position * -*************************************************/ + * Check for newline at previous position * + *************************************************/ /* It is guaranteed that the initial value of ptr is greater than the start of the string that is being processed. @@ -70,43 +82,55 @@ Arguments: Returns: TRUE or FALSE */ -BOOL -_pcre_was_newline(const uschar *ptr, int type, const uschar *startptr, - int *lenptr, BOOL utf8) -{ -int c; -ptr--; +BOOL _pcre_was_newline(const uschar* ptr, + int type, + const uschar* startptr, + int* lenptr, + BOOL utf8) { + int c; + ptr--; #ifdef SUPPORT_UTF8 -if (utf8) - { - BACKCHAR(ptr); - GETCHAR(c, ptr); - } -else c = *ptr; -#else /* no UTF-8 support */ -c = *ptr; -#endif /* SUPPORT_UTF8 */ + if (utf8) { + BACKCHAR(ptr); + GETCHAR(c, ptr); + } else + c = *ptr; +#else /* no UTF-8 support */ + c = *ptr; +#endif /* SUPPORT_UTF8 */ -if (type == NLTYPE_ANYCRLF) switch(c) - { - case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; - return TRUE; /* LF */ - case 0x000d: *lenptr = 1; return TRUE; /* CR */ - default: return FALSE; - } + if (type == NLTYPE_ANYCRLF) + switch (c) { + case 0x000a: + *lenptr = (ptr > startptr && ptr[-1] == 0x0d) ? 2 : 1; + return TRUE; /* LF */ + case 0x000d: + *lenptr = 1; + return TRUE; /* CR */ + default: + return FALSE; + } -else switch(c) - { - case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; - return TRUE; /* LF */ - case 0x000b: /* VT */ - case 0x000c: /* FF */ - case 0x000d: *lenptr = 1; return TRUE; /* CR */ - case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ - case 0x2028: /* LS */ - case 0x2029: *lenptr = 3; return TRUE; /* PS */ - default: return FALSE; - } + else + switch (c) { + case 0x000a: + *lenptr = (ptr > startptr && ptr[-1] == 0x0d) ? 2 : 1; + return TRUE; /* LF */ + case 0x000b: /* VT */ + case 0x000c: /* FF */ + case 0x000d: + *lenptr = 1; + return TRUE; /* CR */ + case 0x0085: + *lenptr = utf8 ? 2 : 1; + return TRUE; /* NEL */ + case 0x2028: /* LS */ + case 0x2029: + *lenptr = 3; + return TRUE; /* PS */ + default: + return FALSE; + } } /* End of pcre_newline.c */ diff --git a/package/re/pcre_ord2utf8.c b/package/re/pcre_ord2utf8.c index ace40064a..1f0bb881a 100644 --- a/package/re/pcre_ord2utf8.c +++ b/package/re/pcre_ord2utf8.c @@ -1,10 +1,9 @@ #include "re_config.h" #include "pcre_internal.h" - /************************************************* -* Convert character value to UTF-8 * -*************************************************/ + * Convert character value to UTF-8 * + *************************************************/ /* This function takes an integer value in the range 0 - 0x7fffffff and encodes it as a UTF-8 character in 0 to 6 bytes. @@ -16,24 +15,22 @@ Arguments: Returns: number of characters placed in the buffer */ -int -_pcre_ord2utf8(int cvalue, uschar *buffer) -{ +int _pcre_ord2utf8(int cvalue, uschar* buffer) { #ifdef SUPPORT_UTF8 -register int i, j; -for (i = 0; i < _pcre_utf8_table1_size; i++) - if (cvalue <= _pcre_utf8_table1[i]) break; -buffer += i; -for (j = i; j > 0; j--) - { - *buffer-- = 0x80 | (cvalue & 0x3f); - cvalue >>= 6; - } -*buffer = _pcre_utf8_table2[i] | cvalue; -return i + 1; + register int i, j; + for (i = 0; i < _pcre_utf8_table1_size; i++) + if (cvalue <= _pcre_utf8_table1[i]) + break; + buffer += i; + for (j = i; j > 0; j--) { + *buffer-- = 0x80 | (cvalue & 0x3f); + cvalue >>= 6; + } + *buffer = _pcre_utf8_table2[i] | cvalue; + return i + 1; #else -return 0; /* Keep compiler happy; this function won't ever be */ -#endif /* called when SUPPORT_UTF8 is not defined. */ + return 0; /* Keep compiler happy; this function won't ever be */ +#endif /* called when SUPPORT_UTF8 is not defined. */ } /* End of pcre_ord2utf8.c */ diff --git a/package/re/pcre_tables.c b/package/re/pcre_tables.c index b7bd9fb5a..04e0d4f04 100644 --- a/package/re/pcre_tables.c +++ b/package/re/pcre_tables.c @@ -2,40 +2,36 @@ #include "re_config.h" #include "pcre_internal.h" - /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that the definition is next to the definition of the opcodes in pcre_internal.h. */ -const uschar _pcre_OP_lengths[] = { OP_LENGTHS }; - - +const uschar _pcre_OP_lengths[] = {OP_LENGTHS}; /************************************************* -* Tables for UTF-8 support * -*************************************************/ + * Tables for UTF-8 support * + *************************************************/ /* These are the breakpoints for different numbers of bytes in a UTF-8 character. */ #ifdef SUPPORT_UTF8 -const int _pcre_utf8_table1[] = - { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; +const int _pcre_utf8_table1[] = {0x7f, 0x7ff, 0xffff, + 0x1fffff, 0x3ffffff, 0x7fffffff}; -const int _pcre_utf8_table1_size = sizeof(_pcre_utf8_table1)/sizeof(int); +const int _pcre_utf8_table1_size = sizeof(_pcre_utf8_table1) / sizeof(int); /* These are the indicator bits and the mask for the data bits to set in the first byte of a character, indexed by the number of additional bytes. */ -const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; -const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; +const int _pcre_utf8_table2[] = {0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; +const int _pcre_utf8_table3[] = {0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; /* Table of the number of extra bytes, indexed by the first byte masked with 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ const uschar _pcre_utf8_table4[] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5}; #endif diff --git a/package/re/pcre_try_flipped.c b/package/re/pcre_try_flipped.c index a60506d29..e21e8fa22 100644 --- a/package/re/pcre_try_flipped.c +++ b/package/re/pcre_try_flipped.c @@ -2,10 +2,9 @@ #include "re_config.h" #include "pcre_internal.h" - /************************************************* -* Flip bytes in an integer * -*************************************************/ + * Flip bytes in an integer * + *************************************************/ /* This function is called when the magic number in a regex doesn't match, in order to flip its bytes to see if we are dealing with a pattern that was @@ -19,21 +18,16 @@ Arguments: Returns: the flipped value */ -static unsigned long int -byteflip(unsigned long int value, int n) -{ -if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8); -return ((value & 0x000000ff) << 24) | - ((value & 0x0000ff00) << 8) | - ((value & 0x00ff0000) >> 8) | - ((value & 0xff000000) >> 24); +static unsigned long int byteflip(unsigned long int value, int n) { + if (n == 2) + return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8); + return ((value & 0x000000ff) << 24) | ((value & 0x0000ff00) << 8) | + ((value & 0x00ff0000) >> 8) | ((value & 0xff000000) >> 24); } - - /************************************************* -* Test for a byte-flipped compiled regex * -*************************************************/ + * Test for a byte-flipped compiled regex * + *************************************************/ /* This function is called from pcre_exec(), pcre_dfa_exec(), and also from pcre_fullinfo(). Its job is to test whether the regex is byte-flipped - that @@ -51,40 +45,40 @@ Returns: the new block if is is indeed a byte-flipped regex NULL if it is not */ -real_pcre * -_pcre_try_flipped(const real_pcre *re, real_pcre *internal_re, - const pcre_study_data *study, pcre_study_data *internal_study) -{ -if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER) - return NULL; +real_pcre* _pcre_try_flipped(const real_pcre* re, + real_pcre* internal_re, + const pcre_study_data* study, + pcre_study_data* internal_study) { + if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER) + return NULL; -*internal_re = *re; /* To copy other fields */ -internal_re->size = byteflip(re->size, sizeof(re->size)); -internal_re->options = byteflip(re->options, sizeof(re->options)); -internal_re->flags = (pcre_uint16)byteflip(re->flags, sizeof(re->flags)); -internal_re->top_bracket = - (pcre_uint16)byteflip(re->top_bracket, sizeof(re->top_bracket)); -internal_re->top_backref = - (pcre_uint16)byteflip(re->top_backref, sizeof(re->top_backref)); -internal_re->first_byte = - (pcre_uint16)byteflip(re->first_byte, sizeof(re->first_byte)); -internal_re->req_byte = - (pcre_uint16)byteflip(re->req_byte, sizeof(re->req_byte)); -internal_re->name_table_offset = - (pcre_uint16)byteflip(re->name_table_offset, sizeof(re->name_table_offset)); -internal_re->name_entry_size = - (pcre_uint16)byteflip(re->name_entry_size, sizeof(re->name_entry_size)); -internal_re->name_count = - (pcre_uint16)byteflip(re->name_count, sizeof(re->name_count)); + *internal_re = *re; /* To copy other fields */ + internal_re->size = byteflip(re->size, sizeof(re->size)); + internal_re->options = byteflip(re->options, sizeof(re->options)); + internal_re->flags = (pcre_uint16)byteflip(re->flags, sizeof(re->flags)); + internal_re->top_bracket = + (pcre_uint16)byteflip(re->top_bracket, sizeof(re->top_bracket)); + internal_re->top_backref = + (pcre_uint16)byteflip(re->top_backref, sizeof(re->top_backref)); + internal_re->first_byte = + (pcre_uint16)byteflip(re->first_byte, sizeof(re->first_byte)); + internal_re->req_byte = + (pcre_uint16)byteflip(re->req_byte, sizeof(re->req_byte)); + internal_re->name_table_offset = (pcre_uint16)byteflip( + re->name_table_offset, sizeof(re->name_table_offset)); + internal_re->name_entry_size = + (pcre_uint16)byteflip(re->name_entry_size, sizeof(re->name_entry_size)); + internal_re->name_count = + (pcre_uint16)byteflip(re->name_count, sizeof(re->name_count)); -if (study != NULL) - { - *internal_study = *study; /* To copy other fields */ - internal_study->size = byteflip(study->size, sizeof(study->size)); - internal_study->options = byteflip(study->options, sizeof(study->options)); - } + if (study != NULL) { + *internal_study = *study; /* To copy other fields */ + internal_study->size = byteflip(study->size, sizeof(study->size)); + internal_study->options = + byteflip(study->options, sizeof(study->options)); + } -return internal_re; + return internal_re; } /* End of pcre_tryflipped.c */ diff --git a/package/re/pcre_valid_utf8.c b/package/re/pcre_valid_utf8.c index 0755e4aad..942bac4d8 100644 --- a/package/re/pcre_valid_utf8.c +++ b/package/re/pcre_valid_utf8.c @@ -2,10 +2,9 @@ #include "re_config.h" #include "pcre_internal.h" - /************************************************* -* Validate a UTF-8 string * -*************************************************/ + * Validate a UTF-8 string * + *************************************************/ /* This function is called (optionally) at the start of compile or match, to validate that a supposed UTF-8 string is actually valid. The early check means @@ -28,60 +27,60 @@ Returns: < 0 if the string is a valid UTF-8 string >= 0 otherwise; the value is the offset of the bad byte */ -int -_pcre_valid_utf8(const uschar *string, int length) -{ +int _pcre_valid_utf8(const uschar* string, int length) { #ifdef SUPPORT_UTF8 -register const uschar *p; + register const uschar* p; -if (length < 0) - { - for (p = string; *p != 0; p++); - length = (uintptr_t)p - (uintptr_t)string; - } + if (length < 0) { + for (p = string; *p != 0; p++) + ; + length = (uintptr_t)p - (uintptr_t)string; + } -for (p = string; length-- > 0; p++) - { - register int ab; - register int c = *p; - if (c < 128) continue; - if (c < 0xc0) return (uintptr_t)p - (uintptr_t)string; - ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ - if (length < ab || ab > 3) return (uintptr_t)p - (uintptr_t)string; - length -= ab; + for (p = string; length-- > 0; p++) { + register int ab; + register int c = *p; + if (c < 128) + continue; + if (c < 0xc0) + return (uintptr_t)p - (uintptr_t)string; + ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ + if (length < ab || ab > 3) + return (uintptr_t)p - (uintptr_t)string; + length -= ab; - /* Check top bits in the second byte */ - if ((*(++p) & 0xc0) != 0x80) return (uintptr_t)p - (uintptr_t)string; + /* Check top bits in the second byte */ + if ((*(++p) & 0xc0) != 0x80) + return (uintptr_t)p - (uintptr_t)string; - /* Check for overlong sequences for each different length, and for the - excluded range 0xd000 to 0xdfff. */ + /* Check for overlong sequences for each different length, and for the + excluded range 0xd000 to 0xdfff. */ - switch (ab) - { - /* Check for xx00 000x (overlong sequence) */ + switch (ab) { + /* Check for xx00 000x (overlong sequence) */ - case 1: - if ((c & 0x3e) == 0) return (uintptr_t)p - (uintptr_t)string; - continue; /* We know there aren't any more bytes to check */ + case 1: + if ((c & 0x3e) == 0) + return (uintptr_t)p - (uintptr_t)string; + continue; /* We know there aren't any more bytes to check */ - /* Check for 1110 0000, xx0x xxxx (overlong sequence) or - 1110 1101, 1010 xxxx (0xd000 - 0xdfff) */ + /* Check for 1110 0000, xx0x xxxx (overlong sequence) or + 1110 1101, 1010 xxxx (0xd000 - 0xdfff) */ - case 2: - if ((c == 0xe0 && (*p & 0x20) == 0) || - (c == 0xed && *p >= 0xa0)) - return (uintptr_t)p - (uintptr_t)string; - break; + case 2: + if ((c == 0xe0 && (*p & 0x20) == 0) || + (c == 0xed && *p >= 0xa0)) + return (uintptr_t)p - (uintptr_t)string; + break; - /* Check for 1111 0000, xx00 xxxx (overlong sequence) or - greater than 0x0010ffff (f4 8f bf bf) */ + /* Check for 1111 0000, xx00 xxxx (overlong sequence) or + greater than 0x0010ffff (f4 8f bf bf) */ - case 3: - if ((c == 0xf0 && (*p & 0x30) == 0) || - (c > 0xf4 ) || - (c == 0xf4 && *p > 0x8f)) - return (uintptr_t)p - (uintptr_t)string; - break; + case 3: + if ((c == 0xf0 && (*p & 0x30) == 0) || (c > 0xf4) || + (c == 0xf4 && *p > 0x8f)) + return (uintptr_t)p - (uintptr_t)string; + break; #if 0 /* These cases can no longer occur, as we restrict to a maximum of four @@ -99,18 +98,17 @@ for (p = string; length-- > 0; p++) (c == 0xfc && (*p & 0x3c) == 0)) return (uintptr_t)p - (uintptr_t)string; break; #endif + } + /* Check for valid bytes after the 2nd, if any; all must start 10 */ + while (--ab > 0) { + if ((*(++p) & 0xc0) != 0x80) + return (uintptr_t)p - (uintptr_t)string; + } } - - /* Check for valid bytes after the 2nd, if any; all must start 10 */ - while (--ab > 0) - { - if ((*(++p) & 0xc0) != 0x80) return (uintptr_t)p - (uintptr_t)string; - } - } #endif -return -1; + return -1; } /* End of pcre_valid_utf8.c */ diff --git a/package/re/pcre_xclass.c b/package/re/pcre_xclass.c index 39184a627..b5f75c6d2 100644 --- a/package/re/pcre_xclass.c +++ b/package/re/pcre_xclass.c @@ -1,11 +1,9 @@ - #include "re_config.h" #include "pcre_internal.h" - /************************************************* -* Match character against an XCLASS * -*************************************************/ + * Match character against an XCLASS * + *************************************************/ /* This function is called to match a character against an extended class that might contain values > 255. @@ -17,85 +15,84 @@ Arguments: Returns: TRUE if character matches, else FALSE */ -BOOL -_pcre_xclass(int c, const uschar *data) -{ -int t; -BOOL negated = (*data & XCL_NOT) != 0; +BOOL _pcre_xclass(int c, const uschar* data) { + int t; + BOOL negated = (*data & XCL_NOT) != 0; -/* Character values < 256 are matched against a bitmap, if one is present. If -not, we still carry on, because there may be ranges that start below 256 in the -additional data. */ + /* Character values < 256 are matched against a bitmap, if one is present. + If not, we still carry on, because there may be ranges that start below 256 + in the additional data. */ -if (c < 256) - { - if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0) - return !negated; /* char found */ - } - -/* First skip the bit map if present. Then match against the list of Unicode -properties or large chars or ranges that end with a large char. We won't ever -encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ - -if ((*data++ & XCL_MAP) != 0) data += 32; - -while ((t = *data++) != XCL_END) - { - int x, y; - if (t == XCL_SINGLE) - { - GETCHARINC(x, data); - if (c == x) return !negated; - } - else if (t == XCL_RANGE) - { - GETCHARINC(x, data); - GETCHARINC(y, data); - if (c >= x && c <= y) return !negated; + if (c < 256) { + if ((*data & XCL_MAP) != 0 && (data[1 + c / 8] & (1 << (c & 7))) != 0) + return !negated; /* char found */ } + /* First skip the bit map if present. Then match against the list of Unicode + properties or large chars or ranges that end with a large char. We won't + ever encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ + + if ((*data++ & XCL_MAP) != 0) + data += 32; + + while ((t = *data++) != XCL_END) { + int x, y; + if (t == XCL_SINGLE) { + GETCHARINC(x, data); + if (c == x) + return !negated; + } else if (t == XCL_RANGE) { + GETCHARINC(x, data); + GETCHARINC(y, data); + if (c >= x && c <= y) + return !negated; + } #ifdef SUPPORT_UCP - else /* XCL_PROP & XCL_NOTPROP */ - { - int chartype, script; - int category = _pcre_ucp_findprop(c, &chartype, &script); + else /* XCL_PROP & XCL_NOTPROP */ + { + int chartype, script; + int category = _pcre_ucp_findprop(c, &chartype, &script); - switch(*data) - { - case PT_ANY: - if (t == XCL_PROP) return !negated; - break; + switch (*data) { + case PT_ANY: + if (t == XCL_PROP) + return !negated; + break; - case PT_LAMP: - if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) == - (t == XCL_PROP)) return !negated; - break; + case PT_LAMP: + if ((chartype == ucp_Lu || chartype == ucp_Ll || + chartype == ucp_Lt) == (t == XCL_PROP)) + return !negated; + break; - case PT_GC: - if ((data[1] == category) == (t == XCL_PROP)) return !negated; - break; + case PT_GC: + if ((data[1] == category) == (t == XCL_PROP)) + return !negated; + break; - case PT_PC: - if ((data[1] == chartype) == (t == XCL_PROP)) return !negated; - break; + case PT_PC: + if ((data[1] == chartype) == (t == XCL_PROP)) + return !negated; + break; - case PT_SC: - if ((data[1] == script) == (t == XCL_PROP)) return !negated; - break; + case PT_SC: + if ((data[1] == script) == (t == XCL_PROP)) + return !negated; + break; - /* This should never occur, but compilers may mutter if there is no - default. */ + /* This should never occur, but compilers may mutter if + there is no default. */ - default: - return FALSE; - } + default: + return FALSE; + } - data += 2; + data += 2; + } +#endif /* SUPPORT_UCP */ } -#endif /* SUPPORT_UCP */ - } -return negated; /* char did not match */ + return negated; /* char did not match */ } /* End of pcre_xclass.c */ diff --git a/package/re/re-api-adapter.c b/package/re/re-api-adapter.c index ef5f8e8c4..f06728941 100644 --- a/package/re/re-api-adapter.c +++ b/package/re/re-api-adapter.c @@ -18,50 +18,47 @@ #error PikaScript version 1.10.5 or later is required. #endif -#define raise_error \ - { \ - obj_setErrorCode(self, -__LINE__); \ - } +#define raise_error \ + { obj_setErrorCode(self, -__LINE__); } #define tu_getNew(name, obj_name) \ - PikaTuple *name = New_pikaTuple(); \ + PikaTuple* name = New_pikaTuple(); \ Any obj_name = newNormalObj(New_PikaStdData_Tuple); \ obj_setPtr(obj_name, "list", name); -#define tu_append(tup, val, type) \ - { \ - Arg *_arg = arg_new##type(val); \ +#define tu_append(tup, val, type) \ + { \ + Arg* _arg = arg_new##type(val); \ pikaList_append(&(tup)->super, _arg); \ - arg_deinit(_arg); \ + arg_deinit(_arg); \ } #define li_append(list, val, type) \ { \ - Arg *_arg = arg_new##type(val); \ + Arg* _arg = arg_new##type(val); \ PikaStdData_List_append(list, _arg); \ arg_deinit(_arg); \ } -typedef PikaObj *Any; +typedef PikaObj* Any; -void re_Match___init__args(PikaObj *self, char *sub, int *vec, int ven); -int _get_flags(PikaTuple *val); -PikaObj *__split(void *pattern__or__re, - char *subject, +void re_Match___init__args(PikaObj* self, char* sub, int* vec, int ven); +int _get_flags(PikaTuple* val); +PikaObj* __split(void* pattern__or__re, + char* subject, int max_split, int flags, int mode_re); -PikaObj *__findall(void *pattern__or__re, - char *subject, +PikaObj* __findall(void* pattern__or__re, + char* subject, int flags, int mode_re); -PikaObj *__subn(void *pattern__or__re, - char *repl, - char *subjet, +PikaObj* __subn(void* pattern__or__re, + char* repl, + char* subjet, int count, int flags, int mode_re); -void re___init__(PikaObj *self) -{ +void re___init__(PikaObj* self) { obj_setInt(self, "A", PCRE_ONLY_ASCII); obj_setInt(self, "I", PCRE_CASELESS); obj_setInt(self, "M", PCRE_MULTILINE); @@ -72,15 +69,13 @@ void re___init__(PikaObj *self) obj_setInt(self, "DOTALL", PCRE_DOTALL); } -PikaObj *re_findall(PikaObj *self, - char *pattern, - char *subject, - PikaTuple *val) -{ +PikaObj* re_findall(PikaObj* self, + char* pattern, + char* subject, + PikaTuple* val) { int flags = 0; flags = _get_flags(val); - if (flags < 0) - { + if (flags < 0) { obj_setErrorCode(self, __LINE__); return NULL; } @@ -89,19 +84,16 @@ PikaObj *re_findall(PikaObj *self, raise_error; return list; } -PikaObj *re_match(PikaObj *self, char *pattern, char *subject, PikaTuple *val) -{ +PikaObj* re_match(PikaObj* self, char* pattern, char* subject, PikaTuple* val) { int flags = 0; flags = _get_flags(val); - if (flags < 0) - { + if (flags < 0) { obj_setErrorCode(self, __LINE__); return NULL; } int ven = -1; - int *vec = pcre_match(pattern, subject, strlen(subject), &ven, flags); - if (!vec) - { + int* vec = pcre_match(pattern, subject, strlen(subject), &ven, flags); + if (!vec) { if (ven < 0) obj_setErrorCode(self, -__LINE__); return NULL; @@ -110,23 +102,20 @@ PikaObj *re_match(PikaObj *self, char *pattern, char *subject, PikaTuple *val) re_Match___init__args(m, subject, vec, ven); return m; } -PikaObj *re_fullmatch(PikaObj *self, - char *pattern, - char *subject, - PikaTuple *val) -{ +PikaObj* re_fullmatch(PikaObj* self, + char* pattern, + char* subject, + PikaTuple* val) { int flags = 0; flags = _get_flags(val); - if (flags < 0) - { + if (flags < 0) { obj_setErrorCode(self, __LINE__); return NULL; } int ven = -1; - int *vec = pcre_fullmatch(pattern, subject, strlen(subject), &ven, flags); - if (!vec) - { + int* vec = pcre_fullmatch(pattern, subject, strlen(subject), &ven, flags); + if (!vec) { if (ven < 0) obj_setErrorCode(self, -__LINE__); return NULL; @@ -135,22 +124,19 @@ PikaObj *re_fullmatch(PikaObj *self, re_Match___init__args(m, subject, vec, ven); return m; } -PikaObj *re_search(PikaObj *self, - char *pattern, - char *subject, - PikaTuple *val) -{ +PikaObj* re_search(PikaObj* self, + char* pattern, + char* subject, + PikaTuple* val) { int flags = 0; flags = _get_flags(val); - if (flags < 0) - { + if (flags < 0) { obj_setErrorCode(self, __LINE__); return NULL; } int ven = -1; - int *vec = pcre_search(pattern, subject, strlen(subject), &ven, flags); - if (!vec) - { + int* vec = pcre_search(pattern, subject, strlen(subject), &ven, flags); + if (!vec) { if (ven < 0) obj_setErrorCode(self, -__LINE__); return NULL; @@ -159,57 +145,48 @@ PikaObj *re_search(PikaObj *self, re_Match___init__args(m, subject, vec, ven); return m; } -char *re_sub(PikaObj *self, - char *pattern, - char *repl, - char *subjet, - PikaTuple *val) -{ +char* re_sub(PikaObj* self, + char* pattern, + char* repl, + char* subjet, + PikaTuple* val) { int flags = PCRE_UTF8; int count = 0; int argn = pikaTuple_getSize(val); - if (argn >= 1) - { - Arg *arg_i = pikaTuple_getArg(val, 0); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 1) { + Arg* arg_i = pikaTuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } count = arg_getInt(arg_i); } - if (argn >= 2) - { - Arg *arg_i = pikaTuple_getArg(val, 1); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 2) { + Arg* arg_i = pikaTuple_getArg(val, 1); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } flags = arg_getInt(arg_i); - if (flags | PCRE_ONLY_ASCII) - { + if (flags | PCRE_ONLY_ASCII) { flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); } } int length = strlen(subjet); - char *s = pcre_subn(pattern, repl, subjet, length, count, flags, NULL); - if (!s) - { + char* s = pcre_subn(pattern, repl, subjet, length, count, flags, NULL); + if (!s) { obj_setErrorCode(self, -__LINE__); return NULL; } - if (s == subjet) - { + if (s == subjet) { obj_setStr(self, "_b", subjet); return obj_getStr(self, "_b"); } int len = strlen(s); - char *b = (char *)malloc(len + 1); - if (!b) - { + char* b = (char*)malloc(len + 1); + if (!b) { free(s); return NULL; } @@ -220,59 +197,49 @@ char *re_sub(PikaObj *self, free(s); return obj_getStr(self, "_b"); } -PikaObj *re_subn(PikaObj *self, - char *pattern, - char *repl, - char *subjet, - PikaTuple *val) -{ +PikaObj* re_subn(PikaObj* self, + char* pattern, + char* repl, + char* subjet, + PikaTuple* val) { int flags = PCRE_UTF8; int count = 0; int argn = pikaTuple_getSize(val); - if (argn >= 1) - { - Arg *arg_i = pikaTuple_getArg(val, 0); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 1) { + Arg* arg_i = pikaTuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } count = arg_getInt(arg_i); } - if (argn >= 2) - { - Arg *arg_i = pikaTuple_getArg(val, 1); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 2) { + Arg* arg_i = pikaTuple_getArg(val, 1); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } flags = arg_getInt(arg_i); - if (flags | PCRE_ONLY_ASCII) - { + if (flags | PCRE_ONLY_ASCII) { flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); } } Any res = __subn(pattern, repl, subjet, count, flags, 0); - if (!res) - { + if (!res) { raise_error; } return res; } -PikaObj *re_compile(PikaObj *self, char *pattern, PikaTuple *val) -{ - const char *error; +PikaObj* re_compile(PikaObj* self, char* pattern, PikaTuple* val) { + const char* error; int erroffset; int flags = _get_flags(val); - if (flags < 0) - { + if (flags < 0) { raise_error; return NULL; } - pcre *re = pcre_compile(pattern, flags, &error, &erroffset, NULL); - if (!re) - { + pcre* re = pcre_compile(pattern, flags, &error, &erroffset, NULL); + if (!re) { obj_setErrorCode(self, erroffset); return NULL; } @@ -281,32 +248,26 @@ PikaObj *re_compile(PikaObj *self, char *pattern, PikaTuple *val) obj_setPtr(m, "_re", re); return m; } -PikaObj *re_split(PikaObj *self, char *pattern, char *subject, PikaTuple *val) -{ +PikaObj* re_split(PikaObj* self, char* pattern, char* subject, PikaTuple* val) { int flags = PCRE_UTF8; int max_split = 0; int argn = pikaTuple_getSize(val); - if (argn >= 1) - { - Arg *arg_i = pikaTuple_getArg(val, 0); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 1) { + Arg* arg_i = pikaTuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } max_split = arg_getInt(arg_i); } - if (argn >= 2) - { - Arg *arg_i = pikaTuple_getArg(val, 1); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 2) { + Arg* arg_i = pikaTuple_getArg(val, 1); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } flags = arg_getInt(arg_i); - if (flags | PCRE_ONLY_ASCII) - { + if (flags | PCRE_ONLY_ASCII) { flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); } } @@ -316,34 +277,29 @@ PikaObj *re_split(PikaObj *self, char *pattern, char *subject, PikaTuple *val) return list; } -char *re_escape(PikaObj *self, char *pattern) -{ - const char *special_chars = "()[]{}?*+-|^$\\.&~# \t\n\r\v\f"; +char* re_escape(PikaObj* self, char* pattern) { + const char* special_chars = "()[]{}?*+-|^$\\.&~# \t\n\r\v\f"; const int special_chars_len = 25; if (!pattern) return NULL; int n = strlen(pattern); int after_size = n; - for (int i = 0; i < n; i++) - { - for (int j = 0; j < special_chars_len; j++) - { + for (int i = 0; i < n; i++) { + for (int j = 0; j < special_chars_len; j++) { if (pattern[i] != special_chars[j]) continue; after_size++; break; } } - char *new_s = (char *)malloc(after_size + 1); + char* new_s = (char*)malloc(after_size + 1); if (!new_s) return NULL; int at = 0; - while (*pattern) - { + while (*pattern) { char c = *pattern; int j = 0; - for (; j < special_chars_len; j++) - { + for (; j < special_chars_len; j++) { if (c != special_chars[j]) continue; new_s[at++] = '\\'; @@ -358,61 +314,53 @@ char *re_escape(PikaObj *self, char *pattern) return obj_getStr(self, "_b"); } -void re_Match___del__(PikaObj *self) -{ - void *vec = obj_getPtr(self, "_vec"); +void re_Match___del__(PikaObj* self) { + void* vec = obj_getPtr(self, "_vec"); if (!vec) return; free(vec); } -void re_Match___init__(PikaObj *self) -{ - if (!obj_isArgExist(self, "_vec")) - { +void re_Match___init__(PikaObj* self) { + if (!obj_isArgExist(self, "_vec")) { obj_setPtr(self, "_vec", NULL); obj_setStr(self, "_b", ""); obj_setInt(self, "_ven", 0); obj_setStr(self, "_s", ""); } } -void re_Match___init__args(PikaObj *self, char *sub, int *vec, int ven) -{ +void re_Match___init__args(PikaObj* self, char* sub, int* vec, int ven) { obj_setPtr(self, "_vec", vec); obj_setStr(self, "_b", ""); obj_setInt(self, "_ven", ven); obj_setStr(self, "_s", sub); } -char *re_Match_group(PikaObj *self, PikaTuple *val) -{ +char* re_Match_group(PikaObj* self, PikaTuple* val) { int n = 0; int argn = pikaTuple_getSize(val); - if (argn >= 1) - { - Arg *arg_i = pikaTuple_getArg(val, 0); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 1) { + Arg* arg_i = pikaTuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } n = arg_getInt(arg_i); } - int *vec = obj_getPtr(self, "_vec"); + int* vec = obj_getPtr(self, "_vec"); if (!vec) return NULL; - char *s = obj_getStr(self, "_s"); + char* s = obj_getStr(self, "_s"); if (!s) return NULL; int ven = obj_getInt(self, "_ven"); - if (n >= ven || n < 0) - { + if (n >= ven || n < 0) { obj_setErrorCode(self, -__LINE__); return NULL; } int len = vec[n * 2 + 1] - vec[n * 2]; if (!len) return ""; - char *b = (char *)malloc(len + 1); + char* b = (char*)malloc(len + 1); if (!b) return NULL; memcpy(b, s + vec[n * 2], len); @@ -421,12 +369,11 @@ char *re_Match_group(PikaObj *self, PikaTuple *val) free(b); return obj_getStr(self, "_b"); } -PikaObj *re_Match_groups(PikaObj *self) -{ - int *vec = obj_getPtr(self, "_vec"); +PikaObj* re_Match_groups(PikaObj* self) { + int* vec = obj_getPtr(self, "_vec"); if (!vec) return NULL; - char *s = obj_getStr(self, "_s"); + char* s = obj_getStr(self, "_s"); if (!s) return NULL; int ven = obj_getInt(self, "_ven"); @@ -434,22 +381,18 @@ PikaObj *re_Match_groups(PikaObj *self) return NULL; tu_getNew(tup, tup_obj); - for (int i = 1; i < ven; i++) - { - Arg *str_arg1; + for (int i = 1; i < ven; i++) { + Arg* str_arg1; int len = vec[i * 2 + 1] - vec[i * 2]; - if (len) - { - char *b = (char *)malloc(len + 1); + if (len) { + char* b = (char*)malloc(len + 1); if (!b) return NULL; memcpy(b, s + vec[i * 2], len); b[len] = 0; str_arg1 = arg_newStr(b); free(b); - } - else - { + } else { str_arg1 = arg_newStr(""); } pikaList_append(&(tup)->super, str_arg1); @@ -457,26 +400,24 @@ PikaObj *re_Match_groups(PikaObj *self) } return tup_obj; } -PikaObj *re_Match_span(PikaObj *self, PikaTuple *val) -{ +PikaObj* re_Match_span(PikaObj* self, PikaTuple* val) { int group_n = 0; int argn = pikaTuple_getSize(val); - if (argn >= 1) - { - Arg *arg_i = pikaTuple_getArg(val, 0); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 1) { + Arg* arg_i = pikaTuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } group_n = arg_getInt(arg_i); } - int *vec = obj_getPtr(self, "_vec"); - if (!vec) + int* vec = obj_getPtr(self, "_vec"); + if (!vec) { raise_error; + return NULL; + } int ven = obj_getInt(self, "_ven"); - if (!ven || group_n >= ven) - { + if (!ven || group_n >= ven) { obj_setErrorCode(self, -__LINE__); return NULL; } @@ -486,57 +427,49 @@ PikaObj *re_Match_span(PikaObj *self, PikaTuple *val) return tu_obj; } -void re_Pattern___del__(PikaObj *self) -{ - void *_re = obj_getPtr(self, "_re"); +void re_Pattern___del__(PikaObj* self) { + void* _re = obj_getPtr(self, "_re"); if (!_re) return; - pcre *re = (pcre *)_re; + pcre* re = (pcre*)_re; pcre_free(re); } -void re_Pattern___init__(PikaObj *self) -{ - if (!obj_isArgExist(self, "_re")) - { +void re_Pattern___init__(PikaObj* self) { + if (!obj_isArgExist(self, "_re")) { obj_setPtr(self, "_re", NULL); obj_setStr(self, "_b", ""); obj_setInt(self, "_n", -1); } } -PikaObj *re_Pattern_findall(PikaObj *self, char *subject, PikaTuple *val) -{ +PikaObj* re_Pattern_findall(PikaObj* self, char* subject, PikaTuple* val) { int flags = 0; flags = _get_flags(val); - if (flags < 0) - { + if (flags < 0) { obj_setErrorCode(self, __LINE__); return NULL; } if (!obj_isArgExist(self, "_re")) return NULL; - pcre *re = obj_getPtr(self, "_re"); + pcre* re = obj_getPtr(self, "_re"); Any list = __findall(re, subject, flags, 1); if (!list) raise_error; return list; } -PikaObj *re_Pattern_match(PikaObj *self, char *subject, PikaTuple *val) -{ +PikaObj* re_Pattern_match(PikaObj* self, char* subject, PikaTuple* val) { int flags = 0; flags = _get_flags(val); - if (flags < 0) - { + if (flags < 0) { obj_setErrorCode(self, __LINE__); return NULL; } if (!obj_isArgExist(self, "_re")) return NULL; - pcre *re = obj_getPtr(self, "_re"); + pcre* re = obj_getPtr(self, "_re"); int ven = -1; - int *vec = re_match2(re, subject, strlen(subject), &ven, flags); - if (!vec) - { + int* vec = re_match2(re, subject, strlen(subject), &ven, flags); + if (!vec) { if (ven < 0) obj_setErrorCode(self, -__LINE__); @@ -546,22 +479,19 @@ PikaObj *re_Pattern_match(PikaObj *self, char *subject, PikaTuple *val) re_Match___init__args(m, subject, vec, ven); return m; } -PikaObj *re_Pattern_fullmatch(PikaObj *self, char *subject, PikaTuple *val) -{ +PikaObj* re_Pattern_fullmatch(PikaObj* self, char* subject, PikaTuple* val) { int flags = 0; flags = _get_flags(val); - if (flags < 0) - { + if (flags < 0) { obj_setErrorCode(self, __LINE__); return NULL; } if (!obj_isArgExist(self, "_re")) return NULL; - pcre *re = obj_getPtr(self, "_re"); + pcre* re = obj_getPtr(self, "_re"); int ven = -1; - int *vec = re_fullmatch2(re, subject, strlen(subject), &ven, flags); - if (!vec) - { + int* vec = re_fullmatch2(re, subject, strlen(subject), &ven, flags); + if (!vec) { if (ven < 0) obj_setErrorCode(self, -__LINE__); return NULL; @@ -570,24 +500,21 @@ PikaObj *re_Pattern_fullmatch(PikaObj *self, char *subject, PikaTuple *val) re_Match___init__args(m, subject, vec, ven); return m; } -PikaObj *re_Pattern_search(PikaObj *self, char *subject, PikaTuple *val) -{ +PikaObj* re_Pattern_search(PikaObj* self, char* subject, PikaTuple* val) { int flags = 0; flags = _get_flags(val); - if (flags < 0) - { + if (flags < 0) { obj_setErrorCode(self, __LINE__); return NULL; } if (!obj_isArgExist(self, "_re")) return NULL; - pcre *re = obj_getPtr(self, "_re"); + pcre* re = obj_getPtr(self, "_re"); Any m = newNormalObj(New_re_Match); int ven = -1; - int *vec = re_search2(re, subject, strlen(subject), &ven, flags); - if (!vec) - { + int* vec = re_search2(re, subject, strlen(subject), &ven, flags); + if (!vec) { if (ven < 0) obj_setErrorCode(self, -__LINE__); return NULL; @@ -595,59 +522,50 @@ PikaObj *re_Pattern_search(PikaObj *self, char *subject, PikaTuple *val) re_Match___init__args(m, subject, vec, ven); return m; } -char *re_Pattern_sub(PikaObj *self, char *repl, char *subjet, PikaTuple *val) -{ +char* re_Pattern_sub(PikaObj* self, char* repl, char* subjet, PikaTuple* val) { int flags = 0; int count = 0; int argn = pikaTuple_getSize(val); - if (argn >= 1) - { - Arg *arg_i = pikaTuple_getArg(val, 0); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 1) { + Arg* arg_i = pikaTuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } count = arg_getInt(arg_i); } - if (argn >= 2) - { - Arg *arg_i = pikaTuple_getArg(val, 1); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 2) { + Arg* arg_i = pikaTuple_getArg(val, 1); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } flags = arg_getInt(arg_i); - if (flags | PCRE_ONLY_ASCII) - { + if (flags | PCRE_ONLY_ASCII) { flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); } } if (!obj_isArgExist(self, "_re")) return NULL; - pcre *re = obj_getPtr(self, "_re"); + pcre* re = obj_getPtr(self, "_re"); int length = strlen(subjet); int matched_times = 0; - char *s = re_subn2(re, repl, subjet, length, count, flags, &matched_times); + char* s = re_subn2(re, repl, subjet, length, count, flags, &matched_times); obj_setInt(self, "_n", matched_times); - if (!s) - { + if (!s) { obj_setErrorCode(self, -__LINE__); return NULL; } - if (s == subjet) - { + if (s == subjet) { obj_setStr(self, "_b", subjet); return obj_getStr(self, "_b"); } int len = strlen(s); - char *b = (char *)malloc(len + 1); - if (!b) - { + char* b = (char*)malloc(len + 1); + if (!b) { free(s); return NULL; } @@ -658,72 +576,63 @@ char *re_Pattern_sub(PikaObj *self, char *repl, char *subjet, PikaTuple *val) free(s); return obj_getStr(self, "_b"); } -PikaObj *re_Pattern_subn(PikaObj *self, char *repl, char *subjet, PikaTuple *val) -{ +PikaObj* re_Pattern_subn(PikaObj* self, + char* repl, + char* subjet, + PikaTuple* val) { if (!obj_isArgExist(self, "_re")) return NULL; int flags = 0; int count = 0; int argn = pikaTuple_getSize(val); - if (argn >= 1) - { - Arg *arg_i = pikaTuple_getArg(val, 0); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 1) { + Arg* arg_i = pikaTuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } count = arg_getInt(arg_i); } - if (argn >= 2) - { - Arg *arg_i = pikaTuple_getArg(val, 1); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 2) { + Arg* arg_i = pikaTuple_getArg(val, 1); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } flags = arg_getInt(arg_i); - if (flags | PCRE_ONLY_ASCII) - { + if (flags | PCRE_ONLY_ASCII) { flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); } } - pcre *re = obj_getPtr(self, "_re"); + pcre* re = obj_getPtr(self, "_re"); Any res = __subn(re, repl, subjet, count, flags, 1); if (!res) raise_error; return res; } -PikaObj *re_Pattern_split(PikaObj *self, char *subject, PikaTuple *val) -{ +PikaObj* re_Pattern_split(PikaObj* self, char* subject, PikaTuple* val) { if (!obj_isArgExist(self, "_re")) return NULL; - pcre *re = obj_getPtr(self, "_re"); + pcre* re = obj_getPtr(self, "_re"); int flags = PCRE_UTF8; int max_split = 0; int argn = pikaTuple_getSize(val); - if (argn >= 1) - { - Arg *arg_i = pikaTuple_getArg(val, 0); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 1) { + Arg* arg_i = pikaTuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } max_split = arg_getInt(arg_i); } - if (argn >= 2) - { - Arg *arg_i = pikaTuple_getArg(val, 1); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 2) { + Arg* arg_i = pikaTuple_getArg(val, 1); + if (arg_getType(arg_i) != ARG_TYPE_INT) { obj_setErrorCode(self, -__LINE__); return NULL; } flags = arg_getInt(arg_i); - if (flags | PCRE_ONLY_ASCII) - { + if (flags | PCRE_ONLY_ASCII) { flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); } } @@ -733,28 +642,24 @@ PikaObj *re_Pattern_split(PikaObj *self, char *subject, PikaTuple *val) return list; } -int _get_flags(PikaTuple *val) -{ +int _get_flags(PikaTuple* val) { int flags = PCRE_UTF8; int argn = pikaTuple_getSize(val); - if (argn >= 1) - { - Arg *arg_i = pikaTuple_getArg(val, 0); - if (arg_getType(arg_i) != ARG_TYPE_INT) - { + if (argn >= 1) { + Arg* arg_i = pikaTuple_getArg(val, 0); + if (arg_getType(arg_i) != ARG_TYPE_INT) { return -1; } flags |= arg_getInt(arg_i); - if (flags & PCRE_ONLY_ASCII) - { + if (flags & PCRE_ONLY_ASCII) { flags &= ~(PCRE_ONLY_ASCII | PCRE_UTF8); } } return flags; } -PikaObj *__split(void *pattern__or__re, - char *subject, +PikaObj* __split(void* pattern__or__re, + char* subject, int max_split, int flags, int mode_re) @@ -764,17 +669,18 @@ PikaObj *__split(void *pattern__or__re, int j2 = 0; int _m_n = 0, m_n = 0; int brackets = -1; - int **vcs; + int** vcs; if (mode_re) - vcs = re_searchall2((pcre *)pattern__or__re, subject, sub_length, &_m_n, &brackets, flags); + vcs = re_searchall2((pcre*)pattern__or__re, subject, sub_length, &_m_n, + &brackets, flags); else - vcs = re_searchall((char *)pattern__or__re, subject, sub_length, &_m_n, &brackets, flags); + vcs = re_searchall((char*)pattern__or__re, subject, sub_length, &_m_n, + &brackets, flags); m_n = _m_n; - char *b = NULL; - Arg *str_arg1; + char* b = NULL; + Arg* str_arg1; // Arg *sub_arg; - if (!vcs) - { + if (!vcs) { return NULL; } if (max_split && max_split < m_n) @@ -783,23 +689,18 @@ PikaObj *__split(void *pattern__or__re, PikaStdData_List___init__(list); int start = 0; - if (brackets == 1) - { - for (int i = 0; i < m_n; i++) - { - int *v = vcs[i]; + if (brackets == 1) { + for (int i = 0; i < m_n; i++) { + int* v = vcs[i]; int length = v[0] - start; - if (length) - { + if (length) { b = malloc(length + 1); if (!b) goto e_er; b[length] = 0; memcpy(b, subject + start, length); - } - else - { - b = (char *)""; + } else { + b = (char*)""; } str_arg1 = arg_newStr(b); PikaStdData_List_append(list, str_arg1); @@ -808,8 +709,7 @@ PikaObj *__split(void *pattern__or__re, free(b); start = v[1]; } - if (start <= sub_length) - { + if (start <= sub_length) { str_arg1 = arg_newStr(subject + start); PikaStdData_List_append(list, str_arg1); arg_deinit(str_arg1); @@ -817,9 +717,8 @@ PikaObj *__split(void *pattern__or__re, goto exit; } - for (int i = 0; i < m_n; i++) - { - int *v = vcs[i]; + for (int i = 0; i < m_n; i++) { + int* v = vcs[i]; int length = v[0] - start; b = malloc(length + 1); if (!b) @@ -830,12 +729,10 @@ PikaObj *__split(void *pattern__or__re, PikaStdData_List_append(list, str_arg1); arg_deinit(str_arg1); - for (int j = 1; j < brackets; j++) - { + for (int j = 1; j < brackets; j++) { j2 = j * 2; int length2 = v[j2 + 1] - v[j2]; - if (length2 > length) - { + if (length2 > length) { free(b); length = length2; b = malloc(length + 1); @@ -852,16 +749,14 @@ PikaObj *__split(void *pattern__or__re, start = v[1]; free(b); } - if (start <= sub_length) - { + if (start <= sub_length) { str_arg1 = arg_newStr(subject + start); PikaStdData_List_append(list, str_arg1); arg_deinit(str_arg1); } goto exit; e_er: - if (list) - { + if (list) { obj_deinit(list); list = NULL; } @@ -871,26 +766,26 @@ exit: return list; } -PikaObj *__findall(void *pattern__or__re, - char *subject, +PikaObj* __findall(void* pattern__or__re, + char* subject, int flags, - int mode_re) -{ + int mode_re) { int length = strlen(subject); int j2 = 0; int m_n = -1; int brackets = -1; - int **vcs; + int** vcs; if (mode_re) - vcs = re_searchall2((pcre *)pattern__or__re, subject, length, &m_n, &brackets, flags); + vcs = re_searchall2((pcre*)pattern__or__re, subject, length, &m_n, + &brackets, flags); else - vcs = re_searchall((char *)pattern__or__re, subject, length, &m_n, &brackets, flags); + vcs = re_searchall((char*)pattern__or__re, subject, length, &m_n, + &brackets, flags); - char *b = NULL; - Arg *str_arg1; - Arg *sub_arg; - if (!vcs) - { + char* b = NULL; + Arg* str_arg1; + Arg* sub_arg; + if (!vcs) { if (m_n < 0) return NULL; Any list = newNormalObj(New_PikaStdData_List); @@ -900,25 +795,20 @@ PikaObj *__findall(void *pattern__or__re, } Any list = newNormalObj(New_PikaStdData_List); PikaStdData_List___init__(list); - PikaTuple *tu; + PikaTuple* tu; Any sub_list = NULL; - if (brackets == 1) - { - for (int i = 0; i < m_n; i++) - { - int *v = vcs[i]; + if (brackets == 1) { + for (int i = 0; i < m_n; i++) { + int* v = vcs[i]; length = v[1] - v[0]; - if (length) - { + if (length) { b = malloc(length + 1); if (!b) goto e_er; b[length] = 0; memcpy(b, subject + v[0], length); - } - else - { - b = (char *)""; + } else { + b = (char*)""; } str_arg1 = arg_newStr(b); PikaStdData_List_append(list, str_arg1); @@ -929,17 +819,15 @@ PikaObj *__findall(void *pattern__or__re, goto exit; } - for (int i = 0; i < m_n; i++) - { - int *v = vcs[i]; + for (int i = 0; i < m_n; i++) { + int* v = vcs[i]; length = v[1] - v[0]; b = malloc(length + 1); if (!b) goto e_er; tu = New_pikaTuple(); - for (int j = 1; j < brackets; j++) - { + for (int j = 1; j < brackets; j++) { j2 = j * 2; length = v[j2 + 1] - v[j2]; b[length] = 0; @@ -955,8 +843,7 @@ PikaObj *__findall(void *pattern__or__re, } goto exit; e_er: - if (list) - { + if (list) { obj_deinit(list); list = NULL; } @@ -966,28 +853,27 @@ exit: return list; } -PikaObj *__subn(void *pattern__or__re, - char *repl, - char *subjet, +PikaObj* __subn(void* pattern__or__re, + char* repl, + char* subjet, int count, int flags, - int mode_re) -{ + int mode_re) { int length = strlen(subjet); int matched_times = 0; - char *s; + char* s; if (mode_re) - s = re_subn2((pcre *)pattern__or__re, repl, subjet, length, count, flags, &matched_times); + s = re_subn2((pcre*)pattern__or__re, repl, subjet, length, count, flags, + &matched_times); else - s = pcre_subn((char *)pattern__or__re, repl, subjet, length, count, flags, &matched_times); + s = pcre_subn((char*)pattern__or__re, repl, subjet, length, count, + flags, &matched_times); - if (!s) - { + if (!s) { return NULL; } - if (s == subjet) - { - PikaTuple *yup = New_pikaTuple(); + if (s == subjet) { + PikaTuple* yup = New_pikaTuple(); tu_append(yup, s, Str); tu_append(yup, 0, Int); @@ -996,7 +882,7 @@ PikaObj *__subn(void *pattern__or__re, return tuple_obj; } - PikaTuple *yup = New_pikaTuple(); + PikaTuple* yup = New_pikaTuple(); tu_append(yup, s, Str); free(s); @@ -1005,4 +891,4 @@ PikaObj *__subn(void *pattern__or__re, Any tuple_obj = newNormalObj(New_PikaStdData_Tuple); obj_setPtr(tuple_obj, "list", yup); return tuple_obj; -} \ No newline at end of file +} diff --git a/package/re/re_config.h b/package/re/re_config.h index b4d6bfb06..384a03519 100644 --- a/package/re/re_config.h +++ b/package/re/re_config.h @@ -143,59 +143,60 @@ #endif /* The value of MATCH_LIMIT determines the default number of times the - internal match() function can be called during a single execution of - pcre_exec(). There is a runtime interface for setting a different limit. - The limit exists in order to catch runaway regular expressions that take - for ever to determine that they do not match. The default is set very large - so that it does not accidentally catch legitimate cases. On systems that - support it, "configure" can be used to override this default default. */ + internal match() function can be called during a single execution of + pcre_exec(). There is a runtime interface for setting a different + limit. The limit exists in order to catch runaway regular expressions that + take for ever to determine that they do not match. The default is set very + large so that it does not accidentally catch legitimate cases. On systems + that support it, "configure" can be used to override this default default. */ #ifndef MATCH_LIMIT #define MATCH_LIMIT 10000000 #endif /* The above limit applies to all calls of match(), whether or not they - increase the recursion depth. In some environments it is desirable to limit - the depth of recursive calls of match() more strictly, in order to restrict - the maximum amount of stack (or heap, if NO_RECURSE is defined) that is - used. The value of MATCH_LIMIT_RECURSION applies only to recursive calls of - match(). To have any useful effect, it must be less than the value of - MATCH_LIMIT. The default is to use the same value as MATCH_LIMIT. There is - a runtime method for setting a different limit. On systems that support it, - "configure" can be used to override the default. */ + increase the recursion depth. In some environments it is + desirable to limit the depth of recursive calls of match() more strictly, in + order to restrict the maximum amount of stack (or heap, if NO_RECURSE is + defined) that is used. The value of MATCH_LIMIT_RECURSION applies only to + recursive calls of match(). To have any useful effect, it must be less than + the value of MATCH_LIMIT. The default is to use the same value as + MATCH_LIMIT. There is a runtime method for setting a different limit. On + systems that support it, "configure" can be used to override the default. */ #ifndef MATCH_LIMIT_RECURSION #define MATCH_LIMIT_RECURSION MATCH_LIMIT #endif /* This limit is parameterized just in case anybody ever wants to change it. - Care must be taken if it is increased, because it guards against integer - overflow caused by enormously large patterns. */ + Care must be taken if it is increased, because it guards + against integer overflow caused by enormously large patterns. */ #ifndef MAX_NAME_COUNT #define MAX_NAME_COUNT 10000 #endif /* This limit is parameterized just in case anybody ever wants to change it. - Care must be taken if it is increased, because it guards against integer - overflow caused by enormously large patterns. */ + Care must be taken if it is increased, because it + guards against integer overflow caused by enormously large patterns. */ #ifndef MAX_NAME_SIZE #define MAX_NAME_SIZE 32 #endif /* The value of NEWLINE determines the newline character sequence. On systems - that support it, "configure" can be used to override the default, which is - 10. The possible values are 10 (LF), 13 (CR), 3338 (CRLF), -1 (ANY), or -2 - (ANYCRLF). */ + that support it, "configure" can be used to + override the default, which is + 10. The possible values are 10 (LF), 13 (CR), + 3338 (CRLF), -1 (ANY), or -2 (ANYCRLF). */ #ifndef NEWLINE #define NEWLINE 10 #endif /* When calling PCRE via the POSIX interface, additional working storage is - required for holding the pointers to capturing substrings because PCRE - requires three integers per substring, whereas the POSIX interface provides - only two. If the number of expected substrings is small, the wrapper - function uses space on the stack, because this is faster than using - malloc() for each call. The threshold above which the stack is no longer - used is defined by POSIX_MALLOC_THRESHOLD. On systems that support it, - "configure" can be used to override this default. */ + required for holding the pointers to capturing substrings because PCRE + requires three integers per substring, whereas the POSIX interface + provides only two. If the number of expected substrings is small, the wrapper + function uses space on the stack, because this is faster than using + malloc() for each call. The threshold above which the stack is no + longer used is defined by POSIX_MALLOC_THRESHOLD. On systems that support it, + "configure" can be used to override this default. */ #ifndef POSIX_MALLOC_THRESHOLD #define POSIX_MALLOC_THRESHOLD 10 #endif @@ -209,6 +210,5 @@ /* #undef SUPPORT_UTF8 */ #define SUPPORT_UTF8 - /* Define to `unsigned int' if does not define. */ /* #undef size_t */ diff --git a/port/linux/package/pikascript/pikascript-lib/PikaStdDevice/pika_hal_SOFT_IIC.c b/port/linux/package/pikascript/pikascript-lib/PikaStdDevice/pika_hal_SOFT_IIC.c index 8c69142d7..8c9f55a04 100644 --- a/port/linux/package/pikascript/pikascript-lib/PikaStdDevice/pika_hal_SOFT_IIC.c +++ b/port/linux/package/pikascript/pikascript-lib/PikaStdDevice/pika_hal_SOFT_IIC.c @@ -1,4 +1,17 @@ -#include "../PikaStdDevice/pika_hal.h" +#include "pika_hal.h" +#include + +static void _IIC_SDA_input(pika_hal_SOFT_IIC_config* iic_cfg) { + pika_hal_GPIO_config cfg_SDA = {0}; + cfg_SDA.dir = PIKA_HAL_GPIO_DIR_IN; + pika_hal_ioctl(iic_cfg->SDA, PIKA_HAL_IOCTL_CONFIG, &cfg_SDA); +} + +static void _IIC_SDA_output(pika_hal_SOFT_IIC_config* iic_cfg) { + pika_hal_GPIO_config cfg_SDA = {0}; + cfg_SDA.dir = PIKA_HAL_GPIO_DIR_OUT; + pika_hal_ioctl(iic_cfg->SDA, PIKA_HAL_IOCTL_CONFIG, &cfg_SDA); +} static int _GPIO_write(pika_dev* dev, uint32_t val) { return pika_hal_write(dev, &val, sizeof(val)); @@ -11,11 +24,12 @@ static uint32_t _GPIO_read(pika_dev* dev) { } static void _IIC_Delay(void) { - // Delay implementation, can be modified based on hardware platform. - // You may need to adjust the delay time to match your hardware. + pika_sleep_ms(3); } static void _IIC_Start(pika_hal_SOFT_IIC_config* cfg) { + pika_debug("iic start"); + _IIC_SDA_output(cfg); _GPIO_write(cfg->SDA, 1); _GPIO_write(cfg->SCL, 1); _IIC_Delay(); @@ -25,6 +39,8 @@ static void _IIC_Start(pika_hal_SOFT_IIC_config* cfg) { } static void _IIC_Stop(pika_hal_SOFT_IIC_config* cfg) { + pika_debug("iic stop"); + _IIC_SDA_output(cfg); _GPIO_write(cfg->SDA, 0); _GPIO_write(cfg->SCL, 1); _IIC_Delay(); @@ -32,7 +48,9 @@ static void _IIC_Stop(pika_hal_SOFT_IIC_config* cfg) { _IIC_Delay(); } -static void _IIC_SendByte(pika_hal_SOFT_IIC_config* cfg, uint8_t byte) { +static pika_bool _IIC_SendByte(pika_hal_SOFT_IIC_config* cfg, uint8_t byte) { + pika_debug(" - iic write: 0x%02X", byte); + _IIC_SDA_output(cfg); for (int i = 0; i < 8; i++) { _GPIO_write(cfg->SCL, 0); _IIC_Delay(); @@ -46,11 +64,52 @@ static void _IIC_SendByte(pika_hal_SOFT_IIC_config* cfg, uint8_t byte) { _IIC_Delay(); byte <<= 1; } + + // 在发送完字节后检查ACK信号 _GPIO_write(cfg->SCL, 0); + _IIC_Delay(); + _IIC_SDA_input(cfg); // 设置SDA为输入 + _GPIO_write(cfg->SCL, 1); // 将SCL线设置为高,让从设备发送ACK信号 + + int timeout = 1000; + uint32_t ack = 0; + do { + _IIC_Delay(); + ack = !_GPIO_read(cfg->SDA); // 如果从设备发送了ACK信号,SDA线会被拉低 + } while (ack == 0 && timeout-- > 0); + + // pika_debug("ack timeout:%d", timeout); + if (timeout <= 0) { + pika_platform_printf("Error: IIC write byte timeout\r\n"); + } + + _GPIO_write(cfg->SCL, 0); // 将SCL线设置为低,完成一个I2C周期 + return ack; +} + +static void _IIC_Ack(pika_hal_SOFT_IIC_config* cfg) { + _GPIO_write(cfg->SCL, 0); // 拉低时钟线 + _IIC_SDA_output(cfg); // 设置SDA为输出 + _GPIO_write(cfg->SDA, 0); // 拉低数据线 + _IIC_Delay(); + _GPIO_write(cfg->SCL, 1); // 产生时钟 + _IIC_Delay(); + _GPIO_write(cfg->SCL, 0); // 拉低时钟线 +} + +static void _IIC_NAck(pika_hal_SOFT_IIC_config* cfg) { + _GPIO_write(cfg->SCL, 0); // 拉低时钟线 + _IIC_SDA_output(cfg); // 设置SDA为输出 + _GPIO_write(cfg->SDA, 1); // 数据线拉高 + _IIC_Delay(); + _GPIO_write(cfg->SCL, 1); // 产生时钟 + _IIC_Delay(); + _GPIO_write(cfg->SCL, 0); // 拉低时钟线 } static uint8_t _IIC_ReadByte(pika_hal_SOFT_IIC_config* cfg, uint8_t ack) { uint8_t byte = 0; + _IIC_SDA_input(cfg); for (int i = 0; i < 8; i++) { _GPIO_write(cfg->SCL, 1); _IIC_Delay(); @@ -61,77 +120,75 @@ static uint8_t _IIC_ReadByte(pika_hal_SOFT_IIC_config* cfg, uint8_t ack) { _GPIO_write(cfg->SCL, 0); _IIC_Delay(); } + // 在读取完一个字节后发送ACK信号 if (ack) { - _IIC_SendByte(cfg, 0xFF); + _IIC_Ack(cfg); // 如果ack为真,发送ACK信号 } else { - _IIC_SendByte(cfg, 0x00); + _IIC_NAck(cfg); // 如果ack为假,发送NACK信号 } + pika_debug(" - iic read: 0x%02X", byte); return byte; } -static void set_SDA_input(pika_hal_SOFT_IIC_config* cfg) { - pika_hal_GPIO_config cfg_SDA = {0}; - cfg_SDA.dir = PIKA_HAL_GPIO_DIR_IN; - pika_hal_ioctl(cfg->SDA, PIKA_HAL_IOCTL_CONFIG, &cfg_SDA); -} - -static void set_SDA_output(pika_hal_SOFT_IIC_config* cfg) { - pika_hal_GPIO_config cfg_SDA = {0}; - cfg_SDA.dir = PIKA_HAL_GPIO_DIR_OUT; - pika_hal_ioctl(cfg->SDA, PIKA_HAL_IOCTL_CONFIG, &cfg_SDA); -} - int pika_hal_platform_SOFT_IIC_write(pika_dev* dev, void* buf, size_t count) { - pika_hal_SOFT_IIC_config* cfg = + pika_hal_SOFT_IIC_config* iic_cfg = (pika_hal_SOFT_IIC_config*)dev->ioctl_config; uint8_t* data = (uint8_t*)buf; - set_SDA_output(cfg); - _IIC_Start(cfg); + + _IIC_Start(iic_cfg); + uint8_t addr_write = (iic_cfg->slave_addr << 1) | 0x00; // 方向位为0代表写 + // pika_debug("iic addr_write: 0x%02X", addr_write); + _IIC_SendByte(iic_cfg, addr_write); // 方向位为0代表写 // 如果启用了mem_addr_ena,将设备地址和内存地址发送到I2C总线 - if (cfg->mem_addr_ena == PIKA_HAL_IIC_MEM_ADDR_ENA_ENABLE) { - _IIC_SendByte(cfg, cfg->slave_addr); - if (cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_8BIT) { - _IIC_SendByte(cfg, cfg->mem_addr & 0xFF); - } else if (cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_16BIT) { - _IIC_SendByte(cfg, (cfg->mem_addr >> 8) & 0xFF); - _IIC_SendByte(cfg, cfg->mem_addr & 0xFF); + if (iic_cfg->mem_addr_ena == PIKA_HAL_IIC_MEM_ADDR_ENA_ENABLE) { + if (iic_cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_8BIT) { + _IIC_SendByte(iic_cfg, iic_cfg->mem_addr & 0xFF); + } else if (iic_cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_16BIT) { + _IIC_SendByte(iic_cfg, (iic_cfg->mem_addr >> 8) & 0xFF); + _IIC_SendByte(iic_cfg, iic_cfg->mem_addr & 0xFF); } } for (int i = 0; i < count; i++) { - _IIC_SendByte(cfg, data[i]); + _IIC_SendByte(iic_cfg, data[i]); } - _IIC_Stop(cfg); + _IIC_Stop(iic_cfg); return count; } int pika_hal_platform_SOFT_IIC_read(pika_dev* dev, void* buf, size_t count) { - pika_hal_SOFT_IIC_config* cfg = + pika_hal_SOFT_IIC_config* iic_cfg = (pika_hal_SOFT_IIC_config*)dev->ioctl_config; uint8_t* data = (uint8_t*)buf; + _IIC_Start(iic_cfg); + // 如果启用了mem_addr_ena,先写设备地址和内存地址 - if (cfg->mem_addr_ena == PIKA_HAL_IIC_MEM_ADDR_ENA_ENABLE) { - set_SDA_output(cfg); - _IIC_Start(cfg); - _IIC_SendByte(cfg, cfg->slave_addr); - if (cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_8BIT) { - _IIC_SendByte(cfg, cfg->mem_addr & 0xFF); - } else if (cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_16BIT) { - _IIC_SendByte(cfg, (cfg->mem_addr >> 8) & 0xFF); - _IIC_SendByte(cfg, cfg->mem_addr & 0xFF); + if (iic_cfg->mem_addr_ena == PIKA_HAL_IIC_MEM_ADDR_ENA_ENABLE) { + uint8_t addr_write = + (iic_cfg->slave_addr << 1) | 0x00; // 方向位为0代表写 + // pika_debug("iic addr_write: 0x%02X", addr_write); + _IIC_SendByte(iic_cfg, addr_write); // 方向位为0代表写 + if (iic_cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_8BIT) { + _IIC_SendByte(iic_cfg, iic_cfg->mem_addr & 0xFF); + } else if (iic_cfg->mem_addr_size == PIKA_HAL_IIC_MEM_ADDR_SIZE_16BIT) { + _IIC_SendByte(iic_cfg, (iic_cfg->mem_addr >> 8) & 0xFF); + _IIC_SendByte(iic_cfg, iic_cfg->mem_addr & 0xFF); } - _IIC_Stop(cfg); + _IIC_Start(iic_cfg); } - set_SDA_input(cfg); - _IIC_Start(cfg); + uint8_t addr_read = (iic_cfg->slave_addr << 1) | 0x01; // 方向位为1代表读 + // pika_debug("iic addr_read: 0x%02X", addr_read); + _IIC_SendByte(iic_cfg, addr_read); // 方向位为1代表读 + for (int i = 0; i < count - 1; i++) { - data[i] = _IIC_ReadByte(cfg, 1); + // data[i] = _IIC_ReadByte(iic_cfg, 1); + data[i] = _IIC_ReadByte(iic_cfg, 1); } - data[count - 1] = _IIC_ReadByte(cfg, 0); - _IIC_Stop(cfg); + data[count - 1] = _IIC_ReadByte(iic_cfg, 0); + _IIC_Stop(iic_cfg); return count; } diff --git a/port/linux/package/pikascript/pikascript-lib/re/pcre_exec.c b/port/linux/package/pikascript/pikascript-lib/re/pcre_exec.c index af260b068..401cbd301 100644 --- a/port/linux/package/pikascript/pikascript-lib/re/pcre_exec.c +++ b/port/linux/package/pikascript/pikascript-lib/re/pcre_exec.c @@ -4524,9 +4524,11 @@ HEAP_RETURN: LBL(7) LBL(8) LBL(9) - LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) LBL(19) LBL(24) - LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) LBL(35) LBL(43) - LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) LBL(53) LBL(54) + LBL(10) + LBL(11) + LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) LBL(19) LBL(24) LBL(25) LBL(26) + LBL(27) LBL(29) LBL(31) LBL(33) LBL(35) LBL(43) LBL(47) LBL(48) + LBL(49) LBL(50) LBL(51) LBL(52) LBL(53) LBL(54) #ifdef SUPPORT_UTF8 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30) LBL(32) LBL(34) LBL(42) LBL(46) diff --git a/port/linux/test/python/requests/get_basic.py b/port/linux/test/python/requests/get_basic.py index 30b76720c..57b2d0eaa 100644 --- a/port/linux/test/python/requests/get_basic.py +++ b/port/linux/test/python/requests/get_basic.py @@ -2,7 +2,7 @@ import requests b = "kkk" -a = requests.request("GET", "http://pikascript.com/package", params = {"name":"get-test"}) +a = requests.request("GET", "http://pikapython.com/packages", params = {"name":"get-test"}) print(a.headers) print(a.content_length)