From b85c6bc9ecc164b5b328c39350b2beeb81a911d2 Mon Sep 17 00:00:00 2001 From: ringabout <43030857+ringabout@users.noreply.github.com> Date: Wed, 27 May 2026 19:34:03 +0800 Subject: [PATCH] test pcre2 --- .github/workflows/ci_packages.yml | 2 +- changelog.md | 4 +- doc/lib.md | 7 +- doc/nimgrep.md | 4 +- lib/impure/nre.nim | 227 +++++++++++---------- lib/impure/re.nim | 324 ++++++++++++++++-------------- lib/wrappers/pcre2.nim | 260 ++++++++++++++++++++++++ tools/ci_generate.nim | 4 +- tools/kochdocs.nim | 1 + tools/nimgrep.nim | 2 +- 10 files changed, 570 insertions(+), 265 deletions(-) create mode 100644 lib/wrappers/pcre2.nim diff --git a/.github/workflows/ci_packages.yml b/.github/workflows/ci_packages.yml index 6cef86c28a..fa646abe95 100644 --- a/.github/workflows/ci_packages.yml +++ b/.github/workflows/ci_packages.yml @@ -49,7 +49,7 @@ jobs: DEBIAN_FRONTEND='noninteractive' \ sudo apt-get install --no-install-recommends -yq \ libcurl4-openssl-dev libgc-dev libsdl1.2-dev libsfml-dev \ - valgrind libc6-dbg libblas-dev liblapack-dev libpcre3 xorg-dev + valgrind libc6-dbg libblas-dev liblapack-dev libpcre2-dev xorg-dev - name: 'Install dependencies (macOS)' if: runner.os == 'macOS' run: brew install boehmgc make sfml gtk+3 diff --git a/changelog.md b/changelog.md index af91b49773..f064c13932 100644 --- a/changelog.md +++ b/changelog.md @@ -78,8 +78,8 @@ parameter and result types, not just their source-level shape. Use - `min`, `max`, and `sequtils`' `minIndex`, `maxIndex` and `minmax` for `openArray`s now accept a comparison function. - `system.substr` implementation now uses `copymem` (wrapped C `memcpy`) for copying data, if available at compilation. - `system.newStringUninit` is now considered free of side-effects allowing it to be used with `--experimental:strictFuncs`. -- `std/re` and `std/nre` are deprecated as PCRE library is obsolete. - Use https://github.com/nitely/nim-regex or `std/nre2`. +- `std/re` and `std/nre` now use PCRE2. They remain deprecated; + use https://github.com/nitely/nim-regex or `std/nre2`. See: https://github.com/nim-lang/Nim/issues/23668. - `std/pegs` now correctly lexes UTF-8 bytes inside bare identifier-style terminals, so case-insensitive matching of non-ASCII terms (e.g. ``\i café``) diff --git a/doc/lib.md b/doc/lib.md index 1507bbaac7..b7ca390c86 100644 --- a/doc/lib.md +++ b/doc/lib.md @@ -596,12 +596,12 @@ Regular expressions * [re](re.html) Procedures and operators for handling regular - expressions. The current implementation uses PCRE. + expressions. The current implementation uses PCRE2. * [nre](nre.html) Many help functions for handling regular expressions. - The current implementation uses PCRE. + The current implementation uses PCRE2. Database support ---------------- @@ -661,6 +661,9 @@ Regular expressions * [pcre](pcre.html) Wrapper for the PCRE library. +* [pcre2](pcre2.html) + Wrapper for the PCRE2 library. + Database support ---------------- diff --git a/doc/nimgrep.md b/doc/nimgrep.md index 63f7600513..8bc1da8dae 100644 --- a/doc/nimgrep.md +++ b/doc/nimgrep.md @@ -86,7 +86,7 @@ That means you can always use only 1 such an option with logical OR, e.g. Meaning of `^`:literal: and `$`:literal: ======================================== -`nimgrep`:cmd: PCRE engine is run in a single-line mode so +`nimgrep`:cmd: PCRE2 engine is run in a single-line mode so `^`:literal: matches the beginning of whole input *file* and `$`:literal: matches the end of *file* (or whole input *string* for options like `--filename`). @@ -97,7 +97,7 @@ Add the `(?m)`:literal: modifier to the beginning of your pattern for Examples ======== -All examples below use default PCRE Regex patterns: +All examples below use default PCRE2 Regex patterns: + To search recursively in Nim files using style-insensitive identifiers: diff --git a/lib/impure/nre.nim b/lib/impure/nre.nim index adc2ceb22d..842ec003be 100644 --- a/lib/impure/nre.nim +++ b/lib/impure/nre.nim @@ -7,21 +7,21 @@ # when defined(js): - {.error: "This library needs to be compiled with a c-like backend, and depends on PCRE; See jsre for JS backend.".} + {.error: "This library needs to be compiled with a c-like backend, and depends on PCRE2; See jsre for JS backend.".} ## .. warning:: NRE is deprecated. ## Use [Regex](https://github.com/nitely/nim-regex) or ## `NRE2 `_ that wraps Regex so that you can easily replace NRE. -## PCRE library is now at end of life. +## This compatibility module uses PCRE2. ## ## What is NRE? ## ============ ## -## A regular expression library for Nim using PCRE to do the hard work. +## A regular expression library for Nim using PCRE2 to do the hard work. ## -## For documentation on how to write patterns, there exists `the official PCRE +## For documentation on how to write patterns, there exists `the official PCRE2 ## pattern documentation -## `_. You can also +## `_. You can also ## search the internet for a wide variety of third-party documentation and ## tools. ## @@ -39,10 +39,8 @@ runnableExamples: ## Licencing ## --------- ## -## PCRE has `some additional terms`_ that you must agree to in order to use -## this module. +## PCRE2 is distributed under a BSD-style licence. ## -## .. _`some additional terms`: https://pcre.sourceforge.net/license.txt runnableExamples: import std/sugar let vowels = re"[aeoui]" @@ -66,7 +64,7 @@ runnableExamples: assert find("uxabc", re"(?<=x|y)ab", start = 1).get.captures[-1] == "ab" assert find("uxabc", re"ab", start = 3).isNone -from std/pcre import nil +import std/pcre2 as pcre import nre/private/util import std/tables from std/strutils import `%` @@ -82,7 +80,6 @@ type RegexDesc* = object pattern*: string pcreObj: ptr pcre.Pcre ## not nil - pcreExtra: ptr pcre.ExtraData ## nil captureNameToId: Table[string, int] @@ -93,9 +90,9 @@ type ## ## `pattern: string` ## : the string that was used to create the pattern. For details on how - ## to write a pattern, please see `the official PCRE pattern + ## to write a pattern, please see `the official PCRE2 pattern ## documentation. - ## `_ + ## `_ ## ## `captureCount: int` ## : the number of captures that the pattern has. @@ -140,23 +137,23 @@ type ## NEL (next line, U+0085), LS (line separator, U+2028), and PS ## (paragraph separator, U+2029). For the 8-bit library, the last two ## are recognized only in UTF-8 mode. - ## — man pcre + ## -- man pcre2pattern ## ## - `(*JAVASCRIPT_COMPAT)` - JavaScript compatibility ## - `(*NO_STUDY)` - turn off studying; study is enabled by default ## ## For more details on the leading option groups, see the `Option - ## Setting `_ + ## Setting `_ ## and the `Newline - ## Convention `_ - ## sections of the `PCRE syntax - ## manual `_. + ## Convention `_ + ## sections of the `PCRE2 syntax + ## manual `_. ## - ## Some of these options are not part of PCRE and are converted by nre - ## into PCRE flags. These include `NEVER_UTF`, `ANCHORED`, + ## Some of these options are not part of a pattern and are converted by nre + ## into PCRE2 flags. These include `NEVER_UTF`, `ANCHORED`, ## `DOLLAR_ENDONLY`, `FIRSTLINE`, `NO_AUTO_CAPTURE`, - ## `JAVASCRIPT_COMPAT`, `U`, `NO_STUDY`. In other PCRE wrappers, you - ## will need to pass these as separate flags to PCRE. + ## `JAVASCRIPT_COMPAT`, `U`, `NO_STUDY`. In other PCRE2 wrappers, you + ## will need to pass these as separate flags to PCRE2. RegexMatch* = object ## Usually seen as `Option[RegexMatch]`, it represents the result of an @@ -196,7 +193,7 @@ type pattern*: Regex ## The regex doing the matching. ## Not nil. str*: string ## The string that was matched against. - pcreMatchBounds: seq[HSlice[cint, cint]] ## First item is the bounds of the match + pcreMatchBounds: seq[HSlice[csize_t, csize_t]] ## First item is the bounds of the match ## Other items are the captures ## `a` is inclusive start, `b` is exclusive end @@ -227,38 +224,32 @@ when defined(gcDestructors): when defined(nimAllowNonVarDestructor) and defined(nimPreviewNonVarDestructor): proc `=destroy`(pattern: RegexDesc) = `=destroy`(pattern.pattern) - pcre.free_substring(cast[cstring](pattern.pcreObj)) - if pattern.pcreExtra != nil: - pcre.free_study(pattern.pcreExtra) + pcre.code_free(pattern.pcreObj) `=destroy`(pattern.captureNameToId) else: proc `=destroy`(pattern: var RegexDesc) = `=destroy`(pattern.pattern) - pcre.free_substring(cast[cstring](pattern.pcreObj)) - if pattern.pcreExtra != nil: - pcre.free_study(pattern.pcreExtra) + pcre.code_free(pattern.pcreObj) `=destroy`(pattern.captureNameToId) else: proc destroyRegex(pattern: Regex) = `=destroy`(pattern.pattern) - pcre.free_substring(cast[cstring](pattern.pcreObj)) - if pattern.pcreExtra != nil: - pcre.free_study(pattern.pcreExtra) + pcre.code_free(pattern.pcreObj) `=destroy`(pattern.captureNameToId) -proc getinfo[T](pattern: Regex, opt: cint): T = +proc getinfo[T](pattern: Regex, opt: uint32): T = result = default(T) - let retcode = pcre.fullinfo(pattern.pcreObj, pattern.pcreExtra, opt, addr result) + let retcode = pcre.pattern_info(pattern.pcreObj, opt, addr result) if retcode < 0: # XXX Error message that doesn't expose implementation details raise newException(FieldDefect, "Invalid getinfo for $1, errno $2" % [$opt, $retcode]) proc getNameToNumberTable(pattern: Regex): Table[string, int] = - let entryCount = getinfo[cint](pattern, pcre.INFO_NAMECOUNT) - let entrySize = getinfo[cint](pattern, pcre.INFO_NAMEENTRYSIZE) + let entryCount = getinfo[uint32](pattern, pcre.INFO_NAMECOUNT).int + let entrySize = getinfo[uint32](pattern, pcre.INFO_NAMEENTRYSIZE).int let table = cast[ptr UncheckedArray[uint8]]( - getinfo[int](pattern, pcre.INFO_NAMETABLE)) + getinfo[pointer](pattern, pcre.INFO_NAMETABLE)) result = initTable[string, int]() @@ -274,61 +265,69 @@ proc getNameToNumberTable(pattern: Regex): Table[string, int] = result[name] = num -proc initRegex(pattern: string, flags: int, study = true): Regex = +proc pcreErrorMessage(errorCode: cint): string = + var buffer: array[256, uint8] + let length = pcre.get_error_message(errorCode, addr buffer[0], buffer.len.csize_t) + if length >= 0: + result = newString(length) + if length > 0: + copyMem(addr result[0], addr buffer[0], length) + else: + result = $errorCode + +proc jitCompile(pattern: ptr pcre.Pcre) = + var hasJit: cint = 0 + if pcre.config(pcre.CONFIG_JIT, addr hasJit) == 0 and hasJit == 1: + discard pcre.jit_compile(pattern, pcre.JIT_COMPLETE.uint32) + +proc initRegex(pattern: string, flags: uint32, study = true): Regex = when defined(gcDestructors): result = Regex() else: new(result, destroyRegex) result.pattern = pattern - var errorMsg: cstring = "" - var errOffset: cint = 0 + var + errorCode: cint = 0 + errOffset: csize_t = 0 - result.pcreObj = pcre.compile(cstring(pattern), - # better hope int is at least 4 bytes.. - cint(flags), addr errorMsg, + result.pcreObj = pcre.compile(cast[ptr uint8](cstring(pattern)), + pattern.len.csize_t, flags, addr errorCode, addr errOffset, nil) if result.pcreObj == nil: # failed to compile - raise SyntaxError(msg: $errorMsg, pos: errOffset, pattern: pattern) + raise SyntaxError(msg: pcreErrorMessage(errorCode), pos: errOffset.int, + pattern: pattern) if study: - var options: cint = 0 - var hasJit: cint = cint(0) - if pcre.config(pcre.CONFIG_JIT, addr hasJit) == 0: - if hasJit == 1'i32: - options = pcre.STUDY_JIT_COMPILE - result.pcreExtra = pcre.study(result.pcreObj, options, addr errorMsg) - if errorMsg != nil: - raise StudyError(msg: $errorMsg) + jitCompile(result.pcreObj) result.captureNameToId = result.getNameToNumberTable() proc captureCount*(pattern: Regex): int = - return getinfo[cint](pattern, pcre.INFO_CAPTURECOUNT) + return getinfo[uint32](pattern, pcre.INFO_CAPTURECOUNT).int proc captureNameId*(pattern: Regex): Table[string, int] = return pattern.captureNameToId proc matchesCrLf(pattern: Regex): bool = - let flags = uint32(getinfo[culong](pattern, pcre.INFO_OPTIONS)) - let newlineFlags = flags and (pcre.NEWLINE_CRLF or - pcre.NEWLINE_ANY or - pcre.NEWLINE_ANYCRLF) - if newlineFlags > 0u32: + let newline = getinfo[uint32](pattern, pcre.INFO_NEWLINE) + case newline + of pcre.NEWLINE_CRLF, pcre.NEWLINE_ANY, pcre.NEWLINE_ANYCRLF: return true + of pcre.NEWLINE_CR, pcre.NEWLINE_LF, pcre.NEWLINE_NUL: + return false + else: + discard # get flags from build config - var confFlags: cint = cint(0) + var confFlags: uint32 = 0 if pcre.config(pcre.CONFIG_NEWLINE, addr confFlags) != 0: assert(false, "CONFIG_NEWLINE apparently got screwed up") case confFlags - of 13: return false - of 10: return false - of (13 shl 8) or 10: return true - of -2: return true - of -1: return true + of pcre.NEWLINE_CR, pcre.NEWLINE_LF, pcre.NEWLINE_NUL: return false + of pcre.NEWLINE_CRLF, pcre.NEWLINE_ANY, pcre.NEWLINE_ANYCRLF: return true else: return false @@ -338,7 +337,9 @@ func captures*(pattern: RegexMatch): Captures = return Captures(pattern) func contains*(pattern: CaptureBounds, i: int): bool = let pattern = RegexMatch(pattern) - pattern.pcreMatchBounds[i + 1].a != -1 + let index = i + 1 + index >= 0 and index < pattern.pcreMatchBounds.len and + pattern.pcreMatchBounds[index].a != pcre.UNSET func contains*(pattern: Captures, i: int): bool = i in CaptureBounds(pattern) @@ -349,7 +350,7 @@ func `[]`*(pattern: CaptureBounds, i: int): HSlice[int, int] = raise newException(IndexDefect, "Group '" & $i & "' was not captured") let bounds = pattern.pcreMatchBounds[i + 1] - int(bounds.a)..int(bounds.b-1) + int(bounds.a) .. (int(bounds.b) - 1) func `[]`*(pattern: Captures, i: int): string = let pattern = RegexMatch(pattern) @@ -437,8 +438,7 @@ proc `$`*(pattern: RegexMatch): string = proc `==`*(a, b: Regex): bool = if not a.isNil and not b.isNil: return a.pattern == b.pattern and - a.pcreObj == b.pcreObj and - a.pcreExtra == b.pcreExtra + a.pcreObj == b.pcreObj else: return system.`==`(a, b) @@ -453,7 +453,7 @@ const PcreOptions = { "FIRSTLINE": pcre.FIRSTLINE, "NO_AUTO_CAPTURE": pcre.NO_AUTO_CAPTURE, "JAVASCRIPT_COMPAT": pcre.JAVASCRIPT_COMPAT, - "U": pcre.UTF8 or pcre.UCP + "U": pcre.UTF or pcre.UCP }.toTable # Options that are supported inside regular expressions themselves @@ -503,46 +503,63 @@ proc extractOptions(pattern: string): tuple[pattern: string, flags: int, study: proc re*(pattern: string): Regex = let (pattern, flags, study) = extractOptions(pattern) - initRegex(pattern, flags, study) + initRegex(pattern, cast[uint32](flags), study) -proc matchImpl(str: string, pattern: Regex, start, endpos: int, flags: int): Option[RegexMatch] = +func isInvalidUnicodeError(errorCode: cint): bool = + (errorCode <= pcre.ERROR_UTF8_ERR1 and errorCode >= pcre.ERROR_UTF8_ERR21) or + errorCode == pcre.ERROR_BADUTFOFFSET or + errorCode == pcre.ERROR_DFA_UINVALID_UTF + +proc newMatchData(pattern: Regex): ptr pcre.MatchData = + result = pcre.match_data_create_from_pattern(pattern.pcreObj, nil) + if result == nil: + raise RegexInternalError(msg: "could not allocate PCRE2 match data") + +proc matchImpl(str: string, pattern: Regex, start, endpos: int, options: uint32): Option[RegexMatch] = var myResult = RegexMatch(pattern: pattern, str: str) - # See PCRE man pages. - # 2x capture count to make room for start-end pairs - # 1x capture count as slack space for PCRE - let vecsize = (pattern.captureCount() + 1) * 3 - # div 2 because each element is 2 cints long - # plus 1 because we need the ceiling, not the floor - myResult.pcreMatchBounds = newSeq[HSlice[cint, cint]]((vecsize + 1) div 2) - myResult.pcreMatchBounds.setLen(vecsize div 3) + myResult.pcreMatchBounds = newSeq[HSlice[csize_t, csize_t]](pattern.captureCount() + 1) let strlen = if endpos == int.high: str.len else: endpos+1 doAssert(strlen <= str.len) # don't want buffer overflows + if start < 0 or start > strlen: + return none(RegexMatch) + + let matchData = newMatchData(pattern) + defer: pcre.match_data_free(matchData) + let execRet = pcre.match(pattern.pcreObj, + cast[ptr uint8](cstring(str)), + strlen.csize_t, + start.csize_t, + options, + matchData, + nil) + let rawMatches = cast[ptr UncheckedArray[csize_t]](pcre.get_ovector_pointer(matchData)) + let ovectorCount = min(myResult.pcreMatchBounds.len, + pcre.get_ovector_count(matchData).int) + for i in 0 ..< ovectorCount: + myResult.pcreMatchBounds[i] = rawMatches[i * 2] .. rawMatches[i * 2 + 1] - let execRet = pcre.exec(pattern.pcreObj, - pattern.pcreExtra, - cstring(str), - cint(strlen), - cint(start), - cint(flags), - cast[ptr cint](addr myResult.pcreMatchBounds[0]), - cint(vecsize)) if execRet >= 0: return some(myResult) - case execRet: - of pcre.ERROR_NOMATCH: - return none(RegexMatch) - of pcre.ERROR_NULL: - raise newException(AccessViolationDefect, "Expected non-null parameters") - of pcre.ERROR_BADOPTION: - raise RegexInternalError(msg: "Unknown pattern flag. Either a bug or " & - "outdated PCRE.") - of pcre.ERROR_BADUTF8, pcre.ERROR_SHORTUTF8, pcre.ERROR_BADUTF8_OFFSET: - raise InvalidUnicodeError(msg: "Invalid unicode byte sequence", - pos: myResult.pcreMatchBounds[0].a) + if isInvalidUnicodeError(execRet): + let errorPos = if myResult.pcreMatchBounds.len > 0 and + myResult.pcreMatchBounds[0].a != pcre.UNSET: + myResult.pcreMatchBounds[0].a.int else: - raise RegexInternalError(msg: "Unknown internal error: " & $execRet) + start + raise InvalidUnicodeError(msg: "Invalid unicode byte sequence", pos: errorPos) + + case execRet + of pcre.ERROR_NOMATCH: + return none(RegexMatch) + of pcre.ERROR_NULL: + raise newException(AccessViolationDefect, "Expected non-null parameters") + of pcre.ERROR_BADOPTION: + raise RegexInternalError(msg: "Unknown pattern flag. Either a bug or " & + "outdated PCRE2.") + else: + raise RegexInternalError(msg: "Unknown internal error: " & $execRet) proc match*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] = ## Like `find(...)<#find,string,Regex,int>`_, but anchored to the start of the @@ -559,7 +576,7 @@ proc match*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[R assert 0 in "abc".match(re"(\w)").get.captureBounds assert "abc".match(re"").get.captureBounds[-1] == 0 .. -1 assert "abc".match(re"abc").get.captureBounds[-1] == 0 .. 2 - return str.matchImpl(pattern, start, endpos, pcre.ANCHORED) + return str.matchImpl(pattern, start, endpos, cast[uint32](pcre.ANCHORED)) iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): RegexMatch = ## Works the same as `find(...)<#find,string,Regex,int>`_, but finds every @@ -573,21 +590,21 @@ iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): R ## Variants: ## ## - `proc findAll(...)` returns a `seq[string]` - # see pcredemo for explanation => https://www.pcre.org/original/doc/html/pcredemo.html + # see pcre2demo for explanation => https://www.pcre.org/current/doc/html/pcre2demo.html let matchesCrLf = pattern.matchesCrLf() - let unicode = uint32(getinfo[culong](pattern, pcre.INFO_OPTIONS) and - pcre.UTF8) > 0u32 + let unicode = uint32(getinfo[uint32](pattern, pcre.INFO_ALLOPTIONS) and + pcre.UTF.uint32) > 0u32 let strlen = if endpos == int.high: str.len else: endpos+1 var offset = start var match: Option[RegexMatch] = default(Option[RegexMatch]) var neverMatched = true while true: - var flags = 0 + var flags = 0'u32 if match.isSome and match.get.matchBounds.a > match.get.matchBounds.b: # 0-len match - flags = pcre.NOTEMPTY_ATSTART + flags = pcre.NOTEMPTY_ATSTART.uint32 match = str.matchImpl(pattern, offset, endpos, flags) if match.isNone: @@ -623,7 +640,7 @@ proc find*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[Re ## `endpos` ## : The maximum index for a match; `int.high` means the end of the ## string, otherwise it’s an inclusive upper bound. - return str.matchImpl(pattern, start, endpos, 0) + return str.matchImpl(pattern, start, endpos, 0'u32) proc findAll*(str: string, pattern: Regex, start = 0, endpos = int.high): seq[string] = result = @[] diff --git a/lib/impure/re.nim b/lib/impure/re.nim index 72d01b9527..b9b7d3704a 100644 --- a/lib/impure/re.nim +++ b/lib/impure/re.nim @@ -8,27 +8,25 @@ # when defined(js): - {.error: "This library needs to be compiled with a c-like backend, and depends on PCRE; See jsre for JS backend.".} + {.error: "This library needs to be compiled with a c-like backend, and depends on PCRE2; See jsre for JS backend.".} ## .. warning:: This module is deprecated. ## Use [Regex](https://github.com/nitely/nim-regex). -## PCRE library is now at end of life. +## This compatibility module uses PCRE2. ## ## Regular expression support for Nim. ## ## This module is implemented by providing a wrapper around the -## `PCRE (Perl-Compatible Regular Expressions) `_ -## C library. This means that your application will depend on the PCRE +## `PCRE2 (Perl-Compatible Regular Expressions) `_ +## C library. This means that your application will depend on the PCRE2 ## library's licence when using this module, which should not be a problem ## though. ## ## .. note:: There are also alternative nimble packages such as [tinyre](https://github.com/khchen/tinyre) ## and [regex](https://github.com/nitely/nim-regex). ## -## PCRE's licence follows: -## -## .. include:: ../../doc/regexprs.txt -## +## PCRE2 is distributed under a BSD-style licence. + runnableExamples: ## Unless specified otherwise, `start` parameter in each proc indicates @@ -40,7 +38,7 @@ runnableExamples: # can't match start of string since we're starting at 1 import - std/[pcre, strutils, rtarrays] + std/[pcre2, strutils] when defined(nimPreviewSlimSystem): import std/syncio @@ -60,8 +58,7 @@ type ## expression will be used only once) RegexDesc = object - h: ptr Pcre - e: ptr ExtraData + h: ptr pcre2.Pcre Regex* = ref RegexDesc ## a compiled regular expression @@ -71,14 +68,10 @@ type when defined(gcDestructors): when defined(nimAllowNonVarDestructor): proc `=destroy`(x: RegexDesc) = - pcre.free_substring(cast[cstring](x.h)) - if not isNil(x.e): - pcre.free_study(x.e) + pcre2.code_free(x.h) else: proc `=destroy`(x: var RegexDesc) = - pcre.free_substring(cast[cstring](x.h)) - if not isNil(x.e): - pcre.free_study(x.e) + pcre2.code_free(x.h) proc raiseInvalidRegex(msg: string) {.noinline, noreturn.} = var e: ref RegexError @@ -86,21 +79,43 @@ proc raiseInvalidRegex(msg: string) {.noinline, noreturn.} = e.msg = msg raise e -proc rawCompile(pattern: string, flags: cint): ptr Pcre = +proc pcre2ErrorMessage(errorCode: cint): string = + var buffer: array[256, uint8] + let length = pcre2.get_error_message(errorCode, addr buffer[0], buffer.len.csize_t) + if length >= 0: + result = newString(length) + if length > 0: + copyMem(addr result[0], addr buffer[0], length) + else: + result = $errorCode + +proc rawCompile(pattern: string, options: uint32): ptr pcre2.Pcre = var - msg: cstring = "" - offset: cint = 0 - result = pcre.compile(pattern, flags, addr(msg), addr(offset), nil) + errorCode: cint = 0 + offset: csize_t = 0 + result = pcre2.compile(cast[ptr uint8](pattern.cstring), pattern.len.csize_t, + options, addr errorCode, addr offset, nil) if result == nil: - raiseInvalidRegex($msg & "\n" & pattern & "\n" & spaces(offset) & "^\n") + raiseInvalidRegex(pcre2ErrorMessage(errorCode) & "\n" & pattern & "\n" & + spaces(offset.int) & "^\n") proc finalizeRegEx(x: Regex) = - # XXX This is a hack, but PCRE does not export its "free" function properly. - # Sigh. The hack relies on PCRE's implementation (see `pcre_get.c`). - # Fortunately the implementation is unlikely to change. - pcre.free_substring(cast[cstring](x.h)) - if not isNil(x.e): - pcre.free_study(x.e) + pcre2.code_free(x.h) + +func toPcre2Options(flags: set[RegexFlag]): uint32 = + if reIgnoreCase in flags: + result = result or pcre2.CASELESS.uint32 + if reMultiLine in flags: + result = result or pcre2.MULTILINE.uint32 + if reDotAll in flags: + result = result or pcre2.DOTALL.uint32 + if reExtended in flags: + result = result or pcre2.EXTENDED.uint32 + +proc jitCompile(pattern: ptr pcre2.Pcre) = + var hasJit: cint = 0 + if pcre2.config(pcre2.CONFIG_JIT, addr hasJit) == 0 and hasJit == 1: + discard pcre2.jit_compile(pattern, pcre2.JIT_COMPLETE.uint32) proc re*(s: string, flags = {reStudy}): Regex = ## Constructor of regular expressions. @@ -116,16 +131,9 @@ proc re*(s: string, flags = {reStudy}): Regex = result = Regex() else: new(result, finalizeRegEx) - result.h = rawCompile(s, cast[cint](flags - {reStudy})) + result.h = rawCompile(s, toPcre2Options(flags)) if reStudy in flags: - var msg: cstring = "" - var options: cint = 0 - var hasJit: cint = 0 - if pcre.config(pcre.CONFIG_JIT, addr hasJit) == 0: - if hasJit == 1'i32: - options = pcre.STUDY_JIT_COMPILE - result.e = pcre.study(result.h, options, addr msg) - if not isNil(msg): raiseInvalidRegex($msg) + jitCompile(result.h) proc rex*(s: string, flags = {reStudy, reExtended}): Regex = ## Constructor for extended regular expressions. @@ -142,25 +150,58 @@ proc bufSubstr(b: cstring, sPos, ePos: int): string {.inline.} = copyMem(addr(result[0]), unsafeAddr(b[sPos]), sz) result.setLen(sz) -proc matchOrFind(buf: cstring, pattern: Regex, matches: var openArray[string], - start, bufSize, flags: cint): cint = - var - rtarray = initRtArray[cint]((matches.len+1)*3) - rawMatches = rtarray.getRawData - res = pcre.exec(pattern.h, pattern.e, buf, bufSize, start, flags, - cast[ptr cint](rawMatches), (matches.len+1).cint*3) - if res < 0'i32: return res - for i in 1..int(res)-1: - var a = rawMatches[i * 2] - var b = rawMatches[i * 2 + 1] - if a >= 0'i32: - matches[i-1] = bufSubstr(buf, int(a), int(b)) - else: matches[i-1] = "" - return rawMatches[1] - rawMatches[0] +proc newMatchData(slots: int): ptr pcre2.MatchData = + result = pcre2.match_data_create(max(slots, 1).uint32, nil) + if result == nil: + raiseInvalidRegex("could not allocate PCRE2 match data") -const MaxReBufSize* = high(cint) - ## Maximum PCRE (API 1) buffer start/size equal to `high(cint)`, which even - ## for 64-bit systems can be either 2`31`:sup:-1 or 2`63`:sup:-1. +template ovector(matchData: ptr pcre2.MatchData): ptr UncheckedArray[csize_t] = + cast[ptr UncheckedArray[csize_t]](pcre2.get_ovector_pointer(matchData)) + +proc rawMatch(buf: cstring, pattern: Regex, start, bufSize: int, + options: uint32, matchData: ptr pcre2.MatchData): cint = + if start < 0 or bufSize < 0: + return pcre2.ERROR_BADOFFSET + pcre2.match(pattern.h, cast[ptr uint8](buf), bufSize.csize_t, + start.csize_t, options, matchData, nil) + +proc copyStringMatches(buf: cstring, rawMatches: ptr UncheckedArray[csize_t], + captureCount: int, matches: var openArray[string]) = + let upper = min(captureCount - 1, matches.len) + if upper > 0: + for i in 1 .. upper: + let matchStart = rawMatches[i * 2] + let matchEnd = rawMatches[i * 2 + 1] + if matchStart != pcre2.UNSET: + matches[i-1] = bufSubstr(buf, int(matchStart), int(matchEnd)) + else: + matches[i-1] = "" + +proc copyBoundsMatches(rawMatches: ptr UncheckedArray[csize_t], + captureCount: int, + matches: var openArray[tuple[first, last: int]]) = + let upper = min(captureCount - 1, matches.len) + if upper > 0: + for i in 1 .. upper: + let matchStart = rawMatches[i * 2] + let matchEnd = rawMatches[i * 2 + 1] + if matchStart != pcre2.UNSET: + matches[i-1] = (int(matchStart), int(matchEnd) - 1) + else: + matches[i-1] = (-1, 0) + +proc matchOrFind(buf: cstring, pattern: Regex, matches: var openArray[string], + start, bufSize: int, options: uint32): int = + let matchData = newMatchData(matches.len + 1) + defer: pcre2.match_data_free(matchData) + let res = rawMatch(buf, pattern, start, bufSize, options, matchData) + let rawMatches = ovector(matchData) + if res < 0: return int(res) + copyStringMatches(buf, rawMatches, int(res), matches) + return int(rawMatches[1]) - int(rawMatches[0]) + +const MaxReBufSize* = high(int) + ## Maximum PCRE2 buffer start/size accepted by this Nim API. proc findBounds*(buf: cstring, pattern: Regex, matches: var openArray[string], start = 0, bufSize: int): tuple[first, last: int] = @@ -172,17 +213,12 @@ proc findBounds*(buf: cstring, pattern: Regex, matches: var openArray[string], ## ## Note: The memory for `matches` needs to be allocated before this function is ## called, otherwise it will just remain empty. - var - rtarray = initRtArray[cint]((matches.len+1)*3) - rawMatches = rtarray.getRawData - res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, 0'i32, - cast[ptr cint](rawMatches), (matches.len+1).cint*3) - if res < 0'i32: return (-1, 0) - for i in 1..int(res)-1: - var a = rawMatches[i * 2] - var b = rawMatches[i * 2 + 1] - if a >= 0'i32: matches[i-1] = bufSubstr(buf, int(a), int(b)) - else: matches[i-1] = "" + let matchData = newMatchData(matches.len + 1) + defer: pcre2.match_data_free(matchData) + let res = rawMatch(buf, pattern, start, bufSize, 0'u32, matchData) + let rawMatches = ovector(matchData) + if res < 0: return (-1, 0) + copyStringMatches(buf, rawMatches, int(res), matches) return (rawMatches[0].int, rawMatches[1].int - 1) proc findBounds*(s: string, pattern: Regex, matches: var openArray[string], @@ -212,17 +248,12 @@ proc findBounds*(buf: cstring, pattern: Regex, ## `(-1,0)` is returned. ## ## .. note:: The memory for `matches` needs to be allocated before this function is called, otherwise it will just remain empty. - var - rtarray = initRtArray[cint]((matches.len+1)*3) - rawMatches = rtarray.getRawData - res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, 0'i32, - cast[ptr cint](rawMatches), (matches.len+1).cint*3) - if res < 0'i32: return (-1, 0) - for i in 1..int(res)-1: - var a = rawMatches[i * 2] - var b = rawMatches[i * 2 + 1] - if a >= 0'i32: matches[i-1] = (int(a), int(b)-1) - else: matches[i-1] = (-1,0) + let matchData = newMatchData(matches.len + 1) + defer: pcre2.match_data_free(matchData) + let res = rawMatch(buf, pattern, start, bufSize, 0'u32, matchData) + let rawMatches = ovector(matchData) + if res < 0: return (-1, 0) + copyBoundsMatches(rawMatches, int(res), matches) return (rawMatches[0].int, rawMatches[1].int - 1) proc findBounds*(s: string, pattern: Regex, @@ -244,29 +275,28 @@ proc findBounds*(s: string, pattern: Regex, min(start, MaxReBufSize), min(s.len, MaxReBufSize)) proc findBoundsImpl(buf: cstring, pattern: Regex, - start = 0, bufSize = 0, flags = 0): tuple[first, last: int] = - var rtarray = initRtArray[cint](3) - let rawMatches = rtarray.getRawData - let res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, flags.int32, - cast[ptr cint](rawMatches), 3) - - if res < 0'i32: + start = 0, bufSize = 0, + options = 0'u32): tuple[first, last: int] = + let matchData = newMatchData(1) + defer: pcre2.match_data_free(matchData) + let res = rawMatch(buf, pattern, start, bufSize, options, matchData) + let rawMatches = ovector(matchData) + if res < 0: result = (-1, 0) else: - result = (int(rawMatches[0]), int(rawMatches[1]-1)) + result = (int(rawMatches[0]), int(rawMatches[1]) - 1) proc findBounds*(buf: cstring, pattern: Regex, start = 0, bufSize: int): tuple[first, last: int] = ## returns the `first` and `last` position of `pattern` in `buf`, ## where `buf` has length `bufSize` (not necessarily `'\0'` terminated). ## If it does not match, `(-1,0)` is returned. - var - rtarray = initRtArray[cint](3) - rawMatches = rtarray.getRawData - res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, 0'i32, - cast[ptr cint](rawMatches), 3) - if res < 0'i32: return (int(res), 0) - return (int(rawMatches[0]), int(rawMatches[1]-1)) + let matchData = newMatchData(1) + defer: pcre2.match_data_free(matchData) + let res = rawMatch(buf, pattern, start, bufSize, 0'u32, matchData) + let rawMatches = ovector(matchData) + if res < 0: return (int(res), 0) + return (int(rawMatches[0]), int(rawMatches[1]) - 1) proc findBounds*(s: string, pattern: Regex, start = 0): tuple[first, last: int] {.inline.} = @@ -279,14 +309,16 @@ proc findBounds*(s: string, pattern: Regex, result = findBounds(cstring(s), pattern, min(start, MaxReBufSize), min(s.len, MaxReBufSize)) -proc matchOrFind(buf: cstring, pattern: Regex, start, bufSize: int, flags: cint): cint = - var - rtarray = initRtArray[cint](3) - rawMatches = rtarray.getRawData - result = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, flags, - cast[ptr cint](rawMatches), 3) - if result >= 0'i32: - result = rawMatches[1] - rawMatches[0] +proc matchOrFind(buf: cstring, pattern: Regex, start, bufSize: int, + options: uint32): int = + let matchData = newMatchData(1) + defer: pcre2.match_data_free(matchData) + let res = rawMatch(buf, pattern, start, bufSize, options, matchData) + if res >= 0: + let rawMatches = ovector(matchData) + result = int(rawMatches[1]) - int(rawMatches[0]) + else: + result = int(res) proc matchLen*(s: string, pattern: Regex, matches: var openArray[string], start = 0): int {.inline.} = @@ -295,7 +327,7 @@ proc matchLen*(s: string, pattern: Regex, matches: var openArray[string], ## of zero can happen. ## ## .. note:: The memory for `matches` needs to be allocated before this function is called, otherwise it will just remain empty. - result = matchOrFind(cstring(s), pattern, matches, start.cint, s.len.cint, pcre.ANCHORED) + result = matchOrFind(cstring(s), pattern, matches, start, s.len, cast[uint32](pcre2.ANCHORED)) proc matchLen*(buf: cstring, pattern: Regex, matches: var openArray[string], start = 0, bufSize: int): int {.inline.} = @@ -304,7 +336,7 @@ proc matchLen*(buf: cstring, pattern: Regex, matches: var openArray[string], ## of zero can happen. ## ## .. note:: The memory for `matches` needs to be allocated before this function is called, otherwise it will just remain empty. - return matchOrFind(buf, pattern, matches, start.cint, bufSize.cint, pcre.ANCHORED) + return matchOrFind(buf, pattern, matches, start, bufSize, cast[uint32](pcre2.ANCHORED)) proc matchLen*(s: string, pattern: Regex, start = 0): int {.inline.} = ## the same as `match`, but it returns the length of the match, @@ -315,13 +347,13 @@ proc matchLen*(s: string, pattern: Regex, start = 0): int {.inline.} = doAssert matchLen("abcdefg", re"cde", 2) == 3 doAssert matchLen("abcdefg", re"abcde") == 5 doAssert matchLen("abcdefg", re"cde") == -1 - result = matchOrFind(cstring(s), pattern, start.cint, s.len.cint, pcre.ANCHORED) + result = matchOrFind(cstring(s), pattern, start, s.len, cast[uint32](pcre2.ANCHORED)) proc matchLen*(buf: cstring, pattern: Regex, start = 0, bufSize: int): int {.inline.} = ## the same as `match`, but it returns the length of the match, ## if there is no match, `-1` is returned. Note that a match length ## of zero can happen. - result = matchOrFind(buf, pattern, start.cint, bufSize, pcre.ANCHORED) + result = matchOrFind(buf, pattern, start, bufSize, cast[uint32](pcre2.ANCHORED)) proc match*(s: string, pattern: Regex, start = 0): bool {.inline.} = ## returns `true` if `s[start..]` matches the `pattern`. @@ -361,18 +393,13 @@ proc find*(buf: cstring, pattern: Regex, matches: var openArray[string], ## `buf` has length `bufSize` (not necessarily `'\0'` terminated). ## ## .. note:: The memory for `matches` needs to be allocated before this function is called, otherwise it will just remain empty. - var - rtarray = initRtArray[cint]((matches.len+1)*3) - rawMatches = rtarray.getRawData - res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, 0'i32, - cast[ptr cint](rawMatches), (matches.len+1).cint*3) - if res < 0'i32: return res - for i in 1..int(res)-1: - var a = rawMatches[i * 2] - var b = rawMatches[i * 2 + 1] - if a >= 0'i32: matches[i-1] = bufSubstr(buf, int(a), int(b)) - else: matches[i-1] = "" - return rawMatches[0] + let matchData = newMatchData(matches.len + 1) + defer: pcre2.match_data_free(matchData) + let res = rawMatch(buf, pattern, start, bufSize, 0'u32, matchData) + let rawMatches = ovector(matchData) + if res < 0: return int(res) + copyStringMatches(buf, rawMatches, int(res), matches) + return int(rawMatches[0]) proc find*(s: string, pattern: Regex, matches: var openArray[string], start = 0): int {.inline.} = @@ -387,13 +414,12 @@ proc find*(buf: cstring, pattern: Regex, start = 0, bufSize: int): int = ## returns the starting position of `pattern` in `buf`, ## where `buf` has length `bufSize` (not necessarily `'\0'` terminated). ## If it does not match, `-1` is returned. - var - rtarray = initRtArray[cint](3) - rawMatches = rtarray.getRawData - res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, 0'i32, - cast[ptr cint](rawMatches), 3) - if res < 0'i32: return res - return rawMatches[0] + let matchData = newMatchData(1) + defer: pcre2.match_data_free(matchData) + let res = rawMatch(buf, pattern, start, bufSize, 0'u32, matchData) + let rawMatches = ovector(matchData) + if res < 0: return int(res) + return int(rawMatches[0]) proc find*(s: string, pattern: Regex, start = 0): int {.inline.} = ## returns the starting position of `pattern` in `s`. If it does not @@ -413,40 +439,38 @@ iterator findAll*(s: string, pattern: Regex, start = 0): string = ## ## Note that since this is an iterator you should not modify the string you ## are iterating over: bad things could happen. - var - i = int32(start) - rtarray = initRtArray[cint](3) - rawMatches = rtarray.getRawData + var i = start + let matchData = newMatchData(1) + defer: pcre2.match_data_free(matchData) while true: - let res = pcre.exec(pattern.h, pattern.e, s, len(s).cint, i, 0'i32, - cast[ptr cint](rawMatches), 3) - if res < 0'i32: break - let a = rawMatches[0] - let b = rawMatches[1] - if a == b and a == i: break - yield substr(s, int(a), int(b)-1) - i = b + let res = rawMatch(s.cstring, pattern, i, len(s), 0'u32, matchData) + if res < 0: break + let rawMatches = ovector(matchData) + let matchStart = rawMatches[0] + let matchEnd = rawMatches[1] + if matchStart == matchEnd and matchStart.int == i: break + yield substr(s, int(matchStart), int(matchEnd) - 1) + i = matchEnd.int iterator findAll*(buf: cstring, pattern: Regex, start = 0, bufSize: int): string = ## Yields all matching `substrings` of `s` that match `pattern`. ## ## Note that since this is an iterator you should not modify the string you ## are iterating over: bad things could happen. - var - i = int32(start) - rtarray = initRtArray[cint](3) - rawMatches = rtarray.getRawData + var i = start + let matchData = newMatchData(1) + defer: pcre2.match_data_free(matchData) while true: - let res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, i, 0'i32, - cast[ptr cint](rawMatches), 3) - if res < 0'i32: break - let a = rawMatches[0] - let b = rawMatches[1] - if a == b and a == i: break - var str = newString(b-a) - copyMem(str[0].addr, unsafeAddr(buf[a]), b-a) + let res = rawMatch(buf, pattern, i, bufSize, 0'u32, matchData) + if res < 0: break + let rawMatches = ovector(matchData) + let matchStart = rawMatches[0] + let matchEnd = rawMatches[1] + if matchStart == matchEnd and matchStart.int == i: break + var str = newString(int(matchEnd - matchStart)) + copyMem(str[0].addr, unsafeAddr(buf[int(matchStart)]), int(matchEnd - matchStart)) yield str - i = b + i = matchEnd.int proc findAll*(s: string, pattern: Regex, start = 0): seq[string] {.inline.} = ## returns all matching `substrings` of `s` that match `pattern`. @@ -503,7 +527,7 @@ proc replace*(s: string, sub: Regex, by = ""): string = doAssert "var1=key; var2=key2".replace(re"(\w+)=(\w+)", "?") == "?; ?" result = "" var prev = 0 - var flags = int32(0) + var flags = 0'u32 while prev < s.len: var match = findBoundsImpl(s.cstring, sub, prev, s.len, flags) flags = 0 @@ -512,7 +536,7 @@ proc replace*(s: string, sub: Regex, by = ""): string = add(result, by) if match.first > match.last: # 0-len match - flags = pcre.NOTEMPTY_ATSTART + flags = pcre2.NOTEMPTY_ATSTART.uint32 prev = match.last + 1 add(result, substr(s, prev)) diff --git a/lib/wrappers/pcre2.nim b/lib/wrappers/pcre2.nim new file mode 100644 index 0000000000..721c718dbe --- /dev/null +++ b/lib/wrappers/pcre2.nim @@ -0,0 +1,260 @@ +# +# Nim's Runtime Library +# (c) Copyright 2026 Nim Contributors +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## Wrapper for the 8-bit PCRE2 API. + +when sizeof(int) == 4: + const ANCHORED* = low(int) +else: + const ANCHORED* = int(0x80000000) + +const + NO_UTF_CHECK* = int(0x40000000) + ENDANCHORED* = int(0x20000000) + +const + ALLOW_EMPTY_CLASS* = 0x00000001 + ALT_BSUX* = 0x00000002 + AUTO_CALLOUT* = 0x00000004 + CASELESS* = 0x00000008 + DOLLAR_ENDONLY* = 0x00000010 + DOTALL* = 0x00000020 + DUPNAMES* = 0x00000040 + EXTENDED* = 0x00000080 + FIRSTLINE* = 0x00000100 + MATCH_UNSET_BACKREF* = 0x00000200 + MULTILINE* = 0x00000400 + NEVER_UCP* = 0x00000800 + NEVER_UTF* = 0x00001000 + NO_AUTO_CAPTURE* = 0x00002000 + NO_AUTO_POSSESS* = 0x00004000 + NO_DOTSTAR_ANCHOR* = 0x00008000 + NO_START_OPTIMIZE* = 0x00010000 + NO_START_OPTIMISE* = NO_START_OPTIMIZE + UCP* = 0x00020000 + UNGREEDY* = 0x00040000 + UTF* = 0x00080000 + UTF8* = UTF + NEVER_BACKSLASH_C* = 0x00100000 + ALT_CIRCUMFLEX* = 0x00200000 + ALT_VERBNAMES* = 0x00400000 + USE_OFFSET_LIMIT* = 0x00800000 + EXTENDED_MORE* = 0x01000000 + LITERAL* = 0x02000000 + MATCH_INVALID_UTF* = 0x04000000 + ALT_EXTENDED_CLASS* = 0x08000000 + + ## PCRE2 no longer exposes PCRE's `JAVASCRIPT_COMPAT` option. `ALT_BSUX` + ## preserves the most important JavaScript-style escape handling. + JAVASCRIPT_COMPAT* = ALT_BSUX + +const + JIT_COMPLETE* = 0x00000001 + JIT_PARTIAL_SOFT* = 0x00000002 + JIT_PARTIAL_HARD* = 0x00000004 + JIT_INVALID_UTF* = 0x00000100 + JIT_TEST_ALLOC* = 0x00000200 + +const + NOTBOL* = 0x00000001 + NOTEOL* = 0x00000002 + NOTEMPTY* = 0x00000004 + NOTEMPTY_ATSTART* = 0x00000008 + PARTIAL_SOFT* = 0x00000010 + PARTIAL_HARD* = 0x00000020 + DFA_RESTART* = 0x00000040 + DFA_SHORTEST* = 0x00000080 + NO_JIT* = 0x00002000 + COPY_MATCHED_SUBJECT* = 0x00004000 + DISABLE_RECURSELOOP_CHECK* = 0x00040000 + +const + NEWLINE_CR* = 1 + NEWLINE_LF* = 2 + NEWLINE_CRLF* = 3 + NEWLINE_ANY* = 4 + NEWLINE_ANYCRLF* = 5 + NEWLINE_NUL* = 6 + BSR_UNICODE* = 1 + BSR_ANYCRLF* = 2 + +const + ERROR_NOMATCH* = -1 + ERROR_PARTIAL* = -2 + + ERROR_UTF8_ERR1* = -3 + ERROR_UTF8_ERR2* = -4 + ERROR_UTF8_ERR3* = -5 + ERROR_UTF8_ERR4* = -6 + ERROR_UTF8_ERR5* = -7 + ERROR_UTF8_ERR6* = -8 + ERROR_UTF8_ERR7* = -9 + ERROR_UTF8_ERR8* = -10 + ERROR_UTF8_ERR9* = -11 + ERROR_UTF8_ERR10* = -12 + ERROR_UTF8_ERR11* = -13 + ERROR_UTF8_ERR12* = -14 + ERROR_UTF8_ERR13* = -15 + ERROR_UTF8_ERR14* = -16 + ERROR_UTF8_ERR15* = -17 + ERROR_UTF8_ERR16* = -18 + ERROR_UTF8_ERR17* = -19 + ERROR_UTF8_ERR18* = -20 + ERROR_UTF8_ERR19* = -21 + ERROR_UTF8_ERR20* = -22 + ERROR_UTF8_ERR21* = -23 + + ERROR_BADDATA* = -29 + ERROR_MIXEDTABLES* = -30 + ERROR_BADMAGIC* = -31 + ERROR_BADMODE* = -32 + ERROR_BADOFFSET* = -33 + ERROR_BADOPTION* = -34 + ERROR_BADREPLACEMENT* = -35 + ERROR_BADUTFOFFSET* = -36 + ERROR_CALLOUT* = -37 + ERROR_INTERNAL* = -44 + ERROR_JIT_BADOPTION* = -45 + ERROR_JIT_STACKLIMIT* = -46 + ERROR_MATCHLIMIT* = -47 + ERROR_NOMEMORY* = -48 + ERROR_NOSUBSTRING* = -49 + ERROR_NULL* = -51 + ERROR_RECURSELOOP* = -52 + ERROR_DEPTHLIMIT* = -53 + ERROR_RECURSIONLIMIT* = ERROR_DEPTHLIMIT + ERROR_UNAVAILABLE* = -54 + ERROR_UNSET* = -55 + ERROR_BADOFFSETLIMIT* = -56 + ERROR_HEAPLIMIT* = -63 + ERROR_DFA_UINVALID_UTF* = -66 + ERROR_INVALIDOFFSET* = -67 + ERROR_JIT_UNSUPPORTED* = -68 + +const + INFO_ALLOPTIONS* = 0 + INFO_ARGOPTIONS* = 1 + INFO_BACKREFMAX* = 2 + INFO_BSR* = 3 + INFO_CAPTURECOUNT* = 4 + INFO_FIRSTCODEUNIT* = 5 + INFO_FIRSTCODETYPE* = 6 + INFO_FIRSTBITMAP* = 7 + INFO_HASCRORLF* = 8 + INFO_JCHANGED* = 9 + INFO_JITSIZE* = 10 + INFO_LASTCODEUNIT* = 11 + INFO_LASTCODETYPE* = 12 + INFO_MATCHEMPTY* = 13 + INFO_MATCHLIMIT* = 14 + INFO_MAXLOOKBEHIND* = 15 + INFO_MINLENGTH* = 16 + INFO_NAMECOUNT* = 17 + INFO_NAMEENTRYSIZE* = 18 + INFO_NAMETABLE* = 19 + INFO_NEWLINE* = 20 + INFO_DEPTHLIMIT* = 21 + INFO_RECURSIONLIMIT* = INFO_DEPTHLIMIT + INFO_SIZE* = 22 + INFO_HASBACKSLASHC* = 23 + INFO_FRAMESIZE* = 24 + INFO_HEAPLIMIT* = 25 + INFO_EXTRAOPTIONS* = 26 + +const + CONFIG_BSR* = 0 + CONFIG_JIT* = 1 + CONFIG_JITTARGET* = 2 + CONFIG_LINKSIZE* = 3 + CONFIG_MATCHLIMIT* = 4 + CONFIG_NEWLINE* = 5 + CONFIG_PARENSLIMIT* = 6 + CONFIG_DEPTHLIMIT* = 7 + CONFIG_RECURSIONLIMIT* = CONFIG_DEPTHLIMIT + CONFIG_STACKRECURSE* = 8 + CONFIG_UNICODE* = 9 + CONFIG_UNICODE_VERSION* = 10 + CONFIG_VERSION* = 11 + CONFIG_HEAPLIMIT* = 12 + CONFIG_NEVER_BACKSLASH_C* = 13 + CONFIG_COMPILED_WIDTHS* = 14 + CONFIG_TABLES_LENGTH* = 15 + +const + ZERO_TERMINATED* = not 0.csize_t + UNSET* = not 0.csize_t + +type + Pcre* = object + CompileContext* = object + GeneralContext* = object + MatchContext* = object + MatchData* = object + JitStack* = object + +when not defined(usePcreHeader): + when hostOS == "windows": + const pcre2Dll = "pcre2-8.dll" + elif hostOS == "macosx": + const pcre2Dll = "libpcre2-8(.0|).dylib" + else: + const pcre2Dll = "libpcre2-8.so(.0|)" + {.push dynlib: pcre2Dll.} +else: + {.passC: "-DPCRE2_CODE_UNIT_WIDTH=8".} + {.push header: "".} + +{.push cdecl, importc: "pcre2_$1_8".} + +proc compile*(pattern: ptr uint8, + length: csize_t, + options: uint32, + errorCode: ptr cint, + errorOffset: ptr csize_t, + context: ptr CompileContext): ptr Pcre + +proc code_free*(code: ptr Pcre) + +proc config*(what: uint32, + where: pointer): cint + +proc get_error_message*(errorCode: cint, + buffer: ptr uint8, + bufferLength: csize_t): cint + +proc match*(code: ptr Pcre, + subject: ptr uint8, + length: csize_t, + startOffset: csize_t, + options: uint32, + matchData: ptr MatchData, + context: ptr MatchContext): cint + +proc match_data_create*(oveccount: uint32, + context: ptr GeneralContext): ptr MatchData + +proc match_data_create_from_pattern*(code: ptr Pcre, + context: ptr GeneralContext): ptr MatchData + +proc match_data_free*(matchData: ptr MatchData) + +proc get_ovector_pointer*(matchData: ptr MatchData): ptr csize_t + +proc get_ovector_count*(matchData: ptr MatchData): uint32 + +proc pattern_info*(code: ptr Pcre, + what: uint32, + where: pointer): cint + +proc jit_compile*(code: ptr Pcre, + options: uint32): cint + +proc jit_free_unused_memory*() + +{.pop.} +{.pop.} diff --git a/tools/ci_generate.nim b/tools/ci_generate.nim index a461be5915..88b078df78 100644 --- a/tools/ci_generate.nim +++ b/tools/ci_generate.nim @@ -110,7 +110,7 @@ image: freebsd/latest packages: - databases/sqlite3 - devel/boehm-gc-threaded -- devel/pcre +- devel/pcre2 - devel/sdl20 - devel/sfml - www/node @@ -124,7 +124,7 @@ packages: - sqlite3 - node - boehm-gc -- pcre +- pcre2 - sfml - sdl2 - libffi diff --git a/tools/kochdocs.nim b/tools/kochdocs.nim index c0f508a74f..33f277a006 100644 --- a/tools/kochdocs.nim +++ b/tools/kochdocs.nim @@ -126,6 +126,7 @@ mm.md withoutIndex = """ lib/wrappers/tinyc.nim lib/wrappers/pcre.nim +lib/wrappers/pcre2.nim lib/wrappers/openssl.nim lib/posix/posix.nim lib/posix/linux.nim diff --git a/tools/nimgrep.nim b/tools/nimgrep.nim index 599c616bae..a76619faf3 100644 --- a/tools/nimgrep.nim +++ b/tools/nimgrep.nim @@ -729,7 +729,7 @@ iterator searchFile(pattern: Pattern; buffer: string): Output = i = t.last+1 when typeof(pattern) is Regex: if buffer.len > MaxReBufSize: - yield Output(kind: openError, msg: "PCRE size limit is " & $MaxReBufSize) + yield Output(kind: openError, msg: "PCRE2 size limit is " & $MaxReBufSize) func detectBin(buffer: string): bool = for i in 0 ..< min(1024, buffer.len):