mirror of
https://github.com/nim-lang/Nim.git
synced 2026-05-28 15:55:14 +00:00
test pcre2
This commit is contained in:
2
.github/workflows/ci_packages.yml
vendored
2
.github/workflows/ci_packages.yml
vendored
@@ -49,7 +49,7 @@ jobs:
|
||||
DEBIAN_FRONTEND='noninteractive' \
|
||||
sudo apt-get install --no-install-recommends -yq \
|
||||
libcurl4-openssl-dev libgc-dev libsdl1.2-dev libsfml-dev \
|
||||
valgrind libc6-dbg libblas-dev liblapack-dev libpcre3 xorg-dev
|
||||
valgrind libc6-dbg libblas-dev liblapack-dev libpcre2-dev xorg-dev
|
||||
- name: 'Install dependencies (macOS)'
|
||||
if: runner.os == 'macOS'
|
||||
run: brew install boehmgc make sfml gtk+3
|
||||
|
||||
@@ -78,8 +78,8 @@ parameter and result types, not just their source-level shape. Use
|
||||
- `min`, `max`, and `sequtils`' `minIndex`, `maxIndex` and `minmax` for `openArray`s now accept a comparison function.
|
||||
- `system.substr` implementation now uses `copymem` (wrapped C `memcpy`) for copying data, if available at compilation.
|
||||
- `system.newStringUninit` is now considered free of side-effects allowing it to be used with `--experimental:strictFuncs`.
|
||||
- `std/re` and `std/nre` are deprecated as PCRE library is obsolete.
|
||||
Use https://github.com/nitely/nim-regex or `std/nre2`.
|
||||
- `std/re` and `std/nre` now use PCRE2. They remain deprecated;
|
||||
use https://github.com/nitely/nim-regex or `std/nre2`.
|
||||
See: https://github.com/nim-lang/Nim/issues/23668.
|
||||
- `std/pegs` now correctly lexes UTF-8 bytes inside bare identifier-style
|
||||
terminals, so case-insensitive matching of non-ASCII terms (e.g. ``\i café``)
|
||||
|
||||
@@ -596,12 +596,12 @@ Regular expressions
|
||||
|
||||
* [re](re.html)
|
||||
Procedures and operators for handling regular
|
||||
expressions. The current implementation uses PCRE.
|
||||
expressions. The current implementation uses PCRE2.
|
||||
|
||||
* [nre](nre.html)
|
||||
|
||||
Many help functions for handling regular expressions.
|
||||
The current implementation uses PCRE.
|
||||
The current implementation uses PCRE2.
|
||||
|
||||
Database support
|
||||
----------------
|
||||
@@ -661,6 +661,9 @@ Regular expressions
|
||||
* [pcre](pcre.html)
|
||||
Wrapper for the PCRE library.
|
||||
|
||||
* [pcre2](pcre2.html)
|
||||
Wrapper for the PCRE2 library.
|
||||
|
||||
|
||||
Database support
|
||||
----------------
|
||||
|
||||
@@ -86,7 +86,7 @@ That means you can always use only 1 such an option with logical OR, e.g.
|
||||
Meaning of `^`:literal: and `$`:literal:
|
||||
========================================
|
||||
|
||||
`nimgrep`:cmd: PCRE engine is run in a single-line mode so
|
||||
`nimgrep`:cmd: PCRE2 engine is run in a single-line mode so
|
||||
`^`:literal: matches the beginning of whole input *file* and
|
||||
`$`:literal: matches the end of *file* (or whole input *string* for
|
||||
options like `--filename`).
|
||||
@@ -97,7 +97,7 @@ Add the `(?m)`:literal: modifier to the beginning of your pattern for
|
||||
Examples
|
||||
========
|
||||
|
||||
All examples below use default PCRE Regex patterns:
|
||||
All examples below use default PCRE2 Regex patterns:
|
||||
|
||||
+ To search recursively in Nim files using style-insensitive identifiers:
|
||||
|
||||
|
||||
@@ -7,21 +7,21 @@
|
||||
#
|
||||
|
||||
when defined(js):
|
||||
{.error: "This library needs to be compiled with a c-like backend, and depends on PCRE; See jsre for JS backend.".}
|
||||
{.error: "This library needs to be compiled with a c-like backend, and depends on PCRE2; See jsre for JS backend.".}
|
||||
|
||||
## .. warning:: NRE is deprecated.
|
||||
## Use [Regex](https://github.com/nitely/nim-regex) or
|
||||
## `NRE2 <nre2.html>`_ that wraps Regex so that you can easily replace NRE.
|
||||
## PCRE library is now at end of life.
|
||||
## This compatibility module uses PCRE2.
|
||||
##
|
||||
## What is NRE?
|
||||
## ============
|
||||
##
|
||||
## A regular expression library for Nim using PCRE to do the hard work.
|
||||
## A regular expression library for Nim using PCRE2 to do the hard work.
|
||||
##
|
||||
## For documentation on how to write patterns, there exists `the official PCRE
|
||||
## For documentation on how to write patterns, there exists `the official PCRE2
|
||||
## pattern documentation
|
||||
## <https://www.pcre.org/original/doc/html/pcrepattern.html>`_. You can also
|
||||
## <https://www.pcre.org/current/doc/html/pcre2pattern.html>`_. You can also
|
||||
## search the internet for a wide variety of third-party documentation and
|
||||
## tools.
|
||||
##
|
||||
@@ -39,10 +39,8 @@ runnableExamples:
|
||||
## Licencing
|
||||
## ---------
|
||||
##
|
||||
## PCRE has `some additional terms`_ that you must agree to in order to use
|
||||
## this module.
|
||||
## PCRE2 is distributed under a BSD-style licence.
|
||||
##
|
||||
## .. _`some additional terms`: https://pcre.sourceforge.net/license.txt
|
||||
runnableExamples:
|
||||
import std/sugar
|
||||
let vowels = re"[aeoui]"
|
||||
@@ -66,7 +64,7 @@ runnableExamples:
|
||||
assert find("uxabc", re"(?<=x|y)ab", start = 1).get.captures[-1] == "ab"
|
||||
assert find("uxabc", re"ab", start = 3).isNone
|
||||
|
||||
from std/pcre import nil
|
||||
import std/pcre2 as pcre
|
||||
import nre/private/util
|
||||
import std/tables
|
||||
from std/strutils import `%`
|
||||
@@ -82,7 +80,6 @@ type
|
||||
RegexDesc* = object
|
||||
pattern*: string
|
||||
pcreObj: ptr pcre.Pcre ## not nil
|
||||
pcreExtra: ptr pcre.ExtraData ## nil
|
||||
|
||||
captureNameToId: Table[string, int]
|
||||
|
||||
@@ -93,9 +90,9 @@ type
|
||||
##
|
||||
## `pattern: string`
|
||||
## : the string that was used to create the pattern. For details on how
|
||||
## to write a pattern, please see `the official PCRE pattern
|
||||
## to write a pattern, please see `the official PCRE2 pattern
|
||||
## documentation.
|
||||
## <https://www.pcre.org/original/doc/html/pcrepattern.html>`_
|
||||
## <https://www.pcre.org/current/doc/html/pcre2pattern.html>`_
|
||||
##
|
||||
## `captureCount: int`
|
||||
## : the number of captures that the pattern has.
|
||||
@@ -140,23 +137,23 @@ type
|
||||
## NEL (next line, U+0085), LS (line separator, U+2028), and PS
|
||||
## (paragraph separator, U+2029). For the 8-bit library, the last two
|
||||
## are recognized only in UTF-8 mode.
|
||||
## — man pcre
|
||||
## -- man pcre2pattern
|
||||
##
|
||||
## - `(*JAVASCRIPT_COMPAT)` - JavaScript compatibility
|
||||
## - `(*NO_STUDY)` - turn off studying; study is enabled by default
|
||||
##
|
||||
## For more details on the leading option groups, see the `Option
|
||||
## Setting <https://man7.org/linux/man-pages/man3/pcresyntax.3.html#OPTION_SETTING>`_
|
||||
## Setting <https://www.pcre.org/current/doc/html/pcre2syntax.html#SEC16>`_
|
||||
## and the `Newline
|
||||
## Convention <https://man7.org/linux/man-pages/man3/pcresyntax.3.html#NEWLINE_CONVENTION>`_
|
||||
## sections of the `PCRE syntax
|
||||
## manual <https://man7.org/linux/man-pages/man3/pcresyntax.3.html>`_.
|
||||
## Convention <https://www.pcre.org/current/doc/html/pcre2syntax.html#SEC17>`_
|
||||
## sections of the `PCRE2 syntax
|
||||
## manual <https://www.pcre.org/current/doc/html/pcre2syntax.html>`_.
|
||||
##
|
||||
## Some of these options are not part of PCRE and are converted by nre
|
||||
## into PCRE flags. These include `NEVER_UTF`, `ANCHORED`,
|
||||
## Some of these options are not part of a pattern and are converted by nre
|
||||
## into PCRE2 flags. These include `NEVER_UTF`, `ANCHORED`,
|
||||
## `DOLLAR_ENDONLY`, `FIRSTLINE`, `NO_AUTO_CAPTURE`,
|
||||
## `JAVASCRIPT_COMPAT`, `U`, `NO_STUDY`. In other PCRE wrappers, you
|
||||
## will need to pass these as separate flags to PCRE.
|
||||
## `JAVASCRIPT_COMPAT`, `U`, `NO_STUDY`. In other PCRE2 wrappers, you
|
||||
## will need to pass these as separate flags to PCRE2.
|
||||
|
||||
RegexMatch* = object
|
||||
## Usually seen as `Option[RegexMatch]`, it represents the result of an
|
||||
@@ -196,7 +193,7 @@ type
|
||||
pattern*: Regex ## The regex doing the matching.
|
||||
## Not nil.
|
||||
str*: string ## The string that was matched against.
|
||||
pcreMatchBounds: seq[HSlice[cint, cint]] ## First item is the bounds of the match
|
||||
pcreMatchBounds: seq[HSlice[csize_t, csize_t]] ## First item is the bounds of the match
|
||||
## Other items are the captures
|
||||
## `a` is inclusive start, `b` is exclusive end
|
||||
|
||||
@@ -227,38 +224,32 @@ when defined(gcDestructors):
|
||||
when defined(nimAllowNonVarDestructor) and defined(nimPreviewNonVarDestructor):
|
||||
proc `=destroy`(pattern: RegexDesc) =
|
||||
`=destroy`(pattern.pattern)
|
||||
pcre.free_substring(cast[cstring](pattern.pcreObj))
|
||||
if pattern.pcreExtra != nil:
|
||||
pcre.free_study(pattern.pcreExtra)
|
||||
pcre.code_free(pattern.pcreObj)
|
||||
`=destroy`(pattern.captureNameToId)
|
||||
else:
|
||||
proc `=destroy`(pattern: var RegexDesc) =
|
||||
`=destroy`(pattern.pattern)
|
||||
pcre.free_substring(cast[cstring](pattern.pcreObj))
|
||||
if pattern.pcreExtra != nil:
|
||||
pcre.free_study(pattern.pcreExtra)
|
||||
pcre.code_free(pattern.pcreObj)
|
||||
`=destroy`(pattern.captureNameToId)
|
||||
else:
|
||||
proc destroyRegex(pattern: Regex) =
|
||||
`=destroy`(pattern.pattern)
|
||||
pcre.free_substring(cast[cstring](pattern.pcreObj))
|
||||
if pattern.pcreExtra != nil:
|
||||
pcre.free_study(pattern.pcreExtra)
|
||||
pcre.code_free(pattern.pcreObj)
|
||||
`=destroy`(pattern.captureNameToId)
|
||||
|
||||
proc getinfo[T](pattern: Regex, opt: cint): T =
|
||||
proc getinfo[T](pattern: Regex, opt: uint32): T =
|
||||
result = default(T)
|
||||
let retcode = pcre.fullinfo(pattern.pcreObj, pattern.pcreExtra, opt, addr result)
|
||||
let retcode = pcre.pattern_info(pattern.pcreObj, opt, addr result)
|
||||
|
||||
if retcode < 0:
|
||||
# XXX Error message that doesn't expose implementation details
|
||||
raise newException(FieldDefect, "Invalid getinfo for $1, errno $2" % [$opt, $retcode])
|
||||
|
||||
proc getNameToNumberTable(pattern: Regex): Table[string, int] =
|
||||
let entryCount = getinfo[cint](pattern, pcre.INFO_NAMECOUNT)
|
||||
let entrySize = getinfo[cint](pattern, pcre.INFO_NAMEENTRYSIZE)
|
||||
let entryCount = getinfo[uint32](pattern, pcre.INFO_NAMECOUNT).int
|
||||
let entrySize = getinfo[uint32](pattern, pcre.INFO_NAMEENTRYSIZE).int
|
||||
let table = cast[ptr UncheckedArray[uint8]](
|
||||
getinfo[int](pattern, pcre.INFO_NAMETABLE))
|
||||
getinfo[pointer](pattern, pcre.INFO_NAMETABLE))
|
||||
|
||||
result = initTable[string, int]()
|
||||
|
||||
@@ -274,61 +265,69 @@ proc getNameToNumberTable(pattern: Regex): Table[string, int] =
|
||||
|
||||
result[name] = num
|
||||
|
||||
proc initRegex(pattern: string, flags: int, study = true): Regex =
|
||||
proc pcreErrorMessage(errorCode: cint): string =
|
||||
var buffer: array[256, uint8]
|
||||
let length = pcre.get_error_message(errorCode, addr buffer[0], buffer.len.csize_t)
|
||||
if length >= 0:
|
||||
result = newString(length)
|
||||
if length > 0:
|
||||
copyMem(addr result[0], addr buffer[0], length)
|
||||
else:
|
||||
result = $errorCode
|
||||
|
||||
proc jitCompile(pattern: ptr pcre.Pcre) =
|
||||
var hasJit: cint = 0
|
||||
if pcre.config(pcre.CONFIG_JIT, addr hasJit) == 0 and hasJit == 1:
|
||||
discard pcre.jit_compile(pattern, pcre.JIT_COMPLETE.uint32)
|
||||
|
||||
proc initRegex(pattern: string, flags: uint32, study = true): Regex =
|
||||
when defined(gcDestructors):
|
||||
result = Regex()
|
||||
else:
|
||||
new(result, destroyRegex)
|
||||
result.pattern = pattern
|
||||
|
||||
var errorMsg: cstring = ""
|
||||
var errOffset: cint = 0
|
||||
var
|
||||
errorCode: cint = 0
|
||||
errOffset: csize_t = 0
|
||||
|
||||
result.pcreObj = pcre.compile(cstring(pattern),
|
||||
# better hope int is at least 4 bytes..
|
||||
cint(flags), addr errorMsg,
|
||||
result.pcreObj = pcre.compile(cast[ptr uint8](cstring(pattern)),
|
||||
pattern.len.csize_t, flags, addr errorCode,
|
||||
addr errOffset, nil)
|
||||
if result.pcreObj == nil:
|
||||
# failed to compile
|
||||
raise SyntaxError(msg: $errorMsg, pos: errOffset, pattern: pattern)
|
||||
raise SyntaxError(msg: pcreErrorMessage(errorCode), pos: errOffset.int,
|
||||
pattern: pattern)
|
||||
|
||||
if study:
|
||||
var options: cint = 0
|
||||
var hasJit: cint = cint(0)
|
||||
if pcre.config(pcre.CONFIG_JIT, addr hasJit) == 0:
|
||||
if hasJit == 1'i32:
|
||||
options = pcre.STUDY_JIT_COMPILE
|
||||
result.pcreExtra = pcre.study(result.pcreObj, options, addr errorMsg)
|
||||
if errorMsg != nil:
|
||||
raise StudyError(msg: $errorMsg)
|
||||
jitCompile(result.pcreObj)
|
||||
|
||||
result.captureNameToId = result.getNameToNumberTable()
|
||||
|
||||
proc captureCount*(pattern: Regex): int =
|
||||
return getinfo[cint](pattern, pcre.INFO_CAPTURECOUNT)
|
||||
return getinfo[uint32](pattern, pcre.INFO_CAPTURECOUNT).int
|
||||
|
||||
proc captureNameId*(pattern: Regex): Table[string, int] =
|
||||
return pattern.captureNameToId
|
||||
|
||||
proc matchesCrLf(pattern: Regex): bool =
|
||||
let flags = uint32(getinfo[culong](pattern, pcre.INFO_OPTIONS))
|
||||
let newlineFlags = flags and (pcre.NEWLINE_CRLF or
|
||||
pcre.NEWLINE_ANY or
|
||||
pcre.NEWLINE_ANYCRLF)
|
||||
if newlineFlags > 0u32:
|
||||
let newline = getinfo[uint32](pattern, pcre.INFO_NEWLINE)
|
||||
case newline
|
||||
of pcre.NEWLINE_CRLF, pcre.NEWLINE_ANY, pcre.NEWLINE_ANYCRLF:
|
||||
return true
|
||||
of pcre.NEWLINE_CR, pcre.NEWLINE_LF, pcre.NEWLINE_NUL:
|
||||
return false
|
||||
else:
|
||||
discard
|
||||
|
||||
# get flags from build config
|
||||
var confFlags: cint = cint(0)
|
||||
var confFlags: uint32 = 0
|
||||
if pcre.config(pcre.CONFIG_NEWLINE, addr confFlags) != 0:
|
||||
assert(false, "CONFIG_NEWLINE apparently got screwed up")
|
||||
|
||||
case confFlags
|
||||
of 13: return false
|
||||
of 10: return false
|
||||
of (13 shl 8) or 10: return true
|
||||
of -2: return true
|
||||
of -1: return true
|
||||
of pcre.NEWLINE_CR, pcre.NEWLINE_LF, pcre.NEWLINE_NUL: return false
|
||||
of pcre.NEWLINE_CRLF, pcre.NEWLINE_ANY, pcre.NEWLINE_ANYCRLF: return true
|
||||
else: return false
|
||||
|
||||
|
||||
@@ -338,7 +337,9 @@ func captures*(pattern: RegexMatch): Captures = return Captures(pattern)
|
||||
|
||||
func contains*(pattern: CaptureBounds, i: int): bool =
|
||||
let pattern = RegexMatch(pattern)
|
||||
pattern.pcreMatchBounds[i + 1].a != -1
|
||||
let index = i + 1
|
||||
index >= 0 and index < pattern.pcreMatchBounds.len and
|
||||
pattern.pcreMatchBounds[index].a != pcre.UNSET
|
||||
|
||||
func contains*(pattern: Captures, i: int): bool =
|
||||
i in CaptureBounds(pattern)
|
||||
@@ -349,7 +350,7 @@ func `[]`*(pattern: CaptureBounds, i: int): HSlice[int, int] =
|
||||
raise newException(IndexDefect, "Group '" & $i & "' was not captured")
|
||||
|
||||
let bounds = pattern.pcreMatchBounds[i + 1]
|
||||
int(bounds.a)..int(bounds.b-1)
|
||||
int(bounds.a) .. (int(bounds.b) - 1)
|
||||
|
||||
func `[]`*(pattern: Captures, i: int): string =
|
||||
let pattern = RegexMatch(pattern)
|
||||
@@ -437,8 +438,7 @@ proc `$`*(pattern: RegexMatch): string =
|
||||
proc `==`*(a, b: Regex): bool =
|
||||
if not a.isNil and not b.isNil:
|
||||
return a.pattern == b.pattern and
|
||||
a.pcreObj == b.pcreObj and
|
||||
a.pcreExtra == b.pcreExtra
|
||||
a.pcreObj == b.pcreObj
|
||||
else:
|
||||
return system.`==`(a, b)
|
||||
|
||||
@@ -453,7 +453,7 @@ const PcreOptions = {
|
||||
"FIRSTLINE": pcre.FIRSTLINE,
|
||||
"NO_AUTO_CAPTURE": pcre.NO_AUTO_CAPTURE,
|
||||
"JAVASCRIPT_COMPAT": pcre.JAVASCRIPT_COMPAT,
|
||||
"U": pcre.UTF8 or pcre.UCP
|
||||
"U": pcre.UTF or pcre.UCP
|
||||
}.toTable
|
||||
|
||||
# Options that are supported inside regular expressions themselves
|
||||
@@ -503,46 +503,63 @@ proc extractOptions(pattern: string): tuple[pattern: string, flags: int, study:
|
||||
|
||||
proc re*(pattern: string): Regex =
|
||||
let (pattern, flags, study) = extractOptions(pattern)
|
||||
initRegex(pattern, flags, study)
|
||||
initRegex(pattern, cast[uint32](flags), study)
|
||||
|
||||
proc matchImpl(str: string, pattern: Regex, start, endpos: int, flags: int): Option[RegexMatch] =
|
||||
func isInvalidUnicodeError(errorCode: cint): bool =
|
||||
(errorCode <= pcre.ERROR_UTF8_ERR1 and errorCode >= pcre.ERROR_UTF8_ERR21) or
|
||||
errorCode == pcre.ERROR_BADUTFOFFSET or
|
||||
errorCode == pcre.ERROR_DFA_UINVALID_UTF
|
||||
|
||||
proc newMatchData(pattern: Regex): ptr pcre.MatchData =
|
||||
result = pcre.match_data_create_from_pattern(pattern.pcreObj, nil)
|
||||
if result == nil:
|
||||
raise RegexInternalError(msg: "could not allocate PCRE2 match data")
|
||||
|
||||
proc matchImpl(str: string, pattern: Regex, start, endpos: int, options: uint32): Option[RegexMatch] =
|
||||
var myResult = RegexMatch(pattern: pattern, str: str)
|
||||
# See PCRE man pages.
|
||||
# 2x capture count to make room for start-end pairs
|
||||
# 1x capture count as slack space for PCRE
|
||||
let vecsize = (pattern.captureCount() + 1) * 3
|
||||
# div 2 because each element is 2 cints long
|
||||
# plus 1 because we need the ceiling, not the floor
|
||||
myResult.pcreMatchBounds = newSeq[HSlice[cint, cint]]((vecsize + 1) div 2)
|
||||
myResult.pcreMatchBounds.setLen(vecsize div 3)
|
||||
myResult.pcreMatchBounds = newSeq[HSlice[csize_t, csize_t]](pattern.captureCount() + 1)
|
||||
|
||||
let strlen = if endpos == int.high: str.len else: endpos+1
|
||||
doAssert(strlen <= str.len) # don't want buffer overflows
|
||||
if start < 0 or start > strlen:
|
||||
return none(RegexMatch)
|
||||
|
||||
let matchData = newMatchData(pattern)
|
||||
defer: pcre.match_data_free(matchData)
|
||||
let execRet = pcre.match(pattern.pcreObj,
|
||||
cast[ptr uint8](cstring(str)),
|
||||
strlen.csize_t,
|
||||
start.csize_t,
|
||||
options,
|
||||
matchData,
|
||||
nil)
|
||||
let rawMatches = cast[ptr UncheckedArray[csize_t]](pcre.get_ovector_pointer(matchData))
|
||||
let ovectorCount = min(myResult.pcreMatchBounds.len,
|
||||
pcre.get_ovector_count(matchData).int)
|
||||
for i in 0 ..< ovectorCount:
|
||||
myResult.pcreMatchBounds[i] = rawMatches[i * 2] .. rawMatches[i * 2 + 1]
|
||||
|
||||
let execRet = pcre.exec(pattern.pcreObj,
|
||||
pattern.pcreExtra,
|
||||
cstring(str),
|
||||
cint(strlen),
|
||||
cint(start),
|
||||
cint(flags),
|
||||
cast[ptr cint](addr myResult.pcreMatchBounds[0]),
|
||||
cint(vecsize))
|
||||
if execRet >= 0:
|
||||
return some(myResult)
|
||||
|
||||
case execRet:
|
||||
of pcre.ERROR_NOMATCH:
|
||||
return none(RegexMatch)
|
||||
of pcre.ERROR_NULL:
|
||||
raise newException(AccessViolationDefect, "Expected non-null parameters")
|
||||
of pcre.ERROR_BADOPTION:
|
||||
raise RegexInternalError(msg: "Unknown pattern flag. Either a bug or " &
|
||||
"outdated PCRE.")
|
||||
of pcre.ERROR_BADUTF8, pcre.ERROR_SHORTUTF8, pcre.ERROR_BADUTF8_OFFSET:
|
||||
raise InvalidUnicodeError(msg: "Invalid unicode byte sequence",
|
||||
pos: myResult.pcreMatchBounds[0].a)
|
||||
if isInvalidUnicodeError(execRet):
|
||||
let errorPos = if myResult.pcreMatchBounds.len > 0 and
|
||||
myResult.pcreMatchBounds[0].a != pcre.UNSET:
|
||||
myResult.pcreMatchBounds[0].a.int
|
||||
else:
|
||||
raise RegexInternalError(msg: "Unknown internal error: " & $execRet)
|
||||
start
|
||||
raise InvalidUnicodeError(msg: "Invalid unicode byte sequence", pos: errorPos)
|
||||
|
||||
case execRet
|
||||
of pcre.ERROR_NOMATCH:
|
||||
return none(RegexMatch)
|
||||
of pcre.ERROR_NULL:
|
||||
raise newException(AccessViolationDefect, "Expected non-null parameters")
|
||||
of pcre.ERROR_BADOPTION:
|
||||
raise RegexInternalError(msg: "Unknown pattern flag. Either a bug or " &
|
||||
"outdated PCRE2.")
|
||||
else:
|
||||
raise RegexInternalError(msg: "Unknown internal error: " & $execRet)
|
||||
|
||||
proc match*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] =
|
||||
## Like `find(...)<#find,string,Regex,int>`_, but anchored to the start of the
|
||||
@@ -559,7 +576,7 @@ proc match*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[R
|
||||
assert 0 in "abc".match(re"(\w)").get.captureBounds
|
||||
assert "abc".match(re"").get.captureBounds[-1] == 0 .. -1
|
||||
assert "abc".match(re"abc").get.captureBounds[-1] == 0 .. 2
|
||||
return str.matchImpl(pattern, start, endpos, pcre.ANCHORED)
|
||||
return str.matchImpl(pattern, start, endpos, cast[uint32](pcre.ANCHORED))
|
||||
|
||||
iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): RegexMatch =
|
||||
## Works the same as `find(...)<#find,string,Regex,int>`_, but finds every
|
||||
@@ -573,21 +590,21 @@ iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): R
|
||||
## Variants:
|
||||
##
|
||||
## - `proc findAll(...)` returns a `seq[string]`
|
||||
# see pcredemo for explanation => https://www.pcre.org/original/doc/html/pcredemo.html
|
||||
# see pcre2demo for explanation => https://www.pcre.org/current/doc/html/pcre2demo.html
|
||||
let matchesCrLf = pattern.matchesCrLf()
|
||||
let unicode = uint32(getinfo[culong](pattern, pcre.INFO_OPTIONS) and
|
||||
pcre.UTF8) > 0u32
|
||||
let unicode = uint32(getinfo[uint32](pattern, pcre.INFO_ALLOPTIONS) and
|
||||
pcre.UTF.uint32) > 0u32
|
||||
let strlen = if endpos == int.high: str.len else: endpos+1
|
||||
var offset = start
|
||||
var match: Option[RegexMatch] = default(Option[RegexMatch])
|
||||
var neverMatched = true
|
||||
|
||||
while true:
|
||||
var flags = 0
|
||||
var flags = 0'u32
|
||||
if match.isSome and
|
||||
match.get.matchBounds.a > match.get.matchBounds.b:
|
||||
# 0-len match
|
||||
flags = pcre.NOTEMPTY_ATSTART
|
||||
flags = pcre.NOTEMPTY_ATSTART.uint32
|
||||
match = str.matchImpl(pattern, offset, endpos, flags)
|
||||
|
||||
if match.isNone:
|
||||
@@ -623,7 +640,7 @@ proc find*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[Re
|
||||
## `endpos`
|
||||
## : The maximum index for a match; `int.high` means the end of the
|
||||
## string, otherwise it’s an inclusive upper bound.
|
||||
return str.matchImpl(pattern, start, endpos, 0)
|
||||
return str.matchImpl(pattern, start, endpos, 0'u32)
|
||||
|
||||
proc findAll*(str: string, pattern: Regex, start = 0, endpos = int.high): seq[string] =
|
||||
result = @[]
|
||||
|
||||
@@ -8,27 +8,25 @@
|
||||
#
|
||||
|
||||
when defined(js):
|
||||
{.error: "This library needs to be compiled with a c-like backend, and depends on PCRE; See jsre for JS backend.".}
|
||||
{.error: "This library needs to be compiled with a c-like backend, and depends on PCRE2; See jsre for JS backend.".}
|
||||
|
||||
## .. warning:: This module is deprecated.
|
||||
## Use [Regex](https://github.com/nitely/nim-regex).
|
||||
## PCRE library is now at end of life.
|
||||
## This compatibility module uses PCRE2.
|
||||
##
|
||||
## Regular expression support for Nim.
|
||||
##
|
||||
## This module is implemented by providing a wrapper around the
|
||||
## `PCRE (Perl-Compatible Regular Expressions) <https://www.pcre.org>`_
|
||||
## C library. This means that your application will depend on the PCRE
|
||||
## `PCRE2 (Perl-Compatible Regular Expressions) <https://www.pcre.org>`_
|
||||
## C library. This means that your application will depend on the PCRE2
|
||||
## library's licence when using this module, which should not be a problem
|
||||
## though.
|
||||
##
|
||||
## .. note:: There are also alternative nimble packages such as [tinyre](https://github.com/khchen/tinyre)
|
||||
## and [regex](https://github.com/nitely/nim-regex).
|
||||
##
|
||||
## PCRE's licence follows:
|
||||
##
|
||||
## .. include:: ../../doc/regexprs.txt
|
||||
##
|
||||
## PCRE2 is distributed under a BSD-style licence.
|
||||
|
||||
|
||||
runnableExamples:
|
||||
## Unless specified otherwise, `start` parameter in each proc indicates
|
||||
@@ -40,7 +38,7 @@ runnableExamples:
|
||||
# can't match start of string since we're starting at 1
|
||||
|
||||
import
|
||||
std/[pcre, strutils, rtarrays]
|
||||
std/[pcre2, strutils]
|
||||
|
||||
when defined(nimPreviewSlimSystem):
|
||||
import std/syncio
|
||||
@@ -60,8 +58,7 @@ type
|
||||
## expression will be used only once)
|
||||
|
||||
RegexDesc = object
|
||||
h: ptr Pcre
|
||||
e: ptr ExtraData
|
||||
h: ptr pcre2.Pcre
|
||||
|
||||
Regex* = ref RegexDesc ## a compiled regular expression
|
||||
|
||||
@@ -71,14 +68,10 @@ type
|
||||
when defined(gcDestructors):
|
||||
when defined(nimAllowNonVarDestructor):
|
||||
proc `=destroy`(x: RegexDesc) =
|
||||
pcre.free_substring(cast[cstring](x.h))
|
||||
if not isNil(x.e):
|
||||
pcre.free_study(x.e)
|
||||
pcre2.code_free(x.h)
|
||||
else:
|
||||
proc `=destroy`(x: var RegexDesc) =
|
||||
pcre.free_substring(cast[cstring](x.h))
|
||||
if not isNil(x.e):
|
||||
pcre.free_study(x.e)
|
||||
pcre2.code_free(x.h)
|
||||
|
||||
proc raiseInvalidRegex(msg: string) {.noinline, noreturn.} =
|
||||
var e: ref RegexError
|
||||
@@ -86,21 +79,43 @@ proc raiseInvalidRegex(msg: string) {.noinline, noreturn.} =
|
||||
e.msg = msg
|
||||
raise e
|
||||
|
||||
proc rawCompile(pattern: string, flags: cint): ptr Pcre =
|
||||
proc pcre2ErrorMessage(errorCode: cint): string =
|
||||
var buffer: array[256, uint8]
|
||||
let length = pcre2.get_error_message(errorCode, addr buffer[0], buffer.len.csize_t)
|
||||
if length >= 0:
|
||||
result = newString(length)
|
||||
if length > 0:
|
||||
copyMem(addr result[0], addr buffer[0], length)
|
||||
else:
|
||||
result = $errorCode
|
||||
|
||||
proc rawCompile(pattern: string, options: uint32): ptr pcre2.Pcre =
|
||||
var
|
||||
msg: cstring = ""
|
||||
offset: cint = 0
|
||||
result = pcre.compile(pattern, flags, addr(msg), addr(offset), nil)
|
||||
errorCode: cint = 0
|
||||
offset: csize_t = 0
|
||||
result = pcre2.compile(cast[ptr uint8](pattern.cstring), pattern.len.csize_t,
|
||||
options, addr errorCode, addr offset, nil)
|
||||
if result == nil:
|
||||
raiseInvalidRegex($msg & "\n" & pattern & "\n" & spaces(offset) & "^\n")
|
||||
raiseInvalidRegex(pcre2ErrorMessage(errorCode) & "\n" & pattern & "\n" &
|
||||
spaces(offset.int) & "^\n")
|
||||
|
||||
proc finalizeRegEx(x: Regex) =
|
||||
# XXX This is a hack, but PCRE does not export its "free" function properly.
|
||||
# Sigh. The hack relies on PCRE's implementation (see `pcre_get.c`).
|
||||
# Fortunately the implementation is unlikely to change.
|
||||
pcre.free_substring(cast[cstring](x.h))
|
||||
if not isNil(x.e):
|
||||
pcre.free_study(x.e)
|
||||
pcre2.code_free(x.h)
|
||||
|
||||
func toPcre2Options(flags: set[RegexFlag]): uint32 =
|
||||
if reIgnoreCase in flags:
|
||||
result = result or pcre2.CASELESS.uint32
|
||||
if reMultiLine in flags:
|
||||
result = result or pcre2.MULTILINE.uint32
|
||||
if reDotAll in flags:
|
||||
result = result or pcre2.DOTALL.uint32
|
||||
if reExtended in flags:
|
||||
result = result or pcre2.EXTENDED.uint32
|
||||
|
||||
proc jitCompile(pattern: ptr pcre2.Pcre) =
|
||||
var hasJit: cint = 0
|
||||
if pcre2.config(pcre2.CONFIG_JIT, addr hasJit) == 0 and hasJit == 1:
|
||||
discard pcre2.jit_compile(pattern, pcre2.JIT_COMPLETE.uint32)
|
||||
|
||||
proc re*(s: string, flags = {reStudy}): Regex =
|
||||
## Constructor of regular expressions.
|
||||
@@ -116,16 +131,9 @@ proc re*(s: string, flags = {reStudy}): Regex =
|
||||
result = Regex()
|
||||
else:
|
||||
new(result, finalizeRegEx)
|
||||
result.h = rawCompile(s, cast[cint](flags - {reStudy}))
|
||||
result.h = rawCompile(s, toPcre2Options(flags))
|
||||
if reStudy in flags:
|
||||
var msg: cstring = ""
|
||||
var options: cint = 0
|
||||
var hasJit: cint = 0
|
||||
if pcre.config(pcre.CONFIG_JIT, addr hasJit) == 0:
|
||||
if hasJit == 1'i32:
|
||||
options = pcre.STUDY_JIT_COMPILE
|
||||
result.e = pcre.study(result.h, options, addr msg)
|
||||
if not isNil(msg): raiseInvalidRegex($msg)
|
||||
jitCompile(result.h)
|
||||
|
||||
proc rex*(s: string, flags = {reStudy, reExtended}): Regex =
|
||||
## Constructor for extended regular expressions.
|
||||
@@ -142,25 +150,58 @@ proc bufSubstr(b: cstring, sPos, ePos: int): string {.inline.} =
|
||||
copyMem(addr(result[0]), unsafeAddr(b[sPos]), sz)
|
||||
result.setLen(sz)
|
||||
|
||||
proc matchOrFind(buf: cstring, pattern: Regex, matches: var openArray[string],
|
||||
start, bufSize, flags: cint): cint =
|
||||
var
|
||||
rtarray = initRtArray[cint]((matches.len+1)*3)
|
||||
rawMatches = rtarray.getRawData
|
||||
res = pcre.exec(pattern.h, pattern.e, buf, bufSize, start, flags,
|
||||
cast[ptr cint](rawMatches), (matches.len+1).cint*3)
|
||||
if res < 0'i32: return res
|
||||
for i in 1..int(res)-1:
|
||||
var a = rawMatches[i * 2]
|
||||
var b = rawMatches[i * 2 + 1]
|
||||
if a >= 0'i32:
|
||||
matches[i-1] = bufSubstr(buf, int(a), int(b))
|
||||
else: matches[i-1] = ""
|
||||
return rawMatches[1] - rawMatches[0]
|
||||
proc newMatchData(slots: int): ptr pcre2.MatchData =
|
||||
result = pcre2.match_data_create(max(slots, 1).uint32, nil)
|
||||
if result == nil:
|
||||
raiseInvalidRegex("could not allocate PCRE2 match data")
|
||||
|
||||
const MaxReBufSize* = high(cint)
|
||||
## Maximum PCRE (API 1) buffer start/size equal to `high(cint)`, which even
|
||||
## for 64-bit systems can be either 2`31`:sup:-1 or 2`63`:sup:-1.
|
||||
template ovector(matchData: ptr pcre2.MatchData): ptr UncheckedArray[csize_t] =
|
||||
cast[ptr UncheckedArray[csize_t]](pcre2.get_ovector_pointer(matchData))
|
||||
|
||||
proc rawMatch(buf: cstring, pattern: Regex, start, bufSize: int,
|
||||
options: uint32, matchData: ptr pcre2.MatchData): cint =
|
||||
if start < 0 or bufSize < 0:
|
||||
return pcre2.ERROR_BADOFFSET
|
||||
pcre2.match(pattern.h, cast[ptr uint8](buf), bufSize.csize_t,
|
||||
start.csize_t, options, matchData, nil)
|
||||
|
||||
proc copyStringMatches(buf: cstring, rawMatches: ptr UncheckedArray[csize_t],
|
||||
captureCount: int, matches: var openArray[string]) =
|
||||
let upper = min(captureCount - 1, matches.len)
|
||||
if upper > 0:
|
||||
for i in 1 .. upper:
|
||||
let matchStart = rawMatches[i * 2]
|
||||
let matchEnd = rawMatches[i * 2 + 1]
|
||||
if matchStart != pcre2.UNSET:
|
||||
matches[i-1] = bufSubstr(buf, int(matchStart), int(matchEnd))
|
||||
else:
|
||||
matches[i-1] = ""
|
||||
|
||||
proc copyBoundsMatches(rawMatches: ptr UncheckedArray[csize_t],
|
||||
captureCount: int,
|
||||
matches: var openArray[tuple[first, last: int]]) =
|
||||
let upper = min(captureCount - 1, matches.len)
|
||||
if upper > 0:
|
||||
for i in 1 .. upper:
|
||||
let matchStart = rawMatches[i * 2]
|
||||
let matchEnd = rawMatches[i * 2 + 1]
|
||||
if matchStart != pcre2.UNSET:
|
||||
matches[i-1] = (int(matchStart), int(matchEnd) - 1)
|
||||
else:
|
||||
matches[i-1] = (-1, 0)
|
||||
|
||||
proc matchOrFind(buf: cstring, pattern: Regex, matches: var openArray[string],
|
||||
start, bufSize: int, options: uint32): int =
|
||||
let matchData = newMatchData(matches.len + 1)
|
||||
defer: pcre2.match_data_free(matchData)
|
||||
let res = rawMatch(buf, pattern, start, bufSize, options, matchData)
|
||||
let rawMatches = ovector(matchData)
|
||||
if res < 0: return int(res)
|
||||
copyStringMatches(buf, rawMatches, int(res), matches)
|
||||
return int(rawMatches[1]) - int(rawMatches[0])
|
||||
|
||||
const MaxReBufSize* = high(int)
|
||||
## Maximum PCRE2 buffer start/size accepted by this Nim API.
|
||||
|
||||
proc findBounds*(buf: cstring, pattern: Regex, matches: var openArray[string],
|
||||
start = 0, bufSize: int): tuple[first, last: int] =
|
||||
@@ -172,17 +213,12 @@ proc findBounds*(buf: cstring, pattern: Regex, matches: var openArray[string],
|
||||
##
|
||||
## Note: The memory for `matches` needs to be allocated before this function is
|
||||
## called, otherwise it will just remain empty.
|
||||
var
|
||||
rtarray = initRtArray[cint]((matches.len+1)*3)
|
||||
rawMatches = rtarray.getRawData
|
||||
res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, 0'i32,
|
||||
cast[ptr cint](rawMatches), (matches.len+1).cint*3)
|
||||
if res < 0'i32: return (-1, 0)
|
||||
for i in 1..int(res)-1:
|
||||
var a = rawMatches[i * 2]
|
||||
var b = rawMatches[i * 2 + 1]
|
||||
if a >= 0'i32: matches[i-1] = bufSubstr(buf, int(a), int(b))
|
||||
else: matches[i-1] = ""
|
||||
let matchData = newMatchData(matches.len + 1)
|
||||
defer: pcre2.match_data_free(matchData)
|
||||
let res = rawMatch(buf, pattern, start, bufSize, 0'u32, matchData)
|
||||
let rawMatches = ovector(matchData)
|
||||
if res < 0: return (-1, 0)
|
||||
copyStringMatches(buf, rawMatches, int(res), matches)
|
||||
return (rawMatches[0].int, rawMatches[1].int - 1)
|
||||
|
||||
proc findBounds*(s: string, pattern: Regex, matches: var openArray[string],
|
||||
@@ -212,17 +248,12 @@ proc findBounds*(buf: cstring, pattern: Regex,
|
||||
## `(-1,0)` is returned.
|
||||
##
|
||||
## .. note:: The memory for `matches` needs to be allocated before this function is called, otherwise it will just remain empty.
|
||||
var
|
||||
rtarray = initRtArray[cint]((matches.len+1)*3)
|
||||
rawMatches = rtarray.getRawData
|
||||
res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, 0'i32,
|
||||
cast[ptr cint](rawMatches), (matches.len+1).cint*3)
|
||||
if res < 0'i32: return (-1, 0)
|
||||
for i in 1..int(res)-1:
|
||||
var a = rawMatches[i * 2]
|
||||
var b = rawMatches[i * 2 + 1]
|
||||
if a >= 0'i32: matches[i-1] = (int(a), int(b)-1)
|
||||
else: matches[i-1] = (-1,0)
|
||||
let matchData = newMatchData(matches.len + 1)
|
||||
defer: pcre2.match_data_free(matchData)
|
||||
let res = rawMatch(buf, pattern, start, bufSize, 0'u32, matchData)
|
||||
let rawMatches = ovector(matchData)
|
||||
if res < 0: return (-1, 0)
|
||||
copyBoundsMatches(rawMatches, int(res), matches)
|
||||
return (rawMatches[0].int, rawMatches[1].int - 1)
|
||||
|
||||
proc findBounds*(s: string, pattern: Regex,
|
||||
@@ -244,29 +275,28 @@ proc findBounds*(s: string, pattern: Regex,
|
||||
min(start, MaxReBufSize), min(s.len, MaxReBufSize))
|
||||
|
||||
proc findBoundsImpl(buf: cstring, pattern: Regex,
|
||||
start = 0, bufSize = 0, flags = 0): tuple[first, last: int] =
|
||||
var rtarray = initRtArray[cint](3)
|
||||
let rawMatches = rtarray.getRawData
|
||||
let res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, flags.int32,
|
||||
cast[ptr cint](rawMatches), 3)
|
||||
|
||||
if res < 0'i32:
|
||||
start = 0, bufSize = 0,
|
||||
options = 0'u32): tuple[first, last: int] =
|
||||
let matchData = newMatchData(1)
|
||||
defer: pcre2.match_data_free(matchData)
|
||||
let res = rawMatch(buf, pattern, start, bufSize, options, matchData)
|
||||
let rawMatches = ovector(matchData)
|
||||
if res < 0:
|
||||
result = (-1, 0)
|
||||
else:
|
||||
result = (int(rawMatches[0]), int(rawMatches[1]-1))
|
||||
result = (int(rawMatches[0]), int(rawMatches[1]) - 1)
|
||||
|
||||
proc findBounds*(buf: cstring, pattern: Regex,
|
||||
start = 0, bufSize: int): tuple[first, last: int] =
|
||||
## returns the `first` and `last` position of `pattern` in `buf`,
|
||||
## where `buf` has length `bufSize` (not necessarily `'\0'` terminated).
|
||||
## If it does not match, `(-1,0)` is returned.
|
||||
var
|
||||
rtarray = initRtArray[cint](3)
|
||||
rawMatches = rtarray.getRawData
|
||||
res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, 0'i32,
|
||||
cast[ptr cint](rawMatches), 3)
|
||||
if res < 0'i32: return (int(res), 0)
|
||||
return (int(rawMatches[0]), int(rawMatches[1]-1))
|
||||
let matchData = newMatchData(1)
|
||||
defer: pcre2.match_data_free(matchData)
|
||||
let res = rawMatch(buf, pattern, start, bufSize, 0'u32, matchData)
|
||||
let rawMatches = ovector(matchData)
|
||||
if res < 0: return (int(res), 0)
|
||||
return (int(rawMatches[0]), int(rawMatches[1]) - 1)
|
||||
|
||||
proc findBounds*(s: string, pattern: Regex,
|
||||
start = 0): tuple[first, last: int] {.inline.} =
|
||||
@@ -279,14 +309,16 @@ proc findBounds*(s: string, pattern: Regex,
|
||||
result = findBounds(cstring(s), pattern,
|
||||
min(start, MaxReBufSize), min(s.len, MaxReBufSize))
|
||||
|
||||
proc matchOrFind(buf: cstring, pattern: Regex, start, bufSize: int, flags: cint): cint =
|
||||
var
|
||||
rtarray = initRtArray[cint](3)
|
||||
rawMatches = rtarray.getRawData
|
||||
result = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, flags,
|
||||
cast[ptr cint](rawMatches), 3)
|
||||
if result >= 0'i32:
|
||||
result = rawMatches[1] - rawMatches[0]
|
||||
proc matchOrFind(buf: cstring, pattern: Regex, start, bufSize: int,
|
||||
options: uint32): int =
|
||||
let matchData = newMatchData(1)
|
||||
defer: pcre2.match_data_free(matchData)
|
||||
let res = rawMatch(buf, pattern, start, bufSize, options, matchData)
|
||||
if res >= 0:
|
||||
let rawMatches = ovector(matchData)
|
||||
result = int(rawMatches[1]) - int(rawMatches[0])
|
||||
else:
|
||||
result = int(res)
|
||||
|
||||
proc matchLen*(s: string, pattern: Regex, matches: var openArray[string],
|
||||
start = 0): int {.inline.} =
|
||||
@@ -295,7 +327,7 @@ proc matchLen*(s: string, pattern: Regex, matches: var openArray[string],
|
||||
## of zero can happen.
|
||||
##
|
||||
## .. note:: The memory for `matches` needs to be allocated before this function is called, otherwise it will just remain empty.
|
||||
result = matchOrFind(cstring(s), pattern, matches, start.cint, s.len.cint, pcre.ANCHORED)
|
||||
result = matchOrFind(cstring(s), pattern, matches, start, s.len, cast[uint32](pcre2.ANCHORED))
|
||||
|
||||
proc matchLen*(buf: cstring, pattern: Regex, matches: var openArray[string],
|
||||
start = 0, bufSize: int): int {.inline.} =
|
||||
@@ -304,7 +336,7 @@ proc matchLen*(buf: cstring, pattern: Regex, matches: var openArray[string],
|
||||
## of zero can happen.
|
||||
##
|
||||
## .. note:: The memory for `matches` needs to be allocated before this function is called, otherwise it will just remain empty.
|
||||
return matchOrFind(buf, pattern, matches, start.cint, bufSize.cint, pcre.ANCHORED)
|
||||
return matchOrFind(buf, pattern, matches, start, bufSize, cast[uint32](pcre2.ANCHORED))
|
||||
|
||||
proc matchLen*(s: string, pattern: Regex, start = 0): int {.inline.} =
|
||||
## the same as `match`, but it returns the length of the match,
|
||||
@@ -315,13 +347,13 @@ proc matchLen*(s: string, pattern: Regex, start = 0): int {.inline.} =
|
||||
doAssert matchLen("abcdefg", re"cde", 2) == 3
|
||||
doAssert matchLen("abcdefg", re"abcde") == 5
|
||||
doAssert matchLen("abcdefg", re"cde") == -1
|
||||
result = matchOrFind(cstring(s), pattern, start.cint, s.len.cint, pcre.ANCHORED)
|
||||
result = matchOrFind(cstring(s), pattern, start, s.len, cast[uint32](pcre2.ANCHORED))
|
||||
|
||||
proc matchLen*(buf: cstring, pattern: Regex, start = 0, bufSize: int): int {.inline.} =
|
||||
## the same as `match`, but it returns the length of the match,
|
||||
## if there is no match, `-1` is returned. Note that a match length
|
||||
## of zero can happen.
|
||||
result = matchOrFind(buf, pattern, start.cint, bufSize, pcre.ANCHORED)
|
||||
result = matchOrFind(buf, pattern, start, bufSize, cast[uint32](pcre2.ANCHORED))
|
||||
|
||||
proc match*(s: string, pattern: Regex, start = 0): bool {.inline.} =
|
||||
## returns `true` if `s[start..]` matches the `pattern`.
|
||||
@@ -361,18 +393,13 @@ proc find*(buf: cstring, pattern: Regex, matches: var openArray[string],
|
||||
## `buf` has length `bufSize` (not necessarily `'\0'` terminated).
|
||||
##
|
||||
## .. note:: The memory for `matches` needs to be allocated before this function is called, otherwise it will just remain empty.
|
||||
var
|
||||
rtarray = initRtArray[cint]((matches.len+1)*3)
|
||||
rawMatches = rtarray.getRawData
|
||||
res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, 0'i32,
|
||||
cast[ptr cint](rawMatches), (matches.len+1).cint*3)
|
||||
if res < 0'i32: return res
|
||||
for i in 1..int(res)-1:
|
||||
var a = rawMatches[i * 2]
|
||||
var b = rawMatches[i * 2 + 1]
|
||||
if a >= 0'i32: matches[i-1] = bufSubstr(buf, int(a), int(b))
|
||||
else: matches[i-1] = ""
|
||||
return rawMatches[0]
|
||||
let matchData = newMatchData(matches.len + 1)
|
||||
defer: pcre2.match_data_free(matchData)
|
||||
let res = rawMatch(buf, pattern, start, bufSize, 0'u32, matchData)
|
||||
let rawMatches = ovector(matchData)
|
||||
if res < 0: return int(res)
|
||||
copyStringMatches(buf, rawMatches, int(res), matches)
|
||||
return int(rawMatches[0])
|
||||
|
||||
proc find*(s: string, pattern: Regex, matches: var openArray[string],
|
||||
start = 0): int {.inline.} =
|
||||
@@ -387,13 +414,12 @@ proc find*(buf: cstring, pattern: Regex, start = 0, bufSize: int): int =
|
||||
## returns the starting position of `pattern` in `buf`,
|
||||
## where `buf` has length `bufSize` (not necessarily `'\0'` terminated).
|
||||
## If it does not match, `-1` is returned.
|
||||
var
|
||||
rtarray = initRtArray[cint](3)
|
||||
rawMatches = rtarray.getRawData
|
||||
res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, start.cint, 0'i32,
|
||||
cast[ptr cint](rawMatches), 3)
|
||||
if res < 0'i32: return res
|
||||
return rawMatches[0]
|
||||
let matchData = newMatchData(1)
|
||||
defer: pcre2.match_data_free(matchData)
|
||||
let res = rawMatch(buf, pattern, start, bufSize, 0'u32, matchData)
|
||||
let rawMatches = ovector(matchData)
|
||||
if res < 0: return int(res)
|
||||
return int(rawMatches[0])
|
||||
|
||||
proc find*(s: string, pattern: Regex, start = 0): int {.inline.} =
|
||||
## returns the starting position of `pattern` in `s`. If it does not
|
||||
@@ -413,40 +439,38 @@ iterator findAll*(s: string, pattern: Regex, start = 0): string =
|
||||
##
|
||||
## Note that since this is an iterator you should not modify the string you
|
||||
## are iterating over: bad things could happen.
|
||||
var
|
||||
i = int32(start)
|
||||
rtarray = initRtArray[cint](3)
|
||||
rawMatches = rtarray.getRawData
|
||||
var i = start
|
||||
let matchData = newMatchData(1)
|
||||
defer: pcre2.match_data_free(matchData)
|
||||
while true:
|
||||
let res = pcre.exec(pattern.h, pattern.e, s, len(s).cint, i, 0'i32,
|
||||
cast[ptr cint](rawMatches), 3)
|
||||
if res < 0'i32: break
|
||||
let a = rawMatches[0]
|
||||
let b = rawMatches[1]
|
||||
if a == b and a == i: break
|
||||
yield substr(s, int(a), int(b)-1)
|
||||
i = b
|
||||
let res = rawMatch(s.cstring, pattern, i, len(s), 0'u32, matchData)
|
||||
if res < 0: break
|
||||
let rawMatches = ovector(matchData)
|
||||
let matchStart = rawMatches[0]
|
||||
let matchEnd = rawMatches[1]
|
||||
if matchStart == matchEnd and matchStart.int == i: break
|
||||
yield substr(s, int(matchStart), int(matchEnd) - 1)
|
||||
i = matchEnd.int
|
||||
|
||||
iterator findAll*(buf: cstring, pattern: Regex, start = 0, bufSize: int): string =
|
||||
## Yields all matching `substrings` of `s` that match `pattern`.
|
||||
##
|
||||
## Note that since this is an iterator you should not modify the string you
|
||||
## are iterating over: bad things could happen.
|
||||
var
|
||||
i = int32(start)
|
||||
rtarray = initRtArray[cint](3)
|
||||
rawMatches = rtarray.getRawData
|
||||
var i = start
|
||||
let matchData = newMatchData(1)
|
||||
defer: pcre2.match_data_free(matchData)
|
||||
while true:
|
||||
let res = pcre.exec(pattern.h, pattern.e, buf, bufSize.cint, i, 0'i32,
|
||||
cast[ptr cint](rawMatches), 3)
|
||||
if res < 0'i32: break
|
||||
let a = rawMatches[0]
|
||||
let b = rawMatches[1]
|
||||
if a == b and a == i: break
|
||||
var str = newString(b-a)
|
||||
copyMem(str[0].addr, unsafeAddr(buf[a]), b-a)
|
||||
let res = rawMatch(buf, pattern, i, bufSize, 0'u32, matchData)
|
||||
if res < 0: break
|
||||
let rawMatches = ovector(matchData)
|
||||
let matchStart = rawMatches[0]
|
||||
let matchEnd = rawMatches[1]
|
||||
if matchStart == matchEnd and matchStart.int == i: break
|
||||
var str = newString(int(matchEnd - matchStart))
|
||||
copyMem(str[0].addr, unsafeAddr(buf[int(matchStart)]), int(matchEnd - matchStart))
|
||||
yield str
|
||||
i = b
|
||||
i = matchEnd.int
|
||||
|
||||
proc findAll*(s: string, pattern: Regex, start = 0): seq[string] {.inline.} =
|
||||
## returns all matching `substrings` of `s` that match `pattern`.
|
||||
@@ -503,7 +527,7 @@ proc replace*(s: string, sub: Regex, by = ""): string =
|
||||
doAssert "var1=key; var2=key2".replace(re"(\w+)=(\w+)", "?") == "?; ?"
|
||||
result = ""
|
||||
var prev = 0
|
||||
var flags = int32(0)
|
||||
var flags = 0'u32
|
||||
while prev < s.len:
|
||||
var match = findBoundsImpl(s.cstring, sub, prev, s.len, flags)
|
||||
flags = 0
|
||||
@@ -512,7 +536,7 @@ proc replace*(s: string, sub: Regex, by = ""): string =
|
||||
add(result, by)
|
||||
if match.first > match.last:
|
||||
# 0-len match
|
||||
flags = pcre.NOTEMPTY_ATSTART
|
||||
flags = pcre2.NOTEMPTY_ATSTART.uint32
|
||||
prev = match.last + 1
|
||||
add(result, substr(s, prev))
|
||||
|
||||
|
||||
260
lib/wrappers/pcre2.nim
Normal file
260
lib/wrappers/pcre2.nim
Normal file
@@ -0,0 +1,260 @@
|
||||
#
|
||||
# Nim's Runtime Library
|
||||
# (c) Copyright 2026 Nim Contributors
|
||||
#
|
||||
# See the file "copying.txt", included in this
|
||||
# distribution, for details about the copyright.
|
||||
#
|
||||
|
||||
## Wrapper for the 8-bit PCRE2 API.
|
||||
|
||||
when sizeof(int) == 4:
|
||||
const ANCHORED* = low(int)
|
||||
else:
|
||||
const ANCHORED* = int(0x80000000)
|
||||
|
||||
const
|
||||
NO_UTF_CHECK* = int(0x40000000)
|
||||
ENDANCHORED* = int(0x20000000)
|
||||
|
||||
const
|
||||
ALLOW_EMPTY_CLASS* = 0x00000001
|
||||
ALT_BSUX* = 0x00000002
|
||||
AUTO_CALLOUT* = 0x00000004
|
||||
CASELESS* = 0x00000008
|
||||
DOLLAR_ENDONLY* = 0x00000010
|
||||
DOTALL* = 0x00000020
|
||||
DUPNAMES* = 0x00000040
|
||||
EXTENDED* = 0x00000080
|
||||
FIRSTLINE* = 0x00000100
|
||||
MATCH_UNSET_BACKREF* = 0x00000200
|
||||
MULTILINE* = 0x00000400
|
||||
NEVER_UCP* = 0x00000800
|
||||
NEVER_UTF* = 0x00001000
|
||||
NO_AUTO_CAPTURE* = 0x00002000
|
||||
NO_AUTO_POSSESS* = 0x00004000
|
||||
NO_DOTSTAR_ANCHOR* = 0x00008000
|
||||
NO_START_OPTIMIZE* = 0x00010000
|
||||
NO_START_OPTIMISE* = NO_START_OPTIMIZE
|
||||
UCP* = 0x00020000
|
||||
UNGREEDY* = 0x00040000
|
||||
UTF* = 0x00080000
|
||||
UTF8* = UTF
|
||||
NEVER_BACKSLASH_C* = 0x00100000
|
||||
ALT_CIRCUMFLEX* = 0x00200000
|
||||
ALT_VERBNAMES* = 0x00400000
|
||||
USE_OFFSET_LIMIT* = 0x00800000
|
||||
EXTENDED_MORE* = 0x01000000
|
||||
LITERAL* = 0x02000000
|
||||
MATCH_INVALID_UTF* = 0x04000000
|
||||
ALT_EXTENDED_CLASS* = 0x08000000
|
||||
|
||||
## PCRE2 no longer exposes PCRE's `JAVASCRIPT_COMPAT` option. `ALT_BSUX`
|
||||
## preserves the most important JavaScript-style escape handling.
|
||||
JAVASCRIPT_COMPAT* = ALT_BSUX
|
||||
|
||||
const
|
||||
JIT_COMPLETE* = 0x00000001
|
||||
JIT_PARTIAL_SOFT* = 0x00000002
|
||||
JIT_PARTIAL_HARD* = 0x00000004
|
||||
JIT_INVALID_UTF* = 0x00000100
|
||||
JIT_TEST_ALLOC* = 0x00000200
|
||||
|
||||
const
|
||||
NOTBOL* = 0x00000001
|
||||
NOTEOL* = 0x00000002
|
||||
NOTEMPTY* = 0x00000004
|
||||
NOTEMPTY_ATSTART* = 0x00000008
|
||||
PARTIAL_SOFT* = 0x00000010
|
||||
PARTIAL_HARD* = 0x00000020
|
||||
DFA_RESTART* = 0x00000040
|
||||
DFA_SHORTEST* = 0x00000080
|
||||
NO_JIT* = 0x00002000
|
||||
COPY_MATCHED_SUBJECT* = 0x00004000
|
||||
DISABLE_RECURSELOOP_CHECK* = 0x00040000
|
||||
|
||||
const
|
||||
NEWLINE_CR* = 1
|
||||
NEWLINE_LF* = 2
|
||||
NEWLINE_CRLF* = 3
|
||||
NEWLINE_ANY* = 4
|
||||
NEWLINE_ANYCRLF* = 5
|
||||
NEWLINE_NUL* = 6
|
||||
BSR_UNICODE* = 1
|
||||
BSR_ANYCRLF* = 2
|
||||
|
||||
const
|
||||
ERROR_NOMATCH* = -1
|
||||
ERROR_PARTIAL* = -2
|
||||
|
||||
ERROR_UTF8_ERR1* = -3
|
||||
ERROR_UTF8_ERR2* = -4
|
||||
ERROR_UTF8_ERR3* = -5
|
||||
ERROR_UTF8_ERR4* = -6
|
||||
ERROR_UTF8_ERR5* = -7
|
||||
ERROR_UTF8_ERR6* = -8
|
||||
ERROR_UTF8_ERR7* = -9
|
||||
ERROR_UTF8_ERR8* = -10
|
||||
ERROR_UTF8_ERR9* = -11
|
||||
ERROR_UTF8_ERR10* = -12
|
||||
ERROR_UTF8_ERR11* = -13
|
||||
ERROR_UTF8_ERR12* = -14
|
||||
ERROR_UTF8_ERR13* = -15
|
||||
ERROR_UTF8_ERR14* = -16
|
||||
ERROR_UTF8_ERR15* = -17
|
||||
ERROR_UTF8_ERR16* = -18
|
||||
ERROR_UTF8_ERR17* = -19
|
||||
ERROR_UTF8_ERR18* = -20
|
||||
ERROR_UTF8_ERR19* = -21
|
||||
ERROR_UTF8_ERR20* = -22
|
||||
ERROR_UTF8_ERR21* = -23
|
||||
|
||||
ERROR_BADDATA* = -29
|
||||
ERROR_MIXEDTABLES* = -30
|
||||
ERROR_BADMAGIC* = -31
|
||||
ERROR_BADMODE* = -32
|
||||
ERROR_BADOFFSET* = -33
|
||||
ERROR_BADOPTION* = -34
|
||||
ERROR_BADREPLACEMENT* = -35
|
||||
ERROR_BADUTFOFFSET* = -36
|
||||
ERROR_CALLOUT* = -37
|
||||
ERROR_INTERNAL* = -44
|
||||
ERROR_JIT_BADOPTION* = -45
|
||||
ERROR_JIT_STACKLIMIT* = -46
|
||||
ERROR_MATCHLIMIT* = -47
|
||||
ERROR_NOMEMORY* = -48
|
||||
ERROR_NOSUBSTRING* = -49
|
||||
ERROR_NULL* = -51
|
||||
ERROR_RECURSELOOP* = -52
|
||||
ERROR_DEPTHLIMIT* = -53
|
||||
ERROR_RECURSIONLIMIT* = ERROR_DEPTHLIMIT
|
||||
ERROR_UNAVAILABLE* = -54
|
||||
ERROR_UNSET* = -55
|
||||
ERROR_BADOFFSETLIMIT* = -56
|
||||
ERROR_HEAPLIMIT* = -63
|
||||
ERROR_DFA_UINVALID_UTF* = -66
|
||||
ERROR_INVALIDOFFSET* = -67
|
||||
ERROR_JIT_UNSUPPORTED* = -68
|
||||
|
||||
const
|
||||
INFO_ALLOPTIONS* = 0
|
||||
INFO_ARGOPTIONS* = 1
|
||||
INFO_BACKREFMAX* = 2
|
||||
INFO_BSR* = 3
|
||||
INFO_CAPTURECOUNT* = 4
|
||||
INFO_FIRSTCODEUNIT* = 5
|
||||
INFO_FIRSTCODETYPE* = 6
|
||||
INFO_FIRSTBITMAP* = 7
|
||||
INFO_HASCRORLF* = 8
|
||||
INFO_JCHANGED* = 9
|
||||
INFO_JITSIZE* = 10
|
||||
INFO_LASTCODEUNIT* = 11
|
||||
INFO_LASTCODETYPE* = 12
|
||||
INFO_MATCHEMPTY* = 13
|
||||
INFO_MATCHLIMIT* = 14
|
||||
INFO_MAXLOOKBEHIND* = 15
|
||||
INFO_MINLENGTH* = 16
|
||||
INFO_NAMECOUNT* = 17
|
||||
INFO_NAMEENTRYSIZE* = 18
|
||||
INFO_NAMETABLE* = 19
|
||||
INFO_NEWLINE* = 20
|
||||
INFO_DEPTHLIMIT* = 21
|
||||
INFO_RECURSIONLIMIT* = INFO_DEPTHLIMIT
|
||||
INFO_SIZE* = 22
|
||||
INFO_HASBACKSLASHC* = 23
|
||||
INFO_FRAMESIZE* = 24
|
||||
INFO_HEAPLIMIT* = 25
|
||||
INFO_EXTRAOPTIONS* = 26
|
||||
|
||||
const
|
||||
CONFIG_BSR* = 0
|
||||
CONFIG_JIT* = 1
|
||||
CONFIG_JITTARGET* = 2
|
||||
CONFIG_LINKSIZE* = 3
|
||||
CONFIG_MATCHLIMIT* = 4
|
||||
CONFIG_NEWLINE* = 5
|
||||
CONFIG_PARENSLIMIT* = 6
|
||||
CONFIG_DEPTHLIMIT* = 7
|
||||
CONFIG_RECURSIONLIMIT* = CONFIG_DEPTHLIMIT
|
||||
CONFIG_STACKRECURSE* = 8
|
||||
CONFIG_UNICODE* = 9
|
||||
CONFIG_UNICODE_VERSION* = 10
|
||||
CONFIG_VERSION* = 11
|
||||
CONFIG_HEAPLIMIT* = 12
|
||||
CONFIG_NEVER_BACKSLASH_C* = 13
|
||||
CONFIG_COMPILED_WIDTHS* = 14
|
||||
CONFIG_TABLES_LENGTH* = 15
|
||||
|
||||
const
|
||||
ZERO_TERMINATED* = not 0.csize_t
|
||||
UNSET* = not 0.csize_t
|
||||
|
||||
type
|
||||
Pcre* = object
|
||||
CompileContext* = object
|
||||
GeneralContext* = object
|
||||
MatchContext* = object
|
||||
MatchData* = object
|
||||
JitStack* = object
|
||||
|
||||
when not defined(usePcreHeader):
|
||||
when hostOS == "windows":
|
||||
const pcre2Dll = "pcre2-8.dll"
|
||||
elif hostOS == "macosx":
|
||||
const pcre2Dll = "libpcre2-8(.0|).dylib"
|
||||
else:
|
||||
const pcre2Dll = "libpcre2-8.so(.0|)"
|
||||
{.push dynlib: pcre2Dll.}
|
||||
else:
|
||||
{.passC: "-DPCRE2_CODE_UNIT_WIDTH=8".}
|
||||
{.push header: "<pcre2.h>".}
|
||||
|
||||
{.push cdecl, importc: "pcre2_$1_8".}
|
||||
|
||||
proc compile*(pattern: ptr uint8,
|
||||
length: csize_t,
|
||||
options: uint32,
|
||||
errorCode: ptr cint,
|
||||
errorOffset: ptr csize_t,
|
||||
context: ptr CompileContext): ptr Pcre
|
||||
|
||||
proc code_free*(code: ptr Pcre)
|
||||
|
||||
proc config*(what: uint32,
|
||||
where: pointer): cint
|
||||
|
||||
proc get_error_message*(errorCode: cint,
|
||||
buffer: ptr uint8,
|
||||
bufferLength: csize_t): cint
|
||||
|
||||
proc match*(code: ptr Pcre,
|
||||
subject: ptr uint8,
|
||||
length: csize_t,
|
||||
startOffset: csize_t,
|
||||
options: uint32,
|
||||
matchData: ptr MatchData,
|
||||
context: ptr MatchContext): cint
|
||||
|
||||
proc match_data_create*(oveccount: uint32,
|
||||
context: ptr GeneralContext): ptr MatchData
|
||||
|
||||
proc match_data_create_from_pattern*(code: ptr Pcre,
|
||||
context: ptr GeneralContext): ptr MatchData
|
||||
|
||||
proc match_data_free*(matchData: ptr MatchData)
|
||||
|
||||
proc get_ovector_pointer*(matchData: ptr MatchData): ptr csize_t
|
||||
|
||||
proc get_ovector_count*(matchData: ptr MatchData): uint32
|
||||
|
||||
proc pattern_info*(code: ptr Pcre,
|
||||
what: uint32,
|
||||
where: pointer): cint
|
||||
|
||||
proc jit_compile*(code: ptr Pcre,
|
||||
options: uint32): cint
|
||||
|
||||
proc jit_free_unused_memory*()
|
||||
|
||||
{.pop.}
|
||||
{.pop.}
|
||||
@@ -110,7 +110,7 @@ image: freebsd/latest
|
||||
packages:
|
||||
- databases/sqlite3
|
||||
- devel/boehm-gc-threaded
|
||||
- devel/pcre
|
||||
- devel/pcre2
|
||||
- devel/sdl20
|
||||
- devel/sfml
|
||||
- www/node
|
||||
@@ -124,7 +124,7 @@ packages:
|
||||
- sqlite3
|
||||
- node
|
||||
- boehm-gc
|
||||
- pcre
|
||||
- pcre2
|
||||
- sfml
|
||||
- sdl2
|
||||
- libffi
|
||||
|
||||
@@ -126,6 +126,7 @@ mm.md
|
||||
withoutIndex = """
|
||||
lib/wrappers/tinyc.nim
|
||||
lib/wrappers/pcre.nim
|
||||
lib/wrappers/pcre2.nim
|
||||
lib/wrappers/openssl.nim
|
||||
lib/posix/posix.nim
|
||||
lib/posix/linux.nim
|
||||
|
||||
@@ -729,7 +729,7 @@ iterator searchFile(pattern: Pattern; buffer: string): Output =
|
||||
i = t.last+1
|
||||
when typeof(pattern) is Regex:
|
||||
if buffer.len > MaxReBufSize:
|
||||
yield Output(kind: openError, msg: "PCRE size limit is " & $MaxReBufSize)
|
||||
yield Output(kind: openError, msg: "PCRE2 size limit is " & $MaxReBufSize)
|
||||
|
||||
func detectBin(buffer: string): bool =
|
||||
for i in 0 ..< min(1024, buffer.len):
|
||||
|
||||
Reference in New Issue
Block a user