diff --git a/changelog.md b/changelog.md index 53e0c0d476..46bdc015d9 100644 --- a/changelog.md +++ b/changelog.md @@ -72,6 +72,12 @@ errors. - `min`, `max`, and `sequtils`' `minIndex`, `maxIndex` and `minmax` for `openArray`s now accept a comparison function. - `system.substr` implementation now uses `copymem` (wrapped C `memcpy`) for copying data, if available at compilation. - `system.newStringUninit` is now considered free of side-effects allowing it to be used with `--experimental:strictFuncs`. +- `std/re` and `std/nre` are deprecated as PCRE library is obsolete. + Use https://github.com/nitely/nim-regex or `std/nre2`. + See: https://github.com/nim-lang/Nim/issues/23668. +- `std/pegs` now correctly lexes UTF-8 bytes inside bare identifier-style + terminals, so case-insensitive matching of non-ASCII terms (e.g. ``\i café``) + works without single-quoting. ## Language changes diff --git a/lib/pure/pegs.nim b/lib/pure/pegs.nim index 804572d04b..d0b16c14fc 100644 --- a/lib/pure/pegs.nim +++ b/lib/pure/pegs.nim @@ -1668,7 +1668,10 @@ func getSymbol(c: var PegLexer, tok: var Token) = while pos < c.buf.len: add(tok.literal, c.buf[pos]) inc(pos) - if pos < c.buf.len and c.buf[pos] notin strutils.IdentChars: break + if pos < c.buf.len: + let ch = c.buf[pos] + # Keep non-ASCII bytes so UTF-8 terminals reach the rune-aware matchers. + if ch notin strutils.IdentChars and ord(ch) < 0x80: break c.bufpos = pos tok.kind = tkIdentifier diff --git a/tests/stdlib/tpegs.nim b/tests/stdlib/tpegs.nim index 0c1fe1ef32..537e64e58e 100644 --- a/tests/stdlib/tpegs.nim +++ b/tests/stdlib/tpegs.nim @@ -259,6 +259,11 @@ block: doAssert match("EINE ÜBERSICHT UND AUSSERDEM", peg"(\upper \white*)+") doAssert(not match("456678", peg"(\letter)+")) + block: + doAssert match("CAFÉ", peg"\i café") + doAssert match("Café", peg"\i café") + doAssert "two cafés: Café and CAFÉ".findAll(peg"\i café").len == 3 + doAssert("var1 = key; var2 = key2".replacef( peg"\skip(\s*) {\ident}'='{\ident}", "$1<-$2$2") == "var1<-keykey;var2<-key2key2")