Fix zero-length matches for multibyte characters

This commit is contained in:
Oleh Prypin
2015-04-10 11:20:05 +03:00
parent a665279192
commit 16577f8167
4 changed files with 8 additions and 5 deletions

View File

@@ -352,16 +352,16 @@ iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): R
if match.isNone:
# either the end of the input or the string
# cannot be split here
offset += 1
if matchesCrLf and offset < (str.len - 1) and
str[offset] == '\r' and str[offset + 1] == '\l':
# if PCRE treats CrLf as newline, skip both at the same time
offset += 1
offset += 2
elif unicode:
# XXX what about invalid unicode?
offset += str.runeLenAt(offset)
assert(offset <= strlen)
else:
offset += 1
else:
offset = match.get.matchBounds.b + 1

View File

@@ -19,4 +19,6 @@ suite "find":
test "len 0 find":
check("".findAll(re"\ ") == newSeq[string]())
check("".findAll(re"") == @[""])
check("word word".findAll(nre.re"\b") == @["", "", "", ""])
check("word word".findAll(re"\b") == @["", "", "", ""])
check("word\r\lword".findAll(re(r"$", "m<anycrlf>")) == @["", ""])
check("слово слово".findAll(re(r"\b", "uW")) == @["", "", "", ""])

View File

@@ -4,4 +4,3 @@ suite "Misc tests":
test "unicode":
check("".find(re("", "8")).match == "")
check("перевірка".replace(re(r"\w", "uW"), "") == "")

View File

@@ -21,6 +21,8 @@ suite "string splitting":
check("12345".split(re("")) == @["1", "2", "3", "4", "5"])
check("".split(re"") == newSeq[string]())
check("word word".split(re"\b") == @["word", " ", "word"])
check("word\r\lword".split(re(r"$", "m<anycrlf>")) == @["word", "\r\lword"])
check("слово слово".split(re(r"(\b)", "uW")) == @["", "слово", "", " ", "", "слово", ""])
test "perl split tests":
check("forty-two" .split(re"") .join(",") == "f,o,r,t,y,-,t,w,o")