diff --git a/changelog.md b/changelog.md index 1790684126..5f1ff21778 100644 --- a/changelog.md +++ b/changelog.md @@ -43,6 +43,15 @@ parameter and result types, not just their source-level shape. Use [//]: # "Additions:" +- Added `system.readRawDataStable`, a companion to `readRawData` that returns a + raw `ptr UncheckedArray[char]` into a string's character data which stays valid + across moves and copies of the string value. It is available under every string + implementation (refc, ARC/ORC and `--strings:sso`) with the same signature, so + code can pin an interior buffer pointer today and be ready for `--strings:sso` + without `when declared` guards. Under `--strings:sso` it promotes a small inline + string to its heap representation first; under the other implementations the data + is already heap-resident, so it is equivalent to `readRawData`. + - `setutils.symmetricDifference` along with its operator version `` setutils.`-+-` `` and in-place version `setutils.toggle` have been added to more efficiently calculate the symmetric difference of bitsets. diff --git a/lib/system.nim b/lib/system.nim index 94fe96ea93..313f90969c 100644 --- a/lib/system.nim +++ b/lib/system.nim @@ -1723,11 +1723,18 @@ when not (notJSnotNims and defined(nimSeqsV2)): let ns = cast[NimString](s) if ns == nil: nil else: cast[ptr UncheckedArray[char]](addr ns.data[start]) + template readRawDataStable*(s: var string; start = 0): ptr UncheckedArray[char] = + ## Same as `readRawData` here: the data lives in a heap `NimStringDesc` at a + ## stable address, so the pointer already survives moves of `s`. Takes `s` by + ## `var` to match the `--strings:sso` version, so code can prepare for that + ## upgrade without `when declared` guards. + readRawData(s, start) else: # JS/nimscript: callers are guarded by whenNotVmJsNims/when not defined(js) proc beginStore*(s: var string; newLen: int; start = 0): ptr UncheckedArray[char] {.inline, noSideEffect, raises: [], tags: [].} = nil proc endStore*(s: var string) {.inline, noSideEffect, raises: [], tags: [].} = discard template readRawData*(s: string; start = 0): ptr UncheckedArray[char] = nil + template readRawDataStable*(s: var string; start = 0): ptr UncheckedArray[char] = nil when not defined(js): template newSeqImpl(T, len) = diff --git a/lib/system/strs_v2.nim b/lib/system/strs_v2.nim index 640e8eeb3f..b0a6264cf8 100644 --- a/lib/system/strs_v2.nim +++ b/lib/system/strs_v2.nim @@ -261,4 +261,14 @@ template readRawData*(s: string; start = 0): ptr UncheckedArray[char] = ## Template ensures no copy of `s`; ptr is valid while `s` is alive. rawDataImpl(cast[ptr NimStringV2](unsafeAddr s), start) +template readRawDataStable*(s: var string; start = 0): ptr UncheckedArray[char] = + ## Like `readRawData`, but the returned pointer additionally survives moves and + ## copies of `s` (while `s` stays alive and is not reassigned). For this string + ## implementation the char data already lives in a heap payload at an address + ## independent of the `string` value itself, so no promotion is needed and this + ## is identical to `readRawData`. Takes `s` by `var` to match the `--strings:sso` + ## version (which promotes a small inline string to the heap), so code written + ## against `readRawDataStable` compiles unchanged under either implementation. + rawDataImpl(cast[ptr NimStringV2](addr s), start) + {.pop.} diff --git a/lib/system/strs_v3.nim b/lib/system/strs_v3.nim index 58462ba522..65c0fa88c9 100644 --- a/lib/system/strs_v3.nim +++ b/lib/system/strs_v3.nim @@ -770,6 +770,33 @@ template readRawData*(s: string; start = 0): ptr UncheckedArray[char] = ## Template ensures no copy of `s` is made; ptr is valid while `s` is alive. rawDataImpl(cast[ptr SmallString](unsafeAddr s), start) +proc readRawDataStable*(s: var string; start = 0): ptr UncheckedArray[char] {.inline.} = + ## Like `readRawData`, but the returned pointer stays valid across moves and + ## copies of `s` (as long as `s` stays alive and is not reassigned). A + ## short/medium string keeps its chars *inline* in the string object, so a + ## plain `readRawData` pointer dangles the moment the object is moved; this + ## promotes `s` to its heap (long) representation first, whose payload address + ## is independent of where the string object itself lives. Use this whenever an + ## interior pointer must outlive the current scope of the owning string (e.g. + ## a cursor cached alongside the buffer it points into). + let ss = cast[ptr SmallString](addr s) + let slen = ssLen(ss[]) + if slen > 0 and slen <= PayloadSize: + # Promote inline/medium to a long heap block so the payload lives at a + # stable address. Mirrors the short/medium -> long transition in `add`. + let newCap = max(slen, resize(slen)) + let p = cast[ptr LongString](alloc(LongStringDataOffset + newCap + 1)) + p.rc = 1 + p.fullLen = slen + p.capImpl = newCap + copyMem(addr p.data[0], inlinePtr(ss[]), slen) + p.data[slen] = '\0' + ss[].more = p + setSSLen(ss[], HeapSlen) + # Hot-prefix cache (bytes 1..AlwaysAvail) already mirrors data[0..AlwaysAvail-1] + # because setSSLen only rewrote byte 0; the inline chars are untouched. + rawDataImpl(ss, start) + # These take `string` (tyString) so the codegen uses them directly, bypassing # strmantle.nim's versions which go through nimStrLen/nimStrAtMutV3 compilerproc calls. proc cmpStrings(a, b: string): int {.compilerproc, inline.} = diff --git a/tests/system/treadrawdatastable.nim b/tests/system/treadrawdatastable.nim new file mode 100644 index 0000000000..0d4aaee54b --- /dev/null +++ b/tests/system/treadrawdatastable.nim @@ -0,0 +1,94 @@ +discard """ + matrix: "--mm:refc; --mm:orc; --mm:orc --strings:sso; --backend:cpp --mm:orc; --backend:js --mm:orc" + output: "OK" +""" + +# Tests for `readRawDataStable` and the SSO static-long-string promotion path. +# `readRawDataStable` is available under every string implementation (refc / v2 / +# v3-sso / js) with the same signature, so the code below compiles unchanged on +# all backends -- the point being that users can prepare for `--strings:sso` +# without `when declared` hacks. + +import std/assertions + +const hasNativeSso = defined(nimsso) and + (defined(gcArc) or defined(gcAtomicArc) or defined(gcOrc) or defined(gcYrc)) + +type + Reader = object + buf: string + p: ptr UncheckedArray[char] + +proc openFromBuffer(buf: sink string): Reader = + # `result` (and thus `buf`) is moved into the caller on return. A plain + # `readRawData` pointer into a small SSO string would dangle after that move; + # `readRawDataStable` pins the buffer to a stable address first. + result = Reader(buf: buf) + result.p = readRawDataStable(result.buf) + +proc testStable() = + when not defined(js): # raw pointers are a degenerate nil no-op on the JS backend + block: # short buffer (kept inline under SSO) survives the move + var r = openFromBuffer("hello") + doAssert r.buf == "hello" + doAssert r.p[0] == 'h' + doAssert r.p[4] == 'o' + # Stable pointer == the live buffer's raw data after the move. + doAssert cast[uint](r.p) == cast[uint](readRawData(r.buf)) + block: # medium buffer (len 12: inline overlay under SSO) + var r = openFromBuffer("hello world!") + doAssert r.p[11] == '!' + block: # already-long buffer: returned as-is (already heap-resident) + var r = openFromBuffer("this is a fairly long string buffer") + doAssert r.p[0] == 't' + doAssert r.p[34] == 'r' + block: # empty string: API is callable (the data pointer is implementation-defined) + var e = "" + discard readRawDataStable(e) + else: + # On JS the API exists and is callable (returns nil) so call sites are portable. + var s = "hello" + discard readRawDataStable(s) + +proc testStaticLongPromotion() = + # Regression for the static-long -> heap promotion: when a string literal + # longer than the inline payload (PayloadSize = 14 under SSO) is first + # mutated, the new heap block must be filled from the full static payload, + # not from the 7-byte inline hot-prefix cache. Reading from the cache copied + # 7 valid chars and then ran off into the `more` pointer bytes -- the bug that + # corrupted .nif index files on Windows bootstrap (see Nimony tstatic_long_add). + # The assertion holds on every backend; only SSO ever risked the corruption. + var content = "(.nif27)\n(index\n" # len 16 + let expected = "(.nif27)\n(index\n" + content.add 'X' # triggers static-long -> heap promotion + doAssert content.len == 17 + doAssert content == expected & "X" + for i in 0 ..< expected.len: + doAssert content[i] == expected[i] + +when hasNativeSso: + # A few SSO-tier-boundary sanity checks (short / medium / long, COW, shrink). + proc testSsoTiers() = + var a = "(.nif27)\n(index\n" # static long + let b = "(.nif27)\n(index\n" + doAssert a == b + a.add 'Z' + doAssert a == "(.nif27)\n(index\nZ" + + var c = "abcdefghijklmnop" # static long, len 16 + var d = c # COW share + d[0] = 'X' + doAssert c == "abcdefghijklmnop" # original untouched + doAssert d == "Xbcdefghijklmnop" + + var e = "abcdefghijklmnop" + e.setLen 3 # shrink below the inline cache size + doAssert e == "abc" + doAssert e.len == 3 +else: + proc testSsoTiers() = discard + +testStable() +testStaticLongPromotion() +testSsoTiers() +echo "OK"