SSO: add readRawDataStable across all string implementations (#25909)

Companion to readRawData whose pointer stays valid across moves/copies
of the string. Under --strings:sso it promotes a small inline string to
its heap representation; under refc/v2 the data is already heap-resident
so it aliases readRawData.

Uniform `var string` signature on every backend so code can prepare for
--strings:sso without `when declared`.
This commit is contained in:
Andreas Rumpf
2026-06-13 19:27:22 +02:00
committed by GitHub
parent c292ab987b
commit 9d7c0cc683
5 changed files with 147 additions and 0 deletions

View File

@@ -43,6 +43,15 @@ parameter and result types, not just their source-level shape. Use
[//]: # "Additions:"
- Added `system.readRawDataStable`, a companion to `readRawData` that returns a
raw `ptr UncheckedArray[char]` into a string's character data which stays valid
across moves and copies of the string value. It is available under every string
implementation (refc, ARC/ORC and `--strings:sso`) with the same signature, so
code can pin an interior buffer pointer today and be ready for `--strings:sso`
without `when declared` guards. Under `--strings:sso` it promotes a small inline
string to its heap representation first; under the other implementations the data
is already heap-resident, so it is equivalent to `readRawData`.
- `setutils.symmetricDifference` along with its operator version
`` setutils.`-+-` `` and in-place version `setutils.toggle` have been added
to more efficiently calculate the symmetric difference of bitsets.

View File

@@ -1723,11 +1723,18 @@ when not (notJSnotNims and defined(nimSeqsV2)):
let ns = cast[NimString](s)
if ns == nil: nil
else: cast[ptr UncheckedArray[char]](addr ns.data[start])
template readRawDataStable*(s: var string; start = 0): ptr UncheckedArray[char] =
## Same as `readRawData` here: the data lives in a heap `NimStringDesc` at a
## stable address, so the pointer already survives moves of `s`. Takes `s` by
## `var` to match the `--strings:sso` version, so code can prepare for that
## upgrade without `when declared` guards.
readRawData(s, start)
else:
# JS/nimscript: callers are guarded by whenNotVmJsNims/when not defined(js)
proc beginStore*(s: var string; newLen: int; start = 0): ptr UncheckedArray[char] {.inline, noSideEffect, raises: [], tags: [].} = nil
proc endStore*(s: var string) {.inline, noSideEffect, raises: [], tags: [].} = discard
template readRawData*(s: string; start = 0): ptr UncheckedArray[char] = nil
template readRawDataStable*(s: var string; start = 0): ptr UncheckedArray[char] = nil
when not defined(js):
template newSeqImpl(T, len) =

View File

@@ -261,4 +261,14 @@ template readRawData*(s: string; start = 0): ptr UncheckedArray[char] =
## Template ensures no copy of `s`; ptr is valid while `s` is alive.
rawDataImpl(cast[ptr NimStringV2](unsafeAddr s), start)
template readRawDataStable*(s: var string; start = 0): ptr UncheckedArray[char] =
## Like `readRawData`, but the returned pointer additionally survives moves and
## copies of `s` (while `s` stays alive and is not reassigned). For this string
## implementation the char data already lives in a heap payload at an address
## independent of the `string` value itself, so no promotion is needed and this
## is identical to `readRawData`. Takes `s` by `var` to match the `--strings:sso`
## version (which promotes a small inline string to the heap), so code written
## against `readRawDataStable` compiles unchanged under either implementation.
rawDataImpl(cast[ptr NimStringV2](addr s), start)
{.pop.}

View File

@@ -770,6 +770,33 @@ template readRawData*(s: string; start = 0): ptr UncheckedArray[char] =
## Template ensures no copy of `s` is made; ptr is valid while `s` is alive.
rawDataImpl(cast[ptr SmallString](unsafeAddr s), start)
proc readRawDataStable*(s: var string; start = 0): ptr UncheckedArray[char] {.inline.} =
## Like `readRawData`, but the returned pointer stays valid across moves and
## copies of `s` (as long as `s` stays alive and is not reassigned). A
## short/medium string keeps its chars *inline* in the string object, so a
## plain `readRawData` pointer dangles the moment the object is moved; this
## promotes `s` to its heap (long) representation first, whose payload address
## is independent of where the string object itself lives. Use this whenever an
## interior pointer must outlive the current scope of the owning string (e.g.
## a cursor cached alongside the buffer it points into).
let ss = cast[ptr SmallString](addr s)
let slen = ssLen(ss[])
if slen > 0 and slen <= PayloadSize:
# Promote inline/medium to a long heap block so the payload lives at a
# stable address. Mirrors the short/medium -> long transition in `add`.
let newCap = max(slen, resize(slen))
let p = cast[ptr LongString](alloc(LongStringDataOffset + newCap + 1))
p.rc = 1
p.fullLen = slen
p.capImpl = newCap
copyMem(addr p.data[0], inlinePtr(ss[]), slen)
p.data[slen] = '\0'
ss[].more = p
setSSLen(ss[], HeapSlen)
# Hot-prefix cache (bytes 1..AlwaysAvail) already mirrors data[0..AlwaysAvail-1]
# because setSSLen only rewrote byte 0; the inline chars are untouched.
rawDataImpl(ss, start)
# These take `string` (tyString) so the codegen uses them directly, bypassing
# strmantle.nim's versions which go through nimStrLen/nimStrAtMutV3 compilerproc calls.
proc cmpStrings(a, b: string): int {.compilerproc, inline.} =

View File

@@ -0,0 +1,94 @@
discard """
matrix: "--mm:refc; --mm:orc; --mm:orc --strings:sso; --backend:cpp --mm:orc; --backend:js --mm:orc"
output: "OK"
"""
# Tests for `readRawDataStable` and the SSO static-long-string promotion path.
# `readRawDataStable` is available under every string implementation (refc / v2 /
# v3-sso / js) with the same signature, so the code below compiles unchanged on
# all backends -- the point being that users can prepare for `--strings:sso`
# without `when declared` hacks.
import std/assertions
const hasNativeSso = defined(nimsso) and
(defined(gcArc) or defined(gcAtomicArc) or defined(gcOrc) or defined(gcYrc))
type
Reader = object
buf: string
p: ptr UncheckedArray[char]
proc openFromBuffer(buf: sink string): Reader =
# `result` (and thus `buf`) is moved into the caller on return. A plain
# `readRawData` pointer into a small SSO string would dangle after that move;
# `readRawDataStable` pins the buffer to a stable address first.
result = Reader(buf: buf)
result.p = readRawDataStable(result.buf)
proc testStable() =
when not defined(js): # raw pointers are a degenerate nil no-op on the JS backend
block: # short buffer (kept inline under SSO) survives the move
var r = openFromBuffer("hello")
doAssert r.buf == "hello"
doAssert r.p[0] == 'h'
doAssert r.p[4] == 'o'
# Stable pointer == the live buffer's raw data after the move.
doAssert cast[uint](r.p) == cast[uint](readRawData(r.buf))
block: # medium buffer (len 12: inline overlay under SSO)
var r = openFromBuffer("hello world!")
doAssert r.p[11] == '!'
block: # already-long buffer: returned as-is (already heap-resident)
var r = openFromBuffer("this is a fairly long string buffer")
doAssert r.p[0] == 't'
doAssert r.p[34] == 'r'
block: # empty string: API is callable (the data pointer is implementation-defined)
var e = ""
discard readRawDataStable(e)
else:
# On JS the API exists and is callable (returns nil) so call sites are portable.
var s = "hello"
discard readRawDataStable(s)
proc testStaticLongPromotion() =
# Regression for the static-long -> heap promotion: when a string literal
# longer than the inline payload (PayloadSize = 14 under SSO) is first
# mutated, the new heap block must be filled from the full static payload,
# not from the 7-byte inline hot-prefix cache. Reading from the cache copied
# 7 valid chars and then ran off into the `more` pointer bytes -- the bug that
# corrupted .nif index files on Windows bootstrap (see Nimony tstatic_long_add).
# The assertion holds on every backend; only SSO ever risked the corruption.
var content = "(.nif27)\n(index\n" # len 16
let expected = "(.nif27)\n(index\n"
content.add 'X' # triggers static-long -> heap promotion
doAssert content.len == 17
doAssert content == expected & "X"
for i in 0 ..< expected.len:
doAssert content[i] == expected[i]
when hasNativeSso:
# A few SSO-tier-boundary sanity checks (short / medium / long, COW, shrink).
proc testSsoTiers() =
var a = "(.nif27)\n(index\n" # static long
let b = "(.nif27)\n(index\n"
doAssert a == b
a.add 'Z'
doAssert a == "(.nif27)\n(index\nZ"
var c = "abcdefghijklmnop" # static long, len 16
var d = c # COW share
d[0] = 'X'
doAssert c == "abcdefghijklmnop" # original untouched
doAssert d == "Xbcdefghijklmnop"
var e = "abcdefghijklmnop"
e.setLen 3 # shrink below the inline cache size
doAssert e == "abc"
doAssert e.len == 3
else:
proc testSsoTiers() = discard
testStable()
testStaticLongPromotion()
testSsoTiers()
echo "OK"