mirror of
https://github.com/nim-lang/Nim.git
synced 2026-06-14 23:53:47 +00:00
SSO: add readRawDataStable across all string implementations (#25909)
Companion to readRawData whose pointer stays valid across moves/copies of the string. Under --strings:sso it promotes a small inline string to its heap representation; under refc/v2 the data is already heap-resident so it aliases readRawData. Uniform `var string` signature on every backend so code can prepare for --strings:sso without `when declared`.
This commit is contained in:
@@ -43,6 +43,15 @@ parameter and result types, not just their source-level shape. Use
|
||||
|
||||
[//]: # "Additions:"
|
||||
|
||||
- Added `system.readRawDataStable`, a companion to `readRawData` that returns a
|
||||
raw `ptr UncheckedArray[char]` into a string's character data which stays valid
|
||||
across moves and copies of the string value. It is available under every string
|
||||
implementation (refc, ARC/ORC and `--strings:sso`) with the same signature, so
|
||||
code can pin an interior buffer pointer today and be ready for `--strings:sso`
|
||||
without `when declared` guards. Under `--strings:sso` it promotes a small inline
|
||||
string to its heap representation first; under the other implementations the data
|
||||
is already heap-resident, so it is equivalent to `readRawData`.
|
||||
|
||||
- `setutils.symmetricDifference` along with its operator version
|
||||
`` setutils.`-+-` `` and in-place version `setutils.toggle` have been added
|
||||
to more efficiently calculate the symmetric difference of bitsets.
|
||||
|
||||
@@ -1723,11 +1723,18 @@ when not (notJSnotNims and defined(nimSeqsV2)):
|
||||
let ns = cast[NimString](s)
|
||||
if ns == nil: nil
|
||||
else: cast[ptr UncheckedArray[char]](addr ns.data[start])
|
||||
template readRawDataStable*(s: var string; start = 0): ptr UncheckedArray[char] =
|
||||
## Same as `readRawData` here: the data lives in a heap `NimStringDesc` at a
|
||||
## stable address, so the pointer already survives moves of `s`. Takes `s` by
|
||||
## `var` to match the `--strings:sso` version, so code can prepare for that
|
||||
## upgrade without `when declared` guards.
|
||||
readRawData(s, start)
|
||||
else:
|
||||
# JS/nimscript: callers are guarded by whenNotVmJsNims/when not defined(js)
|
||||
proc beginStore*(s: var string; newLen: int; start = 0): ptr UncheckedArray[char] {.inline, noSideEffect, raises: [], tags: [].} = nil
|
||||
proc endStore*(s: var string) {.inline, noSideEffect, raises: [], tags: [].} = discard
|
||||
template readRawData*(s: string; start = 0): ptr UncheckedArray[char] = nil
|
||||
template readRawDataStable*(s: var string; start = 0): ptr UncheckedArray[char] = nil
|
||||
|
||||
when not defined(js):
|
||||
template newSeqImpl(T, len) =
|
||||
|
||||
@@ -261,4 +261,14 @@ template readRawData*(s: string; start = 0): ptr UncheckedArray[char] =
|
||||
## Template ensures no copy of `s`; ptr is valid while `s` is alive.
|
||||
rawDataImpl(cast[ptr NimStringV2](unsafeAddr s), start)
|
||||
|
||||
template readRawDataStable*(s: var string; start = 0): ptr UncheckedArray[char] =
|
||||
## Like `readRawData`, but the returned pointer additionally survives moves and
|
||||
## copies of `s` (while `s` stays alive and is not reassigned). For this string
|
||||
## implementation the char data already lives in a heap payload at an address
|
||||
## independent of the `string` value itself, so no promotion is needed and this
|
||||
## is identical to `readRawData`. Takes `s` by `var` to match the `--strings:sso`
|
||||
## version (which promotes a small inline string to the heap), so code written
|
||||
## against `readRawDataStable` compiles unchanged under either implementation.
|
||||
rawDataImpl(cast[ptr NimStringV2](addr s), start)
|
||||
|
||||
{.pop.}
|
||||
|
||||
@@ -770,6 +770,33 @@ template readRawData*(s: string; start = 0): ptr UncheckedArray[char] =
|
||||
## Template ensures no copy of `s` is made; ptr is valid while `s` is alive.
|
||||
rawDataImpl(cast[ptr SmallString](unsafeAddr s), start)
|
||||
|
||||
proc readRawDataStable*(s: var string; start = 0): ptr UncheckedArray[char] {.inline.} =
|
||||
## Like `readRawData`, but the returned pointer stays valid across moves and
|
||||
## copies of `s` (as long as `s` stays alive and is not reassigned). A
|
||||
## short/medium string keeps its chars *inline* in the string object, so a
|
||||
## plain `readRawData` pointer dangles the moment the object is moved; this
|
||||
## promotes `s` to its heap (long) representation first, whose payload address
|
||||
## is independent of where the string object itself lives. Use this whenever an
|
||||
## interior pointer must outlive the current scope of the owning string (e.g.
|
||||
## a cursor cached alongside the buffer it points into).
|
||||
let ss = cast[ptr SmallString](addr s)
|
||||
let slen = ssLen(ss[])
|
||||
if slen > 0 and slen <= PayloadSize:
|
||||
# Promote inline/medium to a long heap block so the payload lives at a
|
||||
# stable address. Mirrors the short/medium -> long transition in `add`.
|
||||
let newCap = max(slen, resize(slen))
|
||||
let p = cast[ptr LongString](alloc(LongStringDataOffset + newCap + 1))
|
||||
p.rc = 1
|
||||
p.fullLen = slen
|
||||
p.capImpl = newCap
|
||||
copyMem(addr p.data[0], inlinePtr(ss[]), slen)
|
||||
p.data[slen] = '\0'
|
||||
ss[].more = p
|
||||
setSSLen(ss[], HeapSlen)
|
||||
# Hot-prefix cache (bytes 1..AlwaysAvail) already mirrors data[0..AlwaysAvail-1]
|
||||
# because setSSLen only rewrote byte 0; the inline chars are untouched.
|
||||
rawDataImpl(ss, start)
|
||||
|
||||
# These take `string` (tyString) so the codegen uses them directly, bypassing
|
||||
# strmantle.nim's versions which go through nimStrLen/nimStrAtMutV3 compilerproc calls.
|
||||
proc cmpStrings(a, b: string): int {.compilerproc, inline.} =
|
||||
|
||||
94
tests/system/treadrawdatastable.nim
Normal file
94
tests/system/treadrawdatastable.nim
Normal file
@@ -0,0 +1,94 @@
|
||||
discard """
|
||||
matrix: "--mm:refc; --mm:orc; --mm:orc --strings:sso; --backend:cpp --mm:orc; --backend:js --mm:orc"
|
||||
output: "OK"
|
||||
"""
|
||||
|
||||
# Tests for `readRawDataStable` and the SSO static-long-string promotion path.
|
||||
# `readRawDataStable` is available under every string implementation (refc / v2 /
|
||||
# v3-sso / js) with the same signature, so the code below compiles unchanged on
|
||||
# all backends -- the point being that users can prepare for `--strings:sso`
|
||||
# without `when declared` hacks.
|
||||
|
||||
import std/assertions
|
||||
|
||||
const hasNativeSso = defined(nimsso) and
|
||||
(defined(gcArc) or defined(gcAtomicArc) or defined(gcOrc) or defined(gcYrc))
|
||||
|
||||
type
|
||||
Reader = object
|
||||
buf: string
|
||||
p: ptr UncheckedArray[char]
|
||||
|
||||
proc openFromBuffer(buf: sink string): Reader =
|
||||
# `result` (and thus `buf`) is moved into the caller on return. A plain
|
||||
# `readRawData` pointer into a small SSO string would dangle after that move;
|
||||
# `readRawDataStable` pins the buffer to a stable address first.
|
||||
result = Reader(buf: buf)
|
||||
result.p = readRawDataStable(result.buf)
|
||||
|
||||
proc testStable() =
|
||||
when not defined(js): # raw pointers are a degenerate nil no-op on the JS backend
|
||||
block: # short buffer (kept inline under SSO) survives the move
|
||||
var r = openFromBuffer("hello")
|
||||
doAssert r.buf == "hello"
|
||||
doAssert r.p[0] == 'h'
|
||||
doAssert r.p[4] == 'o'
|
||||
# Stable pointer == the live buffer's raw data after the move.
|
||||
doAssert cast[uint](r.p) == cast[uint](readRawData(r.buf))
|
||||
block: # medium buffer (len 12: inline overlay under SSO)
|
||||
var r = openFromBuffer("hello world!")
|
||||
doAssert r.p[11] == '!'
|
||||
block: # already-long buffer: returned as-is (already heap-resident)
|
||||
var r = openFromBuffer("this is a fairly long string buffer")
|
||||
doAssert r.p[0] == 't'
|
||||
doAssert r.p[34] == 'r'
|
||||
block: # empty string: API is callable (the data pointer is implementation-defined)
|
||||
var e = ""
|
||||
discard readRawDataStable(e)
|
||||
else:
|
||||
# On JS the API exists and is callable (returns nil) so call sites are portable.
|
||||
var s = "hello"
|
||||
discard readRawDataStable(s)
|
||||
|
||||
proc testStaticLongPromotion() =
|
||||
# Regression for the static-long -> heap promotion: when a string literal
|
||||
# longer than the inline payload (PayloadSize = 14 under SSO) is first
|
||||
# mutated, the new heap block must be filled from the full static payload,
|
||||
# not from the 7-byte inline hot-prefix cache. Reading from the cache copied
|
||||
# 7 valid chars and then ran off into the `more` pointer bytes -- the bug that
|
||||
# corrupted .nif index files on Windows bootstrap (see Nimony tstatic_long_add).
|
||||
# The assertion holds on every backend; only SSO ever risked the corruption.
|
||||
var content = "(.nif27)\n(index\n" # len 16
|
||||
let expected = "(.nif27)\n(index\n"
|
||||
content.add 'X' # triggers static-long -> heap promotion
|
||||
doAssert content.len == 17
|
||||
doAssert content == expected & "X"
|
||||
for i in 0 ..< expected.len:
|
||||
doAssert content[i] == expected[i]
|
||||
|
||||
when hasNativeSso:
|
||||
# A few SSO-tier-boundary sanity checks (short / medium / long, COW, shrink).
|
||||
proc testSsoTiers() =
|
||||
var a = "(.nif27)\n(index\n" # static long
|
||||
let b = "(.nif27)\n(index\n"
|
||||
doAssert a == b
|
||||
a.add 'Z'
|
||||
doAssert a == "(.nif27)\n(index\nZ"
|
||||
|
||||
var c = "abcdefghijklmnop" # static long, len 16
|
||||
var d = c # COW share
|
||||
d[0] = 'X'
|
||||
doAssert c == "abcdefghijklmnop" # original untouched
|
||||
doAssert d == "Xbcdefghijklmnop"
|
||||
|
||||
var e = "abcdefghijklmnop"
|
||||
e.setLen 3 # shrink below the inline cache size
|
||||
doAssert e == "abc"
|
||||
doAssert e.len == 3
|
||||
else:
|
||||
proc testSsoTiers() = discard
|
||||
|
||||
testStable()
|
||||
testStaticLongPromotion()
|
||||
testSsoTiers()
|
||||
echo "OK"
|
||||
Reference in New Issue
Block a user