SSO for strings (#25593)

2026-05-25 14:28:15 +00:00 · 2026-04-02 07:19:43 +02:00
parent be29bcd402
commit d389d4fb2f
29 changed files with 2235 additions and 171 deletions
--- a/compiler/ccgcalls.nim
+++ b/compiler/ccgcalls.nim
@@ -230,20 +230,29 @@ proc genOpenArraySlice(p: BProc; q: PNode; formalType, destType: PType; prepareF
  of tyString, tySequence:
    let atyp = skipTypes(a.t, abstractInst)
    if formalType.skipTypes(abstractInst).kind in {tyVar} and atyp.kind == tyString and
-        optSeqDestructors in p.config.globalOptions:
+        optSeqDestructors in p.config.globalOptions and not p.config.isDefined("nimsso"):
      let bra = byRefLoc(p, a)
      p.s(cpsStmts).addCallStmt(cgsymValue(p.module, "nimPrepareStrMutationV2"),
        bra)
-    var val: Snippet
-    if atyp.kind in {tyVar} and not compileToCpp(p.module):
-      val = cDeref(ra)
+    if p.config.isDefined("nimsso") and
+        skipTypes(a.t, abstractVar + abstractInst).kind == tyString:
+      let strPtr = if atyp.kind in {tyVar} and not compileToCpp(p.module): ra
+                   else: addrLoc(p.config, a)
+      result = (
+        cCast(ptrType(dest), cOp(Add, NimInt,
+          cCall(cgsymValue(p.module, "nimStrData"), strPtr), rb)),
+        lengthExpr)
    else:
-      val = ra
-    result = (
-      cIfExpr(dataFieldAccessor(p, val),
-        cCast(ptrType(dest), cOp(Add, NimInt, dataField(p, val), rb)),
-        NimNil),
-      lengthExpr)
+      var val: Snippet
+      if atyp.kind in {tyVar} and not compileToCpp(p.module):
+        val = cDeref(ra)
+      else:
+        val = ra
+      result = (
+        cIfExpr(dataFieldAccessor(p, val),
+          cCast(ptrType(dest), cOp(Add, NimInt, dataField(p, val), rb)),
+          NimNil),
+        lengthExpr)
  else:
    result = ("", "")
    internalError(p.config, "openArrayLoc: " & typeToString(a.t))
@@ -287,11 +296,22 @@ proc openArrayLoc(p: BProc, formalType: PType, n: PNode; result: var Builder) =
    of tyString, tySequence:
      let ntyp = skipTypes(n.typ, abstractInst)
      if formalType.skipTypes(abstractInst).kind in {tyVar} and ntyp.kind == tyString and
-          optSeqDestructors in p.config.globalOptions:
+          optSeqDestructors in p.config.globalOptions and not p.config.isDefined("nimsso"):
        let bra = byRefLoc(p, a)
        p.s(cpsStmts).addCallStmt(cgsymValue(p.module, "nimPrepareStrMutationV2"),
          bra)
-      if ntyp.kind in {tyVar} and not compileToCpp(p.module):
+      if p.config.isDefined("nimsso") and
+          skipTypes(n.typ, abstractVar + abstractInst).kind == tyString:
+        if ntyp.kind in {tyVar} and not compileToCpp(p.module):
+          let ra = a.rdLoc
+          result.add(cCall(cgsymValue(p.module, "nimStrData"), ra))
+          result.addArgumentSeparator()
+          result.add(cCall(cgsymValue(p.module, "nimStrLen"), cDeref(ra)))
+        else:
+          result.add(cCall(cgsymValue(p.module, "nimStrData"), addrLoc(p.config, a)))
+          result.addArgumentSeparator()
+          result.add(lenExpr(p, a))
+      elif ntyp.kind in {tyVar} and not compileToCpp(p.module):
        let ra = a.rdLoc
        var t = TLoc(snippet: cDeref(ra))
        let lt = lenExpr(p, t)
@@ -315,9 +335,14 @@ proc openArrayLoc(p: BProc, formalType: PType, n: PNode; result: var Builder) =
        let ra = a.rdLoc
        var t = TLoc(snippet: cDeref(ra))
        let lt = lenExpr(p, t)
-        result.add(cIfExpr(dataFieldAccessor(p, t.snippet), dataField(p, t.snippet), NimNil))
-        result.addArgumentSeparator()
-        result.add(lt)
+        if p.config.isDefined("nimsso"):
+          result.add(cCall(cgsymValue(p.module, "nimStrData"), ra))
+          result.addArgumentSeparator()
+          result.add(cCall(cgsymValue(p.module, "nimStrLen"), t.snippet))
+        else:
+          result.add(cIfExpr(dataFieldAccessor(p, t.snippet), dataField(p, t.snippet), NimNil))
+          result.addArgumentSeparator()
+          result.add(lt)
      of tyArray:
        let ra = rdLoc(a)
        result.add(ra)
@@ -344,7 +369,8 @@ proc expressionsNeedsTmp(p: BProc, a: TLoc): TLoc =

 proc genArgStringToCString(p: BProc, n: PNode; result: var Builder; needsTmp: bool) {.inline.} =
  var a = initLocExpr(p, n[0])
-  let ra = withTmpIfNeeded(p, a, needsTmp).rdLoc
+  let tmp = withTmpIfNeeded(p, a, needsTmp)
+  let ra = if p.config.isDefined("nimsso"): addrLoc(p.config, tmp) else: tmp.rdLoc
  result.addCall(cgsymValue(p.module, "nimToCStringConv"), ra)

 proc genArg(p: BProc, n: PNode, param: PSym; call: PNode; result: var Builder; needsTmp = false) =
--- a/compiler/ccgexprs.nim
+++ b/compiler/ccgexprs.nim
@@ -320,12 +320,16 @@ proc genOpenArrayConv(p: BProc; d: TLoc; a: TLoc; flags: TAssignmentFlags) =
      p.s(cpsStmts).addCallStmt(
        cgsymValue(p.module, "nimPrepareStrMutationV2"),
        bra)
-
    let rd = d.rdLoc
-    let ra = a.rdLoc
-    p.s(cpsStmts).addFieldAssignment(rd, "Field0",
-      cIfExpr(dataFieldAccessor(p, ra), dataField(p, ra), NimNil))
    let la = lenExpr(p, a)
+    if p.config.isDefined("nimsso"):
+      let bra = byRefLoc(p, a)
+      p.s(cpsStmts).addFieldAssignment(rd, "Field0",
+        cCall(cgsymValue(p.module, "nimStrData"), bra))
+    else:
+      let ra = a.rdLoc
+      p.s(cpsStmts).addFieldAssignment(rd, "Field0",
+        cIfExpr(dataFieldAccessor(p, ra), dataField(p, ra), NimNil))
    p.s(cpsStmts).addFieldAssignment(rd, "Field1", la)
  else:
    internalError(p.config, a.lode.info, "cannot handle " & $a.t.kind)
@@ -958,7 +962,8 @@ proc genDeref(p: BProc, e: PNode, d: var TLoc) =
      putIntoDest(p, d, e, cDeref(rdLoc(a)), a.storage)

 proc cowBracket(p: BProc; n: PNode) =
-  if n.kind == nkBracketExpr and optSeqDestructors in p.config.globalOptions:
+  if n.kind == nkBracketExpr and optSeqDestructors in p.config.globalOptions and
+      not p.config.isDefined("nimsso"):
    let strCandidate = n[0]
    if strCandidate.typ.skipTypes(abstractInst).kind == tyString:
      var a: TLoc = initLocExpr(p, strCandidate)
@@ -984,7 +989,9 @@ proc genAddr(p: BProc, e: PNode, d: var TLoc) =
    # bug #19497
    d.lode = e
  else:
-    var a: TLoc = initLocExpr(p, e[0])
+    let ssoStrSub = p.config.isDefined("nimsso") and e[0].kind == nkBracketExpr and
+        e[0][0].typ.skipTypes(abstractVar).kind == tyString
+    var a: TLoc = initLocExpr(p, e[0], if ssoStrSub: {lfEnforceDeref, lfPrepareForMutation} else: {})
    if e[0].kind in {nkHiddenStdConv, nkHiddenSubConv, nkConv} and not ignoreConv(e[0]):
      # addr (conv x) introduces a temp because `conv x` is not a rvalue
      # transform addr ( conv ( x ) ) -> conv ( addr ( x ) )
@@ -1311,13 +1318,24 @@ proc genSeqElem(p: BProc, n, x, y: PNode, d: var TLoc) =
  if skipTypes(a.t, abstractVar).kind in {tyRef, tyPtr}:
    a.snippet = cDeref(a.snippet)

-  if lfPrepareForMutation in d.flags and ty.kind == tyString and
-      optSeqDestructors in p.config.globalOptions:
+  if p.config.isDefined("nimsso") and ty.kind == tyString:
    let bra = byRefLoc(p, a)
-    p.s(cpsStmts).addCallStmt(cgsymValue(p.module, "nimPrepareStrMutationV2"),
-      bra)
-  let ra = rdLoc(a)
-  putIntoDest(p, d, n, subscript(dataField(p, ra), rcb), a.storage)
+    if lfPrepareForMutation in d.flags:
+      # Use nimStrAtMutV3 to get a mutable reference (char*) to the element.
+      # Only when mutation is requested: avoids calling nimPrepareStrMutationV2
+      # on const string literals (which would SIGSEGV on write to read-only memory).
+      putIntoDest(p, d, n,
+        cDeref(cCall(cgsymValue(p.module, "nimStrAtMutV3"), bra, rcb)), a.storage)
+    else:
+      putIntoDest(p, d, n,
+        cCall(cgsymValue(p.module, "nimStrAtV3"), bra, rcb), a.storage)
+  else:
+    if lfPrepareForMutation in d.flags and ty.kind == tyString and
+        optSeqDestructors in p.config.globalOptions:
+      let bra = byRefLoc(p, a)
+      p.s(cpsStmts).addCallStmt(cgsymValue(p.module, "nimPrepareStrMutationV2"), bra)
+    let ra = rdLoc(a)
+    putIntoDest(p, d, n, subscript(dataField(p, ra), rcb), a.storage)

 proc genBracketExpr(p: BProc; n: PNode; d: var TLoc) =
  var ty = skipTypes(n[0].typ, abstractVarRange + tyUserTypeClasses)
@@ -2124,12 +2142,20 @@ proc genRepr(p: BProc, e: PNode, d: var TLoc) =
      let ra = rdLoc(a)
      putIntoDest(p, b, e, ra & cArgumentSeparator & ra & "Len_0", a.storage)
    of tyString, tySequence:
-      let ra = rdLoc(a)
      let la = lenExpr(p, a)
-      putIntoDest(p, b, e,
-        cIfExpr(dataFieldAccessor(p, ra), dataField(p, ra), NimNil) &
-          cArgumentSeparator & la,
-        a.storage)
+      if p.config.isDefined("nimsso") and
+          skipTypes(a.t, abstractVarRange).kind == tyString:
+        let bra = byRefLoc(p, a)
+        putIntoDest(p, b, e,
+          cCall(cgsymValue(p.module, "nimStrData"), bra) &
+            cArgumentSeparator & la,
+          a.storage)
+      else:
+        let ra = rdLoc(a)
+        putIntoDest(p, b, e,
+          cIfExpr(dataFieldAccessor(p, ra), dataField(p, ra), NimNil) &
+            cArgumentSeparator & la,
+          a.storage)
    of tyArray:
      let ra = rdLoc(a)
      let la = cIntValue(lengthOrd(p.config, a.t))
@@ -2710,9 +2736,9 @@ proc genConv(p: BProc, e: PNode, d: var TLoc) =

 proc convStrToCStr(p: BProc, n: PNode, d: var TLoc) =
  var a: TLoc = initLocExpr(p, n[0])
+  let arg = if p.config.isDefined("nimsso"): addrLoc(p.config, a) else: rdLoc(a)
  putIntoDest(p, d, n,
-    cgCall(p, "nimToCStringConv", rdLoc(a)),
-#   "($1 ? $1->data : (NCSTRING)\"\")" % [a.rdLoc],
+    cgCall(p, "nimToCStringConv", arg),
    a.storage)

 proc convCStrToStr(p: BProc, n: PNode, d: var TLoc) =
@@ -2783,19 +2809,25 @@ proc genWasMoved(p: BProc; n: PNode) =
    #  [addrLoc(p.config, a), getTypeDesc(p.module, a.t)])

 proc genMove(p: BProc; n: PNode; d: var TLoc) =
-  var a: TLoc = initLocExpr(p, n[1].skipAddr, {lfEnforceDeref})
+  var a: TLoc = initLocExpr(p, n[1].skipAddr, {lfEnforceDeref, lfPrepareForMutation})
  if n.len == 4:
    # generated by liftdestructors:
    var src: TLoc = initLocExpr(p, n[2])
    let destVal = rdLoc(a)
    let srcVal = rdLoc(src)
-    p.s(cpsStmts).addSingleIfStmt(
-      cOp(NotEqual,
-        dotField(destVal, "p"),
-        dotField(srcVal, "p"))):
+    if p.config.isDefined("nimsso") and
+        n[1].typ.skipTypes(abstractVar).kind == tyString:
+      # SmallString: destroy dst then struct-copy src; no .p field aliasing needed
      genStmts(p, n[3])
-    p.s(cpsStmts).addFieldAssignment(destVal, "len", dotField(srcVal, "len"))
-    p.s(cpsStmts).addFieldAssignment(destVal, "p", dotField(srcVal, "p"))
+      genAssignment(p, a, src, {})
+    else:
+      p.s(cpsStmts).addSingleIfStmt(
+        cOp(NotEqual,
+          dotField(destVal, "p"),
+          dotField(srcVal, "p"))):
+        genStmts(p, n[3])
+      p.s(cpsStmts).addFieldAssignment(destVal, "len", dotField(srcVal, "len"))
+      p.s(cpsStmts).addFieldAssignment(destVal, "p", dotField(srcVal, "p"))
  else:
    if d.k == locNone: d = getTemp(p, n.typ)
    if p.config.selectedGC in {gcArc, gcAtomicArc, gcOrc, gcYrc}:
@@ -2832,15 +2864,19 @@ proc genDestroy(p: BProc; n: PNode) =
    case t.kind
    of tyString:
      var a: TLoc = initLocExpr(p, arg)
-      let ra = rdLoc(a)
-      let rp = dotField(ra, "p")
-      p.s(cpsStmts).addSingleIfStmt(
-        cOp(And, rp,
-          cOp(Not, cOp(BitAnd, NimInt,
-            derefField(rp, "cap"),
-            NimStrlitFlag)))):
-        let fn = if optThreads in p.config.globalOptions: "deallocShared" else: "dealloc"
-        p.s(cpsStmts).addCallStmt(cgsymValue(p.module, fn), rp)
+      if p.config.isDefined("nimsso"):
+        # SmallString: delegate to nimDestroyStrV1 (rc-based, handles static strings)
+        p.s(cpsStmts).addCallStmt(cgsymValue(p.module, "nimDestroyStrV1"), rdLoc(a))
+      else:
+        let ra = rdLoc(a)
+        let rp = dotField(ra, "p")
+        p.s(cpsStmts).addSingleIfStmt(
+          cOp(And, rp,
+            cOp(Not, cOp(BitAnd, NimInt,
+              derefField(rp, "cap"),
+              NimStrlitFlag)))):
+          let fn = if optThreads in p.config.globalOptions: "deallocShared" else: "dealloc"
+          p.s(cpsStmts).addCallStmt(cgsymValue(p.module, fn), rp)
    of tySequence:
      var a: TLoc = initLocExpr(p, arg)
      let ra = rdLoc(a)
@@ -4200,7 +4236,10 @@ proc genBracedInit(p: BProc, n: PNode; isConst: bool; optionalType: PType; resul
      genConstObjConstr(p, n, isConst, result)
    of tyString, tyCstring:
      if optSeqDestructors in p.config.globalOptions and n.kind != nkNilLit and ty == tyString:
-        genStringLiteralV2Const(p.module, n, isConst, result)
+        if p.config.isDefined("nimsso"):
+          genStringLiteralV3Const(p.module, n, isConst, result)
+        else:
+          genStringLiteralV2Const(p.module, n, isConst, result)
      else:
        var d: TLoc = initLocExpr(p, n)
        result.add rdLoc(d)
--- a/compiler/ccgliterals.nim
+++ b/compiler/ccgliterals.nim
@@ -22,7 +22,11 @@ template detectVersion(field, corename) =
    result = 1

 proc detectStrVersion(m: BModule): int =
-  detectVersion(strVersion, "nimStrVersion")
+  if m.g.config.isDefined("nimsso") and
+      m.g.config.selectedGC in {gcArc, gcOrc, gcYrc, gcAtomicArc, gcHooks}:
+    result = 3
+  else:
+    detectVersion(strVersion, "nimStrVersion")

 proc detectSeqVersion(m: BModule): int =
  detectVersion(seqVersion, "nimSeqVersion")
@@ -128,6 +132,192 @@ proc genStringLiteralV2Const(m: BModule; n: PNode; isConst: bool; result: var Bu
    result.addField(strInit, name = "p"):
      result.add(cCast(ptrType("NimStrPayload"), cAddr(pureLit)))

+proc ssoCharLit(ch: char): string =
+  ## Return a C char literal for ch, with proper escaping.
+  const hexDigits = "0123456789abcdef"
+  result = "'"
+  case ch
+  of '\'': result.add("\\'")
+  of '\\': result.add("\\\\")
+  of '\0': result.add("\\0")
+  of '\n': result.add("\\n")
+  of '\r': result.add("\\r")
+  of '\t': result.add("\\t")
+  elif ch.ord < 32 or ch.ord == 127:
+    result.add("\\x")
+    result.add(hexDigits[ch.ord shr 4])
+    result.add(hexDigits[ch.ord and 0xf])
+  else:
+    result.add(ch)
+  result.add('\'')
+
+proc ssoBytesLit(m: BModule; s: string; slen: int): string =
+  ## Compute the `bytes` field value for the new SmallString layout.
+  ## byte 0 = slen, bytes 1-7 = inline chars 0-6 (zero-padded).
+  ## On LE: slen in bits 0-7, char[i] in bits (i+1)*8..(i+1)*8+7.
+  ## On BE: slen in bits 56-63, char[i] in bits (6-i)*8..(6-i)*8+7.
+  const AlwaysAvail = 7
+  var val: uint64
+  if CPU[m.g.config.target.targetCPU].endian == littleEndian:
+    val = uint64(slen)
+    for i in 0..<min(s.len, AlwaysAvail):
+      val = val or (uint64(s[i]) shl (uint(i + 1) * 8))
+  else:
+    val = uint64(slen) shl 56
+    for i in 0..<min(s.len, AlwaysAvail):
+      val = val or (uint64(s[i]) shl (uint(AlwaysAvail - 1 - i) * 8))
+  # Cast to NU (C name for Nim's uint, = NU64 on 64-bit). NU64 = uint64_t.
+  result = cCast("NU", $val & "ULL")
+
+proc ssoMoreLit(m: BModule; s: string): string =
+  ## For medium string literals (AlwaysAvail < len <= PayloadSize), encode
+  ## chars[AlwaysAvail..ptrSize-1] in the 'more' pointer field bit-pattern.
+  ## The last pointer byte is always '\0' (null terminator), guaranteed by
+  ## PayloadSize = AlwaysAvail + ptrSize - 1.  slen <= PayloadSize guards
+  ## prevent any code from dereferencing this as an actual pointer.
+  const AlwaysAvail = 7
+  let ptrSize = m.g.config.target.ptrSize
+  var val: uint64 = 0
+  for i in 0..<ptrSize:
+    let ch: uint64 = if AlwaysAvail + i < s.len: uint64(s[AlwaysAvail + i]) else: 0
+    if CPU[m.g.config.target.targetCPU].endian == littleEndian:
+      val = val or (ch shl (uint(i) * 8))
+    else:
+      val = val or (ch shl (uint(ptrSize - 1 - i) * 8))
+  result = cCast(ptrType("LongString"), "(uintptr_t)" & $val)
+
+proc genStringLiteralV3Const(m: BModule; n: PNode; isConst: bool; result: var Builder) =
+  # Inline SmallString struct initializer for use inside const aggregate types.
+  # Layout: {bytes: NimUint, more: ptr LongString}
+  # bytes = slen (low byte) | char[0]<<8 | char[1]<<16 | ... | char[6]<<56
+  const AlwaysAvail = 7
+  let s = n.strVal
+
+  cgsym(m, "SmallString")
+  cgsym(m, "LongString")
+
+  let payloadSize = AlwaysAvail + m.g.config.target.ptrSize - 1
+  var si: StructInitializer
+  result.addStructInitializer(si, kind = siOrderedStruct):
+    if s.len <= AlwaysAvail:
+      result.addField(si, name = "bytes"):
+        result.add(ssoBytesLit(m, s, s.len))
+      result.addField(si, name = "more"):
+        result.add(NimNil)
+    elif s.len <= payloadSize:
+      # Medium string: bytes holds slen + chars 0-6; more holds chars 7..PayloadSize-1.
+      result.addField(si, name = "bytes"):
+        result.add(ssoBytesLit(m, s, s.len))
+      result.addField(si, name = "more"):
+        result.add(ssoMoreLit(m, s))
+    else:
+      # Emit the LongString block into cfsStrData and reference it inline.
+      let dataName = getTempName(m)
+      var res = newBuilder("")
+      res.addVarWithTypeAndInitializer(
+          if isConst: AlwaysConst else: Global,
+          name = dataName):
+        res.addSimpleStruct(m, name = "", baseType = ""):
+          res.addField(name = "rc", typ = NimInt)
+          res.addField(name = "fullLen", typ = NimInt)
+          res.addField(name = "capImpl", typ = NimInt)
+          res.addArrayField(name = "data", elementType = NimChar, len = s.len + 1)
+      do:
+        var di: StructInitializer
+        res.addStructInitializer(di, kind = siOrderedStruct):
+          res.addField(di, name = "fullLen"):
+            res.addIntValue(s.len)
+          res.addField(di, name = "rc"):
+            res.addIntValue(1)
+          res.addField(di, name = "capImpl"):
+            res.addIntValue(0)  # static, never freed
+          res.addField(di, name = "data"):
+            res.add(makeCString(s))
+      m.s[cfsStrData].add(extract(res))
+      # slen = StaticSlen (254): marks this as a static (never-freed) long string.
+      result.addField(si, name = "bytes"):
+        result.add(ssoBytesLit(m, s, 254))
+      result.addField(si, name = "more"):
+        result.add(cCast(ptrType("LongString"), cAddr(dataName)))
+
+# ------ Version 3: SmallString (SSO) strings --------------------------------
+
+proc genStringLiteralV3(m: BModule; n: PNode; isConst: bool; result: var Builder) =
+  # SmallString literal. Always generate a fresh SmallString variable (like v2
+  # always generates a fresh outer NimStringV2). For long strings, cache the
+  # LongString payload to avoid duplicates within a module.
+  const AlwaysAvail = 7  # must match strs_v3.nim
+  let s = n.strVal
+  let tmp = getTempName(m)
+  result.add tmp
+
+  cgsym(m, "SmallString")
+  cgsym(m, "LongString")
+
+  let payloadSize = AlwaysAvail + m.g.config.target.ptrSize - 1
+  var res = newBuilder("")
+  if s.len <= AlwaysAvail:
+    # Short: bytes holds slen + all chars (zero-padded), more = NULL.
+    res.addVarWithInitializer(
+        if isConst: AlwaysConst else: Global,
+        name = tmp, typ = "SmallString"):
+      var si: StructInitializer
+      res.addStructInitializer(si, kind = siOrderedStruct):
+        res.addField(si, name = "bytes"):
+          res.add(ssoBytesLit(m, s, s.len))
+        res.addField(si, name = "more"):
+          res.add(NimNil)
+  elif s.len <= payloadSize:
+    # Medium: bytes holds slen + chars 0-6; more holds chars 7..PayloadSize-1 as raw bits.
+    res.addVarWithInitializer(
+        if isConst: AlwaysConst else: Global,
+        name = tmp, typ = "SmallString"):
+      var si: StructInitializer
+      res.addStructInitializer(si, kind = siOrderedStruct):
+        res.addField(si, name = "bytes"):
+          res.add(ssoBytesLit(m, s, s.len))
+        res.addField(si, name = "more"):
+          res.add(ssoMoreLit(m, s))
+  else:
+    # Long: cache the LongString block to emit it only once per module per string.
+    # Always generate a fresh SmallString pointing at the (possibly cached) block.
+    let id = nodeTableTestOrSet(m.dataCache, n, m.labels)
+    var dataName: string
+    if id == m.labels:
+      dataName = getTempName(m)
+      res.addVarWithTypeAndInitializer(
+          if isConst: AlwaysConst else: Global,
+          name = dataName):
+        res.addSimpleStruct(m, name = "", baseType = ""):
+          res.addField(name = "rc", typ = NimInt)
+          res.addField(name = "fullLen", typ = NimInt)
+          res.addField(name = "capImpl", typ = NimInt)
+          res.addArrayField(name = "data", elementType = NimChar, len = s.len + 1)
+      do:
+        var di: StructInitializer
+        res.addStructInitializer(di, kind = siOrderedStruct):
+          res.addField(di, name = "fullLen"):
+            res.addIntValue(s.len)
+          res.addField(di, name = "rc"):
+            res.addIntValue(1)
+          res.addField(di, name = "capImpl"):
+            res.addIntValue(0)  # bit 0 = 0: static, never freed
+          res.addField(di, name = "data"):
+            res.add(makeCString(s))
+    else:
+      dataName = m.tmpBase & $id
+    # slen = StaticSlen (254): marks this as a static (never-freed) long string.
+    res.addVarWithInitializer(
+        if isConst: AlwaysConst else: Global,
+        name = tmp, typ = "SmallString"):
+      var si: StructInitializer
+      res.addStructInitializer(si, kind = siOrderedStruct):
+        res.addField(si, name = "bytes"):
+          res.add(ssoBytesLit(m, s, 254))
+        res.addField(si, name = "more"):
+          res.add(cCast(ptrType("LongString"), cAddr(dataName)))
+  m.s[cfsStrData].add(extract(res))
+
 # ------ Version selector ---------------------------------------------------

 proc genStringLiteralDataOnly(m: BModule; s: string; info: TLineInfo;
@@ -138,6 +328,8 @@ proc genStringLiteralDataOnly(m: BModule; s: string; info: TLineInfo;
    let tmp = getTempName(m)
    genStringLiteralDataOnlyV2(m, s, tmp, isConst)
    result.add tmp
+  of 3:
+    localError(m.config, info, "genStringLiteralDataOnly not supported for SmallString (nimsso)")
  else:
    localError(m.config, info, "cannot determine how to produce code for string literal")

@@ -148,5 +340,6 @@ proc genStringLiteral(m: BModule; n: PNode; result: var Builder) =
  case detectStrVersion(m)
  of 0, 1: genStringLiteralV1(m, n, result)
  of 2: genStringLiteralV2(m, n, isConst = true, result)
+  of 3: genStringLiteralV3(m, n, isConst = true, result)
  else:
    localError(m.config, n.info, "cannot determine how to produce code for string literal")
--- a/compiler/ccgstmts.nim
+++ b/compiler/ccgstmts.nim
@@ -1940,6 +1940,15 @@ proc genAsgn(p: BProc, e: PNode, fastAsgn: bool) =
  elif optFieldCheck in p.options and isDiscriminantField(e[0]):
    genLineDir(p, e)
    asgnFieldDiscriminant(p, e)
+  elif p.config.isDefined("nimsso") and e[0].kind == nkBracketExpr and
+      e[0][0].typ.skipTypes(abstractVar).kind == tyString:
+    # nimsso: s[i] = c  →  nimStrPutV3(&s, i, c)  (handles COW internally)
+    genLineDir(p, e)
+    var base = initLocExpr(p, e[0][0])
+    var idx  = initLocExpr(p, e[0][1])
+    var rhs  = initLocExpr(p, e[1])
+    p.s(cpsStmts).addCallStmt(cgsymValue(p.module, "nimStrPutV3"),
+      byRefLoc(p, base), rdLoc(idx), rdCharLoc(rhs))
  else:
    let le = e[0]
    let ri = e[1]
--- a/compiler/ccgtypes.nim
+++ b/compiler/ccgtypes.nim
@@ -339,6 +339,10 @@ proc getSimpleTypeDesc(m: BModule; typ: PType): Rope =
      cgsym(m, "NimStrPayload")
      cgsym(m, "NimStringV2")
      result = typeNameOrLiteral(m, typ, "NimStringV2")
+    of 3:
+      cgsym(m, "LongString")
+      cgsym(m, "SmallString")
+      result = typeNameOrLiteral(m, typ, "SmallString")
    else:
      cgsym(m, "NimStringDesc")
      result = typeNameOrLiteral(m, typ, "NimStringDesc*")
--- a/compiler/cgen.nim
+++ b/compiler/cgen.nim
@@ -389,7 +389,11 @@ proc lenField(p: BProc, val: Rope): Rope {.inline.} =

 proc lenExpr(p: BProc; a: TLoc): Rope =
  if optSeqDestructors in p.config.globalOptions:
-    result = dotField(rdLoc(a), "len")
+    if p.config.isDefined("nimsso") and a.lode != nil and a.t != nil and
+        a.t.skipTypes(abstractInst).kind == tyString:
+      result = cCall(cgsymValue(p.module, "nimStrLen"), rdLoc(a))
+    else:
+      result = dotField(rdLoc(a), "len")
  else:
    let ra = rdLoc(a)
    result = cIfExpr(ra, lenField(p, ra), cIntValue(0))
@@ -530,7 +534,15 @@ proc resetLoc(p: BProc, loc: var TLoc) =

    let atyp = skipTypes(loc.t, abstractInst)
    let rl = rdLoc(loc)
-    if atyp.kind in {tyVar, tyLent}:
+    if typ.kind == tyString and p.config.isDefined("nimsso"):
+      # SmallString zero state: bytes=0 (slen=0 in low byte, all inline chars zeroed)
+      if atyp.kind in {tyVar, tyLent}:
+        p.s(cpsStmts).addAssignment(derefField(rl, "bytes"), cIntValue(0))
+        p.s(cpsStmts).addAssignment(derefField(rl, "more"), NimNil)
+      else:
+        p.s(cpsStmts).addAssignment(dotField(rl, "bytes"), cIntValue(0))
+        p.s(cpsStmts).addAssignment(dotField(rl, "more"), NimNil)
+    elif atyp.kind in {tyVar, tyLent}:
      p.s(cpsStmts).addAssignment(derefField(rl, "len"), cIntValue(0))
      p.s(cpsStmts).addAssignment(derefField(rl, "p"), NimNil)
    else:
@@ -580,8 +592,13 @@ proc constructLoc(p: BProc, loc: var TLoc, isTemp = false) =
  let typ = loc.t
  if optSeqDestructors in p.config.globalOptions and skipTypes(typ, abstractInst + {tyStatic}).kind in {tyString, tySequence}:
    let rl = rdLoc(loc)
-    p.s(cpsStmts).addFieldAssignment(rl, "len", cIntValue(0))
-    p.s(cpsStmts).addFieldAssignment(rl, "p", NimNil)
+    if skipTypes(typ, abstractInst + {tyStatic}).kind == tyString and p.config.isDefined("nimsso"):
+      # SmallString zero state: bytes=0 (slen=0 in low byte, all inline chars zeroed)
+      p.s(cpsStmts).addFieldAssignment(rl, "bytes", cIntValue(0))
+      p.s(cpsStmts).addFieldAssignment(rl, "more", NimNil)
+    else:
+      p.s(cpsStmts).addFieldAssignment(rl, "len", cIntValue(0))
+      p.s(cpsStmts).addFieldAssignment(rl, "p", NimNil)
  elif not isComplexValueType(typ):
    if containsGarbageCollectedRef(loc.t):
      var nilLoc: TLoc = initLoc(locTemp, loc.lode, OnStack)
--- a/compiler/int128.nim
+++ b/compiler/int128.nim
@@ -460,7 +460,9 @@ proc addInt128*(result: var string; value: Int128) =
    var i = initialSize
    var j = high(result)
    while i < j:
-      swap(result[i], result[j])
+      let tmp = result[i]
+      result[i] = result[j]
+      result[j] = tmp
      i += 1
      j -= 1

--- a/compiler/layeredtable.nim
+++ b/compiler/layeredtable.nim
@@ -46,12 +46,11 @@ proc setToPreviousLayer*(pt: var LayeredIdTable) {.inline.} =
  when useRef:
    pt = pt.nextLayer
  else:
-    when defined(gcDestructors):
-      pt = pt.nextLayer[]
-    else:
-      # workaround refc
-      let tmp = pt.nextLayer[]
-      pt = tmp
+    # Must read nextLayer into a temp before destroying pt:
+    # `pt = pt.nextLayer[]` would call eqcopy(&pt, &(*pt.nextLayer)) which
+    # decrements pt.nextLayer's rc (freeing it) before reading pt.nextLayer.nextLayer.
+    let tmp = pt.nextLayer[]
+    pt = tmp

 iterator pairs*(pt: LayeredIdTable): (ItemId, PType) =
  var tm = pt
--- a/compiler/liftdestructors.nim
+++ b/compiler/liftdestructors.nim
@@ -701,11 +701,18 @@ proc fillStrOp(c: var TLiftCtx; t: PType; body, x, y: PNode) =
  of attachedAsgn, attachedDeepCopy, attachedDup:
    body.add callCodegenProc(c.g, "nimAsgnStrV2", c.info, genAddr(c, x), y)
  of attachedSink:
-    let moveCall = genBuiltin(c, mMove, "move", x)
-    moveCall.add y
-    doAssert t.destructor != nil
-    moveCall.add destructorCall(c, t.destructor, x)
-    body.add moveCall
+    if c.g.config.isDefined("nimsso"):
+      # SmallString: destroy old dst, then bit-copy src (no rc increment — this is a move).
+      # No .p aliasing check needed; rc-based destroy handles COW sharing correctly.
+      doAssert t.destructor != nil
+      body.add destructorCall(c, t.destructor, x)
+      body.add newAsgnStmt(x, y)
+    else:
+      let moveCall = genBuiltin(c, mMove, "move", x)
+      moveCall.add y
+      doAssert t.destructor != nil
+      moveCall.add destructorCall(c, t.destructor, x)
+      body.add moveCall
  of attachedDestructor:
    body.add genBuiltin(c, mDestroy, "destroy", x)
  of attachedTrace:
--- a/compiler/llstream.nim
+++ b/compiler/llstream.nim
@@ -163,7 +163,7 @@ proc llReadFromStdin(s: PLLStream, buf: pointer, bufLen: int): int =
  inc(s.lineOffset)
  result = min(bufLen, s.s.len - s.rd)
  if result > 0:
-    copyMem(buf, addr(s.s[s.rd]), result)
+    copyMem(buf, readRawData(s.s, s.rd), result)
    inc(s.rd, result)

 proc llStreamRead*(s: PLLStream, buf: pointer, bufLen: int): int =
@@ -173,7 +173,7 @@ proc llStreamRead*(s: PLLStream, buf: pointer, bufLen: int): int =
  of llsString:
    result = min(bufLen, s.s.len - s.rd)
    if result > 0:
-      copyMem(buf, addr(s.s[0 + s.rd]), result)
+      copyMem(buf, readRawData(s.s, s.rd), result)
      inc(s.rd, result)
  of llsFile:
    result = readBuffer(s.f, buf, bufLen)
--- a/lib/pure/lexbase.nim
+++ b/lib/pure/lexbase.nim
@@ -65,7 +65,9 @@ proc fillBuffer(L: var BaseLexer) =
          L.buf[i] = L.buf[L.sentinel + 1 + i]
      else:
        # "moveMem" handles overlapping regions
-        moveMem(addr L.buf[0], addr L.buf[L.sentinel + 1], toCopy)
+        let p = beginStore(L.buf, L.buf.len)
+        moveMem(p, addr p[L.sentinel + 1], toCopy)
+        endStore(L.buf)
  charsRead = L.input.readDataStr(L.buf, toCopy ..< toCopy + L.sentinel + 1)
  s = toCopy + charsRead
  if charsRead < L.sentinel + 1:
--- a/lib/pure/osproc.nim
+++ b/lib/pure/osproc.nim
@@ -921,7 +921,7 @@ elif not defined(useNimRtl):
    for key, val in pairs(t):
      var x = key & "=" & val
      result[i] = cast[cstring](alloc(x.len+1))
-      copyMem(result[i], addr(x[0]), x.len+1)
+      copyMem(result[i], x.cstring, x.len+1)
      inc(i)

  proc envToCStringArray(): cstringArray =
@@ -932,7 +932,7 @@ elif not defined(useNimRtl):
    for key, val in envPairs():
      var x = key & "=" & val
      result[i] = cast[cstring](alloc(x.len+1))
-      copyMem(result[i], addr(x[0]), x.len+1)
+      copyMem(result[i], x.cstring, x.len+1)
      inc(i)

  type
--- a/lib/pure/streams.nim
+++ b/lib/pure/streams.nim
@@ -259,10 +259,8 @@ proc readDataStr*(s: Stream, buffer: var string, slice: Slice[int]): int =
    result = s.readDataStrImpl(s, buffer, slice)
  else:
    # fallback
-    when declared(prepareMutation):
-      # buffer might potentially be a CoW literal with ARC
-      prepareMutation(buffer)
-    result = s.readData(addr buffer[slice.a], slice.b + 1 - slice.a)
+    result = s.readData(beginStore(buffer, slice.b + 1 - slice.a, slice.a), slice.b + 1 - slice.a)
+    endStore(buffer)

 template jsOrVmBlock(caseJsOrVm, caseElse: untyped): untyped =
  when nimvm:
@@ -1228,7 +1226,8 @@ else: # after 1.3 or JS not defined
      jsOrVmBlock:
        buffer[slice.a..<slice.a+result] = s.data[s.pos..<s.pos+result]
      do:
-        copyMem(unsafeAddr buffer[slice.a], addr s.data[s.pos], result)
+        copyMem(beginStore(buffer, result, slice.a), readRawData(s.data, s.pos), result)
+        endStore(buffer)
      inc(s.pos, result)
    else:
      result = 0
@@ -1244,7 +1243,7 @@ else: # after 1.3 or JS not defined
          raise newException(Defect, "could not read string stream, " &
            "did you use a non-string buffer pointer?", getCurrentException())
      elif not defined(nimscript):
-        copyMem(buffer, addr(s.data[s.pos]), result)
+        copyMem(buffer, readRawData(s.data, s.pos), result)
      inc(s.pos, result)
    else:
      result = 0
@@ -1260,7 +1259,7 @@ else: # after 1.3 or JS not defined
          raise newException(Defect, "could not peek string stream, " &
            "did you use a non-string buffer pointer?", getCurrentException())
      elif not defined(nimscript):
-        copyMem(buffer, addr(s.data[s.pos]), result)
+        copyMem(buffer, readRawData(s.data, s.pos), result)
    else:
      result = 0

@@ -1277,7 +1276,8 @@ else: # after 1.3 or JS not defined
        raise newException(Defect, "could not write to string stream, " &
          "did you use a non-string buffer pointer?", getCurrentException())
    elif not defined(nimscript):
-      copyMem(addr(s.data[s.pos]), buffer, bufLen)
+      copyMem(beginStore(s.data, bufLen, s.pos), buffer, bufLen)
+      endStore(s.data)
    inc(s.pos, bufLen)

  proc ssClose(s: Stream) =
@@ -1345,7 +1345,9 @@ proc fsReadData(s: Stream, buffer: pointer, bufLen: int): int =
  result = readBuffer(FileStream(s).f, buffer, bufLen)

 proc fsReadDataStr(s: Stream, buffer: var string, slice: Slice[int]): int =
-  result = readBuffer(FileStream(s).f, addr buffer[slice.a], slice.b + 1 - slice.a)
+  let len = slice.b + 1 - slice.a
+  result = readBuffer(FileStream(s).f, beginStore(buffer, len, slice.a), len)
+  endStore(buffer)

 proc fsPeekData(s: Stream, buffer: pointer, bufLen: int): int =
  let pos = fsGetPosition(s)
--- a/lib/pure/strutils.nim
+++ b/lib/pure/strutils.nim
@@ -1983,9 +1983,10 @@ func find*(s: string, sub: char, start: Natural = 0, last = -1): int {.rtl,
    when hasCStringBuiltin:
      let length = last-start+1
      if length > 0:
-        let found = c_memchr(s[start].unsafeAddr, cint(sub), cast[csize_t](length))
+        let sdata = readRawData(s)
+        let found = c_memchr(addr sdata[start], cint(sub), cast[csize_t](length))
        if not found.isNil:
-          return cast[int](found) -% cast[int](s.cstring)
+          return cast[int](found) -% cast[int](sdata)
    else:
      findImpl()

@@ -2041,9 +2042,10 @@ func find*(s, sub: string, start: Natural = 0, last = -1): int {.rtl,
    when declared(memmem):
      let subLen = sub.len
      if last < 0 and start < s.len and subLen != 0:
-        let found = memmem(s[start].unsafeAddr, csize_t(s.len - start), sub.cstring, csize_t(subLen))
+        let sdata = readRawData(s)
+        let found = memmem(addr sdata[start], csize_t(s.len - start), readRawData(sub), csize_t(subLen))
        result = if not found.isNil:
-            cast[int](found) -% cast[int](s.cstring)
+            cast[int](found) -% cast[int](sdata)
          else:
            -1
      else:
--- a/lib/std/formatfloat.nim
+++ b/lib/std/formatfloat.nim
@@ -19,7 +19,12 @@ proc addCstringN(result: var string, buf: cstring; buflen: int) =
  let oldLen = result.len
  let newLen = oldLen + buflen
  result.setLen newLen
-  c_memcpy(result[oldLen].addr, buf, buflen.csize_t)
+  {.cast(noSideEffect).}:
+    when declared(completeStore):
+      c_memcpy(beginStore(result, buflen, oldLen), buf, buflen.csize_t)
+      endStore(result)
+    else:
+      discard c_memcpy(result[oldLen].addr, buf, buflen.csize_t)

 import std/private/[dragonbox, schubfach]

--- a/lib/std/private/digitsutils.nim
+++ b/lib/std/private/digitsutils.nim
@@ -52,7 +52,7 @@ func addChars[T](result: var string, x: T, start: int, n: int) {.inline, enforce
    for i in 0..<n: result[old + i] = x[start + i]
  when nimvm: impl
  else:
-    when defined(js) or defined(nimscript): impl
+    when defined(js) or defined(nimscript) or defined(nimsso): impl
    else:
      {.noSideEffect.}:
        copyMem result[old].addr, x[start].unsafeAddr, n
--- a/lib/std/strbasics.nim
+++ b/lib/std/strbasics.nim
@@ -84,9 +84,9 @@ func setSlice*(s: var string, slice: Slice[int]) =
      when not declared(moveMem):
        impl()
      else:
-        when defined(nimSeqsV2):
-          prepareMutation(s)
-        moveMem(addr s[0], addr s[first], last - first + 1)
+        let p = beginStore(s, last - first + 1)
+        moveMem(p, addr p[first], last - first + 1)
+        endStore(s)
  s.setLen(last - first + 1)

 func strip*(a: var string, leading = true, trailing = true, chars: set[char] = whitespaces) {.inline.} =
--- a/lib/std/syncio.nim
+++ b/lib/std/syncio.nim
@@ -485,7 +485,8 @@ proc readLine*(f: File, line: var string): bool {.tags: [ReadIOEffect],
    while true:
      # fixes #9634; this pattern may need to be abstracted as a template if reused;
      # likely other io procs need this for correctness.
-      fgetsSuccess = c_fgets(cast[cstring](addr line[pos]), sp.cint, f) != nil
+      fgetsSuccess = c_fgets(cast[cstring](beginStore(line, sp, pos)), sp.cint, f) != nil
+      endStore(line)
      if fgetsSuccess: break
      when not defined(nimscript):
        if errno == EINTR:
@@ -495,10 +496,11 @@ proc readLine*(f: File, line: var string): bool {.tags: [ReadIOEffect],
      checkErr(f)
      break

-    let m = c_memchr(addr line[pos], cint('\L'), cast[csize_t](sp))
+    let lineData = readRawData(line)
+    let m = c_memchr(addr lineData[pos], cint('\L'), cast[csize_t](sp))
    if m != nil:
      # \l found: Could be our own or the one by fgets, in any case, we're done
-      var last = cast[int](m) - cast[int](addr line[0])
+      var last = cast[int](m) - cast[int](lineData)
      if last > 0 and line[last-1] == '\c':
        line.setLen(last-1)
        return last > 1 or fgetsSuccess
@@ -564,7 +566,8 @@ proc readAllBuffer(file: File): string =
  result = ""
  var buffer = newString(BufSize)
  while true:
-    var bytesRead = readBuffer(file, addr(buffer[0]), BufSize)
+    var bytesRead = readBuffer(file, beginStore(buffer, BufSize), BufSize)
+    endStore(buffer)
    if bytesRead == BufSize:
      result.add(buffer)
    else:
@@ -590,7 +593,8 @@ proc readAllFile(file: File, len: int64): string =
  # We acquire the filesize beforehand and hope it doesn't change.
  # Speeds things up.
  result = newString(len)
-  let bytes = readBuffer(file, addr(result[0]), len)
+  let bytes = readBuffer(file, beginStore(result, len.int), len.int)
+  endStore(result)
  if endOfFile(file):
    if bytes.int64 < len:
      result.setLen(bytes)
--- a/lib/system.nim
+++ b/lib/system.nim
@@ -1622,26 +1622,29 @@ when notJSnotNims:
  include system/sysmem

 when notJSnotNims and defined(nimSeqsV2):
-  const nimStrVersion {.core.} = 2
+  when defined(nimsso):
+    const nimStrVersion {.core.} = 3
+  else:
+    const nimStrVersion {.core.} = 2

-  type
-    NimStrPayloadBase = object
-      cap: int
+    type
+      NimStrPayloadBase = object
+        cap: int

-    NimStrPayload {.core.} = object
-      cap: int
-      data: UncheckedArray[char]
+      NimStrPayload {.core.} = object
+        cap: int
+        data: UncheckedArray[char]

-    NimStringV2 {.core.} = object
-      len: int
-      p: ptr NimStrPayload ## can be nil if len == 0.
+      NimStringV2 {.core.} = object
+        len: int
+        p: ptr NimStrPayload ## can be nil if len == 0.

 when defined(windows):
  proc GetLastError(): int32 {.header: "<windows.h>", nodecl.}
  const ERROR_BAD_EXE_FORMAT = 193

 when notJSnotNims:
-  when defined(nimSeqsV2):
+  when defined(nimSeqsV2) and not defined(nimsso):
    proc nimToCStringConv(s: NimStringV2): cstring {.compilerproc, nonReloadable, inline.}

  when hostOS != "standalone" and hostOS != "any":
@@ -1689,9 +1692,32 @@ when not defined(nimIcIntegrityChecks):
  export exceptions

 when notJSnotNims and defined(nimSeqsV2):
-  include "system/strs_v2"
+  when defined(nimsso):
+    include "system/strs_v3"
+  else:
+    include "system/strs_v2"
  include "system/seqs_v2"

+when not (notJSnotNims and defined(nimSeqsV2)):
+  # Fallback implementations for backends where strs_v2/v3 is not included.
+  # Needed so modules imported by system (e.g. syncio) can reference these without guards.
+  when notJSnotNims:
+    # mm:refc: string = ptr NimStringDesc with data: UncheckedArray[char]
+    proc beginStore*(s: var string; ensuredLen: int; start = 0): ptr UncheckedArray[char] {.inline, noSideEffect, raises: [], tags: [].} =
+      let ns = cast[NimString](s)
+      if ns == nil: nil
+      else: cast[ptr UncheckedArray[char]](addr ns.data[start])
+    proc endStore*(s: var string) {.inline, noSideEffect, raises: [], tags: [].} = discard
+    template readRawData*(s: string; start = 0): ptr UncheckedArray[char] =
+      let ns = cast[NimString](s)
+      if ns == nil: nil
+      else: cast[ptr UncheckedArray[char]](addr ns.data[start])
+  else:
+    # JS/nimscript: callers are guarded by whenNotVmJsNims/when not defined(js)
+    proc beginStore*(s: var string; ensuredLen: int; start = 0): ptr UncheckedArray[char] {.inline, noSideEffect, raises: [], tags: [].} = nil
+    proc endStore*(s: var string) {.inline, noSideEffect, raises: [], tags: [].} = discard
+    template readRawData*(s: string; start = 0): ptr UncheckedArray[char] = nil
+
 when not defined(js):
  template newSeqImpl(T, len) =
    result = newSeqOfCap[T](len)
@@ -1741,6 +1767,9 @@ when not defined(js):
    else:
      {.error: "The type T cannot contain managed memory or have destructors".}

+  when defined(nimsso) and not declared(newStringUninitWasDeclared):
+    proc newStringUninitImpl(len: Natural): string {.noSideEffect, inline.}
+
  proc newStringUninit*(len: Natural): string {.noSideEffect.} =
    ## Returns a new string of length `len` but with uninitialized
    ## content. One needs to fill the string character after character
@@ -1751,17 +1780,20 @@ when not defined(js):
    when nimvm:
      result = newString(len)
    else:
-      result = newStringOfCap(len)
-      {.cast(noSideEffect).}:
-        when defined(nimSeqsV2):
-          let s = cast[ptr NimStringV2](addr result)
-          if len > 0:
+      when defined(nimsso):
+        result = newStringUninitImpl(len)
+      else:
+        result = newStringOfCap(len)
+        {.cast(noSideEffect).}:
+          when defined(nimSeqsV2):
+            let s = cast[ptr NimStringV2](addr result)
+            if len > 0:
+              s.len = len
+              s.p.data[len] = '\0'
+          else:
+            let s = cast[NimString](result)
            s.len = len
-            s.p.data[len] = '\0'
-        else:
-          let s = cast[NimString](result)
-          s.len = len
-          s.data[len] = '\0'
+            s.data[len] = '\0'
 else:
  proc newStringUninit*(len: Natural): string {.
    magic: "NewString", importc: "mnewString", noSideEffect.}
@@ -2244,10 +2276,13 @@ when not defined(js) or defined(nimscript):
      else: result = 0
    else:
      when not defined(nimscript): # avoid semantic checking
-        let minlen = min(x.len, y.len)
-        result = int(nimCmpMem(x.cstring, y.cstring, cast[csize_t](minlen)))
-        if result == 0:
-          result = x.len - y.len
+        when defined(nimsso):
+          result = cmpStrings(x, y)
+        else:
+          let minlen = min(x.len, y.len)
+          result = int(nimCmpMem(x.cstring, y.cstring, cast[csize_t](minlen)))
+          if result == 0:
+            result = x.len - y.len

  when declared(newSeq):
    proc cstringArrayToSeq*(a: cstringArray, len: Natural): seq[string] =
@@ -2913,7 +2948,9 @@ proc substr*(a: openArray[char]): string =
  result = newStringUninit(a.len)
  whenNotVmJsNims():
    if a.len > 0:
-      copyMem(result[0].addr, a[0].unsafeAddr, a.len)
+      {.cast(noSideEffect).}:
+        copyMem(beginStore(result, a.len), a[0].unsafeAddr, a.len)
+        endStore(result)
  do:
    for i, ch in a:
      result[i] = ch
@@ -2948,7 +2985,8 @@ proc substr*(s: string; first, last: int): string = # A bug with `magic: Slice`
  result = newStringUninit(L)
  whenNotVmJsNims():
    if L > 0:
-      copyMem(result[0].addr, s[first].unsafeAddr, L)
+      copyMem(beginStore(result, L), readRawData(s, first), L)
+      endStore(result)
  do:
    for i in 0..<L:
      result[i] = s[i + first]
@@ -3166,3 +3204,6 @@ when hostOS == "standalone":
  # ssymbols being duplicated.
  proc nimPanic(s: string) {.exportc, noreturn.} = panic(s)
  proc nimRawoutput(s: string) {.exportc.} = rawoutput(s)
+
+when not declared(newStringUninitWasDeclared):
+  proc newStringUninitImpl(len: Natural): string {.noSideEffect, inline.} = discard
--- a/lib/system/assign.nim
+++ b/lib/system/assign.nim
@@ -62,9 +62,14 @@ proc genericAssignAux(dest, src: pointer, mt: PNimType, shallow: bool) =
  case mt.kind
  of tyString:
    when defined(nimSeqsV2):
-      var x = cast[ptr NimStringV2](dest)
-      var s2 = cast[ptr NimStringV2](s)[]
-      nimAsgnStrV2(x[], s2)
+      when defined(nimsso):
+        var x = cast[ptr SmallString](dest)
+        var s2 = cast[ptr SmallString](s)[]
+        nimAsgnStrV2(x[], s2)
+      else:
+        var x = cast[ptr NimStringV2](dest)
+        var s2 = cast[ptr NimStringV2](s)[]
+        nimAsgnStrV2(x[], s2)
    else:
      var x = cast[PPointer](dest)
      var s2 = cast[PPointer](s)[]
@@ -245,8 +250,11 @@ proc genericReset(dest: pointer, mt: PNimType) =
    unsureAsgnRef(cast[PPointer](dest), nil)
  of tyString:
    when defined(nimSeqsV2):
-      var s = cast[ptr NimStringV2](dest)
-      frees(s[])
+      when defined(nimsso):
+        nimDestroyStrV1(cast[ptr SmallString](dest)[])
+      else:
+        var s = cast[ptr NimStringV2](dest)
+        frees(s[])
      zeroMem(dest, mt.size)
    else:
      unsureAsgnRef(cast[PPointer](dest), nil)
--- a/lib/system/deepcopy.nim
+++ b/lib/system/deepcopy.nim
@@ -92,9 +92,14 @@ proc genericDeepCopyAux(dest, src: pointer, mt: PNimType; tab: var PtrTable) =
  case mt.kind
  of tyString:
    when defined(nimSeqsV2):
-      var x = cast[ptr NimStringV2](dest)
-      var s2 = cast[ptr NimStringV2](s)[]
-      nimAsgnStrV2(x[], s2)
+      when defined(nimsso):
+        var x = cast[ptr SmallString](dest)
+        var s2 = cast[ptr SmallString](s)[]
+        nimAsgnStrV2(x[], s2)
+      else:
+        var x = cast[ptr NimStringV2](dest)
+        var s2 = cast[ptr NimStringV2](s)[]
+        nimAsgnStrV2(x[], s2)
    else:
      var x = cast[PPointer](dest)
      var s2 = cast[PPointer](s)[]
--- a/lib/system/indices.nim
+++ b/lib/system/indices.nim
@@ -30,7 +30,8 @@ proc `[]`*[T](s: var openArray[T]; i: BackwardsIndex): var T {.inline, systemRai
  system.`[]`(s, s.len - int(i))
 proc `[]`*[Idx, T](a: var array[Idx, T]; i: BackwardsIndex): var T {.inline, systemRaisesDefect.} =
  a[Idx(a.len - int(i) + int low(a))]
-proc `[]`*(s: var string; i: BackwardsIndex): var char {.inline, systemRaisesDefect.} = s[s.len - int(i)]
+when not defined(nimsso):
+  proc `[]`*(s: var string; i: BackwardsIndex): var char {.inline, systemRaisesDefect.} = s[s.len - int(i)]

 proc `[]=`*[T](s: var openArray[T]; i: BackwardsIndex; x: T) {.inline, systemRaisesDefect.} =
  system.`[]=`(s, s.len - int(i), x)
--- a/lib/system/strmantle.nim
+++ b/lib/system/strmantle.nim
@@ -10,45 +10,46 @@
 # Compilerprocs for strings that do not depend on the string implementation.
 import std/private/digitsutils as digitsutils2

-proc cmpStrings(a, b: string): int {.inline, compilerproc.} =
-  let alen = a.len
-  let blen = b.len
-  let minlen = min(alen, blen)
-  if minlen > 0:
-    result = c_memcmp(unsafeAddr a[0], unsafeAddr b[0], cast[csize_t](minlen)).int
-    if result == 0:
+when not defined(nimsso):
+  proc cmpStrings(a, b: string): int {.inline, compilerproc.} =
+    let alen = a.len
+    let blen = b.len
+    let minlen = min(alen, blen)
+    if minlen > 0:
+      result = c_memcmp(unsafeAddr a[0], unsafeAddr b[0], cast[csize_t](minlen)).int
+      if result == 0:
+        result = alen - blen
+    else:
      result = alen - blen
-  else:
-    result = alen - blen

-proc leStrings(a, b: string): bool {.inline, compilerproc.} =
-  # required by upcoming backends (NIR).
-  cmpStrings(a, b) <= 0
+  proc leStrings(a, b: string): bool {.inline, compilerproc.} =
+    # required by upcoming backends (NIR).
+    cmpStrings(a, b) <= 0

-proc ltStrings(a, b: string): bool {.inline, compilerproc.} =
-  # required by upcoming backends (NIR).
-  cmpStrings(a, b) < 0
+  proc ltStrings(a, b: string): bool {.inline, compilerproc.} =
+    # required by upcoming backends (NIR).
+    cmpStrings(a, b) < 0

-proc eqStrings(a, b: string): bool {.inline, compilerproc.} =
-  result = false
-  let alen = a.len
-  let blen = b.len
-  if alen == blen:
-    if alen == 0: return true
-    return equalMem(unsafeAddr(a[0]), unsafeAddr(b[0]), alen)
+  proc eqStrings(a, b: string): bool {.inline, compilerproc.} =
+    result = false
+    let alen = a.len
+    let blen = b.len
+    if alen == blen:
+      if alen == 0: return true
+      return equalMem(unsafeAddr(a[0]), unsafeAddr(b[0]), alen)

-proc hashString(s: string): int {.compilerproc.} =
-  # the compiler needs exactly the same hash function!
-  # this used to be used for efficient generation of string case statements
-  var h = 0'u
-  for i in 0..len(s)-1:
-    h = h + uint(s[i])
-    h = h + h shl 10
-    h = h xor (h shr 6)
-  h = h + h shl 3
-  h = h xor (h shr 11)
-  h = h + h shl 15
-  result = cast[int](h)
+  proc hashString(s: string): int {.compilerproc.} =
+    # the compiler needs exactly the same hash function!
+    # this used to be used for efficient generation of string case statements
+    var h = 0'u
+    for i in 0..len(s)-1:
+      h = h + uint(s[i])
+      h = h + h shl 10
+      h = h xor (h shr 6)
+    h = h + h shl 3
+    h = h xor (h shr 11)
+    h = h + h shl 15
+    result = cast[int](h)

 proc eqCstrings(a, b: cstring): bool {.inline, compilerproc.} =
  if pointer(a) == pointer(b): result = true
--- a/lib/system/strs_v2.nim
+++ b/lib/system/strs_v2.nim
@@ -176,18 +176,18 @@ proc nimAsgnStrV2(a: var NimStringV2, b: NimStringV2) {.compilerRtl.} =
    a.len = b.len
    copyMem(unsafeAddr a.p.data[0], unsafeAddr b.p.data[0], b.len+1)

-proc nimPrepareStrMutationImpl(s: var NimStringV2) =
+proc nimPrepareStrMutationImpl(s: var NimStringV2) {.raises: [], tags: [].} =
  let oldP = s.p
  # can't mutate a literal, so we need a fresh copy here:
  s.p = allocPayload(s.len)
  s.p.cap = s.len
  copyMem(unsafeAddr s.p.data[0], unsafeAddr oldP.data[0], s.len+1)

-proc nimPrepareStrMutationV2(s: var NimStringV2) {.compilerRtl, inl.} =
+proc nimPrepareStrMutationV2(s: var NimStringV2) {.compilerRtl, inl, raises: [], tags: [].} =
  if s.p != nil and (s.p.cap and strlitFlag) == strlitFlag:
    nimPrepareStrMutationImpl(s)

-proc prepareMutation*(s: var string) {.inline.} =
+proc prepareMutation*(s: var string) {.inline, raises: [], tags: [].} =
  # string literals are "copy on write", so you need to call
  # `prepareMutation` before modifying the strings via `addr`.
  {.cast(noSideEffect).}:
@@ -216,4 +216,25 @@ func capacity*(self: string): int {.inline.} =
  let str = cast[ptr NimStringV2](unsafeAddr self)
  result = if str.p != nil: str.p.cap and not strlitFlag else: 0

+proc beginStore*(s: var string; ensuredLen: int; start = 0): ptr UncheckedArray[char] {.inline, noSideEffect, raises: [], tags: [].} =
+  ## Returns a writable pointer for bulk write of `ensuredLen` bytes starting at `start`.
+  ## Call `endStore(s)` afterwards for portability.
+  {.cast(noSideEffect).}: prepareMutation(s)
+  let str = cast[ptr NimStringV2](unsafeAddr s)
+  if str.p == nil: nil
+  else: cast[ptr UncheckedArray[char]](addr str.p.data[start])
+
+proc endStore*(s: var string) {.inline, noSideEffect, raises: [], tags: [].} =
+  ## No-op for non-SSO strings; call after bulk writes via `beginStore`.
+  discard
+
+proc rawDataImpl(str: ptr NimStringV2; start: int): ptr UncheckedArray[char] {.inline, noSideEffect, raises: [], tags: [].} =
+  if str.p == nil: nil
+  else: cast[ptr UncheckedArray[char]](addr str.p.data[start])
+
+template readRawData*(s: string; start = 0): ptr UncheckedArray[char] =
+  ## Returns a pointer to `s[start]` for read-only raw access.
+  ## Template ensures no copy of `s`; ptr is valid while `s` is alive.
+  rawDataImpl(cast[ptr NimStringV2](unsafeAddr s), start)
+
 {.pop.}
--- a/lib/system/strs_v3.nim
+++ b/lib/system/strs_v3.nim
@@ -0,0 +1,743 @@
+#
+#
+#            Nim's Runtime Library
+#        (c) Copyright 2026 Nim contributors
+#
+#    See the file "copying.txt", included in this
+#    distribution, for details about the copyright.
+#
+
+## Small String Optimization (SSO) implementation used by Nim's core.
+
+const
+  AlwaysAvail = sizeof(uint) - 1  # inline chars that fit in the `bytes` field alongside slen
+  PayloadSize = AlwaysAvail + sizeof(pointer) - 1  # -1 reserves the last byte for '\0'
+  HeapSlen   = 255  # slen sentinel: heap-allocated long string; capImpl = raw capacity
+  StaticSlen = 254  # slen sentinel: static/literal long string; capImpl = 0, never freed
+  LongStringDataOffset = 3 * sizeof(int)  # byte offset of LongString.data from struct start
+
+when false:
+  proc atomicAddFetch(p: var int; v: int): int {.importc: "__sync_add_and_fetch", nodecl.}
+  proc atomicSubFetch(p: var int; v: int): int {.importc: "__sync_sub_and_fetch", nodecl.}
+else:
+  proc atomicAddFetch(p: var int; v: int): int {.inline.} =
+    result = p + v
+    p = result
+  proc atomicSubFetch(p: var int; v: int): int {.inline.} =
+    result = p - v
+    p = result
+
+type
+  LongString {.core.} = object
+    fullLen: int
+    rc: int       # atomic reference count; 1 = unique owner
+    capImpl: int  # raw capacity; 0 for static literals (never freed, slen = StaticSlen)
+    data: UncheckedArray[char]
+
+  SmallString {.core.} = object
+    bytes: uint
+      ## Layout (little-endian): byte 0 = slen; bytes 1..AlwaysAvail = inline chars 0..AlwaysAvail-1.
+      ## Bytes after the null terminator are zero (SWAR invariant).
+      ## When slen == HeapSlen (255), `more` is a heap-owned LongString block.
+      ## When slen == StaticSlen (254), `more` points to a static LongString literal.
+      ## When AlwaysAvail < slen <= PayloadSize, `more` holds raw char bytes AlwaysAvail..PayloadSize-1 (medium string).
+    more: ptr LongString
+
+when sizeof(uint) == 8:
+  proc bswap(x: uint): uint {.importc: "__builtin_bswap64", nodecl, noSideEffect.}
+  proc ctzImpl(x: uint): int {.inline.} =
+    proc ctz64(x: uint64): int32 {.importc: "__builtin_ctzll", nodecl, noSideEffect.}
+    int(ctz64(uint64(x)))
+else:
+  proc bswap(x: uint): uint {.importc: "__builtin_bswap32", nodecl, noSideEffect.}
+  proc ctzImpl(x: uint): int {.inline.} =
+    proc ctz32(x: uint32): int32 {.importc: "__builtin_ctz", nodecl, noSideEffect.}
+    int(ctz32(uint32(x)))
+
+proc swarKey(x: uint): uint {.inline.} =
+  ## Returns a value where inline char[0] is in the most significant byte,
+  ## so that integer comparison gives lexicographic string order.
+  ## LE: slen in bits 0-7; `bswap(x shr 8)` puts char[0] in MSB.
+  ## BE: slen in bits (sizeof(uint)-1)*8..(sizeof(uint)*8-1) (MSB); `x shl 8` shifts slen out, char[0] lands in MSB.
+  when system.cpuEndian == littleEndian:
+    bswap(x shr 8)
+  else:
+    x shl 8
+
+# ---- accessors ----
+# Memory layout is identical on both endiannesses: byte 0 = slen, bytes 1..AlwaysAvail = inline chars.
+# But the integer value of `bytes` differs: on LE slen is in the LSB, on BE in the MSB.
+
+template ssLenOf(bytes: uint): int =
+  ## Extract slen from an already-loaded `bytes` word. Zero-cost (register op only).
+  ## Use when `bytes` is already in a register (e.g. loaded for SWAR comparison).
+  when system.cpuEndian == littleEndian:
+    int(bytes and 0xFF'u)
+  else:
+    int(bytes shr (8 * (sizeof(uint) - 1)))
+
+proc cmpShortInline(abytes, bbytes: uint; aslen, bslen: int): int {.inline.} =
+  let minLen = min(aslen, bslen)
+  if minLen > 0:
+    when system.cpuEndian == littleEndian:
+      let diffMask = (1'u shl (minLen * 8)) - 1'u
+      let diff = ((abytes xor bbytes) shr 8) and diffMask
+      if diff != 0:
+        let byteShift = (ctzImpl(diff) shr 3) * 8 + 8
+        let ac = (abytes shr byteShift) and 0xFF'u
+        let bc = (bbytes shr byteShift) and 0xFF'u
+        if ac < bc: return -1
+        return 1
+    else:
+      let aw = swarKey(abytes)
+      let bw = swarKey(bbytes)
+      if aw < bw: return -1
+      if aw > bw: return 1
+  aslen - bslen
+
+template ssLen(s: SmallString): int =
+  ## Load slen via a direct byte access at offset 0 (valid on both LE and BE).
+  ## A byte load (movzx) lets the C compiler prove that slen is at offset 0,
+  ## distinct from inline char writes at offsets 1+, enabling register-caching
+  ## of slen across char-write loops (e.g. nimAddCharV1).
+  int(cast[ptr byte](unsafeAddr s.bytes)[])
+
+template setSSLen(s: var SmallString; v: int) =
+  # Single byte store — equivalent to old `s.slen = byte(v)`.
+  # Accessing a uint via byte* is legal in C (char-pointer aliasing exemption).
+  cast[ptr byte](addr s.bytes)[] = cast[byte](v)
+
+# Pointer to inline chars (offset +1 from `bytes` field / start of struct).
+# Only valid when s is in memory (var/ptr); forces a load from memory.
+template inlinePtr(s: SmallString): ptr UncheckedArray[char] =
+  cast[ptr UncheckedArray[char]](cast[uint](unsafeAddr s.bytes) + 1'u)
+
+# Same but from a ptr SmallString (avoids unsafeAddr dance).
+template inlinePtrOf(p: ptr SmallString): ptr UncheckedArray[char] =
+  cast[ptr UncheckedArray[char]](cast[uint](p) + 1'u)
+
+proc resize(old: int): int {.inline.} =
+  ## Capacity growth factor shared with seqs_v2.nim.
+  if old <= 0: result = 4
+  elif old <= high(int16): result = old * 2
+  else: result = old div 2 + old
+
+# No Nim lifecycle hooks: the compiler calls the compilerRtl procs directly
+# for tyString variables (nimDestroyStrV1, nimAsgnStrV2).
+
+proc nimDestroyStrV1(s: SmallString) {.compilerRtl, inline.} =
+  if ssLen(s) == HeapSlen:
+    if atomicSubFetch(s.more.rc, 1) == 0:
+      dealloc(s.more)
+
+proc ensureUniqueLong(s: var SmallString; oldLen, newLen: int) =
+  # Ensure s.more is a unique (rc=1) heap block with capacity >= newLen, preserving existing data.
+  # s must already be a long string (slen >= StaticSlen) on entry.
+  # After return, slen == HeapSlen (s is heap-owned).
+  let isHeap = ssLen(s) == HeapSlen
+  let cap = if isHeap: s.more.capImpl else: 0  # static literals have capImpl=0
+  if isHeap and s.more.rc == 1 and newLen <= cap:
+    s.more.fullLen = newLen
+  else:
+    # Only grow capacity when actually needed; pure COW copies (newLen <= cap)
+    # preserve the existing capacity to avoid exponential growth via repeated COW.
+    let newCap = if newLen > cap: max(newLen, resize(cap)) else: cap
+    let p = cast[ptr LongString](alloc(LongStringDataOffset + newCap + 1))
+    p.rc = 1
+    p.fullLen = newLen
+    p.capImpl = newCap
+    let old = s.more
+    copyMem(addr p.data[0], addr old.data[0], oldLen + 1)  # +1 preserves the '\0'
+    if isHeap and atomicSubFetch(old.rc, 1) == 0:
+      dealloc(old)
+    s.more = p
+    setSSLen(s, HeapSlen)  # mark as heap-owned (also handles static→heap promotion)
+
+proc len(s: SmallString): int {.inline.} =
+  result = ssLen(s)
+  if result > PayloadSize:
+    result = s.more.fullLen
+
+template guts(s: SmallString): (int, ptr UncheckedArray[char]) =
+  let slen = ssLen(s)
+  if slen > PayloadSize:
+    (s.more.fullLen, cast[ptr UncheckedArray[char]](addr s.more.data[0]))
+  else:
+    (slen, inlinePtr(s))
+
+proc nimStrAtV3*(s: var SmallString; i: int): char {.compilerproc, inline.} =
+  if ssLen(s) <= PayloadSize:
+    # short/medium: data is in the inline bytes overlay
+    result = inlinePtr(s)[i]
+  else:
+    # long: always use heap data (completeStore keeps more.data canonical)
+    result = s.more.data[i]
+
+proc nimStrPutV3*(s: var SmallString; i: int; c: char) {.compilerproc, inline.} =
+  let slen = ssLen(s)
+  if slen <= PayloadSize:
+    # unchecked: when i >= 7 we store into the `more` overlay
+    inlinePtr(s)[i] = c
+    # Maintain SWAR zeroing invariant: if i < AlwaysAvail and we wrote a non-null,
+    # caller is responsible. Writing '\0' here would break content. No action needed.
+  else:
+    let l = s.more.fullLen
+    ensureUniqueLong(s, l, l)  # COW if shared; length unchanged
+    s.more.data[i] = c
+    if i < AlwaysAvail:
+      inlinePtr(s)[i] = c
+
+proc cmpInlineBytes(a, b: ptr UncheckedArray[char]; n: int): int {.inline.} =
+  for i in 0..<n:
+    let ac = a[i]
+    let bc = b[i]
+    if ac < bc: return -1
+    if ac > bc: return 1
+
+proc cmpStringPtrs(a, b: ptr SmallString): int {.inline.} =
+  # Compare two SmallStrings by pointer to avoid struct copies in the hot path.
+  let abytes = a.bytes
+  let bbytes = b.bytes
+  let aslen = ssLenOf(abytes)
+  let bslen = ssLenOf(bbytes)
+  if aslen <= AlwaysAvail and bslen <= AlwaysAvail:
+    # SWAR path: both short (≤7 bytes). All data lives in the `bytes` field.
+    # Zeroed-padding invariant ensures bytes past the null are 0.
+    # swarKey puts char[0] in the MSB → integer comparison is lexicographic.
+    let aw = swarKey(abytes)
+    let bw = swarKey(bbytes)
+    if aw < bw: return -1
+    if aw > bw: return 1
+    return aslen - bslen
+  if aslen <= PayloadSize and bslen <= PayloadSize:
+    # Both inline/medium: all data lives in the flat struct, no heap access needed.
+    let minLen = min(aslen, bslen)
+    let pfxLen = min(minLen, AlwaysAvail)
+    result = cmpInlineBytes(inlinePtrOf(a), inlinePtrOf(b), pfxLen)
+    if result != 0: return
+    if minLen > AlwaysAvail:
+      let aInl = inlinePtrOf(a)
+      let bInl = inlinePtrOf(b)
+      result = cmpInlineBytes(
+        cast[ptr UncheckedArray[char]](addr aInl[AlwaysAvail]),
+        cast[ptr UncheckedArray[char]](addr bInl[AlwaysAvail]),
+        minLen - AlwaysAvail)
+    if result == 0: result = aslen - bslen
+    return
+  # At least one is long. Hot prefix: inlinePtr[0..AlwaysAvail-1] mirrors heap data.
+  let pfxLen = min(min(aslen, bslen), AlwaysAvail)
+  result = cmpInlineBytes(inlinePtrOf(a), inlinePtrOf(b), pfxLen)
+  if result != 0: return
+  let la = if aslen > PayloadSize: a.more.fullLen else: aslen
+  let lb = if bslen > PayloadSize: b.more.fullLen else: bslen
+  let minLen = min(la, lb)
+  if minLen <= AlwaysAvail:
+    result = la - lb
+    return
+  let ap = if aslen > PayloadSize: cast[ptr UncheckedArray[char]](addr a.more.data[0]) else:
+    inlinePtrOf(a)
+  let bp = if bslen > PayloadSize: cast[ptr UncheckedArray[char]](addr b.more.data[0]) else:
+    inlinePtrOf(b)
+  result = cmpMem(addr ap[AlwaysAvail], addr bp[AlwaysAvail], minLen - AlwaysAvail)
+  if result == 0: result = la - lb
+
+proc cmp(a, b: SmallString): int {.inline.} =
+  # Load bytes once per string — used for both slen check and SWAR key.
+  let abytes = a.bytes
+  let bbytes = b.bytes
+  let aslen = ssLenOf(abytes)
+  let bslen = ssLenOf(bbytes)
+  if aslen <= AlwaysAvail and bslen <= AlwaysAvail:
+    return cmpShortInline(abytes, bbytes, aslen, bslen)
+  cmpStringPtrs(unsafeAddr a, unsafeAddr b)
+
+proc `==`(a, b: SmallString): bool {.inline.} =
+  let abytes = a.bytes
+  let bbytes = b.bytes
+  let aslen = ssLenOf(abytes)
+  let bslen = ssLenOf(bbytes)
+  if aslen <= AlwaysAvail and bslen <= AlwaysAvail:
+    return abytes == bbytes  # SWAR: slen equal, data in bytes word
+  # Compute actual lengths (sentinels 254/255 → more.fullLen)
+  let la = if aslen > PayloadSize: a.more.fullLen else: aslen
+  let lb = if bslen > PayloadSize: b.more.fullLen else: bslen
+  if la != lb: return false
+  if la == 0: return true
+  if aslen <= PayloadSize and bslen <= PayloadSize:
+    # Both medium (slen == la == lb, so byte0 equal): compare prefix word + tail
+    if abytes != bbytes: return false
+    let (_, pa) = a.guts
+    let (_, pb) = b.guts
+    return cmpMem(addr pa[AlwaysAvail], addr pb[AlwaysAvail], la - AlwaysAvail) == 0
+  # At least one long (heap or static): delegate to cmpStringPtrs
+  cmpStringPtrs(unsafeAddr a, unsafeAddr b) == 0
+
+proc continuesWith*(s, sub: SmallString; start: int): bool =
+  if start < 0: return false
+  let subslen = ssLen(sub)
+  if subslen == 0: return true
+  let sslen = ssLen(s)
+  # Compare via hot prefix first where possible (no heap dereference).
+  let pfxLen = min(subslen, max(0, AlwaysAvail - start))
+  if pfxLen > 0:
+    if cmpMem(cast[pointer](cast[uint](unsafeAddr s.bytes) + 1'u + uint(start)),
+              cast[pointer](cast[uint](unsafeAddr sub.bytes) + 1'u), pfxLen) != 0:
+      return false
+  # Fetch actual lengths and compare the remaining tail via heap/guts.
+  let subLen = if subslen > PayloadSize: sub.more.fullLen else: subslen
+  let sLen = if sslen > PayloadSize: s.more.fullLen else: sslen
+  if start + subLen > sLen: return false
+  if pfxLen == subLen: return true
+  let (_, sp) = s.guts
+  let (_, subp) = sub.guts
+  cmpMem(addr sp[start + pfxLen], addr subp[pfxLen], subLen - pfxLen) == 0
+
+proc startsWith*(s, sub: SmallString): bool {.inline.} = continuesWith(s, sub, 0)
+proc endsWith*(s, sub: SmallString): bool {.inline.} = continuesWith(s, sub, s.len - sub.len)
+
+
+proc add(s: var SmallString; c: char) =
+  let slen = ssLen(s)
+  if slen <= PayloadSize:
+    let newLen = slen + 1
+    if newLen <= PayloadSize:
+      let inl = inlinePtr(s)
+      inl[slen] = c
+      inl[newLen] = '\0'
+      setSSLen(s, newLen)
+    else:
+      # transition from medium (slen == PayloadSize) to long
+      let cap = newLen * 2
+      let p = cast[ptr LongString](alloc(LongStringDataOffset + cap + 1))
+      p.rc = 1
+      p.fullLen = newLen
+      p.capImpl = cap
+      copyMem(addr p.data[0], inlinePtr(s), slen)
+      p.data[slen] = c
+      p.data[newLen] = '\0'
+      s.more = p
+      setSSLen(s, HeapSlen)
+  else:
+    let l = s.more.fullLen  # fetch fullLen only in the long path
+    ensureUniqueLong(s, l, l + 1)
+    s.more.data[l] = c
+    s.more.data[l + 1] = '\0'
+    if l < AlwaysAvail:
+      inlinePtr(s)[l] = c
+
+proc add(s: var SmallString; t: SmallString) =
+  let slen = ssLen(s)
+  let (tl, tp) = t.guts  # fetch t's guts before any mutation (aliasing safety)
+  if tl == 0: return
+  if slen <= PayloadSize:
+    let sl = slen  # for short/medium, slen IS the actual length
+    let newLen = sl + tl
+    if newLen <= PayloadSize:
+      let inl = inlinePtr(s)
+      copyMem(addr inl[sl], tp, tl)
+      inl[newLen] = '\0'
+      setSSLen(s, newLen)
+    else:
+      # transition to long
+      let cap = newLen * 2
+      let p = cast[ptr LongString](alloc(LongStringDataOffset + cap + 1))
+      p.rc = 1
+      p.fullLen = newLen
+      p.capImpl = cap
+      copyMem(addr p.data[0], inlinePtr(s), sl)
+      copyMem(addr p.data[sl], tp, tl)
+      p.data[newLen] = '\0'
+      if sl < AlwaysAvail:
+        copyMem(addr inlinePtr(s)[sl], tp, min(AlwaysAvail - sl, tl))
+      s.more = p
+      setSSLen(s, HeapSlen)
+  else:
+    let sl = s.more.fullLen  # fetch fullLen only in the long path
+    let newLen = sl + tl
+    # tp was read before ensureUniqueLong: if t.more == s.more, rc decrements but won't hit 0
+    ensureUniqueLong(s, sl, newLen)
+    copyMem(addr s.more.data[sl], tp, tl)
+    s.more.data[newLen] = '\0'
+    if sl < AlwaysAvail:
+      copyMem(addr inlinePtr(s)[sl], tp, min(AlwaysAvail - sl, tl))
+
+{.push overflowChecks: off, rangeChecks: off.}
+
+proc prepareAddLong(s: var SmallString; newLen: int) =
+  # Reserve capacity for newLen in the long-string block without changing logical length.
+  let isHeap = ssLen(s) == HeapSlen
+  let cap = if isHeap: s.more.capImpl else: 0
+  if isHeap and s.more.rc == 1 and newLen <= cap:
+    discard  # already unique with sufficient capacity
+  else:
+    let oldLen = s.more.fullLen
+    let newCap = max(newLen, resize(cap))
+    let p = cast[ptr LongString](alloc(LongStringDataOffset + newCap + 1))
+    p.rc = 1
+    p.fullLen = oldLen  # logical length unchanged — caller sets it after writing data
+    p.capImpl = newCap
+    let old = s.more
+    copyMem(addr p.data[0], addr old.data[0], oldLen + 1)
+    if isHeap and atomicSubFetch(old.rc, 1) == 0:
+      dealloc(old)
+    s.more = p
+    setSSLen(s, HeapSlen)
+
+proc prepareAdd(s: var SmallString; addLen: int) {.compilerRtl.} =
+  ## Ensure s has room for addLen more characters without changing its length.
+  let slen = ssLen(s)
+  let curLen = if slen > PayloadSize: s.more.fullLen else: slen
+  let newLen = curLen + addLen
+  if slen <= PayloadSize:
+    if newLen > PayloadSize:
+      # transition to long: allocate, copy existing data
+      let newCap = newLen * 2
+      let p = cast[ptr LongString](alloc(LongStringDataOffset + newCap + 1))
+      p.rc = 1
+      p.fullLen = curLen
+      p.capImpl = newCap
+      copyMem(addr p.data[0], inlinePtr(s), curLen + 1)
+      s.more = p
+      setSSLen(s, HeapSlen)
+    # else: short/medium — inline capacity always sufficient (struct is fixed size)
+  else:
+    prepareAddLong(s, newLen)
+
+proc nimAddCharV1(s: var SmallString; c: char) {.compilerRtl, inline.} =
+  let slen = ssLen(s)
+  if slen < PayloadSize:
+    # Hot path: inline/medium with room (slen+1 <= PayloadSize, no heap needed)
+    let inl = inlinePtr(s)
+    inl[slen] = c
+    inl[slen + 1] = '\0'
+    setSSLen(s, slen + 1)
+  elif slen > PayloadSize:
+    # Long string — inline the common case: unique heap block with room
+    let l = s.more.fullLen
+    if slen == HeapSlen and s.more.rc == 1 and l < s.more.capImpl:
+      s.more.data[l] = c
+      s.more.data[l + 1] = '\0'
+      s.more.fullLen = l + 1
+      if l < AlwaysAvail:
+        inlinePtr(s)[l] = c
+    else:
+      prepareAdd(s, 1)
+      s.add(c)
+  else:
+    # slen == PayloadSize: medium→long transition (rare)
+    prepareAdd(s, 1)
+    s.add(c)
+
+proc toNimStr(str: cstring; len: int): SmallString {.compilerproc.} =
+  if len <= 0: return
+  if len <= PayloadSize:
+    setSSLen(result, len)
+    let inl = inlinePtr(result)
+    copyMem(inl, str, len)
+    inl[len] = '\0'
+    # Bytes past inl[len] in `bytes` must be zero for SWAR. `result` is zero-initialized,
+    # and copyMem only fills bytes 0..len-1 of inl; bytes len..6 remain zero.
+  else:
+    let p = cast[ptr LongString](alloc(LongStringDataOffset + len + 1))
+    p.rc = 1
+    p.fullLen = len
+    p.capImpl = len
+    copyMem(addr p.data[0], str, len)
+    p.data[len] = '\0'
+    copyMem(inlinePtr(result), str, AlwaysAvail)
+    setSSLen(result, HeapSlen)
+    result.more = p
+
+proc cstrToNimstr(str: cstring): SmallString {.compilerRtl.} =
+  if str == nil: return
+  toNimStr(str, str.len)
+
+proc nimToCStringConv(s: var SmallString): cstring {.compilerproc, nonReloadable, inline.} =
+  ## Returns a null-terminated C string pointer into s's data.
+  ## Takes by var (pointer) so the inline chars ptr is always valid.
+  if ssLen(s) > PayloadSize:
+    cast[cstring](addr s.more.data[0])
+  else:
+    cast[cstring](inlinePtr(s))
+
+proc appendString(dest: var SmallString; src: SmallString) {.compilerproc, inline.} =
+  dest.add(src)
+
+proc appendChar(dest: var SmallString; c: char) {.compilerproc, inline.} =
+  dest.add(c)
+
+proc rawNewString(space: int): SmallString {.compilerproc.} =
+  ## Returns an empty SmallString with capacity reserved for `space` chars (newStringOfCap).
+  if space <= 0: return
+  if space <= PayloadSize:
+    discard  # inline capacity is always available; nothing to pre-allocate
+  else:
+    let p = cast[ptr LongString](alloc(LongStringDataOffset + space + 1))
+    p.rc = 1
+    p.fullLen = 0
+    p.capImpl = space
+    p.data[0] = '\0'
+    result.more = p
+    setSSLen(result, HeapSlen)
+
+proc mnewString(len: int): SmallString {.compilerproc.} =
+  ## Returns a SmallString of `len` zero characters (newString).
+  if len <= 0: return
+  if len <= PayloadSize:
+    setSSLen(result, len)
+    # bytes field is zero-initialized (result starts at 0); inline chars are already 0.
+    # Null terminator at inlinePtr(result)[len] is also 0 — fine for SWAR invariant.
+  else:
+    let p = cast[ptr LongString](alloc0(LongStringDataOffset + len + 1))
+    p.rc = 1
+    p.fullLen = len
+    p.capImpl = len
+    # data is zeroed by alloc0; data[len] is '\0' too
+    result.more = p
+    setSSLen(result, HeapSlen)
+
+proc setLengthStrV2(s: var SmallString; newLen: int) {.compilerRtl.} =
+  ## Sets the length of s to newLen, zeroing new bytes on growth.
+  let slen = ssLen(s)
+  let curLen = if slen > PayloadSize: s.more.fullLen else: slen
+  if newLen == curLen: return
+  if newLen <= 0:
+    if slen > PayloadSize:
+      if slen == HeapSlen and s.more.rc == 1:
+        s.more.fullLen = 0
+        s.more.data[0] = '\0'
+      else:
+        # shared or static block: detach and go back to empty inline
+        nimDestroyStrV1(s)
+        s.bytes = 0  # slen=0, all inline chars zeroed
+    else:
+      s.bytes = 0  # slen=0, all inline chars zeroed (SWAR safe)
+    return
+  if slen <= PayloadSize:
+    if newLen <= PayloadSize:
+      let inl = inlinePtr(s)
+      if newLen > curLen:
+        zeroMem(addr inl[curLen], newLen - curLen)
+        inl[newLen] = '\0'
+        setSSLen(s, newLen)
+      else:
+        # Shrink: zero out padding bytes for SWAR invariant.
+        inl[newLen] = '\0'
+        if newLen < AlwaysAvail:
+          # Zero bytes newLen+1..AlwaysAvail-1 in `bytes` (chars newLen..AlwaysAvail-2
+          # are now padding and must be 0 for SWAR comparison to work correctly).
+          when system.cpuEndian == littleEndian:
+            # LE: slen in bits 0-7; keep bits 0..(newLen+1)*8-1, clear the rest above.
+            let keepBits = (newLen + 1) * 8
+            let charMask = ((uint(1) shl keepBits) - 1'u) and not 0xFF'u
+            s.bytes = (s.bytes and charMask) or uint(newLen)
+          else:
+            # BE: slen in the top byte; keep top (newLen+1) bytes, zero the rest below.
+            let discardBits = (AlwaysAvail - newLen) * 8
+            let slenBit = 8 * (sizeof(uint) - 1)
+            let charMask = not ((uint(1) shl discardBits) - 1'u) and not (0xFF'u shl slenBit)
+            s.bytes = (s.bytes and charMask) or (uint(newLen) shl slenBit)
+        else:
+          setSSLen(s, newLen)
+    else:
+      # grow into long
+      let newCap = resize(newLen)
+      let p = cast[ptr LongString](alloc0(LongStringDataOffset + newCap + 1))
+      p.rc = 1
+      p.fullLen = newLen
+      p.capImpl = newCap
+      copyMem(addr p.data[0], inlinePtr(s), curLen)
+      # bytes [curLen..newLen] zeroed by alloc0; p.data[newLen] = '\0' by alloc0
+      s.more = p
+      setSSLen(s, HeapSlen)
+  else:
+    # currently long
+    if newLen <= PayloadSize:
+      # shrink back to inline
+      let old = s.more
+      let inl = inlinePtr(s)
+      copyMem(inl, addr old.data[0], newLen)
+      inl[newLen] = '\0'
+      if slen == HeapSlen and atomicSubFetch(old.rc, 1) == 0:
+        dealloc(old)
+      # Zero padding bytes in `bytes` for SWAR invariant
+      if newLen < AlwaysAvail:
+        when system.cpuEndian == littleEndian:
+          let keepBits = (newLen + 1) * 8
+          let charMask = ((uint(1) shl keepBits) - 1'u) and not 0xFF'u
+          s.bytes = (s.bytes and charMask) or uint(newLen)
+        else:
+          let discardBits = (AlwaysAvail - newLen) * 8
+          let slenBit = 8 * (sizeof(uint) - 1)
+          let charMask = not ((uint(1) shl discardBits) - 1'u) and not (0xFF'u shl slenBit)
+          s.bytes = (s.bytes and charMask) or (uint(newLen) shl slenBit)
+      else:
+        setSSLen(s, newLen)
+    else:
+      ensureUniqueLong(s, curLen, newLen)
+      if newLen > curLen:
+        zeroMem(addr s.more.data[curLen], newLen - curLen)
+      s.more.data[newLen] = '\0'
+      s.more.fullLen = newLen
+
+proc nimAsgnStrV2(a: var SmallString; b: SmallString) {.compilerRtl, inline.} =
+  if ssLen(b) <= PayloadSize:
+    nimDestroyStrV1(a)  # free any existing heap block before overwriting
+    copyMem(addr a, unsafeAddr b, sizeof(SmallString))
+  else:
+    if addr(a) == unsafeAddr(b): return
+    nimDestroyStrV1(a)
+    # COW: share the block, bump refcount — no allocation needed (static literals: no bump)
+    if ssLenOf(b.bytes) == HeapSlen:
+      discard atomicAddFetch(b.more.rc, 1)
+    copyMem(addr a, unsafeAddr b, sizeof(SmallString))
+
+proc nimPrepareStrMutationImpl(s: var SmallString) =
+  # Called when s holds a static (slen=StaticSlen) LongString block. COW: allocate fresh copy.
+  let old = s.more
+  let oldLen = old.fullLen
+  let p = cast[ptr LongString](alloc(LongStringDataOffset + oldLen + 1))
+  p.rc = 1
+  p.fullLen = oldLen
+  p.capImpl = oldLen
+  copyMem(addr p.data[0], addr old.data[0], oldLen + 1)
+  s.more = p
+  setSSLen(s, HeapSlen)  # promote from static to heap-owned
+
+proc nimPrepareStrMutationV2(s: var SmallString) {.compilerRtl, inline.} =
+  if ssLen(s) == StaticSlen:
+    nimPrepareStrMutationImpl(s)
+
+proc prepareMutation*(s: var string) {.inline.} =
+  {.cast(noSideEffect).}:
+    nimPrepareStrMutationV2(cast[ptr SmallString](addr s)[])
+
+proc nimStrAtMutV3*(s: var SmallString; i: int): var char {.compilerproc, inline.} =
+  ## Returns a mutable reference to the i-th char. Handles COW for long strings.
+  ## Used by the codegen when s[i] is passed as a `var char` argument.
+  if ssLen(s) > PayloadSize:
+    nimPrepareStrMutationV2(s)  # COW: ensure unique heap block before exposing ref
+    result = s.more.data[i]
+  else:
+    result = inlinePtr(s)[i]
+
+proc nimAddStrV1(s: var SmallString; src: SmallString) {.compilerRtl, inline.} =
+  s.add(src)
+
+func capacity*(self: SmallString): int {.inline.} =
+  ## Returns the current capacity of the string.
+  let slen = ssLen(self)
+  if slen == HeapSlen:
+    self.more.capImpl
+  elif slen == StaticSlen:
+    self.more.fullLen  # static: report fullLen as capacity (read-only, no extra room)
+  else:
+    PayloadSize
+
+proc nimStrLen(s: SmallString): int {.compilerproc, inline.} =
+  ## Returns the length of s. Called by the codegen for `mLen` on strings with -d:nimsso.
+  s.len
+
+proc nimStrData(s: var SmallString): ptr UncheckedArray[char] {.compilerproc, inline.} =
+  ## Returns a pointer to the char data of s. Called by codegen for subscript and slice with -d:nimsso.
+  if ssLen(s) > PayloadSize: cast[ptr UncheckedArray[char]](addr s.more.data[0])
+  else: inlinePtr(s)
+
+const
+  newStringUninitWasDeclared = true
+
+proc newStringUninitImpl(len: Natural): string {.noSideEffect, inline.} =
+  ## Returns a new string of length `len` but with uninitialized content.
+  ## One needs to fill the string character after character
+  ## with the index operator `s[i]`.
+  ##
+  ## This procedure exists only for optimization purposes;
+  ## the same effect can be achieved with the `&` operator or with `add`.
+  when nimvm:
+    result = newString(len)
+  else:
+    result = newStringOfCap(len)  # rawNewString: alloc (not alloc0) for long strings
+    {.cast(noSideEffect).}:
+      if len > 0:
+        let s = cast[ptr SmallString](addr result)
+        if len <= PayloadSize:
+          setSSLen(s[], len)
+          # Null-terminate; bytes [0..len-1] left uninitialized for caller to fill.
+          inlinePtr(s[])[len] = '\0'
+        else:
+          # rawNewString allocated with alloc (not alloc0), so data[0..len-1] is
+          # intentionally uninitialized. Caller fills it and calls completeStore.
+          s.more.fullLen = len
+          s.more.data[len] = '\0'
+
+proc completeStore(s: var SmallString) {.compilerproc, inline.} =
+  ## Must be called after bulk data has been written directly into the string buffer
+  ## via a raw pointer obtained from `nimStrData`/`nimStrAtMutV3` (e.g. `readBuffer`,
+  ## `moveMem`, `copyMem`).
+  ##
+  ## Syncs the hot prefix cache: copies `more.data[0..AlwaysAvail-1]` into
+  ## the inline bytes so that `cmp`/`==` can compare long strings
+  ## without a heap dereference for the first few bytes.
+  if ssLen(s) > PayloadSize:
+    copyMem(inlinePtr(s), addr s.more.data[0], AlwaysAvail)
+
+proc completeStore*(s: var string) {.inline.} =
+  completeStore(cast[ptr SmallString](addr s)[])
+
+proc beginStore*(s: var string; ensuredLen: int; start = 0): ptr UncheckedArray[char] {.inline, noSideEffect, raises: [], tags: [].} =
+  ## Prepares `s` for a bulk write of `ensuredLen` bytes starting at `start`.
+  ## The caller must ensure `s.len >= start + ensuredLen` (e.g. via `newString` or `setLen`).
+  ## Call `endStore(s)` afterwards to sync the inline cache.
+  {.cast(noSideEffect).}:
+    let ss = cast[ptr SmallString](addr s)
+    let slen = ssLen(ss[])
+    if slen > PayloadSize:
+      ensureUniqueLong(ss[], ss[].more.fullLen, ss[].more.fullLen)
+      result = cast[ptr UncheckedArray[char]](addr ss[].more.data[start])
+    else:
+      result = cast[ptr UncheckedArray[char]](cast[uint](inlinePtr(ss[])) + uint(start))
+
+proc endStore*(s: var string) {.inline, noSideEffect, raises: [], tags: [].} =
+  ## Syncs the inline cache after bulk writes via `beginStore`. No-op for short/medium strings.
+  {.cast(noSideEffect).}: completeStore(cast[ptr SmallString](addr s)[])
+
+proc rawDataImpl(ss: ptr SmallString; start: int): ptr UncheckedArray[char] {.inline, noSideEffect, raises: [].} =
+  let slen = ssLen(ss[])
+  let actualLen = if slen > PayloadSize: ss[].more.fullLen else: slen
+  if actualLen == 0: nil
+  elif slen > PayloadSize: cast[ptr UncheckedArray[char]](addr ss[].more.data[start])
+  else: cast[ptr UncheckedArray[char]](cast[uint](inlinePtr(ss[])) + uint(start))
+
+template readRawData*(s: string; start = 0): ptr UncheckedArray[char] =
+  ## Returns a pointer to `s[start]` for read-only raw access.
+  ## Template ensures no copy of `s` is made; ptr is valid while `s` is alive.
+  rawDataImpl(cast[ptr SmallString](unsafeAddr s), start)
+
+# These take `string` (tyString) so the codegen uses them directly, bypassing
+# strmantle.nim's versions which go through nimStrLen/nimStrAtMutV3 compilerproc calls.
+proc cmpStrings(a, b: string): int {.compilerproc, inline.} =
+  cmpStringPtrs(cast[ptr SmallString](unsafeAddr a), cast[ptr SmallString](unsafeAddr b))
+
+proc eqStrings(a, b: string): bool {.compilerproc, inline.} =
+  cast[ptr SmallString](unsafeAddr a)[] == cast[ptr SmallString](unsafeAddr b)[]
+
+proc leStrings(a, b: string): bool {.compilerproc, inline.} =
+  cmpStrings(a, b) <= 0
+
+proc ltStrings(a, b: string): bool {.compilerproc, inline.} =
+  cmpStrings(a, b) < 0
+
+proc hashString(s: string): int {.compilerproc.} =
+  let ss = cast[ptr SmallString](unsafeAddr s)[]
+  let (L, data) = ss.guts
+  var h = 0'u
+  for i in 0..<L:
+    h = h + uint(data[i])
+    h = h + h shl 10
+    h = h xor (h shr 6)
+  h = h + h shl 3
+  h = h xor (h shr 11)
+  h = h + h shl 15
+  result = cast[int](h)
+
+{.pop.}
--- a/tests/benchmarks/strings/cmpbench.nim
+++ b/tests/benchmarks/strings/cmpbench.nim
@@ -0,0 +1,261 @@
+import std/[monotimes, os, random, strutils, times]
+
+const
+  AlwaysAvail = 7
+  InlineMax = AlwaysAvail + sizeof(pointer) - 1
+  Alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
+  SharedPrefixes = [
+    "module/submodule/symbol/",
+    "compiler/semantic/checker/",
+    "core/runtime/string-table/",
+    "aaaaaaaaaaaaaa/shared/prefix/",
+    "zzzzzzzzzzzzzz/shared/prefix/"
+  ]
+  ScenarioNames = ["short", "inline", "boundary", "long", "prefix", "mixed"]
+
+type
+  Scenario = enum
+    scShort
+    scInline
+    scBoundary
+    scLong
+    scPrefix
+    scMixed
+
+  Pair = tuple[a, b: string]
+
+  Config = object
+    count: int
+    rounds: int
+    seed: int64
+    scenarios: seq[Scenario]
+
+proc defaultConfig(): Config =
+  Config(
+    count: 400_000,
+    rounds: 8,
+    seed: 20260307'i64,
+    scenarios: @[scShort, scInline, scBoundary, scLong, scMixed]
+  )
+
+proc usage() =
+  echo "String comparison benchmark for experimenting with the SSO runtime."
+  echo ""
+  echo "Usage:"
+  echo "  nim r -d:danger cmpbench.nim [--count=N] [--rounds=N] [--seed=N]"
+  echo "                                [--scenarios=list]"
+  echo ""
+  echo "Scenarios:"
+  echo "  short, inline, boundary, long, prefix, mixed"
+  echo ""
+  echo "Current inline limit on this target: ", InlineMax, " bytes"
+
+proc parseScenario(name: string): Scenario =
+  case name.normalize
+  of "short":
+    scShort
+  of "inline":
+    scInline
+  of "boundary":
+    scBoundary
+  of "long":
+    scLong
+  of "prefix":
+    scPrefix
+  of "mixed":
+    scMixed
+  else:
+    quit "unknown scenario: " & name
+
+proc parseConfig(): Config =
+  result = defaultConfig()
+  for arg in commandLineParams():
+    if arg == "--help" or arg == "-h":
+      usage()
+      quit 0
+    elif arg.startsWith("--count="):
+      result.count = parseInt(arg["--count=".len .. ^1])
+    elif arg.startsWith("--rounds="):
+      result.rounds = parseInt(arg["--rounds=".len .. ^1])
+    elif arg.startsWith("--seed="):
+      result.seed = parseInt(arg["--seed=".len .. ^1]).int64
+    elif arg.startsWith("--scenarios="):
+      result.scenarios.setLen(0)
+      for item in arg["--scenarios=".len .. ^1].split(','):
+        if item.len > 0:
+          result.scenarios.add parseScenario(item)
+    else:
+      quit "unknown argument: " & arg
+
+  if result.count <= 0:
+    quit "--count must be > 0"
+  if result.rounds <= 0:
+    quit "--rounds must be > 0"
+  if result.scenarios.len == 0:
+    quit "at least one scenario is required"
+
+proc scenarioName(s: Scenario): string =
+  ScenarioNames[s.ord]
+
+proc scenarioList(scenarios: openArray[Scenario]): string =
+  for i, scenario in scenarios:
+    if i > 0:
+      result.add ','
+    result.add scenarioName(scenario)
+
+proc fixed(x: float; digits: range[0..32]): string =
+  formatFloat(x, ffDecimal, digits)
+
+proc randomChar(rng: var Rand): char =
+  Alphabet[rng.rand(Alphabet.high)]
+
+proc makeRandomString(rng: var Rand; len: int; prefix = ""): string =
+  result = newString(len)
+  var i = 0
+  while i < len and i < prefix.len:
+    result[i] = prefix[i]
+    inc i
+  while i < len:
+    result[i] = randomChar(rng)
+    inc i
+
+proc pickMixedLength(rng: var Rand): int =
+  let bucket = rng.rand(0..99)
+  if bucket < 35:
+    result = rng.rand(1..AlwaysAvail)
+  elif bucket < 70:
+    result = rng.rand(AlwaysAvail + 1 .. InlineMax)
+  else:
+    result = rng.rand(InlineMax + 1 .. InlineMax + 48)
+
+proc makeScenarioString(rng: var Rand; kind: Scenario; serial: int): string =
+  case kind
+  of scShort:
+    result = makeRandomString(rng, rng.rand(1..AlwaysAvail))
+  of scInline:
+    result = makeRandomString(rng, rng.rand(AlwaysAvail + 1 .. InlineMax))
+  of scBoundary:
+    let choices = [
+      max(1, InlineMax - 2),
+      max(1, InlineMax - 1),
+      InlineMax,
+      InlineMax + 1,
+      InlineMax + 2
+    ]
+    result = makeRandomString(rng, choices[rng.rand(choices.high)])
+  of scLong:
+    result = makeRandomString(rng, rng.rand(InlineMax + 1 .. InlineMax + 64))
+  of scPrefix:
+    let prefix = SharedPrefixes[rng.rand(SharedPrefixes.high)]
+    let suffixLen = rng.rand(4..24)
+    result = makeRandomString(rng, prefix.len + suffixLen, prefix)
+  of scMixed:
+    result = makeRandomString(rng, pickMixedLength(rng))
+  if kind == scPrefix and result.len > 0:
+    # Keep the shared-prefix workload adversarial on purpose.
+    result[^1] = char(ord('0') + (serial mod 10))
+
+proc generateDataset(kind: Scenario; count: int; seed: int64): seq[string] =
+  var rng = initRand(seed + kind.ord.int64 * 10_000_019'i64)
+  result = newSeq[string](count)
+  for i in 0..<count:
+    result[i] = makeScenarioString(rng, kind, i)
+
+proc tweakTail(s: string; salt: int): string =
+  result = s
+  if result.len == 0:
+    result = "x"
+  elif result.len == 1:
+    result[0] = char(ord('a') + (salt mod 26))
+  else:
+    result[^1] = char(ord('a') + (salt mod 26))
+
+proc buildPairs(kind: Scenario; data: openArray[string]): seq[Pair] =
+  result = newSeq[Pair](data.len)
+  let n = max(1, data.len)
+  for i in 0..<data.len:
+    let a = data[i]
+    let j = (i * 48271 + 17) mod n
+    let k = (i * 69621 + 91) mod n
+    if kind == scPrefix:
+      case i mod 4
+      of 0:
+        result[i] = (a, data[j])
+      of 1:
+        result[i] = (a, a)
+      of 2:
+        result[i] = (a, tweakTail(a, i))
+      else:
+        result[i] = (a, data[(i + 1) mod n])
+    else:
+      # Default workload: mostly unrelated words, with a small minority of harder cases.
+      case i mod 10
+      of 0:
+        result[i] = (a, a)
+      of 1:
+        result[i] = (a, tweakTail(a, i))
+      of 2:
+        result[i] = (a, data[(i + 1) mod n])
+      else:
+        result[i] = (a, data[if j == i: k else: j])
+
+proc averageLen(data: openArray[string]): float =
+  var total = 0
+  for s in data:
+    total += s.len
+  result = total.float / max(1, data.len).float
+
+proc pairChecksum(pairs: openArray[Pair]): uint64 =
+  for i, pair in pairs:
+    result = result * 0x9E3779B185EBCA87'u64 + uint64(pair.a.len + pair.b.len)
+    if pair.a.len > 0:
+      result = result xor (uint64(ord(pair.a[0])) shl (i and 7))
+    if pair.b.len > 0:
+      result = result xor (uint64(ord(pair.b[^1])) shl ((i + 3) and 7))
+
+proc bench(kind: Scenario; cfg: Config) =
+  let data = generateDataset(kind, cfg.count, cfg.seed)
+  let pairs = buildPairs(kind, data)
+  let avgLen = averageLen(data)
+
+  var warm = 0
+  for pair in pairs:
+    warm += system.cmp(pair.a, pair.b)
+
+  var totalNs = 0.0
+  var bestNs = Inf
+  var worstNs = 0.0
+  var combined = uint64(cast[uint](warm)) xor pairChecksum(pairs)
+
+  for round in 0..<cfg.rounds:
+    var acc = 0
+    let started = getMonoTime()
+    for pair in pairs:
+      acc += system.cmp(pair.a, pair.b)
+    let elapsedNs = float((getMonoTime() - started).inNanoseconds)
+    totalNs += elapsedNs
+    bestNs = min(bestNs, elapsedNs)
+    worstNs = max(worstNs, elapsedNs)
+    combined = combined * 0x9E3779B185EBCA87'u64 + uint64(cast[uint](acc)) + uint64(round + 1)
+
+  let avgNs = totalNs / cfg.rounds.float
+  let nsPerCmp = avgNs / pairs.len.float
+  echo align(scenarioName(kind), 8), "  n=", align($pairs.len, 8),
+      "  avgLen=", align(fixed(avgLen, 1), 6),
+      "  avg=", align(fixed(avgNs / 1e6, 3), 9), " ms",
+      "  best=", align(fixed(bestNs / 1e6, 3), 9), " ms",
+      "  worst=", align(fixed(worstNs / 1e6, 3), 9), " ms",
+      "  ns/cmp=", align(fixed(nsPerCmp, 1), 8),
+      "  check=0x", toHex(combined, 16)
+
+proc main() =
+  let cfg = parseConfig()
+  echo "inline limit=", InlineMax, " bytes  count=", cfg.count,
+      "  rounds=", cfg.rounds, "  seed=", cfg.seed
+  echo "scenarios=", scenarioList(cfg.scenarios)
+  for scenario in cfg.scenarios:
+    bench(scenario, cfg)
+  when not defined(useMalloc): echo "MAXMEM=", formatSize getMaxMem()
+
+when isMainModule:
+  main()
--- a/tests/benchmarks/strings/csvbench.nim
+++ b/tests/benchmarks/strings/csvbench.nim
@@ -0,0 +1,171 @@
+import std/[monotimes, os, parsecsv, random, strutils, times]
+
+const
+  FirstNames = [
+    "amy", "ben", "chris", "dora", "ella", "finn", "gina", "hugo",
+    "ivan", "june", "kyle", "lena", "mona", "nina", "owen", "paul"
+  ]
+  LastNames = [
+    "li", "ng", "kim", "ross", "miles", "stone", "young", "ward",
+    "reed", "clark", "hall", "price", "woods", "perry", "cohen", "moore"
+  ]
+
+type
+  StoredRow = object
+    id: string
+    name: string
+    age: string
+    score: string
+    visits: string
+    zip: string
+    timestamp: string
+    url: string
+
+  Config = object
+    rows: int
+    rounds: int
+    seed: int64
+
+proc defaultConfig(): Config =
+  Config(rows: 100_000, rounds: 4, seed: 20260307'i64)
+
+proc usage() =
+  echo "CSV parse/materialize benchmark for experimenting with the SSO runtime."
+  echo ""
+  echo "Usage:"
+  echo "  nim r -d:danger csvbench.nim [--rows=N] [--rounds=N] [--seed=N]"
+
+proc parseConfig(): Config =
+  result = defaultConfig()
+  for arg in commandLineParams():
+    if arg == "--help" or arg == "-h":
+      usage()
+      quit 0
+    elif arg.startsWith("--rows="):
+      result.rows = parseInt(arg["--rows=".len .. ^1])
+    elif arg.startsWith("--rounds="):
+      result.rounds = parseInt(arg["--rounds=".len .. ^1])
+    elif arg.startsWith("--seed="):
+      result.seed = parseInt(arg["--seed=".len .. ^1]).int64
+    else:
+      quit "unknown argument: " & arg
+  if result.rows <= 0:
+    quit "--rows must be > 0"
+  if result.rounds <= 0:
+    quit "--rounds must be > 0"
+
+proc fixed(x: float; digits: range[0..32]): string =
+  formatFloat(x, ffDecimal, digits)
+
+proc makeName(rng: var Rand; serial: int): string =
+  result = FirstNames[rng.rand(FirstNames.high)] & "_" &
+      LastNames[(serial + rng.rand(LastNames.high)) mod LastNames.len]
+
+proc makeUrl(name: string; serial: int; score: int): string =
+  "https://data.example/api/u/" & name & "/" & $serial &
+      "?score=" & $score & "&src=csv"
+
+proc csvPath(cfg: Config): string =
+  getTempDir() / ("nim_csvbench_" & $cfg.rows & "_" & $cfg.seed & ".csv")
+
+proc writeCsv(path: string; cfg: Config) =
+  var rng = initRand(cfg.seed)
+  var f = open(path, fmWrite)
+  defer: close(f)
+
+  f.writeLine("id,name,age,score,visits,zip,timestamp,url")
+  for i in 0..<cfg.rows:
+    let name = makeName(rng, i)
+    let age = 18 + (i mod 63)
+    let score = 1000 + rng.rand(0..900_000)
+    let visits = rng.rand(0..20_000)
+    let zip = 10000 + rng.rand(0..89999)
+    let ts = 1700000000'i64 + i.int64 * 17 + rng.rand(0..999).int64
+    let url = makeUrl(name, i, score)
+    f.write($i)
+    f.write(',')
+    f.write(name)
+    f.write(',')
+    f.write($age)
+    f.write(',')
+    f.write($score)
+    f.write(',')
+    f.write($visits)
+    f.write(',')
+    f.write($zip)
+    f.write(',')
+    f.write($ts)
+    f.write(',')
+    f.writeLine(url)
+
+proc checksum(row: StoredRow): uint64 =
+  let fields = [
+    row.id, row.name, row.age, row.score,
+    row.visits, row.zip, row.timestamp, row.url
+  ]
+  for i, field in fields:
+    result = result * 0x9E3779B185EBCA87'u64 + uint64(field.len + i)
+    if field.len > 0:
+      result = result xor (uint64(ord(field[0])) shl (i and 7))
+      result = result xor (uint64(ord(field[^1])) shl ((i + 3) and 7))
+
+proc parseAndMaterialize(path: string; rowsExpected: int): tuple[elapsedNs: float, check: uint64] =
+  var parser: CsvParser
+  parser.open(path)
+  defer: parser.close()
+  parser.readHeaderRow()
+
+  var rows = newSeqOfCap[StoredRow](rowsExpected)
+  let started = getMonoTime()
+  while parser.readRow():
+    var row: StoredRow
+    row.id = parser.row[0]
+    row.name = parser.row[1]
+    row.age = parser.row[2]
+    row.score = parser.row[3]
+    row.visits = parser.row[4]
+    row.zip = parser.row[5]
+    row.timestamp = parser.row[6]
+    row.url = parser.row[7]
+    result.check = result.check * 0x9E3779B185EBCA87'u64 + checksum(row)
+    rows.add row
+  result.elapsedNs = float((getMonoTime() - started).inNanoseconds)
+  doAssert rows.len == rowsExpected
+
+proc main() =
+  let cfg = parseConfig()
+  let path = csvPath(cfg)
+  writeCsv(path, cfg)
+  defer:
+    if fileExists(path):
+      removeFile(path)
+
+  let fileSize = getFileSize(path)
+  var warm = parseAndMaterialize(path, cfg.rows)
+  discard warm
+
+  var totalNs = 0.0
+  var bestNs = Inf
+  var worstNs = 0.0
+  var combined = uint64(fileSize) + uint64(cfg.rows)
+
+  for round in 0..<cfg.rounds:
+    let run = parseAndMaterialize(path, cfg.rows)
+    totalNs += run.elapsedNs
+    bestNs = min(bestNs, run.elapsedNs)
+    worstNs = max(worstNs, run.elapsedNs)
+    combined = combined * 0x9E3779B185EBCA87'u64 + run.check + uint64(round + 1)
+
+  let avgNs = totalNs / cfg.rounds.float
+  let nsPerRow = avgNs / cfg.rows.float
+  echo "rows=", cfg.rows, "  rounds=", cfg.rounds, "  seed=", cfg.seed,
+      "  file=", formatSize(fileSize)
+  echo "avg=", fixed(avgNs / 1e6, 3), " ms",
+      "  best=", fixed(bestNs / 1e6, 3), " ms",
+      "  worst=", fixed(worstNs / 1e6, 3), " ms",
+      "  ns/row=", fixed(nsPerRow, 1),
+      "  check=0x", toHex(combined, 16)
+  when not defined(useMalloc): echo "MAXMEM=", formatSize getMaxMem()
+
+when isMainModule:
+  main()
--- a/tests/benchmarks/strings/hashbench.nim
+++ b/tests/benchmarks/strings/hashbench.nim
@@ -0,0 +1,277 @@
+import std/[monotimes, os, random, strutils, tables, times]
+
+const
+  AlwaysAvail = 7
+  InlineMax = AlwaysAvail + sizeof(pointer) - 1
+  Alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
+  SharedPrefixes = [
+    "module/submodule/symbol/",
+    "compiler/semantic/checker/",
+    "core/runtime/string-table/",
+    "aaaaaaaaaaaaaa/shared/prefix/",
+    "zzzzzzzzzzzzzz/shared/prefix/"
+  ]
+  ScenarioNames = ["short", "inline", "boundary", "long", "prefix", "mixed"]
+
+type
+  Scenario = enum
+    scShort
+    scInline
+    scBoundary
+    scLong
+    scPrefix
+    scMixed
+
+  Config = object
+    count: int
+    rounds: int
+    seed: int64
+    scenarios: seq[Scenario]
+
+proc defaultConfig(): Config =
+  Config(
+    count: 200_000,
+    rounds: 5,
+    seed: 20260307'i64,
+    scenarios: @[scShort, scInline, scBoundary, scLong, scPrefix, scMixed]
+  )
+
+proc usage() =
+  echo "String hash-table benchmark for experimenting with the SSO runtime."
+  echo ""
+  echo "Usage:"
+  echo "  nim r -d:danger hashbench.nim [--count=N] [--rounds=N] [--seed=N]"
+  echo "                                 [--scenarios=list]"
+  echo ""
+  echo "Scenarios:"
+  echo "  short, inline, boundary, long, prefix, mixed"
+  echo ""
+  echo "Current inline limit on this target: ", InlineMax, " bytes"
+
+proc parseScenario(name: string): Scenario =
+  case name.normalize
+  of "short":
+    scShort
+  of "inline":
+    scInline
+  of "boundary":
+    scBoundary
+  of "long":
+    scLong
+  of "prefix":
+    scPrefix
+  of "mixed":
+    scMixed
+  else:
+    quit "unknown scenario: " & name
+
+proc parseConfig(): Config =
+  result = defaultConfig()
+  for arg in commandLineParams():
+    if arg == "--help" or arg == "-h":
+      usage()
+      quit 0
+    elif arg.startsWith("--count="):
+      result.count = parseInt(arg["--count=".len .. ^1])
+    elif arg.startsWith("--rounds="):
+      result.rounds = parseInt(arg["--rounds=".len .. ^1])
+    elif arg.startsWith("--seed="):
+      result.seed = parseInt(arg["--seed=".len .. ^1]).int64
+    elif arg.startsWith("--scenarios="):
+      result.scenarios.setLen(0)
+      for item in arg["--scenarios=".len .. ^1].split(','):
+        if item.len > 0:
+          result.scenarios.add parseScenario(item)
+    else:
+      quit "unknown argument: " & arg
+
+  if result.count <= 0:
+    quit "--count must be > 0"
+  if result.rounds <= 0:
+    quit "--rounds must be > 0"
+  if result.scenarios.len == 0:
+    quit "at least one scenario is required"
+
+proc scenarioName(s: Scenario): string =
+  ScenarioNames[s.ord]
+
+proc scenarioList(scenarios: openArray[Scenario]): string =
+  for i, scenario in scenarios:
+    if i > 0:
+      result.add ','
+    result.add scenarioName(scenario)
+
+proc fixed(x: float; digits: range[0..32]): string =
+  formatFloat(x, ffDecimal, digits)
+
+proc randomChar(rng: var Rand): char =
+  Alphabet[rng.rand(Alphabet.high)]
+
+proc makeRandomString(rng: var Rand; len: int; prefix = ""): string =
+  result = newString(len)
+  var i = 0
+  while i < len and i < prefix.len:
+    result[i] = prefix[i]
+    inc i
+  while i < len:
+    result[i] = randomChar(rng)
+    inc i
+
+proc pickMixedLength(rng: var Rand): int =
+  let bucket = rng.rand(0..99)
+  if bucket < 35:
+    result = rng.rand(1..AlwaysAvail)
+  elif bucket < 70:
+    result = rng.rand(AlwaysAvail + 1 .. InlineMax)
+  else:
+    result = rng.rand(InlineMax + 1 .. InlineMax + 48)
+
+proc makeScenarioString(rng: var Rand; kind: Scenario; serial: int): string =
+  case kind
+  of scShort:
+    result = makeRandomString(rng, rng.rand(1..AlwaysAvail))
+  of scInline:
+    result = makeRandomString(rng, rng.rand(AlwaysAvail + 1 .. InlineMax))
+  of scBoundary:
+    let choices = [
+      max(1, InlineMax - 2),
+      max(1, InlineMax - 1),
+      InlineMax,
+      InlineMax + 1,
+      InlineMax + 2
+    ]
+    result = makeRandomString(rng, choices[rng.rand(choices.high)])
+  of scLong:
+    result = makeRandomString(rng, rng.rand(InlineMax + 1 .. InlineMax + 64))
+  of scPrefix:
+    let prefix = SharedPrefixes[rng.rand(SharedPrefixes.high)]
+    let suffixLen = rng.rand(4..24)
+    result = makeRandomString(rng, prefix.len + suffixLen, prefix)
+  of scMixed:
+    result = makeRandomString(rng, pickMixedLength(rng))
+
+  if result.len > 0:
+    result[0] = char(ord('a') + (serial mod 26))
+    result[^1] = char(ord('0') + (serial mod 10))
+
+proc generateDataset(kind: Scenario; count: int; seed: int64): seq[string] =
+  var rng = initRand(seed + kind.ord.int64 * 10_000_019'i64)
+  result = newSeq[string](count)
+  for i in 0..<count:
+    result[i] = makeScenarioString(rng, kind, i)
+
+proc averageLen(data: openArray[string]): float =
+  var total = 0
+  for s in data:
+    total += s.len
+  result = total.float / max(1, data.len).float
+
+proc checksum(data: openArray[string]): uint64 =
+  for i, s in data:
+    result = result * 0x9E3779B185EBCA87'u64 + uint64(s.len)
+    if s.len > 0:
+      result = result xor (uint64(ord(s[0])) shl (i and 7))
+      result = result xor (uint64(ord(s[^1])) shl ((i + 3) and 7))
+
+proc makeMissQueries(kind: Scenario; count: int; seed: int64): seq[string] =
+  result = generateDataset(kind, count, seed + 0x6A09E667'i64)
+  for i in 0..<result.len:
+    if result[i].len == 0:
+      result[i] = "!"
+    else:
+      result[i][^1] = char(ord('Q') + (i mod 7))
+
+proc bench(kind: Scenario; cfg: Config) =
+  let keys = generateDataset(kind, cfg.count, cfg.seed)
+  let hitQueries = keys
+  let missQueries = makeMissQueries(kind, cfg.count, cfg.seed)
+  let avgLen = averageLen(keys)
+  let keyCheck = checksum(keys) xor checksum(missQueries)
+
+  var warm = initTable[string, int](cfg.count * 2)
+  for i, key in keys:
+    warm[key] = i
+  var warmHits = 0
+  for key in hitQueries:
+    warmHits += warm[key]
+  var warmMisses = 0
+  for key in missQueries:
+    if warm.hasKey(key):
+      inc warmMisses
+  doAssert warmHits >= 0
+  doAssert warmMisses == 0
+
+  var insertTotalNs = 0.0
+  var hitTotalNs = 0.0
+  var missTotalNs = 0.0
+  var insertBestNs = Inf
+  var hitBestNs = Inf
+  var missBestNs = Inf
+  var insertWorstNs = 0.0
+  var hitWorstNs = 0.0
+  var missWorstNs = 0.0
+  var combined = keyCheck + uint64(cfg.count)
+
+  for round in 0..<cfg.rounds:
+    var table = initTable[string, int](cfg.count * 2)
+
+    let insertStarted = getMonoTime()
+    for i, key in keys:
+      table[key] = i
+    let insertNs = float((getMonoTime() - insertStarted).inNanoseconds)
+
+    var hitSum = 0
+    let hitStarted = getMonoTime()
+    for key in hitQueries:
+      hitSum += table[key]
+    let hitNs = float((getMonoTime() - hitStarted).inNanoseconds)
+
+    var missSum = 0
+    let missStarted = getMonoTime()
+    for key in missQueries:
+      if table.hasKey(key):
+        inc missSum
+    let missNs = float((getMonoTime() - missStarted).inNanoseconds)
+
+    doAssert hitSum >= 0
+    doAssert missSum == 0
+
+    insertTotalNs += insertNs
+    hitTotalNs += hitNs
+    missTotalNs += missNs
+    insertBestNs = min(insertBestNs, insertNs)
+    hitBestNs = min(hitBestNs, hitNs)
+    missBestNs = min(missBestNs, missNs)
+    insertWorstNs = max(insertWorstNs, insertNs)
+    hitWorstNs = max(hitWorstNs, hitNs)
+    missWorstNs = max(missWorstNs, missNs)
+    combined = combined * 0x9E3779B185EBCA87'u64 +
+        uint64(cast[uint](hitSum xor missSum xor round))
+  let insertAvgNs = insertTotalNs / cfg.rounds.float
+  let hitAvgNs = hitTotalNs / cfg.rounds.float
+  let missAvgNs = missTotalNs / cfg.rounds.float
+  echo align(scenarioName(kind), 8), "  n=", align($cfg.count, 8),
+      "  avgLen=", align(fixed(avgLen, 1), 6),
+      "  ins=", align(fixed(insertAvgNs / 1e6, 3), 9), " ms",
+      "  hit=", align(fixed(hitAvgNs / 1e6, 3), 9), " ms",
+      "  miss=", align(fixed(missAvgNs / 1e6, 3), 9), " ms",
+      "  ns/op=", align(fixed((insertAvgNs + hitAvgNs + missAvgNs) / (3.0 * cfg.count.float), 1), 8),
+      "  check=0x", toHex(combined, 16)
+  discard insertBestNs
+  discard hitBestNs
+  discard missBestNs
+  discard insertWorstNs
+  discard hitWorstNs
+  discard missWorstNs
+
+proc main() =
+  let cfg = parseConfig()
+  echo "inline limit=", InlineMax, " bytes  count=", cfg.count,
+      "  rounds=", cfg.rounds, "  seed=", cfg.seed
+  echo "scenarios=", scenarioList(cfg.scenarios)
+  for scenario in cfg.scenarios:
+    bench(scenario, cfg)
+  when not defined(useMalloc): echo "MAXMEM=", formatSize getMaxMem()
+
+when isMainModule:
+  main()
--- a/tests/benchmarks/strings/sortbench.nim
+++ b/tests/benchmarks/strings/sortbench.nim
@@ -0,0 +1,224 @@
+import std/[algorithm, monotimes, os, random, strutils, times]
+
+const
+  AlwaysAvail = 7
+  InlineMax = AlwaysAvail + sizeof(pointer) - 1
+  Alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
+  SharedPrefixes = [
+    "module/submodule/symbol/",
+    "compiler/semantic/checker/",
+    "core/runtime/string-table/",
+    "aaaaaaaaaaaaaa/shared/prefix/",
+    "zzzzzzzzzzzzzz/shared/prefix/"
+  ]
+  ScenarioNames = ["short", "inline", "boundary", "long", "prefix", "mixed"]
+
+type
+  Scenario = enum
+    scShort
+    scInline
+    scBoundary
+    scLong
+    scMixed
+
+  Config = object
+    count: int
+    rounds: int
+    seed: int64
+    scenarios: seq[Scenario]
+
+proc defaultConfig(): Config =
+  Config(
+    count: 200_000,
+    rounds: 5,
+    seed: 20260307'i64,
+    scenarios: @[scShort, scInline, scBoundary, scLong, scMixed]
+  )
+
+proc usage() =
+  echo "String sorting benchmark for experimenting with the SSO runtime."
+  echo ""
+  echo "Usage:"
+  echo "  nim r -d:danger sortbench.nim [--count=N] [--rounds=N] [--seed=N]"
+  echo "                                 [--scenarios=list]"
+  echo ""
+  echo "Scenarios:"
+  echo "  short, inline, boundary, long, prefix, mixed"
+  echo ""
+  echo "Current inline limit on this target: ", InlineMax, " bytes"
+
+proc parseScenario(name: string): Scenario =
+  case name.normalize
+  of "short":
+    scShort
+  of "inline":
+    scInline
+  of "boundary":
+    scBoundary
+  of "long":
+    scLong
+  of "mixed":
+    scMixed
+  else:
+    quit "unknown scenario: " & name
+
+proc parseConfig(): Config =
+  result = defaultConfig()
+  for arg in commandLineParams():
+    if arg == "--help" or arg == "-h":
+      usage()
+      quit 0
+    elif arg.startsWith("--count="):
+      result.count = parseInt(arg["--count=".len .. ^1])
+    elif arg.startsWith("--rounds="):
+      result.rounds = parseInt(arg["--rounds=".len .. ^1])
+    elif arg.startsWith("--seed="):
+      result.seed = parseInt(arg["--seed=".len .. ^1]).int64
+    elif arg.startsWith("--scenarios="):
+      result.scenarios.setLen(0)
+      for item in arg["--scenarios=".len .. ^1].split(','):
+        if item.len > 0:
+          result.scenarios.add parseScenario(item)
+    else:
+      quit "unknown argument: " & arg
+
+  if result.count <= 0:
+    quit "--count must be > 0"
+  if result.rounds <= 0:
+    quit "--rounds must be > 0"
+  if result.scenarios.len == 0:
+    quit "at least one scenario is required"
+
+proc scenarioName(s: Scenario): string =
+  ScenarioNames[s.ord]
+
+proc randomChar(rng: var Rand): char =
+  Alphabet[rng.rand(Alphabet.high)]
+
+proc makeRandomString(rng: var Rand; len: int): string =
+  result = newString(len)
+  var i = 0
+  while i < len:
+    result[i] = randomChar(rng)
+    inc i
+
+proc pickMixedLength(rng: var Rand): int =
+  let bucket = rng.rand(0..99)
+  if bucket < 35:
+    result = rng.rand(1..AlwaysAvail)
+  elif bucket < 70:
+    result = rng.rand(AlwaysAvail + 1 .. InlineMax)
+  else:
+    result = rng.rand(InlineMax + 1 .. InlineMax + 48)
+
+proc makeScenarioString(rng: var Rand; kind: Scenario; serial: int): string =
+  case kind
+  of scShort:
+    result = makeRandomString(rng, rng.rand(1..AlwaysAvail))
+  of scInline:
+    result = makeRandomString(rng, rng.rand(1 .. InlineMax))
+  of scBoundary:
+    let choices = [
+      max(1, InlineMax - 2),
+      max(1, InlineMax - 1),
+      InlineMax,
+      InlineMax + 1,
+      InlineMax + 2
+    ]
+    result = makeRandomString(rng, choices[rng.rand(choices.high)])
+  of scLong:
+    result = makeRandomString(rng, rng.rand(InlineMax + 1 .. InlineMax + 64))
+  of scMixed:
+    result = makeRandomString(rng, pickMixedLength(rng))
+
+  # Inject a little deterministic structure so equal prefixes are common but not identical.
+  if result.len > 0:
+    result[0] = char(ord('a') + (serial mod 26))
+    result[^1] = char(ord('0') + (serial mod 10))
+
+proc generateDataset(kind: Scenario; count: int; seed: int64): seq[string] =
+  var rng = initRand(seed + kind.ord.int64 * 10_000_019'i64)
+  result = newSeq[string](count)
+  for i in 0..<count:
+    result[i] = makeScenarioString(rng, kind, i)
+
+proc cloneStrings(src: seq[string]): seq[string] =
+  result = newSeq[string](src.len)
+  for i, s in src:
+    result[i] = s
+
+proc isSorted(a: openArray[string]): bool =
+  for i in 1..<a.len:
+    if cmp(a[i - 1], a[i]) > 0:
+      return false
+  result = true
+
+proc checksum(a: openArray[string]): uint64 =
+  for i, s in a:
+    result = result * 0x9E3779B185EBCA87'u64 + uint64(s.len)
+    if s.len > 0:
+      result = result xor (uint64(ord(s[0])) shl (i and 7))
+      result = result xor (uint64(ord(s[^1])) shl ((i + 3) and 7))
+
+proc averageLen(data: openArray[string]): float =
+  var total = 0
+  for s in data:
+    total += s.len
+  result = total.float / max(1, data.len).float
+
+proc scenarioList(scenarios: openArray[Scenario]): string =
+  for i, scenario in scenarios:
+    if i > 0:
+      result.add ','
+    result.add scenarioName(scenario)
+
+proc fixed(x: float; digits: range[0..32]): string =
+  formatFloat(x, ffDecimal, digits)
+
+proc bench(kind: Scenario; cfg: Config) =
+  let data = generateDataset(kind, cfg.count, cfg.seed)
+  let avgLen = averageLen(data)
+
+  var warmup = cloneStrings(data)
+  warmup.sort(system.cmp)
+  doAssert isSorted(warmup)
+
+  var totalNs = 0.0
+  var bestNs = Inf
+  var worstNs = 0.0
+  var combinedChecksum = 0'u64
+
+  for round in 0..<cfg.rounds:
+    var working = cloneStrings(data)
+    let started = getMonoTime()
+    working.sort(system.cmp)
+    let elapsedNs = float((getMonoTime() - started).inNanoseconds)
+    doAssert isSorted(working)
+    totalNs += elapsedNs
+    bestNs = min(bestNs, elapsedNs)
+    worstNs = max(worstNs, elapsedNs)
+    combinedChecksum = combinedChecksum * 0x9E3779B185EBCA87'u64 +
+        checksum(working) + uint64(round + 1)
+
+  let avgNs = totalNs / cfg.rounds.float
+  let nsPerItem = avgNs / cfg.count.float
+  echo align(scenarioName(kind), 8), "  n=", align($cfg.count, 8),
+      "  avgLen=", align(fixed(avgLen, 1), 6),
+      "  avg=", align(fixed(avgNs / 1e6, 3), 9), " ms",
+      "  best=", align(fixed(bestNs / 1e6, 3), 9), " ms",
+      "  worst=", align(fixed(worstNs / 1e6, 3), 9), " ms",
+      "  ns/item=", align(fixed(nsPerItem, 1), 8),
+      "  check=0x", toHex(combinedChecksum, 16)
+
+proc main() =
+  let cfg = parseConfig()
+  echo "inline limit=", InlineMax, " bytes  count=", cfg.count,
+      "  rounds=", cfg.rounds, "  seed=", cfg.seed
+  echo "scenarios=" & scenarioList(cfg.scenarios)
+  for scenario in cfg.scenarios:
+    bench(scenario, cfg)
+
+  when not defined(useMalloc): echo "MAXMEM=", formatSize getMaxMem()
+
+when isMainModule:
+  main()