This adds parseutils.parseSize, an inverse to strutils.formatSize (#21349)

* This adds `parseutils.parseSize`, an inverse to `strutils.formatSize` which has existed since 2017. It is useful for parsing the compiler's own output logs (like SuccessX) or many other scenarios where "human readable" units have been chosen. The doc comment and tests explain accepted syntax in detail. Big units lead to small numbers, often with a fractional part, but we parse into an `int64` since that is what `formatSize` stringifies and this is an inverse over partial function slots. Although metric prefixes z & y for zettabyte & yottabyte are accepted, these will saturate the result at `int64.high` unless the qualified number is a small fraction. This should not be much of a problem until such sizes are common (at which point another overload with the parse result either `float64` or `int128` could be added). Tests avoids `test()` because of a weakly related static: test() failure as mentioned in https://github.com/nim-lang/Nim/pull/21325. This is a more elemental VM failure. As such, it needs its own failure exhibition issue that is a smaller test case. (I am working on that, but unless there is a burning need to `parseSize` at compile-time before run-time it need not hold up this PR.) * This worked with `int` but fails with `int64`. Try for green tests. * Lift 2-result matching into a `checkParseSize` template and format as a table of input & 2 expected outputs which seems nicer and to address https://github.com/nim-lang/Nim/pull/21349#pullrequestreview-1294407679 * Fix (probably) the i386 trouble by using `int64` consistently. * Improve documentation by mentioning saturation. * Improve documentation with `runnableExamples` and a little more detail in the main doc comment based on excellent code review by @juancarlospaco: https://github.com/nim-lang/Nim/pull/21349#pullrequestreview-1294564155 * Address some more @juancarlospaco code review concerns. * Remove a stray space. * Mention milli-bytes in docs to maybe help clarify why wild conventions are so prone to going case-insensitive-metric. * Add some parens. (cherry picked from commit 1d06c2b6cf)
2026-07-16 22:11:18 +00:00 · 2023-02-14 02:00:30 -05:00
parent 7fa782e3a0
commit c546ba5d23
2 changed files with 104 additions and 0 deletions
--- a/lib/pure/parseutils.nim
+++ b/lib/pure/parseutils.nim
@@ -597,6 +597,71 @@ proc parseFloat*(s: string, number: var float, start = 0): int {.
  if result != 0:
    number = bf

+func toLowerAscii(c: char): char =
+  if c in {'A'..'Z'}: char(uint8(c) xor 0b0010_0000'u8) else: c
+
+func parseSize*(s: openArray[char], size: var int64, alwaysBin=false): int =
+  ## Parse a size qualified by binary or metric units into `size`.  This format
+  ## is often called "human readable".  Result is the number of processed chars
+  ## or 0 on parse errors and size is rounded to the nearest integer.  Trailing
+  ## garbage like "/s" in "1k/s" is allowed and detected by `result < s.len`.
+  ##
+  ## To simplify use, following non-rare wild conventions, and since fractional
+  ## data like milli-bytes is so rare, unit matching is case-insensitive but for
+  ## the 'i' distinguishing binary-metric from metric (which cannot be 'I').
+  ##
+  ## An optional trailing 'B|b' is ignored but processed.  I.e., you must still
+  ## know if units are bytes | bits or infer this fact via the case of s[^1] (if
+  ## users can even be relied upon to use 'B' for byte and 'b' for bit or have
+  ## that be s[^1]).
+  ##
+  ## If `alwaysBin==true` then scales are always binary-metric, but e.g. "KiB"
+  ## is still accepted for clarity.  If the value would exceed the range of
+  ## `int64`, `size` saturates to `int64.high`.  Supported metric prefix chars
+  ## include k, m, g, t, p, e, z, y (but z & y saturate unless the number is a
+  ## small fraction).
+  ##
+  ## **See also:**
+  ## * https://en.wikipedia.org/wiki/Binary_prefix
+  ## * `formatSize module<strutils.html>`_ for formatting
+  runnableExamples:
+    var res: int64  # caller must still know if 'b' refers to bytes|bits
+    doAssert parseSize("10.5 MB", res) == 7
+    doAssert res == 10_500_000  # decimal metric Mega prefix
+    doAssert parseSize("64 mib", res) == 6
+    doAssert res == 67108864    # 64 shl 20
+    doAssert parseSize("1G/h", res, true) == 2 # '/' stops parse
+    doAssert res == 1073741824  # 1 shl 30, forced binary metric
+  const prefix = "b" & "kmgtpezy"       # byte|bit & lowCase metric-ish prefixes
+  const scaleM = [1.0, 1e3, 1e6, 1e9, 1e12, 1e15, 1e18, 1e21, 1e24] # 10^(3*idx)
+  const scaleB = [1.0, 1024, 1048576, 1073741824, 1099511627776.0,  # 2^(10*idx)
+                  1125899906842624.0, 1152921504606846976.0,        # ldexp?
+                  1.180591620717411303424e21, 1.208925819614629174706176e24]
+  var number: float
+  var scale = 1.0
+  result = parseFloat(s, number)
+  if number < 0:                        # While parseFloat accepts negatives ..
+    result = 0                          #.. we do not since sizes cannot be < 0
+  if result > 0:
+    let start = result                  # Save spot to maybe unwind white to EOS
+    while result < s.len and s[result] in Whitespace:
+      inc result
+    if result < s.len:                  # Illegal starting char => unity
+      if (let si = prefix.find(s[result].toLowerAscii); si >= 0):
+        inc result                      # Now parse the scale
+        scale = if alwaysBin: scaleB[si] else: scaleM[si]
+        if result < s.len and s[result] == 'i':
+          scale = scaleB[si]            # Switch from default to binary-metric
+          inc result
+        if result < s.len and s[result].toLowerAscii == 'b':
+          inc result                    # Skip optional '[bB]'
+    else:                               # Unwind result advancement when there..
+      result = start                    #..is no unit to the end of `s`.
+    var sizeF = number * scale + 0.5    # Saturate to int64.high when too big
+    size = if sizeF > 9223372036854774784.0: int64.high else: sizeF.int64
+# Above constant=2^63-1024 avoids C UB; github.com/nim-lang/Nim/issues/20102 or
+# stackoverflow.com/questions/20923556/math-pow2-63-1-math-pow2-63-512-is-true
+
 type
  InterpolatedKind* = enum ## Describes for `interpolatedFragments`
                           ## which part of the interpolated string is
--- a/tests/stdlib/tparseutils.nim
+++ b/tests/stdlib/tparseutils.nim
@@ -50,3 +50,42 @@ block:
  doAssert res == @[(17, "9.123456789012344"), (18, "11.123456789012344"),
                    (17, "9.123456789012344"), (17, "8.123456789012344"),
                    (16, "9.12345678901234"), (17, "9.123456789012344")]
+
+block:
+  var sz: int64
+  template checkParseSize(s, expectLen, expectVal) =
+    if (let got = parseSize(s, sz); got != expectLen):
+      raise newException(IOError, "got len " & $got & " != " & $expectLen)
+    if sz != expectVal:
+      raise newException(IOError, "got sz " & $sz & " != " & $expectVal)
+  #              STRING    LEN SZ
+  # Good, complete parses
+  checkParseSize "1  b"   , 4, 1
+  checkParseSize "1  B"   , 4, 1
+  checkParseSize "1k"     , 2, 1000
+  checkParseSize "1 kib"  , 5, 1024
+  checkParseSize "1 ki"   , 4, 1024
+  checkParseSize "1mi"    , 3, 1048576
+  checkParseSize "1 mi"   , 4, 1048576
+  checkParseSize "1 mib"  , 5, 1048576
+  checkParseSize "1 Mib"  , 5, 1048576
+  checkParseSize "1 MiB"  , 5, 1048576
+  checkParseSize "1.23GiB", 7, 1320702444 # 1320702443.52 rounded
+  checkParseSize "0.001k" , 6, 1
+  checkParseSize "0.0004k", 7, 0
+  checkParseSize "0.0006k", 7, 1
+  # Incomplete parses
+  checkParseSize "1  "    , 1, 1          # Trailing white IGNORED
+  checkParseSize "1  B "  , 4, 1          # Trailing white IGNORED
+  checkParseSize "1  B/s" , 4, 1          # Trailing junk IGNORED
+  checkParseSize "1 kX"   , 3, 1000
+  checkParseSize "1 kiX"  , 4, 1024
+  checkParseSize "1j"     , 1, 1          # Unknown prefix IGNORED
+  checkParseSize "1 jib"  , 2, 1          # Unknown prefix post space
+  checkParseSize "1  ji"  , 3, 1
+  # Bad parses; `sz` should stay last good|incomplete value
+  checkParseSize "-1b"    , 0, 1          # Negative numbers
+  checkParseSize "abc"    , 0, 1          # Non-numeric
+  checkParseSize " 12"    , 0, 1          # Leading white
+  # Value Edge cases
+  checkParseSize "9223372036854775807", 19, int64.high