This adds parseutils.parseSize, an inverse to strutils.formatSize (#21349)

* This adds `parseutils.parseSize`, an inverse to `strutils.formatSize`
which has existed since 2017.

It is useful for parsing the compiler's own output logs (like SuccessX)
or many other scenarios where "human readable" units have been chosen.
The doc comment and tests explain accepted syntax in detail.

Big units lead to small numbers, often with a fractional part, but we
parse into an `int64` since that is what `formatSize` stringifies and
this is an inverse over partial function slots.  Although metric
prefixes z & y for zettabyte & yottabyte are accepted, these will
saturate the result at `int64.high` unless the qualified number is a
small fraction.  This should not be much of a problem until such sizes
are common (at which point another overload with the parse result
either `float64` or `int128` could be added).

Tests avoids `test()` because of a weakly related static: test() failure
as mentioned in https://github.com/nim-lang/Nim/pull/21325. This is a
more elemental VM failure.  As such, it needs its own failure exhibition
issue that is a smaller test case.  (I am working on that, but unless
there is a burning need to `parseSize` at compile-time before run-time
it need not hold up this PR.)

* This worked with `int` but fails with `int64`.  Try for green tests.

* Lift 2-result matching into a `checkParseSize` template and format as a
table of input & 2 expected outputs which seems nicer and to address
https://github.com/nim-lang/Nim/pull/21349#pullrequestreview-1294407679

* Fix (probably) the i386 trouble by using `int64` consistently.

* Improve documentation by mentioning saturation.

* Improve documentation with `runnableExamples` and a little more detail in
the main doc comment based on excellent code review by @juancarlospaco:
https://github.com/nim-lang/Nim/pull/21349#pullrequestreview-1294564155

* Address some more @juancarlospaco code review concerns.

* Remove a stray space.

* Mention milli-bytes in docs to maybe help clarify why wild conventions
are so prone to going case-insensitive-metric.

* Add some parens.

(cherry picked from commit 1d06c2b6cf)
This commit is contained in:
c-blake
2023-02-14 02:00:30 -05:00
committed by narimiran
parent 7fa782e3a0
commit c546ba5d23
2 changed files with 104 additions and 0 deletions

View File

@@ -597,6 +597,71 @@ proc parseFloat*(s: string, number: var float, start = 0): int {.
if result != 0:
number = bf
func toLowerAscii(c: char): char =
if c in {'A'..'Z'}: char(uint8(c) xor 0b0010_0000'u8) else: c
func parseSize*(s: openArray[char], size: var int64, alwaysBin=false): int =
## Parse a size qualified by binary or metric units into `size`. This format
## is often called "human readable". Result is the number of processed chars
## or 0 on parse errors and size is rounded to the nearest integer. Trailing
## garbage like "/s" in "1k/s" is allowed and detected by `result < s.len`.
##
## To simplify use, following non-rare wild conventions, and since fractional
## data like milli-bytes is so rare, unit matching is case-insensitive but for
## the 'i' distinguishing binary-metric from metric (which cannot be 'I').
##
## An optional trailing 'B|b' is ignored but processed. I.e., you must still
## know if units are bytes | bits or infer this fact via the case of s[^1] (if
## users can even be relied upon to use 'B' for byte and 'b' for bit or have
## that be s[^1]).
##
## If `alwaysBin==true` then scales are always binary-metric, but e.g. "KiB"
## is still accepted for clarity. If the value would exceed the range of
## `int64`, `size` saturates to `int64.high`. Supported metric prefix chars
## include k, m, g, t, p, e, z, y (but z & y saturate unless the number is a
## small fraction).
##
## **See also:**
## * https://en.wikipedia.org/wiki/Binary_prefix
## * `formatSize module<strutils.html>`_ for formatting
runnableExamples:
var res: int64 # caller must still know if 'b' refers to bytes|bits
doAssert parseSize("10.5 MB", res) == 7
doAssert res == 10_500_000 # decimal metric Mega prefix
doAssert parseSize("64 mib", res) == 6
doAssert res == 67108864 # 64 shl 20
doAssert parseSize("1G/h", res, true) == 2 # '/' stops parse
doAssert res == 1073741824 # 1 shl 30, forced binary metric
const prefix = "b" & "kmgtpezy" # byte|bit & lowCase metric-ish prefixes
const scaleM = [1.0, 1e3, 1e6, 1e9, 1e12, 1e15, 1e18, 1e21, 1e24] # 10^(3*idx)
const scaleB = [1.0, 1024, 1048576, 1073741824, 1099511627776.0, # 2^(10*idx)
1125899906842624.0, 1152921504606846976.0, # ldexp?
1.180591620717411303424e21, 1.208925819614629174706176e24]
var number: float
var scale = 1.0
result = parseFloat(s, number)
if number < 0: # While parseFloat accepts negatives ..
result = 0 #.. we do not since sizes cannot be < 0
if result > 0:
let start = result # Save spot to maybe unwind white to EOS
while result < s.len and s[result] in Whitespace:
inc result
if result < s.len: # Illegal starting char => unity
if (let si = prefix.find(s[result].toLowerAscii); si >= 0):
inc result # Now parse the scale
scale = if alwaysBin: scaleB[si] else: scaleM[si]
if result < s.len and s[result] == 'i':
scale = scaleB[si] # Switch from default to binary-metric
inc result
if result < s.len and s[result].toLowerAscii == 'b':
inc result # Skip optional '[bB]'
else: # Unwind result advancement when there..
result = start #..is no unit to the end of `s`.
var sizeF = number * scale + 0.5 # Saturate to int64.high when too big
size = if sizeF > 9223372036854774784.0: int64.high else: sizeF.int64
# Above constant=2^63-1024 avoids C UB; github.com/nim-lang/Nim/issues/20102 or
# stackoverflow.com/questions/20923556/math-pow2-63-1-math-pow2-63-512-is-true
type
InterpolatedKind* = enum ## Describes for `interpolatedFragments`
## which part of the interpolated string is

View File

@@ -50,3 +50,42 @@ block:
doAssert res == @[(17, "9.123456789012344"), (18, "11.123456789012344"),
(17, "9.123456789012344"), (17, "8.123456789012344"),
(16, "9.12345678901234"), (17, "9.123456789012344")]
block:
var sz: int64
template checkParseSize(s, expectLen, expectVal) =
if (let got = parseSize(s, sz); got != expectLen):
raise newException(IOError, "got len " & $got & " != " & $expectLen)
if sz != expectVal:
raise newException(IOError, "got sz " & $sz & " != " & $expectVal)
# STRING LEN SZ
# Good, complete parses
checkParseSize "1 b" , 4, 1
checkParseSize "1 B" , 4, 1
checkParseSize "1k" , 2, 1000
checkParseSize "1 kib" , 5, 1024
checkParseSize "1 ki" , 4, 1024
checkParseSize "1mi" , 3, 1048576
checkParseSize "1 mi" , 4, 1048576
checkParseSize "1 mib" , 5, 1048576
checkParseSize "1 Mib" , 5, 1048576
checkParseSize "1 MiB" , 5, 1048576
checkParseSize "1.23GiB", 7, 1320702444 # 1320702443.52 rounded
checkParseSize "0.001k" , 6, 1
checkParseSize "0.0004k", 7, 0
checkParseSize "0.0006k", 7, 1
# Incomplete parses
checkParseSize "1 " , 1, 1 # Trailing white IGNORED
checkParseSize "1 B " , 4, 1 # Trailing white IGNORED
checkParseSize "1 B/s" , 4, 1 # Trailing junk IGNORED
checkParseSize "1 kX" , 3, 1000
checkParseSize "1 kiX" , 4, 1024
checkParseSize "1j" , 1, 1 # Unknown prefix IGNORED
checkParseSize "1 jib" , 2, 1 # Unknown prefix post space
checkParseSize "1 ji" , 3, 1
# Bad parses; `sz` should stay last good|incomplete value
checkParseSize "-1b" , 0, 1 # Negative numbers
checkParseSize "abc" , 0, 1 # Non-numeric
checkParseSize " 12" , 0, 1 # Leading white
# Value Edge cases
checkParseSize "9223372036854775807", 19, int64.high