synthetic package

This introduces a new package `src/synthetic` for generating synthetic
data, currently primarily for benchmarking but other use cases can
emerge.

The synthetic package exports a runtime-dispatched type `Generator` that
can generate data of various types. To start, we have a bytes, utf8,
and OSC generator. The goal of each generator is to expose knobs to tune the
probabilities of various outcomes. For example, the UTF-8 generator has
a knob to tune the probability of generating 1, 2, 3, or 4-byte UTF-8
sequences.

Ultimately, the goal is to be able to collect probability data
empirically that we can then use for benchmarks so we can optimize
various parts of the codebase on real-world data shape distributions.
This commit is contained in:
Mitchell Hashimoto
2025-05-16 10:14:39 -07:00
parent 4c50a4d487
commit f1c42c9f8c
9 changed files with 497 additions and 288 deletions

View File

@@ -12,10 +12,9 @@ const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const ArenaAllocator = std.heap.ArenaAllocator;
const ziglyph = @import("ziglyph");
const cli = @import("../cli.zig");
const terminal = @import("../terminal/main.zig");
const synth = @import("synth/main.zig");
const synthetic = @import("../synthetic/main.zig");
const Args = struct {
mode: Mode = .noop,
@@ -102,16 +101,57 @@ pub fn main() !void {
const writer = std.io.getStdOut().writer();
const buf = try alloc.alloc(u8, args.@"buffer-size");
// Build our RNG
const seed: u64 = if (args.seed >= 0) @bitCast(args.seed) else @truncate(@as(u128, @bitCast(std.time.nanoTimestamp())));
var prng = std.Random.DefaultPrng.init(seed);
const rand = prng.random();
// Handle the modes that do not depend on terminal state first.
switch (args.mode) {
.@"gen-ascii" => try genAscii(writer, seed),
.@"gen-utf8" => try genUtf8(writer, seed),
.@"gen-rand" => try genRand(writer, seed),
.@"gen-osc" => try genOsc(writer, seed, 0.5),
.@"gen-osc-valid" => try genOsc(writer, seed, 1.0),
.@"gen-osc-invalid" => try genOsc(writer, seed, 0.0),
.@"gen-ascii" => {
var gen: synthetic.Bytes = .{
.rand = rand,
.alphabet = synthetic.Bytes.Alphabet.ascii,
};
try generate(writer, gen.generator());
},
.@"gen-utf8" => {
var gen: synthetic.Utf8 = .{
.rand = rand,
};
try generate(writer, gen.generator());
},
.@"gen-rand" => {
var gen: synthetic.Bytes = .{ .rand = rand };
try generate(writer, gen.generator());
},
.@"gen-osc" => {
var gen: synthetic.Osc = .{
.rand = rand,
.p_valid = 0.5,
};
try generate(writer, gen.generator());
},
.@"gen-osc-valid" => {
var gen: synthetic.Osc = .{
.rand = rand,
.p_valid = 1.0,
};
try generate(writer, gen.generator());
},
.@"gen-osc-invalid" => {
var gen: synthetic.Osc = .{
.rand = rand,
.p_valid = 0.0,
};
try generate(writer, gen.generator());
},
.noop => try benchNoop(reader, buf),
// Handle the ones that depend on terminal state next
@@ -145,75 +185,14 @@ pub fn main() !void {
}
}
/// Generates an infinite stream of random printable ASCII characters.
/// This has no control characters in it at all.
fn genAscii(writer: anytype, seed: u64) !void {
const alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_+-=[]{}|;':\\\",./<>?`~";
try genData(writer, alphabet, seed);
}
/// Generates an infinite stream of bytes from the given alphabet.
fn genData(writer: anytype, alphabet: []const u8, seed: u64) !void {
var prng = std.Random.DefaultPrng.init(seed);
const rnd = prng.random();
fn generate(
writer: anytype,
gen: synthetic.Generator,
) !void {
var buf: [1024]u8 = undefined;
while (true) {
for (&buf) |*c| {
const idx = rnd.uintLessThanBiased(usize, alphabet.len);
c.* = alphabet[idx];
}
writer.writeAll(&buf) catch |err| switch (err) {
error.BrokenPipe => return, // stdout closed
else => return err,
};
}
}
fn genUtf8(writer: anytype, seed: u64) !void {
var prng = std.Random.DefaultPrng.init(seed);
const rnd = prng.random();
var buf: [1024]u8 = undefined;
while (true) {
var i: usize = 0;
while (i <= buf.len - 4) {
const cp: u18 = while (true) {
const cp = rnd.int(u18);
if (ziglyph.isPrint(cp)) break cp;
};
i += try std.unicode.utf8Encode(cp, buf[i..]);
}
writer.writeAll(buf[0..i]) catch |err| switch (err) {
error.BrokenPipe => return, // stdout closed
else => return err,
};
}
}
fn genOsc(writer: anytype, seed: u64, p_valid: f64) !void {
var prng = std.Random.DefaultPrng.init(seed);
const gen: synth.OSC = .{ .rand = prng.random(), .p_valid = p_valid };
var buf: [1024]u8 = undefined;
while (true) {
const seq = try gen.next(&buf);
writer.writeAll(seq) catch |err| switch (err) {
error.BrokenPipe => return, // stdout closed
else => return err,
};
}
}
fn genRand(writer: anytype, seed: u64) !void {
var prng = std.Random.DefaultPrng.init(seed);
const rnd = prng.random();
var buf: [1024]u8 = undefined;
while (true) {
rnd.bytes(&buf);
writer.writeAll(&buf) catch |err| switch (err) {
const data = try gen.next(&buf);
writer.writeAll(data) catch |err| switch (err) {
error.BrokenPipe => return, // stdout closed
else => return err,
};

View File

@@ -1,15 +0,0 @@
//! Package synth contains functions for generating synthetic data for
//! the purpose of benchmarking, primarily. This can also probably be used
//! for testing and fuzzing (probably generating a corpus rather than
//! directly fuzzing) and more.
//!
//! The synthetic data generators in this package are usually not performant
//! enough to be streamed in real time. They should instead be used to
//! generate a large amount of data in a single go and then streamed
//! from there.
pub const OSC = @import("osc.zig").Generator;
test {
@import("std").testing.refAllDecls(@This());
}

View File

@@ -1,197 +0,0 @@
const std = @import("std");
const assert = std.debug.assert;
/// Synthetic OSC request generator.
///
/// I tried to balance generality and practicality. I implemented mainly
/// all I need at the time of writing this, but I think this can be iterated
/// over time to be a general purpose OSC generator with a lot of
/// configurability. I limited the configurability to what I need but still
/// tried to lay out the code in a way that it can be extended easily.
pub const Generator = struct {
/// Random number generator.
rand: std.Random,
/// Probability of a valid OSC sequence being generated.
p_valid: f64 = 1.0,
pub const Error = error{NoSpaceLeft};
/// We use a FBS as a direct parameter below in non-pub functions,
/// but we should probably just switch to `[]u8`.
const FBS = std.io.FixedBufferStream([]u8);
/// Get the next OSC request in bytes. The generated OSC request will
/// have the prefix `ESC ]` and the terminator `BEL` (0x07).
///
/// This will generate both valid and invalid OSC requests (based on
/// the `p_valid` probability value). Invalid requests still have the
/// prefix and terminator, but the content in between is not a valid
/// OSC request.
///
/// The buffer must be at least 3 bytes long to accommodate the
/// prefix and terminator.
pub fn next(self: *const Generator, buf: []u8) Error![]const u8 {
assert(buf.len >= 3);
var fbs: FBS = std.io.fixedBufferStream(buf);
const writer = fbs.writer();
// Start OSC (ESC ])
try writer.writeAll("\x1b]");
// Determine if we are generating a valid or invalid OSC request.
switch (self.chooseValidity()) {
.valid => try self.nextValid(&fbs),
.invalid => try self.nextInvalid(&fbs),
}
// Terminate OSC
try writer.writeAll("\x07");
return fbs.getWritten();
}
fn nextValid(self: *const Generator, fbs: *FBS) Error!void {
try self.nextValidExact(fbs, self.rand.enumValue(ValidKind));
}
fn nextValidExact(self: *const Generator, fbs: *FBS, k: ValidKind) Error!void {
switch (k) {
.change_window_title => {
try fbs.writer().writeAll("0;"); // Set window title
try self.randomBytes(fbs, 1, fbs.buffer.len);
},
.prompt_start => {
try fbs.writer().writeAll("133;A"); // Start prompt
// aid
if (self.rand.boolean()) {
try fbs.writer().writeAll(";aid=");
try self.randomBytes(fbs, 1, 16);
}
// redraw
if (self.rand.boolean()) {
try fbs.writer().writeAll(";redraw=");
if (self.rand.boolean()) {
try fbs.writer().writeAll("1");
} else {
try fbs.writer().writeAll("0");
}
}
},
.prompt_end => try fbs.writer().writeAll("133;B"), // End prompt
}
}
fn nextInvalid(self: *const Generator, fbs: *FBS) Error!void {
switch (self.rand.enumValue(InvalidKind)) {
.random => try self.randomBytes(fbs, 1, fbs.buffer.len),
.good_prefix => {
try fbs.writer().writeAll("133;");
try self.randomBytes(fbs, 2, fbs.buffer.len);
},
}
}
/// Generate a random string of bytes up to `max_len` bytes or
/// until we run out of space in the buffer, whichever is
/// smaller.
///
/// This will avoid the terminator characters (0x1B and 0x07) and
/// replace them by incrementing them by one.
fn randomBytes(
self: *const Generator,
fbs: *FBS,
min_len: usize,
max_len: usize,
) Error!void {
const len = @min(
self.rand.intRangeAtMostBiased(usize, min_len, max_len),
fbs.buffer.len - fbs.pos - 1, // leave space for terminator
);
var rem: usize = len;
var buf: [1024]u8 = undefined;
while (rem > 0) {
self.rand.bytes(&buf);
std.mem.replaceScalar(u8, &buf, 0x1B, 0x1C);
std.mem.replaceScalar(u8, &buf, 0x07, 0x08);
const n = @min(rem, buf.len);
try fbs.writer().writeAll(buf[0..n]);
rem -= n;
}
}
/// Choose whether to generate a valid or invalid OSC request based
/// on the validity probability.
fn chooseValidity(self: *const Generator) Validity {
return if (self.rand.float(f64) > self.p_valid)
.invalid
else
.valid;
}
const Validity = enum { valid, invalid };
const ValidKind = enum {
change_window_title,
prompt_start,
prompt_end,
};
const InvalidKind = enum {
/// Literally random bytes. Might even be valid, but probably not.
random,
/// A good prefix, but ultimately invalid format.
good_prefix,
};
};
/// A fixed seed we can use for our tests to avoid flakes.
const test_seed = 0xC0FFEEEEEEEEEEEE;
test "OSC generator" {
var prng = std.Random.DefaultPrng.init(test_seed);
var buf: [4096]u8 = undefined;
const gen: Generator = .{ .rand = prng.random() };
for (0..50) |_| _ = try gen.next(&buf);
}
test "OSC generator valid" {
const testing = std.testing;
const terminal = @import("../../terminal/main.zig");
var prng = std.Random.DefaultPrng.init(test_seed);
var buf: [256]u8 = undefined;
const gen: Generator = .{
.rand = prng.random(),
.p_valid = 1.0,
};
for (0..50) |_| {
const seq = try gen.next(&buf);
var parser: terminal.osc.Parser = .{};
for (seq[2 .. seq.len - 1]) |c| parser.next(c);
try testing.expect(parser.end(null) != null);
}
}
test "OSC generator invalid" {
const testing = std.testing;
const terminal = @import("../../terminal/main.zig");
var prng = std.Random.DefaultPrng.init(test_seed);
var buf: [256]u8 = undefined;
const gen: Generator = .{
.rand = prng.random(),
.p_valid = 0.0,
};
for (0..50) |_| {
const seq = try gen.next(&buf);
var parser: terminal.osc.Parser = .{};
for (seq[2 .. seq.len - 1]) |c| parser.next(c);
try testing.expect(parser.end(null) == null);
}
}

View File

@@ -182,12 +182,12 @@ test {
_ = @import("surface_mouse.zig");
// Libraries
_ = @import("bench/synth/main.zig");
_ = @import("crash/main.zig");
_ = @import("datastruct/main.zig");
_ = @import("inspector/main.zig");
_ = @import("terminal/main.zig");
_ = @import("terminfo/main.zig");
_ = @import("simd/main.zig");
_ = @import("synthetic/main.zig");
_ = @import("unicode/main.zig");
}

53
src/synthetic/Bytes.zig Normal file
View File

@@ -0,0 +1,53 @@
/// Generates bytes.
const Bytes = @This();
const std = @import("std");
const Generator = @import("Generator.zig");
/// Random number generator.
rand: std.Random,
/// The minimum and maximum length of the generated bytes. The maximum
/// length will be capped to the length of the buffer passed in if the
/// buffer length is smaller.
min_len: usize = 1,
max_len: usize = std.math.maxInt(usize),
/// The possible bytes that can be generated. If a byte is duplicated
/// in the alphabet, it will be more likely to be generated. That's a
/// side effect of the generator, not an intended use case.
alphabet: ?[]const u8 = null,
/// Predefined alphabets.
pub const Alphabet = struct {
pub const ascii = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_+-=[]{}|;':\\\",./<>?`~";
};
pub fn generator(self: *Bytes) Generator {
return .init(self, next);
}
pub fn next(self: *Bytes, buf: []u8) Generator.Error![]const u8 {
const len = @min(
self.rand.intRangeAtMostBiased(usize, self.min_len, self.max_len),
buf.len,
);
const result = buf[0..len];
self.rand.bytes(result);
if (self.alphabet) |alphabet| {
for (result) |*byte| byte.* = alphabet[byte.* % alphabet.len];
}
return result;
}
test "bytes" {
const testing = std.testing;
var prng = std.Random.DefaultPrng.init(0);
var buf: [256]u8 = undefined;
var v: Bytes = .{ .rand = prng.random() };
const gen = v.generator();
const result = try gen.next(&buf);
try testing.expect(result.len > 0);
}

View File

@@ -0,0 +1,42 @@
/// A common interface for all generators.
const Generator = @This();
const std = @import("std");
const assert = std.debug.assert;
/// For generators, this is the only error that is allowed to be
/// returned by the next function.
pub const Error = error{NoSpaceLeft};
/// The vtable for the generator.
ptr: *anyopaque,
nextFn: *const fn (ptr: *anyopaque, buf: []u8) Error![]const u8,
/// Create a new generator from a pointer and a function pointer.
/// This usually is only called by generator implementations, not
/// generator users.
pub fn init(
pointer: anytype,
comptime nextFn: fn (ptr: @TypeOf(pointer), buf: []u8) Error![]const u8,
) Generator {
const Ptr = @TypeOf(pointer);
assert(@typeInfo(Ptr) == .pointer); // Must be a pointer
assert(@typeInfo(Ptr).pointer.size == .one); // Must be a single-item pointer
assert(@typeInfo(@typeInfo(Ptr).pointer.child) == .@"struct"); // Must point to a struct
const gen = struct {
fn next(ptr: *anyopaque, buf: []u8) Error![]const u8 {
const self: Ptr = @ptrCast(@alignCast(ptr));
return try nextFn(self, buf);
}
};
return .{
.ptr = pointer,
.nextFn = gen.next,
};
}
/// Get the next value from the generator. Returns the data written.
pub fn next(self: Generator, buf: []u8) Error![]const u8 {
return try self.nextFn(self.ptr, buf);
}

221
src/synthetic/Osc.zig Normal file
View File

@@ -0,0 +1,221 @@
/// Generates random terminal OSC requests.
const Osc = @This();
const std = @import("std");
const assert = std.debug.assert;
const Generator = @import("Generator.zig");
const Bytes = @import("Bytes.zig");
/// Valid OSC request kinds that can be generated.
pub const ValidKind = enum {
change_window_title,
prompt_start,
prompt_end,
};
/// Invalid OSC request kinds that can be generated.
pub const InvalidKind = enum {
/// Literally random bytes. Might even be valid, but probably not.
random,
/// A good prefix, but ultimately invalid format.
good_prefix,
};
/// Random number generator.
rand: std.Random,
/// Probability of a valid OSC sequence being generated.
p_valid: f64 = 1.0,
/// Probabilities of specific valid or invalid OSC request kinds.
/// The probabilities are weighted relative to each other, so they
/// can sum greater than 1.0. A kind of weight 1.0 and a kind of
/// weight 2.0 will have a 2:1 chance of the latter being selected.
p_valid_kind: std.enums.EnumArray(ValidKind, f64) = .initFill(1.0),
p_invalid_kind: std.enums.EnumArray(InvalidKind, f64) = .initFill(1.0),
/// The alphabet for random bytes (omitting 0x1B and 0x07).
const bytes_alphabet: []const u8 = alphabet: {
var alphabet: [256]u8 = undefined;
for (0..alphabet.len) |i| {
if (i == 0x1B or i == 0x07) {
alphabet[i] = @intCast(i + 1);
} else {
alphabet[i] = @intCast(i);
}
}
const result = alphabet;
break :alphabet &result;
};
pub fn generator(self: *Osc) Generator {
return .init(self, next);
}
/// Get the next OSC request in bytes. The generated OSC request will
/// have the prefix `ESC ]` and the terminator `BEL` (0x07).
///
/// This will generate both valid and invalid OSC requests (based on
/// the `p_valid` probability value). Invalid requests still have the
/// prefix and terminator, but the content in between is not a valid
/// OSC request.
///
/// The buffer must be at least 3 bytes long to accommodate the
/// prefix and terminator.
pub fn next(self: *Osc, buf: []u8) Generator.Error![]const u8 {
if (buf.len < 3) return error.NoSpaceLeft;
const unwrapped = try self.nextUnwrapped(buf[2 .. buf.len - 1]);
buf[0] = 0x1B; // ESC
buf[1] = ']';
buf[unwrapped.len + 2] = 0x07; // BEL
return buf[0 .. unwrapped.len + 3];
}
fn nextUnwrapped(self: *Osc, buf: []u8) Generator.Error![]const u8 {
return switch (self.chooseValidity()) {
.valid => valid: {
const Indexer = @TypeOf(self.p_valid_kind).Indexer;
const idx = self.rand.weightedIndex(f64, &self.p_valid_kind.values);
break :valid try self.nextUnwrappedValidExact(
buf,
Indexer.keyForIndex(idx),
);
},
.invalid => invalid: {
const Indexer = @TypeOf(self.p_invalid_kind).Indexer;
const idx = self.rand.weightedIndex(f64, &self.p_invalid_kind.values);
break :invalid try self.nextUnwrappedInvalidExact(
buf,
Indexer.keyForIndex(idx),
);
},
};
}
fn nextUnwrappedValidExact(self: *const Osc, buf: []u8, k: ValidKind) Generator.Error![]const u8 {
var fbs = std.io.fixedBufferStream(buf);
switch (k) {
.change_window_title => {
try fbs.writer().writeAll("0;"); // Set window title
var bytes_gen = self.bytes();
const title = try bytes_gen.next(fbs.buffer[fbs.pos..]);
try fbs.seekBy(@intCast(title.len));
},
.prompt_start => {
try fbs.writer().writeAll("133;A"); // Start prompt
// aid
if (self.rand.boolean()) {
var bytes_gen = self.bytes();
bytes_gen.max_len = 16;
try fbs.writer().writeAll(";aid=");
const aid = try bytes_gen.next(fbs.buffer[fbs.pos..]);
try fbs.seekBy(@intCast(aid.len));
}
// redraw
if (self.rand.boolean()) {
try fbs.writer().writeAll(";redraw=");
if (self.rand.boolean()) {
try fbs.writer().writeAll("1");
} else {
try fbs.writer().writeAll("0");
}
}
},
.prompt_end => try fbs.writer().writeAll("133;B"), // End prompt
}
return fbs.getWritten();
}
fn nextUnwrappedInvalidExact(
self: *const Osc,
buf: []u8,
k: InvalidKind,
) Generator.Error![]const u8 {
switch (k) {
.random => {
var bytes_gen = self.bytes();
return try bytes_gen.next(buf);
},
.good_prefix => {
var fbs = std.io.fixedBufferStream(buf);
try fbs.writer().writeAll("133;");
var bytes_gen = self.bytes();
const data = try bytes_gen.next(fbs.buffer[fbs.pos..]);
try fbs.seekBy(@intCast(data.len));
return fbs.getWritten();
},
}
}
fn bytes(self: *const Osc) Bytes {
return .{
.rand = self.rand,
.alphabet = bytes_alphabet,
};
}
/// Choose whether to generate a valid or invalid OSC request based
/// on the validity probability.
fn chooseValidity(self: *const Osc) Validity {
return if (self.rand.float(f64) > self.p_valid)
.invalid
else
.valid;
}
const Validity = enum { valid, invalid };
/// A fixed seed we can use for our tests to avoid flakes.
const test_seed = 0xC0FFEEEEEEEEEEEE;
test "OSC generator" {
var prng = std.Random.DefaultPrng.init(test_seed);
var buf: [4096]u8 = undefined;
var v: Osc = .{ .rand = prng.random() };
const gen = v.generator();
for (0..50) |_| _ = try gen.next(&buf);
}
test "OSC generator valid" {
const testing = std.testing;
const terminal = @import("../terminal/main.zig");
var prng = std.Random.DefaultPrng.init(test_seed);
var buf: [256]u8 = undefined;
var gen: Osc = .{
.rand = prng.random(),
.p_valid = 1.0,
};
for (0..50) |_| {
const seq = try gen.next(&buf);
var parser: terminal.osc.Parser = .{};
for (seq[2 .. seq.len - 1]) |c| parser.next(c);
try testing.expect(parser.end(null) != null);
}
}
test "OSC generator invalid" {
const testing = std.testing;
const terminal = @import("../terminal/main.zig");
var prng = std.Random.DefaultPrng.init(test_seed);
var buf: [256]u8 = undefined;
var gen: Osc = .{
.rand = prng.random(),
.p_valid = 0.0,
};
for (0..50) |_| {
const seq = try gen.next(&buf);
var parser: terminal.osc.Parser = .{};
for (seq[2 .. seq.len - 1]) |c| parser.next(c);
try testing.expect(parser.end(null) == null);
}
}

103
src/synthetic/Utf8.zig Normal file
View File

@@ -0,0 +1,103 @@
/// Generates UTF-8.
///
/// This doesn't yet generate multi-codepoint graphemes, but it
/// has the ability to generate a custom distribution of UTF-8
/// encoding lengths (1, 2, 3, or 4 bytes).
const Utf8 = @This();
const std = @import("std");
const assert = std.debug.assert;
const Generator = @import("Generator.zig");
/// Possible UTF-8 encoding lengths.
pub const Utf8Len = enum(u3) {
one = 1,
two = 2,
three = 3,
four = 4,
};
/// Random number generator.
rand: std.Random,
/// The minimum and maximum length of the generated bytes. The maximum
/// length will be capped to the length of the buffer passed in if the
/// buffer length is smaller.
min_len: usize = 1,
max_len: usize = std.math.maxInt(usize),
/// Probability of a specific UTF-8 encoding length being generated.
/// The probabilities are weighted relative to each other, so they
/// can sum greater than 1.0. A length of weight 1.0 and a length
/// of weight 2.0 will have a 2:1 chance of the latter being
/// selected.
///
/// If a UTF-8 encoding of a chosen length can't fit into the remaining
/// buffer, a smaller length will be chosen. For small buffers this may
/// skew the distribution of lengths.
p_length: std.enums.EnumArray(Utf8Len, f64) = .initFill(1.0),
pub fn generator(self: *Utf8) Generator {
return .init(self, next);
}
pub fn next(self: *Utf8, buf: []u8) Generator.Error![]const u8 {
const len = @min(
self.rand.intRangeAtMostBiased(usize, self.min_len, self.max_len),
buf.len,
);
const result = buf[0..len];
var rem: usize = len;
while (rem > 0) {
// Pick a utf8 byte count to generate.
const utf8_len: Utf8Len = len: {
const Indexer = @TypeOf(self.p_length).Indexer;
const idx = self.rand.weightedIndex(f64, &self.p_length.values);
var utf8_len = Indexer.keyForIndex(idx);
assert(rem > 0);
while (@intFromEnum(utf8_len) > rem) {
// If the chosen length can't fit into the remaining buffer,
// choose a smaller length.
utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1);
}
break :len utf8_len;
};
// Generate a UTF-8 sequence that encodes to this length.
const cp: u21 = switch (utf8_len) {
.one => self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F),
.two => self.rand.intRangeAtMostBiased(u21, 0x80, 0x7FF),
.three => self.rand.intRangeAtMostBiased(u21, 0x800, 0xFFFF),
.four => self.rand.intRangeAtMostBiased(u21, 0x10000, 0x10FFFF),
};
assert(std.unicode.utf8CodepointSequenceLength(
cp,
) catch unreachable == @intFromEnum(utf8_len));
rem -= std.unicode.utf8Encode(
cp,
result[result.len - rem ..],
) catch |err| switch (err) {
// Impossible because our generation above is hardcoded to
// produce a valid range. If not, a bug.
error.CodepointTooLarge => unreachable,
// Possible, in which case we redo the loop and encode nothing.
error.Utf8CannotEncodeSurrogateHalf => continue,
};
}
return result;
}
test "utf8" {
const testing = std.testing;
var prng = std.Random.DefaultPrng.init(0);
var buf: [256]u8 = undefined;
var v: Utf8 = .{ .rand = prng.random() };
const gen = v.generator();
const result = try gen.next(&buf);
try testing.expect(result.len > 0);
try testing.expect(std.unicode.utf8ValidateSlice(result));
}

23
src/synthetic/main.zig Normal file
View File

@@ -0,0 +1,23 @@
//! The synthetic package contains an abstraction for generating
//! synthetic data. The motivating use case for this package is to
//! generate synthetic data for benchmarking, but it may also expand
//! to other use cases such as fuzzing (e.g. to generate a corpus
//! rather than directly fuzzing).
//!
//! The generators in this package are typically not performant
//! enough to be streamed in real time. They should instead be
//! used to generate a large amount of data in a single go
//! and then streamed from there.
//!
//! The generators are aimed for terminal emulation, but the package
//! is not limited to that and we may want to extract this to a
//! standalone package one day.
pub const Generator = @import("Generator.zig");
pub const Bytes = @import("Bytes.zig");
pub const Utf8 = @import("Utf8.zig");
pub const Osc = @import("Osc.zig");
test {
@import("std").testing.refAllDecls(@This());
}