Merge pull request #6362 from Yawning/feature/argon2id

core/crypto/argon2id: Initial import
2026-07-11 10:29:32 +00:00 · 2026-03-04 14:27:13 +01:00
parent 1a5126c6b7 86b629ba84
commit fc570d3210
8 changed files with 855 additions and 43 deletions
--- a/core/crypto/_blake2/blake2.odin
+++ b/core/crypto/_blake2/blake2.odin
@@ -19,17 +19,12 @@ BLAKE2S_SIZE :: 32
 BLAKE2B_BLOCK_SIZE :: 128
 BLAKE2B_SIZE :: 64

-MAX_SIZE :: 255
-
 Blake2s_Context :: struct {
 	h:            [8]u32,
 	t:            [2]u32,
 	f:            [2]u32,
 	x:            [BLAKE2S_BLOCK_SIZE]byte,
 	nx:           int,
-	ih:           [8]u32,
-	padded_key:   [BLAKE2S_BLOCK_SIZE]byte,
-	is_keyed:     bool,
 	size:         byte,
 	is_last_node: bool,

@@ -42,9 +37,6 @@ Blake2b_Context :: struct {
 	f:            [2]u64,
 	x:            [BLAKE2B_BLOCK_SIZE]byte,
 	nx:           int,
-	ih:           [8]u64,
-	padded_key:   [BLAKE2B_BLOCK_SIZE]byte,
-	is_keyed:     bool,
 	size:         byte,
 	is_last_node: bool,

@@ -87,11 +79,12 @@ BLAKE2B_IV := [8]u64 {

 init :: proc "contextless" (ctx: ^$T, cfg: ^Blake2_Config) {
 	when T == Blake2s_Context {
-		max_size :: BLAKE2S_SIZE
+		MAX_SIZE :: BLAKE2S_SIZE
 	} else when T == Blake2b_Context {
-		max_size :: BLAKE2B_SIZE
+		MAX_SIZE :: BLAKE2B_SIZE
 	}
-	ensure_contextless(cfg.size <= max_size, "blake2: requested output size exceeeds algorithm max")
+	ensure_contextless(cfg.size <= MAX_SIZE, "blake2: requested output size exceeeds algorithm max")
+	ensure_contextless(len(cfg.key) <= MAX_SIZE, "blake2: requested key size exceeeds algorithm max")

 	// To save having to allocate a scratch buffer, use the internal
 	// data buffer (`ctx.x`), as it is exactly the correct size.
@@ -152,17 +145,11 @@ init :: proc "contextless" (ctx: ^$T, cfg: ^Blake2_Config) {
 		ctx.is_last_node = true
 	}
 	if len(cfg.key) > 0 {
-		copy(ctx.padded_key[:], cfg.key)
-		update(ctx, ctx.padded_key[:])
-		ctx.is_keyed = true
+		copy(ctx.x[:], cfg.key)
+		ctx.nx = len(ctx.x)
+	} else {
+		ctx.nx = 0
 	}
-	copy(ctx.ih[:], ctx.h[:])
-	copy(ctx.h[:], ctx.ih[:])
-	if ctx.is_keyed {
-		update(ctx, ctx.padded_key[:])
-	}
-
-	ctx.nx = 0

 	ctx.is_initialized = true
 }
@@ -172,22 +159,22 @@ update :: proc "contextless" (ctx: ^$T, p: []byte) {

 	p := p
 	when T == Blake2s_Context {
-		block_size :: BLAKE2S_BLOCK_SIZE
+		BLOCK_SIZE :: BLAKE2S_BLOCK_SIZE
 	} else when T == Blake2b_Context {
-		block_size :: BLAKE2B_BLOCK_SIZE
+		BLOCK_SIZE :: BLAKE2B_BLOCK_SIZE
 	}

-	left := block_size - ctx.nx
+	left := BLOCK_SIZE - ctx.nx
 	if len(p) > left {
 		copy(ctx.x[ctx.nx:], p[:left])
 		p = p[left:]
 		blocks(ctx, ctx.x[:])
 		ctx.nx = 0
 	}
-	if len(p) > block_size {
-		n := len(p) &~ (block_size - 1)
+	if len(p) > BLOCK_SIZE {
+		n := len(p) &~ (BLOCK_SIZE - 1)
 		if n == len(p) {
-			n -= block_size
+			n -= BLOCK_SIZE
 		}
 		blocks(ctx, p[:n])
 		p = p[n:]
@@ -228,12 +215,6 @@ reset :: proc "contextless" (ctx: ^$T) {

@(private)
 blake2s_final :: proc "contextless" (ctx: ^Blake2s_Context, hash: []byte) {
-	if ctx.is_keyed {
-		for i := 0; i < len(ctx.padded_key); i += 1 {
-			ctx.padded_key[i] = 0
-		}
-	}
-
 	dec := BLAKE2S_BLOCK_SIZE - u32(ctx.nx)
 	if ctx.t[0] < dec {
 		ctx.t[1] -= 1
@@ -254,17 +235,11 @@ blake2s_final :: proc "contextless" (ctx: ^Blake2s_Context, hash: []byte) {
 	for i := 0; i < BLAKE2S_SIZE / 4; i += 1 {
 		endian.unchecked_put_u32le(dst[i * 4:], ctx.h[i])
 	}
-	copy(hash, dst[:])
+	copy(hash, dst[:ctx.size])
 }

@(private)
 blake2b_final :: proc "contextless" (ctx: ^Blake2b_Context, hash: []byte) {
-	if ctx.is_keyed {
-		for i := 0; i < len(ctx.padded_key); i += 1 {
-			ctx.padded_key[i] = 0
-		}
-	}
-
 	dec := BLAKE2B_BLOCK_SIZE - u64(ctx.nx)
 	if ctx.t[0] < dec {
 		ctx.t[1] -= 1
--- a/core/crypto/argon2id/argon2id.odin
+++ b/core/crypto/argon2id/argon2id.odin
@@ -0,0 +1,622 @@
+/*
+package argon2id implements the Argon2id password hashing algorithm.
+
+See: [[ https://datatracker.ietf.org/doc/rfc9106/ ]]
+*/
+package argon2id
+
+import "core:crypto/blake2b"
+import "core:encoding/endian"
+import "core:math/bits"
+import "core:mem"
+
+// Implementation based on the RFC, Monocypher (CC0-1.0), and the reference
+// code (CC0-1.0).
+
+// MAX_INPUT_SIZE is the mamximum size of the various inputs (password,
+// salt, secret, ad) in bytes.
+MAX_INPUT_SIZE :: (1 << 32) - 1
+
+// MIN_PARALLELISM is the minimum allowed parallelism.
+MIN_PARALLELISM :: 1
+// MAX_PARALLELISM is the maximum allowed parallelism.
+MAX_PARALLELISM :: (1 << 24) - 1
+
+// MIN_TAG_SIZE is the minimum digest size in bytes.
+MIN_TAG_SIZE :: 4
+// MAX_TAG_SIZE is the maximum digest size in bytes.
+MAX_TAG_SIZE :: (1 << 32) - 1
+
+// RECOMMENDED_TAG_SIZE is the recommended tag size in bytes.
+RECOMMENTED_TAG_SIZE :: 32 // 256-bits
+// RECOMMENDNED_SALT_SIZE is the recommended salt size in bytes.
+RECOMMENDED_SALT_SIZE :: 16 // 128-bits
+
+@(private)
+V_RFC9106 :: 0x13
+@(private)
+Y_ID :: 0x02
+@(private)
+BLOCK_SIZE_BYTES :: 1024
+@(private)
+BLOCK_SIZE_U64 :: 128
+
+// PARAMS_RFC9106 is the first recommended "uniformly safe" parameter set
+// per RFC 9106.
+@(rodata)
+PARAMS_RFC9106 := Parameters{
+	memory_size = 2 * 1024 * 1024, // 2 GiB
+	passes      = 1,
+	parallelism = 4,
+}
+
+// PARAMS_RFC9106_SMALL is the second recommended "uniformly safe" parameter
+// set per RFC 9106 tailored for memory constrained environments.
+@(rodata)
+PARAMS_RFC9106_SMALL := Parameters{
+	memory_size = 64 * 1024, // 64 MiB
+	passes      = 3,
+	parallelism = 4,
+}
+
+// PARAMS_OWASP is one of the recommended parameter set from the OWASP
+// Password Storage Cheat Sheet (as of 2026/02).  The cheat sheet contains
+// additional variations to this parameter set with various trade-offs
+// between `memory_size` and `passes` that are intended to provide
+// equivalent security.
+//
+// See: [[ https://cheatsheetseries.owasp.org/cheatsheets/Password_Storage_Cheat_Sheet.html ]]
+@(rodata)
+PARAMS_OWASP := Parameters{
+	memory_size = 19 * 1024, // 19 MiB
+	passes      = 2,
+	parallelism = 1,
+}
+
+// PARAMS_OWASP_SMALL is equivalent in strength to PARAMS_OWASP, but
+// trades off less memory use for more CPU usage.
+@(rodata)
+PARAMS_OWASP_SMALL := Parameters{
+	memory_size = 7 * 1024, // 7 MiB
+	passes      = 5,
+	parallelism = 1,
+}
+
+// Parameters is an Argon2id parameter set.
+Parameters :: struct {
+	memory_size: u32,  // m (KiB)
+	passes:      u32,  // t
+	parallelism: u32,  // p
+}
+
+@(private)
+Block :: [BLOCK_SIZE_U64]u64
+
+// derive invokes Argon2id with the specified parameter set and inputs,
+// and outputs the derived key to dst.
+@(require_results)
+derive :: proc(
+	parameters: ^Parameters,
+	password:   []byte, // P
+	salt:       []byte, // S
+	dst:        []byte,
+	secret:     []byte = nil, // K (aka `pepper`)
+	ad:         []byte = nil, // X
+	sanitize  := true,
+	allocator := context.allocator, // Not temp as this can be large.
+) -> mem.Allocator_Error #no_bounds_check {
+	if u64(len(password)) > MAX_INPUT_SIZE {
+		panic("crypto/argon2id: invalid password size")
+	}
+	if u64(len(salt)) > MAX_INPUT_SIZE {
+		panic("crypto/argon2id: invalid salt size")
+	}
+	if u64(len(secret)) > MAX_INPUT_SIZE {
+		panic("crypto/argon2id: invalid secret size")
+	}
+	if u64(len(ad)) > MAX_INPUT_SIZE {
+		panic("crypto/argon2id: invalid ad size")
+	}
+	if l := u64(len(dst)); l > MAX_TAG_SIZE || l < MIN_TAG_SIZE {
+		panic("crypto/argon2id: invalid dst size")
+	}
+
+	p, t, m := parameters.parallelism, parameters.passes, u64(parameters.memory_size)
+	if p < MIN_PARALLELISM || p > MAX_PARALLELISM {
+		panic("crypto/argon2id: invalid parallelism")
+	}
+	if t < 1 {
+		panic("crypto/argon2id: invalid passes")
+	}
+	if m < 8 * u64(p) {
+		panic("crypto/argon2id: insufficient memory size")
+	}
+	if m * BLOCK_SIZE_BYTES > u64(max(int)) {
+		panic("crypto/argon2id: excessive memory size")
+	}
+
+	// Allocate the memory as m' 1024-byte blocks, where m' is derived as:
+	// m' = 4 * p * floor (m / 4p)
+	//
+	// For p lanes, the memory is organized in a matrix B[i][j] of
+	// blocks with p rows (lanes) and q = m' / p columns.
+	m_ := 4 * u64(p) * (m / u64(4 * p))
+	b := mem.alloc_bytes_non_zeroed(
+		int(m_) * BLOCK_SIZE_BYTES,
+		alignment = mem.DEFAULT_PAGE_SIZE,
+		allocator = allocator,
+	) or_return
+	defer delete(b, allocator)
+
+	block_buf: [BLOCK_SIZE_BYTES]byte = ---
+
+	blocks := ([^]Block)(raw_data(b))[:m_]
+	segment_size := u32(m_ / u64(p) / 4)
+	lane_size := segment_size * 4
+
+	// Establish H_0 as the 64-byte value as shown below.  If K, X, or S
+	// has zero length, it is just absent, but its length field remains.
+	//
+	// H_0 = H^(64)(LE32(p) || LE32(T) || LE32(m) || LE32(t) ||
+	//     LE32(v) || LE32(y) || LE32(length(P)) || P ||
+	//     LE32(length(S)) || S ||  LE32(length(K)) || K ||
+	//     LE32(length(X)) || X)
+	{
+		ctx: blake2b.Context
+		blake2b.init(&ctx)
+
+		blake2b_update_u32le(&ctx, u32(p))
+		blake2b_update_u32le(&ctx, u32(len(dst)))
+		blake2b_update_u32le(&ctx, parameters.memory_size)
+		blake2b_update_u32le(&ctx, t)
+		blake2b_update_u32le(&ctx, V_RFC9106)
+		blake2b_update_u32le(&ctx, Y_ID)
+		blake2b_update_u32le(&ctx, u32(len(password)))
+		blake2b.update(&ctx, password)
+		blake2b_update_u32le(&ctx, u32(len(salt)))
+		blake2b.update(&ctx, salt)
+		blake2b_update_u32le(&ctx, u32(len(secret)))
+		blake2b.update(&ctx, secret)
+		blake2b_update_u32le(&ctx, u32(len(ad)))
+		blake2b.update(&ctx, ad)
+
+		h_0: [blake2b.DIGEST_SIZE+8]byte
+		blake2b.final(&ctx, h_0[:blake2b.DIGEST_SIZE])
+
+		// Compute B[i][0] for all i ranging from (and including) 0 to (not
+		// including) p.
+		//
+		// B[i][0] = H'^(1024)(H_0 || LE32(0) || LE32(i))
+		//
+		// Compute B[i][1] for all i ranging from (and including) 0 to (not
+		// including) p.
+		//
+		// B[i][1] = H'^(1024)(H_0 || LE32(1) || LE32(i))
+		for l in u32(0) ..< p {
+			for i in u32(0) ..< 2 {
+				endian.unchecked_put_u32le(h_0[blake2b.DIGEST_SIZE:], i)   // LE32({0,1})
+				endian.unchecked_put_u32le(h_0[blake2b.DIGEST_SIZE+4:], l) // LE32(i)
+				h_prime(block_buf[:], h_0[:])
+				blk := &blocks[l * lane_size + i]
+				for j in 0 ..< BLOCK_SIZE_U64 {
+					blk[j] = endian.unchecked_get_u64le(block_buf[j*8:])
+				}
+			}
+		}
+
+		mem.zero_explicit(&h_0, size_of(h_0)) // No longer needed.
+	}
+
+	// Compute B[i][j] for all i ranging from (and including) 0 to (not
+	// including) p and for all j ranging from (and including) 2 to (not
+	// including) q.  The computation MUST proceed slicewise
+	// (Section 3.4): first, blocks from slice 0 are computed for all
+	// lanes (in an arbitrary order of lanes), then blocks from slice 1
+	// are computed, etc.  The block indices l and z are determined for
+	// each i, j differently for Argon2d, Argon2i, and Argon2id.
+	//
+	// B[i][j] = G(B[i][j-1], B[l][z])
+	//
+	// If the number of passes t is larger than 1, we repeat step 5.  We
+	// compute B[i][0] and B[i][j] for all i raging from (and including)
+	// 0 to (not including) p and for all j ranging from (and including)
+	// 1 to (not including) q.  However, blocks are computed differently
+	// as the old value is XORed with the new one:
+	//
+	// B[i][0] = G(B[i][q-1], B[l][z]) XOR B[i][0];
+	// B[i][j] = G(B[i][j-1], B[l][z]) XOR B[i][j].
+	constant_time := true // Start with constant time indexing.
+	tmp, index_block: Block = ---, ---
+	for pass in u32(0) ..< t {
+		for slice in u32(0) ..< 4 {
+			// The first slice of the first pass has blocks 0 and 1
+			// pre-filled.
+			pass_offset: u32 = pass == 0 && slice == 0 ? 2 : 0
+			slice_offset := slice * segment_size
+
+			// 3.4.1.3.  Argon2id
+			//
+			//    If the pass number is 0 and the slice number is 0 or 1, then compute
+			//    J_1 and J_2 as for Argon2i, else compute J_1 and J_2 as for Argon2d.
+			if slice == 2 {
+				constant_time = false
+			}
+
+			// Each segment can be processed in parallel, as long as
+			// each iteration of the loop completes before proceeding
+			// to the next.  For simplicity we do this in serial
+			// instead of using threads.
+			for segment in u32(0) ..< u32(p) {
+				index_ctr: u64 = 1
+				for block in pass_offset ..< segment_size {
+					// Current and previous blocks (indexes, not pointers)
+					lane_offset := segment * lane_size
+					segment_start := lane_offset + slice_offset
+					current := segment_start + block
+					previous := segment_start - 1
+					switch {
+					case block == 0 && slice_offset == 0:
+						previous += lane_size
+					case:
+						previous += block
+					}
+
+					index_seed: u64
+					if constant_time {
+						// 3.4.1.2.  Argon2i
+						//
+						//    For each segment, we do the following.  First, we compute the value Z
+						//    as:
+						//
+						//    Z= ( LE64(r) || LE64(l) || LE64(sl) || LE64(m') ||
+						//         LE64(t) || LE64(y) )
+						//
+						//                 Figure 11: Input to Compute J1,J2 in Argon2i
+						//
+						//    where
+						//
+						//    r:   the pass number
+						//    l:   the lane number
+						//    sl:  the slice number
+						//    m':  the total number of memory blocks
+						//    t:   the total number of passes
+						//    y:   the Argon2 type (0 for Argon2d, 1 for Argon2i, 2 for Argon2id)
+						//
+						//    Then we compute:
+						//
+						//    q/(128*SL) 1024-byte values
+						//    G(ZERO(1024),G(ZERO(1024),
+						//    Z || LE64(1) || ZERO(968) )),
+						//    G(ZERO(1024),G(ZERO(1024),
+						//    Z || LE64(2) || ZERO(968) )),... ,
+						//    G(ZERO(1024),G(ZERO(1024),
+						//    Z || LE64(q/(128*SL)) || ZERO(968) )),
+						//
+						//    which are partitioned into q/(SL) 8-byte values X, which are viewed
+						//    as X1||X2 and converted to J_1=int32(X1) and J_2=int32(X2).
+						//
+						//    The values r, l, sl, m', t, y, and i are represented as 8 bytes in
+						//    little endian.
+						if block == pass_offset || (block % 128) == 0 {
+							mem.zero(&index_block, size_of(index_block))
+							index_block[0] = u64(pass)
+							index_block[1] = u64(segment)
+							index_block[2] = u64(slice)
+							index_block[3] = u64(lane_size * p)
+							index_block[4] = u64(t) // passes
+							index_block[5] = Y_ID
+							index_block[6] = index_ctr
+							index_ctr += 1
+
+							copy(tmp[:], index_block[:])
+							g_rounds(&index_block)
+							xor_block(&index_block, &tmp)
+							copy(tmp[:], index_block[:])
+							g_rounds(&index_block)
+							xor_block(&index_block, &tmp)
+						}
+						index_seed = index_block[block % 128]
+					} else {
+						// 3.4.1.1.  Argon2d
+						//
+						//    J_1 is given by the first 32 bits of block B[i][j-1], while J_2 is
+						//    given by the next 32 bits of block B[i][j-1]:
+						//
+						//    J_1 = int32(extract(B[i][j-1], 0))
+						//    J_2 = int32(extract(B[i][j-1], 1))
+						//
+						//                   Figure 10: Deriving J1,J2 in Argon2d
+						index_seed = blocks[previous][0]
+					}
+
+					// 3.4.2.  Mapping J_1 and J_2 to Reference Block Index [l][z]
+					//
+					//    The value of l = J_2 mod p gives the index of the lane from which the
+					//    block will be taken.  For the first pass (r=0) and the first slice
+					//    (sl=0), the block is taken from the current lane.
+					//
+					//    The set W contains the indices that are referenced according to the
+					//    following rules:
+					//
+					//    1.  If l is the current lane, then W includes the indices of all
+					//        blocks in the last SL - 1 = 3 segments computed and finished, as
+					//        well as the blocks computed in the current segment in the current
+					//        pass excluding B[i][j-1].
+					//
+					//    2.  If l is not the current lane, then W includes the indices of all
+					//        blocks in the last SL - 1 = 3 segments computed and finished in
+					//        lane l.  If B[i][j] is the first block of a segment, then the
+					//        very last index from W is excluded.
+					//
+					//    Then take a block from W with a nonuniform distribution over [0, |W|)
+					//    using the following mapping:
+					//
+					//    J_1 -> |W|(1 - J_1^2 / 2^(64))
+					//
+					//                           Figure 12: Computing J1
+					//
+					//    To avoid floating point computation, the following approximation is
+					//    used:
+					//
+					//    x = J_1^2 / 2^(32)
+					//    y = (|W| * x) / 2^(32)
+					//    zz = |W| - 1 - y
+					//
+					//                       Figure 13: Computing J1, Part 2
+					//
+					//    Then take the zz-th index from W; it will be the z value for the
+					//    reference block index [l][z].
+					next_slice: u32 = ((slice + 1) % 4) * segment_size
+					window_start, nb_segments: u32
+					lane := u32(index_seed >> 32) % p
+					switch {
+					case pass == 0:
+						nb_segments = slice
+						if slice == 0 {
+							lane = segment
+						}
+					case:
+						window_start = next_slice
+						nb_segments = 3
+					}
+					window_size := nb_segments * segment_size
+					if lane == segment {
+						window_size += block - 1
+					} else if block == 0 {
+						window_size += ~u32(0)
+					}
+
+					j1 := index_seed & 0xffffffff
+					x := (j1 * j1) >> 32
+					y := (u64(window_size) * x) >> 32
+					z := (u64(window_size) - 1) - y
+					ref := u32((u64(window_start) + z) % u64(lane_size))
+					reference: u32 = lane * lane_size + ref
+
+					copy(tmp[:], blocks[previous][:])
+					xor_block(&tmp, &blocks[reference])
+					if pass == 0 {
+						copy(blocks[current][:], tmp[:])
+					} else {
+						xor_block(&blocks[current], &tmp)
+					}
+					g_rounds(&tmp)
+					xor_block(&blocks[current], &tmp)
+				}
+			}
+		}
+	}
+	mem.zero_explicit(&tmp, size_of(tmp))
+	mem.zero_explicit(&index_block, size_of(index_block))
+
+	// After t steps have been iterated, the final block C is computed
+	// as the XOR of the last column:
+	//
+	// C = B[0][q-1] XOR B[1][q-1] XOR ... XOR B[p-1][q-1]
+	idx := lane_size - 1
+	last_block := &blocks[idx]
+	for _ in 1 ..< p {
+		idx += lane_size
+		next_block := &blocks[idx]
+		xor_block(next_block, last_block)
+		last_block = next_block
+	}
+
+	for v, i in last_block {
+		endian.unchecked_put_u64le(block_buf[i*8:], v)
+	}
+
+	// The output tag is computed as H'^T(C).
+	h_prime(dst, block_buf[:])
+	mem.zero_explicit(&block_buf, size_of(block_buf))
+
+	// Sanitize the working memory.  While the RFC implies that this is
+	// optional ("enable the memory-wiping option in the library call"),
+	// the reference code defaults to enabling it.
+	//
+	// An opt-out is provided, as this can get somewhat expensive when
+	// m gets large.
+	if sanitize {
+		mem.zero_explicit(raw_data(b), len(b))
+	}
+
+	return nil
+}
+
+@(private)
+xor_block :: #force_inline proc(dst, src: ^Block) {
+	for v, i in src {
+		dst[i] ~= v
+	}
+}
+
+@(private)
+blake2b_update_u32le :: #force_inline proc(ctx: ^blake2b.Context, i: u32) {
+	tmp: [4]byte = ---
+	endian.unchecked_put_u32le(tmp[:], i)
+	blake2b.update(ctx, tmp[:])
+	mem.zero_explicit(&tmp, size_of(tmp)) // Probably overkill.
+}
+
+// 3.3.  Variable-Length Hash Function H'
+//
+//    Let V_i be a 64-byte block and W_i be its first 32 bytes.  Then we
+//    define function H' as follows:
+//
+//            if T <= 64
+//                H'^T(A) = H^T(LE32(T)||A)
+//            else
+//                r = ceil(T/32)-2
+//                V_1 = H^(64)(LE32(T)||A)
+//                V_2 = H^(64)(V_1)
+//                ...
+//                V_r = H^(64)(V_{r-1})
+//                V_{r+1} = H^(T-32*r)(V_{r})
+//                H'^T(X) = W_1 || W_2 || ... || W_r || V_{r+1}
+//
+//         Figure 8: Function H' for Tag and Initial Block Computations
+@(private)
+h_prime :: proc(dst, src: []byte) {
+	t := len(dst)
+	ctx: blake2b.Context
+	blake2b.init(&ctx, min(t, blake2b.DIGEST_SIZE))
+	blake2b_update_u32le(&ctx, u32(t))
+	blake2b.update(&ctx, src)
+	blake2b.final(&ctx, dst)
+
+	if t > 64 {
+		r := u32((u64(t) + 31) >> 5) - 2
+		i: u32 = 1
+		off_in := 0
+		off_out := 32
+		for i < r {
+			blake2b.init(&ctx, blake2b.DIGEST_SIZE)
+			blake2b.update(&ctx, dst[off_in:off_in+64])
+			blake2b.final(&ctx, dst[off_out:])
+			i += 1
+			off_in += 32
+			off_out += 32
+		}
+		blake2b.init(&ctx, t - int(32 * r))
+		blake2b.update(&ctx, dst[off_in:off_in+64])
+		blake2b.final(&ctx, dst[off_out:])
+	}
+}
+
+// GB(a, b, c, d) is defined as follows:
+//
+//         a = (a + b + 2 * trunc(a) * trunc(b)) mod 2^(64)
+//         d = (d XOR a) >>> 32
+//         c = (c + d + 2 * trunc(c) * trunc(d)) mod 2^(64)
+//         b = (b XOR c) >>> 24
+//
+//         a = (a + b + 2 * trunc(a) * trunc(b)) mod 2^(64)
+//         d = (d XOR a) >>> 16
+//         c = (c + d + 2 * trunc(c) * trunc(d)) mod 2^(64)
+//         b = (b XOR c) >>> 63
+//
+//                        Figure 19: Details of GB
+//
+// The modular additions in GB are combined with 64-bit multiplications.
+// Multiplications are the only difference from the original BLAKE2b
+// design.  This choice is done to increase the circuit depth and thus
+// the running time of ASIC implementations, while having roughly the
+// same running time on CPUs thanks to parallelism and pipelining.
+@(private,require_results)
+gb :: #force_inline proc(a, b, c, d: u64) -> (u64, u64, u64, u64) {
+	a, b, c, d := a, b, c, d
+
+	trunc := #force_inline proc(v: u64) -> u64 {
+		return u64(u32(v))
+	}
+
+	a += b + ((trunc(a) * trunc(b)) << 1)
+	d = bits.rotate_left64(d ~ a, 32) // >>> 32
+	c += d + ((trunc(c) * trunc(d)) << 1)
+	b = bits.rotate_left64((b ~ c), 40) // >>> 24
+
+	a += b + ((trunc(a) * trunc(b)) << 1)
+	d = bits.rotate_left64(d ~ a, 48) // >>> 16
+	c += d + ((trunc(c) * trunc(d)) << 1)
+	b = bits.rotate_left64((b ~ c), 1) // >>> 63
+
+	return a, b, c, d
+}
+
+// 3.6.  Permutation P
+//
+//    Permutation P is based on the round function of BLAKE2b.  The eight
+//    16-byte inputs S_0, S_1, ... , S_7 are viewed as a 4x4 matrix of
+//    64-bit words, where S_i = (v_{2*i+1} || v_{2*i}):
+//
+//             v_0  v_1  v_2  v_3
+//             v_4  v_5  v_6  v_7
+//             v_8  v_9 v_10 v_11
+//            v_12 v_13 v_14 v_15
+//
+//                      Figure 17: Matrix Element Labeling
+//
+//    It works as follows:
+//
+//            GB(v_0, v_4,  v_8, v_12)
+//            GB(v_1, v_5,  v_9, v_13)
+//            GB(v_2, v_6, v_10, v_14)
+//            GB(v_3, v_7, v_11, v_15)
+//
+//            GB(v_0, v_5, v_10, v_15)
+//            GB(v_1, v_6, v_11, v_12)
+//            GB(v_2, v_7,  v_8, v_13)
+//            GB(v_3, v_4,  v_9, v_14)
+//
+//                   Figure 18: Feeding Matrix Elements to GB
+@(private,require_results)
+perm_p :: #force_inline proc(v_0, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9, v_10, v_11, v_12, v_13, v_14, v_15: u64) -> (u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64) {
+	v_0, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9, v_10, v_11, v_12, v_13, v_14, v_15 := v_0, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9, v_10, v_11, v_12, v_13, v_14, v_15
+
+	v_0, v_4, v_8, v_12 = gb(v_0, v_4, v_8, v_12)
+	v_1, v_5, v_9, v_13 = gb(v_1, v_5, v_9, v_13)
+	v_2, v_6, v_10, v_14 = gb(v_2, v_6, v_10, v_14)
+	v_3, v_7, v_11, v_15 = gb(v_3, v_7, v_11, v_15)
+
+	v_0, v_5, v_10, v_15 = gb(v_0, v_5, v_10, v_15)
+	v_1, v_6, v_11, v_12 = gb(v_1, v_6, v_11, v_12)
+	v_2, v_7, v_8, v_13 = gb(v_2, v_7, v_8, v_13)
+	v_3, v_4, v_9, v_14 = gb(v_3, v_4, v_9, v_14)
+
+	return v_0, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9, v_10, v_11, v_12, v_13, v_14, v_15
+}
+
+// 3.5.  Compression Function G
+//
+//    The compression function G is built upon the BLAKE2b-based
+//    transformation P.  P operates on the 128-byte input, which can be
+//    viewed as eight 16-byte registers:
+//
+//    P(A_0, A_1, ... ,A_7) = (B_0, B_1, ... ,B_7)
+//
+//                      Figure 14: Blake Round Function P
+//
+//    The compression function G(X, Y) operates on two 1024-byte blocks X
+//    and Y.  It first computes R = X XOR Y.  Then R is viewed as an 8x8
+//    matrix of 16-byte registers R_0, R_1, ... , R_63.  Then P is first
+//    applied to each row, and then to each column to get Z:
+//
+//    ( Q_0,  Q_1,  Q_2, ... ,  Q_7) <- P( R_0,  R_1,  R_2, ... ,  R_7)
+//    ( Q_8,  Q_9, Q_10, ... , Q_15) <- P( R_8,  R_9, R_10, ... , R_15)
+//                                  ...
+//    (Q_56, Q_57, Q_58, ... , Q_63) <- P(R_56, R_57, R_58, ... , R_63)
+//    ( Z_0,  Z_8, Z_16, ... , Z_56) <- P( Q_0,  Q_8, Q_16, ... , Q_56)
+//    ( Z_1,  Z_9, Z_17, ... , Z_57) <- P( Q_1,  Q_9, Q_17, ... , Q_57)
+//                                  ...
+//    ( Z_7, Z_15, Z 23, ... , Z_63) <- P( Q_7, Q_15, Q_23, ... , Q_63)
+//
+//                  Figure 15: Core of Compression Function G
+@(private)
+g_rounds :: proc(b: ^Block) {
+	for i := 0; i < 128; i += 16 {
+		b[i], b[i+1], b[i+2], b[i+3], b[i+4], b[i+5], b[i+6], b[i+7], b[i+8], b[i+9], b[i+10], b[i+11], b[i+12], b[i+13], b[i+14], b[i+15] = perm_p(b[i], b[i+1], b[i+2], b[i+3], b[i+4], b[i+5], b[i+6], b[i+7], b[i+8], b[i+9], b[i+10], b[i+11], b[i+12], b[i+13], b[i+14], b[i+15])
+	}
+	for i := 0; i < 16; i += 2 {
+		b[i], b[i+1], b[i+16], b[i+17], b[i+32], b[i+33], b[i+48], b[i+49], b[i+64], b[i+65], b[i+80], b[i+81], b[i+96], b[i+97], b[i+112], b[i+113] = perm_p(b[i], b[i+1], b[i+16], b[i+17], b[i+32], b[i+33], b[i+48], b[i+49], b[i+64], b[i+65], b[i+80], b[i+81], b[i+96], b[i+97], b[i+112], b[i+113])
+	}
+}
--- a/core/crypto/blake2b/blake2b.odin
+++ b/core/crypto/blake2b/blake2b.odin
@@ -28,13 +28,24 @@ Context :: _blake2.Blake2b_Context

 // init initializes a Context with the default BLAKE2b config.
 init :: proc(ctx: ^Context, digest_size := DIGEST_SIZE) {
-	ensure(digest_size <= _blake2.MAX_SIZE, "crypto/blake2b: invalid digest size")
+	ensure(digest_size <= DIGEST_SIZE, "crypto/blake2b: invalid digest size")

 	cfg: _blake2.Blake2_Config
 	cfg.size = u8(digest_size)
 	_blake2.init(ctx, &cfg)
 }

+// init_mac initializes a Context with a user provided key.
+init_mac :: proc(ctx: ^Context, key: []byte, digest_size := DIGEST_SIZE) {
+	ensure(digest_size <= DIGEST_SIZE, "crypto/blake2b: invalid digest size")
+	ensure(len(key) <= DIGEST_SIZE, "crypto/blake2b: invalid key size")
+
+	cfg: _blake2.Blake2_Config
+	cfg.size = u8(digest_size)
+	cfg.key = key
+	_blake2.init(ctx, &cfg)
+}
+
 // update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
 	_blake2.update(ctx, data)
--- a/core/crypto/blake2s/blake2s.odin
+++ b/core/crypto/blake2s/blake2s.odin
@@ -28,13 +28,24 @@ Context :: _blake2.Blake2s_Context

 // init initializes a Context with the default BLAKE2s config.
 init :: proc(ctx: ^Context, digest_size := DIGEST_SIZE) {
-	ensure(digest_size <= _blake2.MAX_SIZE, "crypto/blake2s: invalid digest size")
+	ensure(digest_size <= DIGEST_SIZE, "crypto/blake2s: invalid digest size")

 	cfg: _blake2.Blake2_Config
 	cfg.size = u8(digest_size)
 	_blake2.init(ctx, &cfg)
 }

+// init_mac initializes a Context with a user provided key.
+init_mac :: proc(ctx: ^Context, key: []byte, digest_size := DIGEST_SIZE) {
+	ensure(digest_size <= DIGEST_SIZE, "crypto/blake2s: invalid digest size")
+	ensure(len(key) <= DIGEST_SIZE, "crypto/blake2s: invalid key size")
+
+	cfg: _blake2.Blake2_Config
+	cfg.size = u8(digest_size)
+	cfg.key = key
+	_blake2.init(ctx, &cfg)
+}
+
 // update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
 	_blake2.update(ctx, data)
--- a/examples/all/all_js.odin
+++ b/examples/all/all_js.odin
@@ -27,6 +27,7 @@ package all
@(require) import "core:crypto/aead"
@(require) import "core:crypto/aegis"
@(require) import "core:crypto/aes"
+@(require) import "core:crypto/argon2id"
@(require) import "core:crypto/blake2b"
@(require) import "core:crypto/blake2s"
@(require) import "core:crypto/chacha20"
--- a/examples/all/all_main.odin
+++ b/examples/all/all_main.odin
@@ -32,6 +32,7 @@ package all
@(require) import "core:crypto/aead"
@(require) import "core:crypto/aegis"
@(require) import "core:crypto/aes"
+@(require) import "core:crypto/argon2id"
@(require) import "core:crypto/blake2b"
@(require) import "core:crypto/blake2s"
@(require) import "core:crypto/chacha20"
--- a/tests/core/crypto/test_core_crypto_hash.odin
+++ b/tests/core/crypto/test_core_crypto_hash.odin
@@ -5,6 +5,9 @@ import "core:bytes"
 import "core:encoding/hex"
 import "core:strings"
 import "core:testing"
+
+import "core:crypto/blake2b"
+import "core:crypto/blake2s"
 import "core:crypto/hash"

@(test)
@@ -596,4 +599,139 @@ test_hash :: proc(t: ^testing.T) {
 			c_str,
 		)
 	}
-}
+}
+
+@(private="file")
+selftest_seq :: proc(dst: []byte, seed: u32) {
+	a := 0xdead4bad * seed
+	b: u32 = 1
+
+	for i in 0 ..< len(dst) {
+		a, b = b, a + b
+		dst[i] = byte(b >> 24)
+	}
+}
+
+@(test)
+test_blake2b_self :: proc(t: ^testing.T) {
+	expected := []byte{
+		0xC2, 0x3A, 0x78, 0x00, 0xD9, 0x81, 0x23, 0xBD,
+		0x10, 0xF5, 0x06, 0xC6, 0x1E, 0x29, 0xDA, 0x56,
+		0x03, 0xD7, 0x63, 0xB8, 0xBB, 0xAD, 0x2E, 0x73,
+		0x7F, 0x5E, 0x76, 0x5A, 0x7B, 0xCC, 0xD4, 0x75,
+	}
+	md_lens := []int{20, 32, 48, 64}
+	src_lens := []int{0, 3, 128, 129, 255, 1024}
+
+	b2b := proc(dst, src: []byte) {
+		ctx: blake2b.Context
+
+		blake2b.init(&ctx, len(dst))
+		blake2b.update(&ctx, src)
+		blake2b.final(&ctx, dst)
+	}
+	b2b_keyed := proc(dst, key, src: []byte) {
+		ctx: blake2b.Context
+
+		blake2b.init_mac(&ctx, key, len(dst))
+		blake2b.update(&ctx, src)
+		blake2b.final(&ctx, dst)
+	}
+
+	buf: [1024]byte
+	md, key: [64]byte
+
+	ctx: blake2b.Context
+	blake2b.init(&ctx, 32)
+
+	for md_len in md_lens {
+		dst := md[:md_len]
+		for src_len in src_lens {
+			src := buf[:src_len]
+
+			selftest_seq(src, u32(src_len))
+			b2b(dst, src)
+			blake2b.update(&ctx, dst)
+
+			k := key[:md_len]
+			selftest_seq(k, u32(md_len))
+			b2b_keyed(dst, k, src)
+			blake2b.update(&ctx, dst)
+		}
+	}
+
+	blake2b.final(&ctx, md[:32])
+
+	expected_str := string(hex.encode(expected, context.temp_allocator))
+	actual_str := string(hex.encode(md[:32], context.temp_allocator))
+
+	testing.expectf(
+		t,
+		expected_str == actual_str,
+		"blake2b/self-test: Expected: %s Got %s",
+		expected_str,
+		actual_str,
+	)
+}
+
+@(test)
+test_blake2s_self :: proc(t: ^testing.T) {
+	expected := []byte{
+		0x6A, 0x41, 0x1F, 0x08, 0xCE, 0x25, 0xAD, 0xCD,
+		0xFB, 0x02, 0xAB, 0xA6, 0x41, 0x45, 0x1C, 0xEC,
+		0x53, 0xC5, 0x98, 0xB2, 0x4F, 0x4F, 0xC7, 0x87,
+		0xFB, 0xDC, 0x88, 0x79, 0x7F, 0x4C, 0x1D, 0xFE,
+	}
+	md_lens := []int{16, 20, 28, 32}
+	src_lens := []int{0, 3, 64, 65, 255, 1024}
+
+	b2s := proc(dst, src: []byte) {
+		ctx: blake2s.Context
+
+		blake2s.init(&ctx, len(dst))
+		blake2s.update(&ctx, src)
+		blake2s.final(&ctx, dst)
+	}
+	b2s_keyed := proc(dst, key, src: []byte) {
+		ctx: blake2s.Context
+
+		blake2s.init_mac(&ctx, key, len(dst))
+		blake2s.update(&ctx, src)
+		blake2s.final(&ctx, dst)
+	}
+
+	buf: [1024]byte
+	md, key: [32]byte
+
+	ctx: blake2s.Context
+	blake2s.init(&ctx)
+
+	for md_len in md_lens {
+		dst := md[:md_len]
+		for src_len in src_lens {
+			src := buf[:src_len]
+
+			selftest_seq(src, u32(src_len))
+			b2s(dst, src)
+			blake2s.update(&ctx, dst)
+
+			k := key[:md_len]
+			selftest_seq(k, u32(md_len))
+			b2s_keyed(dst, k, src)
+			blake2s.update(&ctx, dst)
+		}
+	}
+
+	blake2s.final(&ctx, md[:])
+
+	expected_str := string(hex.encode(expected, context.temp_allocator))
+	actual_str := string(hex.encode(md[:], context.temp_allocator))
+
+	testing.expectf(
+		t,
+		expected_str == actual_str,
+		"blake2s/self-test: Expected: %s Got %s",
+		expected_str,
+		actual_str,
+	)
+}
--- a/tests/core/crypto/test_core_crypto_kdf.odin
+++ b/tests/core/crypto/test_core_crypto_kdf.odin
@@ -3,10 +3,63 @@ package test_core_crypto
 import "base:runtime"
 import "core:encoding/hex"
 import "core:testing"
+import "core:crypto/argon2id"
 import "core:crypto/hash"
 import "core:crypto/hkdf"
 import "core:crypto/pbkdf2"

+@(test)
+test_argon2id :: proc(t: ^testing.T) {
+	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
+
+	test_vectors := []struct {
+		params:   ^argon2id.Parameters,
+		password: string,
+		salt:     string,
+		secret:   string,
+		ad:       string,
+		tag:      string,
+	} {
+		// RFC 9106 5.3.
+		{
+			&argon2id.Parameters{
+				32,
+				3,
+				4,
+			},
+			"0101010101010101010101010101010101010101010101010101010101010101",
+			"02020202020202020202020202020202",
+			"0303030303030303",
+			"040404040404040404040404",
+			"0d640df58d78766c08c037a34a8b53c9d01ef0452d75b65eb52520e96b01e659",
+		},
+	}
+	for v, _ in test_vectors {
+		tag := make([]byte, len(v.tag)/2, context.temp_allocator)
+
+		password, _ := hex.decode(transmute([]byte)(v.password), context.temp_allocator)
+		salt, _ := hex.decode(transmute([]byte)(v.salt), context.temp_allocator)
+		secret, _ := hex.decode(transmute([]byte)(v.secret), context.temp_allocator)
+		ad, _ := hex.decode(transmute([]byte)(v.ad), context.temp_allocator)
+
+		_ = argon2id.derive(v.params, password, salt, tag, secret, ad)
+
+		tag_str := string(hex.encode(tag, context.temp_allocator))
+
+		testing.expectf(
+			t,
+			tag_str == v.tag,
+			"argon2id: Expected: %s for input of (%s, %s, %s, %s), but got %s instead",
+			v.tag,
+			v.password,
+			v.salt,
+			v.secret,
+			v.ad,
+			tag_str,
+		)
+	}
+}
+
@(test)
 test_hkdf :: proc(t: ^testing.T) {
 	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()