Replace normal sort procedure with a simpler unified type-erased one

2026-02-12 22:33:36 +00:00 · 2025-10-07 12:38:01 +01:00
parent 1400952530
commit f40fc2792f
2 changed files with 309 additions and 334 deletions
--- a/core/slice/sort.odin
+++ b/core/slice/sort.odin
@@ -6,6 +6,8 @@ Ordering :: enum {
 	Greater = +1,
 }

+Generic_Cmp :: #type proc(lhs, rhs: rawptr, user_data: rawptr) -> Ordering
+
@(require_results)
 cmp :: proc(a, b: $E) -> Ordering where ORD(E) {
 	switch {
@@ -35,7 +37,16 @@ cmp_proc :: proc($E: typeid) -> (proc(E, E) -> Ordering) where ORD(E) {
 sort :: proc(data: $T/[]$E) where ORD(E) {
 	when size_of(E) != 0 {
 		if n := len(data); n > 1 {
-			_quick_sort_general(data, 0, n, _max_depth(n), struct{}{}, .Ordered)
+			raw := ([^]byte)(raw_data(data))
+			_smoothsort(raw, uint(len(data)), size_of(E), proc(lhs, rhs: rawptr, user_data: rawptr) -> Ordering {
+				x, y := (^E)(lhs)^, (^E)(rhs)^
+				if x < y {
+					return .Less
+				} else if x > y {
+					return .Greater
+				}
+				return .Equal
+			}, nil)
 		}
 	}
 }
@@ -79,7 +90,18 @@ sort_with_indices :: proc(data: $T/[]$E, allocator := context.allocator) -> (ind
 			for _, idx in indices {
 				indices[idx] = idx
 			}
-			_quick_sort_general_with_indices(data, indices, 0, n, _max_depth(n), struct{}{}, .Ordered)
+			raw := ([^]byte)(raw_data(indices))
+			_smoothsort(raw, uint(len(indices)), size_of(int), proc(lhs, rhs: rawptr, user_data: rawptr) -> Ordering {
+				data := ([^]E)(user_data)
+				xi, yi := (^int)(lhs)^, (^int)(rhs)^
+				x, y := data[xi], data[yi]
+				if x < y {
+					return .Less
+				} else if x > y {
+					return .Greater
+				}
+				return .Equal
+			}, raw_data(data))
 		}
 		return indices
 	}
@@ -91,7 +113,39 @@ sort_with_indices :: proc(data: $T/[]$E, allocator := context.allocator) -> (ind
 sort_by :: proc(data: $T/[]$E, less: proc(i, j: E) -> bool) {
 	when size_of(E) != 0 {
 		if n := len(data); n > 1 {
-			_quick_sort_general(data, 0, n, _max_depth(n), less, .Less)
+			raw := ([^]byte)(raw_data(data))
+			_smoothsort(raw, uint(len(data)), size_of(E), proc(lhs, rhs: rawptr, user_data: rawptr) -> Ordering {
+				x, y := (^E)(lhs)^, (^E)(rhs)^
+				less := (proc(E, E) -> bool)(user_data)
+				switch {
+				case less(x, y): return .Less
+				case less(y, x): return .Greater
+				}
+				return .Equal
+			}, rawptr(less))
+		}
+	}
+}
+
+sort_by_with_data :: proc(data: $T/[]$E, less: proc(i, j: E, user_data: rawptr) -> bool, user_data: rawptr) {
+	when size_of(E) != 0 {
+		if n := len(data); n > 1 {
+			Context :: struct {
+				less:      proc(i, j: E, user_data: rawptr) -> bool,
+				user_data: rawptr,
+			}
+			ctx := &Context{less, user_data}
+
+			raw := ([^]byte)(raw_data(data))
+			_smoothsort(raw, uint(len(data)), size_of(E), proc(lhs, rhs: rawptr, user_data: rawptr) -> Ordering {
+				x, y := (^E)(lhs)^, (^E)(rhs)^
+				ctx := (^Context)(user_data)
+				switch {
+				case ctx.less(x, y, ctx.user_data): return .Less
+				case ctx.less(y, x, ctx.user_data): return .Greater
+				}
+				return .Equal
+			}, ctx)
 		}
 	}
 }
@@ -105,8 +159,55 @@ sort_by_with_indices :: proc(data: $T/[]$E, less: proc(i, j: E) -> bool, allocat
 			for _, idx in indices {
 				indices[idx] = idx
 			}
-			_quick_sort_general_with_indices(data, indices, 0, n, _max_depth(n), less, .Less)
-			return indices
+
+			Context :: struct{
+				less: proc(i, j: E) -> bool,
+				data: T,
+			}
+			ctx := &Context{less, data}
+
+			raw := ([^]byte)(raw_data(indices))
+			_smoothsort(raw, uint(len(indices)), size_of(int), proc(lhs, rhs: rawptr, user_data: rawptr) -> Ordering {
+				ctx := (^Context)(user_data)
+				xi, yi := (^int)(lhs)^, (^int)(rhs)^
+				x, y := ctx.data[xi], ctx.data[yi]
+				switch {
+				case ctx.less(x, y): return .Less
+				case ctx.less(y, x): return .Greater
+				}
+				return .Equal
+			}, ctx)
+		}
+	}
+	return indices
+}
+
+sort_by_with_indices_with_data :: proc(data: $T/[]$E, less: proc(i, j: E, user_data: rawptr) -> bool, user_data: rawptr, allocator := context.allocator) -> (indices : []int) {
+	indices = make([]int, len(data), allocator)
+	when size_of(E) != 0 {
+		if n := len(data); n > 1 {
+			for _, idx in indices {
+				indices[idx] = idx
+			}
+
+			Context :: struct{
+				less: proc(i, j: E, user_data: rawptr) -> bool,
+				data: T,
+				user_data: rawptr,
+			}
+			ctx := &Context{less, data, user_data}
+
+			raw := ([^]byte)(raw_data(indices))
+			_smoothsort(raw, uint(len(indices)), size_of(int), proc(lhs, rhs: rawptr, user_data: rawptr) -> Ordering {
+				ctx := (^Context)(user_data)
+				xi, yi := (^int)(lhs)^, (^int)(rhs)^
+				x, y := ctx.data[xi], ctx.data[yi]
+				switch {
+				case ctx.less(x, y, ctx.user_data): return .Less
+				case ctx.less(y, x, ctx.user_data): return .Greater
+				}
+				return .Equal
+			}, ctx)
 		}
 	}
 	return indices
@@ -115,11 +216,47 @@ sort_by_with_indices :: proc(data: $T/[]$E, less: proc(i, j: E) -> bool, allocat
 sort_by_cmp :: proc(data: $T/[]$E, cmp: proc(i, j: E) -> Ordering) {
 	when size_of(E) != 0 {
 		if n := len(data); n > 1 {
-			_quick_sort_general(data, 0, n, _max_depth(n), cmp, .Cmp)
+			raw := ([^]byte)(raw_data(data))
+			_smoothsort(raw, uint(len(data)), size_of(E), proc(lhs, rhs: rawptr, user_data: rawptr) -> Ordering {
+				x, y := (^E)(lhs)^, (^E)(rhs)^
+				cmp := cast(proc(E, E) -> Ordering)(user_data)
+				return cmp(x, y)
+			}, rawptr(cmp))
 		}
 	}
 }

+
+sort_by_cmp_with_data :: proc(data: $T/[]$E, cmp: proc(i, j: E, user_data: rawptr) -> Ordering, user_data: rawptr) {
+	when size_of(E) != 0 {
+		if n := len(data); n > 1 {
+			Context :: struct{
+				cmp: proc(i, j: E, user_data: rawptr) -> Ordering,
+				user_data: rawptr,
+			}
+			ctx := &Context{cmp, user_data}
+
+			raw := ([^]byte)(raw_data(data))
+			_smoothsort(raw, uint(len(data)), size_of(E), proc(lhs, rhs: rawptr, user_data: rawptr) -> Ordering {
+				x, y := (^E)(lhs)^, (^E)(rhs)^
+				ctx := (^Context)(user_data)
+				return ctx.cmp(x, y, ctx.user_data)
+			}, ctx)
+		}
+	}
+}
+
+
+sort_by_generic_cmp :: proc(data: $T/[]$E, cmp: Generic_Cmp, user_data: rawptr) {
+	when size_of(E) != 0 {
+		if n := len(data); n > 1 {
+			raw := ([^]byte)(raw_data(data))
+			_smoothsort(raw, uint(len(data)), size_of(E), cmp, user_data)
+		}
+	}
+}
+
+
 // stable_sort sorts a slice
 stable_sort :: proc(data: $T/[]$E) where ORD(E) {
 	when size_of(E) != 0 {
--- a/core/slice/sort_private.odin
+++ b/core/slice/sort_private.odin
@@ -1,6 +1,7 @@
 #+private
 package slice

+import "base:builtin"
 import "base:intrinsics"
 _ :: intrinsics

@@ -12,171 +13,6 @@ Sort_Kind :: enum {
 	Cmp,
 }

-_quick_sort_general :: proc(data: $T/[]$E, a, b, max_depth: int, call: $P, $KIND: Sort_Kind) where (ORD(E) && KIND == .Ordered) || (KIND != .Ordered) #no_bounds_check {
-	less :: #force_inline proc(a, b: E, call: P) -> bool {
-		when KIND == .Ordered {
-			return a < b
-		} else when KIND == .Less {
-			return call(a, b)
-		} else when KIND == .Cmp {
-			return call(a, b) == .Less
-		} else {
-			#panic("unhandled Sort_Kind")
-		}
-	}
-
-	insertion_sort :: proc(data: $T/[]$E, a, b: int, call: P) #no_bounds_check {
-		for i in a+1..<b {
-			for j := i; j > a && less(data[j], data[j-1], call); j -= 1 {
-				swap(data, j, j-1)
-			}
-		}
-	}
-
-	heap_sort :: proc(data: $T/[]$E, a, b: int, call: P) #no_bounds_check {
-		sift_down :: proc(data: T, lo, hi, first: int, call: P) #no_bounds_check {
-			root := lo
-			for {
-				child := 2*root + 1
-				if child >= hi {
-					break
-				}
-				if child+1 < hi && less(data[first+child], data[first+child+1], call) {
-					child += 1
-				}
-				if !less(data[first+root], data[first+child], call) {
-					return
-				}
-				swap(data, first+root, first+child)
-				root = child
-			}
-		}
-
-
-		first, lo, hi := a, 0, b-a
-
-		for i := (hi-1)/2; i >= 0; i -= 1 {
-			sift_down(data, i, hi, first, call)
-		}
-
-		for i := hi-1; i >= 0; i -= 1 {
-			swap(data, first, first+i)
-			sift_down(data, lo, i, first, call)
-		}
-	}
-
-	median3 :: proc(data: T, m1, m0, m2: int, call: P) #no_bounds_check {
-		if less(data[m1], data[m0], call) {
-			swap(data, m1, m0)
-		}
-		if less(data[m2], data[m1], call) {
-			swap(data, m2, m1)
-			if less(data[m1], data[m0], call) {
-				swap(data, m1, m0)
-			}
-		}
-	}
-
-	do_pivot :: proc(data: T, lo, hi: int, call: P) -> (midlo, midhi: int) #no_bounds_check {
-		m := int(uint(lo+hi)>>1)
-		if hi-lo > 40 {
-			s := (hi-lo)/8
-			median3(data, lo, lo+s, lo+s*2, call)
-			median3(data, m, m-s, m+s, call)
-			median3(data, hi-1, hi-1-s, hi-1-s*2, call)
-		}
-		median3(data, lo, m, hi-1, call)
-
-		pivot := lo
-		a, c := lo+1, hi-1
-
-
-		for ; a < c && less(data[a], data[pivot], call); a += 1 {
-		}
-		b := a
-
-		for {
-			for ; b < c && !less(data[pivot], data[b], call); b += 1 { // data[b] <= pivot
-			}
-			for ; b < c && less(data[pivot], data[c-1], call); c -=1 { // data[c-1] > pivot
-			}
-			if b >= c {
-				break
-			}
-
-			swap(data, b, c-1)
-			b += 1
-			c -= 1
-		}
-
-		protect := hi-c < 5
-		if !protect && hi-c < (hi-lo)/4 {
-			dups := 0
-			if !less(data[pivot], data[hi-1], call) {
-				swap(data, c, hi-1)
-				c += 1
-				dups += 1
-			}
-			if !less(data[b-1], data[pivot], call) {
-				b -= 1
-				dups += 1
-			}
-
-			if !less(data[m], data[pivot], call) {
-				swap(data, m, b-1)
-				b -= 1
-				dups += 1
-			}
-			protect = dups > 1
-		}
-		if protect {
-			for {
-				for ; a < b && !less(data[b-1], data[pivot], call); b -= 1 {
-				}
-				for ; a < b && less(data[a], data[pivot], call); a += 1 {
-				}
-				if a >= b {
-					break
-				}
-				swap(data, a, b-1)
-				a += 1
-				b -= 1
-			}
-		}
-		swap(data, pivot, b-1)
-		return b-1, c
-	}
-
-
-	a, b, max_depth := a, b, max_depth
-
-	for b-a > 12 { // only use shell sort for lengths <= 12
-		if max_depth == 0 {
-			heap_sort(data, a, b, call)
-			return
-		}
-		max_depth -= 1
-		mlo, mhi := do_pivot(data, a, b, call)
-		if mlo-a < b-mhi {
-			_quick_sort_general(data, a, mlo, max_depth, call, KIND)
-			a = mhi
-		} else {
-			_quick_sort_general(data, mhi, b, max_depth, call, KIND)
-			b = mlo
-		}
-	}
-	if b-a > 1 {
-		// Shell short with gap 6
-		for i in a+6..<b {
-			if less(data[i], data[i-6], call) {
-				swap(data, i, i-6)
-			}
-		}
-		insertion_sort(data, a, b, call)
-	}
-}
-
-
 _stable_sort_general :: proc(data: $T/[]$E, call: $P, $KIND: Sort_Kind) where (ORD(E) && KIND == .Ordered) || (KIND != .Ordered) #no_bounds_check {
 	less :: #force_inline proc(a, b: E, call: P) -> bool {
 		when KIND == .Ordered {
@@ -200,179 +36,181 @@ _stable_sort_general :: proc(data: $T/[]$E, call: $P, $KIND: Sort_Kind) where (O
 	}
 }

-_quick_sort_general_with_indices :: proc(data: $T/[]$E, indices: []int, a, b, max_depth: int, call: $P, $KIND: Sort_Kind) where (ORD(E) && KIND == .Ordered) || (KIND != .Ordered) #no_bounds_check {
-	less :: #force_inline proc(a, b: E, call: P) -> bool {
-		when KIND == .Ordered {
-			return a < b
-		} else when KIND == .Less {
-			return call(a, b)
-		} else when KIND == .Cmp {
-			return call(a, b) == .Less
-		} else {
-			#panic("unhandled Sort_Kind")
+@(private)
+_smoothsort :: proc(base: [^]byte, nel: uint, width: uint, cmp: Generic_Cmp, arg: rawptr) {
+	pntz :: proc "contextless" (p: [2]uint) -> int {
+		r := intrinsics.count_trailing_zeros(p[0] - 1)
+		if r != 0 {
+			return int(r)
 		}
+		r = (8*size_of(uint) + intrinsics.count_trailing_zeros(p[1]))
+		if r != 8*size_of(uint) {
+			return int(r)
+		}
+		return 0
 	}

-	insertion_sort :: proc(data: $T/[]$E, indices: []int, a, b: int, call: P) #no_bounds_check {
-		for i in a+1..<b {
-			for j := i; j > a && less(data[j], data[j-1], call); j -= 1 {
-				swap(data, j, j-1)
-				swap(indices, j, j-1)
-			}
+	shl :: proc "contextless" (p: []uint, n: int) {
+		n := n
+		if n >= 8*size_of(uint) {
+			n -= 8*size_of(uint)
+			p[1] = p[0]
+			p[0] = 0
 		}
+		p[1] <<= uint(n)
+		p[0] |= p[0] >> uint(8*size_of(uint) - n)
+		p[0] <<= uint(n)
+	}
+	shr :: proc "contextless" (p: []uint, n: int) {
+		n := n
+		if n >= 8*size_of(uint) {
+			n -= 8*size_of(uint)
+			p[0] = p[1]
+			p[1] = 0
+		}
+		p[0] >>= uint(n)
+		p[0] |= p[1] << uint(8*size_of(uint) - n)
+		p[1] >>= uint(n)
 	}

-	heap_sort :: proc(data: $T/[]$E, indices: []int, a, b: int, call: P) #no_bounds_check {
-		sift_down :: proc(data: T, indices: []int, lo, hi, first: int, call: P) #no_bounds_check {
-			root := lo
-			for {
-				child := 2*root + 1
-				if child >= hi {
-					break
-				}
-				if child+1 < hi && less(data[first+child], data[first+child+1], call) {
-					child += 1
-				}
-				if !less(data[first+root], data[first+child], call) {
-					return
-				}
-				swap(data, first+root, first+child)
-				swap(indices, first+root, first+child)
-				root = child
-			}
-		}
-
-
-		first, lo, hi := a, 0, b-a
-
-		for i := (hi-1)/2; i >= 0; i -= 1 {
-			sift_down(data, indices, i, hi, first, call)
-		}
-
-		for i := hi-1; i >= 0; i -= 1 {
-			swap(data, first, first+i)
-			swap(indices, first, first+i)
-			sift_down(data, indices, lo, i, first, call)
-		}
-	}
-
-	median3 :: proc(data: T, indices: []int, m1, m0, m2: int, call: P) #no_bounds_check {
-		if less(data[m1], data[m0], call) {
-			swap(data, m1, m0)
-			swap(indices, m1, m0)
-		}
-		if less(data[m2], data[m1], call) {
-			swap(data, m2, m1)
-			swap(indices, m2, m1)
-			if less(data[m1], data[m0], call) {
-				swap(data, m1, m0)
-				swap(indices, m1, m0)
-			}
-		}
-	}
-
-	do_pivot :: proc(data: T, indices: []int, lo, hi: int, call: P) -> (midlo, midhi: int) #no_bounds_check {
-		m := int(uint(lo+hi)>>1)
-		if hi-lo > 40 {
-			s := (hi-lo)/8
-			median3(data, indices, lo, lo+s, lo+s*2, call)
-			median3(data, indices, m, m-s, m+s, call)
-			median3(data, indices, hi-1, hi-1-s, hi-1-s*2, call)
-		}
-		median3(data, indices, lo, m, hi-1, call)
-
-		pivot := lo
-		a, c := lo+1, hi-1
-
-
-		for ; a < c && less(data[a], data[pivot], call); a += 1 {
-		}
-		b := a
-
-		for {
-			for ; b < c && !less(data[pivot], data[b], call); b += 1 { // data[b] <= pivot
-			}
-			for ; b < c && less(data[pivot], data[c-1], call); c -=1 { // data[c-1] > pivot
-			}
-			if b >= c {
-				break
-			}
-
-			swap(data, b, c-1)
-			swap(indices, b, c-1)
-			b += 1
-			c -= 1
-		}
-
-		protect := hi-c < 5
-		if !protect && hi-c < (hi-lo)/4 {
-			dups := 0
-			if !less(data[pivot], data[hi-1], call) {
-				swap(data, c, hi-1)
-				swap(indices, c, hi-1)
-				c += 1
-				dups += 1
-			}
-			if !less(data[b-1], data[pivot], call) {
-				b -= 1
-				dups += 1
-			}
-
-			if !less(data[m], data[pivot], call) {
-				swap(data, m, b-1)
-				swap(indices, m, b-1)
-				b -= 1
-				dups += 1
-			}
-			protect = dups > 1
-		}
-		if protect {
-			for {
-				for ; a < b && !less(data[b-1], data[pivot], call); b -= 1 {
-				}
-				for ; a < b && less(data[a], data[pivot], call); a += 1 {
-				}
-				if a >= b {
-					break
-				}
-				swap(data, a, b-1)
-				swap(indices, a, b-1)
-				a += 1
-				b -= 1
-			}
-		}
-		swap(data, pivot, b-1)
-		swap(indices, pivot, b-1)
-		return b-1, c
-	}
-
-	assert(len(data) == len(indices))
-
-	a, b, max_depth := a, b, max_depth
-
-	for b-a > 12 { // only use shell sort for lengths <= 12
-		if max_depth == 0 {
-			heap_sort(data, indices, a, b, call)
+	cycle :: proc "contextless" (width: uint, data: [][^]byte, n: int) {
+		if len(data) < 2 {
 			return
 		}
-		max_depth -= 1
-		mlo, mhi := do_pivot(data, indices, a, b, call)
-		if mlo-a < b-mhi {
-			_quick_sort_general_with_indices(data, indices, a, mlo, max_depth, call, KIND)
-			a = mhi
-		} else {
-			_quick_sort_general_with_indices(data, indices, mhi, b, max_depth, call, KIND)
-			b = mlo
+		buf: [256]u8 = ---
+		data[n] = raw_data(buf[:])
+		width := width
+		for width != 0 {
+			l := builtin.min(size_of(buf), int(width))
+			copy(data[n][:l], data[0][:l])
+			for i in 0..<n {
+				copy(data[i][:l], data[i+1][:l])
+				data[i] = data[i][l:]
+			}
+			width -= uint(l)
 		}
 	}
-	if b-a > 1 {
-		// Shell short with gap 6
-		for i in a+6..<b {
-			if less(data[i], data[i-6], call) {
-				swap(data, i, i-6)
-				swap(indices, i, i-6)
+
+	sift :: proc(head: [^]byte, width: uint, cmp: Generic_Cmp, arg: rawptr, pshift: int, lp: []uint) {
+		head := head
+		buf: [14*size_of(uint)+1][^]byte = ---
+		buf[0] = head
+		i := 1
+		pshift := pshift
+		for pshift > 1 {
+			rt := head[-width:]
+			lf := head[-width:][-lp[pshift - 2]:]
+			if cmp(buf[0], lf, arg) >= .Equal && cmp(buf[0], rt, arg) >= .Equal {
+				break
+			}
+			if cmp(lf, rt, arg) >= .Equal {
+				buf[i], head = lf, lf
+				pshift -= 1
+			} else {
+				buf[i], head = rt, rt
+				pshift -= 2
+			}
+			i += 1
+		}
+		cycle(width, buf[:], i)
+	}
+
+	trinkle :: proc(head: [^]byte, width: uint, cmp: Generic_Cmp, arg: rawptr, pp: []uint, pshift: int, trusty: bool, lp: []uint) {
+		head := head
+
+		p := [2]uint{pp[0], pp[1]}
+
+		buf: [14*size_of(uint)+1][^]byte = ---
+		buf[0] = head
+
+		i := 1
+		trail := 0
+		pshift := pshift
+		trusty := trusty
+		for p[0] != 1 || p[1] != 0 {
+			stepson := head[-lp[pshift]:]
+			if cmp(stepson, buf[0], arg) <= .Equal {
+				break
+			}
+			if !trusty && pshift > 1 {
+				rt := head[-width:]
+				lf := head[-width:][-lp[pshift-2]:]
+				if cmp(rt, stepson, arg) >= .Equal || cmp(lf, stepson, arg) >= .Equal {
+					break
+				}
+			}
+			buf[i] = stepson
+			head = stepson
+			trail = pntz(p)
+			shr(p[:], trail)
+			pshift += trail
+			trusty = false
+			i += 1
+		}
+		if trusty {
+			return
+		}
+		cycle(width, buf[:], i)
+		sift(head, width, cmp, arg, pshift, lp)
+	}
+
+	size := nel * width
+	if size == 0 {
+		return
+	}
+
+	lp: [12*size_of(uint)]uint = ---
+	lp[1] = width
+	lp[0] = lp[1]
+	for i := 2; true; i += 1 {
+		lp[i] = lp[i-2] + lp[i-1] + width
+		if lp[i] >= size {
+			break
+		}
+	}
+
+	head := base
+	high := head[size - width:]
+	p := [2]uint{1, 0}
+	pshift := 1
+	for head < high {
+		if (p[0] & 3) == 3 {
+			sift(head, width, cmp, arg, pshift, lp[:])
+			shr(p[:], 2)
+			pshift += 2
+		} else {
+			if lp[pshift - 1] >= uint(uintptr(high) - uintptr(head)) {
+				trinkle(head, width, cmp, arg, p[:], pshift, false, lp[:])
+			} else {
+				sift(head, width, cmp, arg, pshift, lp[:])
+			}
+			if pshift == 1 {
+				shl(p[:], 1)
+				pshift = 0
+			} else {
+				shl(p[:], pshift - 1)
+				pshift = 1
 			}
 		}
-		insertion_sort(data, indices, a, b, call)
+		p[0] |= 1
+		head = head[width:]
 	}
-}
+	trinkle(head, width, cmp, arg, p[:], pshift, false, lp[:])
+	for pshift != 1 || p[0] != 1 || p[1] != 0 {
+		if pshift <= 1 {
+			trail := pntz(p)
+			shr(p[:], trail)
+			pshift += trail
+		} else {
+			shl(p[:], 2)
+			pshift -= 2
+			p[0] ~= 7
+			shr(p[:], 1)
+			trinkle(head[-width:][-lp[pshift]:], width, cmp, arg, p[:], pshift + 1, true, lp[:])
+			shl(p[:], 1)
+			p[0] |= 1
+			trinkle(head[-width:], width, cmp, arg, p[:], pshift, true, lp[:])
+		}
+		head = head[-width:]
+	}
+}