diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin index bb9fc4b36..091feece7 100644 --- a/base/runtime/internal.odin +++ b/base/runtime/internal.odin @@ -8,10 +8,10 @@ IS_WASM :: ODIN_ARCH == .wasm32 || ODIN_ARCH == .wasm64p32 @(private) RUNTIME_LINKAGE :: "strong" when ODIN_USE_SEPARATE_MODULES else - "internal" when ODIN_NO_ENTRY_POINT && (ODIN_BUILD_MODE == .Static || ODIN_BUILD_MODE == .Dynamic || ODIN_BUILD_MODE == .Object) else - "strong" when ODIN_BUILD_MODE == .Dynamic else - "strong" when !ODIN_NO_CRT else - "internal" + "internal" when ODIN_NO_ENTRY_POINT && (ODIN_BUILD_MODE == .Static || ODIN_BUILD_MODE == .Dynamic || ODIN_BUILD_MODE == .Object) else + "strong" when ODIN_BUILD_MODE == .Dynamic else + "strong" when !ODIN_NO_CRT else + "internal" RUNTIME_REQUIRE :: false // !ODIN_TILDE @(private) @@ -24,7 +24,7 @@ HAS_HARDWARE_SIMD :: false when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && ! true // Size of a native SIMD register for the current compilation target -NATIVE_SIMD_BIT_WIDTH :: +NATIVE_SIMD_BIT_WIDTH :: 512 when (ODIN_ARCH == .amd64) && intrinsics.has_target_feature("avx512f") else 256 when (ODIN_ARCH == .amd64) && (intrinsics.has_target_feature("avx2") || intrinsics.has_target_feature("avx")) else // Fallback for no hardware SIMD, but also SSE, NEON, SVE, RVV and WASM SIMD128. @@ -1186,7 +1186,7 @@ floattidf :: proc "c" (a: i128) -> f64 { // okay case: a = i128(u128(a) >> u128(sd - (DBL_MANT_DIG+2))) | - i128(u128(a) & (~u128(0) >> u128(N + DBL_MANT_DIG+2 - sd)) != 0) + i128(u128(a) & (~u128(0) >> u128(N + DBL_MANT_DIG+2 - sd)) != 0) } a |= i128((a & 4) != 0) @@ -1202,8 +1202,8 @@ floattidf :: proc "c" (a: i128) -> f64 { } fb: [2]u32 fb[1] = (u32(s) & 0x80000000) | // sign - (u32(e + 1023) << 20) | // exponent - u32((u64(a) >> 32) & 0x000FFFFF) // mantissa-high + (u32(e + 1023) << 20) | // exponent + u32((u64(a) >> 32) & 0x000FFFFF) // mantissa-high fb[0] = u32(a) // mantissa-low return transmute(f64)fb } @@ -1243,8 +1243,8 @@ floattidf_unsigned :: proc "c" (a: u128) -> f64 { } fb: [2]u32 fb[1] = (0) | // sign - u32((e + 1023) << 20) | // exponent - u32((u64(a) >> 32) & 0x000FFFFF) // mantissa-high + u32((e + 1023) << 20) | // exponent + u32((u64(a) >> 32) & 0x000FFFFF) // mantissa-high fb[0] = u32(a) // mantissa-low return transmute(f64)fb } @@ -1374,24 +1374,71 @@ fixdfti :: proc "c" (a: u64) -> i128 { } - - -__write_bits :: proc "contextless" (dst, src: [^]byte, offset: uintptr, size: uintptr) { - for i in 0..>3]) & (1<<(i&7)) != 0) - dst[j>>3] &~= 1<<(j&7) - dst[j>>3] |= the_bit<<(j&7) +__copy_bits :: #force_inline proc "contextless" ( + dst: [^]byte, + src: [^]byte, + buf_bytes: uintptr, + dst_bit: uintptr, + src_bit: uintptr, + size_bits: uintptr, +) #no_bounds_check { + src_byte := src_bit >> 3 + dst_byte := dst_bit >> 3 + src_shift := src_bit & 7 + dst_shift := dst_bit & 7 + src_need_bytes := ((src_shift + size_bits + 7) >> 3) + a, b: u64 + if src_need_bytes <= 4 { + a = u64(intrinsics.unaligned_load((^u32)(&src[src_byte]))) + } else { + a = intrinsics.unaligned_load((^u64)(&src[src_byte])) + b = intrinsics.unaligned_load((^u64)(&src[src_byte + 8])) + } + bits := (a >> src_shift) | (b << (64 - src_shift)) + mask := ~u64(0) + if size_bits < 64 { + mask = (u64(1) << size_bits) - 1 + } + bits &= mask + dst_need_bytes := ((dst_shift + size_bits + 7) >> 3) + if dst_shift == 0 { + if dst_need_bytes <= 4 { + v := u64(intrinsics.unaligned_load((^u32)(&dst[dst_byte]))) + v = (v & ~mask) | bits + intrinsics.unaligned_store((^u32)(&dst[dst_byte]), u32(v)) + } else { + v := intrinsics.unaligned_load((^u64)(&dst[dst_byte])) + v = (v & ~mask) | bits + intrinsics.unaligned_store((^u64)(&dst[dst_byte]), v) + } + } else { + v0 := intrinsics.unaligned_load((^u64)(&dst[dst_byte])) + v1 := intrinsics.unaligned_load((^u64)(&dst[dst_byte + 8])) + v0 = (v0 & ~(mask << dst_shift)) | (bits << dst_shift) + v1 = (v1 & ~(mask >> (64 - dst_shift))) | (bits >> (64 - dst_shift)) + intrinsics.unaligned_store((^u64)(&dst[dst_byte]), v0) + intrinsics.unaligned_store((^u64)(&dst[dst_byte + 8]), v1) } } -__read_bits :: proc "contextless" (dst, src: [^]byte, offset: uintptr, size: uintptr) { - for j in 0..>3]) & (1<<(i&7)) != 0) - dst[j>>3] &~= 1<<(j&7) - dst[j>>3] |= the_bit<<(j&7) - } +__write_bits :: proc "contextless" (dst, src: [^]byte, dst_size_bytes: uintptr, offset_bits: uintptr, size_bits: uintptr) { + __copy_bits(dst, src, dst_size_bytes, offset_bits, 0, size_bits) + // for i in 0..>3]) & (1<<(i&7)) != 0) + // dst[j>>3] &~= 1<<(j&7) + // dst[j>>3] |= the_bit<<(j&7) + // } +} + +__read_bits :: proc "contextless" (dst, src: [^]byte, src_size_bytes: uintptr, offset_bits: uintptr, size_bits: uintptr) { + __copy_bits(dst, src, src_size_bytes, 0, offset_bits, size_bits) + // for j in 0..>3]) & (1<<(i&7)) != 0) + // dst[j>>3] &~= 1<<(j&7) + // dst[j>>3] |= the_bit<<(j&7) + // } } when .Address in ODIN_SANITIZER_FLAGS { diff --git a/src/llvm_backend_general.cpp b/src/llvm_backend_general.cpp index 27a8c66e3..d36dd002d 100644 --- a/src/llvm_backend_general.cpp +++ b/src/llvm_backend_general.cpp @@ -957,6 +957,133 @@ gb_internal LLVMValueRef OdinLLVMBuildLoadAligned(lbProcedure *p, LLVMTypeRef ty return result; } +// gb_internal void OdinLLVMBuildUnalignedStore(lbProcedure *p, LLVMValueRef dst, LLVMValueRef src, Type *ptr_type) { +// Type *t = type_deref(ptr_type); + +// if (is_type_simd_vector(t)) { +// LLVMValueRef store = LLVMBuildStore(p->builder, src, dst); +// LLVMSetAlignment(store, 1); +// } else { +// lb_mem_copy_non_overlapping(p, {dst, ptr_type}, {src, ptr_type}, lb_const_int(p->module, t_int, type_size_of(t)), false); +// } + +// } +gb_internal LLVMValueRef OdinLLVMBuildUnalignedLoad(lbProcedure *p, LLVMValueRef src, Type *ptr_type) { + LLVMTypeRef type = lb_type(p->module, type_deref(ptr_type)); + + src = LLVMBuildPointerCast(p->builder, src, lb_type(p->module, ptr_type), ""); + LLVMValueRef load = LLVMBuildLoad2(p->builder, type, src, ""); + LLVMSetAlignment(load, 1); + return load; +} + +gb_internal void lb_copy_bits(lbProcedure *p, + LLVMValueRef dst, + LLVMValueRef src, + u64 buf_bytes, + u64 dst_bit, + u64 src_bit, + u64 size_bits +) { + auto ptr_offset = [](lbProcedure *p, LLVMValueRef ptr, u64 offset) -> LLVMValueRef { + LLVMValueRef indices[1] = {LLVMConstInt(lb_type(p->module, t_u64), offset, false)}; + ptr = LLVMBuildPointerCast(p->builder, ptr, lb_type(p->module, t_u8_ptr), ""); + return LLVMBuildGEP2(p->builder, lb_type(p->module, t_u8), ptr, indices, 1, ""); + }; + + Type *t_u32_ptr = alloc_type_pointer(t_u32); + Type *t_u64_ptr = alloc_type_pointer(t_u64); + LLVMTypeRef llvm_u32 = lb_type(p->module, t_u32); + LLVMTypeRef llvm_u64 = lb_type(p->module, t_u64); + LLVMTypeRef llvm_u32_ptr = lb_type(p->module, t_u32_ptr); + LLVMTypeRef llvm_u64_ptr = lb_type(p->module, t_u64_ptr); + + dst = LLVMBuildPointerCast(p->builder, dst, lb_type(p->module, t_u8_ptr), ""); + src = LLVMBuildPointerCast(p->builder, src, lb_type(p->module, t_u8_ptr), ""); + + u64 src_byte = src_bit>>3; + u64 dst_byte = dst_bit>>3; + u64 src_shift = src_bit&7; + u64 dst_shift = dst_bit&7; + u64 src_need_bytes = (src_shift + size_bits + 7)>>3; + + LLVMValueRef a = LLVMConstInt(llvm_u64, 0, false); + LLVMValueRef b = LLVMConstInt(llvm_u64, 0, false); + if (src_need_bytes <= 4) { + // a = u64(intrinsics.unaligned_load((^u32)(&src[src_byte]))) + a = OdinLLVMBuildUnalignedLoad(p, ptr_offset(p, src, src_byte), t_u32_ptr); + a = LLVMBuildZExt(p->builder, a, llvm_u64, ""); + } else { + // a = intrinsics.unaligned_load((^u64)(&src[src_byte])) + a = OdinLLVMBuildUnalignedLoad(p, ptr_offset(p, src, src_byte), t_u64_ptr); + // b = intrinsics.unaligned_load((^u64)(&src[src_byte + 8])) + b = OdinLLVMBuildUnalignedLoad(p, ptr_offset(p, src, src_byte + 8), t_u64_ptr); + } + + // bits := (a >> src_shift) | (b << (64 - src_shift)) + LLVMValueRef bits = LLVMBuildOr(p->builder, + LLVMBuildLShr(p->builder, a, LLVMConstInt(llvm_u64, src_shift, false), ""), + LLVMBuildShl (p->builder, b, LLVMConstInt(llvm_u64, 64 - src_shift, false), ""), + "" + ); + u64 mask = ~cast(u64)0; + if (size_bits < 64) { + mask = ((cast(u64)1)<builder, bits, LLVMConstInt(llvm_u64, mask, false), ""); + + u64 dst_need_bytes = (dst_shift + size_bits + 7) >> 3; + if (dst_shift == 0) { + if (dst_need_bytes <= 4) { + // v := u64(intrinsics.unaligned_load((^u32)(&dst[dst_byte]))) + LLVMValueRef v = OdinLLVMBuildUnalignedLoad(p, ptr_offset(p, dst, dst_byte), t_u32_ptr); + v = LLVMBuildZExt(p->builder, v, llvm_u64, ""); + + // v = (v & ~mask) | bits + v = LLVMBuildAnd(p->builder, v, LLVMConstInt(llvm_u64, ~mask, false), ""); + v = LLVMBuildOr(p->builder, v, bits, ""); + + // intrinsics.unaligned_store((^u32)(&dst[dst_byte]), u32(v)) + v = LLVMBuildTrunc(p->builder, v, llvm_u32, ""); + LLVMValueRef store = LLVMBuildStore(p->builder, v, LLVMBuildPointerCast(p->builder, ptr_offset(p, dst, dst_byte), llvm_u32_ptr, "")); + LLVMSetAlignment(store, 1); + } else { + // v := intrinsics.unaligned_load((^u64)(&dst[dst_byte])) + LLVMValueRef v = OdinLLVMBuildUnalignedLoad(p, ptr_offset(p, dst, dst_byte), t_u64_ptr); + + // v = (v & ~mask) | bits + v = LLVMBuildAnd(p->builder, v, LLVMConstInt(llvm_u64, ~mask, false), ""); + v = LLVMBuildOr(p->builder, v, bits, ""); + + // intrinsics.unaligned_store((^u64)(&dst[dst_byte]), v) + LLVMValueRef store = LLVMBuildStore(p->builder, v, LLVMBuildPointerCast(p->builder, ptr_offset(p, dst, dst_byte), llvm_u64_ptr, "")); + LLVMSetAlignment(store, 1); + } + } else { + // v0 := intrinsics.unaligned_load((^u64)(&dst[dst_byte])) + // v1 := intrinsics.unaligned_load((^u64)(&dst[dst_byte + 8])) + LLVMValueRef v0 = OdinLLVMBuildUnalignedLoad(p, ptr_offset(p, dst, dst_byte+0), t_u64_ptr); + LLVMValueRef v1 = OdinLLVMBuildUnalignedLoad(p, ptr_offset(p, dst, dst_byte+8), t_u64_ptr); + + // v0 = (v0 & ~(mask << dst_shift)) | (bits << dst_shift) + v0 = LLVMBuildAnd(p->builder, v0, LLVMConstInt(llvm_u64, ~(mask << dst_shift), false), ""); + v0 = LLVMBuildOr (p->builder, v0, LLVMBuildShl(p->builder, bits, LLVMConstInt(llvm_u64, dst_shift, false), ""), ""); + + // v1 = (v1 & ~(mask >> (64 - dst_shift))) | (bits >> (64 - dst_shift)) + v1 = LLVMBuildAnd(p->builder, v1, LLVMConstInt(llvm_u64, ~(mask >> (64 - dst_shift)), false), ""); + v1 = LLVMBuildOr (p->builder, v1, LLVMBuildLShr(p->builder, bits, LLVMConstInt(llvm_u64, (64 - dst_shift), false), ""), ""); + + // intrinsics.unaligned_store((^u64)(&dst[dst_byte]), v0) + // intrinsics.unaligned_store((^u64)(&dst[dst_byte + 8]), v1) + LLVMValueRef s0 = LLVMBuildStore(p->builder, v0, LLVMBuildPointerCast(p->builder, ptr_offset(p, dst, dst_byte+0), llvm_u64_ptr, "")); + LLVMValueRef s1 = LLVMBuildStore(p->builder, v1, LLVMBuildPointerCast(p->builder, ptr_offset(p, dst, dst_byte+8), llvm_u64_ptr, "")); + LLVMSetAlignment(s0, 1); + LLVMSetAlignment(s1, 1); + } +} + + gb_internal void lb_addr_store(lbProcedure *p, lbAddr addr, lbValue value) { if (addr.addr.value == nullptr) { return; @@ -974,6 +1101,7 @@ gb_internal void lb_addr_store(lbProcedure *p, lbAddr addr, lbValue value) { if (addr.kind == lbAddr_BitField) { lbValue dst = addr.addr; + lbValue src = {}; if (is_type_endian_big(addr.bitfield.type)) { i64 shift_amount = 8*type_size_of(value.type) - addr.bitfield.bit_size; lbValue shifted_value = value; @@ -981,33 +1109,17 @@ gb_internal void lb_addr_store(lbProcedure *p, lbAddr addr, lbValue value) { shifted_value.value, LLVMConstInt(LLVMTypeOf(shifted_value.value), shift_amount, false), ""); - lbValue src = lb_address_from_load_or_generate_local(p, shifted_value); - - auto args = array_make(temporary_allocator(), 4); - args[0] = dst; - args[1] = src; - args[2] = lb_const_int(p->module, t_uintptr, addr.bitfield.bit_offset); - args[3] = lb_const_int(p->module, t_uintptr, addr.bitfield.bit_size); - lb_emit_runtime_call(p, "__write_bits", args); - } else if ((addr.bitfield.bit_offset % 8) == 0 && - (addr.bitfield.bit_size % 8) == 0) { - lbValue src = lb_address_from_load_or_generate_local(p, value); - - lbValue byte_offset = lb_const_int(p->module, t_uintptr, addr.bitfield.bit_offset/8); - lbValue byte_size = lb_const_int(p->module, t_uintptr, addr.bitfield.bit_size/8); - lbValue dst_offset = lb_emit_conv(p, dst, t_u8_ptr); - dst_offset = lb_emit_ptr_offset(p, dst_offset, byte_offset); - lb_mem_copy_non_overlapping(p, dst_offset, src, byte_size); + src = lb_address_from_load_or_generate_local(p, shifted_value); } else { - lbValue src = lb_address_from_load_or_generate_local(p, value); - - auto args = array_make(temporary_allocator(), 4); - args[0] = dst; - args[1] = src; - args[2] = lb_const_int(p->module, t_uintptr, addr.bitfield.bit_offset); - args[3] = lb_const_int(p->module, t_uintptr, addr.bitfield.bit_size); - lb_emit_runtime_call(p, "__write_bits", args); + src = lb_address_from_load_or_generate_local(p, value); } + + u64 buf_bytes = cast(u64)type_size_of(type_deref(dst.type)); + u64 dst_bit = cast(u64)addr.bitfield.bit_offset; + u64 src_bit = cast(u64)0; + u64 size_bits = cast(u64)addr.bitfield.bit_size; + + lb_copy_bits(p, dst.value, src.value, buf_bytes, dst_bit, src_bit, size_bits); return; } else if (addr.kind == lbAddr_Map) { lb_internal_dynamic_map_set(p, addr.addr, addr.map.type, addr.map.key, value, p->curr_stmt); @@ -1289,53 +1401,28 @@ gb_internal lbValue lb_addr_load(lbProcedure *p, lbAddr const &addr) { } } - i64 total_bitfield_bit_size = 8*type_size_of(lb_addr_type(addr)); i64 dst_byte_size = type_size_of(addr.bitfield.type); lbAddr dst = lb_add_local_generated(p, addr.bitfield.type, true); lbValue src = addr.addr; - lbValue bit_offset = lb_const_int(p->module, t_uintptr, addr.bitfield.bit_offset); - lbValue bit_size = lb_const_int(p->module, t_uintptr, addr.bitfield.bit_size); - lbValue byte_offset = lb_const_int(p->module, t_uintptr, (addr.bitfield.bit_offset+7)/8); - lbValue byte_size = lb_const_int(p->module, t_uintptr, (addr.bitfield.bit_size+7)/8); - GB_ASSERT(type_size_of(addr.bitfield.type) >= ((addr.bitfield.bit_size+7)/8)); lbValue r = {}; - if (is_type_endian_big(addr.bitfield.type)) { - auto args = array_make(temporary_allocator(), 4); - args[0] = dst.addr; - args[1] = src; - args[2] = bit_offset; - args[3] = bit_size; - lb_emit_runtime_call(p, "__read_bits", args); + u64 buf_bytes = cast(u64)type_size_of(type_deref(src.type)); + u64 dst_bit = cast(u64)0; + u64 src_bit = cast(u64)addr.bitfield.bit_offset; + u64 size_bits = cast(u64)addr.bitfield.bit_size; + + lb_copy_bits(p, dst.addr.value, src.value, buf_bytes, dst_bit, src_bit, size_bits); + r = lb_addr_load(p, dst); + if (is_type_endian_big(addr.bitfield.type)) { LLVMValueRef shift_amount = LLVMConstInt( lb_type(p->module, lb_addr_type(dst)), 8*dst_byte_size - addr.bitfield.bit_size, false ); - r = lb_addr_load(p, dst); r.value = LLVMBuildShl(p->builder, r.value, shift_amount, ""); - } else if ((addr.bitfield.bit_offset % 8) == 0) { - do_mask = 8*dst_byte_size != addr.bitfield.bit_size; - - lbValue copy_size = byte_size; - lbValue src_offset = lb_emit_conv(p, src, t_u8_ptr); - src_offset = lb_emit_ptr_offset(p, src_offset, byte_offset); - if (addr.bitfield.bit_offset + 8*dst_byte_size <= total_bitfield_bit_size) { - copy_size = lb_const_int(p->module, t_uintptr, dst_byte_size); - } - lb_mem_copy_non_overlapping(p, dst.addr, src_offset, copy_size, false); - r = lb_addr_load(p, dst); - } else { - auto args = array_make(temporary_allocator(), 4); - args[0] = dst.addr; - args[1] = src; - args[2] = bit_offset; - args[3] = bit_size; - lb_emit_runtime_call(p, "__read_bits", args); - r = lb_addr_load(p, dst); } Type *t = addr.bitfield.type;