sys/linux: add io uring API

This commit is contained in:
Laytan
2025-11-10 20:03:57 +01:00
parent ad9dc4d31b
commit 3db76bc367
4 changed files with 516 additions and 6 deletions

View File

@@ -1964,3 +1964,265 @@ RISCV_HWProbe_Misaligned_Scalar_Perf :: enum {
UNSUPPORTED,
}
IO_Uring_Enter_Flags_Bits :: enum {
GETEVENTS,
SQ_WAKEUP,
SQ_WAIT,
EXT_ARG, // Available since Linux 5.11
REGISTERED_RING,
}
IO_Uring_Register_Opcode :: enum uint {
REGISTER_BUFFERS = 0,
UNREGISTER_BUFFERS = 1,
REGISTER_FILES = 2,
UNREGISTER_FILES = 3,
REGISTER_EVENTFD = 4,
UNREGISTER_EVENTFD = 5,
REGISTER_FILES_UPDATE = 6,
REGISTER_EVENTFD_ASYNC = 7,
REGISTER_PROBE = 8,
REGISTER_PERSONALITY = 9,
UNREGISTER_PERSONALITY = 10,
REGISTER_RESTRICTIONS = 11,
REGISTER_ENABLE_RINGS = 12,
/* extended with tagging */
REGISTER_FILES2 = 13,
REGISTER_FILES_UPDATE2 = 14,
REGISTER_BUFFERS2 = 15,
REGISTER_BUFFERS_UPDATE = 16,
/* set/clear io-wq thread affinities */
REGISTER_IOWQ_AFF = 17,
UNREGISTER_IOWQ_AFF = 18,
/* set/get max number of io-wq workers */
REGISTER_IOWQ_MAX_WORKERS = 19,
/* register/unregister io_uring fd with the ring */
REGISTER_RING_FDS = 20,
UNREGISTER_RING_FDS = 21,
/* register ring based provide buffer group */
REGISTER_PBUF_RING = 22,
UNREGISTER_PBUF_RING = 23,
/* sync cancelation API */
REGISTER_SYNC_CANCEL = 24,
/* register a range of fixed file slots for automatic slot allocation */
REGISTER_FILE_ALLOC_RANGE = 25,
/* this goes last */
REGISTER_LAST,
/* flag added to the opcode to use a registered ring fd */
REGISTER_USE_REGISTERED_RING = 1 << 31,
}
IO_Uring_Setup_Flags_Bits :: enum {
// io_context is polled.
IOPOLL,
// SQ poll thread.
SQPOLL,
// sq_thread_cpu is valid.
SQ_AFF,
// app defines CQ size.
CQSIZE,
// clamp SQ/CQ ring sizes.
CLAMP,
// attach to existing wq.
ATTACH_WQ,
// start with ring disabled.
R_DISABLED,
// continue submit on error.
SUBMIT_ALL,
// Cooperative task running. When requests complete, they often require
// forcing the submitter to transition to the kernel to complete. If this
// flag is set, work will be done when the task transitions anyway, rather
// than force an inter-processor interrupt reschedule. This avoids interrupting
// a task running in userspace, and saves an IPI.
COOP_TASKRUN,
// If COOP_TASKRUN is set, get notified if task work is available for
// running and a kernel transition would be needed to run it. This sets
// IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
TASKRUN_FLAG,
// SQEs are 128 bytes.
SQE128,
// CQEs are 32 bytes.
CQE32,
// Only one task is allowed to submit requests
SINGLE_ISSUER,
// Defer running task work to get events.
// Rather than running bits of task work whenever the task transitions
// try to do it just before it is needed.
DEFER_TASKRUN,
}
IO_Uring_Features_Bits :: enum {
SINGLE_MMAP,
NODROP,
SUBMIT_STABLE,
RW_CUR_POS,
CUR_PERSONALITY,
FAST_POLL,
POLL_32BITS,
SQPOLL_NONFIXED,
EXT_ARG,
NATIVE_WORKERS,
RSRC_TAGS,
}
IO_Uring_CQE_Flags_Bits :: enum {
// If set, the upper 16 bits are the buffer ID.
BUFFER,
// If set, parent SQE will generate more CQE entries.
MORE,
// If set, more data to read after socket recv.
SOCK_NONEMPTY,
// Set for notification CQEs. Can be used to distinct them from sends.
NOTIF,
}
IO_Uring_OP :: enum u8 {
NOP,
READV,
WRITEV,
FSYNC,
READ_FIXED,
WRITE_FIXED,
POLL_ADD,
POLL_REMOVE,
SYNC_FILE_RANGE,
SENDMSG,
RECVMSG,
TIMEOUT,
TIMEOUT_REMOVE,
ACCEPT,
ASYNC_CANCEL,
LINK_TIMEOUT,
CONNECT,
FALLOCATE,
OPENAT,
CLOSE,
FILES_UPDATE,
STATX,
READ,
WRITE,
FADVISE,
MADVISE,
SEND,
RECV,
OPENAT2,
EPOLL_CTL,
SPLICE,
PROVIDE_BUFFERS,
REMOVE_BUFFERS,
TEE,
SHUTDOWN,
RENAMEAT,
UNLINKAT,
MKDIRAT,
SYMLINKAT,
LINKAT,
MSG_RING,
FSETXATTR,
SETXATTR,
FGETXATTR,
GETXATTR,
SOCKET,
URING_CMD,
SEND_ZC,
SENDMSG_ZC,
READ_MULTISHOT,
WAITID,
FUTEX_WAIT,
FUTEX_WAKE,
FUTEX_WAITV,
FIXED_FD_INSTALL,
FTRUNCATE,
BIND,
LISTEN,
}
IO_Uring_SQE_Flags_Bits :: enum {
// Use fixed fileset.
FIXED_FILE,
// Issue after inflight IO.
IO_DRAIN,
// Links next sqe.
IO_LINK,
// Like LINK, but stronger.
IO_HARDLINK,
// Always go async.
ASYNC,
// Select buffer from sq.buf_group.
BUFFER_SELECT,
// Don't post CQE if request succeeded.
CQE_SKIP_SUCCESS,
}
IO_Uring_Poll_Add_Flags_Bits :: enum {
ADD_MULTI,
UPDATE_EVENTS,
UPDATE_USER_DATA,
ADD_LEVEL,
}
IO_Uring_Fsync_Flags_Bits :: enum {
DATASYNC,
}
IO_Uring_Timeout_Flags_Bits :: enum {
ABS,
UPDATE,
BOOTTIME,
REALTIME,
LINK_TIMEOUT_UPDATE,
ETIME_SUCCESS,
}
IO_Uring_Cmd_Flags_Bits :: enum {
// use registered buffer; pass this flag along with setting sqe.buf_index.
FIXED,
}
IO_Uring_Splice_Flags_Bits :: enum {
MOVE,
NONBLOCK,
MORE,
GIFT,
F_FD_IN_FIXED = 31,
}
IO_Uring_Accept_Flags_Bits :: enum {
MULTISHOT,
}
IO_Uring_Send_Recv_Flags_Bits :: enum {
/*
If set, instead of first attempting to send
or receive and arm poll if that yields an
-EAGAIN result, arm poll upfront and skip
the initial transfer attempt.
*/
RECVSEND_POLL_FIRST,
/*
Multishot recv. Sets IORING_CQE_F_MORE if
the handler will continue to report
CQEs on behalf of the same SQE.
*/
RECV_MULTISHOT,
/*
Use registered buffers, the index is stored in
the buf_index field.
*/
RECVSEND_FIXED_BUF,
/*
If set, SEND[MSG]_ZC should report
the zerocopy usage in cqe.res
for the IORING_CQE_F_NOTIF cqe.
0 is reported if zerocopy was actually possible.
IORING_NOTIF_USAGE_ZC_COPIED if data was copied
(at least partially).
*/
SEND_ZC_REPORT_USAGE,
}
IO_Uring_Submission_Queue_Flags_Bits :: enum {
NEED_WAKEUP,
CQ_OVERFLOW,
TASKRUN,
}

View File

@@ -395,3 +395,13 @@ MAP_HUGE_16GB :: transmute(Map_Flags)(u32(34) << MAP_HUGE_SHIFT)
/* Get window size */
TIOCGWINSZ :: 0x5413
IORING_TIMEOUT_CLOCK_MASK :: IO_Uring_Timeout_Flags{.BOOTTIME, .REALTIME}
IORING_TIMEOUT_UPDATE_MASK :: IO_Uring_Timeout_Flags{.UPDATE, .LINK_TIMEOUT_UPDATE}
IORING_OFF_SQ_RING :: 0
IORING_OFF_CQ_RING :: 0x8000000
IORING_OFF_SQES :: 0x10000000
IORING_OFF_PBUF_RING :: 0x80000000
IORING_OFF_PBUF_SHIFT :: 16
IORING_OFF_MMAP_MASK :: 0xf8000000

View File

@@ -510,7 +510,7 @@ sendfile :: proc "contextless" (out_fd: Fd, in_fd: Fd, offset: ^i64, count: uint
Available since Linux 2.0.
*/
socket :: proc "contextless" (domain: Address_Family, socktype: Socket_Type, sockflags: Socket_FD_Flags, protocol: Protocol) -> (Fd, Errno) {
sock_type_flags: int = cast(int) socktype | transmute(int) sockflags
sock_type_flags: int = cast(int) socktype | cast(int) transmute(i32) sockflags
ret := syscall(SYS_socket, domain, sock_type_flags, protocol)
return errno_unwrap(ret, Fd)
}
@@ -543,7 +543,7 @@ where
T == Sock_Addr_Any
{
addr_len: i32 = size_of(T)
ret := syscall(SYS_accept4, sock, addr, &addr_len, transmute(int) sockflags)
ret := syscall(SYS_accept4, sock, addr, &addr_len, transmute(i32) sockflags)
return errno_unwrap(ret, Fd)
}
@@ -2927,11 +2927,46 @@ statx :: proc "contextless" (dir: Fd, pathname: cstring, flags: FD_Flags, mask:
// TODO(flysand): pidfd_send_signal
// TODO(flysand): io_uring_setup
/*
Setup a context for performing asynchronous I/O.
// TODO(flysand): io_uring_enter
Available since Linux 5.1
*/
io_uring_setup :: proc "contextless" (entries: u32, params: ^IO_Uring_Params) -> (Fd, Errno) {
ret := syscall(SYS_io_uring_setup, entries, params)
return errno_unwrap(ret, Fd)
}
// TODO(flysand): io_uring_register
/*
Initiate and/or complete I/O using the shared submission and completion queues.
Available since Linux 5.1
*/
io_uring_enter :: proc "contextless" (fd: Fd, to_submit: u32, min_complete: u32, flags: IO_Uring_Enter_Flags, sig: ^Sig_Set) -> (int, Errno) {
ret := syscall(SYS_io_uring_enter, fd, to_submit, min_complete, transmute(u32)flags, sig, size_of(Sig_Set) if sig != nil else 0)
return errno_unwrap(ret, int)
}
/*
Initiate and.or complete I/O using the shared submission and completion queues.
Available since Linux 5.11
*/
io_uring_enter2 :: proc "contextless" (fd: Fd, to_submit: u32, min_complete: u32, flags: IO_Uring_Enter_Flags, arg: ^IO_Uring_Getevents_Arg) -> (int, Errno) {
assert_contextless(.EXT_ARG in flags)
ret := syscall(SYS_io_uring_enter, fd, to_submit, min_complete, transmute(u32)flags, arg, size_of(IO_Uring_Getevents_Arg))
return errno_unwrap(ret, int)
}
/*
Register files or user buffers for asynchronous I/O.
Available since Linux 5.1
*/
io_uring_register :: proc "contextless" (fd: Fd, opcode: IO_Uring_Register_Opcode, arg: rawptr, nr_args: u32) -> Errno {
ret := syscall(SYS_io_uring_register, fd, opcode, arg, nr_args)
return Errno(-ret)
}
// TODO(flysand): open_tree

View File

@@ -763,7 +763,7 @@ Sig_Action :: struct($T: typeid) {
Note, on linux these are technically passed by OR'ing together
with Socket_Type, our wrapper does this under the hood.
*/
Socket_FD_Flags :: bit_set[Socket_FD_Flags_Bits; int]
Socket_FD_Flags :: bit_set[Socket_FD_Flags_Bits; i32]
/*
Address family for the socket.
@@ -1488,3 +1488,206 @@ RISCV_HWProbe :: struct {
raw: u64,
},
}
IO_Uring_Params :: struct {
sq_entries: u32,
cq_entries: u32,
flags: IO_Uring_Setup_Flags,
sq_thread_cpu: u32,
sq_thread_idle: u32,
features: IO_Uring_Features,
wq_fd: u32,
resv: [3]u32,
sq_off: IO_SQ_Ring_Offsets,
cq_off: IO_CQ_Ring_Offsets,
}
IO_Uring_Setup_Flags :: bit_set[IO_Uring_Setup_Flags_Bits; u32]
IO_Uring_Features :: bit_set[IO_Uring_Features_Bits; u32]
IO_SQ_Ring_Offsets :: struct {
head: u32,
tail: u32,
ring_mask: u32,
ring_entries: u32,
flags: u32,
dropped: u32,
array: u32,
resv1: u32,
user_addr: u64,
}
IO_CQ_Ring_Offsets :: struct {
head: u32,
tail: u32,
ring_mask: u32,
ring_entries: u32,
overflow: u32,
cqes: u32,
flags: u32,
resv1: u32,
user_addr: u64,
}
IO_Uring_Enter_Flags :: bit_set[IO_Uring_Enter_Flags_Bits; u32]
IO_Uring_Getevents_Arg :: struct #min_field_align(8) {
sigmask: ^Sig_Set,
sigmask_sz: u32,
// pad: u32,
ts: ^Time_Spec,
}
#assert(align_of(IO_Uring_Getevents_Arg) == 8)
IO_Uring_Rsrc_Register :: struct($T: typeid) {
nr: u32,
resv: u32,
resv2: u64,
using _: struct #min_field_align(8) {
data: [^]T,
tags: [^]u64,
},
}
IO_Uring_Rsrc_Update2 :: struct($T: typeid) {
offset: u32,
resv: u32,
using _: struct #min_field_align(8) {
data: [^]T,
tags: [^]u64,
},
nr: u32,
resv2: u32,
}
// The completion queue entry when the .CQE32 flag is not set on setup.
IO_Uring_CQE :: struct {
// sq.data submission passed back.
user_data: u64,
// result code for this event.
res: i32,
flags: IO_Uring_CQE_Flags,
}
#assert(size_of(IO_Uring_CQE) == 16)
// The completion queue entry when the .CQE32 flag is set on setup.
IO_Uring_CQE32 :: struct {
using _: IO_Uring_CQE,
pad: u64,
pad2: u64,
}
#assert(size_of(IO_Uring_CQE32) == 32)
IO_Uring_CQE_Flags :: bit_set[IO_Uring_CQE_Flags_Bits; u32]
IO_Uring_SQE_Flags :: bit_set[IO_Uring_SQE_Flags_Bits; u8]
// The submission queue entry when the .SQE128 flag is not set on setup.
IO_Uring_SQE :: struct {
opcode: IO_Uring_OP,
flags: IO_Uring_SQE_Flags,
using __ioprio: struct #raw_union {
ioprio: u16,
sq_accept_flags: IO_Uring_Accept_Flags,
sq_send_recv_flags: IO_Uring_Send_Recv_Flags,
},
fd: Fd,
using __offset: struct #raw_union {
// Offset into file.
off: u64,
addr2: u64,
using _: struct {
cmd_op: u32,
__pad1: u32,
},
statx: ^Statx,
},
using __iovecs: struct #raw_union {
// Pointer to buffer or iovecs.
addr: u64,
splice_off_in: u64,
using _: struct {
level: u32,
optname: u32,
},
},
using __len: struct #raw_union {
// Buffer size or number of iovecs.
len: u32,
poll_flags: IO_Uring_Poll_Add_Flags,
statx_mask: Statx_Mask,
epoll_ctl_op: EPoll_Ctl_Opcode,
shutdown_how: Shutdown_How,
},
using __contents: struct #raw_union {
rw_flags: i32,
fsync_flags: IO_Uring_Fsync_Flags,
// compatibility.
poll_events: Fd_Poll_Events,
// word-reversed for BE.
poll32_events: u32,
sync_range_flags: u32,
msg_flags: Socket_Msg,
timeout_flags: IO_Uring_Timeout_Flags,
accept_flags: Socket_FD_Flags,
cancel_flags: u32,
open_flags: Open_Flags,
statx_flags: FD_Flags,
fadvise_advice: u32,
splice_flags: IO_Uring_Splice_Flags,
rename_flags: u32,
unlink_flags: u32,
hardlink_flags: u32,
xattr_flags: u32,
msg_ring_flags: u32,
uring_cmd_flags: IO_Uring_Cmd_Flags,
},
// Data to be passed back at completion time.
user_data: u64,
using __buffer: struct #raw_union {
// Index into fixed buffers, if used.
buf_index: u16,
// For grouped buffer selection.
buf_group: u16,
},
// Personality to use, if used.
personality: u16,
using _: struct #raw_union {
splice_fd_in: Fd,
file_index: u32,
using _: struct {
addr_len: u16,
__pad3: [1]u16,
},
},
using __: struct #raw_union {
using _: struct {
addr3: u64,
__pad2: [1]u64,
},
},
}
#assert(size_of(IO_Uring_SQE) == 64)
// The submission queue entry when the .SQE128 flag is set on setup.
IO_Uring_SQE128 :: struct {
using _: IO_Uring_SQE,
cmd: [64]byte,
}
#assert(size_of(IO_Uring_SQE128) == 128)
IO_Uring_Poll_Add_Flags :: bit_set[IO_Uring_Poll_Add_Flags_Bits; u32]
IO_Uring_Fsync_Flags :: bit_set[IO_Uring_Fsync_Flags_Bits; u32]
IO_Uring_Timeout_Flags :: bit_set[IO_Uring_Timeout_Flags_Bits; u32]
IO_Uring_Cmd_Flags :: bit_set[IO_Uring_Cmd_Flags_Bits; u32]
IO_Uring_Splice_Flags :: bit_set[IO_Uring_Splice_Flags_Bits; u32]
IO_Uring_Accept_Flags :: bit_set[IO_Uring_Accept_Flags_Bits; u16]
IO_Uring_Send_Recv_Flags :: bit_set[IO_Uring_Send_Recv_Flags_Bits; u16]
IO_Uring_Submission_Queue_Flags :: bit_set[IO_Uring_Submission_Queue_Flags_Bits; u32]