mirror of
https://github.com/odin-lang/Odin.git
synced 2025-12-28 17:04:34 +00:00
488 lines
13 KiB
Odin
488 lines
13 KiB
Odin
// package csv reads and writes comma-separated values (CSV) files.
|
|
// This package supports the format described in [[ RFC 4180; https://tools.ietf.org/html/rfc4180.html ]]
|
|
package encoding_csv
|
|
|
|
import "core:bufio"
|
|
import "core:bytes"
|
|
import "core:io"
|
|
import "core:strings"
|
|
import "core:unicode/utf8"
|
|
|
|
// Reader is a data structure used for reading records from a CSV-encoded file
|
|
//
|
|
// The associated procedures for Reader expects its input to conform to RFC 4180.
|
|
Reader :: struct {
|
|
// comma is the field delimiter
|
|
// reader_init will set it to be ','
|
|
// A "comma" must be a valid rune, nor can it be \r, \n, or the Unicode replacement character (0xfffd)
|
|
comma: rune,
|
|
|
|
// comment, if not 0, is the comment character
|
|
// Lines beginning with the comment character without a preceding whitespace are ignored
|
|
comment: rune,
|
|
|
|
// fields_per_record is the number of expected fields per record
|
|
// if fields_per_record is >0, 'read' requires each record to have that field count
|
|
// if fields_per_record is 0, 'read' sets it to the field count in the first record
|
|
// if fields_per_record is <0, no check is made and records may have a variable field count
|
|
fields_per_record: int,
|
|
|
|
// If trim_leading_space is true, leading whitespace in a field is ignored
|
|
// This is done even if the field delimiter (comma), is whitespace
|
|
trim_leading_space: bool,
|
|
|
|
// If lazy_quotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field
|
|
lazy_quotes: bool,
|
|
|
|
// multiline_fields, when set to true, will treat a field starting with a " as a multiline string
|
|
// therefore, instead of reading until the next \n, it'll read until the next "
|
|
multiline_fields: bool,
|
|
|
|
// reuse_record controls whether calls to 'read' may return a slice using the backing buffer
|
|
// for performance
|
|
// By default, each call to 'read' returns a newly allocated slice
|
|
reuse_record: bool,
|
|
|
|
// reuse_record_buffer controls whether calls to 'read' clone the strings of each field or uses
|
|
// the data stored in record buffer for performance
|
|
// By default, each call to 'read' clones the strings of each field
|
|
reuse_record_buffer: bool,
|
|
|
|
|
|
// internal buffers
|
|
r: bufio.Reader,
|
|
line_count: int, // current line being read in the CSV file
|
|
raw_buffer: [dynamic]byte,
|
|
record_buffer: [dynamic]byte,
|
|
field_indices: [dynamic]int,
|
|
last_record: [dynamic]string,
|
|
sr: strings.Reader, // used by reader_init_with_string
|
|
|
|
// Set and used by the iterator. Query using `iterator_last_error`
|
|
last_iterator_error: Error,
|
|
}
|
|
|
|
|
|
Reader_Error_Kind :: enum {
|
|
Bare_Quote,
|
|
Quote,
|
|
Field_Count,
|
|
Invalid_Delim,
|
|
}
|
|
|
|
reader_error_kind_string := [Reader_Error_Kind]string{
|
|
.Bare_Quote = "bare \" in non-quoted field",
|
|
.Quote = "extra or missing \" in quoted field",
|
|
.Field_Count = "wrong field count",
|
|
.Invalid_Delim = "invalid delimiter",
|
|
}
|
|
|
|
Reader_Error :: struct {
|
|
kind: Reader_Error_Kind,
|
|
start_line: int,
|
|
line: int,
|
|
column: int,
|
|
expected, got: int, // used by .Field_Count
|
|
}
|
|
|
|
Error :: union {
|
|
Reader_Error,
|
|
io.Error,
|
|
}
|
|
|
|
DEFAULT_RECORD_BUFFER_CAPACITY :: 256
|
|
|
|
// reader_init initializes a new Reader from r
|
|
reader_init :: proc(reader: ^Reader, r: io.Reader, buffer_allocator := context.allocator) {
|
|
switch reader.comma {
|
|
case '\x00', '\n', '\r', 0xfffd:
|
|
reader.comma = ','
|
|
}
|
|
|
|
context.allocator = buffer_allocator
|
|
reserve(&reader.record_buffer, DEFAULT_RECORD_BUFFER_CAPACITY)
|
|
reserve(&reader.raw_buffer, 0)
|
|
reserve(&reader.field_indices, 0)
|
|
reserve(&reader.last_record, 0)
|
|
bufio.reader_init(&reader.r, r)
|
|
}
|
|
|
|
|
|
// reader_init_with_string initializes a new Reader from s
|
|
reader_init_with_string :: proc(reader: ^Reader, s: string, buffer_allocator := context.allocator) {
|
|
strings.reader_init(&reader.sr, s)
|
|
r, _ := io.to_reader(strings.reader_to_stream(&reader.sr))
|
|
reader_init(reader, r, buffer_allocator)
|
|
}
|
|
|
|
// reader_destroy destroys a Reader
|
|
reader_destroy :: proc(r: ^Reader) {
|
|
delete(r.raw_buffer)
|
|
delete(r.record_buffer)
|
|
delete(r.field_indices)
|
|
delete(r.last_record)
|
|
bufio.reader_destroy(&r.r)
|
|
}
|
|
|
|
/*
|
|
Returns a record at a time.
|
|
|
|
for record, row_idx in csv.iterator_next(&r) { ... }
|
|
|
|
TIP: If you process the results within the loop and don't need to own the results,
|
|
you can set the Reader's `reuse_record` and `reuse_record_buffer` to true;
|
|
you won't need to delete the record or its fields.
|
|
*/
|
|
iterator_next :: proc(r: ^Reader) -> (record: []string, idx: int, err: Error, more: bool) {
|
|
record, r.last_iterator_error = read(r)
|
|
return record, r.line_count - 1, r.last_iterator_error, r.last_iterator_error == nil
|
|
}
|
|
|
|
// Get last CSV parse error if we ignored it in the iterator loop
|
|
//
|
|
// for record, row_idx in csv.iterator_next(&r) { ... }
|
|
iterator_last_error :: proc(r: Reader) -> (err: Error) {
|
|
return r.last_iterator_error
|
|
}
|
|
|
|
// read reads a single record (a slice of fields) from r
|
|
//
|
|
// All \r\n sequences are normalized to \n, including multi-line field
|
|
@(require_results)
|
|
read :: proc(r: ^Reader, allocator := context.allocator) -> (record: []string, err: Error) {
|
|
if r.reuse_record {
|
|
record, err = _read_record(r, &r.last_record, allocator)
|
|
resize(&r.last_record, len(record))
|
|
copy(r.last_record[:], record)
|
|
} else {
|
|
record, err = _read_record(r, nil, allocator)
|
|
}
|
|
return
|
|
}
|
|
|
|
// is_io_error checks where an Error is a specific io.Error kind
|
|
@(require_results)
|
|
is_io_error :: proc(err: Error, io_err: io.Error) -> bool {
|
|
if v, ok := err.(io.Error); ok {
|
|
return v == io_err
|
|
}
|
|
return false
|
|
}
|
|
|
|
// read_all reads all the remaining records from r.
|
|
// Each record is a slice of fields.
|
|
// read_all is defined to read until an EOF, and does not treat EOF as an error
|
|
@(require_results)
|
|
read_all :: proc(r: ^Reader, allocator := context.allocator) -> ([][]string, Error) {
|
|
context.allocator = allocator
|
|
records: [dynamic][]string
|
|
for {
|
|
record, rerr := _read_record(r, nil, allocator)
|
|
if is_io_error(rerr, .EOF) {
|
|
return records[:], nil
|
|
}
|
|
if rerr != nil {
|
|
// allow for a partial read
|
|
if record != nil {
|
|
append(&records, record)
|
|
}
|
|
return records[:], rerr
|
|
}
|
|
append(&records, record)
|
|
}
|
|
}
|
|
|
|
// read reads a single record (a slice of fields) from the provided input.
|
|
@(require_results)
|
|
read_from_string :: proc(input: string, record_allocator := context.allocator, buffer_allocator := context.allocator) -> (record: []string, n: int, err: Error) {
|
|
ir: strings.Reader
|
|
strings.reader_init(&ir, input)
|
|
input_reader, _ := io.to_reader(strings.reader_to_stream(&ir))
|
|
|
|
r: Reader
|
|
reader_init(&r, input_reader, buffer_allocator)
|
|
defer reader_destroy(&r)
|
|
record, err = read(&r, record_allocator)
|
|
n = int(r.r.r)
|
|
return
|
|
}
|
|
|
|
|
|
// read_all reads all the remaining records from the provided input.
|
|
@(require_results)
|
|
read_all_from_string :: proc(input: string, records_allocator := context.allocator, buffer_allocator := context.allocator) -> ([][]string, Error) {
|
|
ir: strings.Reader
|
|
strings.reader_init(&ir, input)
|
|
input_reader, _ := io.to_reader(strings.reader_to_stream(&ir))
|
|
|
|
r: Reader
|
|
reader_init(&r, input_reader, buffer_allocator)
|
|
defer reader_destroy(&r)
|
|
return read_all(&r, records_allocator)
|
|
}
|
|
|
|
@(private, require_results)
|
|
is_valid_delim :: proc(r: rune) -> bool {
|
|
switch r {
|
|
case 0, '"', '\r', '\n', utf8.RUNE_ERROR:
|
|
return false
|
|
}
|
|
return utf8.valid_rune(r)
|
|
}
|
|
|
|
@(private, require_results)
|
|
_read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.allocator) -> ([]string, Error) {
|
|
@(require_results)
|
|
read_line :: proc(r: ^Reader) -> ([]byte, io.Error) {
|
|
if !r.multiline_fields {
|
|
line, err := bufio.reader_read_slice(&r.r, '\n')
|
|
if err == .Buffer_Full {
|
|
clear(&r.raw_buffer)
|
|
append(&r.raw_buffer, ..line)
|
|
for err == .Buffer_Full {
|
|
line, err = bufio.reader_read_slice(&r.r, '\n')
|
|
append(&r.raw_buffer, ..line)
|
|
}
|
|
line = r.raw_buffer[:]
|
|
}
|
|
if len(line) > 0 && err == .EOF {
|
|
err = nil
|
|
if line[len(line)-1] == '\r' {
|
|
line = line[:len(line)-1]
|
|
}
|
|
}
|
|
r.line_count += 1
|
|
|
|
// normalize \r\n to \n
|
|
n := len(line)
|
|
for n >= 2 && string(line[n-2:]) == "\r\n" {
|
|
line[n-2] = '\n'
|
|
line = line[:n-1]
|
|
}
|
|
return line, err
|
|
|
|
} else {
|
|
// Reading a "line" that can possibly contain multiline fields.
|
|
// Unfortunately, this means we need to read a character at a time.
|
|
|
|
err: io.Error
|
|
cur: rune
|
|
is_quoted: bool
|
|
|
|
field_length := 0
|
|
|
|
clear(&r.raw_buffer)
|
|
|
|
read_loop: for err == .None {
|
|
cur, _, err = bufio.reader_read_rune(&r.r)
|
|
|
|
if err != .None { break read_loop }
|
|
|
|
switch cur {
|
|
case '"':
|
|
is_quoted = field_length == 0
|
|
field_length += 1
|
|
|
|
case '\n', '\r':
|
|
is_quoted or_break read_loop
|
|
|
|
case r.comma:
|
|
field_length = 0
|
|
|
|
case:
|
|
field_length += 1
|
|
}
|
|
|
|
rune_buf, rune_len := utf8.encode_rune(cur)
|
|
append(&r.raw_buffer, ..rune_buf[:rune_len])
|
|
}
|
|
|
|
return r.raw_buffer[:], err
|
|
}
|
|
unreachable()
|
|
}
|
|
|
|
@(require_results)
|
|
length_newline :: proc(b: []byte) -> int {
|
|
if len(b) > 0 && b[len(b)-1] == '\n' {
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
@(require_results)
|
|
next_rune :: proc(b: []byte) -> rune {
|
|
r, _ := utf8.decode_rune(b)
|
|
return r
|
|
}
|
|
|
|
if r.comma == r.comment ||
|
|
!is_valid_delim(r.comma) ||
|
|
(r.comment != 0 && !is_valid_delim(r.comment)) {
|
|
err := Reader_Error{
|
|
kind = .Invalid_Delim,
|
|
line = r.line_count,
|
|
}
|
|
return nil, err
|
|
}
|
|
|
|
line, full_line: []byte
|
|
err_read: io.Error
|
|
for err_read == nil {
|
|
line, err_read = read_line(r)
|
|
if r.comment != 0 && next_rune(line) == r.comment {
|
|
line = nil
|
|
continue
|
|
}
|
|
if err_read == nil && len(line) == length_newline(line) {
|
|
line = nil
|
|
continue
|
|
}
|
|
full_line = line
|
|
break
|
|
}
|
|
|
|
if is_io_error(err_read, .EOF) {
|
|
return nil, err_read
|
|
}
|
|
|
|
err: Error
|
|
quote_len :: len(`"`)
|
|
comma_len := utf8.rune_size(r.comma)
|
|
record_line := r.line_count
|
|
clear(&r.record_buffer)
|
|
clear(&r.field_indices)
|
|
|
|
parse_field: for {
|
|
if r.trim_leading_space {
|
|
line = bytes.trim_left_space(line)
|
|
}
|
|
if len(line) == 0 || line[0] != '"' {
|
|
i := bytes.index_rune(line, r.comma)
|
|
field := line
|
|
if i >= 0 {
|
|
field = field[:i]
|
|
} else {
|
|
field = field[:len(field) - length_newline(field)]
|
|
}
|
|
|
|
if !r.lazy_quotes {
|
|
if j := bytes.index_byte(field, '"'); j >= 0 {
|
|
column := utf8.rune_count(full_line[:len(full_line) - len(line[j:])])
|
|
err = Reader_Error{
|
|
kind = .Bare_Quote,
|
|
start_line = record_line,
|
|
line = r.line_count,
|
|
column = column,
|
|
}
|
|
break parse_field
|
|
}
|
|
}
|
|
append(&r.record_buffer, ..field)
|
|
append(&r.field_indices, len(r.record_buffer))
|
|
if i >= 0 {
|
|
line = line[i+comma_len:]
|
|
continue parse_field
|
|
}
|
|
break parse_field
|
|
|
|
} else {
|
|
line = line[quote_len:]
|
|
for {
|
|
i := bytes.index_byte(line, '"')
|
|
switch {
|
|
case i >= 0:
|
|
append(&r.record_buffer, ..line[:i])
|
|
line = line[i+quote_len:]
|
|
switch ch := next_rune(line); {
|
|
case ch == '"': // append quote
|
|
append(&r.record_buffer, '"')
|
|
line = line[quote_len:]
|
|
case ch == r.comma: // end of field
|
|
line = line[comma_len:]
|
|
append(&r.field_indices, len(r.record_buffer))
|
|
continue parse_field
|
|
case length_newline(line) == len(line): // end of line
|
|
append(&r.field_indices, len(r.record_buffer))
|
|
break parse_field
|
|
case r.lazy_quotes: // bare quote
|
|
append(&r.record_buffer, '"')
|
|
case: // invalid non-escaped quote
|
|
column := utf8.rune_count(full_line[:len(full_line) - len(line) - quote_len])
|
|
err = Reader_Error{
|
|
kind = .Quote,
|
|
start_line = record_line,
|
|
line = r.line_count,
|
|
column = column,
|
|
}
|
|
break parse_field
|
|
}
|
|
|
|
case len(line) > 0:
|
|
append(&r.record_buffer, ..line)
|
|
if err_read != nil {
|
|
break parse_field
|
|
}
|
|
line, err_read = read_line(r)
|
|
if is_io_error(err_read, .EOF) {
|
|
err_read = nil
|
|
}
|
|
full_line = line
|
|
|
|
case:
|
|
if !r.lazy_quotes && err_read == nil {
|
|
column := utf8.rune_count(full_line)
|
|
err = Reader_Error{
|
|
kind = .Quote,
|
|
start_line = record_line,
|
|
line = r.line_count,
|
|
column = column,
|
|
}
|
|
break parse_field
|
|
}
|
|
append(&r.field_indices, len(r.record_buffer))
|
|
break parse_field
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if err == nil && err_read != nil {
|
|
err = err_read
|
|
}
|
|
|
|
context.allocator = allocator
|
|
dst := dst
|
|
str := string(r.record_buffer[:])
|
|
if dst == nil {
|
|
// use local variable
|
|
dst = &([dynamic]string){}
|
|
}
|
|
clear(dst)
|
|
resize(dst, len(r.field_indices))
|
|
pre_idx: int
|
|
for idx, i in r.field_indices {
|
|
field := str[pre_idx:idx]
|
|
if !r.reuse_record_buffer {
|
|
field = strings.clone(field)
|
|
}
|
|
dst[i] = field
|
|
pre_idx = idx
|
|
}
|
|
|
|
if r.fields_per_record > 0 {
|
|
if len(dst) != r.fields_per_record && err == nil {
|
|
err = Reader_Error{
|
|
kind = .Field_Count,
|
|
start_line = record_line,
|
|
line = r.line_count,
|
|
expected = r.fields_per_record,
|
|
got = len(dst),
|
|
}
|
|
}
|
|
} else if r.fields_per_record == 0 {
|
|
r.fields_per_record = len(dst)
|
|
}
|
|
return dst[:], err
|
|
}
|