1#include "../include/csvparser.h"
3#include "../include/arena.h"
4#include "../include/cstr.h"
5#include "../include/str_utils.h"
27typedef struct csv_line_params {
37static size_t line_count(CsvReader* reader);
38static size_t get_num_fields(
const char* line,
char delim,
char quote);
39static bool parse_csv_line(csv_line_params* args);
41static inline void set_default_config(CsvReader* reader) {
43 reader->comment =
'#';
44 reader->has_header =
true;
45 reader->skip_header =
false;
52 fprintf(stderr,
"error allocating memory for CsvReader\n");
56 FILE* stream = fopen(filename,
"r");
58 fprintf(stderr,
"error opening file %s\n", filename);
64 Arena* arena = arena_create((arena_memory ? arena_memory : CSV_ARENA_BLOCK_SIZE));
66 fprintf(stderr,
"error creating memory arena\n");
72 reader->arena = arena;
74 reader->stream = stream;
76 set_default_config(reader);
81static Row** csv_allocate_rows(Arena* arena,
size_t num_rows) {
86 Row** rows = ARENA_ALLOC_ARRAY(arena,
Row*, num_rows);
88 fprintf(stderr,
"csv_allocate_rows(): arena out of memory\n");
92 for (
size_t i = 0; i < num_rows; i++) {
93 rows[i] = arena_alloc(arena,
sizeof(
Row));
95 fprintf(stderr,
"csv_allocate_rows(): arena_alloc failed on row %zu\n", i);
102static inline bool read_first_valid_line(CsvReader* reader,
char* line,
size_t line_size) {
103 bool found_valid_line =
false;
105 while (fgets(line, line_size, reader->stream)) {
107 char* end = line + strlen(line) - 1;
108 while (end > line && isspace(*end)) {
120 if (line[0] == reader->comment) {
125 found_valid_line =
true;
129 if (!found_valid_line) {
134 fseek(reader->stream, 0, SEEK_SET);
139 char line[MAX_FIELD_SIZE] = {0};
141 bool headerSkipped =
false;
144 reader->num_rows = line_count(reader);
145 reader->rows = csv_allocate_rows(reader->arena, reader->num_rows);
147 fclose(reader->stream);
152 if (!read_first_valid_line(reader, line,
sizeof(line))) {
153 fclose(reader->stream);
158 size_t num_fields = get_num_fields(line, reader->delim, reader->quote);
159 if (num_fields == 0) {
160 fclose(reader->stream);
164 bool parse_success =
true;
165 while (fgets(line, MAX_FIELD_SIZE, reader->stream) && rowIndex < reader->num_rows) {
167 char* end = line + strlen(line) - 1;
168 while (end > line && isspace(*end)) {
181 if (line[0] == reader->comment) {
185 if (reader->has_header && reader->skip_header && rowIndex == 0 && !headerSkipped) {
186 headerSkipped =
true;
190 csv_line_params args = {
191 .arena = reader->arena,
193 .rowIndex = rowIndex,
194 .row = reader->rows[rowIndex],
195 .delim = reader->delim,
196 .quote = reader->quote,
197 .num_fields = num_fields,
200 parse_success = parse_csv_line(&args);
201 if (!parse_success) {
207 fclose(reader->stream);
209 if (!parse_success) {
210 fprintf(stderr,
"csv_reader_parse() failed\n");
211 fprintf(stderr,
"Line: %s\n", line);
220 bool headerSkipped =
false;
221 char line[MAX_FIELD_SIZE] = {0};
223 reader->num_rows = line_count(reader);
226 reader->num_rows = (maxrows > 0 && maxrows < reader->num_rows) ? maxrows : reader->num_rows;
227 reader->rows = csv_allocate_rows(reader->arena, reader->num_rows);
229 fclose(reader->stream);
233 if (!read_first_valid_line(reader, line,
sizeof(line))) {
234 fclose(reader->stream);
239 size_t num_fields = get_num_fields(line, reader->delim, reader->quote);
240 if (num_fields == 0) {
241 fprintf(stderr,
"Error: no fields found in CSV file\n");
242 fclose(reader->stream);
246 while (fgets(line, MAX_FIELD_SIZE, reader->stream) && rowIndex < reader->num_rows) {
248 char* end = line + strlen(line) - 1;
249 while (end > line && isspace(*end)) {
262 if (line[0] == reader->comment) {
266 if (reader->has_header && reader->skip_header && rowIndex == 0 && !headerSkipped) {
267 headerSkipped =
true;
271 csv_line_params args = {
272 .arena = reader->arena,
274 .rowIndex = rowIndex,
275 .row = reader->rows[rowIndex],
276 .delim = reader->delim,
277 .quote = reader->quote,
278 .num_fields = num_fields,
281 if (!parse_csv_line(&args)) {
282 fprintf(stderr,
"csv_reader_parse_async() failed\n");
287 callback(rowIndex, reader->rows[rowIndex]);
291 fclose(reader->stream);
300 arena_destroy(reader->arena);
306void csv_reader_setconfig(CsvReader* reader, CsvReaderConfig config) {
307 if (config.delim !=
'\0') {
308 reader->delim = config.delim;
311 if (config.quote !=
'\0') {
312 reader->quote = config.quote;
315 if (config.comment !=
'\0') {
316 reader->comment = config.comment;
319 reader->has_header = config.has_header;
320 reader->skip_header = config.skip_header;
323CsvReaderConfig csv_reader_getconfig(CsvReader* reader) {
324 CsvReaderConfig config = {
325 .comment = reader->comment,
326 .delim = reader->delim,
327 .has_header = reader->has_header,
328 .skip_header = reader->skip_header,
329 .quote = reader->quote,
335static size_t get_num_fields(
const char* line,
char delim,
char quote) {
336 size_t numFields = 0;
337 int insideQuotes = 0;
339 for (
int i = 0; line[i] !=
'\0'; i++) {
340 if (line[i] == quote) {
341 insideQuotes = !insideQuotes;
342 }
else if (line[i] == delim && !insideQuotes) {
348 if (line[0] !=
'\0') {
357static bool parse_csv_line(csv_line_params* args) {
359 char field[MAX_FIELD_SIZE] = {0};
360 int insideQuotes = 0;
362 Row* row = args->row;
363 row->
fields = arena_alloc(args->arena, args->num_fields *
sizeof(
char*));
365 fprintf(stderr,
"ERROR: unable to allocate memory for fields\n");
369 char** fields = row->
fields;
370 size_t fieldIndex = 0;
373 for (
size_t i = 0; args->line[i] !=
'\0'; i++) {
374 if (args->line[i] == args->quote) {
375 insideQuotes = !insideQuotes;
376 }
else if (args->line[i] == args->delim && !insideQuotes) {
377 field[fieldIndex] =
'\0';
378 char* trimmed = trim_string(field);
379 fields[row->
count] = arena_strdup(args->arena, trimmed);
380 if (!fields[row->
count]) {
387 if (fieldIndex >= MAX_FIELD_SIZE - 1) {
389 "ERROR: field in row %zu exceeds MAX_FIELD_SIZE (%d), "
391 args->rowIndex, MAX_FIELD_SIZE - 1);
394 field[fieldIndex++] = args->line[i];
400 fprintf(stderr,
"ERROR: unterminated quoted field:%s in line %zu\n", args->line, args->rowIndex);
405 field[fieldIndex] =
'\0';
406 char* trimmed = trim_string(field);
407 fields[row->
count] = arena_strdup(args->arena, trimmed);
408 if (!fields[row->
count]) {
409 fprintf(stderr,
"ERROR: unable to allocate memory for fields[%zu]\n", row->
count);
415 if (row->
count != args->num_fields) {
416 fprintf(stderr,
"ERROR: invalid number of fields in line %zu\n", args->rowIndex);
425#define _CSV_READ_BUFSIZE (64u * 1024u)
427static size_t line_count(CsvReader* reader) {
429 bool headerSkipped =
false;
430 bool line_first_char =
true;
431 bool skip_this_line =
false;
432 bool blank_line =
true;
435 char buf[_CSV_READ_BUFSIZE];
438 rewind(reader->stream);
440 while ((nread = fread(buf, 1,
sizeof(buf), reader->stream)) > 0) {
442 const char* end = buf + nread;
446 const char* nl = (
const char*)memchr(p,
'\n', (
size_t)(end - p));
447 const char* chunk_end = nl ? nl + 1 : end;
450 for (
const char* c = p; c < chunk_end; c++) {
453 if (!skip_this_line && !blank_line) {
455 if (reader->has_header && reader->skip_header && !headerSkipped && lines == 0) {
456 headerSkipped =
true;
462 skip_this_line =
false;
464 line_first_char =
true;
469 if (skip_this_line)
continue;
471 if (line_first_char) {
472 line_first_char =
false;
473 if (*c == reader->comment) {
474 skip_this_line =
true;
479 if (*c !=
'\r' && (*c !=
' ' && *c !=
'\t')) {
489 if (!skip_this_line && !blank_line) {
490 if (reader->has_header && reader->skip_header && !headerSkipped && lines == 0) {
497 rewind(reader->stream);
510CsvWriter* csvwriter_new(
const char* filename) {
511 CsvWriter* writer = malloc(
sizeof(CsvWriter));
513 fprintf(stderr,
"error allocating memory for CsvWriter\n");
517 writer->stream = fopen(filename,
"w");
518 if (!writer->stream) {
519 fprintf(stderr,
"error opening file %s\n", filename);
526 writer->newline =
'\n';
527 writer->quote_all =
false;
528 writer->flush =
false;
540static inline bool field_needs_quoting(
const char* field,
char delim,
char quote,
char newline) {
542 return (strchr(field, delim) != NULL || strchr(field, quote) != NULL || strchr(field, newline) != NULL);
552static bool write_quoted_field(FILE* fp,
const char* field,
char quote) {
554 if (fputc(quote, fp) == EOF) {
559 for (
const char* ptr = field; *ptr !=
'\0'; ptr++) {
562 if (fputc(quote, fp) == EOF || fputc(quote, fp) == EOF) {
566 if (fputc(*ptr, fp) == EOF) {
573 if (fputc(quote, fp) == EOF) {
590static bool write_single_field(FILE* fp,
const char* field,
bool quote_all,
char delim,
char quote,
char newline) {
596 if (quote_all || field_needs_quoting(field, delim, quote, newline)) {
597 return write_quoted_field(fp, field, quote);
600 return fputs(field, fp) != EOF;
613 if (writer == NULL) {
618 if (fields == NULL && numfields > 0) {
625 if (numfields == 0) {
627 if (fputc(writer->newline, writer->stream) == EOF) {
641 for (
size_t i = 0; i < numfields; i++) {
644 if (fputc(writer->delim, fp) == EOF) {
650 if (!write_single_field(fp, fields[i], writer->quote_all, writer->delim, writer->quote, writer->newline)) {
656 if (fputc(writer->newline, fp) == EOF) {
662 if (writer->flush && fp) {
663 if (fflush(fp) != 0) {
672void csvwriter_free(CsvWriter* writer) {
674 if (writer->stream) fclose(writer->stream);
679void csvwriter_setconfig(CsvWriter* writer, CsvWriterConfig config) {
680 if (config.delim !=
'\0') {
681 writer->delim = config.delim;
684 if (config.quote !=
'\0') {
685 writer->quote = config.quote;
688 writer->quote_all = config.quote_all;
689 writer->flush = config.flush;
size_t csv_reader_numrows(const CsvReader *reader)
Get the number of rows in the CSV data.
void csv_reader_free(CsvReader *reader)
Free memory used by the CsvReader and CsvRow structures.
struct CsvReader CsvReader
Opaque structure representing a CSV parser. Create a new CSV parser with csv_reader_new and free it w...
bool csvwriter_write_row(CsvWriter *writer, const char **fields, size_t numfields)
Row ** csv_reader_parse(CsvReader *reader)
Parse the CSV data and retrieve all the rows at once.
CsvReader * csv_reader_new(const char *filename, size_t arena_memory)
Create a new CSV reader associated with a filename.
void csv_reader_parse_async(CsvReader *reader, CsvRowCallback callback, size_t alloc_max)
Parse the CSV data and pass each processed row back in a callback. Return true from the callback to s...
struct CsvWriter CsvWriter
Structure representing a CSV row.
size_t count
Number of fields in each row.
char ** fields
Array of fields in each row.