1#include "../include/csvparser.h"
3#include "../include/arena.h"
4#include "../include/cstr.h"
5#include "../include/str.h"
27typedef struct csv_line_params {
37static size_t line_count(CsvReader* reader);
38static size_t get_num_fields(
const char* line,
char delim,
char quote);
39static bool parse_csv_line(csv_line_params* args);
41static inline void set_default_config(CsvReader* reader) {
43 reader->comment =
'#';
44 reader->has_header =
true;
45 reader->skip_header =
false;
52 fprintf(stderr,
"error allocating memory for CsvReader\n");
56 FILE* stream = fopen(filename,
"r");
58 fprintf(stderr,
"error opening file %s\n", filename);
64 Arena* arena = arena_create((arena_memory ? arena_memory : CSV_ARENA_BLOCK_SIZE));
66 fprintf(stderr,
"error creating memory arena\n");
72 reader->arena = arena;
74 reader->stream = stream;
76 set_default_config(reader);
81static Row** csv_allocate_rows(Arena* arena,
size_t num_rows) {
86 Row** rows = ARENA_ALLOC_ARRAY(arena,
Row*, num_rows);
88 fprintf(stderr,
"csv_allocate_rows(): arena out of memory\n");
92 for (
size_t i = 0; i < num_rows; i++) {
93 rows[i] = arena_alloc(arena,
sizeof(
Row));
95 fprintf(stderr,
"csv_allocate_rows(): arena_alloc failed on row %zu\n", i);
102static inline bool read_first_valid_line(CsvReader* reader,
char* line,
size_t line_size) {
103 bool found_valid_line =
false;
105 while (fgets(line, line_size, reader->stream)) {
107 char* end = line + strlen(line) - 1;
108 while (end > line && isspace(*end)) {
120 if (line[0] == reader->comment) {
125 found_valid_line =
true;
129 if (!found_valid_line) {
134 fseek(reader->stream, 0, SEEK_SET);
139 char line[MAX_FIELD_SIZE] = {0};
141 bool headerSkipped =
false;
144 reader->num_rows = line_count(reader);
145 reader->rows = csv_allocate_rows(reader->arena, reader->num_rows);
147 fclose(reader->stream);
152 if (!read_first_valid_line(reader, line,
sizeof(line))) {
153 fclose(reader->stream);
158 size_t num_fields = get_num_fields(line, reader->delim, reader->quote);
159 if (num_fields == 0) {
160 fclose(reader->stream);
164 bool parse_success =
true;
165 while (fgets(line, MAX_FIELD_SIZE, reader->stream) && rowIndex < reader->num_rows) {
167 char* end = line + strlen(line) - 1;
168 while (end > line && isspace(*end)) {
181 if (line[0] == reader->comment) {
185 if (reader->has_header && reader->skip_header && rowIndex == 0 && !headerSkipped) {
186 headerSkipped =
true;
190 csv_line_params args = {
191 .arena = reader->arena,
193 .rowIndex = rowIndex,
194 .row = reader->rows[rowIndex],
195 .delim = reader->delim,
196 .quote = reader->quote,
197 .num_fields = num_fields,
200 parse_success = parse_csv_line(&args);
201 if (!parse_success) {
207 fclose(reader->stream);
209 if (!parse_success) {
210 fprintf(stderr,
"csv_reader_parse() failed\n");
211 fprintf(stderr,
"Line: %s\n", line);
220 bool headerSkipped =
false;
221 char line[MAX_FIELD_SIZE] = {0};
223 reader->num_rows = line_count(reader);
226 reader->num_rows = (maxrows > 0 && maxrows < reader->num_rows) ? maxrows : reader->num_rows;
227 reader->rows = csv_allocate_rows(reader->arena, reader->num_rows);
229 fclose(reader->stream);
233 if (!read_first_valid_line(reader, line,
sizeof(line))) {
234 fclose(reader->stream);
239 size_t num_fields = get_num_fields(line, reader->delim, reader->quote);
240 if (num_fields == 0) {
241 fprintf(stderr,
"Error: no fields found in CSV file\n");
242 fclose(reader->stream);
246 while (fgets(line, MAX_FIELD_SIZE, reader->stream) && rowIndex < reader->num_rows) {
248 char* end = line + strlen(line) - 1;
249 while (end > line && isspace(*end)) {
262 if (line[0] == reader->comment) {
266 if (reader->has_header && reader->skip_header && rowIndex == 0 && !headerSkipped) {
267 headerSkipped =
true;
271 csv_line_params args = {
272 .arena = reader->arena,
274 .rowIndex = rowIndex,
275 .row = reader->rows[rowIndex],
276 .delim = reader->delim,
277 .quote = reader->quote,
278 .num_fields = num_fields,
281 if (!parse_csv_line(&args)) {
282 fprintf(stderr,
"csv_reader_parse_async() failed\n");
287 callback(rowIndex, reader->rows[rowIndex]);
291 fclose(reader->stream);
300 arena_destroy(reader->arena);
306void csv_reader_setconfig(CsvReader* reader, CsvReaderConfig config) {
307 if (config.delim !=
'\0') {
308 reader->delim = config.delim;
311 if (config.quote !=
'\0') {
312 reader->quote = config.quote;
315 if (config.comment !=
'\0') {
316 reader->comment = config.comment;
319 reader->has_header = config.has_header;
320 reader->skip_header = config.skip_header;
323CsvReaderConfig csv_reader_getconfig(CsvReader* reader) {
324 CsvReaderConfig config = {
325 .comment = reader->comment,
326 .delim = reader->delim,
327 .has_header = reader->has_header,
328 .skip_header = reader->skip_header,
329 .quote = reader->quote,
335static size_t get_num_fields(
const char* line,
char delim,
char quote) {
336 size_t numFields = 0;
337 int insideQuotes = 0;
339 for (
int i = 0; line[i] !=
'\0'; i++) {
340 if (line[i] == quote) {
341 insideQuotes = !insideQuotes;
342 }
else if (line[i] == delim && !insideQuotes) {
348 if (line[0] !=
'\0') {
357static bool parse_csv_line(csv_line_params* args) {
359 char field[MAX_FIELD_SIZE] = {0};
360 int insideQuotes = 0;
362 Row* row = args->row;
363 row->
fields = arena_alloc(args->arena, args->num_fields *
sizeof(
char*));
365 fprintf(stderr,
"ERROR: unable to allocate memory for fields\n");
369 char** fields = row->
fields;
370 size_t fieldIndex = 0;
373 for (
size_t i = 0; args->line[i] !=
'\0'; i++) {
374 if (args->line[i] == args->quote) {
375 insideQuotes = !insideQuotes;
376 }
else if (args->line[i] == args->delim && !insideQuotes) {
377 field[fieldIndex] =
'\0';
379 fields[row->
count] = arena_strdup(args->arena, field);
380 if (!fields[row->
count]) {
387 if (fieldIndex >= MAX_FIELD_SIZE - 1) {
389 "ERROR: field in row %zu exceeds MAX_FIELD_SIZE (%d), "
391 args->rowIndex, MAX_FIELD_SIZE - 1);
394 field[fieldIndex++] = args->line[i];
400 fprintf(stderr,
"ERROR: unterminated quoted field:%s in line %zu\n", args->line, args->rowIndex);
405 field[fieldIndex] =
'\0';
407 fields[row->
count] = arena_strdup(args->arena, field);
408 if (!fields[row->
count]) {
409 fprintf(stderr,
"ERROR: unable to allocate memory for fields[%zu]\n", row->
count);
415 if (row->
count != args->num_fields) {
416 fprintf(stderr,
"ERROR: invalid number of fields in line %zu\n", args->rowIndex);
424#define _CSV_READ_BUFSIZE (64u * 1024u)
426static size_t line_count(CsvReader* reader) {
428 bool headerSkipped =
false;
429 bool line_first_char =
true;
430 bool skip_this_line =
false;
431 bool blank_line =
true;
434 char buf[_CSV_READ_BUFSIZE];
437 rewind(reader->stream);
439 while ((nread = fread(buf, 1,
sizeof(buf), reader->stream)) > 0) {
441 const char* end = buf + nread;
445 const char* nl = (
const char*)memchr(p,
'\n', (
size_t)(end - p));
446 const char* chunk_end = nl ? nl + 1 : end;
449 for (
const char* c = p; c < chunk_end; c++) {
452 if (!skip_this_line && !blank_line) {
454 if (reader->has_header && reader->skip_header && !headerSkipped && lines == 0) {
455 headerSkipped =
true;
461 skip_this_line =
false;
463 line_first_char =
true;
468 if (skip_this_line)
continue;
470 if (line_first_char) {
471 line_first_char =
false;
472 if (*c == reader->comment) {
473 skip_this_line =
true;
478 if (*c !=
'\r' && (*c !=
' ' && *c !=
'\t')) {
488 if (!skip_this_line && !blank_line) {
489 if (reader->has_header && reader->skip_header && !headerSkipped && lines == 0) {
496 rewind(reader->stream);
509CsvWriter* csvwriter_new(
const char* filename) {
510 CsvWriter* writer = malloc(
sizeof(CsvWriter));
512 fprintf(stderr,
"error allocating memory for CsvWriter\n");
516 writer->stream = fopen(filename,
"w");
517 if (!writer->stream) {
518 fprintf(stderr,
"error opening file %s\n", filename);
525 writer->newline =
'\n';
526 writer->quote_all =
false;
527 writer->flush =
false;
539static inline bool field_needs_quoting(
const char* field,
char delim,
char quote,
char newline) {
541 return (strchr(field, delim) != NULL || strchr(field, quote) != NULL || strchr(field, newline) != NULL);
551static bool write_quoted_field(FILE* fp,
const char* field,
char quote) {
553 if (fputc(quote, fp) == EOF) {
558 for (
const char* ptr = field; *ptr !=
'\0'; ptr++) {
561 if (fputc(quote, fp) == EOF || fputc(quote, fp) == EOF) {
565 if (fputc(*ptr, fp) == EOF) {
572 if (fputc(quote, fp) == EOF) {
589static bool write_single_field(FILE* fp,
const char* field,
bool quote_all,
char delim,
char quote,
char newline) {
595 if (quote_all || field_needs_quoting(field, delim, quote, newline)) {
596 return write_quoted_field(fp, field, quote);
599 return fputs(field, fp) != EOF;
612 if (writer == NULL) {
617 if (fields == NULL && numfields > 0) {
624 if (numfields == 0) {
626 if (fputc(writer->newline, writer->stream) == EOF) {
640 for (
size_t i = 0; i < numfields; i++) {
643 if (fputc(writer->delim, fp) == EOF) {
649 if (!write_single_field(fp, fields[i], writer->quote_all, writer->delim, writer->quote, writer->newline)) {
655 if (fputc(writer->newline, fp) == EOF) {
661 if (writer->flush && fp) {
662 if (fflush(fp) != 0) {
671void csvwriter_free(CsvWriter* writer) {
673 if (writer->stream) fclose(writer->stream);
678void csvwriter_setconfig(CsvWriter* writer, CsvWriterConfig config) {
679 if (config.delim !=
'\0') {
680 writer->delim = config.delim;
683 if (config.quote !=
'\0') {
684 writer->quote = config.quote;
687 writer->quote_all = config.quote_all;
688 writer->flush = config.flush;
size_t csv_reader_numrows(const CsvReader *reader)
Get the number of rows in the CSV data.
void csv_reader_free(CsvReader *reader)
Free memory used by the CsvReader and CsvRow structures.
struct CsvReader CsvReader
Opaque structure representing a CSV parser. Create a new CSV parser with csv_reader_new and free it w...
bool csvwriter_write_row(CsvWriter *writer, const char **fields, size_t numfields)
Row ** csv_reader_parse(CsvReader *reader)
Parse the CSV data and retrieve all the rows at once.
CsvReader * csv_reader_new(const char *filename, size_t arena_memory)
Create a new CSV reader associated with a filename.
void csv_reader_parse_async(CsvReader *reader, CsvRowCallback callback, size_t alloc_max)
Parse the CSV data and pass each processed row back in a callback. Return true from the callback to s...
struct CsvWriter CsvWriter
Structure representing a CSV row.
size_t count
Number of fields in each row.
char ** fields
Array of fields in each row.