solidc
Robust collection of general-purpose cross-platform C libraries and data structures designed for rapid and safe development in C
Loading...
Searching...
No Matches
csvparser.c
1#include "../include/csvparser.h"
2
3#include "../include/arena.h"
4#include "../include/cstr.h"
5#include "../include/str.h"
6
7#include <ctype.h>
8#include <errno.h>
9#include <limits.h>
10#include <stdarg.h>
11#include <stdio.h>
12#include <stdlib.h>
13#include <string.h>
14
15typedef struct CsvReader {
16 FILE* stream; // file_t pointer corresponding to the file stream.
17 Row** rows; // Array of row pointers
18 size_t num_rows; // Number of rows in csv, excluding empty lines
19 char delim; // Delimiter character
20 char quote; // Quote character
21 char comment; // Comment character
22 bool has_header; // Whether the CSV file has a header
23 bool skip_header; // Whether to skip the header when parsing
24 Arena* arena; // single-threaded arena for memory allocation
25} CsvReader;
26
27typedef struct csv_line_params {
28 Arena* arena;
29 const char* line;
30 size_t num_fields;
31 size_t rowIndex;
32 Row* row;
33 char delim;
34 char quote;
35} csv_line_params;
36
37static size_t line_count(CsvReader* reader);
38static size_t get_num_fields(const char* line, char delim, char quote);
39static bool parse_csv_line(csv_line_params* args);
40
41static inline void set_default_config(CsvReader* reader) {
42 reader->delim = ',';
43 reader->comment = '#';
44 reader->has_header = true;
45 reader->skip_header = false;
46 reader->quote = '"';
47}
48
49CsvReader* csv_reader_new(const char* filename, size_t arena_memory) {
50 CsvReader* reader = malloc(sizeof(CsvReader));
51 if (!reader) {
52 fprintf(stderr, "error allocating memory for CsvReader\n");
53 return NULL;
54 }
55
56 FILE* stream = fopen(filename, "r");
57 if (!stream) {
58 fprintf(stderr, "error opening file %s\n", filename);
59 free(reader);
60 return NULL;
61 }
62
63 // Use passed in argument if provided or use default value.
64 Arena* arena = arena_create((arena_memory ? arena_memory : CSV_ARENA_BLOCK_SIZE));
65 if (!arena) {
66 fprintf(stderr, "error creating memory arena\n");
67 fclose(stream);
68 free(reader);
69 return NULL;
70 }
71
72 reader->arena = arena;
73 reader->num_rows = 0;
74 reader->stream = stream;
75 reader->rows = NULL;
76 set_default_config(reader);
77 return reader;
78}
79
80// Allocate memory for rows and set num_rows.
81static Row** csv_allocate_rows(Arena* arena, size_t num_rows) {
82 if (num_rows == 0) {
83 return NULL;
84 }
85
86 Row** rows = ARENA_ALLOC_ARRAY(arena, Row*, num_rows);
87 if (!rows) {
88 fprintf(stderr, "csv_allocate_rows(): arena out of memory\n");
89 return NULL;
90 }
91
92 for (size_t i = 0; i < num_rows; i++) {
93 rows[i] = arena_alloc(arena, sizeof(Row));
94 if (!rows[i]) {
95 fprintf(stderr, "csv_allocate_rows(): arena_alloc failed on row %zu\n", i);
96 return NULL;
97 }
98 }
99 return rows;
100}
101
102static inline bool read_first_valid_line(CsvReader* reader, char* line, size_t line_size) {
103 bool found_valid_line = false;
104
105 while (fgets(line, line_size, reader->stream)) {
106 // Trim whitespace from end
107 char* end = line + strlen(line) - 1;
108 while (end > line && isspace(*end)) {
109 end--;
110 }
111
112 // Skip empty lines
113 if (end == line) {
114 continue;
115 }
116
117 end[1] = '\0';
118
119 // Skip comment lines
120 if (line[0] == reader->comment) {
121 continue;
122 }
123
124 // This is a valid data/header line
125 found_valid_line = true;
126 break;
127 }
128
129 if (!found_valid_line) {
130 return false;
131 }
132
133 // Reset the file pointer to the beginning of the file
134 fseek(reader->stream, 0, SEEK_SET);
135 return true;
136}
137
138Row** csv_reader_parse(CsvReader* reader) {
139 char line[MAX_FIELD_SIZE] = {0};
140 size_t rowIndex = 0;
141 bool headerSkipped = false;
142
143 // read num_rows and allocate them on heap.
144 reader->num_rows = line_count(reader);
145 reader->rows = csv_allocate_rows(reader->arena, reader->num_rows);
146 if (!reader->rows) {
147 fclose(reader->stream);
148 return NULL;
149 }
150
151 // Read lines until we find a non-comment, non-empty line to determine field count
152 if (!read_first_valid_line(reader, line, sizeof(line))) {
153 fclose(reader->stream);
154 return NULL;
155 }
156
157 // Get the number of fields in the CSV file
158 size_t num_fields = get_num_fields(line, reader->delim, reader->quote);
159 if (num_fields == 0) {
160 fclose(reader->stream);
161 return NULL;
162 }
163
164 bool parse_success = true;
165 while (fgets(line, MAX_FIELD_SIZE, reader->stream) && rowIndex < reader->num_rows) {
166 // trim white space from end of line and skip empty lines
167 char* end = line + strlen(line) - 1;
168 while (end > line && isspace(*end)) {
169 end--;
170 }
171
172 // If the line is empty, skip it
173 if (end == line) {
174 continue;
175 }
176
177 // Terminate the line with a null character
178 end[1] = '\0';
179
180 // skip comment lines
181 if (line[0] == reader->comment) {
182 continue;
183 }
184
185 if (reader->has_header && reader->skip_header && rowIndex == 0 && !headerSkipped) {
186 headerSkipped = true;
187 continue;
188 }
189
190 csv_line_params args = {
191 .arena = reader->arena,
192 .line = line,
193 .rowIndex = rowIndex,
194 .row = reader->rows[rowIndex],
195 .delim = reader->delim,
196 .quote = reader->quote,
197 .num_fields = num_fields,
198 };
199
200 parse_success = parse_csv_line(&args);
201 if (!parse_success) {
202 break;
203 }
204 rowIndex++;
205 }
206
207 fclose(reader->stream);
208
209 if (!parse_success) {
210 fprintf(stderr, "csv_reader_parse() failed\n");
211 fprintf(stderr, "Line: %s\n", line);
212 return NULL;
213 }
214
215 return reader->rows;
216}
217
218void csv_reader_parse_async(CsvReader* reader, CsvRowCallback callback, size_t maxrows) {
219 size_t rowIndex = 0;
220 bool headerSkipped = false;
221 char line[MAX_FIELD_SIZE] = {0};
222
223 reader->num_rows = line_count(reader);
224
225 // Limit the number of rows to parse if maxrows is set
226 reader->num_rows = (maxrows > 0 && maxrows < reader->num_rows) ? maxrows : reader->num_rows;
227 reader->rows = csv_allocate_rows(reader->arena, reader->num_rows);
228 if (!reader->rows) {
229 fclose(reader->stream);
230 return;
231 }
232
233 if (!read_first_valid_line(reader, line, sizeof(line))) {
234 fclose(reader->stream);
235 return;
236 }
237
238 // Get the number of fields in the CSV file
239 size_t num_fields = get_num_fields(line, reader->delim, reader->quote);
240 if (num_fields == 0) {
241 fprintf(stderr, "Error: no fields found in CSV file\n");
242 fclose(reader->stream);
243 return;
244 }
245
246 while (fgets(line, MAX_FIELD_SIZE, reader->stream) && rowIndex < reader->num_rows) {
247 // trim white space from end of line and skip empty lines
248 char* end = line + strlen(line) - 1;
249 while (end > line && isspace(*end)) {
250 end--;
251 }
252
253 // If the line is empty, skip it
254 if (end == line) {
255 continue;
256 }
257
258 // Terminate the line with a null character
259 end[1] = '\0';
260
261 // skip comment lines
262 if (line[0] == reader->comment) {
263 continue;
264 }
265
266 if (reader->has_header && reader->skip_header && rowIndex == 0 && !headerSkipped) {
267 headerSkipped = true;
268 continue;
269 }
270
271 csv_line_params args = {
272 .arena = reader->arena,
273 .line = line,
274 .rowIndex = rowIndex,
275 .row = reader->rows[rowIndex],
276 .delim = reader->delim,
277 .quote = reader->quote,
278 .num_fields = num_fields,
279 };
280
281 if (!parse_csv_line(&args)) {
282 fprintf(stderr, "csv_reader_parse_async() failed\n");
283 break;
284 }
285
286 // Pass the processed row to the caller.
287 callback(rowIndex, reader->rows[rowIndex]);
288 rowIndex++;
289 }
290
291 fclose(reader->stream);
292}
293
294size_t csv_reader_numrows(const CsvReader* reader) { return reader->num_rows; }
295
296void csv_reader_free(CsvReader* reader) {
297 if (!reader) return;
298
299 // The row are allocated in the arena, so we only need to free the arena.
300 arena_destroy(reader->arena);
301
302 free(reader);
303 reader = NULL;
304}
305
306void csv_reader_setconfig(CsvReader* reader, CsvReaderConfig config) {
307 if (config.delim != '\0') {
308 reader->delim = config.delim;
309 }
310
311 if (config.quote != '\0') {
312 reader->quote = config.quote;
313 }
314
315 if (config.comment != '\0') {
316 reader->comment = config.comment;
317 }
318
319 reader->has_header = config.has_header;
320 reader->skip_header = config.skip_header;
321}
322
323CsvReaderConfig csv_reader_getconfig(CsvReader* reader) {
324 CsvReaderConfig config = {
325 .comment = reader->comment,
326 .delim = reader->delim,
327 .has_header = reader->has_header,
328 .skip_header = reader->skip_header,
329 .quote = reader->quote,
330 };
331 return config;
332}
333
334// Function to count the number of fields in a CSV line
335static size_t get_num_fields(const char* line, char delim, char quote) {
336 size_t numFields = 0;
337 int insideQuotes = 0;
338
339 for (int i = 0; line[i] != '\0'; i++) {
340 if (line[i] == quote) {
341 insideQuotes = !insideQuotes; // Toggle insideQuotes flag
342 } else if (line[i] == delim && !insideQuotes) {
343 numFields++;
344 }
345 }
346
347 // Add the last field if it is not empty
348 if (line[0] != '\0') {
349 numFields++;
350 }
351 return numFields;
352}
353
357static bool parse_csv_line(csv_line_params* args) {
358 // statically MUST be <= 4096 to prevent stack overflow
359 char field[MAX_FIELD_SIZE] = {0};
360 int insideQuotes = 0;
361
362 Row* row = args->row;
363 row->fields = arena_alloc(args->arena, args->num_fields * sizeof(char*));
364 if (!row->fields) {
365 fprintf(stderr, "ERROR: unable to allocate memory for fields\n");
366 return false;
367 }
368
369 char** fields = row->fields;
370 size_t fieldIndex = 0;
371 row->count = 0;
372
373 for (size_t i = 0; args->line[i] != '\0'; i++) {
374 if (args->line[i] == args->quote) {
375 insideQuotes = !insideQuotes;
376 } else if (args->line[i] == args->delim && !insideQuotes) {
377 field[fieldIndex] = '\0';
378 str_trim(field);
379 fields[row->count] = arena_strdup(args->arena, field);
380 if (!fields[row->count]) {
381 return false;
382 }
383 row->count++;
384 fieldIndex = 0;
385 } else {
386 /* SECURITY: Prevent stack buffer overflow on oversized fields. */
387 if (fieldIndex >= MAX_FIELD_SIZE - 1) {
388 fprintf(stderr,
389 "ERROR: field in row %zu exceeds MAX_FIELD_SIZE (%d), "
390 "aborting parse\n",
391 args->rowIndex, MAX_FIELD_SIZE - 1);
392 return false;
393 }
394 field[fieldIndex++] = args->line[i];
395 }
396 }
397
398 /* If inside quotes at the end of the line, the line is not terminated */
399 if (insideQuotes) {
400 fprintf(stderr, "ERROR: unterminated quoted field:%s in line %zu\n", args->line, args->rowIndex);
401 return false;
402 }
403
404 /* Add the last field with whitespace trimming */
405 field[fieldIndex] = '\0';
406 str_trim(field);
407 fields[row->count] = arena_strdup(args->arena, field);
408 if (!fields[row->count]) {
409 fprintf(stderr, "ERROR: unable to allocate memory for fields[%zu]\n", row->count);
410 return false;
411 }
412 row->count++;
413
414 /* Validate the number of fields */
415 if (row->count != args->num_fields) {
416 fprintf(stderr, "ERROR: invalid number of fields in line %zu\n", args->rowIndex);
417 return false;
418 }
419 return true;
420}
421
422// count the number of lines in a csv file.
423// ignore comments. Optionally skip header.
424#define _CSV_READ_BUFSIZE (64u * 1024u) /* 64 KB — fits comfortably in L2 */
425
426static size_t line_count(CsvReader* reader) {
427 size_t lines = 0;
428 bool headerSkipped = false;
429 bool line_first_char = true; /* first char of a new logical line? */
430 bool skip_this_line = false; /* skip remainder of current line */
431 bool blank_line = true; /* is the current line blank? */
432
433 /* One stack buffer; we never hold a reference across a fread() call. */
434 char buf[_CSV_READ_BUFSIZE];
435 size_t nread;
436
437 rewind(reader->stream);
438
439 while ((nread = fread(buf, 1, sizeof(buf), reader->stream)) > 0) {
440 const char* p = buf;
441 const char* end = buf + nread;
442
443 while (p < end) {
444 /* Find the next newline in the remaining block. */
445 const char* nl = (const char*)memchr(p, '\n', (size_t)(end - p));
446 const char* chunk_end = nl ? nl + 1 : end;
447
448 /* ---- process characters in [p, chunk_end) ---- */
449 for (const char* c = p; c < chunk_end; c++) {
450 if (*c == '\n') {
451 /* End of line: count it if it had real content. */
452 if (!skip_this_line && !blank_line) {
453 /* Header skip (only first real data line). */
454 if (reader->has_header && reader->skip_header && !headerSkipped && lines == 0) {
455 headerSkipped = true;
456 } else {
457 lines++;
458 }
459 }
460 /* Reset state for next line. */
461 skip_this_line = false;
462 blank_line = true;
463 line_first_char = true;
464 continue;
465 }
466
467 /* Non-newline character. */
468 if (skip_this_line) continue;
469
470 if (line_first_char) {
471 line_first_char = false;
472 if (*c == reader->comment) {
473 skip_this_line = true;
474 continue;
475 }
476 }
477
478 if (*c != '\r' && (*c != ' ' && *c != '\t')) {
479 blank_line = false;
480 }
481 }
482
483 p = chunk_end;
484 }
485 }
486
487 /* Handle last line if it did not end with '\n'. */
488 if (!skip_this_line && !blank_line) {
489 if (reader->has_header && reader->skip_header && !headerSkipped && lines == 0) {
490 /* header only — nothing to count */
491 } else {
492 lines++;
493 }
494 }
495
496 rewind(reader->stream);
497 return lines;
498}
499
500typedef struct CsvWriter {
501 FILE* stream; // file_t pointer corresponding to the file stream.
502 char delim; // Delimiter character
503 char quote; // Quote character
504 char newline; // Newline character
505 bool quote_all; // Quote all fields
506 bool flush; // Flush the stream after writing each row
507} CsvWriter;
508
509CsvWriter* csvwriter_new(const char* filename) {
510 CsvWriter* writer = malloc(sizeof(CsvWriter));
511 if (!writer) {
512 fprintf(stderr, "error allocating memory for CsvWriter\n");
513 return NULL;
514 }
515
516 writer->stream = fopen(filename, "w");
517 if (!writer->stream) {
518 fprintf(stderr, "error opening file %s\n", filename);
519 free(writer);
520 return NULL;
521 }
522
523 writer->delim = ',';
524 writer->quote = '"';
525 writer->newline = '\n';
526 writer->quote_all = false;
527 writer->flush = false;
528 return writer;
529}
530
539static inline bool field_needs_quoting(const char* field, char delim, char quote, char newline) {
540 // Check for delimiter, quote character, or newline in the field
541 return (strchr(field, delim) != NULL || strchr(field, quote) != NULL || strchr(field, newline) != NULL);
542}
543
551static bool write_quoted_field(FILE* fp, const char* field, char quote) {
552 // Write opening quote
553 if (fputc(quote, fp) == EOF) {
554 return false;
555 }
556
557 // Write field content, escaping quotes by doubling them (CSV standard)
558 for (const char* ptr = field; *ptr != '\0'; ptr++) {
559 if (*ptr == quote) {
560 // Escape quote by writing it twice
561 if (fputc(quote, fp) == EOF || fputc(quote, fp) == EOF) {
562 return false;
563 }
564 } else {
565 if (fputc(*ptr, fp) == EOF) {
566 return false;
567 }
568 }
569 }
570
571 // Write closing quote
572 if (fputc(quote, fp) == EOF) {
573 return false;
574 }
575
576 return true;
577}
578
589static bool write_single_field(FILE* fp, const char* field, bool quote_all, char delim, char quote, char newline) {
590 if (field == NULL) {
591 // Handle null field as empty string
592 field = "";
593 }
594
595 if (quote_all || field_needs_quoting(field, delim, quote, newline)) {
596 return write_quoted_field(fp, field, quote);
597 } else {
598 // Simple case - no quoting needed, use fputs for efficiency
599 return fputs(field, fp) != EOF;
600 }
601}
602
610bool csvwriter_write_row(CsvWriter* writer, const char** fields, size_t numfields) {
611 // Input validation
612 if (writer == NULL) {
613 errno = EINVAL;
614 return false;
615 }
616
617 if (fields == NULL && numfields > 0) {
618 errno = EINVAL;
619 return false;
620 }
621
622 FILE* fp = NULL;
623
624 if (numfields == 0) {
625 // Writing empty row - just write newline
626 if (fputc(writer->newline, writer->stream) == EOF) {
627 return false;
628 }
629 goto flush_and_exit;
630 }
631
632 fp = writer->stream;
633
634 // Check if stream is valid before proceeding
635 if (ferror(fp)) {
636 return false;
637 }
638
639 // Write all fields with delimiters
640 for (size_t i = 0; i < numfields; i++) {
641 // Write delimiter before all fields except the first
642 if (i > 0) {
643 if (fputc(writer->delim, fp) == EOF) {
644 return false;
645 }
646 }
647
648 // Write the field content
649 if (!write_single_field(fp, fields[i], writer->quote_all, writer->delim, writer->quote, writer->newline)) {
650 return false;
651 }
652 }
653
654 // Write row terminator
655 if (fputc(writer->newline, fp) == EOF) {
656 return false;
657 }
658
659flush_and_exit:
660 // Flush if requested
661 if (writer->flush && fp) {
662 if (fflush(fp) != 0) {
663 return false;
664 }
665 }
666
667 // Final error check
668 return !ferror(fp);
669}
670
671void csvwriter_free(CsvWriter* writer) {
672 if (!writer) return;
673 if (writer->stream) fclose(writer->stream);
674 free(writer);
675}
676
677// configure the csv writer
678void csvwriter_setconfig(CsvWriter* writer, CsvWriterConfig config) {
679 if (config.delim != '\0') {
680 writer->delim = config.delim;
681 }
682
683 if (config.quote != '\0') {
684 writer->quote = config.quote;
685 }
686
687 writer->quote_all = config.quote_all;
688 writer->flush = config.flush;
689}
size_t csv_reader_numrows(const CsvReader *reader)
Get the number of rows in the CSV data.
Definition csvparser.c:294
void csv_reader_free(CsvReader *reader)
Free memory used by the CsvReader and CsvRow structures.
Definition csvparser.c:296
struct CsvReader CsvReader
Opaque structure representing a CSV parser. Create a new CSV parser with csv_reader_new and free it w...
Definition csvparser.h:48
bool csvwriter_write_row(CsvWriter *writer, const char **fields, size_t numfields)
Definition csvparser.c:610
Row ** csv_reader_parse(CsvReader *reader)
Parse the CSV data and retrieve all the rows at once.
Definition csvparser.c:138
CsvReader * csv_reader_new(const char *filename, size_t arena_memory)
Create a new CSV reader associated with a filename.
Definition csvparser.c:49
void csv_reader_parse_async(CsvReader *reader, CsvRowCallback callback, size_t alloc_max)
Parse the CSV data and pass each processed row back in a callback. Return true from the callback to s...
Definition csvparser.c:218
struct CsvWriter CsvWriter
Definition csvparser.h:164
Structure representing a CSV row.
Definition csvparser.h:53
size_t count
Number of fields in each row.
Definition csvparser.h:55
char ** fields
Array of fields in each row.
Definition csvparser.h:54