solidc
Robust collection of general-purpose cross-platform C libraries and data structures designed for rapid and safe development in C
Loading...
Searching...
No Matches
csvparser.c
1#include "../include/csvparser.h"
2
3#include "../include/arena.h"
4#include "../include/cstr.h"
5#include "../include/str_utils.h"
6
7#include <ctype.h>
8#include <errno.h>
9#include <limits.h>
10#include <stdarg.h>
11#include <stdio.h>
12#include <stdlib.h>
13#include <string.h>
14
15typedef struct CsvReader {
16 FILE* stream; // file_t pointer corresponding to the file stream.
17 Row** rows; // Array of row pointers
18 size_t num_rows; // Number of rows in csv, excluding empty lines
19 char delim; // Delimiter character
20 char quote; // Quote character
21 char comment; // Comment character
22 bool has_header; // Whether the CSV file has a header
23 bool skip_header; // Whether to skip the header when parsing
24 Arena* arena; // single-threaded arena for memory allocation
25} CsvReader;
26
27typedef struct csv_line_params {
28 Arena* arena;
29 const char* line;
30 size_t num_fields;
31 size_t rowIndex;
32 Row* row;
33 char delim;
34 char quote;
35} csv_line_params;
36
37static size_t line_count(CsvReader* reader);
38static size_t get_num_fields(const char* line, char delim, char quote);
39static bool parse_csv_line(csv_line_params* args);
40
41static inline void set_default_config(CsvReader* reader) {
42 reader->delim = ',';
43 reader->comment = '#';
44 reader->has_header = true;
45 reader->skip_header = false;
46 reader->quote = '"';
47}
48
49CsvReader* csv_reader_new(const char* filename, size_t arena_memory) {
50 CsvReader* reader = malloc(sizeof(CsvReader));
51 if (!reader) {
52 fprintf(stderr, "error allocating memory for CsvReader\n");
53 return NULL;
54 }
55
56 FILE* stream = fopen(filename, "r");
57 if (!stream) {
58 fprintf(stderr, "error opening file %s\n", filename);
59 free(reader);
60 return NULL;
61 }
62
63 // Use passed in argument if provided or use default value.
64 Arena* arena = arena_create((arena_memory ? arena_memory : CSV_ARENA_BLOCK_SIZE));
65 if (!arena) {
66 fprintf(stderr, "error creating memory arena\n");
67 fclose(stream);
68 free(reader);
69 return NULL;
70 }
71
72 reader->arena = arena;
73 reader->num_rows = 0;
74 reader->stream = stream;
75 reader->rows = NULL;
76 set_default_config(reader);
77 return reader;
78}
79
80// Allocate memory for rows and set num_rows.
81static Row** csv_allocate_rows(Arena* arena, size_t num_rows) {
82 if (num_rows == 0) {
83 return NULL;
84 }
85
86 Row** rows = ARENA_ALLOC_ARRAY(arena, Row*, num_rows);
87 if (!rows) {
88 fprintf(stderr, "csv_allocate_rows(): arena out of memory\n");
89 return NULL;
90 }
91
92 for (size_t i = 0; i < num_rows; i++) {
93 rows[i] = arena_alloc(arena, sizeof(Row));
94 if (!rows[i]) {
95 fprintf(stderr, "csv_allocate_rows(): arena_alloc failed on row %zu\n", i);
96 return NULL;
97 }
98 }
99 return rows;
100}
101
102static inline bool read_first_valid_line(CsvReader* reader, char* line, size_t line_size) {
103 bool found_valid_line = false;
104
105 while (fgets(line, line_size, reader->stream)) {
106 // Trim whitespace from end
107 char* end = line + strlen(line) - 1;
108 while (end > line && isspace(*end)) {
109 end--;
110 }
111
112 // Skip empty lines
113 if (end == line) {
114 continue;
115 }
116
117 end[1] = '\0';
118
119 // Skip comment lines
120 if (line[0] == reader->comment) {
121 continue;
122 }
123
124 // This is a valid data/header line
125 found_valid_line = true;
126 break;
127 }
128
129 if (!found_valid_line) {
130 return false;
131 }
132
133 // Reset the file pointer to the beginning of the file
134 fseek(reader->stream, 0, SEEK_SET);
135 return true;
136}
137
138Row** csv_reader_parse(CsvReader* reader) {
139 char line[MAX_FIELD_SIZE] = {0};
140 size_t rowIndex = 0;
141 bool headerSkipped = false;
142
143 // read num_rows and allocate them on heap.
144 reader->num_rows = line_count(reader);
145 reader->rows = csv_allocate_rows(reader->arena, reader->num_rows);
146 if (!reader->rows) {
147 fclose(reader->stream);
148 return NULL;
149 }
150
151 // Read lines until we find a non-comment, non-empty line to determine field count
152 if (!read_first_valid_line(reader, line, sizeof(line))) {
153 fclose(reader->stream);
154 return NULL;
155 }
156
157 // Get the number of fields in the CSV file
158 size_t num_fields = get_num_fields(line, reader->delim, reader->quote);
159 if (num_fields == 0) {
160 fclose(reader->stream);
161 return NULL;
162 }
163
164 bool parse_success = true;
165 while (fgets(line, MAX_FIELD_SIZE, reader->stream) && rowIndex < reader->num_rows) {
166 // trim white space from end of line and skip empty lines
167 char* end = line + strlen(line) - 1;
168 while (end > line && isspace(*end)) {
169 end--;
170 }
171
172 // If the line is empty, skip it
173 if (end == line) {
174 continue;
175 }
176
177 // Terminate the line with a null character
178 end[1] = '\0';
179
180 // skip comment lines
181 if (line[0] == reader->comment) {
182 continue;
183 }
184
185 if (reader->has_header && reader->skip_header && rowIndex == 0 && !headerSkipped) {
186 headerSkipped = true;
187 continue;
188 }
189
190 csv_line_params args = {
191 .arena = reader->arena,
192 .line = line,
193 .rowIndex = rowIndex,
194 .row = reader->rows[rowIndex],
195 .delim = reader->delim,
196 .quote = reader->quote,
197 .num_fields = num_fields,
198 };
199
200 parse_success = parse_csv_line(&args);
201 if (!parse_success) {
202 break;
203 }
204 rowIndex++;
205 }
206
207 fclose(reader->stream);
208
209 if (!parse_success) {
210 fprintf(stderr, "csv_reader_parse() failed\n");
211 fprintf(stderr, "Line: %s\n", line);
212 return NULL;
213 }
214
215 return reader->rows;
216}
217
218void csv_reader_parse_async(CsvReader* reader, CsvRowCallback callback, size_t maxrows) {
219 size_t rowIndex = 0;
220 bool headerSkipped = false;
221 char line[MAX_FIELD_SIZE] = {0};
222
223 reader->num_rows = line_count(reader);
224
225 // Limit the number of rows to parse if maxrows is set
226 reader->num_rows = (maxrows > 0 && maxrows < reader->num_rows) ? maxrows : reader->num_rows;
227 reader->rows = csv_allocate_rows(reader->arena, reader->num_rows);
228 if (!reader->rows) {
229 fclose(reader->stream);
230 return;
231 }
232
233 if (!read_first_valid_line(reader, line, sizeof(line))) {
234 fclose(reader->stream);
235 return;
236 }
237
238 // Get the number of fields in the CSV file
239 size_t num_fields = get_num_fields(line, reader->delim, reader->quote);
240 if (num_fields == 0) {
241 fprintf(stderr, "Error: no fields found in CSV file\n");
242 fclose(reader->stream);
243 return;
244 }
245
246 while (fgets(line, MAX_FIELD_SIZE, reader->stream) && rowIndex < reader->num_rows) {
247 // trim white space from end of line and skip empty lines
248 char* end = line + strlen(line) - 1;
249 while (end > line && isspace(*end)) {
250 end--;
251 }
252
253 // If the line is empty, skip it
254 if (end == line) {
255 continue;
256 }
257
258 // Terminate the line with a null character
259 end[1] = '\0';
260
261 // skip comment lines
262 if (line[0] == reader->comment) {
263 continue;
264 }
265
266 if (reader->has_header && reader->skip_header && rowIndex == 0 && !headerSkipped) {
267 headerSkipped = true;
268 continue;
269 }
270
271 csv_line_params args = {
272 .arena = reader->arena,
273 .line = line,
274 .rowIndex = rowIndex,
275 .row = reader->rows[rowIndex],
276 .delim = reader->delim,
277 .quote = reader->quote,
278 .num_fields = num_fields,
279 };
280
281 if (!parse_csv_line(&args)) {
282 fprintf(stderr, "csv_reader_parse_async() failed\n");
283 break;
284 }
285
286 // Pass the processed row to the caller.
287 callback(rowIndex, reader->rows[rowIndex]);
288 rowIndex++;
289 }
290
291 fclose(reader->stream);
292}
293
294size_t csv_reader_numrows(const CsvReader* reader) { return reader->num_rows; }
295
296void csv_reader_free(CsvReader* reader) {
297 if (!reader) return;
298
299 // The row are allocated in the arena, so we only need to free the arena.
300 arena_destroy(reader->arena);
301
302 free(reader);
303 reader = NULL;
304}
305
306void csv_reader_setconfig(CsvReader* reader, CsvReaderConfig config) {
307 if (config.delim != '\0') {
308 reader->delim = config.delim;
309 }
310
311 if (config.quote != '\0') {
312 reader->quote = config.quote;
313 }
314
315 if (config.comment != '\0') {
316 reader->comment = config.comment;
317 }
318
319 reader->has_header = config.has_header;
320 reader->skip_header = config.skip_header;
321}
322
323CsvReaderConfig csv_reader_getconfig(CsvReader* reader) {
324 CsvReaderConfig config = {
325 .comment = reader->comment,
326 .delim = reader->delim,
327 .has_header = reader->has_header,
328 .skip_header = reader->skip_header,
329 .quote = reader->quote,
330 };
331 return config;
332}
333
334// Function to count the number of fields in a CSV line
335static size_t get_num_fields(const char* line, char delim, char quote) {
336 size_t numFields = 0;
337 int insideQuotes = 0;
338
339 for (int i = 0; line[i] != '\0'; i++) {
340 if (line[i] == quote) {
341 insideQuotes = !insideQuotes; // Toggle insideQuotes flag
342 } else if (line[i] == delim && !insideQuotes) {
343 numFields++;
344 }
345 }
346
347 // Add the last field if it is not empty
348 if (line[0] != '\0') {
349 numFields++;
350 }
351 return numFields;
352}
353
357static bool parse_csv_line(csv_line_params* args) {
358 // statically MUST be <= 4096 to prevent stack overflow
359 char field[MAX_FIELD_SIZE] = {0};
360 int insideQuotes = 0;
361
362 Row* row = args->row;
363 row->fields = arena_alloc(args->arena, args->num_fields * sizeof(char*));
364 if (!row->fields) {
365 fprintf(stderr, "ERROR: unable to allocate memory for fields\n");
366 return false;
367 }
368
369 char** fields = row->fields;
370 size_t fieldIndex = 0;
371 row->count = 0;
372
373 for (size_t i = 0; args->line[i] != '\0'; i++) {
374 if (args->line[i] == args->quote) {
375 insideQuotes = !insideQuotes;
376 } else if (args->line[i] == args->delim && !insideQuotes) {
377 field[fieldIndex] = '\0';
378 char* trimmed = trim_string(field);
379 fields[row->count] = arena_strdup(args->arena, trimmed);
380 if (!fields[row->count]) {
381 return false;
382 }
383 row->count++;
384 fieldIndex = 0;
385 } else {
386 /* SECURITY: Prevent stack buffer overflow on oversized fields. */
387 if (fieldIndex >= MAX_FIELD_SIZE - 1) {
388 fprintf(stderr,
389 "ERROR: field in row %zu exceeds MAX_FIELD_SIZE (%d), "
390 "aborting parse\n",
391 args->rowIndex, MAX_FIELD_SIZE - 1);
392 return false;
393 }
394 field[fieldIndex++] = args->line[i];
395 }
396 }
397
398 /* If inside quotes at the end of the line, the line is not terminated */
399 if (insideQuotes) {
400 fprintf(stderr, "ERROR: unterminated quoted field:%s in line %zu\n", args->line, args->rowIndex);
401 return false;
402 }
403
404 /* Add the last field with whitespace trimming */
405 field[fieldIndex] = '\0';
406 char* trimmed = trim_string(field);
407 fields[row->count] = arena_strdup(args->arena, trimmed);
408 if (!fields[row->count]) {
409 fprintf(stderr, "ERROR: unable to allocate memory for fields[%zu]\n", row->count);
410 return false;
411 }
412 row->count++;
413
414 /* Validate the number of fields */
415 if (row->count != args->num_fields) {
416 fprintf(stderr, "ERROR: invalid number of fields in line %zu\n", args->rowIndex);
417 return false;
418 }
419
420 return true;
421}
422
423// count the number of lines in a csv file.
424// ignore comments. Optionally skip header.
425#define _CSV_READ_BUFSIZE (64u * 1024u) /* 64 KB — fits comfortably in L2 */
426
427static size_t line_count(CsvReader* reader) {
428 size_t lines = 0;
429 bool headerSkipped = false;
430 bool line_first_char = true; /* first char of a new logical line? */
431 bool skip_this_line = false; /* skip remainder of current line */
432 bool blank_line = true; /* is the current line blank? */
433
434 /* One stack buffer; we never hold a reference across a fread() call. */
435 char buf[_CSV_READ_BUFSIZE];
436 size_t nread;
437
438 rewind(reader->stream);
439
440 while ((nread = fread(buf, 1, sizeof(buf), reader->stream)) > 0) {
441 const char* p = buf;
442 const char* end = buf + nread;
443
444 while (p < end) {
445 /* Find the next newline in the remaining block. */
446 const char* nl = (const char*)memchr(p, '\n', (size_t)(end - p));
447 const char* chunk_end = nl ? nl + 1 : end;
448
449 /* ---- process characters in [p, chunk_end) ---- */
450 for (const char* c = p; c < chunk_end; c++) {
451 if (*c == '\n') {
452 /* End of line: count it if it had real content. */
453 if (!skip_this_line && !blank_line) {
454 /* Header skip (only first real data line). */
455 if (reader->has_header && reader->skip_header && !headerSkipped && lines == 0) {
456 headerSkipped = true;
457 } else {
458 lines++;
459 }
460 }
461 /* Reset state for next line. */
462 skip_this_line = false;
463 blank_line = true;
464 line_first_char = true;
465 continue;
466 }
467
468 /* Non-newline character. */
469 if (skip_this_line) continue;
470
471 if (line_first_char) {
472 line_first_char = false;
473 if (*c == reader->comment) {
474 skip_this_line = true;
475 continue;
476 }
477 }
478
479 if (*c != '\r' && (*c != ' ' && *c != '\t')) {
480 blank_line = false;
481 }
482 }
483
484 p = chunk_end;
485 }
486 }
487
488 /* Handle last line if it did not end with '\n'. */
489 if (!skip_this_line && !blank_line) {
490 if (reader->has_header && reader->skip_header && !headerSkipped && lines == 0) {
491 /* header only — nothing to count */
492 } else {
493 lines++;
494 }
495 }
496
497 rewind(reader->stream);
498 return lines;
499}
500
501typedef struct CsvWriter {
502 FILE* stream; // file_t pointer corresponding to the file stream.
503 char delim; // Delimiter character
504 char quote; // Quote character
505 char newline; // Newline character
506 bool quote_all; // Quote all fields
507 bool flush; // Flush the stream after writing each row
508} CsvWriter;
509
510CsvWriter* csvwriter_new(const char* filename) {
511 CsvWriter* writer = malloc(sizeof(CsvWriter));
512 if (!writer) {
513 fprintf(stderr, "error allocating memory for CsvWriter\n");
514 return NULL;
515 }
516
517 writer->stream = fopen(filename, "w");
518 if (!writer->stream) {
519 fprintf(stderr, "error opening file %s\n", filename);
520 free(writer);
521 return NULL;
522 }
523
524 writer->delim = ',';
525 writer->quote = '"';
526 writer->newline = '\n';
527 writer->quote_all = false;
528 writer->flush = false;
529 return writer;
530}
531
540static inline bool field_needs_quoting(const char* field, char delim, char quote, char newline) {
541 // Check for delimiter, quote character, or newline in the field
542 return (strchr(field, delim) != NULL || strchr(field, quote) != NULL || strchr(field, newline) != NULL);
543}
544
552static bool write_quoted_field(FILE* fp, const char* field, char quote) {
553 // Write opening quote
554 if (fputc(quote, fp) == EOF) {
555 return false;
556 }
557
558 // Write field content, escaping quotes by doubling them (CSV standard)
559 for (const char* ptr = field; *ptr != '\0'; ptr++) {
560 if (*ptr == quote) {
561 // Escape quote by writing it twice
562 if (fputc(quote, fp) == EOF || fputc(quote, fp) == EOF) {
563 return false;
564 }
565 } else {
566 if (fputc(*ptr, fp) == EOF) {
567 return false;
568 }
569 }
570 }
571
572 // Write closing quote
573 if (fputc(quote, fp) == EOF) {
574 return false;
575 }
576
577 return true;
578}
579
590static bool write_single_field(FILE* fp, const char* field, bool quote_all, char delim, char quote, char newline) {
591 if (field == NULL) {
592 // Handle null field as empty string
593 field = "";
594 }
595
596 if (quote_all || field_needs_quoting(field, delim, quote, newline)) {
597 return write_quoted_field(fp, field, quote);
598 } else {
599 // Simple case - no quoting needed, use fputs for efficiency
600 return fputs(field, fp) != EOF;
601 }
602}
603
611bool csvwriter_write_row(CsvWriter* writer, const char** fields, size_t numfields) {
612 // Input validation
613 if (writer == NULL) {
614 errno = EINVAL;
615 return false;
616 }
617
618 if (fields == NULL && numfields > 0) {
619 errno = EINVAL;
620 return false;
621 }
622
623 FILE* fp = NULL;
624
625 if (numfields == 0) {
626 // Writing empty row - just write newline
627 if (fputc(writer->newline, writer->stream) == EOF) {
628 return false;
629 }
630 goto flush_and_exit;
631 }
632
633 fp = writer->stream;
634
635 // Check if stream is valid before proceeding
636 if (ferror(fp)) {
637 return false;
638 }
639
640 // Write all fields with delimiters
641 for (size_t i = 0; i < numfields; i++) {
642 // Write delimiter before all fields except the first
643 if (i > 0) {
644 if (fputc(writer->delim, fp) == EOF) {
645 return false;
646 }
647 }
648
649 // Write the field content
650 if (!write_single_field(fp, fields[i], writer->quote_all, writer->delim, writer->quote, writer->newline)) {
651 return false;
652 }
653 }
654
655 // Write row terminator
656 if (fputc(writer->newline, fp) == EOF) {
657 return false;
658 }
659
660flush_and_exit:
661 // Flush if requested
662 if (writer->flush && fp) {
663 if (fflush(fp) != 0) {
664 return false;
665 }
666 }
667
668 // Final error check
669 return !ferror(fp);
670}
671
672void csvwriter_free(CsvWriter* writer) {
673 if (!writer) return;
674 if (writer->stream) fclose(writer->stream);
675 free(writer);
676}
677
678// configure the csv writer
679void csvwriter_setconfig(CsvWriter* writer, CsvWriterConfig config) {
680 if (config.delim != '\0') {
681 writer->delim = config.delim;
682 }
683
684 if (config.quote != '\0') {
685 writer->quote = config.quote;
686 }
687
688 writer->quote_all = config.quote_all;
689 writer->flush = config.flush;
690}
size_t csv_reader_numrows(const CsvReader *reader)
Get the number of rows in the CSV data.
Definition csvparser.c:294
void csv_reader_free(CsvReader *reader)
Free memory used by the CsvReader and CsvRow structures.
Definition csvparser.c:296
struct CsvReader CsvReader
Opaque structure representing a CSV parser. Create a new CSV parser with csv_reader_new and free it w...
Definition csvparser.h:48
bool csvwriter_write_row(CsvWriter *writer, const char **fields, size_t numfields)
Definition csvparser.c:611
Row ** csv_reader_parse(CsvReader *reader)
Parse the CSV data and retrieve all the rows at once.
Definition csvparser.c:138
CsvReader * csv_reader_new(const char *filename, size_t arena_memory)
Create a new CSV reader associated with a filename.
Definition csvparser.c:49
void csv_reader_parse_async(CsvReader *reader, CsvRowCallback callback, size_t alloc_max)
Parse the CSV data and pass each processed row back in a callback. Return true from the callback to s...
Definition csvparser.c:218
struct CsvWriter CsvWriter
Definition csvparser.h:164
Structure representing a CSV row.
Definition csvparser.h:53
size_t count
Number of fields in each row.
Definition csvparser.h:55
char ** fields
Array of fields in each row.
Definition csvparser.h:54