solidc
Robust collection of general-purpose cross-platform C libraries and data structures designed for rapid and safe development in C
Loading...
Searching...
No Matches
unicode.h
Go to the documentation of this file.
1#ifndef SOLIDC_UNICODE_H
2#define SOLIDC_UNICODE_H
3
23#include <stdbool.h>
24#include <stddef.h>
25#include <stdint.h>
26
27#ifdef __cplusplus
28extern "C" {
29#endif
30
32#define UNICODE_VERSION 0x0100 // 1.0
33
35#define UNICODE_MAX_CODEPOINT 0x10FFFF // 1,114,111
36
38#define UNICODE_MAX_UTF8_BYTES 4
39
40#define UTF8_MAX_LEN (UNICODE_MAX_UTF8_BYTES + 1)
41
48typedef struct utf8_string {
49 char* data;
50 size_t length;
51 size_t count;
53
54/* ============================================================================
55 * Core Encoding/Decoding Functions
56 * ============================================================================ */
57
64void ucp_to_utf8(uint32_t codepoint, char utf8[UTF8_MAX_LEN]);
65
72uint32_t utf8_to_codepoint(const char* utf8);
73
79size_t utf8_count_codepoints(const char* utf8);
80
87size_t utf8_valid_byte_count(const char* s);
88
94size_t utf8_char_length(const char* str);
95
96/* ============================================================================
97 * Validation Functions
98 * ============================================================================ */
99
105bool is_valid_codepoint(uint32_t codepoint);
106
113bool is_valid_utf8(const char* utf8);
114
115/* ============================================================================
116 * Character Classification Functions
117 * ============================================================================ */
118
124bool is_codepoint_whitespace(uint32_t codepoint);
125
131bool is_utf8_whitespace(const char* utf8);
132
138bool is_codepoint_digit(uint32_t codepoint);
139
145bool is_utf8_digit(const char* utf8);
146
152bool is_codepoint_alpha(uint32_t codepoint);
153
159bool is_utf8_alpha(const char* utf8);
160
166bool is_codepoint_alnum(uint32_t codepoint);
167
173bool is_utf8_alnum(const char* utf8);
174
180bool is_codepoint_punct(uint32_t codepoint);
181
187bool is_utf8_punct(const char* utf8);
188
189/* ============================================================================
190 * String Object Lifecycle Functions
191 * ============================================================================ */
192
199utf8_string* utf8_new(const char* data);
200
207utf8_string* utf8_new_with_capacity(size_t capacity);
208
213void utf8_free(utf8_string* s);
214
221char* utf8_copy(const char* data);
222
230
231/* ============================================================================
232 * String Access and Information Functions
233 * ============================================================================ */
234
241const char* utf8_data(const utf8_string* s);
242
247void utf8_print(const utf8_string* s);
248
253void utf8_print_info(const utf8_string* s);
254
260
261/* ============================================================================
262 * String Search and Comparison Functions
263 * ============================================================================ */
264
271int utf8_index_of(const utf8_string* s, const char* utf8);
272
279int utf8_last_index_of(const utf8_string* s, const char* utf8);
280
287bool utf8_starts_with(const char* str, const char* prefix);
288
295bool utf8_ends_with(const char* str, const char* suffix);
296
303bool utf8_contains(const char* str, const char* substr);
304
311int utf8_compare(const char* s1, const char* s2);
312
319bool utf8_equals(const char* s1, const char* s2);
320
321/* ============================================================================
322 * String Modification Functions
323 * ============================================================================ */
324
331bool utf8_append(utf8_string* s, const char* data);
332
341char* utf8_substr(const utf8_string* s, size_t index, size_t utf8_byte_len);
342
350bool utf8_insert(utf8_string* s, size_t index, const char* data);
351
359bool utf8_remove(utf8_string* s, size_t index, size_t count);
360
368bool utf8_replace(utf8_string* s, const char* old_str, const char* new_str);
369
377size_t utf8_replace_all(utf8_string* s, const char* old_str, const char* new_str);
378
385
393utf8_string* utf8_concat(const utf8_string* s1, const utf8_string* s2);
394
395/* ============================================================================
396 * String Transformation Functions (In-Place)
397 * ============================================================================ */
398
403void utf8_ltrim(char* str);
404
409void utf8_rtrim(char* str);
410
415void utf8_trim(char* str);
416
422void utf8_trim_chars(char* str, const char* c);
423
429void utf8_trim_char(char* str, char c);
430
436void utf8_tolower(char* str);
437
443void utf8_toupper(char* str);
444
445/* ============================================================================
446 * String Splitting and Array Functions
447 * ============================================================================ */
448
457utf8_string** utf8_split(const utf8_string* str, const char* delim, size_t* num_parts);
458
464void utf8_split_free(utf8_string** str, size_t size);
465
472void utf8_array_remove(utf8_string** array, size_t size, size_t index);
473
474/* ============================================================================
475 * File I/O Functions
476 * ============================================================================ */
477
484long utf8_writeto(const utf8_string* s, const char* filename);
485
492utf8_string* utf8_readfrom(const char* filename);
493
494/* ============================================================================
495 * Pattern Matching Functions
496 * ============================================================================ */
497
505bool regex_match(const char* str, const char* pattern);
506
507#ifdef __cplusplus
508}
509#endif
510
511#endif /* SOLIDC_UNICODE_H */
Represents a mutable UTF-8 encoded string with metadata.
Definition unicode.h:48
char * data
Definition unicode.h:49
size_t length
Definition unicode.h:50
size_t count
Definition unicode.h:51
uint32_t utf8_to_codepoint(const char *utf8)
Decodes a UTF-8 byte sequence to its Unicode codepoint.
Definition unicode.c:69
void utf8_toupper(char *str)
Converts all lowercase characters to uppercase in-place.
Definition unicode.c:1191
utf8_string * utf8_readfrom(const char *filename)
Reads a UTF-8 string from a file.
Definition unicode.c:882
utf8_string * utf8_new_with_capacity(size_t capacity)
Creates an empty utf8_string with preallocated capacity.
Definition unicode.c:481
size_t utf8_replace_all(utf8_string *s, const char *old_str, const char *new_str)
Replaces all occurrences of a substring with another string.
Definition unicode.c:773
bool is_utf8_digit(const char *utf8)
Checks if a UTF-8 character represents a digit.
Definition unicode.c:339
bool utf8_replace(utf8_string *s, const char *old_str, const char *new_str)
Replaces the first occurrence of a substring with another string.
Definition unicode.c:734
void utf8_array_remove(utf8_string **array, size_t size, size_t index)
Removes an element from a utf8_string array and frees it.
Definition unicode.c:1318
utf8_string * utf8_concat(const utf8_string *s1, const utf8_string *s2)
Concatenates two utf8_string objects into a new string.
Definition unicode.c:1463
bool utf8_reverse(utf8_string *s)
Reverses a UTF-8 string by codepoints.
Definition unicode.c:821
bool is_codepoint_alnum(uint32_t codepoint)
Checks if a codepoint represents an alphanumeric character.
Definition unicode.c:373
char * utf8_substr(const utf8_string *s, size_t index, size_t utf8_byte_len)
Extracts a substring by byte range.
Definition unicode.c:648
int utf8_last_index_of(const utf8_string *s, const char *utf8)
Finds the byte index of the last occurrence of a substring.
Definition unicode.c:587
utf8_string ** utf8_split(const utf8_string *str, const char *delim, size_t *num_parts)
Splits a UTF-8 string into parts using a delimiter.
Definition unicode.c:1241
const char * utf8_data(const utf8_string *s)
Returns a pointer to the internal UTF-8 data buffer.
Definition unicode.c:437
void utf8_free(utf8_string *s)
Frees all resources associated with a utf8_string.
Definition unicode.c:505
void utf8_ltrim(char *str)
Removes leading whitespace from a UTF-8 string in-place.
Definition unicode.c:930
char * utf8_copy(const char *data)
Creates a copy of a UTF-8 string containing only valid UTF-8 sequences.
Definition unicode.c:416
size_t utf8_valid_byte_count(const char *s)
Counts the number of valid UTF-8 bytes in a string.
Definition unicode.c:162
bool utf8_ends_with(const char *str, const char *suffix)
Checks if a string ends with a given suffix.
Definition unicode.c:1375
bool utf8_equals(const char *s1, const char *s2)
Compares two UTF-8 strings for equality.
Definition unicode.c:1434
bool is_utf8_punct(const char *utf8)
Checks if a UTF-8 character represents a punctuation character.
Definition unicode.c:402
bool utf8_remove(utf8_string *s, size_t index, size_t count)
Removes a specified number of codepoints starting at a byte index.
Definition unicode.c:702
bool utf8_insert(utf8_string *s, size_t index, const char *data)
Inserts UTF-8 data at a specific byte index.
Definition unicode.c:673
bool is_codepoint_whitespace(uint32_t codepoint)
Checks if a codepoint represents whitespace.
Definition unicode.c:310
bool is_codepoint_punct(uint32_t codepoint)
Checks if a codepoint represents a punctuation character.
Definition unicode.c:394
bool is_utf8_alnum(const char *utf8)
Checks if a UTF-8 character represents an alphanumeric character.
Definition unicode.c:381
void utf8_rtrim(char *str)
Removes trailing whitespace from a UTF-8 string in-place.
Definition unicode.c:957
bool utf8_append(utf8_string *s, const char *data)
Appends UTF-8 data to the end of a utf8_string.
Definition unicode.c:617
int utf8_index_of(const utf8_string *s, const char *utf8)
Finds the byte index of the first occurrence of a substring.
Definition unicode.c:568
bool is_codepoint_digit(uint32_t codepoint)
Checks if a codepoint represents a digit.
Definition unicode.c:331
void utf8_split_free(utf8_string **str, size_t size)
Frees an array of utf8_string objects returned by utf8_split().
Definition unicode.c:1336
void utf8_trim(char *str)
Removes leading and trailing whitespace from a UTF-8 string in-place.
Definition unicode.c:988
void utf8_print(const utf8_string *s)
Prints the UTF-8 string content to stdout followed by a newline.
Definition unicode.c:519
int utf8_compare(const char *s1, const char *s2)
Compares two UTF-8 strings lexicographically.
Definition unicode.c:1414
bool is_valid_utf8(const char *utf8)
Comprehensively validates a UTF-8 encoded string.
Definition unicode.c:256
bool regex_match(const char *str, const char *pattern)
Tests if a string matches a regular expression pattern.
size_t utf8_char_length(const char *str)
Determines the byte length of a UTF-8 character from its first byte.
Definition unicode.c:214
void utf8_trim_chars(char *str, const char *c)
Removes leading and trailing characters from a UTF-8 string in-place.
Definition unicode.c:1004
void utf8_print_info(const utf8_string *s)
Prints metadata about the UTF-8 string to stdout.
Definition unicode.c:531
void utf8_trim_char(char *str, char c)
Removes leading and trailing occurrences of a single character.
Definition unicode.c:1114
utf8_string * utf8_clone(const utf8_string *s)
Duplicates a utf8_string object.
Definition unicode.c:1448
void ucp_to_utf8(uint32_t codepoint, char utf8[UTF8_MAX_LEN])
Converts a Unicode codepoint to its UTF-8 byte sequence.
Definition unicode.c:27
utf8_string * utf8_new(const char *data)
Creates a new utf8_string object from a C string.
Definition unicode.c:453
size_t utf8_count_codepoints(const char *utf8)
Counts the number of Unicode codepoints in a UTF-8 string.
Definition unicode.c:133
bool is_utf8_whitespace(const char *utf8)
Checks if a UTF-8 character represents whitespace.
Definition unicode.c:318
bool is_valid_codepoint(uint32_t codepoint)
Validates whether a codepoint is within the valid Unicode range.
Definition unicode.c:240
bool utf8_contains(const char *str, const char *substr)
Checks if a string contains a substring.
Definition unicode.c:1400
void utf8_tolower(char *str)
Converts all uppercase characters to lowercase in-place.
Definition unicode.c:1144
bool is_codepoint_alpha(uint32_t codepoint)
Checks if a codepoint represents an alphabetic character.
Definition unicode.c:352
bool utf8_starts_with(const char *str, const char *prefix)
Checks if a string starts with a given prefix.
Definition unicode.c:1354
bool is_utf8_alpha(const char *utf8)
Checks if a UTF-8 character represents an alphabetic character.
Definition unicode.c:360
long utf8_writeto(const utf8_string *s, const char *filename)
Writes a utf8_string to a file.
Definition unicode.c:855
void utf8_print_codepoints(const utf8_string *s)
Prints the Unicode codepoints in U+XXXX format to stdout.
Definition unicode.c:544