1#include "../include/unicode.h"
27void ucp_to_utf8(uint32_t codepoint,
char utf8[UTF8_MAX_LEN]) {
32 if (codepoint <= 0x7F) {
33 utf8[0] = (char)codepoint;
35 }
else if (codepoint <= 0x7FF) {
36 utf8[0] = (char)(0xC0 | (codepoint >> 6));
37 utf8[1] = (char)(0x80 | (codepoint & 0x3F));
39 }
else if (codepoint <= 0xFFFF) {
40 utf8[0] = (char)(0xE0 | (codepoint >> 12));
41 utf8[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
42 utf8[2] = (char)(0x80 | (codepoint & 0x3F));
44 }
else if (codepoint <= 0x10FFFF) {
45 utf8[0] = (char)(0xF0 | (codepoint >> 18));
46 utf8[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F));
47 utf8[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
48 utf8[3] = (char)(0x80 | (codepoint & 0x3F));
74 uint32_t codepoint = 0;
75 const uint8_t* u = (
const uint8_t*)utf8;
77 if ((u[0] & 0x80) == 0) {
79 }
else if ((u[0] & 0xE0) == 0xC0) {
80 if ((u[1] & 0xC0) == 0x80) {
81 codepoint = ((u[0] & 0x1FU) << 6) | (u[1] & 0x3F);
82 if (codepoint < 0x80) {
88 }
else if ((u[0] & 0xF0) == 0xE0) {
89 if ((u[1] & 0xC0) == 0x80 && (u[2] & 0xC0) == 0x80) {
90 codepoint = ((u[0] & 0x0FU) << 12) | ((u[1] & 0x3FU) << 6) | (u[2] & 0x3F);
91 if (codepoint < 0x800) {
94 if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
100 }
else if ((u[0] & 0xF8) == 0xF0) {
101 if ((u[1] & 0xC0) == 0x80 && (u[2] & 0xC0) == 0x80 && (u[3] & 0xC0) == 0x80) {
102 codepoint = ((u[0] & 0x07U) << 18) | ((u[1] & 0x3FU) << 12) | ((u[2] & 0x3FU) << 6) | (u[3] & 0x3F);
103 if (codepoint < 0x10000) {
106 if (codepoint > 0x10FFFF) {
139 for (
size_t i = 0; s[i] !=
'\0'; i++) {
140 if ((s[i] & 0xC0) != 0x80) {
168 for (
size_t i = 0; s[i] !=
'\0';) {
169 unsigned char byte = (
unsigned char)s[i];
170 if ((
byte & 0x80) == 0) {
173 }
else if ((
byte & 0xE0) == 0xC0 && s[i + 1] !=
'\0') {
174 if ((s[i + 1] & 0xC0) == 0x80) {
180 }
else if ((
byte & 0xF0) == 0xE0 && s[i + 1] !=
'\0' && s[i + 2] !=
'\0') {
181 if ((s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80) {
187 }
else if ((
byte & 0xF8) == 0xF0 && s[i + 1] !=
'\0' && s[i + 2] !=
'\0' && s[i + 3] !=
'\0') {
188 if ((s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80 && (s[i + 3] & 0xC0) == 0x80) {
219 uint8_t
byte = (uint8_t)*str;
222 }
else if (
byte <= 0xDF) {
224 }
else if (
byte <= 0xEF) {
226 }
else if (
byte <= 0xF7) {
261 for (
size_t i = 0; utf8[i] !=
'\0';) {
262 unsigned char byte = (
unsigned char)utf8[i];
263 if ((
byte & 0x80) == 0) {
265 }
else if ((
byte & 0xE0) == 0xC0) {
266 if (utf8[i + 1] ==
'\0' || (utf8[i + 1] & 0xC0) != 0x80) {
269 uint32_t codepoint = (uint32_t)((
byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F);
270 if (codepoint < 0x80) {
274 }
else if ((
byte & 0xF0) == 0xE0) {
275 if (utf8[i + 1] ==
'\0' || utf8[i + 2] ==
'\0' || (utf8[i + 1] & 0xC0) != 0x80 ||
276 (utf8[i + 2] & 0xC0) != 0x80) {
280 (uint32_t)((
byte & 0x0F) << 12) | (uint32_t)((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F);
281 if (codepoint < 0x800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
285 }
else if ((
byte & 0xF8) == 0xF0) {
286 if (utf8[i + 1] ==
'\0' || utf8[i + 2] ==
'\0' || utf8[i + 3] ==
'\0' || (utf8[i + 1] & 0xC0) != 0x80 ||
287 (utf8[i + 2] & 0xC0) != 0x80 || (utf8[i + 3] & 0xC0) != 0x80) {
290 uint32_t codepoint = (uint32_t)((
byte & 0x07) << 18) | ((uint32_t)(utf8[i + 1] & 0x3F) << 12) |
291 ((uint32_t)(utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F);
292 if (codepoint < 0x10000 || codepoint > 0x10FFFF) {
422 char* copy = (
char*)malloc(length + 1);
424 memcpy(copy, data, length);
487 s->
data = (
char*)malloc(capacity + 1);
520 if (!s || !s->
data) {
523 printf(
"%s\n", s->
data);
535 printf(
"Byte Length: %zu\n", s->
length);
536 printf(
"Code Points: %zu\n", s->
count);
545 if (!s || !s->
data) {
549 for (
size_t i = 0; s->
data[i] !=
'\0';) {
551 printf(
"U+%04X ", codepoint);
569 if (!s || !s->
data || !utf8) {
573 char* index = strstr(s->
data, utf8);
575 return (
int)(index - s->
data);
588 if (!s || !s->
data || !utf8) {
592 size_t needle_len = strlen(utf8);
593 if (needle_len == 0 || needle_len > s->
length) {
597 for (
size_t i = s->
length - needle_len; i > 0; i--) {
598 if (memcmp(&s->
data[i], utf8, needle_len) == 0) {
603 if (memcmp(s->
data, utf8, needle_len) == 0) {
625 char* new_data = (
char*)realloc(s->
data, s->
length + length + 1);
649 if (!s || !s->
data || index >= s->
length || utf8_byte_len == 0) {
653 if (index + utf8_byte_len > s->
length) {
654 utf8_byte_len = s->
length - index;
657 char* substr = (
char*)malloc(utf8_byte_len + 1);
659 memcpy(substr, &s->
data[index], utf8_byte_len);
660 substr[utf8_byte_len] =
'\0';
674 if (!s || !s->
data || !data || index > s->
length) {
680 char* new_data = (
char*)realloc(s->
data, s->
length + length + 1);
687 memmove(&s->
data[index + length], &s->
data[index], s->
length - index + 1);
688 memcpy(&s->
data[index], data, length);
703 if (!s || !s->
data || index >= s->
length || count == 0) {
708 for (
size_t j = 0; j < count && i < s->
length; j++) {
735 if (!s || !s->
data || !old_str || !new_str) {
744 char* index = strstr(s->
data, old_str);
749 size_t offset = (size_t)(index - s->
data);
750 if (old_byte_len != new_byte_len) {
751 char* new_data = (
char*)realloc(s->
data, s->
length - old_byte_len + new_byte_len + 1);
758 memmove(&s->
data[offset + new_byte_len], &s->
data[offset + old_byte_len], s->
length - offset - old_byte_len + 1);
759 memcpy(&s->
data[offset], new_str, new_byte_len);
774 if (!s || !s->
data || !old_str || !new_str) {
779 if (old_byte_len == 0) {
786 size_t replacements = 0;
788 char* index = s->
data;
789 while ((index = strstr(index, old_str)) != NULL) {
790 size_t offset = (size_t)(index - s->
data);
792 if (old_byte_len != new_byte_len) {
793 char* new_data = (
char*)realloc(s->
data, s->
length - old_byte_len + new_byte_len + 1);
798 index = s->
data + offset;
801 memmove(&s->
data[offset + new_byte_len], &s->
data[offset + old_byte_len],
802 s->
length - offset - old_byte_len + 1);
803 memcpy(&s->
data[offset], new_str, new_byte_len);
806 index += new_byte_len;
826 char* reversed = (
char*)malloc(s->
length + 1);
832 for (
size_t i = 0; i < s->
length;) {
834 if (len == 0 || j < len) {
839 memcpy(&reversed[j], &s->
data[i], len);
842 reversed[s->
length] =
'\0';
856 if (!s || !s->
data || !filename) {
860 FILE* file = fopen(filename,
"w");
865 size_t bytes = fwrite(s->
data, 1, s->
length, file);
887 FILE* file = fopen(filename,
"r");
892 if (fseek(file, 0, SEEK_END) != 0) {
897 long length = ftell(file);
903 if (fseek(file, 0, SEEK_SET) != 0) {
908 char* data = (
char*)malloc((
size_t)length + 1);
914 size_t bytes = fread(data, 1, (
size_t)length, file);
935 size_t len = strlen(str);
946 memmove(str, &str[i], len - i + 1);
962 size_t len = strlen(str);
966 size_t char_start = i - 1;
967 while (char_start > 0 && (str[char_start] & 0xC0) == 0x80) {
973 if (char_len > 0 && char_start + char_len == i &&
is_utf8_whitespace(&str[char_start])) {
1005 if (!str || !chars) {
1009 uint32_t trim_codepoints[256] = {0};
1010 size_t num_trim_chars = 0;
1011 size_t chars_len = strlen(chars);
1013 for (
size_t k = 0; k < chars_len && num_trim_chars < 256;) {
1015 if (current_char_len == 0 || k + current_char_len > chars_len) {
1021 if (codepoint != 0xFFFD) {
1023 for (
size_t idx = 0; idx < num_trim_chars; ++idx) {
1024 if (trim_codepoints[idx] == codepoint) {
1029 if (!found && num_trim_chars < 256) {
1030 trim_codepoints[num_trim_chars++] = codepoint;
1033 k += current_char_len;
1036 size_t len = strlen(str);
1040 if (current_char_len == 0 || i + current_char_len > len) {
1045 bool should_trim =
false;
1047 if (codepoint != 0xFFFD) {
1048 for (
size_t j = 0; j < num_trim_chars; j++) {
1049 if (codepoint == trim_codepoints[j]) {
1060 i += current_char_len;
1064 memmove(str, &str[i], len - i + 1);
1070 size_t char_start = i - 1;
1071 while (char_start > 0 && (str[char_start] & 0xC0) == 0x80) {
1075 if ((str[char_start] & 0xC0) == 0x80) {
1081 if (char_len > 0 && char_start + char_len == i) {
1083 bool should_trim =
false;
1085 if (codepoint != 0xFFFD) {
1086 for (
size_t j = 0; j < num_trim_chars; j++) {
1087 if (codepoint == trim_codepoints[j]) {
1119 size_t len = strlen(str);
1121 while (i < len && str[i] == c) {
1126 memmove(str, &str[i], len - i + 1);
1130 while (len > 0 && str[len - 1] == c) {
1147 for (
size_t i = 0; str[i] !=
'\0';) {
1148 unsigned char byte = (
unsigned char)str[i];
1154 unsigned int is_upper = (
byte -
'A' + 1u <= 26u) ? 1u : 0u;
1155 str[i] = (char)(
byte | (is_upper << 5));
1164 if (old_len == 0)
break;
1166 if (iswupper((wint_t)codepoint)) {
1167 char utf8[UTF8_MAX_LEN] = {0};
1168 ucp_to_utf8((uint32_t)towlower((wint_t)codepoint), utf8);
1171 if (new_len != old_len) {
1172 size_t remaining = strlen(&str[i + old_len]);
1173 memmove(&str[i + new_len], &str[i + old_len], remaining + 1);
1175 memcpy(&str[i], utf8, new_len);
1194 for (
size_t i = 0; str[i] !=
'\0';) {
1195 unsigned char byte = (
unsigned char)str[i];
1200 unsigned int is_lower = (
byte -
'a' + 1u <= 26u) ? 1u : 0u;
1201 str[i] = (char)(
byte & (
unsigned char)~(is_lower << 5));
1210 if (old_len == 0)
break;
1212 if (iswlower((wint_t)codepoint)) {
1213 char utf8[UTF8_MAX_LEN] = {0};
1214 ucp_to_utf8((uint32_t)towupper((wint_t)codepoint), utf8);
1217 if (new_len != old_len) {
1218 size_t remaining = strlen(&str[i + old_len]);
1219 memmove(&str[i + new_len], &str[i + old_len], remaining + 1);
1221 memcpy(&str[i], utf8, new_len);
1242 if (!str || !str->
data || !delim || !num_parts) {
1250 if (delim_len == 0) {
1256 size_t len = str->
length;
1257 for (
size_t i = 0; i < len;) {
1263 if (char_len == 0) {
1278 for (
size_t i = 0; i < len;) {
1282 parts[index]->
data[i - start] =
'\0';
1283 parts[index]->
length = i - start;
1291 if (char_len == 0) {
1300 parts[index]->
data[len - start] =
'\0';
1301 parts[index]->
length = len - start;
1319 if (!array || index >= size) {
1324 for (
size_t i = index; i < size - 1; i++) {
1325 array[i] = array[i + 1];
1327 array[size - 1] = NULL;
1341 for (
size_t i = 0; i < size; i++) {
1355 if (!str || !prefix) {
1360 for (
size_t i = 0; i < len; i++) {
1361 if (str[i] != prefix[i]) {
1376 if (!str || !suffix) {
1385 for (
size_t i = 0; i < len2; i++) {
1386 if (str[len - len2 + i] != suffix[i]) {
1401 if (!str || !substr) {
1404 return strstr(str, substr) != NULL;
1424 return strcmp(s1, s2);
1438 return strcmp(s1, s2) == 0;
1449 if (!s || !s->
data) {
1464 if (!s1 || !s1->
data || !s2 || !s2->
data) {
Represents a mutable UTF-8 encoded string with metadata.
uint32_t utf8_to_codepoint(const char *utf8)
Decodes a UTF-8 byte sequence to its Unicode codepoint.
void utf8_toupper(char *str)
Converts all lowercase characters to uppercase in-place.
utf8_string * utf8_readfrom(const char *filename)
Reads a UTF-8 string from a file.
utf8_string * utf8_new_with_capacity(size_t capacity)
Creates an empty utf8_string with preallocated capacity.
size_t utf8_replace_all(utf8_string *s, const char *old_str, const char *new_str)
Replaces all occurrences of a substring with another string.
bool is_utf8_digit(const char *utf8)
Checks if a UTF-8 character represents a digit.
bool utf8_replace(utf8_string *s, const char *old_str, const char *new_str)
Replaces the first occurrence of a substring with another string.
void utf8_array_remove(utf8_string **array, size_t size, size_t index)
Removes an element from a utf8_string array and frees it.
utf8_string * utf8_concat(const utf8_string *s1, const utf8_string *s2)
Concatenates two utf8_string objects into a new string.
bool utf8_reverse(utf8_string *s)
Reverses a UTF-8 string by codepoints.
bool is_codepoint_alnum(uint32_t codepoint)
Checks if a codepoint represents an alphanumeric character.
char * utf8_substr(const utf8_string *s, size_t index, size_t utf8_byte_len)
Extracts a substring by byte range.
int utf8_last_index_of(const utf8_string *s, const char *utf8)
Finds the byte index of the last occurrence of a substring.
utf8_string ** utf8_split(const utf8_string *str, const char *delim, size_t *num_parts)
Splits a UTF-8 string into parts using a delimiter.
const char * utf8_data(const utf8_string *s)
Returns a pointer to the internal UTF-8 data buffer.
void utf8_free(utf8_string *s)
Frees all resources associated with a utf8_string.
void utf8_ltrim(char *str)
Removes leading whitespace from a UTF-8 string in-place.
char * utf8_copy(const char *data)
Creates a copy of a UTF-8 string containing only valid UTF-8 sequences.
size_t utf8_valid_byte_count(const char *s)
Counts the number of valid UTF-8 bytes in a string.
bool utf8_ends_with(const char *str, const char *suffix)
Checks if a string ends with a given suffix.
bool utf8_equals(const char *s1, const char *s2)
Compares two UTF-8 strings for equality.
bool is_utf8_punct(const char *utf8)
Checks if a UTF-8 character represents a punctuation character.
bool utf8_remove(utf8_string *s, size_t index, size_t count)
Removes a specified number of codepoints starting at a byte index.
bool utf8_insert(utf8_string *s, size_t index, const char *data)
Inserts UTF-8 data at a specific byte index.
bool is_codepoint_whitespace(uint32_t codepoint)
Checks if a codepoint represents whitespace.
bool is_codepoint_punct(uint32_t codepoint)
Checks if a codepoint represents a punctuation character.
bool is_utf8_alnum(const char *utf8)
Checks if a UTF-8 character represents an alphanumeric character.
void utf8_rtrim(char *str)
Removes trailing whitespace from a UTF-8 string in-place.
bool utf8_append(utf8_string *s, const char *data)
Appends UTF-8 data to the end of a utf8_string.
int utf8_index_of(const utf8_string *s, const char *utf8)
Finds the byte index of the first occurrence of a substring.
bool is_codepoint_digit(uint32_t codepoint)
Checks if a codepoint represents a digit.
void utf8_split_free(utf8_string **str, size_t size)
Frees an array of utf8_string objects returned by utf8_split().
void utf8_trim(char *str)
Removes leading and trailing whitespace from a UTF-8 string in-place.
void utf8_print(const utf8_string *s)
Prints the UTF-8 string content to stdout followed by a newline.
int utf8_compare(const char *s1, const char *s2)
Compares two UTF-8 strings lexicographically.
bool is_valid_utf8(const char *utf8)
Comprehensively validates a UTF-8 encoded string.
size_t utf8_char_length(const char *str)
Determines the byte length of a UTF-8 character from its first byte.
#define UNICODE_MAX_CODEPOINT
void utf8_trim_chars(char *str, const char *c)
Removes leading and trailing characters from a UTF-8 string in-place.
void utf8_print_info(const utf8_string *s)
Prints metadata about the UTF-8 string to stdout.
void utf8_trim_char(char *str, char c)
Removes leading and trailing occurrences of a single character.
utf8_string * utf8_clone(const utf8_string *s)
Duplicates a utf8_string object.
void ucp_to_utf8(uint32_t codepoint, char utf8[UTF8_MAX_LEN])
Converts a Unicode codepoint to its UTF-8 byte sequence.
utf8_string * utf8_new(const char *data)
Creates a new utf8_string object from a C string.
size_t utf8_count_codepoints(const char *utf8)
Counts the number of Unicode codepoints in a UTF-8 string.
bool is_utf8_whitespace(const char *utf8)
Checks if a UTF-8 character represents whitespace.
bool is_valid_codepoint(uint32_t codepoint)
Validates whether a codepoint is within the valid Unicode range.
bool utf8_contains(const char *str, const char *substr)
Checks if a string contains a substring.
void utf8_tolower(char *str)
Converts all uppercase characters to lowercase in-place.
bool is_codepoint_alpha(uint32_t codepoint)
Checks if a codepoint represents an alphabetic character.
bool utf8_starts_with(const char *str, const char *prefix)
Checks if a string starts with a given prefix.
bool is_utf8_alpha(const char *utf8)
Checks if a UTF-8 character represents an alphabetic character.
long utf8_writeto(const utf8_string *s, const char *filename)
Writes a utf8_string to a file.
void utf8_print_codepoints(const utf8_string *s)
Prints the Unicode codepoints in U+XXXX format to stdout.