solidc
Robust collection of general-purpose cross-platform C libraries and data structures designed for rapid and safe development in C
Loading...
Searching...
No Matches
unicode.c
1#include "../include/unicode.h"
2
3#include <assert.h>
4#include <errno.h>
5#include <locale.h>
6#include <stdio.h>
7#include <stdlib.h>
8#include <string.h>
9#include <wctype.h>
10
27void ucp_to_utf8(uint32_t codepoint, char utf8[UTF8_MAX_LEN]) {
28 if (!utf8) {
29 return;
30 }
31
32 if (codepoint <= 0x7F) {
33 utf8[0] = (char)codepoint;
34 utf8[1] = '\0';
35 } else if (codepoint <= 0x7FF) {
36 utf8[0] = (char)(0xC0 | (codepoint >> 6));
37 utf8[1] = (char)(0x80 | (codepoint & 0x3F));
38 utf8[2] = '\0';
39 } else if (codepoint <= 0xFFFF) {
40 utf8[0] = (char)(0xE0 | (codepoint >> 12));
41 utf8[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
42 utf8[2] = (char)(0x80 | (codepoint & 0x3F));
43 utf8[3] = '\0';
44 } else if (codepoint <= 0x10FFFF) {
45 utf8[0] = (char)(0xF0 | (codepoint >> 18));
46 utf8[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F));
47 utf8[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F));
48 utf8[3] = (char)(0x80 | (codepoint & 0x3F));
49 utf8[4] = '\0';
50 } else {
51 utf8[0] = '\0';
52 }
53}
54
69uint32_t utf8_to_codepoint(const char* utf8) {
70 if (!utf8) {
71 return 0xFFFD; // replacement character
72 }
73
74 uint32_t codepoint = 0;
75 const uint8_t* u = (const uint8_t*)utf8;
76
77 if ((u[0] & 0x80) == 0) {
78 codepoint = u[0];
79 } else if ((u[0] & 0xE0) == 0xC0) {
80 if ((u[1] & 0xC0) == 0x80) {
81 codepoint = ((u[0] & 0x1FU) << 6) | (u[1] & 0x3F);
82 if (codepoint < 0x80) {
83 return 0xFFFD;
84 }
85 } else {
86 return 0xFFFD;
87 }
88 } else if ((u[0] & 0xF0) == 0xE0) {
89 if ((u[1] & 0xC0) == 0x80 && (u[2] & 0xC0) == 0x80) {
90 codepoint = ((u[0] & 0x0FU) << 12) | ((u[1] & 0x3FU) << 6) | (u[2] & 0x3F);
91 if (codepoint < 0x800) {
92 return 0xFFFD;
93 }
94 if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
95 return 0xFFFD;
96 }
97 } else {
98 return 0xFFFD;
99 }
100 } else if ((u[0] & 0xF8) == 0xF0) {
101 if ((u[1] & 0xC0) == 0x80 && (u[2] & 0xC0) == 0x80 && (u[3] & 0xC0) == 0x80) {
102 codepoint = ((u[0] & 0x07U) << 18) | ((u[1] & 0x3FU) << 12) | ((u[2] & 0x3FU) << 6) | (u[3] & 0x3F);
103 if (codepoint < 0x10000) {
104 return 0xFFFD;
105 }
106 if (codepoint > 0x10FFFF) {
107 return 0xFFFD;
108 }
109 } else {
110 return 0xFFFD;
111 }
112 } else {
113 return 0xFFFD;
114 }
115
116 return codepoint;
117}
118
133size_t utf8_count_codepoints(const char* s) {
134 if (!s) {
135 return 0;
136 }
137
138 size_t count = 0;
139 for (size_t i = 0; s[i] != '\0'; i++) {
140 if ((s[i] & 0xC0) != 0x80) {
141 count++;
142 }
143 }
144 return count;
145}
146
162size_t utf8_valid_byte_count(const char* s) {
163 if (!s) {
164 return 0;
165 }
166
167 size_t count = 0;
168 for (size_t i = 0; s[i] != '\0';) {
169 unsigned char byte = (unsigned char)s[i];
170 if ((byte & 0x80) == 0) {
171 count++;
172 i++;
173 } else if ((byte & 0xE0) == 0xC0 && s[i + 1] != '\0') {
174 if ((s[i + 1] & 0xC0) == 0x80) {
175 count += 2;
176 i += 2;
177 } else {
178 i++;
179 }
180 } else if ((byte & 0xF0) == 0xE0 && s[i + 1] != '\0' && s[i + 2] != '\0') {
181 if ((s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80) {
182 count += 3;
183 i += 3;
184 } else {
185 i++;
186 }
187 } else if ((byte & 0xF8) == 0xF0 && s[i + 1] != '\0' && s[i + 2] != '\0' && s[i + 3] != '\0') {
188 if ((s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80 && (s[i + 3] & 0xC0) == 0x80) {
189 count += 4;
190 i += 4;
191 } else {
192 i++;
193 }
194 } else {
195 i++;
196 }
197 }
198 return count;
199}
200
214size_t utf8_char_length(const char* str) {
215 if (!str) {
216 return 0;
217 }
218
219 uint8_t byte = (uint8_t)*str;
220 if (byte <= 0x7F) {
221 return 1;
222 } else if (byte <= 0xDF) {
223 return 2;
224 } else if (byte <= 0xEF) {
225 return 3;
226 } else if (byte <= 0xF7) {
227 return 4;
228 } else {
229 return 0;
230 }
231}
232
240bool is_valid_codepoint(uint32_t codepoint) { return codepoint <= UNICODE_MAX_CODEPOINT; }
241
256bool is_valid_utf8(const char* utf8) {
257 if (!utf8) {
258 return false;
259 }
260
261 for (size_t i = 0; utf8[i] != '\0';) {
262 unsigned char byte = (unsigned char)utf8[i];
263 if ((byte & 0x80) == 0) {
264 i++;
265 } else if ((byte & 0xE0) == 0xC0) {
266 if (utf8[i + 1] == '\0' || (utf8[i + 1] & 0xC0) != 0x80) {
267 return false;
268 }
269 uint32_t codepoint = (uint32_t)((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F);
270 if (codepoint < 0x80) {
271 return false;
272 }
273 i += 2;
274 } else if ((byte & 0xF0) == 0xE0) {
275 if (utf8[i + 1] == '\0' || utf8[i + 2] == '\0' || (utf8[i + 1] & 0xC0) != 0x80 ||
276 (utf8[i + 2] & 0xC0) != 0x80) {
277 return false;
278 }
279 uint32_t codepoint =
280 (uint32_t)((byte & 0x0F) << 12) | (uint32_t)((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F);
281 if (codepoint < 0x800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
282 return false;
283 }
284 i += 3;
285 } else if ((byte & 0xF8) == 0xF0) {
286 if (utf8[i + 1] == '\0' || utf8[i + 2] == '\0' || utf8[i + 3] == '\0' || (utf8[i + 1] & 0xC0) != 0x80 ||
287 (utf8[i + 2] & 0xC0) != 0x80 || (utf8[i + 3] & 0xC0) != 0x80) {
288 return false;
289 }
290 uint32_t codepoint = (uint32_t)((byte & 0x07) << 18) | ((uint32_t)(utf8[i + 1] & 0x3F) << 12) |
291 ((uint32_t)(utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F);
292 if (codepoint < 0x10000 || codepoint > 0x10FFFF) {
293 return false;
294 }
295 i += 4;
296 } else {
297 return false;
298 }
299 }
300 return true;
301}
302
310bool is_codepoint_whitespace(uint32_t codepoint) { return iswspace(codepoint); }
311
318bool is_utf8_whitespace(const char* utf8) {
319 if (!utf8) {
320 return false;
321 }
323}
324
331bool is_codepoint_digit(uint32_t codepoint) { return iswdigit(codepoint); }
332
339bool is_utf8_digit(const char* utf8) {
340 if (!utf8) {
341 return false;
342 }
344}
345
352bool is_codepoint_alpha(uint32_t codepoint) { return iswalpha(codepoint); }
353
360bool is_utf8_alpha(const char* utf8) {
361 if (!utf8) {
362 return false;
363 }
365}
366
373bool is_codepoint_alnum(uint32_t codepoint) { return iswalnum(codepoint); }
374
381bool is_utf8_alnum(const char* utf8) {
382 if (!utf8) {
383 return false;
384 }
386}
387
394bool is_codepoint_punct(uint32_t codepoint) { return iswpunct(codepoint); }
395
402bool is_utf8_punct(const char* utf8) {
403 if (!utf8) {
404 return false;
405 }
407}
408
416char* utf8_copy(const char* data) {
417 if (!data) {
418 return NULL;
419 }
420
421 size_t length = utf8_valid_byte_count(data);
422 char* copy = (char*)malloc(length + 1);
423 if (copy) {
424 memcpy(copy, data, length);
425 copy[length] = '\0';
426 }
427 return copy;
428}
429
437const char* utf8_data(const utf8_string* s) {
438 if (!s) {
439 return NULL;
440 }
441 return s->data;
442}
443
453utf8_string* utf8_new(const char* data) {
454 if (!data) {
455 return NULL;
456 }
457
458 utf8_string* s = (utf8_string*)malloc(sizeof(utf8_string));
459 if (!s) {
460 return NULL;
461 }
462
463 s->data = utf8_copy(data);
464 if (!s->data) {
465 free(s);
466 return NULL;
467 }
468
469 s->length = utf8_valid_byte_count(data);
470 s->count = utf8_count_codepoints(data);
471 return s;
472}
473
482 utf8_string* s = (utf8_string*)malloc(sizeof(utf8_string));
483 if (!s) {
484 return NULL;
485 }
486
487 s->data = (char*)malloc(capacity + 1);
488 if (!s->data) {
489 free(s);
490 return NULL;
491 }
492
493 s->data[0] = '\0';
494 s->length = 0;
495 s->count = 0;
496 return s;
497}
498
506 if (!s) {
507 return;
508 }
509
510 free(s->data);
511 free(s);
512}
513
519void utf8_print(const utf8_string* s) {
520 if (!s || !s->data) {
521 return;
522 }
523 printf("%s\n", s->data);
524}
525
532 if (!s) {
533 return;
534 }
535 printf("Byte Length: %zu\n", s->length);
536 printf("Code Points: %zu\n", s->count);
537}
538
545 if (!s || !s->data) {
546 return;
547 }
548
549 for (size_t i = 0; s->data[i] != '\0';) {
550 uint32_t codepoint = utf8_to_codepoint(&s->data[i]);
551 printf("U+%04X ", codepoint);
552 size_t len = utf8_char_length(&s->data[i]);
553 if (len == 0) {
554 break;
555 }
556 i += len;
557 }
558 printf("\n");
559}
560
568int utf8_index_of(const utf8_string* s, const char* utf8) {
569 if (!s || !s->data || !utf8) {
570 return -1;
571 }
572
573 char* index = strstr(s->data, utf8);
574 if (index) {
575 return (int)(index - s->data);
576 }
577 return -1;
578}
579
587int utf8_last_index_of(const utf8_string* s, const char* utf8) {
588 if (!s || !s->data || !utf8) {
589 return -1;
590 }
591
592 size_t needle_len = strlen(utf8);
593 if (needle_len == 0 || needle_len > s->length) {
594 return -1;
595 }
596
597 for (size_t i = s->length - needle_len; i > 0; i--) {
598 if (memcmp(&s->data[i], utf8, needle_len) == 0) {
599 return (int)i;
600 }
601 }
602
603 if (memcmp(s->data, utf8, needle_len) == 0) {
604 return 0;
605 }
606
607 return -1;
608}
609
617bool utf8_append(utf8_string* s, const char* data) {
618 if (!s || !data) {
619 return false;
620 }
621
622 size_t length = utf8_valid_byte_count(data);
623 size_t count = utf8_count_codepoints(data);
624
625 char* new_data = (char*)realloc(s->data, s->length + length + 1);
626 if (!new_data) {
627 return false;
628 }
629
630 s->data = new_data;
631 memcpy(&s->data[s->length], data, length);
632 s->length += length;
633 s->count += count;
634 s->data[s->length] = '\0';
635 return true;
636}
637
648char* utf8_substr(const utf8_string* s, size_t index, size_t utf8_byte_len) {
649 if (!s || !s->data || index >= s->length || utf8_byte_len == 0) {
650 return NULL;
651 }
652
653 if (index + utf8_byte_len > s->length) {
654 utf8_byte_len = s->length - index;
655 }
656
657 char* substr = (char*)malloc(utf8_byte_len + 1);
658 if (substr) {
659 memcpy(substr, &s->data[index], utf8_byte_len);
660 substr[utf8_byte_len] = '\0';
661 }
662 return substr;
663}
664
673bool utf8_insert(utf8_string* s, size_t index, const char* data) {
674 if (!s || !s->data || !data || index > s->length) {
675 return false;
676 }
677
678 size_t length = utf8_valid_byte_count(data);
679 size_t count = utf8_count_codepoints(data);
680 char* new_data = (char*)realloc(s->data, s->length + length + 1);
681
682 if (!new_data) {
683 return false;
684 }
685
686 s->data = new_data;
687 memmove(&s->data[index + length], &s->data[index], s->length - index + 1);
688 memcpy(&s->data[index], data, length);
689 s->length += length;
690 s->count += count;
691 return true;
692}
693
702bool utf8_remove(utf8_string* s, size_t index, size_t count) {
703 if (!s || !s->data || index >= s->length || count == 0) {
704 return false;
705 }
706
707 size_t i = index;
708 for (size_t j = 0; j < count && i < s->length; j++) {
709 size_t len = utf8_char_length(&s->data[i]);
710 if (len == 0) {
711 break;
712 }
713 i += len;
714 }
715
716 if (i > s->length) {
717 i = s->length;
718 }
719
720 memmove(&s->data[index], &s->data[i], s->length - i + 1);
721 s->length -= i - index;
722 s->count -= count;
723 return true;
724}
725
734bool utf8_replace(utf8_string* s, const char* old_str, const char* new_str) {
735 if (!s || !s->data || !old_str || !new_str) {
736 return false;
737 }
738
739 size_t old_byte_len = utf8_valid_byte_count(old_str);
740 size_t new_byte_len = utf8_valid_byte_count(new_str);
741 size_t old_count = utf8_count_codepoints(old_str);
742 size_t new_count = utf8_count_codepoints(new_str);
743
744 char* index = strstr(s->data, old_str);
745 if (index == NULL) {
746 return false;
747 }
748
749 size_t offset = (size_t)(index - s->data);
750 if (old_byte_len != new_byte_len) {
751 char* new_data = (char*)realloc(s->data, s->length - old_byte_len + new_byte_len + 1);
752 if (!new_data) {
753 return false;
754 }
755 s->data = new_data;
756 }
757
758 memmove(&s->data[offset + new_byte_len], &s->data[offset + old_byte_len], s->length - offset - old_byte_len + 1);
759 memcpy(&s->data[offset], new_str, new_byte_len);
760 s->length = s->length - old_byte_len + new_byte_len;
761 s->count = s->count - old_count + new_count;
762 return true;
763}
764
773size_t utf8_replace_all(utf8_string* s, const char* old_str, const char* new_str) {
774 if (!s || !s->data || !old_str || !new_str) {
775 return 0;
776 }
777
778 size_t old_byte_len = utf8_valid_byte_count(old_str);
779 if (old_byte_len == 0) {
780 return 0;
781 }
782
783 size_t new_byte_len = utf8_valid_byte_count(new_str);
784 size_t old_count = utf8_count_codepoints(old_str);
785 size_t new_count = utf8_count_codepoints(new_str);
786 size_t replacements = 0;
787
788 char* index = s->data;
789 while ((index = strstr(index, old_str)) != NULL) {
790 size_t offset = (size_t)(index - s->data);
791
792 if (old_byte_len != new_byte_len) {
793 char* new_data = (char*)realloc(s->data, s->length - old_byte_len + new_byte_len + 1);
794 if (!new_data) {
795 return replacements;
796 }
797 s->data = new_data;
798 index = s->data + offset;
799 }
800
801 memmove(&s->data[offset + new_byte_len], &s->data[offset + old_byte_len],
802 s->length - offset - old_byte_len + 1);
803 memcpy(&s->data[offset], new_str, new_byte_len);
804 s->length = s->length - old_byte_len + new_byte_len;
805 s->count = s->count - old_count + new_count;
806 index += new_byte_len;
807 replacements++;
808 }
809
810 return replacements;
811}
812
822 if (!s || !s->data || s->length == 0) {
823 return false;
824 }
825
826 char* reversed = (char*)malloc(s->length + 1);
827 if (!reversed) {
828 return false;
829 }
830
831 size_t j = s->length;
832 for (size_t i = 0; i < s->length;) {
833 size_t len = utf8_char_length(&s->data[i]);
834 if (len == 0 || j < len) {
835 free(reversed);
836 return false;
837 }
838 j -= len;
839 memcpy(&reversed[j], &s->data[i], len);
840 i += len;
841 }
842 reversed[s->length] = '\0';
843 free(s->data);
844 s->data = reversed;
845 return true;
846}
847
855long utf8_writeto(const utf8_string* s, const char* filename) {
856 if (!s || !s->data || !filename) {
857 return -1;
858 }
859
860 FILE* file = fopen(filename, "w");
861 if (!file) {
862 return -1;
863 }
864
865 size_t bytes = fwrite(s->data, 1, s->length, file);
866 fclose(file);
867
868 if (bytes != s->length) {
869 return -1;
870 }
871
872 return (long)bytes;
873}
874
882utf8_string* utf8_readfrom(const char* filename) {
883 if (!filename) {
884 return NULL;
885 }
886
887 FILE* file = fopen(filename, "r");
888 if (!file) {
889 return NULL;
890 }
891
892 if (fseek(file, 0, SEEK_END) != 0) {
893 fclose(file);
894 return NULL;
895 }
896
897 long length = ftell(file);
898 if (length < 0) {
899 fclose(file);
900 return NULL;
901 }
902
903 if (fseek(file, 0, SEEK_SET) != 0) {
904 fclose(file);
905 return NULL;
906 }
907
908 char* data = (char*)malloc((size_t)length + 1);
909 if (!data) {
910 fclose(file);
911 return NULL;
912 }
913
914 size_t bytes = fread(data, 1, (size_t)length, file);
915 data[bytes] = '\0';
916 fclose(file);
917
918 utf8_string* s = utf8_new(data);
919 free(data);
920 return s;
921}
922
930void utf8_ltrim(char* str) {
931 if (!str) {
932 return;
933 }
934
935 size_t len = strlen(str);
936 size_t i = 0;
937 while (i < len && is_utf8_whitespace(&str[i])) {
938 size_t char_len = utf8_char_length(&str[i]);
939 if (char_len == 0) {
940 break;
941 }
942 i += char_len;
943 }
944
945 if (i > 0) {
946 memmove(str, &str[i], len - i + 1);
947 }
948}
949
957void utf8_rtrim(char* str) {
958 if (!str) {
959 return;
960 }
961
962 size_t len = strlen(str);
963 size_t i = len;
964
965 while (i > 0) {
966 size_t char_start = i - 1;
967 while (char_start > 0 && (str[char_start] & 0xC0) == 0x80) {
968 char_start--;
969 }
970
971 size_t char_len = utf8_char_length(&str[char_start]);
972
973 if (char_len > 0 && char_start + char_len == i && is_utf8_whitespace(&str[char_start])) {
974 i = char_start;
975 } else {
976 break;
977 }
978 }
979
980 str[i] = '\0';
981}
982
988void utf8_trim(char* str) {
989 if (!str) {
990 return;
991 }
992 utf8_ltrim(str);
993 utf8_rtrim(str);
994}
995
1004void utf8_trim_chars(char* str, const char* chars) {
1005 if (!str || !chars) {
1006 return;
1007 }
1008
1009 uint32_t trim_codepoints[256] = {0};
1010 size_t num_trim_chars = 0;
1011 size_t chars_len = strlen(chars);
1012
1013 for (size_t k = 0; k < chars_len && num_trim_chars < 256;) {
1014 size_t current_char_len = utf8_char_length(&chars[k]);
1015 if (current_char_len == 0 || k + current_char_len > chars_len) {
1016 k++;
1017 continue;
1018 }
1019
1020 uint32_t codepoint = utf8_to_codepoint(&chars[k]);
1021 if (codepoint != 0xFFFD) {
1022 bool found = false;
1023 for (size_t idx = 0; idx < num_trim_chars; ++idx) {
1024 if (trim_codepoints[idx] == codepoint) {
1025 found = true;
1026 break;
1027 }
1028 }
1029 if (!found && num_trim_chars < 256) {
1030 trim_codepoints[num_trim_chars++] = codepoint;
1031 }
1032 }
1033 k += current_char_len;
1034 }
1035
1036 size_t len = strlen(str);
1037 size_t i = 0;
1038 while (i < len) {
1039 size_t current_char_len = utf8_char_length(&str[i]);
1040 if (current_char_len == 0 || i + current_char_len > len) {
1041 break;
1042 }
1043
1044 uint32_t codepoint = utf8_to_codepoint(&str[i]);
1045 bool should_trim = false;
1046
1047 if (codepoint != 0xFFFD) {
1048 for (size_t j = 0; j < num_trim_chars; j++) {
1049 if (codepoint == trim_codepoints[j]) {
1050 should_trim = true;
1051 break;
1052 }
1053 }
1054 }
1055
1056 if (!should_trim) {
1057 break;
1058 }
1059
1060 i += current_char_len;
1061 }
1062
1063 if (i > 0) {
1064 memmove(str, &str[i], len - i + 1);
1065 len -= i;
1066 }
1067
1068 i = len;
1069 while (i > 0) {
1070 size_t char_start = i - 1;
1071 while (char_start > 0 && (str[char_start] & 0xC0) == 0x80) {
1072 char_start--;
1073 }
1074
1075 if ((str[char_start] & 0xC0) == 0x80) {
1076 break;
1077 }
1078
1079 size_t char_len = utf8_char_length(&str[char_start]);
1080
1081 if (char_len > 0 && char_start + char_len == i) {
1082 uint32_t codepoint = utf8_to_codepoint(&str[char_start]);
1083 bool should_trim = false;
1084
1085 if (codepoint != 0xFFFD) {
1086 for (size_t j = 0; j < num_trim_chars; j++) {
1087 if (codepoint == trim_codepoints[j]) {
1088 should_trim = true;
1089 break;
1090 }
1091 }
1092 }
1093
1094 if (should_trim) {
1095 i = char_start;
1096 } else {
1097 break;
1098 }
1099 } else {
1100 break;
1101 }
1102 }
1103
1104 str[i] = '\0';
1105}
1106
1114void utf8_trim_char(char* str, char c) {
1115 if (!str) {
1116 return;
1117 }
1118
1119 size_t len = strlen(str);
1120 size_t i = 0;
1121 while (i < len && str[i] == c) {
1122 i++;
1123 }
1124
1125 if (i > 0) {
1126 memmove(str, &str[i], len - i + 1);
1127 len -= i;
1128 }
1129
1130 while (len > 0 && str[len - 1] == c) {
1131 len--;
1132 }
1133 str[len] = '\0';
1134}
1135
1144void utf8_tolower(char* str) {
1145 if (!str) return;
1146
1147 for (size_t i = 0; str[i] != '\0';) {
1148 unsigned char byte = (unsigned char)str[i];
1149
1150 /* ---- ASCII fast path (covers the vast majority of real text) ---- */
1151 if (byte < 0x80u) {
1152 /* Branchless lowercase: set bit 5 only when byte is A-Z. */
1153 /* Mask is 1 for characters in 'A'..'Z' (0x41..0x5A), 0 otherwise */
1154 unsigned int is_upper = (byte - 'A' + 1u <= 26u) ? 1u : 0u;
1155 str[i] = (char)(byte | (is_upper << 5));
1156 i++;
1157 continue;
1158 }
1159
1160 /* ---- Multibyte path ---- */
1161 uint32_t codepoint = utf8_to_codepoint(&str[i]);
1162 size_t old_len = utf8_char_length(&str[i]);
1163
1164 if (old_len == 0) break;
1165
1166 if (iswupper((wint_t)codepoint)) {
1167 char utf8[UTF8_MAX_LEN] = {0};
1168 ucp_to_utf8((uint32_t)towlower((wint_t)codepoint), utf8);
1169 size_t new_len = utf8_valid_byte_count(utf8);
1170
1171 if (new_len != old_len) {
1172 size_t remaining = strlen(&str[i + old_len]);
1173 memmove(&str[i + new_len], &str[i + old_len], remaining + 1);
1174 }
1175 memcpy(&str[i], utf8, new_len);
1176 i += new_len;
1177 } else {
1178 i += old_len;
1179 }
1180 }
1181}
1182
1191void utf8_toupper(char* str) {
1192 if (!str) return;
1193
1194 for (size_t i = 0; str[i] != '\0';) {
1195 unsigned char byte = (unsigned char)str[i];
1196
1197 /* ---- ASCII fast path ---- */
1198 if (byte < 0x80u) {
1199 /* Branchless uppercase: clear bit 5 only when byte is a-z. */
1200 unsigned int is_lower = (byte - 'a' + 1u <= 26u) ? 1u : 0u;
1201 str[i] = (char)(byte & (unsigned char)~(is_lower << 5));
1202 i++;
1203 continue;
1204 }
1205
1206 /* ---- Multibyte path ---- */
1207 uint32_t codepoint = utf8_to_codepoint(&str[i]);
1208 size_t old_len = utf8_char_length(&str[i]);
1209
1210 if (old_len == 0) break;
1211
1212 if (iswlower((wint_t)codepoint)) {
1213 char utf8[UTF8_MAX_LEN] = {0};
1214 ucp_to_utf8((uint32_t)towupper((wint_t)codepoint), utf8);
1215 size_t new_len = utf8_valid_byte_count(utf8);
1216
1217 if (new_len != old_len) {
1218 size_t remaining = strlen(&str[i + old_len]);
1219 memmove(&str[i + new_len], &str[i + old_len], remaining + 1);
1220 }
1221 memcpy(&str[i], utf8, new_len);
1222 i += new_len;
1223 } else {
1224 i += old_len;
1225 }
1226 }
1227}
1228
1241utf8_string** utf8_split(const utf8_string* str, const char* delim, size_t* num_parts) {
1242 if (!str || !str->data || !delim || !num_parts) {
1243 if (num_parts) {
1244 *num_parts = 0;
1245 }
1246 return NULL;
1247 }
1248
1249 size_t delim_len = utf8_valid_byte_count(delim);
1250 if (delim_len == 0) {
1251 *num_parts = 0;
1252 return NULL;
1253 }
1254
1255 size_t count = 1;
1256 size_t len = str->length;
1257 for (size_t i = 0; i < len;) {
1258 if (i + delim_len <= len && utf8_starts_with(&str->data[i], delim)) {
1259 count++;
1260 i += delim_len;
1261 } else {
1262 size_t char_len = utf8_char_length(&str->data[i]);
1263 if (char_len == 0) {
1264 break;
1265 }
1266 i += char_len;
1267 }
1268 }
1269
1270 utf8_string** parts = (utf8_string**)malloc(count * sizeof(utf8_string*));
1271 if (!parts) {
1272 *num_parts = 0;
1273 return NULL;
1274 }
1275
1276 size_t index = 0;
1277 size_t start = 0;
1278 for (size_t i = 0; i < len;) {
1279 if (i + delim_len <= len && utf8_starts_with(&str->data[i], delim)) {
1280 parts[index] = utf8_new(&str->data[start]);
1281 if (parts[index]) {
1282 parts[index]->data[i - start] = '\0';
1283 parts[index]->length = i - start;
1284 parts[index]->count = utf8_count_codepoints(parts[index]->data);
1285 }
1286 index++;
1287 i += delim_len;
1288 start = i;
1289 } else {
1290 size_t char_len = utf8_char_length(&str->data[i]);
1291 if (char_len == 0) {
1292 break;
1293 }
1294 i += char_len;
1295 }
1296 }
1297
1298 parts[index] = utf8_new(&str->data[start]);
1299 if (parts[index]) {
1300 parts[index]->data[len - start] = '\0';
1301 parts[index]->length = len - start;
1302 parts[index]->count = utf8_count_codepoints(parts[index]->data);
1303 }
1304
1305 *num_parts = count;
1306 return parts;
1307}
1308
1318void utf8_array_remove(utf8_string** array, size_t size, size_t index) {
1319 if (!array || index >= size) {
1320 return;
1321 }
1322
1323 utf8_free(array[index]);
1324 for (size_t i = index; i < size - 1; i++) {
1325 array[i] = array[i + 1];
1326 }
1327 array[size - 1] = NULL;
1328}
1329
1336void utf8_split_free(utf8_string** str, size_t size) {
1337 if (!str) {
1338 return;
1339 }
1340
1341 for (size_t i = 0; i < size; i++) {
1342 utf8_free(str[i]);
1343 }
1344 free(str);
1345}
1346
1354bool utf8_starts_with(const char* str, const char* prefix) {
1355 if (!str || !prefix) {
1356 return false;
1357 }
1358
1359 size_t len = utf8_valid_byte_count(prefix);
1360 for (size_t i = 0; i < len; i++) {
1361 if (str[i] != prefix[i]) {
1362 return false;
1363 }
1364 }
1365 return true;
1366}
1367
1375bool utf8_ends_with(const char* str, const char* suffix) {
1376 if (!str || !suffix) {
1377 return false;
1378 }
1379
1380 size_t len = utf8_valid_byte_count(str);
1381 size_t len2 = utf8_valid_byte_count(suffix);
1382 if (len2 > len) {
1383 return false;
1384 }
1385 for (size_t i = 0; i < len2; i++) {
1386 if (str[len - len2 + i] != suffix[i]) {
1387 return false;
1388 }
1389 }
1390 return true;
1391}
1392
1400bool utf8_contains(const char* str, const char* substr) {
1401 if (!str || !substr) {
1402 return false;
1403 }
1404 return strstr(str, substr) != NULL;
1405}
1406
1414int utf8_compare(const char* s1, const char* s2) {
1415 if (!s1 && !s2) {
1416 return 0;
1417 }
1418 if (!s1) {
1419 return -1;
1420 }
1421 if (!s2) {
1422 return 1;
1423 }
1424 return strcmp(s1, s2);
1425}
1426
1434bool utf8_equals(const char* s1, const char* s2) {
1435 if (!s1 || !s2) {
1436 return s1 == s2;
1437 }
1438 return strcmp(s1, s2) == 0;
1439}
1440
1449 if (!s || !s->data) {
1450 return NULL;
1451 }
1452 return utf8_new(s->data);
1453}
1454
1464 if (!s1 || !s1->data || !s2 || !s2->data) {
1465 return NULL;
1466 }
1467
1468 utf8_string* result = utf8_new_with_capacity(s1->length + s2->length);
1469 if (!result) {
1470 return NULL;
1471 }
1472
1473 memcpy(result->data, s1->data, s1->length);
1474 memcpy(result->data + s1->length, s2->data, s2->length);
1475 result->data[s1->length + s2->length] = '\0';
1476 result->length = s1->length + s2->length;
1477 result->count = s1->count + s2->count;
1478
1479 return result;
1480}
Represents a mutable UTF-8 encoded string with metadata.
Definition unicode.h:48
char * data
Definition unicode.h:49
size_t length
Definition unicode.h:50
size_t count
Definition unicode.h:51
uint32_t utf8_to_codepoint(const char *utf8)
Decodes a UTF-8 byte sequence to its Unicode codepoint.
Definition unicode.c:69
void utf8_toupper(char *str)
Converts all lowercase characters to uppercase in-place.
Definition unicode.c:1191
utf8_string * utf8_readfrom(const char *filename)
Reads a UTF-8 string from a file.
Definition unicode.c:882
utf8_string * utf8_new_with_capacity(size_t capacity)
Creates an empty utf8_string with preallocated capacity.
Definition unicode.c:481
size_t utf8_replace_all(utf8_string *s, const char *old_str, const char *new_str)
Replaces all occurrences of a substring with another string.
Definition unicode.c:773
bool is_utf8_digit(const char *utf8)
Checks if a UTF-8 character represents a digit.
Definition unicode.c:339
bool utf8_replace(utf8_string *s, const char *old_str, const char *new_str)
Replaces the first occurrence of a substring with another string.
Definition unicode.c:734
void utf8_array_remove(utf8_string **array, size_t size, size_t index)
Removes an element from a utf8_string array and frees it.
Definition unicode.c:1318
utf8_string * utf8_concat(const utf8_string *s1, const utf8_string *s2)
Concatenates two utf8_string objects into a new string.
Definition unicode.c:1463
bool utf8_reverse(utf8_string *s)
Reverses a UTF-8 string by codepoints.
Definition unicode.c:821
bool is_codepoint_alnum(uint32_t codepoint)
Checks if a codepoint represents an alphanumeric character.
Definition unicode.c:373
char * utf8_substr(const utf8_string *s, size_t index, size_t utf8_byte_len)
Extracts a substring by byte range.
Definition unicode.c:648
int utf8_last_index_of(const utf8_string *s, const char *utf8)
Finds the byte index of the last occurrence of a substring.
Definition unicode.c:587
utf8_string ** utf8_split(const utf8_string *str, const char *delim, size_t *num_parts)
Splits a UTF-8 string into parts using a delimiter.
Definition unicode.c:1241
const char * utf8_data(const utf8_string *s)
Returns a pointer to the internal UTF-8 data buffer.
Definition unicode.c:437
void utf8_free(utf8_string *s)
Frees all resources associated with a utf8_string.
Definition unicode.c:505
void utf8_ltrim(char *str)
Removes leading whitespace from a UTF-8 string in-place.
Definition unicode.c:930
char * utf8_copy(const char *data)
Creates a copy of a UTF-8 string containing only valid UTF-8 sequences.
Definition unicode.c:416
size_t utf8_valid_byte_count(const char *s)
Counts the number of valid UTF-8 bytes in a string.
Definition unicode.c:162
bool utf8_ends_with(const char *str, const char *suffix)
Checks if a string ends with a given suffix.
Definition unicode.c:1375
bool utf8_equals(const char *s1, const char *s2)
Compares two UTF-8 strings for equality.
Definition unicode.c:1434
bool is_utf8_punct(const char *utf8)
Checks if a UTF-8 character represents a punctuation character.
Definition unicode.c:402
bool utf8_remove(utf8_string *s, size_t index, size_t count)
Removes a specified number of codepoints starting at a byte index.
Definition unicode.c:702
bool utf8_insert(utf8_string *s, size_t index, const char *data)
Inserts UTF-8 data at a specific byte index.
Definition unicode.c:673
bool is_codepoint_whitespace(uint32_t codepoint)
Checks if a codepoint represents whitespace.
Definition unicode.c:310
bool is_codepoint_punct(uint32_t codepoint)
Checks if a codepoint represents a punctuation character.
Definition unicode.c:394
bool is_utf8_alnum(const char *utf8)
Checks if a UTF-8 character represents an alphanumeric character.
Definition unicode.c:381
void utf8_rtrim(char *str)
Removes trailing whitespace from a UTF-8 string in-place.
Definition unicode.c:957
bool utf8_append(utf8_string *s, const char *data)
Appends UTF-8 data to the end of a utf8_string.
Definition unicode.c:617
int utf8_index_of(const utf8_string *s, const char *utf8)
Finds the byte index of the first occurrence of a substring.
Definition unicode.c:568
bool is_codepoint_digit(uint32_t codepoint)
Checks if a codepoint represents a digit.
Definition unicode.c:331
void utf8_split_free(utf8_string **str, size_t size)
Frees an array of utf8_string objects returned by utf8_split().
Definition unicode.c:1336
void utf8_trim(char *str)
Removes leading and trailing whitespace from a UTF-8 string in-place.
Definition unicode.c:988
void utf8_print(const utf8_string *s)
Prints the UTF-8 string content to stdout followed by a newline.
Definition unicode.c:519
int utf8_compare(const char *s1, const char *s2)
Compares two UTF-8 strings lexicographically.
Definition unicode.c:1414
bool is_valid_utf8(const char *utf8)
Comprehensively validates a UTF-8 encoded string.
Definition unicode.c:256
size_t utf8_char_length(const char *str)
Determines the byte length of a UTF-8 character from its first byte.
Definition unicode.c:214
#define UNICODE_MAX_CODEPOINT
Definition unicode.h:35
void utf8_trim_chars(char *str, const char *c)
Removes leading and trailing characters from a UTF-8 string in-place.
Definition unicode.c:1004
void utf8_print_info(const utf8_string *s)
Prints metadata about the UTF-8 string to stdout.
Definition unicode.c:531
void utf8_trim_char(char *str, char c)
Removes leading and trailing occurrences of a single character.
Definition unicode.c:1114
utf8_string * utf8_clone(const utf8_string *s)
Duplicates a utf8_string object.
Definition unicode.c:1448
void ucp_to_utf8(uint32_t codepoint, char utf8[UTF8_MAX_LEN])
Converts a Unicode codepoint to its UTF-8 byte sequence.
Definition unicode.c:27
utf8_string * utf8_new(const char *data)
Creates a new utf8_string object from a C string.
Definition unicode.c:453
size_t utf8_count_codepoints(const char *utf8)
Counts the number of Unicode codepoints in a UTF-8 string.
Definition unicode.c:133
bool is_utf8_whitespace(const char *utf8)
Checks if a UTF-8 character represents whitespace.
Definition unicode.c:318
bool is_valid_codepoint(uint32_t codepoint)
Validates whether a codepoint is within the valid Unicode range.
Definition unicode.c:240
bool utf8_contains(const char *str, const char *substr)
Checks if a string contains a substring.
Definition unicode.c:1400
void utf8_tolower(char *str)
Converts all uppercase characters to lowercase in-place.
Definition unicode.c:1144
bool is_codepoint_alpha(uint32_t codepoint)
Checks if a codepoint represents an alphabetic character.
Definition unicode.c:352
bool utf8_starts_with(const char *str, const char *prefix)
Checks if a string starts with a given prefix.
Definition unicode.c:1354
bool is_utf8_alpha(const char *utf8)
Checks if a UTF-8 character represents an alphabetic character.
Definition unicode.c:360
long utf8_writeto(const utf8_string *s, const char *filename)
Writes a utf8_string to a file.
Definition unicode.c:855
void utf8_print_codepoints(const utf8_string *s)
Prints the Unicode codepoints in U+XXXX format to stdout.
Definition unicode.c:544