solidc/unicode_8c_source.html

#include "../include/unicode.h"


#include <assert.h>

#include <errno.h>

#include <locale.h>

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <wctype.h>


void ucp_to_utf8(uint32_t codepoint, char utf8[UTF8_MAX_LEN]) {

    if (!utf8) {

        return;

    }


    if (codepoint <= 0x7F) {

        utf8[0] = (char)codepoint;

        utf8[1] = '\0';

    } else if (codepoint <= 0x7FF) {

        utf8[0] = (char)(0xC0 | (codepoint >> 6));

        utf8[1] = (char)(0x80 | (codepoint & 0x3F));

        utf8[2] = '\0';

    } else if (codepoint <= 0xFFFF) {

        utf8[0] = (char)(0xE0 | (codepoint >> 12));

        utf8[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F));

        utf8[2] = (char)(0x80 | (codepoint & 0x3F));

        utf8[3] = '\0';

    } else if (codepoint <= 0x10FFFF) {

        utf8[0] = (char)(0xF0 | (codepoint >> 18));

        utf8[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F));

        utf8[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F));

        utf8[3] = (char)(0x80 | (codepoint & 0x3F));

        utf8[4] = '\0';

    } else {

        utf8[0] = '\0';

    }

}


uint32_t utf8_to_codepoint(const char* utf8) {

    if (!utf8) {

        return 0xFFFD;  // replacement character

    }


    uint32_t codepoint = 0;

    const uint8_t* u = (const uint8_t*)utf8;


    if ((u[0] & 0x80) == 0) {

        codepoint = u[0];

    } else if ((u[0] & 0xE0) == 0xC0) {

        if ((u[1] & 0xC0) == 0x80) {

            codepoint = ((u[0] & 0x1FU) << 6) | (u[1] & 0x3F);

            if (codepoint < 0x80) {

                return 0xFFFD;

            }

        } else {

            return 0xFFFD;

        }

    } else if ((u[0] & 0xF0) == 0xE0) {

        if ((u[1] & 0xC0) == 0x80 && (u[2] & 0xC0) == 0x80) {

            codepoint = ((u[0] & 0x0FU) << 12) | ((u[1] & 0x3FU) << 6) | (u[2] & 0x3F);

            if (codepoint < 0x800) {

                return 0xFFFD;

            }

            if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {

                return 0xFFFD;

            }

        } else {

            return 0xFFFD;

        }

    } else if ((u[0] & 0xF8) == 0xF0) {

        if ((u[1] & 0xC0) == 0x80 && (u[2] & 0xC0) == 0x80 && (u[3] & 0xC0) == 0x80) {

            codepoint = ((u[0] & 0x07U) << 18) | ((u[1] & 0x3FU) << 12) | ((u[2] & 0x3FU) << 6) | (u[3] & 0x3F);

            if (codepoint < 0x10000) {

                return 0xFFFD;

            }

            if (codepoint > 0x10FFFF) {

                return 0xFFFD;

            }

        } else {

            return 0xFFFD;

        }

    } else {

        return 0xFFFD;

    }


    return codepoint;

}


size_t utf8_count_codepoints(const char* s) {

    if (!s) {

        return 0;

    }


    size_t count = 0;

    for (size_t i = 0; s[i] != '\0'; i++) {

        if ((s[i] & 0xC0) != 0x80) {

            count++;

        }

    }

    return count;

}


size_t utf8_valid_byte_count(const char* s) {

    if (!s) {

        return 0;

    }


    size_t count = 0;

    for (size_t i = 0; s[i] != '\0';) {

        unsigned char byte = (unsigned char)s[i];

        if ((byte & 0x80) == 0) {

            count++;

            i++;

        } else if ((byte & 0xE0) == 0xC0 && s[i + 1] != '\0') {

            if ((s[i + 1] & 0xC0) == 0x80) {

                count += 2;

                i += 2;

            } else {

                i++;

            }

        } else if ((byte & 0xF0) == 0xE0 && s[i + 1] != '\0' && s[i + 2] != '\0') {

            if ((s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80) {

                count += 3;

                i += 3;

            } else {

                i++;

            }

        } else if ((byte & 0xF8) == 0xF0 && s[i + 1] != '\0' && s[i + 2] != '\0' && s[i + 3] != '\0') {

            if ((s[i + 1] & 0xC0) == 0x80 && (s[i + 2] & 0xC0) == 0x80 && (s[i + 3] & 0xC0) == 0x80) {

                count += 4;

                i += 4;

            } else {

                i++;

            }

        } else {

            i++;

        }

    }

    return count;

}


size_t utf8_char_length(const char* str) {

    if (!str) {

        return 0;

    }


    uint8_t byte = (uint8_t)*str;

    if (byte <= 0x7F) {

        return 1;

    } else if (byte <= 0xDF) {

        return 2;

    } else if (byte <= 0xEF) {

        return 3;

    } else if (byte <= 0xF7) {

        return 4;

    } else {

        return 0;

    }

}


bool is_valid_codepoint(uint32_t codepoint) { return codepoint <= UNICODE_MAX_CODEPOINT; }


bool is_valid_utf8(const char* utf8) {

    if (!utf8) {

        return false;

    }


    for (size_t i = 0; utf8[i] != '\0';) {

        unsigned char byte = (unsigned char)utf8[i];

        if ((byte & 0x80) == 0) {

            i++;

        } else if ((byte & 0xE0) == 0xC0) {

            if (utf8[i + 1] == '\0' || (utf8[i + 1] & 0xC0) != 0x80) {

                return false;

            }

            uint32_t codepoint = (uint32_t)((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F);

            if (codepoint < 0x80) {

                return false;

            }

            i += 2;

        } else if ((byte & 0xF0) == 0xE0) {

            if (utf8[i + 1] == '\0' || utf8[i + 2] == '\0' || (utf8[i + 1] & 0xC0) != 0x80 ||

                (utf8[i + 2] & 0xC0) != 0x80) {

                return false;

            }

            uint32_t codepoint =

                (uint32_t)((byte & 0x0F) << 12) | (uint32_t)((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F);

            if (codepoint < 0x800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {

                return false;

            }

            i += 3;

        } else if ((byte & 0xF8) == 0xF0) {

            if (utf8[i + 1] == '\0' || utf8[i + 2] == '\0' || utf8[i + 3] == '\0' || (utf8[i + 1] & 0xC0) != 0x80 ||

                (utf8[i + 2] & 0xC0) != 0x80 || (utf8[i + 3] & 0xC0) != 0x80) {

                return false;

            }

            uint32_t codepoint = (uint32_t)((byte & 0x07) << 18) | ((uint32_t)(utf8[i + 1] & 0x3F) << 12) |

                                 ((uint32_t)(utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F);

            if (codepoint < 0x10000 || codepoint > 0x10FFFF) {

                return false;

            }

            i += 4;

        } else {

            return false;

        }

    }

    return true;

}


bool is_codepoint_whitespace(uint32_t codepoint) { return iswspace(codepoint); }


bool is_utf8_whitespace(const char* utf8) {

    if (!utf8) {

        return false;

    }

    return is_codepoint_whitespace(utf8_to_codepoint(utf8));

}


bool is_codepoint_digit(uint32_t codepoint) { return iswdigit(codepoint); }


bool is_utf8_digit(const char* utf8) {

    if (!utf8) {

        return false;

    }

    return is_codepoint_digit(utf8_to_codepoint(utf8));

}


bool is_codepoint_alpha(uint32_t codepoint) { return iswalpha(codepoint); }


bool is_utf8_alpha(const char* utf8) {

    if (!utf8) {

        return false;

    }

    return is_codepoint_alpha(utf8_to_codepoint(utf8));

}


bool is_codepoint_alnum(uint32_t codepoint) { return iswalnum(codepoint); }


bool is_utf8_alnum(const char* utf8) {

    if (!utf8) {

        return false;

    }

    return is_codepoint_alnum(utf8_to_codepoint(utf8));

}


bool is_codepoint_punct(uint32_t codepoint) { return iswpunct(codepoint); }


bool is_utf8_punct(const char* utf8) {

    if (!utf8) {

        return false;

    }

    return is_codepoint_punct(utf8_to_codepoint(utf8));

}


char* utf8_copy(const char* data) {

    if (!data) {

        return NULL;

    }


    size_t length = utf8_valid_byte_count(data);

    char* copy = (char*)malloc(length + 1);

    if (copy) {

        memcpy(copy, data, length);

        copy[length] = '\0';

    }

    return copy;

}


const char* utf8_data(const utf8_string* s) {

    if (!s) {

        return NULL;

    }

    return s->data;

}


utf8_string* utf8_new(const char* data) {

    if (!data) {

        return NULL;

    }


    utf8_string* s = (utf8_string*)malloc(sizeof(utf8_string));

    if (!s) {

        return NULL;

    }


    s->data = utf8_copy(data);

    if (!s->data) {

        free(s);

        return NULL;

    }


    s->length = utf8_valid_byte_count(data);

    s->count = utf8_count_codepoints(data);

    return s;

}


utf8_string* utf8_new_with_capacity(size_t capacity) {

    utf8_string* s = (utf8_string*)malloc(sizeof(utf8_string));

    if (!s) {

        return NULL;

    }


    s->data = (char*)malloc(capacity + 1);

    if (!s->data) {

        free(s);

        return NULL;

    }


    s->data[0] = '\0';

    s->length = 0;

    s->count = 0;

    return s;

}


void utf8_free(utf8_string* s) {

    if (!s) {

        return;

    }


    free(s->data);

    free(s);

}


void utf8_print(const utf8_string* s) {

    if (!s || !s->data) {

        return;

    }

    printf("%s\n", s->data);

}


void utf8_print_info(const utf8_string* s) {

    if (!s) {

        return;

    }

    printf("Byte Length: %zu\n", s->length);

    printf("Code Points: %zu\n", s->count);

}


void utf8_print_codepoints(const utf8_string* s) {

    if (!s || !s->data) {

        return;

    }


    for (size_t i = 0; s->data[i] != '\0';) {

        uint32_t codepoint = utf8_to_codepoint(&s->data[i]);

        printf("U+%04X ", codepoint);

        size_t len = utf8_char_length(&s->data[i]);

        if (len == 0) {

            break;

        }

        i += len;

    }

    printf("\n");

}


int utf8_index_of(const utf8_string* s, const char* utf8) {

    if (!s || !s->data || !utf8) {

        return -1;

    }


    char* index = strstr(s->data, utf8);

    if (index) {

        return (int)(index - s->data);

    }

    return -1;

}


int utf8_last_index_of(const utf8_string* s, const char* utf8) {

    if (!s || !s->data || !utf8) {

        return -1;

    }


    size_t needle_len = strlen(utf8);

    if (needle_len == 0 || needle_len > s->length) {

        return -1;

    }


    for (size_t i = s->length - needle_len; i > 0; i--) {

        if (memcmp(&s->data[i], utf8, needle_len) == 0) {

            return (int)i;

        }

    }


    if (memcmp(s->data, utf8, needle_len) == 0) {

        return 0;

    }


    return -1;

}


bool utf8_append(utf8_string* s, const char* data) {

    if (!s || !data) {

        return false;

    }


    size_t length = utf8_valid_byte_count(data);

    size_t count = utf8_count_codepoints(data);


    char* new_data = (char*)realloc(s->data, s->length + length + 1);

    if (!new_data) {

        return false;

    }


    s->data = new_data;

    memcpy(&s->data[s->length], data, length);

    s->length += length;

    s->count += count;

    s->data[s->length] = '\0';

    return true;

}


char* utf8_substr(const utf8_string* s, size_t index, size_t utf8_byte_len) {

    if (!s || !s->data || index >= s->length || utf8_byte_len == 0) {

        return NULL;

    }


    if (index + utf8_byte_len > s->length) {

        utf8_byte_len = s->length - index;

    }


    char* substr = (char*)malloc(utf8_byte_len + 1);

    if (substr) {

        memcpy(substr, &s->data[index], utf8_byte_len);

        substr[utf8_byte_len] = '\0';

    }

    return substr;

}


bool utf8_insert(utf8_string* s, size_t index, const char* data) {

    if (!s || !s->data || !data || index > s->length) {

        return false;

    }


    size_t length = utf8_valid_byte_count(data);

    size_t count = utf8_count_codepoints(data);

    char* new_data = (char*)realloc(s->data, s->length + length + 1);


    if (!new_data) {

        return false;

    }


    s->data = new_data;

    memmove(&s->data[index + length], &s->data[index], s->length - index + 1);

    memcpy(&s->data[index], data, length);

    s->length += length;

    s->count += count;

    return true;

}


bool utf8_remove(utf8_string* s, size_t index, size_t count) {

    if (!s || !s->data || index >= s->length || count == 0) {

        return false;

    }


    size_t i = index;

    for (size_t j = 0; j < count && i < s->length; j++) {

        size_t len = utf8_char_length(&s->data[i]);

        if (len == 0) {

            break;

        }

        i += len;

    }


    if (i > s->length) {

        i = s->length;

    }


    memmove(&s->data[index], &s->data[i], s->length - i + 1);

    s->length -= i - index;

    s->count -= count;

    return true;

}


bool utf8_replace(utf8_string* s, const char* old_str, const char* new_str) {

    if (!s || !s->data || !old_str || !new_str) {

        return false;

    }


    size_t old_byte_len = utf8_valid_byte_count(old_str);

    size_t new_byte_len = utf8_valid_byte_count(new_str);

    size_t old_count = utf8_count_codepoints(old_str);

    size_t new_count = utf8_count_codepoints(new_str);


    char* index = strstr(s->data, old_str);

    if (index == NULL) {

        return false;

    }


    size_t offset = (size_t)(index - s->data);

    if (old_byte_len != new_byte_len) {

        char* new_data = (char*)realloc(s->data, s->length - old_byte_len + new_byte_len + 1);

        if (!new_data) {

            return false;

        }

        s->data = new_data;

    }


    memmove(&s->data[offset + new_byte_len], &s->data[offset + old_byte_len], s->length - offset - old_byte_len + 1);

    memcpy(&s->data[offset], new_str, new_byte_len);

    s->length = s->length - old_byte_len + new_byte_len;

    s->count = s->count - old_count + new_count;

    return true;

}


size_t utf8_replace_all(utf8_string* s, const char* old_str, const char* new_str) {

    if (!s || !s->data || !old_str || !new_str) {

        return 0;

    }


    size_t old_byte_len = utf8_valid_byte_count(old_str);

    if (old_byte_len == 0) {

        return 0;

    }


    size_t new_byte_len = utf8_valid_byte_count(new_str);

    size_t old_count = utf8_count_codepoints(old_str);

    size_t new_count = utf8_count_codepoints(new_str);

    size_t replacements = 0;


    char* index = s->data;

    while ((index = strstr(index, old_str)) != NULL) {

        size_t offset = (size_t)(index - s->data);


        if (old_byte_len != new_byte_len) {

            char* new_data = (char*)realloc(s->data, s->length - old_byte_len + new_byte_len + 1);

            if (!new_data) {

                return replacements;

            }

            s->data = new_data;

            index = s->data + offset;

        }


        memmove(&s->data[offset + new_byte_len], &s->data[offset + old_byte_len],

                s->length - offset - old_byte_len + 1);

        memcpy(&s->data[offset], new_str, new_byte_len);

        s->length = s->length - old_byte_len + new_byte_len;

        s->count = s->count - old_count + new_count;

        index += new_byte_len;

        replacements++;

    }


    return replacements;

}


bool utf8_reverse(utf8_string* s) {

    if (!s || !s->data || s->length == 0) {

        return false;

    }


    char* reversed = (char*)malloc(s->length + 1);

    if (!reversed) {

        return false;

    }


    size_t j = s->length;

    for (size_t i = 0; i < s->length;) {

        size_t len = utf8_char_length(&s->data[i]);

        if (len == 0 || j < len) {

            free(reversed);

            return false;

        }

        j -= len;

        memcpy(&reversed[j], &s->data[i], len);

        i += len;

    }

    reversed[s->length] = '\0';

    free(s->data);

    s->data = reversed;

    return true;

}


long utf8_writeto(const utf8_string* s, const char* filename) {

    if (!s || !s->data || !filename) {

        return -1;

    }


    FILE* file = fopen(filename, "w");

    if (!file) {

        return -1;

    }


    size_t bytes = fwrite(s->data, 1, s->length, file);

    fclose(file);


    if (bytes != s->length) {

        return -1;

    }


    return (long)bytes;

}


utf8_string* utf8_readfrom(const char* filename) {

    if (!filename) {

        return NULL;

    }


    FILE* file = fopen(filename, "r");

    if (!file) {

        return NULL;

    }


    if (fseek(file, 0, SEEK_END) != 0) {

        fclose(file);

        return NULL;

    }


    long length = ftell(file);

    if (length < 0) {

        fclose(file);

        return NULL;

    }


    if (fseek(file, 0, SEEK_SET) != 0) {

        fclose(file);

        return NULL;

    }


    char* data = (char*)malloc((size_t)length + 1);

    if (!data) {

        fclose(file);

        return NULL;

    }


    size_t bytes = fread(data, 1, (size_t)length, file);

    data[bytes] = '\0';

    fclose(file);


    utf8_string* s = utf8_new(data);

    free(data);

    return s;

}


void utf8_ltrim(char* str) {

    if (!str) {

        return;

    }


    size_t len = strlen(str);

    size_t i = 0;

    while (i < len && is_utf8_whitespace(&str[i])) {

        size_t char_len = utf8_char_length(&str[i]);

        if (char_len == 0) {

            break;

        }

        i += char_len;

    }


    if (i > 0) {

        memmove(str, &str[i], len - i + 1);

    }

}


void utf8_rtrim(char* str) {

    if (!str) {

        return;

    }


    size_t len = strlen(str);

    size_t i = len;


    while (i > 0) {

        size_t char_start = i - 1;

        while (char_start > 0 && (str[char_start] & 0xC0) == 0x80) {

            char_start--;

        }


        size_t char_len = utf8_char_length(&str[char_start]);


        if (char_len > 0 && char_start + char_len == i && is_utf8_whitespace(&str[char_start])) {

            i = char_start;

        } else {

            break;

        }

    }


    str[i] = '\0';

}


void utf8_trim(char* str) {

    if (!str) {

        return;

    }

    utf8_ltrim(str);

    utf8_rtrim(str);

}


void utf8_trim_chars(char* str, const char* chars) {

    if (!str || !chars) {

        return;

    }


    uint32_t trim_codepoints[256] = {0};

    size_t num_trim_chars = 0;

    size_t chars_len = strlen(chars);


    for (size_t k = 0; k < chars_len && num_trim_chars < 256;) {

        size_t current_char_len = utf8_char_length(&chars[k]);

        if (current_char_len == 0 || k + current_char_len > chars_len) {

            k++;

            continue;

        }


        uint32_t codepoint = utf8_to_codepoint(&chars[k]);

        if (codepoint != 0xFFFD) {

            bool found = false;

            for (size_t idx = 0; idx < num_trim_chars; ++idx) {

                if (trim_codepoints[idx] == codepoint) {

                    found = true;

                    break;

                }

            }

            if (!found && num_trim_chars < 256) {

                trim_codepoints[num_trim_chars++] = codepoint;

            }

        }

        k += current_char_len;

    }


    size_t len = strlen(str);

    size_t i = 0;

    while (i < len) {

        size_t current_char_len = utf8_char_length(&str[i]);

        if (current_char_len == 0 || i + current_char_len > len) {

            break;

        }


        uint32_t codepoint = utf8_to_codepoint(&str[i]);

        bool should_trim = false;


        if (codepoint != 0xFFFD) {

            for (size_t j = 0; j < num_trim_chars; j++) {

                if (codepoint == trim_codepoints[j]) {

                    should_trim = true;

                    break;

                }

            }

        }


        if (!should_trim) {

            break;

        }


        i += current_char_len;

    }


    if (i > 0) {

        memmove(str, &str[i], len - i + 1);

        len -= i;

    }


    i = len;

    while (i > 0) {

        size_t char_start = i - 1;

        while (char_start > 0 && (str[char_start] & 0xC0) == 0x80) {

            char_start--;

        }


        if ((str[char_start] & 0xC0) == 0x80) {

            break;

        }


        size_t char_len = utf8_char_length(&str[char_start]);


        if (char_len > 0 && char_start + char_len == i) {

            uint32_t codepoint = utf8_to_codepoint(&str[char_start]);

            bool should_trim = false;


            if (codepoint != 0xFFFD) {

                for (size_t j = 0; j < num_trim_chars; j++) {

                    if (codepoint == trim_codepoints[j]) {

                        should_trim = true;

                        break;

                    }

                }

            }


            if (should_trim) {

                i = char_start;

            } else {

                break;

            }

        } else {

            break;

        }

    }


    str[i] = '\0';

}


void utf8_trim_char(char* str, char c) {

    if (!str) {

        return;

    }


    size_t len = strlen(str);

    size_t i = 0;

    while (i < len && str[i] == c) {

        i++;

    }


    if (i > 0) {

        memmove(str, &str[i], len - i + 1);

        len -= i;

    }


    while (len > 0 && str[len - 1] == c) {

        len--;

    }

    str[len] = '\0';

}


void utf8_tolower(char* str) {

    if (!str) return;


    for (size_t i = 0; str[i] != '\0';) {

        unsigned char byte = (unsigned char)str[i];


        /* ---- ASCII fast path (covers the vast majority of real text) ---- */

        if (byte < 0x80u) {

            /* Branchless lowercase: set bit 5 only when byte is A-Z.        */

            /* Mask is 1 for characters in 'A'..'Z' (0x41..0x5A), 0 otherwise */

            unsigned int is_upper = (byte - 'A' + 1u <= 26u) ? 1u : 0u;

            str[i] = (char)(byte | (is_upper << 5));

            i++;

            continue;

        }


        /* ---- Multibyte path ---- */

        uint32_t codepoint = utf8_to_codepoint(&str[i]);

        size_t old_len = utf8_char_length(&str[i]);


        if (old_len == 0) break;


        if (iswupper((wint_t)codepoint)) {

            char utf8[UTF8_MAX_LEN] = {0};

            ucp_to_utf8((uint32_t)towlower((wint_t)codepoint), utf8);

            size_t new_len = utf8_valid_byte_count(utf8);


            if (new_len != old_len) {

                size_t remaining = strlen(&str[i + old_len]);

                memmove(&str[i + new_len], &str[i + old_len], remaining + 1);

            }

            memcpy(&str[i], utf8, new_len);

            i += new_len;

        } else {

            i += old_len;

        }

    }

}


void utf8_toupper(char* str) {

    if (!str) return;


    for (size_t i = 0; str[i] != '\0';) {

        unsigned char byte = (unsigned char)str[i];


        /* ---- ASCII fast path ---- */

        if (byte < 0x80u) {

            /* Branchless uppercase: clear bit 5 only when byte is a-z.     */

            unsigned int is_lower = (byte - 'a' + 1u <= 26u) ? 1u : 0u;

            str[i] = (char)(byte & (unsigned char)~(is_lower << 5));

            i++;

            continue;

        }


        /* ---- Multibyte path ---- */

        uint32_t codepoint = utf8_to_codepoint(&str[i]);

        size_t old_len = utf8_char_length(&str[i]);


        if (old_len == 0) break;


        if (iswlower((wint_t)codepoint)) {

            char utf8[UTF8_MAX_LEN] = {0};

            ucp_to_utf8((uint32_t)towupper((wint_t)codepoint), utf8);

            size_t new_len = utf8_valid_byte_count(utf8);


            if (new_len != old_len) {

                size_t remaining = strlen(&str[i + old_len]);

                memmove(&str[i + new_len], &str[i + old_len], remaining + 1);

            }

            memcpy(&str[i], utf8, new_len);

            i += new_len;

        } else {

            i += old_len;

        }

    }

}


utf8_string** utf8_split(const utf8_string* str, const char* delim, size_t* num_parts) {

    if (!str || !str->data || !delim || !num_parts) {

        if (num_parts) {

            *num_parts = 0;

        }

        return NULL;

    }


    size_t delim_len = utf8_valid_byte_count(delim);

    if (delim_len == 0) {

        *num_parts = 0;

        return NULL;

    }


    size_t count = 1;

    size_t len = str->length;

    for (size_t i = 0; i < len;) {

        if (i + delim_len <= len && utf8_starts_with(&str->data[i], delim)) {

            count++;

            i += delim_len;

        } else {

            size_t char_len = utf8_char_length(&str->data[i]);

            if (char_len == 0) {

                break;

            }

            i += char_len;

        }

    }


    utf8_string** parts = (utf8_string**)malloc(count * sizeof(utf8_string*));

    if (!parts) {

        *num_parts = 0;

        return NULL;

    }


    size_t index = 0;

    size_t start = 0;

    for (size_t i = 0; i < len;) {

        if (i + delim_len <= len && utf8_starts_with(&str->data[i], delim)) {

            parts[index] = utf8_new(&str->data[start]);

            if (parts[index]) {

                parts[index]->data[i - start] = '\0';

                parts[index]->length = i - start;

                parts[index]->count = utf8_count_codepoints(parts[index]->data);

            }

            index++;

            i += delim_len;

            start = i;

        } else {

            size_t char_len = utf8_char_length(&str->data[i]);

            if (char_len == 0) {

                break;

            }

            i += char_len;

        }

    }


    parts[index] = utf8_new(&str->data[start]);

    if (parts[index]) {

        parts[index]->data[len - start] = '\0';

        parts[index]->length = len - start;

        parts[index]->count = utf8_count_codepoints(parts[index]->data);

    }


    *num_parts = count;

    return parts;

}


void utf8_array_remove(utf8_string** array, size_t size, size_t index) {

    if (!array || index >= size) {

        return;

    }


    utf8_free(array[index]);

    for (size_t i = index; i < size - 1; i++) {

        array[i] = array[i + 1];

    }

    array[size - 1] = NULL;

}


void utf8_split_free(utf8_string** str, size_t size) {

    if (!str) {

        return;

    }


    for (size_t i = 0; i < size; i++) {

        utf8_free(str[i]);

    }

    free(str);

}


bool utf8_starts_with(const char* str, const char* prefix) {

    if (!str || !prefix) {

        return false;

    }


    size_t len = utf8_valid_byte_count(prefix);

    for (size_t i = 0; i < len; i++) {

        if (str[i] != prefix[i]) {

            return false;

        }

    }

    return true;

}


bool utf8_ends_with(const char* str, const char* suffix) {

    if (!str || !suffix) {

        return false;

    }


    size_t len = utf8_valid_byte_count(str);

    size_t len2 = utf8_valid_byte_count(suffix);

    if (len2 > len) {

        return false;

    }

    for (size_t i = 0; i < len2; i++) {

        if (str[len - len2 + i] != suffix[i]) {

            return false;

        }

    }

    return true;

}


bool utf8_contains(const char* str, const char* substr) {

    if (!str || !substr) {

        return false;

    }

    return strstr(str, substr) != NULL;

}


int utf8_compare(const char* s1, const char* s2) {

    if (!s1 && !s2) {

        return 0;

    }

    if (!s1) {

        return -1;

    }

    if (!s2) {

        return 1;

    }

    return strcmp(s1, s2);

}


bool utf8_equals(const char* s1, const char* s2) {

    if (!s1 || !s2) {

        return s1 == s2;

    }

    return strcmp(s1, s2) == 0;

}


utf8_string* utf8_clone(const utf8_string* s) {

    if (!s || !s->data) {

        return NULL;

    }

    return utf8_new(s->data);

}


utf8_string* utf8_concat(const utf8_string* s1, const utf8_string* s2) {

    if (!s1 || !s1->data || !s2 || !s2->data) {

        return NULL;

    }


    utf8_string* result = utf8_new_with_capacity(s1->length + s2->length);

    if (!result) {

        return NULL;

    }


    memcpy(result->data, s1->data, s1->length);

    memcpy(result->data + s1->length, s2->data, s2->length);

    result->data[s1->length + s2->length] = '\0';

    result->length = s1->length + s2->length;

    result->count = s1->count + s2->count;


    return result;

}


utf8_string
Represents a mutable UTF-8 encoded string with metadata.
Definition unicode.h:48

utf8_string::data
char * data
Definition unicode.h:49

utf8_string::length
size_t length
Definition unicode.h:50

utf8_string::count
size_t count
Definition unicode.h:51

utf8_to_codepoint
uint32_t utf8_to_codepoint(const char *utf8)
Decodes a UTF-8 byte sequence to its Unicode codepoint.
Definition unicode.c:69

utf8_toupper
void utf8_toupper(char *str)
Converts all lowercase characters to uppercase in-place.
Definition unicode.c:1191

utf8_readfrom
utf8_string * utf8_readfrom(const char *filename)
Reads a UTF-8 string from a file.
Definition unicode.c:882

utf8_new_with_capacity
utf8_string * utf8_new_with_capacity(size_t capacity)
Creates an empty utf8_string with preallocated capacity.
Definition unicode.c:481

utf8_replace_all
size_t utf8_replace_all(utf8_string *s, const char *old_str, const char *new_str)
Replaces all occurrences of a substring with another string.
Definition unicode.c:773

is_utf8_digit
bool is_utf8_digit(const char *utf8)
Checks if a UTF-8 character represents a digit.
Definition unicode.c:339

utf8_replace
bool utf8_replace(utf8_string *s, const char *old_str, const char *new_str)
Replaces the first occurrence of a substring with another string.
Definition unicode.c:734

utf8_array_remove
void utf8_array_remove(utf8_string **array, size_t size, size_t index)
Removes an element from a utf8_string array and frees it.
Definition unicode.c:1318

utf8_concat
utf8_string * utf8_concat(const utf8_string *s1, const utf8_string *s2)
Concatenates two utf8_string objects into a new string.
Definition unicode.c:1463

utf8_reverse
bool utf8_reverse(utf8_string *s)
Reverses a UTF-8 string by codepoints.
Definition unicode.c:821

is_codepoint_alnum
bool is_codepoint_alnum(uint32_t codepoint)
Checks if a codepoint represents an alphanumeric character.
Definition unicode.c:373

utf8_substr
char * utf8_substr(const utf8_string *s, size_t index, size_t utf8_byte_len)
Extracts a substring by byte range.
Definition unicode.c:648

utf8_last_index_of
int utf8_last_index_of(const utf8_string *s, const char *utf8)
Finds the byte index of the last occurrence of a substring.
Definition unicode.c:587

utf8_split
utf8_string ** utf8_split(const utf8_string *str, const char *delim, size_t *num_parts)
Splits a UTF-8 string into parts using a delimiter.
Definition unicode.c:1241

utf8_data
const char * utf8_data(const utf8_string *s)
Returns a pointer to the internal UTF-8 data buffer.
Definition unicode.c:437

utf8_free
void utf8_free(utf8_string *s)
Frees all resources associated with a utf8_string.
Definition unicode.c:505

utf8_ltrim
void utf8_ltrim(char *str)
Removes leading whitespace from a UTF-8 string in-place.
Definition unicode.c:930

utf8_copy
char * utf8_copy(const char *data)
Creates a copy of a UTF-8 string containing only valid UTF-8 sequences.
Definition unicode.c:416

utf8_valid_byte_count
size_t utf8_valid_byte_count(const char *s)
Counts the number of valid UTF-8 bytes in a string.
Definition unicode.c:162

utf8_ends_with
bool utf8_ends_with(const char *str, const char *suffix)
Checks if a string ends with a given suffix.
Definition unicode.c:1375

utf8_equals
bool utf8_equals(const char *s1, const char *s2)
Compares two UTF-8 strings for equality.
Definition unicode.c:1434

is_utf8_punct
bool is_utf8_punct(const char *utf8)
Checks if a UTF-8 character represents a punctuation character.
Definition unicode.c:402

utf8_remove
bool utf8_remove(utf8_string *s, size_t index, size_t count)
Removes a specified number of codepoints starting at a byte index.
Definition unicode.c:702

utf8_insert
bool utf8_insert(utf8_string *s, size_t index, const char *data)
Inserts UTF-8 data at a specific byte index.
Definition unicode.c:673

is_codepoint_whitespace
bool is_codepoint_whitespace(uint32_t codepoint)
Checks if a codepoint represents whitespace.
Definition unicode.c:310

is_codepoint_punct
bool is_codepoint_punct(uint32_t codepoint)
Checks if a codepoint represents a punctuation character.
Definition unicode.c:394

is_utf8_alnum
bool is_utf8_alnum(const char *utf8)
Checks if a UTF-8 character represents an alphanumeric character.
Definition unicode.c:381

utf8_rtrim
void utf8_rtrim(char *str)
Removes trailing whitespace from a UTF-8 string in-place.
Definition unicode.c:957

utf8_append
bool utf8_append(utf8_string *s, const char *data)
Appends UTF-8 data to the end of a utf8_string.
Definition unicode.c:617

utf8_index_of
int utf8_index_of(const utf8_string *s, const char *utf8)
Finds the byte index of the first occurrence of a substring.
Definition unicode.c:568

is_codepoint_digit
bool is_codepoint_digit(uint32_t codepoint)
Checks if a codepoint represents a digit.
Definition unicode.c:331

utf8_split_free
void utf8_split_free(utf8_string **str, size_t size)
Frees an array of utf8_string objects returned by utf8_split().
Definition unicode.c:1336

utf8_trim
void utf8_trim(char *str)
Removes leading and trailing whitespace from a UTF-8 string in-place.
Definition unicode.c:988

utf8_print
void utf8_print(const utf8_string *s)
Prints the UTF-8 string content to stdout followed by a newline.
Definition unicode.c:519

utf8_compare
int utf8_compare(const char *s1, const char *s2)
Compares two UTF-8 strings lexicographically.
Definition unicode.c:1414

is_valid_utf8
bool is_valid_utf8(const char *utf8)
Comprehensively validates a UTF-8 encoded string.
Definition unicode.c:256

utf8_char_length
size_t utf8_char_length(const char *str)
Determines the byte length of a UTF-8 character from its first byte.
Definition unicode.c:214

UNICODE_MAX_CODEPOINT
#define UNICODE_MAX_CODEPOINT
Definition unicode.h:35

utf8_trim_chars
void utf8_trim_chars(char *str, const char *c)
Removes leading and trailing characters from a UTF-8 string in-place.
Definition unicode.c:1004

utf8_print_info
void utf8_print_info(const utf8_string *s)
Prints metadata about the UTF-8 string to stdout.
Definition unicode.c:531

utf8_trim_char
void utf8_trim_char(char *str, char c)
Removes leading and trailing occurrences of a single character.
Definition unicode.c:1114

utf8_clone
utf8_string * utf8_clone(const utf8_string *s)
Duplicates a utf8_string object.
Definition unicode.c:1448

ucp_to_utf8
void ucp_to_utf8(uint32_t codepoint, char utf8[UTF8_MAX_LEN])
Converts a Unicode codepoint to its UTF-8 byte sequence.
Definition unicode.c:27

utf8_new
utf8_string * utf8_new(const char *data)
Creates a new utf8_string object from a C string.
Definition unicode.c:453

utf8_count_codepoints
size_t utf8_count_codepoints(const char *utf8)
Counts the number of Unicode codepoints in a UTF-8 string.
Definition unicode.c:133

is_utf8_whitespace
bool is_utf8_whitespace(const char *utf8)
Checks if a UTF-8 character represents whitespace.
Definition unicode.c:318

is_valid_codepoint
bool is_valid_codepoint(uint32_t codepoint)
Validates whether a codepoint is within the valid Unicode range.
Definition unicode.c:240

utf8_contains
bool utf8_contains(const char *str, const char *substr)
Checks if a string contains a substring.
Definition unicode.c:1400

utf8_tolower
void utf8_tolower(char *str)
Converts all uppercase characters to lowercase in-place.
Definition unicode.c:1144

is_codepoint_alpha
bool is_codepoint_alpha(uint32_t codepoint)
Checks if a codepoint represents an alphabetic character.
Definition unicode.c:352

utf8_starts_with
bool utf8_starts_with(const char *str, const char *prefix)
Checks if a string starts with a given prefix.
Definition unicode.c:1354

is_utf8_alpha
bool is_utf8_alpha(const char *utf8)
Checks if a UTF-8 character represents an alphabetic character.
Definition unicode.c:360

utf8_writeto
long utf8_writeto(const utf8_string *s, const char *filename)
Writes a utf8_string to a file.
Definition unicode.c:855

utf8_print_codepoints
void utf8_print_codepoints(const utf8_string *s)
Prints the Unicode codepoints in U+XXXX format to stdout.
Definition unicode.c:544