1#ifndef SOLIDC_UNICODE_H
2#define SOLIDC_UNICODE_H
32#define UNICODE_VERSION 0x0100
35#define UNICODE_MAX_CODEPOINT 0x10FFFF
38#define UNICODE_MAX_UTF8_BYTES 4
40#define UTF8_MAX_LEN (UNICODE_MAX_UTF8_BYTES + 1)
64void ucp_to_utf8(uint32_t codepoint,
char utf8[UTF8_MAX_LEN]);
Represents a mutable UTF-8 encoded string with metadata.
uint32_t utf8_to_codepoint(const char *utf8)
Decodes a UTF-8 byte sequence to its Unicode codepoint.
void utf8_toupper(char *str)
Converts all lowercase characters to uppercase in-place.
utf8_string * utf8_readfrom(const char *filename)
Reads a UTF-8 string from a file.
utf8_string * utf8_new_with_capacity(size_t capacity)
Creates an empty utf8_string with preallocated capacity.
size_t utf8_replace_all(utf8_string *s, const char *old_str, const char *new_str)
Replaces all occurrences of a substring with another string.
bool is_utf8_digit(const char *utf8)
Checks if a UTF-8 character represents a digit.
bool utf8_replace(utf8_string *s, const char *old_str, const char *new_str)
Replaces the first occurrence of a substring with another string.
void utf8_array_remove(utf8_string **array, size_t size, size_t index)
Removes an element from a utf8_string array and frees it.
utf8_string * utf8_concat(const utf8_string *s1, const utf8_string *s2)
Concatenates two utf8_string objects into a new string.
bool utf8_reverse(utf8_string *s)
Reverses a UTF-8 string by codepoints.
bool is_codepoint_alnum(uint32_t codepoint)
Checks if a codepoint represents an alphanumeric character.
char * utf8_substr(const utf8_string *s, size_t index, size_t utf8_byte_len)
Extracts a substring by byte range.
int utf8_last_index_of(const utf8_string *s, const char *utf8)
Finds the byte index of the last occurrence of a substring.
utf8_string ** utf8_split(const utf8_string *str, const char *delim, size_t *num_parts)
Splits a UTF-8 string into parts using a delimiter.
const char * utf8_data(const utf8_string *s)
Returns a pointer to the internal UTF-8 data buffer.
void utf8_free(utf8_string *s)
Frees all resources associated with a utf8_string.
void utf8_ltrim(char *str)
Removes leading whitespace from a UTF-8 string in-place.
char * utf8_copy(const char *data)
Creates a copy of a UTF-8 string containing only valid UTF-8 sequences.
size_t utf8_valid_byte_count(const char *s)
Counts the number of valid UTF-8 bytes in a string.
bool utf8_ends_with(const char *str, const char *suffix)
Checks if a string ends with a given suffix.
bool utf8_equals(const char *s1, const char *s2)
Compares two UTF-8 strings for equality.
bool is_utf8_punct(const char *utf8)
Checks if a UTF-8 character represents a punctuation character.
bool utf8_remove(utf8_string *s, size_t index, size_t count)
Removes a specified number of codepoints starting at a byte index.
bool utf8_insert(utf8_string *s, size_t index, const char *data)
Inserts UTF-8 data at a specific byte index.
bool is_codepoint_whitespace(uint32_t codepoint)
Checks if a codepoint represents whitespace.
bool is_codepoint_punct(uint32_t codepoint)
Checks if a codepoint represents a punctuation character.
bool is_utf8_alnum(const char *utf8)
Checks if a UTF-8 character represents an alphanumeric character.
void utf8_rtrim(char *str)
Removes trailing whitespace from a UTF-8 string in-place.
bool utf8_append(utf8_string *s, const char *data)
Appends UTF-8 data to the end of a utf8_string.
int utf8_index_of(const utf8_string *s, const char *utf8)
Finds the byte index of the first occurrence of a substring.
bool is_codepoint_digit(uint32_t codepoint)
Checks if a codepoint represents a digit.
void utf8_split_free(utf8_string **str, size_t size)
Frees an array of utf8_string objects returned by utf8_split().
void utf8_trim(char *str)
Removes leading and trailing whitespace from a UTF-8 string in-place.
void utf8_print(const utf8_string *s)
Prints the UTF-8 string content to stdout followed by a newline.
int utf8_compare(const char *s1, const char *s2)
Compares two UTF-8 strings lexicographically.
bool is_valid_utf8(const char *utf8)
Comprehensively validates a UTF-8 encoded string.
bool regex_match(const char *str, const char *pattern)
Tests if a string matches a regular expression pattern.
size_t utf8_char_length(const char *str)
Determines the byte length of a UTF-8 character from its first byte.
void utf8_trim_chars(char *str, const char *c)
Removes leading and trailing characters from a UTF-8 string in-place.
void utf8_print_info(const utf8_string *s)
Prints metadata about the UTF-8 string to stdout.
void utf8_trim_char(char *str, char c)
Removes leading and trailing occurrences of a single character.
utf8_string * utf8_clone(const utf8_string *s)
Duplicates a utf8_string object.
void ucp_to_utf8(uint32_t codepoint, char utf8[UTF8_MAX_LEN])
Converts a Unicode codepoint to its UTF-8 byte sequence.
utf8_string * utf8_new(const char *data)
Creates a new utf8_string object from a C string.
size_t utf8_count_codepoints(const char *utf8)
Counts the number of Unicode codepoints in a UTF-8 string.
bool is_utf8_whitespace(const char *utf8)
Checks if a UTF-8 character represents whitespace.
bool is_valid_codepoint(uint32_t codepoint)
Validates whether a codepoint is within the valid Unicode range.
bool utf8_contains(const char *str, const char *substr)
Checks if a string contains a substring.
void utf8_tolower(char *str)
Converts all uppercase characters to lowercase in-place.
bool is_codepoint_alpha(uint32_t codepoint)
Checks if a codepoint represents an alphabetic character.
bool utf8_starts_with(const char *str, const char *prefix)
Checks if a string starts with a given prefix.
bool is_utf8_alpha(const char *utf8)
Checks if a UTF-8 character represents an alphabetic character.
long utf8_writeto(const utf8_string *s, const char *filename)
Writes a utf8_string to a file.
void utf8_print_codepoints(const utf8_string *s)
Prints the Unicode codepoints in U+XXXX format to stdout.