|
solidc
Robust collection of general-purpose cross-platform C libraries and data structures designed for rapid and safe development in C
|
Unicode handling library for C with UTF-8 encoding support. More...
#include <stdbool.h>#include <stddef.h>#include <stdint.h>Go to the source code of this file.
Classes | |
| struct | utf8_string |
| Represents a mutable UTF-8 encoded string with metadata. More... | |
Macros | |
| #define | UNICODE_VERSION 0x0100 |
| #define | UNICODE_MAX_CODEPOINT 0x10FFFF |
| #define | UNICODE_MAX_UTF8_BYTES 4 |
Typedefs | |
| typedef struct utf8_string | utf8_string |
| Represents a mutable UTF-8 encoded string with metadata. | |
Functions | |
| void | ucp_to_utf8 (uint32_t codepoint, char utf8[UTF8_MAX_LEN]) |
| Converts a Unicode codepoint to its UTF-8 byte sequence. | |
| uint32_t | utf8_to_codepoint (const char *utf8) |
| Decodes a UTF-8 byte sequence to its Unicode codepoint. | |
| size_t | utf8_count_codepoints (const char *utf8) |
| Counts the number of Unicode codepoints in a UTF-8 string. | |
| size_t | utf8_valid_byte_count (const char *s) |
| Counts the number of valid UTF-8 bytes in a string. | |
| size_t | utf8_char_length (const char *str) |
| Determines the byte length of a UTF-8 character from its first byte. | |
| bool | is_valid_codepoint (uint32_t codepoint) |
| Validates whether a codepoint is within the valid Unicode range. | |
| bool | is_valid_utf8 (const char *utf8) |
| Comprehensively validates a UTF-8 encoded string. | |
| bool | is_codepoint_whitespace (uint32_t codepoint) |
| Checks if a codepoint represents whitespace. | |
| bool | is_utf8_whitespace (const char *utf8) |
| Checks if a UTF-8 character represents whitespace. | |
| bool | is_codepoint_digit (uint32_t codepoint) |
| Checks if a codepoint represents a digit. | |
| bool | is_utf8_digit (const char *utf8) |
| Checks if a UTF-8 character represents a digit. | |
| bool | is_codepoint_alpha (uint32_t codepoint) |
| Checks if a codepoint represents an alphabetic character. | |
| bool | is_utf8_alpha (const char *utf8) |
| Checks if a UTF-8 character represents an alphabetic character. | |
| bool | is_codepoint_alnum (uint32_t codepoint) |
| Checks if a codepoint represents an alphanumeric character. | |
| bool | is_utf8_alnum (const char *utf8) |
| Checks if a UTF-8 character represents an alphanumeric character. | |
| bool | is_codepoint_punct (uint32_t codepoint) |
| Checks if a codepoint represents a punctuation character. | |
| bool | is_utf8_punct (const char *utf8) |
| Checks if a UTF-8 character represents a punctuation character. | |
| utf8_string * | utf8_new (const char *data) |
| Creates a new utf8_string object from a C string. | |
| utf8_string * | utf8_new_with_capacity (size_t capacity) |
| Creates an empty utf8_string with preallocated capacity. | |
| void | utf8_free (utf8_string *s) |
| Frees all resources associated with a utf8_string. | |
| char * | utf8_copy (const char *data) |
| Creates a copy of a UTF-8 string containing only valid UTF-8 sequences. | |
| utf8_string * | utf8_clone (const utf8_string *s) |
| Duplicates a utf8_string object. | |
| const char * | utf8_data (const utf8_string *s) |
| Returns a pointer to the internal UTF-8 data buffer. | |
| void | utf8_print (const utf8_string *s) |
| Prints the UTF-8 string content to stdout followed by a newline. | |
| void | utf8_print_info (const utf8_string *s) |
| Prints metadata about the UTF-8 string to stdout. | |
| void | utf8_print_codepoints (const utf8_string *s) |
| Prints the Unicode codepoints in U+XXXX format to stdout. | |
| int | utf8_index_of (const utf8_string *s, const char *utf8) |
| Finds the byte index of the first occurrence of a substring. | |
| int | utf8_last_index_of (const utf8_string *s, const char *utf8) |
| Finds the byte index of the last occurrence of a substring. | |
| bool | utf8_starts_with (const char *str, const char *prefix) |
| Checks if a string starts with a given prefix. | |
| bool | utf8_ends_with (const char *str, const char *suffix) |
| Checks if a string ends with a given suffix. | |
| bool | utf8_contains (const char *str, const char *substr) |
| Checks if a string contains a substring. | |
| int | utf8_compare (const char *s1, const char *s2) |
| Compares two UTF-8 strings lexicographically. | |
| bool | utf8_equals (const char *s1, const char *s2) |
| Compares two UTF-8 strings for equality. | |
| bool | utf8_append (utf8_string *s, const char *data) |
| Appends UTF-8 data to the end of a utf8_string. | |
| char * | utf8_substr (const utf8_string *s, size_t index, size_t utf8_byte_len) |
| Extracts a substring by byte range. | |
| bool | utf8_insert (utf8_string *s, size_t index, const char *data) |
| Inserts UTF-8 data at a specific byte index. | |
| bool | utf8_remove (utf8_string *s, size_t index, size_t count) |
| Removes a specified number of codepoints starting at a byte index. | |
| bool | utf8_replace (utf8_string *s, const char *old_str, const char *new_str) |
| Replaces the first occurrence of a substring with another string. | |
| size_t | utf8_replace_all (utf8_string *s, const char *old_str, const char *new_str) |
| Replaces all occurrences of a substring with another string. | |
| bool | utf8_reverse (utf8_string *s) |
| Reverses a UTF-8 string by codepoints. | |
| utf8_string * | utf8_concat (const utf8_string *s1, const utf8_string *s2) |
| Concatenates two utf8_string objects into a new string. | |
| void | utf8_ltrim (char *str) |
| Removes leading whitespace from a UTF-8 string in-place. | |
| void | utf8_rtrim (char *str) |
| Removes trailing whitespace from a UTF-8 string in-place. | |
| void | utf8_trim (char *str) |
| Removes leading and trailing whitespace from a UTF-8 string in-place. | |
| void | utf8_trim_chars (char *str, const char *c) |
| Removes leading and trailing characters from a UTF-8 string in-place. | |
| void | utf8_trim_char (char *str, char c) |
| Removes leading and trailing occurrences of a single character. | |
| void | utf8_tolower (char *str) |
| Converts all uppercase characters to lowercase in-place. | |
| void | utf8_toupper (char *str) |
| Converts all lowercase characters to uppercase in-place. | |
| utf8_string ** | utf8_split (const utf8_string *str, const char *delim, size_t *num_parts) |
| Splits a UTF-8 string into parts using a delimiter. | |
| void | utf8_split_free (utf8_string **str, size_t size) |
| Frees an array of utf8_string objects returned by utf8_split(). | |
| void | utf8_array_remove (utf8_string **array, size_t size, size_t index) |
| Removes an element from a utf8_string array and frees it. | |
| long | utf8_writeto (const utf8_string *s, const char *filename) |
| Writes a utf8_string to a file. | |
| utf8_string * | utf8_readfrom (const char *filename) |
| Reads a UTF-8 string from a file. | |
| bool | regex_match (const char *str, const char *pattern) |
| Tests if a string matches a regular expression pattern. | |
Unicode handling library for C with UTF-8 encoding support.
This library provides comprehensive UTF-8 string manipulation capabilities including encoding/decoding, validation, searching, transformation, and character classification using Unicode standards.
UTF-8 is a variable-length encoding that uses 1-4 bytes per codepoint:
Definition in file unicode.h.
| #define UNICODE_MAX_CODEPOINT 0x10FFFF |
| #define UNICODE_MAX_UTF8_BYTES 4 |
| #define UNICODE_VERSION 0x0100 |
| typedef struct utf8_string utf8_string |
Represents a mutable UTF-8 encoded string with metadata.
This structure maintains both the UTF-8 byte data and precomputed statistics about the string for efficient operations.
| bool is_codepoint_alnum | ( | uint32_t | codepoint | ) |
Checks if a codepoint represents an alphanumeric character.
| codepoint | The codepoint to test. |
Checks if a Unicode codepoint represents an alphanumeric character.
| codepoint | The Unicode codepoint to test. |
Definition at line 373 of file unicode.c.
References is_codepoint_alnum().
Referenced by is_codepoint_alnum(), and is_utf8_alnum().
| bool is_codepoint_alpha | ( | uint32_t | codepoint | ) |
Checks if a codepoint represents an alphabetic character.
| codepoint | The codepoint to test. |
Checks if a Unicode codepoint represents an alphabetic character.
| codepoint | The Unicode codepoint to test. |
Definition at line 352 of file unicode.c.
References is_codepoint_alpha().
Referenced by is_codepoint_alpha(), and is_utf8_alpha().
| bool is_codepoint_digit | ( | uint32_t | codepoint | ) |
Checks if a codepoint represents a digit.
| codepoint | The codepoint to test. |
Checks if a Unicode codepoint represents a digit.
| codepoint | The Unicode codepoint to test. |
Definition at line 331 of file unicode.c.
References is_codepoint_digit().
Referenced by is_codepoint_digit(), and is_utf8_digit().
| bool is_codepoint_punct | ( | uint32_t | codepoint | ) |
Checks if a codepoint represents a punctuation character.
| codepoint | The codepoint to test. |
Checks if a Unicode codepoint represents a punctuation character.
| codepoint | The Unicode codepoint to test. |
Definition at line 394 of file unicode.c.
References is_codepoint_punct().
Referenced by is_codepoint_punct(), and is_utf8_punct().
| bool is_codepoint_whitespace | ( | uint32_t | codepoint | ) |
Checks if a codepoint represents whitespace.
| codepoint | The codepoint to test. |
Checks if a Unicode codepoint represents whitespace.
| codepoint | The Unicode codepoint to test. |
Definition at line 310 of file unicode.c.
References is_codepoint_whitespace().
Referenced by is_codepoint_whitespace(), and is_utf8_whitespace().
| bool is_utf8_alnum | ( | const char * | utf8 | ) |
Checks if a UTF-8 character represents an alphanumeric character.
| utf8 | Pointer to UTF-8 character. Must not be NULL. |
Checks if a UTF-8 character represents an alphanumeric character.
| utf8 | Pointer to a UTF-8 encoded character. Must not be NULL. |
Definition at line 381 of file unicode.c.
References is_codepoint_alnum(), is_utf8_alnum(), and utf8_to_codepoint().
Referenced by is_utf8_alnum().
| bool is_utf8_alpha | ( | const char * | utf8 | ) |
Checks if a UTF-8 character represents an alphabetic character.
| utf8 | Pointer to UTF-8 character. Must not be NULL. |
Checks if a UTF-8 character represents an alphabetic character.
| utf8 | Pointer to a UTF-8 encoded character. Must not be NULL. |
Definition at line 360 of file unicode.c.
References is_codepoint_alpha(), is_utf8_alpha(), and utf8_to_codepoint().
Referenced by is_utf8_alpha().
| bool is_utf8_digit | ( | const char * | utf8 | ) |
Checks if a UTF-8 character represents a digit.
| utf8 | Pointer to UTF-8 character. Must not be NULL. |
Checks if a UTF-8 character represents a digit.
| utf8 | Pointer to a UTF-8 encoded character. Must not be NULL. |
Definition at line 339 of file unicode.c.
References is_codepoint_digit(), is_utf8_digit(), and utf8_to_codepoint().
Referenced by is_utf8_digit().
| bool is_utf8_punct | ( | const char * | utf8 | ) |
Checks if a UTF-8 character represents a punctuation character.
| utf8 | Pointer to UTF-8 character. Must not be NULL. |
Checks if a UTF-8 character represents a punctuation character.
| utf8 | Pointer to a UTF-8 encoded character. Must not be NULL. |
Definition at line 402 of file unicode.c.
References is_codepoint_punct(), is_utf8_punct(), and utf8_to_codepoint().
Referenced by is_utf8_punct().
| bool is_utf8_whitespace | ( | const char * | utf8 | ) |
Checks if a UTF-8 character represents whitespace.
| utf8 | Pointer to UTF-8 character. Must not be NULL. |
Checks if a UTF-8 character represents whitespace.
| utf8 | Pointer to a UTF-8 encoded character. Must not be NULL. |
Definition at line 318 of file unicode.c.
References is_codepoint_whitespace(), is_utf8_whitespace(), and utf8_to_codepoint().
Referenced by is_utf8_whitespace(), utf8_ltrim(), and utf8_rtrim().
| bool is_valid_codepoint | ( | uint32_t | codepoint | ) |
Validates whether a codepoint is within the valid Unicode range.
| codepoint | The codepoint to validate. |
Validates whether a codepoint is within the valid Unicode range.
| codepoint | The Unicode codepoint to validate. |
Definition at line 240 of file unicode.c.
References is_valid_codepoint(), and UNICODE_MAX_CODEPOINT.
Referenced by is_valid_codepoint().
| bool is_valid_utf8 | ( | const char * | utf8 | ) |
Comprehensively validates a UTF-8 encoded string.
| utf8 | Null-terminated UTF-8 string. NULL returns false. |
Comprehensively validates a UTF-8 encoded string.
Validation checks include:
| utf8 | The null-terminated UTF-8 string to validate. NULL returns false. |
Definition at line 256 of file unicode.c.
References is_valid_utf8().
Referenced by is_valid_utf8().
| bool regex_match | ( | const char * | str, |
| const char * | pattern | ||
| ) |
Tests if a string matches a regular expression pattern.
| str | Null-terminated UTF-8 string to test. Must not be NULL. |
| pattern | Regular expression pattern. Must not be NULL. |
| void ucp_to_utf8 | ( | uint32_t | codepoint, |
| char | utf8[UTF8_MAX_LEN] | ||
| ) |
Converts a Unicode codepoint to its UTF-8 byte sequence.
| codepoint | The Unicode codepoint to encode (must be <= 0x10FFFF). |
| utf8 | Output buffer (minimum 5 bytes) that receives null-terminated UTF-8 string. |
Converts a Unicode codepoint to its UTF-8 byte sequence representation.
UTF-8 encoding uses 1-4 bytes per codepoint:
| codepoint | The Unicode codepoint to convert (must be <= 0x10FFFF). |
| utf8 | Output buffer that receives the UTF-8 bytes. Must be at least 5 bytes (4 bytes for max UTF-8 sequence + 1 for null terminator). Buffer is always null-terminated on success. |
Definition at line 27 of file unicode.c.
References ucp_to_utf8().
Referenced by ucp_to_utf8(), utf8_tolower(), and utf8_toupper().
| bool utf8_append | ( | utf8_string * | s, |
| const char * | data | ||
| ) |
Appends UTF-8 data to the end of a utf8_string.
| s | The utf8_string to append to. Must not be NULL. |
| data | Null-terminated UTF-8 string to append. NULL is safely ignored. |
Appends UTF-8 data to the end of a utf8_string.
| s | The utf8_string to append to. Must not be NULL. |
| data | The null-terminated UTF-8 string to append. NULL is safely ignored. |
Definition at line 617 of file unicode.c.
References utf8_string::count, utf8_string::data, utf8_string::length, utf8_append(), utf8_count_codepoints(), and utf8_valid_byte_count().
Referenced by utf8_append().
| void utf8_array_remove | ( | utf8_string ** | array, |
| size_t | size, | ||
| size_t | index | ||
| ) |
Removes an element from a utf8_string array and frees it.
| array | The array of utf8_string pointers. Must not be NULL. |
| size | The current size of the array. |
| index | The index to remove. Must be < size. |
Removes an element from a utf8_string array and frees it.
Elements after the removed index are shifted down.
| array | The array of utf8_string pointers. Must not be NULL. |
| size | The current size of the array. |
| index | The index to remove. Must be < size. |
Definition at line 1318 of file unicode.c.
References utf8_array_remove(), and utf8_free().
Referenced by utf8_array_remove().
| size_t utf8_char_length | ( | const char * | str | ) |
Determines the byte length of a UTF-8 character from its first byte.
| str | Pointer to the first byte. Must not be NULL. |
Determines the byte length of a UTF-8 character from its first byte.
UTF-8 character lengths by leading byte:
| str | Pointer to the first byte of a UTF-8 character. Must not be NULL. |
Definition at line 214 of file unicode.c.
References utf8_char_length().
Referenced by utf8_char_length(), utf8_ltrim(), utf8_print_codepoints(), utf8_remove(), utf8_reverse(), utf8_rtrim(), utf8_split(), utf8_tolower(), utf8_toupper(), and utf8_trim_chars().
| utf8_string * utf8_clone | ( | const utf8_string * | s | ) |
Duplicates a utf8_string object.
| s | The utf8_string to duplicate. Must not be NULL. |
Duplicates a utf8_string object.
| s | The utf8_string to duplicate. Must not be NULL. |
Definition at line 1448 of file unicode.c.
References utf8_string::data, utf8_clone(), and utf8_new().
Referenced by utf8_clone().
| int utf8_compare | ( | const char * | s1, |
| const char * | s2 | ||
| ) |
Compares two UTF-8 strings lexicographically.
| s1 | First UTF-8 string. NULL is treated as empty. |
| s2 | Second UTF-8 string. NULL is treated as empty. |
Compares two UTF-8 strings lexicographically.
| s1 | First UTF-8 string. NULL is treated as empty. |
| s2 | Second UTF-8 string. NULL is treated as empty. |
Definition at line 1414 of file unicode.c.
References utf8_compare().
Referenced by utf8_compare().
| utf8_string * utf8_concat | ( | const utf8_string * | s1, |
| const utf8_string * | s2 | ||
| ) |
Concatenates two utf8_string objects into a new string.
| s1 | First utf8_string. Must not be NULL. |
| s2 | Second utf8_string. Must not be NULL. |
Concatenates two utf8_string objects into a new string.
| s1 | First utf8_string. Must not be NULL. |
| s2 | Second utf8_string. Must not be NULL. |
Definition at line 1463 of file unicode.c.
References utf8_string::count, utf8_string::data, utf8_string::length, utf8_concat(), and utf8_new_with_capacity().
Referenced by utf8_concat().
| bool utf8_contains | ( | const char * | str, |
| const char * | substr | ||
| ) |
Checks if a string contains a substring.
| str | Null-terminated UTF-8 string. Must not be NULL. |
| substr | The substring to search for. Must not be NULL. |
Checks if a string contains a substring.
| str | The null-terminated UTF-8 string to search in. Must not be NULL. |
| substr | The substring to search for. Must not be NULL. |
Definition at line 1400 of file unicode.c.
References utf8_contains().
Referenced by utf8_contains().
| char * utf8_copy | ( | const char * | data | ) |
Creates a copy of a UTF-8 string containing only valid UTF-8 sequences.
| data | Null-terminated UTF-8 string. NULL returns NULL. |
Creates a copy of a UTF-8 string containing only valid UTF-8 sequences.
| data | The null-terminated UTF-8 string to copy. NULL returns NULL. |
Definition at line 416 of file unicode.c.
References utf8_copy(), and utf8_valid_byte_count().
Referenced by utf8_copy(), and utf8_new().
| size_t utf8_count_codepoints | ( | const char * | s | ) |
Counts the number of Unicode codepoints in a UTF-8 string.
| utf8 | Null-terminated UTF-8 string. NULL returns 0. |
Counts the number of Unicode codepoints in a UTF-8 string.
A codepoint is counted by identifying UTF-8 leading bytes, which are any byte that does NOT match the continuation byte pattern (10xxxxxx). This includes:
| s | The null-terminated UTF-8 string. NULL returns 0. |
Definition at line 133 of file unicode.c.
References utf8_count_codepoints().
Referenced by utf8_append(), utf8_count_codepoints(), utf8_insert(), utf8_new(), utf8_replace(), utf8_replace_all(), and utf8_split().
| const char * utf8_data | ( | const utf8_string * | s | ) |
Returns a pointer to the internal UTF-8 data buffer.
| s | The utf8_string object. Must not be NULL. |
Returns a pointer to the internal UTF-8 data buffer.
| s | The utf8_string object. Must not be NULL. |
Definition at line 437 of file unicode.c.
References utf8_string::data, and utf8_data().
Referenced by utf8_data().
| bool utf8_ends_with | ( | const char * | str, |
| const char * | suffix | ||
| ) |
Checks if a string ends with a given suffix.
| str | Null-terminated UTF-8 string. Must not be NULL. |
| suffix | The suffix to test for. Must not be NULL. |
Checks if a string ends with a given suffix.
| str | The null-terminated UTF-8 string to check. Must not be NULL. |
| suffix | The suffix to test for. Must not be NULL. |
Definition at line 1375 of file unicode.c.
References utf8_ends_with(), and utf8_valid_byte_count().
Referenced by utf8_ends_with().
| bool utf8_equals | ( | const char * | s1, |
| const char * | s2 | ||
| ) |
Compares two UTF-8 strings for equality.
| s1 | First UTF-8 string. Must not be NULL. |
| s2 | Second UTF-8 string. Must not be NULL. |
Compares two UTF-8 strings for equality.
| s1 | First UTF-8 string. Must not be NULL. |
| s2 | Second UTF-8 string. Must not be NULL. |
Definition at line 1434 of file unicode.c.
References utf8_equals().
Referenced by utf8_equals().
| void utf8_free | ( | utf8_string * | s | ) |
Frees all resources associated with a utf8_string.
| s | The utf8_string to free. NULL is safely ignored. |
Frees all resources associated with a utf8_string.
| s | The utf8_string to free. NULL is safely ignored. |
Definition at line 505 of file unicode.c.
References utf8_string::data, and utf8_free().
Referenced by utf8_array_remove(), utf8_free(), and utf8_split_free().
| int utf8_index_of | ( | const utf8_string * | s, |
| const char * | utf8 | ||
| ) |
Finds the byte index of the first occurrence of a substring.
| s | The utf8_string to search in. Must not be NULL. |
| utf8 | The substring to search for. Must not be NULL. |
Finds the byte index of the first occurrence of a substring.
| s | The utf8_string to search in. Must not be NULL. |
| utf8 | The substring to search for. Must not be NULL. |
Definition at line 568 of file unicode.c.
References utf8_string::data, and utf8_index_of().
Referenced by utf8_index_of().
| bool utf8_insert | ( | utf8_string * | s, |
| size_t | index, | ||
| const char * | data | ||
| ) |
Inserts UTF-8 data at a specific byte index.
| s | The utf8_string to modify. Must not be NULL. |
| index | The byte index at which to insert. |
| data | Null-terminated UTF-8 string to insert. NULL is safely ignored. |
Inserts UTF-8 data at a specific byte index.
| s | The utf8_string to modify. Must not be NULL. |
| index | The byte index at which to insert. |
| data | The null-terminated UTF-8 string to insert. NULL is safely ignored. |
Definition at line 673 of file unicode.c.
References utf8_string::count, utf8_string::data, utf8_string::length, utf8_count_codepoints(), utf8_insert(), and utf8_valid_byte_count().
Referenced by utf8_insert().
| int utf8_last_index_of | ( | const utf8_string * | s, |
| const char * | utf8 | ||
| ) |
Finds the byte index of the last occurrence of a substring.
| s | The utf8_string to search in. Must not be NULL. |
| utf8 | The substring to search for. Must not be NULL. |
Finds the byte index of the last occurrence of a substring.
| s | The utf8_string to search in. Must not be NULL. |
| utf8 | The substring to search for. Must not be NULL. |
Definition at line 587 of file unicode.c.
References utf8_string::data, utf8_string::length, and utf8_last_index_of().
Referenced by utf8_last_index_of().
| void utf8_ltrim | ( | char * | str | ) |
Removes leading whitespace from a UTF-8 string in-place.
| str | Null-terminated UTF-8 string to modify. NULL is safely ignored. |
Removes leading whitespace from a UTF-8 string in-place.
Whitespace is determined by is_utf8_whitespace() which uses locale settings.
| str | The null-terminated UTF-8 string to modify. NULL is safely ignored. |
Definition at line 930 of file unicode.c.
References is_utf8_whitespace(), utf8_char_length(), and utf8_ltrim().
Referenced by utf8_ltrim(), and utf8_trim().
| utf8_string * utf8_new | ( | const char * | data | ) |
Creates a new utf8_string object from a C string.
| data | Null-terminated UTF-8 string. NULL returns NULL. |
Creates a new utf8_string object from a C string.
The string is validated and only valid UTF-8 bytes are stored.
| data | The null-terminated UTF-8 string. NULL returns NULL. |
Definition at line 453 of file unicode.c.
References utf8_string::count, utf8_string::data, utf8_string::length, utf8_copy(), utf8_count_codepoints(), utf8_new(), and utf8_valid_byte_count().
Referenced by utf8_clone(), utf8_new(), utf8_readfrom(), and utf8_split().
| utf8_string * utf8_new_with_capacity | ( | size_t | capacity | ) |
Creates an empty utf8_string with preallocated capacity.
| capacity | Initial byte capacity to allocate. |
Creates an empty utf8_string with preallocated capacity.
| capacity | The initial byte capacity to allocate. |
Definition at line 481 of file unicode.c.
References utf8_string::count, utf8_string::data, utf8_string::length, and utf8_new_with_capacity().
Referenced by utf8_concat(), and utf8_new_with_capacity().
| void utf8_print | ( | const utf8_string * | s | ) |
Prints the UTF-8 string content to stdout followed by a newline.
| s | The utf8_string to print. Must not be NULL. |
Prints the UTF-8 string content to stdout followed by a newline.
| s | The utf8_string to print. Must not be NULL. |
Definition at line 519 of file unicode.c.
References utf8_string::data, and utf8_print().
Referenced by utf8_print().
| void utf8_print_codepoints | ( | const utf8_string * | s | ) |
Prints the Unicode codepoints in U+XXXX format to stdout.
| s | The utf8_string to print. Must not be NULL. |
Prints the Unicode codepoints of the string in U+XXXX format.
| s | The utf8_string to print. Must not be NULL. |
Definition at line 544 of file unicode.c.
References utf8_string::data, utf8_char_length(), utf8_print_codepoints(), and utf8_to_codepoint().
Referenced by utf8_print_codepoints().
| void utf8_print_info | ( | const utf8_string * | s | ) |
Prints metadata about the UTF-8 string to stdout.
| s | The utf8_string to inspect. Must not be NULL. |
Prints metadata about the UTF-8 string to stdout.
| s | The utf8_string to inspect. Must not be NULL. |
Definition at line 531 of file unicode.c.
References utf8_string::count, utf8_string::length, and utf8_print_info().
Referenced by utf8_print_info().
| utf8_string * utf8_readfrom | ( | const char * | filename | ) |
Reads a UTF-8 string from a file.
| filename | The file path to read from. Must not be NULL. |
Reads a UTF-8 string from a file.
| filename | The file path to read from. Must not be NULL. |
Definition at line 882 of file unicode.c.
References utf8_new(), and utf8_readfrom().
Referenced by utf8_readfrom().
| bool utf8_remove | ( | utf8_string * | s, |
| size_t | index, | ||
| size_t | count | ||
| ) |
Removes a specified number of codepoints starting at a byte index.
| s | The utf8_string to modify. Must not be NULL. |
| index | The starting byte index. |
| count | The number of codepoints (not bytes) to remove. |
Removes a specified number of codepoints starting at a byte index.
| s | The utf8_string to modify. Must not be NULL. |
| index | The starting byte index. |
| count | The number of codepoints (not bytes) to remove. |
Definition at line 702 of file unicode.c.
References utf8_string::count, utf8_string::data, utf8_string::length, utf8_char_length(), and utf8_remove().
Referenced by utf8_remove().
| bool utf8_replace | ( | utf8_string * | s, |
| const char * | old_str, | ||
| const char * | new_str | ||
| ) |
Replaces the first occurrence of a substring with another string.
| s | The utf8_string to modify. Must not be NULL. |
| old_str | The substring to find and replace. Must not be NULL. |
| new_str | The replacement string. Must not be NULL. |
Replaces the first occurrence of a substring with another string.
| s | The utf8_string to modify. Must not be NULL. |
| old_str | The substring to find and replace. Must not be NULL. |
| new_str | The replacement string. Must not be NULL. |
Definition at line 734 of file unicode.c.
References utf8_string::count, utf8_string::data, utf8_string::length, utf8_count_codepoints(), utf8_replace(), and utf8_valid_byte_count().
Referenced by utf8_replace().
| size_t utf8_replace_all | ( | utf8_string * | s, |
| const char * | old_str, | ||
| const char * | new_str | ||
| ) |
Replaces all occurrences of a substring with another string.
| s | The utf8_string to modify. Must not be NULL. |
| old_str | The substring to find and replace. Must not be NULL or empty. |
| new_str | The replacement string. Must not be NULL (can be empty). |
Replaces all occurrences of a substring with another string.
| s | The utf8_string to modify. Must not be NULL. |
| old_str | The substring to find and replace. Must not be NULL or empty. |
| new_str | The replacement string. Must not be NULL (can be empty for deletion). |
Definition at line 773 of file unicode.c.
References utf8_string::count, utf8_string::data, utf8_string::length, utf8_count_codepoints(), utf8_replace_all(), and utf8_valid_byte_count().
Referenced by utf8_replace_all().
| bool utf8_reverse | ( | utf8_string * | s | ) |
Reverses a UTF-8 string by codepoints.
| s | The utf8_string to reverse in-place. Must not be NULL. |
Reverses a UTF-8 string by codepoints (not bytes).
Each complete UTF-8 character is treated as an atomic unit.
| s | The utf8_string to reverse in-place. Must not be NULL. |
Definition at line 821 of file unicode.c.
References utf8_string::data, utf8_string::length, utf8_char_length(), and utf8_reverse().
Referenced by utf8_reverse().
| void utf8_rtrim | ( | char * | str | ) |
Removes trailing whitespace from a UTF-8 string in-place.
| str | Null-terminated UTF-8 string to modify. NULL is safely ignored. |
Removes trailing whitespace from a UTF-8 string in-place.
Whitespace is determined by is_utf8_whitespace() which uses locale settings.
| str | The null-terminated UTF-8 string to modify. NULL is safely ignored. |
Definition at line 957 of file unicode.c.
References is_utf8_whitespace(), utf8_char_length(), and utf8_rtrim().
Referenced by utf8_rtrim(), and utf8_trim().
| utf8_string ** utf8_split | ( | const utf8_string * | str, |
| const char * | delim, | ||
| size_t * | num_parts | ||
| ) |
Splits a UTF-8 string into parts using a delimiter.
| str | The utf8_string to split. Must not be NULL. |
| delim | The delimiter string. Must not be NULL or empty. |
| num_parts | Output parameter receiving the number of parts. Must not be NULL. |
Splits a UTF-8 string into parts using a delimiter.
The string is divided at each occurrence of the delimiter. Empty parts (when delimiter appears consecutively) are included in the result.
| str | The utf8_string to split. Must not be NULL. |
| delim | The delimiter string. Must not be NULL or empty. |
| num_parts | Output parameter receiving the number of parts. Must not be NULL. |
Definition at line 1241 of file unicode.c.
References utf8_string::count, utf8_string::data, utf8_string::length, utf8_char_length(), utf8_count_codepoints(), utf8_new(), utf8_split(), utf8_starts_with(), and utf8_valid_byte_count().
Referenced by utf8_split().
| void utf8_split_free | ( | utf8_string ** | str, |
| size_t | size | ||
| ) |
Frees an array of utf8_string objects returned by utf8_split().
| str | The array of utf8_string pointers. NULL is safely ignored. |
| size | The number of elements in the array. |
Frees an array of utf8_string objects returned by utf8_split().
| str | The array of utf8_string pointers. NULL is safely ignored. |
| size | The number of elements in the array. |
Definition at line 1336 of file unicode.c.
References utf8_free(), and utf8_split_free().
Referenced by utf8_split_free().
| bool utf8_starts_with | ( | const char * | str, |
| const char * | prefix | ||
| ) |
Checks if a string starts with a given prefix.
| str | Null-terminated UTF-8 string. Must not be NULL. |
| prefix | The prefix to test for. Must not be NULL. |
Checks if a string starts with a given prefix.
| str | The null-terminated UTF-8 string to check. Must not be NULL. |
| prefix | The prefix to test for. Must not be NULL. |
Definition at line 1354 of file unicode.c.
References utf8_starts_with(), and utf8_valid_byte_count().
Referenced by utf8_split(), and utf8_starts_with().
| char * utf8_substr | ( | const utf8_string * | s, |
| size_t | index, | ||
| size_t | utf8_byte_len | ||
| ) |
Extracts a substring by byte range.
| s | The source utf8_string. Must not be NULL. |
| index | The starting byte index. |
| utf8_byte_len | The number of bytes to extract. |
Extracts a substring by byte range.
| s | The source utf8_string. Must not be NULL. |
| index | The starting byte index. |
| utf8_byte_len | The number of bytes to extract. |
Definition at line 648 of file unicode.c.
References utf8_string::data, utf8_string::length, and utf8_substr().
Referenced by utf8_substr().
| uint32_t utf8_to_codepoint | ( | const char * | utf8 | ) |
Decodes a UTF-8 byte sequence to its Unicode codepoint.
| utf8 | Pointer to UTF-8 encoded byte sequence. Must not be NULL. |
Decodes a UTF-8 byte sequence to its Unicode codepoint.
This function validates the UTF-8 sequence for:
| utf8 | Pointer to UTF-8 encoded byte sequence. Must not be NULL. |
Definition at line 69 of file unicode.c.
References utf8_to_codepoint().
Referenced by is_utf8_alnum(), is_utf8_alpha(), is_utf8_digit(), is_utf8_punct(), is_utf8_whitespace(), utf8_print_codepoints(), utf8_to_codepoint(), utf8_tolower(), utf8_toupper(), and utf8_trim_chars().
| void utf8_tolower | ( | char * | str | ) |
Converts all uppercase characters to lowercase in-place.
| str | Null-terminated UTF-8 string to modify. NULL is safely ignored. |
Converts all uppercase characters in a UTF-8 string to lowercase in-place.
Uses locale-aware conversion via towlower().
| str | The null-terminated UTF-8 string to modify. NULL is safely ignored. |
Definition at line 1144 of file unicode.c.
References ucp_to_utf8(), utf8_char_length(), utf8_to_codepoint(), utf8_tolower(), and utf8_valid_byte_count().
Referenced by utf8_tolower().
| void utf8_toupper | ( | char * | str | ) |
Converts all lowercase characters to uppercase in-place.
| str | Null-terminated UTF-8 string to modify. NULL is safely ignored. |
Converts all lowercase characters in a UTF-8 string to uppercase in-place.
Uses locale-aware conversion via towupper().
| str | The null-terminated UTF-8 string to modify. NULL is safely ignored. |
Definition at line 1191 of file unicode.c.
References ucp_to_utf8(), utf8_char_length(), utf8_to_codepoint(), utf8_toupper(), and utf8_valid_byte_count().
Referenced by utf8_toupper().
| void utf8_trim | ( | char * | str | ) |
Removes leading and trailing whitespace from a UTF-8 string in-place.
| str | Null-terminated UTF-8 string to modify. NULL is safely ignored. |
Removes leading and trailing whitespace from a UTF-8 string in-place.
| str | The null-terminated UTF-8 string to modify. NULL is safely ignored. |
Definition at line 988 of file unicode.c.
References utf8_ltrim(), utf8_rtrim(), and utf8_trim().
Referenced by utf8_trim().
| void utf8_trim_char | ( | char * | str, |
| char | c | ||
| ) |
Removes leading and trailing occurrences of a single character.
| str | Null-terminated UTF-8 string to modify. NULL is safely ignored. |
| c | The ASCII character to trim (single-byte only). |
Removes leading and trailing occurrences of a single character.
| str | The null-terminated UTF-8 string to modify. NULL is safely ignored. |
| c | The ASCII character to trim (only works for single-byte chars). |
Definition at line 1114 of file unicode.c.
References utf8_trim_char().
Referenced by utf8_trim_char().
| void utf8_trim_chars | ( | char * | str, |
| const char * | chars | ||
| ) |
Removes leading and trailing characters from a UTF-8 string in-place.
| str | Null-terminated UTF-8 string to modify. NULL is safely ignored. |
| c | String containing codepoints to trim. NULL is safely ignored. |
Removes leading and trailing characters from a UTF-8 string in-place.
Any codepoint appearing in the 'chars' string will be trimmed from both ends.
| str | The null-terminated UTF-8 string to modify. NULL is safely ignored. |
| chars | String containing codepoints to trim. NULL is safely ignored. |
Definition at line 1004 of file unicode.c.
References utf8_char_length(), utf8_to_codepoint(), and utf8_trim_chars().
Referenced by utf8_trim_chars().
| size_t utf8_valid_byte_count | ( | const char * | s | ) |
Counts the number of valid UTF-8 bytes in a string.
| s | Null-terminated string. NULL returns 0. |
Counts the number of valid UTF-8 bytes in a string.
This function validates each UTF-8 sequence by checking:
Invalid sequences are skipped (counted as 0 bytes), allowing partial processing of corrupted UTF-8 data.
| s | The null-terminated string to validate. NULL returns 0. |
Definition at line 162 of file unicode.c.
References utf8_valid_byte_count().
Referenced by utf8_append(), utf8_copy(), utf8_ends_with(), utf8_insert(), utf8_new(), utf8_replace(), utf8_replace_all(), utf8_split(), utf8_starts_with(), utf8_tolower(), utf8_toupper(), and utf8_valid_byte_count().
| long utf8_writeto | ( | const utf8_string * | s, |
| const char * | filename | ||
| ) |
Writes a utf8_string to a file.
| s | The utf8_string to write. Must not be NULL. |
| filename | The file path. Existing files are overwritten. Must not be NULL. |
Writes a utf8_string to a file.
| s | The utf8_string to write. Must not be NULL. |
| filename | The file path. Existing files are overwritten. Must not be NULL. |
Definition at line 855 of file unicode.c.
References utf8_string::data, utf8_string::length, and utf8_writeto().
Referenced by utf8_writeto().