1 module derelict.utf8proc.statfun; 2 3 version (Derelict_Static) version = DerelictUTF8Proc_Static; 4 version (DerelictUTF8Proc_Static) : public import derelict.utf8proc.types; 5 6 extern (C) @nogc nothrow 7 { 8 /** 9 * Array containing the byte lengths of a UTF-8 encoded codepoint based 10 * on the first byte. 11 */ 12 extern const utf8proc_int8_t[256] utf8proc_utf8class; 13 14 /** 15 * Returns the utf8proc API version as a string MAJOR.MINOR.PATCH 16 * (http://semver.org format), possibly with a "-dev" suffix for 17 * development versions. 18 */ 19 extern const(char)* utf8proc_version(); 20 21 /** 22 * Returns an informative error string for the given utf8proc error code 23 * (e.g. the error codes returned by @ref utf8proc_map). 24 */ 25 extern const(char)* utf8proc_errmsg(utf8proc_ssize_t errcode); 26 27 /** 28 * Reads a single codepoint from the UTF-8 sequence being pointed to by `str`. 29 * The maximum number of bytes read is `strlen`, unless `strlen` is 30 * negative (in which case up to 4 bytes are read). 31 * 32 * If a valid codepoint could be read, it is stored in the variable 33 * pointed to by `codepoint_ref`, otherwise that variable will be set to -1. 34 * In case of success, the number of bytes read is returned; otherwise, a 35 * negative error code is returned. 36 */ 37 extern utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t* str, 38 utf8proc_ssize_t strlen, utf8proc_int32_t* codepoint_ref); 39 40 /** 41 * Check if a codepoint is valid (regardless of whether it has been 42 * assigned a value by the current Unicode standard). 43 * 44 * @return 1 if the given `codepoint` is valid and otherwise return 0. 45 */ 46 extern utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint); 47 48 /** 49 * Encodes the codepoint as an UTF-8 string in the byte array pointed 50 * to by `dst`. This array must be at least 4 bytes long. 51 * 52 * In case of success the number of bytes written is returned, and 53 * otherwise 0 is returned. 54 * 55 * This function does not check whether `codepoint` is valid Unicode. 56 */ 57 extern utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t* dst); 58 59 /** 60 * Look up the properties for a given codepoint. 61 * 62 * @param codepoint The Unicode codepoint. 63 * 64 * @returns 65 * A pointer to a (constant) struct containing information about 66 * the codepoint. 67 * @par 68 * If the codepoint is unassigned or invalid, a pointer to a special struct is 69 * returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN). 70 */ 71 extern const(utf8proc_property_t)* utf8proc_get_property(utf8proc_int32_t codepoint); 72 73 /** Decompose a codepoint into an array of codepoints. 74 * 75 * @param codepoint the codepoint. 76 * @param dst the destination buffer. 77 * @param bufsize the size of the destination buffer. 78 * @param options one or more of the following flags: 79 * - @ref UTF8PROC_REJECTNA - return an error `codepoint` is unassigned 80 * - @ref UTF8PROC_IGNORE - strip "default ignorable" codepoints 81 * - @ref UTF8PROC_CASEFOLD - apply Unicode casefolding 82 * - @ref UTF8PROC_COMPAT - replace certain codepoints with their 83 * compatibility decomposition 84 * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster 85 * - @ref UTF8PROC_LUMP - lump certain different codepoints together 86 * - @ref UTF8PROC_STRIPMARK - remove all character marks 87 * @param last_boundclass 88 * Pointer to an integer variable containing 89 * the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND 90 * option is used. Otherwise, this parameter is ignored. 91 * 92 * @return 93 * In case of success, the number of codepoints written is returned; in case 94 * of an error, a negative error code is returned (@ref utf8proc_errmsg). 95 * @par 96 * If the number of written codepoints would be bigger than `bufsize`, the 97 * required buffer size is returned, while the buffer will be overwritten with 98 * undefined data. 99 */ 100 extern utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t codepoint, 101 utf8proc_int32_t* dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, 102 int* last_boundclass); 103 104 /** 105 * The same as @ref utf8proc_decompose_char, but acts on a whole UTF-8 106 * string and orders the decomposed sequences correctly. 107 * 108 * If the @ref UTF8PROC_NULLTERM flag in `options` is set, processing 109 * will be stopped, when a NULL byte is encounted, otherwise `strlen` 110 * bytes are processed. The result (in the form of 32-bit unicode 111 * codepoints) is written into the buffer being pointed to by 112 * `buffer` (which must contain at least `bufsize` entries). In case of 113 * success, the number of codepoints written is returned; in case of an 114 * error, a negative error code is returned (@ref utf8proc_errmsg). 115 * See @ref utf8proc_decompose_custom to supply additional transformations. 116 * 117 * If the number of written codepoints would be bigger than `bufsize`, the 118 * required buffer size is returned, while the buffer will be overwritten with 119 * undefined data. 120 */ 121 extern utf8proc_ssize_t utf8proc_decompose(const utf8proc_uint8_t* str, utf8proc_ssize_t strlen, 122 utf8proc_int32_t* buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options); 123 124 /** 125 * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function 126 * that is called on each codepoint in `str` before any other transformations 127 * (along with a `custom_data` pointer that is passed through to `custom_func`). 128 * The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom. 129 */ 130 extern utf8proc_ssize_t utf8proc_decompose_custom(const utf8proc_uint8_t* str, 131 utf8proc_ssize_t strlen, utf8proc_int32_t* buffer, 132 utf8proc_ssize_t bufsize, utf8proc_option_t options, 133 utf8proc_custom_func custom_func, void* custom_data); 134 135 /** 136 * Normalizes the sequence of `length` codepoints pointed to by `buffer` 137 * in-place (i.e., the result is also stored in `buffer`). 138 * 139 * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode. 140 * @param length the length (in codepoints) of the buffer. 141 * @param options a bitwise or (`|`) of one or more of the following flags: 142 * - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS 143 * - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS 144 * - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF 145 * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters 146 * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite 147 * codepoints 148 * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate 149 * the unicode versioning stability 150 * 151 * @return 152 * In case of success, the length (in codepoints) of the normalized UTF-32 string is 153 * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg). 154 * 155 * @warning The entries of the array pointed to by `str` have to be in the 156 * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash! 157 */ 158 extern utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t* buffer, 159 utf8proc_ssize_t length, utf8proc_option_t options); 160 161 /** 162 * Reencodes the sequence of `length` codepoints pointed to by `buffer` 163 * UTF-8 data in-place (i.e., the result is also stored in `buffer`). 164 * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion. 165 * 166 * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode. 167 * @param length the length (in codepoints) of the buffer. 168 * @param options a bitwise or (`|`) of one or more of the following flags: 169 * - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS 170 * - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS 171 * - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF 172 * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters 173 * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite 174 * codepoints 175 * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate 176 * the unicode versioning stability 177 * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster 178 * 179 * @return 180 * In case of success, the length (in bytes) of the resulting nul-terminated 181 * UTF-8 string is returned; otherwise, a negative error code is returned 182 * (@ref utf8proc_errmsg). 183 * 184 * @warning The amount of free space pointed to by `buffer` must 185 * exceed the amount of the input data by one byte, and the 186 * entries of the array pointed to by `str` have to be in the 187 * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash! 188 */ 189 extern utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t* buffer, 190 utf8proc_ssize_t length, utf8proc_option_t options); 191 192 /** 193 * Given a pair of consecutive codepoints, return whether a grapheme break is 194 * permitted between them (as defined by the extended grapheme clusters in UAX#29). 195 * 196 * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires 197 * state to break graphemes. This state can be passed in as a pointer 198 * in the `state` argument and should initially be set to 0. If the 199 * state is not passed in (i.e. a null pointer is passed), UAX#29 rules 200 * GB10/12/13 which require this state will not be applied, essentially 201 * matching the rules in Unicode 8.0.0. 202 * 203 * @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must 204 * be called IN ORDER on ALL potential breaks in a string. 205 */ 206 extern utf8proc_bool utf8proc_grapheme_break_stateful(utf8proc_int32_t codepoint1, 207 utf8proc_int32_t codepoint2, utf8proc_int32_t* state); 208 209 /** 210 * Same as @ref utf8proc_grapheme_break_stateful, except without support for the 211 * Unicode 9 additions to the algorithm. Supported for legacy reasons. 212 */ 213 extern utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, 214 utf8proc_int32_t codepoint2); 215 216 /** 217 * Given a codepoint `c`, return the codepoint of the corresponding 218 * lower-case character, if any; otherwise (if there is no lower-case 219 * variant, or if `c` is not a valid codepoint) return `c`. 220 */ 221 extern utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c); 222 223 /** 224 * Given a codepoint `c`, return the codepoint of the corresponding 225 * upper-case character, if any; otherwise (if there is no upper-case 226 * variant, or if `c` is not a valid codepoint) return `c`. 227 */ 228 extern utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c); 229 230 /** 231 * Given a codepoint `c`, return the codepoint of the corresponding 232 * title-case character, if any; otherwise (if there is no title-case 233 * variant, or if `c` is not a valid codepoint) return `c`. 234 */ 235 extern utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c); 236 237 /** 238 * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`, 239 * except that a width of 0 is returned for non-printable codepoints 240 * instead of -1 as in `wcwidth`. 241 * 242 * @note 243 * If you want to check for particular types of non-printable characters, 244 * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */ 245 extern int utf8proc_charwidth(utf8proc_int32_t codepoint); 246 247 /** 248 * Return the Unicode category for the codepoint (one of the 249 * @ref utf8proc_category_t constants.) 250 */ 251 extern utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint); 252 253 /** 254 * Return the two-letter (nul-terminated) Unicode category string for 255 * the codepoint (e.g. `"Lu"` or `"Co"`). 256 */ 257 extern const(char)* utf8proc_category_string(utf8proc_int32_t codepoint); 258 259 /** 260 * Maps the given UTF-8 string pointed to by `str` to a new UTF-8 261 * string, allocated dynamically by `malloc` and returned via `dstptr`. 262 * 263 * If the @ref UTF8PROC_NULLTERM flag in the `options` field is set, 264 * the length is determined by a NULL terminator, otherwise the 265 * parameter `strlen` is evaluated to determine the string length, but 266 * in any case the result will be NULL terminated (though it might 267 * contain NULL characters with the string if `str` contained NULL 268 * characters). Other flags in the `options` field are passed to the 269 * functions defined above, and regarded as described. See also 270 * @ref utfproc_map_custom to supply a custom codepoint transformation. 271 * 272 * In case of success the length of the new string is returned, 273 * otherwise a negative error code is returned. 274 * 275 * @note The memory of the new UTF-8 string will have been allocated 276 * with `malloc`, and should therefore be deallocated with `free`. 277 */ 278 extern utf8proc_ssize_t utf8proc_map(const utf8proc_uint8_t* str, 279 utf8proc_ssize_t strlen, utf8proc_uint8_t** dstptr, utf8proc_option_t options); 280 281 /** 282 * Like @ref utf8proc_map, but also takes a `custom_func` mapping function 283 * that is called on each codepoint in `str` before any other transformations 284 * (along with a `custom_data` pointer that is passed through to `custom_func`). 285 * The `custom_func` argument is ignored if it is `NULL`. 286 */ 287 extern utf8proc_ssize_t utf8proc_map_custom(const utf8proc_uint8_t* str, 288 utf8proc_ssize_t strlen, utf8proc_uint8_t** dstptr, 289 utf8proc_option_t options, utf8proc_custom_func custom_func, void* custom_data); 290 291 /** @name Unicode normalization 292 * 293 * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC 294 * normalized version of the null-terminated string `str`. These 295 * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM 296 * combined with @ref UTF8PROC_STABLE and flags indicating the normalization. 297 */ 298 /** @{ */ 299 /** NFD normalization (@ref UTF8PROC_DECOMPOSE). */ 300 extern utf8proc_uint8_t* utf8proc_NFD(const utf8proc_uint8_t* str); 301 /** NFC normalization (@ref UTF8PROC_COMPOSE). */ 302 extern utf8proc_uint8_t* utf8proc_NFC(const utf8proc_uint8_t* str); 303 /** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */ 304 extern utf8proc_uint8_t* utf8proc_NFKD(const utf8proc_uint8_t* str); 305 /** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ 306 extern utf8proc_uint8_t* utf8proc_NFKC(const utf8proc_uint8_t* str); 307 /** @} */ 308 }