derelict.utf8proc.statfun source code

1 module derelict.utf8proc.statfun;
2 
3 version (Derelict_Static) version = DerelictUTF8Proc_Static;
4 version (DerelictUTF8Proc_Static)  : public import derelict.utf8proc.types;
5 
6 extern (C) @nogc nothrow
7 {
8 	/**
9  * Array containing the byte lengths of a UTF-8 encoded codepoint based
10  * on the first byte.
11  */
12 	extern const utf8proc_int8_t[256] utf8proc_utf8class;
13 
14 	/**
15  * Returns the utf8proc API version as a string MAJOR.MINOR.PATCH
16  * (http://semver.org format), possibly with a "-dev" suffix for
17  * development versions.
18  */
19 	extern const(char)* utf8proc_version();
20 
21 	/**
22  * Returns an informative error string for the given utf8proc error code
23  * (e.g. the error codes returned by @ref utf8proc_map).
24  */
25 	extern const(char)* utf8proc_errmsg(utf8proc_ssize_t errcode);
26 
27 	/**
28  * Reads a single codepoint from the UTF-8 sequence being pointed to by `str`.
29  * The maximum number of bytes read is `strlen`, unless `strlen` is
30  * negative (in which case up to 4 bytes are read).
31  *
32  * If a valid codepoint could be read, it is stored in the variable
33  * pointed to by `codepoint_ref`, otherwise that variable will be set to -1.
34  * In case of success, the number of bytes read is returned; otherwise, a
35  * negative error code is returned.
36  */
37 	extern utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t* str,
38 			utf8proc_ssize_t strlen, utf8proc_int32_t* codepoint_ref);
39 
40 	/**
41  * Check if a codepoint is valid (regardless of whether it has been
42  * assigned a value by the current Unicode standard).
43  *
44  * @return 1 if the given `codepoint` is valid and otherwise return 0.
45  */
46 	extern utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint);
47 
48 	/**
49  * Encodes the codepoint as an UTF-8 string in the byte array pointed
50  * to by `dst`. This array must be at least 4 bytes long.
51  *
52  * In case of success the number of bytes written is returned, and
53  * otherwise 0 is returned.
54  *
55  * This function does not check whether `codepoint` is valid Unicode.
56  */
57 	extern utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t* dst);
58 
59 	/**
60  * Look up the properties for a given codepoint.
61  *
62  * @param codepoint The Unicode codepoint.
63  *
64  * @returns
65  * A pointer to a (constant) struct containing information about
66  * the codepoint.
67  * @par
68  * If the codepoint is unassigned or invalid, a pointer to a special struct is
69  * returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN).
70  */
71 	extern const(utf8proc_property_t)* utf8proc_get_property(utf8proc_int32_t codepoint);
72 
73 	/** Decompose a codepoint into an array of codepoints.
74  *
75  * @param codepoint the codepoint.
76  * @param dst the destination buffer.
77  * @param bufsize the size of the destination buffer.
78  * @param options one or more of the following flags:
79  * - @ref UTF8PROC_REJECTNA  - return an error `codepoint` is unassigned
80  * - @ref UTF8PROC_IGNORE    - strip "default ignorable" codepoints
81  * - @ref UTF8PROC_CASEFOLD  - apply Unicode casefolding
82  * - @ref UTF8PROC_COMPAT    - replace certain codepoints with their
83  *                             compatibility decomposition
84  * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
85  * - @ref UTF8PROC_LUMP      - lump certain different codepoints together
86  * - @ref UTF8PROC_STRIPMARK - remove all character marks
87  * @param last_boundclass
88  * Pointer to an integer variable containing
89  * the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
90  * option is used.  Otherwise, this parameter is ignored.
91  *
92  * @return
93  * In case of success, the number of codepoints written is returned; in case
94  * of an error, a negative error code is returned (@ref utf8proc_errmsg).
95  * @par
96  * If the number of written codepoints would be bigger than `bufsize`, the
97  * required buffer size is returned, while the buffer will be overwritten with
98  * undefined data.
99  */
100 	extern utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t codepoint,
101 			utf8proc_int32_t* dst, utf8proc_ssize_t bufsize, utf8proc_option_t options,
102 			int* last_boundclass);
103 
104 	/**
105  * The same as @ref utf8proc_decompose_char, but acts on a whole UTF-8
106  * string and orders the decomposed sequences correctly.
107  *
108  * If the @ref UTF8PROC_NULLTERM flag in `options` is set, processing
109  * will be stopped, when a NULL byte is encounted, otherwise `strlen`
110  * bytes are processed.  The result (in the form of 32-bit unicode
111  * codepoints) is written into the buffer being pointed to by
112  * `buffer` (which must contain at least `bufsize` entries).  In case of
113  * success, the number of codepoints written is returned; in case of an
114  * error, a negative error code is returned (@ref utf8proc_errmsg).
115  * See @ref utf8proc_decompose_custom to supply additional transformations.
116  *
117  * If the number of written codepoints would be bigger than `bufsize`, the
118  * required buffer size is returned, while the buffer will be overwritten with
119  * undefined data.
120  */
121 	extern utf8proc_ssize_t utf8proc_decompose(const utf8proc_uint8_t* str, utf8proc_ssize_t strlen,
122 			utf8proc_int32_t* buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options);
123 
124 	/**
125  * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
126  * that is called on each codepoint in `str` before any other transformations
127  * (along with a `custom_data` pointer that is passed through to `custom_func`).
128  * The `custom_func` argument is ignored if it is `NULL`.  See also @ref utf8proc_map_custom.
129  */
130 	extern utf8proc_ssize_t utf8proc_decompose_custom(const utf8proc_uint8_t* str,
131 			utf8proc_ssize_t strlen, utf8proc_int32_t* buffer,
132 			utf8proc_ssize_t bufsize, utf8proc_option_t options,
133 			utf8proc_custom_func custom_func, void* custom_data);
134 
135 	/**
136  * Normalizes the sequence of `length` codepoints pointed to by `buffer`
137  * in-place (i.e., the result is also stored in `buffer`).
138  *
139  * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
140  * @param length the length (in codepoints) of the buffer.
141  * @param options a bitwise or (`|`) of one or more of the following flags:
142  * - @ref UTF8PROC_NLF2LS  - convert LF, CRLF, CR and NEL into LS
143  * - @ref UTF8PROC_NLF2PS  - convert LF, CRLF, CR and NEL into PS
144  * - @ref UTF8PROC_NLF2LF  - convert LF, CRLF, CR and NEL into LF
145  * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
146  * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
147  *                           codepoints
148  * - @ref UTF8PROC_STABLE  - prohibit combining characters that would violate
149  *                           the unicode versioning stability
150  *
151  * @return
152  * In case of success, the length (in codepoints) of the normalized UTF-32 string is
153  * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
154  *
155  * @warning The entries of the array pointed to by `str` have to be in the
156  *          range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
157  */
158 	extern utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t* buffer,
159 			utf8proc_ssize_t length, utf8proc_option_t options);
160 
161 	/**
162  * Reencodes the sequence of `length` codepoints pointed to by `buffer`
163  * UTF-8 data in-place (i.e., the result is also stored in `buffer`).
164  * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
165  *
166  * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
167  * @param length the length (in codepoints) of the buffer.
168  * @param options a bitwise or (`|`) of one or more of the following flags:
169  * - @ref UTF8PROC_NLF2LS  - convert LF, CRLF, CR and NEL into LS
170  * - @ref UTF8PROC_NLF2PS  - convert LF, CRLF, CR and NEL into PS
171  * - @ref UTF8PROC_NLF2LF  - convert LF, CRLF, CR and NEL into LF
172  * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
173  * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
174  *                           codepoints
175  * - @ref UTF8PROC_STABLE  - prohibit combining characters that would violate
176  *                           the unicode versioning stability
177  * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
178  *
179  * @return
180  * In case of success, the length (in bytes) of the resulting nul-terminated
181  * UTF-8 string is returned; otherwise, a negative error code is returned
182  * (@ref utf8proc_errmsg).
183  *
184  * @warning The amount of free space pointed to by `buffer` must
185  *          exceed the amount of the input data by one byte, and the
186  *          entries of the array pointed to by `str` have to be in the
187  *          range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
188  */
189 	extern utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t* buffer,
190 			utf8proc_ssize_t length, utf8proc_option_t options);
191 
192 	/**
193  * Given a pair of consecutive codepoints, return whether a grapheme break is
194  * permitted between them (as defined by the extended grapheme clusters in UAX#29).
195  *
196  * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
197  *              state to break graphemes. This state can be passed in as a pointer
198  *              in the `state` argument and should initially be set to 0. If the
199  *              state is not passed in (i.e. a null pointer is passed), UAX#29 rules
200  *              GB10/12/13 which require this state will not be applied, essentially
201  *              matching the rules in Unicode 8.0.0.
202  *
203  * @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must
204  *          be called IN ORDER on ALL potential breaks in a string.
205  */
206 	extern utf8proc_bool utf8proc_grapheme_break_stateful(utf8proc_int32_t codepoint1,
207 			utf8proc_int32_t codepoint2, utf8proc_int32_t* state);
208 
209 	/**
210  * Same as @ref utf8proc_grapheme_break_stateful, except without support for the
211  * Unicode 9 additions to the algorithm. Supported for legacy reasons.
212  */
213 	extern utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1,
214 			utf8proc_int32_t codepoint2);
215 
216 	/**
217  * Given a codepoint `c`, return the codepoint of the corresponding
218  * lower-case character, if any; otherwise (if there is no lower-case
219  * variant, or if `c` is not a valid codepoint) return `c`.
220  */
221 	extern utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
222 
223 	/**
224  * Given a codepoint `c`, return the codepoint of the corresponding
225  * upper-case character, if any; otherwise (if there is no upper-case
226  * variant, or if `c` is not a valid codepoint) return `c`.
227  */
228 	extern utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
229 
230 	/**
231  * Given a codepoint `c`, return the codepoint of the corresponding
232  * title-case character, if any; otherwise (if there is no title-case
233  * variant, or if `c` is not a valid codepoint) return `c`.
234  */
235 	extern utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
236 
237 	/**
238  * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
239  * except that a width of 0 is returned for non-printable codepoints
240  * instead of -1 as in `wcwidth`.
241  *
242  * @note
243  * If you want to check for particular types of non-printable characters,
244  * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
245 	extern int utf8proc_charwidth(utf8proc_int32_t codepoint);
246 
247 	/**
248  * Return the Unicode category for the codepoint (one of the
249  * @ref utf8proc_category_t constants.)
250  */
251 	extern utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint);
252 
253 	/**
254  * Return the two-letter (nul-terminated) Unicode category string for
255  * the codepoint (e.g. `"Lu"` or `"Co"`).
256  */
257 	extern const(char)* utf8proc_category_string(utf8proc_int32_t codepoint);
258 
259 	/**
260  * Maps the given UTF-8 string pointed to by `str` to a new UTF-8
261  * string, allocated dynamically by `malloc` and returned via `dstptr`.
262  *
263  * If the @ref UTF8PROC_NULLTERM flag in the `options` field is set,
264  * the length is determined by a NULL terminator, otherwise the
265  * parameter `strlen` is evaluated to determine the string length, but
266  * in any case the result will be NULL terminated (though it might
267  * contain NULL characters with the string if `str` contained NULL
268  * characters). Other flags in the `options` field are passed to the
269  * functions defined above, and regarded as described.  See also
270  * @ref utfproc_map_custom to supply a custom codepoint transformation.
271  *
272  * In case of success the length of the new string is returned,
273  * otherwise a negative error code is returned.
274  *
275  * @note The memory of the new UTF-8 string will have been allocated
276  * with `malloc`, and should therefore be deallocated with `free`.
277  */
278 	extern utf8proc_ssize_t utf8proc_map(const utf8proc_uint8_t* str,
279 			utf8proc_ssize_t strlen, utf8proc_uint8_t** dstptr, utf8proc_option_t options);
280 
281 	/**
282  * Like @ref utf8proc_map, but also takes a `custom_func` mapping function
283  * that is called on each codepoint in `str` before any other transformations
284  * (along with a `custom_data` pointer that is passed through to `custom_func`).
285  * The `custom_func` argument is ignored if it is `NULL`.
286  */
287 	extern utf8proc_ssize_t utf8proc_map_custom(const utf8proc_uint8_t* str,
288 			utf8proc_ssize_t strlen, utf8proc_uint8_t** dstptr,
289 			utf8proc_option_t options, utf8proc_custom_func custom_func, void* custom_data);
290 
291 	/** @name Unicode normalization
292  *
293  * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
294  * normalized version of the null-terminated string `str`.  These
295  * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
296  * combined with @ref UTF8PROC_STABLE and flags indicating the normalization.
297  */
298 	/** @{ */
299 	/** NFD normalization (@ref UTF8PROC_DECOMPOSE). */
300 	extern utf8proc_uint8_t* utf8proc_NFD(const utf8proc_uint8_t* str);
301 	/** NFC normalization (@ref UTF8PROC_COMPOSE). */
302 	extern utf8proc_uint8_t* utf8proc_NFC(const utf8proc_uint8_t* str);
303 	/** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
304 	extern utf8proc_uint8_t* utf8proc_NFKD(const utf8proc_uint8_t* str);
305 	/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
306 	extern utf8proc_uint8_t* utf8proc_NFKC(const utf8proc_uint8_t* str);
307 	/** @} */
308 }