utf-8.cpp@ 40091

Last change on this file since 40091 was 40091, checked in by vboxsync, 13 years ago
Runtime/strings: add Utf-8 and Utf-16 sanitising to a white list of characters. Do not validate the string encoding in advance.
Property svn:eol-style set to `native` Property svn:keywords set to `Id`
File size: 54.4 KB

Line
1	/* $Id: utf-8.cpp 40091 2012-02-13 10:14:00Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2010 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.215389.xyz. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*******************************************************************************
29	* Header Files *
30	*******************************************************************************/
31	#include <iprt/string.h>
32	#include "internal/iprt.h"
33
34	#include <iprt/uni.h>
35	#include <iprt/alloc.h>
36	#include <iprt/assert.h>
37	#include <iprt/err.h>
38	#include "internal/string.h"
39
40
41
42	/**
43	* Get get length in code points of a UTF-8 encoded string.
44	* The string is validated while doing this.
45	*
46	* @returns IPRT status code.
47	* @param psz Pointer to the UTF-8 string.
48	* @param cch The max length of the string. (btw cch = cb)
49	* Use RTSTR_MAX if all of the string is to be examined.
50	* @param pcuc Where to store the length in unicode code points.
51	* @param pcchActual Where to store the actual size of the UTF-8 string
52	* on success (cch = cb again). Optional.
53	*/
54	DECLHIDDEN(int) rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
55	{
56	const unsigned char puch = (const unsigned char )psz;
57	size_t cCodePoints = 0;
58	while (cch > 0)
59	{
60	const unsigned char uch = *puch;
61	if (!uch)
62	break;
63	if (uch & RT_BIT(7))
64	{
65	/* figure sequence length and validate the first byte */
66	/** @todo RT_USE_RTC_3629 */
67	unsigned cb;
68	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
69	cb = 2;
70	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
71	cb = 3;
72	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
73	cb = 4;
74	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
75	cb = 5;
76	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
77	cb = 6;
78	else
79	{
80	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
81	return VERR_INVALID_UTF8_ENCODING;
82	}
83
84	/* check length */
85	if (cb > cch)
86	{
87	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
88	return VERR_INVALID_UTF8_ENCODING;
89	}
90
91	/* validate the rest */
92	switch (cb)
93	{
94	case 6:
95	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
96	case 5:
97	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98	case 4:
99	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100	case 3:
101	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
102	case 2:
103	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104	break;
105	}
106
107	/* validate the code point. */
108	RTUNICP uc;
109	switch (cb)
110	{
111	case 6:
112	uc = (puch[5] & 0x3f)
113	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
114	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
115	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
116	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
117	\| ((RTUNICP)(uch & 0x01) << 30);
118	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
119	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120	break;
121	case 5:
122	uc = (puch[4] & 0x3f)
123	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
124	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
125	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
126	\| ((RTUNICP)(uch & 0x03) << 24);
127	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
128	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
129	break;
130	case 4:
131	uc = (puch[3] & 0x3f)
132	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
133	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
134	\| ((RTUNICP)(uch & 0x07) << 18);
135	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
136	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
137	break;
138	case 3:
139	uc = (puch[2] & 0x3f)
140	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
141	\| ((RTUNICP)(uch & 0x0f) << 12);
142	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
143	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
144	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
145	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
146	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
147	break;
148	case 2:
149	uc = (puch[1] & 0x3f)
150	\| ((RTUNICP)(uch & 0x1f) << 6);
151	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
152	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153	break;
154	}
155
156	/* advance */
157	cch -= cb;
158	puch += cb;
159	}
160	else
161	{
162	/* one ASCII byte */
163	puch++;
164	cch--;
165	}
166	cCodePoints++;
167	}
168
169	/* done */
170	*pcuc = cCodePoints;
171	if (pcchActual)
172	pcchActual = puch - (unsigned char const )psz;
173	return VINF_SUCCESS;
174	}
175
176
177	/**
178	* Decodes and UTF-8 string into an array of unicode code point.
179	*
180	* Since we know the input is valid, we do not perform encoding or length checks.
181	*
182	* @returns iprt status code.
183	* @param psz The UTF-8 string to recode. This is a valid encoding.
184	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
185	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
186	* @param paCps Where to store the code points array.
187	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
188	*/
189	static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
190	{
191	int rc = VINF_SUCCESS;
192	const unsigned char puch = (const unsigned char )psz;
193	PRTUNICP pCp = paCps;
194	while (cch > 0)
195	{
196	/* read the next char and check for terminator. */
197	const unsigned char uch = *puch;
198	if (!uch)
199	break;
200
201	/* check for output overflow */
202	if (RT_UNLIKELY(cCps < 1))
203	{
204	rc = VERR_BUFFER_OVERFLOW;
205	break;
206	}
207	cCps--;
208
209	/* decode and recode the code point */
210	if (!(uch & RT_BIT(7)))
211	{
212	*pCp++ = uch;
213	puch++;
214	cch--;
215	}
216	#ifdef RT_STRICT
217	else if (!(uch & RT_BIT(6)))
218	AssertMsgFailed(("Internal error!\n"));
219	#endif
220	else if (!(uch & RT_BIT(5)))
221	{
222	*pCp++ = (puch[1] & 0x3f)
223	\| ((uint16_t)(uch & 0x1f) << 6);
224	puch += 2;
225	cch -= 2;
226	}
227	else if (!(uch & RT_BIT(4)))
228	{
229	*pCp++ = (puch[2] & 0x3f)
230	\| ((uint16_t)(puch[1] & 0x3f) << 6)
231	\| ((uint16_t)(uch & 0x0f) << 12);
232	puch += 3;
233	cch -= 3;
234	}
235	else if (!(uch & RT_BIT(3)))
236	{
237	*pCp++ = (puch[3] & 0x3f)
238	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
239	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
240	\| ((RTUNICP)(uch & 0x07) << 18);
241	puch += 4;
242	cch -= 4;
243	}
244	else if (!(uch & RT_BIT(2)))
245	{
246	*pCp++ = (puch[4] & 0x3f)
247	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
248	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
249	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
250	\| ((RTUNICP)(uch & 0x03) << 24);
251	puch += 5;
252	cch -= 6;
253	}
254	else
255	{
256	Assert(!(uch & RT_BIT(1)));
257	*pCp++ = (puch[5] & 0x3f)
258	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
259	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
260	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
261	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
262	\| ((RTUNICP)(uch & 0x01) << 30);
263	puch += 6;
264	cch -= 6;
265	}
266	}
267
268	/* done */
269	*pCp = 0;
270	return rc;
271	}
272
273
274	RTDECL(size_t) RTStrUniLen(const char *psz)
275	{
276	size_t cCodePoints;
277	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
278	return RT_SUCCESS(rc) ? cCodePoints : 0;
279	}
280	RT_EXPORT_SYMBOL(RTStrUniLen);
281
282
283	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
284	{
285	size_t cCodePoints;
286	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
287	if (pcCps)
288	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
289	return rc;
290	}
291	RT_EXPORT_SYMBOL(RTStrUniLenEx);
292
293
294	RTDECL(int) RTStrValidateEncoding(const char *psz)
295	{
296	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
297	}
298	RT_EXPORT_SYMBOL(RTStrValidateEncoding);
299
300
301	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
302	{
303	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
304	AssertPtr(psz);
305
306	/*
307	* Use rtUtf8Length for the job.
308	*/
309	size_t cchActual;
310	size_t cCpsIgnored;
311	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
312	if (RT_SUCCESS(rc))
313	{
314	if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
315	&& cchActual >= cch)
316	rc = VERR_BUFFER_OVERFLOW;
317	}
318	return rc;
319	}
320	RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
321
322
323	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
324	{
325	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
326	return RT_SUCCESS(rc);
327	}
328	RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
329
330
331	RTDECL(size_t) RTStrPurgeEncoding(char *psz)
332	{
333	size_t cErrors = 0;
334	for (;;)
335	{
336	RTUNICP Cp;
337	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
338	if (RT_SUCCESS(rc))
339	{
340	if (!Cp)
341	break;
342	}
343	else
344	{
345	psz[-1] = '?';
346	cErrors++;
347	}
348	}
349	return cErrors;
350	}
351	RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
352
353
354	RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidSet, char chReplacement)
355	{
356	size_t cReplacements = 0;
357	AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
358	for (;;)
359	{
360	RTUNICP Cp;
361	PCRTUNICP pCp;
362	char *pszOld = psz;
363	if (RT_FAILURE(RTStrGetCpEx((const char **)&psz, &Cp)))
364	return -1;
365	if (!Cp)
366	break;
367	for (pCp = puszValidSet; ; ++pCp)
368	if (!pCp \|\| pCp == Cp)
369	break;
370	if (!*pCp)
371	{
372	for (; pszOld != psz; ++pszOld)
373	*pszOld = chReplacement;
374	++cReplacements;
375	}
376	}
377	return cReplacements;
378	}
379	RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
380
381
382	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
383	{
384	/*
385	* Validate input.
386	*/
387	Assert(VALID_PTR(pszString));
388	Assert(VALID_PTR(ppaCps));
389	*ppaCps = NULL;
390
391	/*
392	* Validate the UTF-8 input and count its code points.
393	*/
394	size_t cCps;
395	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
396	if (RT_SUCCESS(rc))
397	{
398	/*
399	* Allocate buffer.
400	*/
401	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
402	if (paCps)
403	{
404	/*
405	* Decode the string.
406	*/
407	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
408	if (RT_SUCCESS(rc))
409	{
410	*ppaCps = paCps;
411	return rc;
412	}
413	RTMemFree(paCps);
414	}
415	else
416	rc = VERR_NO_CODE_POINT_MEMORY;
417	}
418	return rc;
419	}
420	RT_EXPORT_SYMBOL(RTStrToUni);
421
422
423	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
424	{
425	/*
426	* Validate input.
427	*/
428	Assert(VALID_PTR(pszString));
429	Assert(VALID_PTR(ppaCps));
430	Assert(!pcCps \|\| VALID_PTR(pcCps));
431
432	/*
433	* Validate the UTF-8 input and count the code points.
434	*/
435	size_t cCpsResult;
436	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
437	if (RT_SUCCESS(rc))
438	{
439	if (pcCps)
440	*pcCps = cCpsResult;
441
442	/*
443	* Check buffer size / Allocate buffer.
444	*/
445	bool fShouldFree;
446	PRTUNICP paCpsResult;
447	if (cCps > 0 && *ppaCps)
448	{
449	fShouldFree = false;
450	if (cCps <= cCpsResult)
451	return VERR_BUFFER_OVERFLOW;
452	paCpsResult = *ppaCps;
453	}
454	else
455	{
456	*ppaCps = NULL;
457	fShouldFree = true;
458	cCps = RT_MAX(cCpsResult + 1, cCps);
459	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
460	}
461	if (paCpsResult)
462	{
463	/*
464	* Encode the UTF-16 string.
465	*/
466	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
467	if (RT_SUCCESS(rc))
468	{
469	*ppaCps = paCpsResult;
470	return rc;
471	}
472	if (fShouldFree)
473	RTMemFree(paCpsResult);
474	}
475	else
476	rc = VERR_NO_CODE_POINT_MEMORY;
477	}
478	return rc;
479	}
480	RT_EXPORT_SYMBOL(RTStrToUniEx);
481
482
483	/**
484	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
485	*
486	* @returns IPRT status code.
487	* @param psz Pointer to the UTF-8 string.
488	* @param cch The max length of the string. (btw cch = cb)
489	* Use RTSTR_MAX if all of the string is to be examined.
490	* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
491	*/
492	static int rtUtf8CalcUtf16Length(const char psz, size_t cch, size_t pcwc)
493	{
494	const unsigned char puch = (const unsigned char )psz;
495	size_t cwc = 0;
496	while (cch > 0)
497	{
498	const unsigned char uch = *puch;
499	if (!uch)
500	break;
501	if (!(uch & RT_BIT(7)))
502	{
503	/* one ASCII byte */
504	cwc++;
505	puch++;
506	cch--;
507	}
508	else
509	{
510	/* figure sequence length and validate the first byte */
511	unsigned cb;
512	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
513	cb = 2;
514	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
515	cb = 3;
516	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
517	cb = 4;
518	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
519	cb = 5;
520	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
521	cb = 6;
522	else
523	{
524	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
525	return VERR_INVALID_UTF8_ENCODING;
526	}
527
528	/* check length */
529	if (cb > cch)
530	{
531	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
532	return VERR_INVALID_UTF8_ENCODING;
533	}
534
535	/* validate the rest */
536	switch (cb)
537	{
538	case 6:
539	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
540	case 5:
541	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
542	case 4:
543	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
544	case 3:
545	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
546	case 2:
547	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
548	break;
549	}
550
551	/* validate the code point. */
552	RTUNICP uc;
553	switch (cb)
554	{
555	case 6:
556	uc = (puch[5] & 0x3f)
557	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
558	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
559	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
560	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
561	\| ((RTUNICP)(uch & 0x01) << 30);
562	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
563	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
564	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
565	return VERR_CANT_RECODE_AS_UTF16;
566	case 5:
567	uc = (puch[4] & 0x3f)
568	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
569	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
570	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
571	\| ((RTUNICP)(uch & 0x03) << 24);
572	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
573	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
574	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
575	return VERR_CANT_RECODE_AS_UTF16;
576	case 4:
577	uc = (puch[3] & 0x3f)
578	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
579	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
580	\| ((RTUNICP)(uch & 0x07) << 18);
581	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
582	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
583	RTStrAssertMsgReturn(uc <= 0x0010ffff,
584	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
585	cwc++;
586	break;
587	case 3:
588	uc = (puch[2] & 0x3f)
589	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
590	\| ((RTUNICP)(uch & 0x0f) << 12);
591	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
592	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
593	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
594	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
595	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
596	break;
597	case 2:
598	uc = (puch[1] & 0x3f)
599	\| ((RTUNICP)(uch & 0x1f) << 6);
600	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
601	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
602	break;
603	}
604
605	/* advance */
606	cch -= cb;
607	puch += cb;
608	cwc++;
609	}
610	}
611
612	/* done */
613	*pcwc = cwc;
614	return VINF_SUCCESS;
615	}
616
617
618	/**
619	* Recodes a valid UTF-8 string as UTF-16.
620	*
621	* Since we know the input is valid, we do not perform encoding or length checks.
622	*
623	* @returns iprt status code.
624	* @param psz The UTF-8 string to recode. This is a valid encoding.
625	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
626	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
627	* @param pwsz Where to store the UTF-16 string.
628	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
629	*/
630	static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
631	{
632	int rc = VINF_SUCCESS;
633	const unsigned char puch = (const unsigned char )psz;
634	PRTUTF16 pwc = pwsz;
635	while (cch > 0)
636	{
637	/* read the next char and check for terminator. */
638	const unsigned char uch = *puch;
639	if (!uch)
640	break;
641
642	/* check for output overflow */
643	if (RT_UNLIKELY(cwc < 1))
644	{
645	rc = VERR_BUFFER_OVERFLOW;
646	break;
647	}
648	cwc--;
649
650	/* decode and recode the code point */
651	if (!(uch & RT_BIT(7)))
652	{
653	*pwc++ = uch;
654	puch++;
655	cch--;
656	}
657	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
658	{
659	uint16_t uc = (puch[1] & 0x3f)
660	\| ((uint16_t)(uch & 0x1f) << 6);
661	*pwc++ = uc;
662	puch += 2;
663	cch -= 2;
664	}
665	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
666	{
667	uint16_t uc = (puch[2] & 0x3f)
668	\| ((uint16_t)(puch[1] & 0x3f) << 6)
669	\| ((uint16_t)(uch & 0x0f) << 12);
670	*pwc++ = uc;
671	puch += 3;
672	cch -= 3;
673	}
674	else
675	{
676	/* generate surrogate pair */
677	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
678	RTUNICP uc = (puch[3] & 0x3f)
679	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
680	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
681	\| ((RTUNICP)(uch & 0x07) << 18);
682	if (RT_UNLIKELY(cwc < 1))
683	{
684	rc = VERR_BUFFER_OVERFLOW;
685	break;
686	}
687	cwc--;
688
689	uc -= 0x10000;
690	*pwc++ = 0xd800 \| (uc >> 10);
691	*pwc++ = 0xdc00 \| (uc & 0x3ff);
692	puch += 4;
693	cch -= 4;
694	}
695	}
696
697	/* done */
698	*pwc = '\0';
699	return rc;
700	}
701
702
703	RTDECL(int) RTStrToUtf16Tag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
704	{
705	/*
706	* Validate input.
707	*/
708	Assert(VALID_PTR(ppwszString));
709	Assert(VALID_PTR(pszString));
710	*ppwszString = NULL;
711
712	/*
713	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
714	*/
715	size_t cwc;
716	int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
717	if (RT_SUCCESS(rc))
718	{
719	/*
720	* Allocate buffer.
721	*/
722	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
723	if (pwsz)
724	{
725	/*
726	* Encode the UTF-16 string.
727	*/
728	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
729	if (RT_SUCCESS(rc))
730	{
731	*ppwszString = pwsz;
732	return rc;
733	}
734	RTMemFree(pwsz);
735	}
736	else
737	rc = VERR_NO_UTF16_MEMORY;
738	}
739	return rc;
740	}
741	RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
742
743
744	RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
745	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
746	{
747	/*
748	* Validate input.
749	*/
750	Assert(VALID_PTR(pszString));
751	Assert(VALID_PTR(ppwsz));
752	Assert(!pcwc \|\| VALID_PTR(pcwc));
753
754	/*
755	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
756	*/
757	size_t cwcResult;
758	int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
759	if (RT_SUCCESS(rc))
760	{
761	if (pcwc)
762	*pcwc = cwcResult;
763
764	/*
765	* Check buffer size / Allocate buffer.
766	*/
767	bool fShouldFree;
768	PRTUTF16 pwszResult;
769	if (cwc > 0 && *ppwsz)
770	{
771	fShouldFree = false;
772	if (cwc <= cwcResult)
773	return VERR_BUFFER_OVERFLOW;
774	pwszResult = *ppwsz;
775	}
776	else
777	{
778	*ppwsz = NULL;
779	fShouldFree = true;
780	cwc = RT_MAX(cwcResult + 1, cwc);
781	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
782	}
783	if (pwszResult)
784	{
785	/*
786	* Encode the UTF-16 string.
787	*/
788	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
789	if (RT_SUCCESS(rc))
790	{
791	*ppwsz = pwszResult;
792	return rc;
793	}
794	if (fShouldFree)
795	RTMemFree(pwszResult);
796	}
797	else
798	rc = VERR_NO_UTF16_MEMORY;
799	}
800	return rc;
801	}
802	RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
803
804
805	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
806	{
807	size_t cwc;
808	int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
809	return RT_SUCCESS(rc) ? cwc : 0;
810	}
811	RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
812
813
814	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
815	{
816	size_t cwc;
817	int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
818	if (pcwc)
819	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
820	return rc;
821	}
822	RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
823
824
825	/**
826	* Calculates the length of the UTF-8 encoding of a Latin-1 string.
827	*
828	* @returns iprt status code.
829	* @param psz The Latin-1 string.
830	* @param cchIn The max length of the Latin-1 string to consider.
831	* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
832	*/
833	static int rtLatin1CalcUtf8Length(const char psz, size_t cchIn, size_t pcch)
834	{
835	size_t cch = 0;
836	for (;;)
837	{
838	RTUNICP Cp;
839	int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
840	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
841	break;
842	if (RT_FAILURE(rc))
843	return rc;
844	cch += RTStrCpSize(Cp); /* cannot fail */
845	}
846
847	/* done */
848	*pcch = cch;
849	return VINF_SUCCESS;
850	}
851
852
853	/**
854	* Recodes a Latin-1 string as UTF-8.
855	*
856	* @returns iprt status code.
857	* @param psz The Latin-1 string.
858	* @param cchIn The number of characters to process from psz. The recoding
859	* will stop when cch or '\\0' is reached.
860	* @param psz Where to store the UTF-8 string.
861	* @param cch The size of the UTF-8 buffer, excluding the terminator.
862	*/
863	static int rtLatin1RecodeAsUtf8(const char pszIn, size_t cchIn, char psz, size_t cch)
864	{
865	int rc;
866	for (;;)
867	{
868	RTUNICP Cp;
869	size_t cchCp;
870	rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
871	if (Cp == 0 \|\| RT_FAILURE(rc))
872	break;
873	cchCp = RTStrCpSize(Cp);
874	if (RT_UNLIKELY(cch < cchCp))
875	{
876	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
877	rc = VERR_BUFFER_OVERFLOW;
878	break;
879	}
880	cch -= cchCp;
881	psz = RTStrPutCp(psz, Cp);
882	}
883
884	/* done */
885	if (rc == VERR_END_OF_STRING)
886	rc = VINF_SUCCESS;
887	*psz = '\0';
888	return rc;
889	}
890
891
892
893	RTDECL(int) RTLatin1ToUtf8Tag(const char pszString, char ppszString, const char pszTag)
894	{
895	/*
896	* Validate input.
897	*/
898	Assert(VALID_PTR(ppszString));
899	Assert(VALID_PTR(pszString));
900	*ppszString = NULL;
901
902	/*
903	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
904	*/
905	size_t cch;
906	int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
907	if (RT_SUCCESS(rc))
908	{
909	/*
910	* Allocate buffer and recode it.
911	*/
912	char pszResult = (char )RTMemAllocTag(cch + 1, pszTag);
913	if (pszResult)
914	{
915	rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
916	if (RT_SUCCESS(rc))
917	{
918	*ppszString = pszResult;
919	return rc;
920	}
921
922	RTMemFree(pszResult);
923	}
924	else
925	rc = VERR_NO_STR_MEMORY;
926	}
927	return rc;
928	}
929	RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
930
931
932	RTDECL(int) RTLatin1ToUtf8ExTag(const char pszString, size_t cchString, char ppsz, size_t cch, size_t pcch, const char *pszTag)
933	{
934	/*
935	* Validate input.
936	*/
937	Assert(VALID_PTR(pszString));
938	Assert(VALID_PTR(ppsz));
939	Assert(!pcch \|\| VALID_PTR(pcch));
940
941	/*
942	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
943	*/
944	size_t cchResult;
945	int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
946	if (RT_SUCCESS(rc))
947	{
948	if (pcch)
949	*pcch = cchResult;
950
951	/*
952	* Check buffer size / Allocate buffer and recode it.
953	*/
954	bool fShouldFree;
955	char *pszResult;
956	if (cch > 0 && *ppsz)
957	{
958	fShouldFree = false;
959	if (RT_UNLIKELY(cch <= cchResult))
960	return VERR_BUFFER_OVERFLOW;
961	pszResult = *ppsz;
962	}
963	else
964	{
965	*ppsz = NULL;
966	fShouldFree = true;
967	cch = RT_MAX(cch, cchResult + 1);
968	pszResult = (char *)RTStrAllocTag(cch, pszTag);
969	}
970	if (pszResult)
971	{
972	rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
973	if (RT_SUCCESS(rc))
974	{
975	*ppsz = pszResult;
976	return rc;
977	}
978
979	if (fShouldFree)
980	RTStrFree(pszResult);
981	}
982	else
983	rc = VERR_NO_STR_MEMORY;
984	}
985	return rc;
986	}
987	RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
988
989
990	RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
991	{
992	size_t cch;
993	int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
994	return RT_SUCCESS(rc) ? cch : 0;
995	}
996	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
997
998
999	RTDECL(int) RTLatin1CalcUtf8LenEx(const char psz, size_t cchIn, size_t pcch)
1000	{
1001	size_t cch;
1002	int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1003	if (pcch)
1004	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1005	return rc;
1006	}
1007	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1008
1009
1010	/**
1011	* Calculates the Latin-1 length of a string, validating the encoding while
1012	* doing so.
1013	*
1014	* @returns IPRT status code.
1015	* @param psz Pointer to the UTF-8 string.
1016	* @param cchIn The max length of the string. (btw cch = cb)
1017	* Use RTSTR_MAX if all of the string is to be examined.
1018	* @param pcch Where to store the length of the Latin-1 string in bytes.
1019	*/
1020	static int rtUtf8CalcLatin1Length(const char psz, size_t cchIn, size_t pcch)
1021	{
1022	size_t cch = 0;
1023	for (;;)
1024	{
1025	RTUNICP Cp;
1026	size_t cchCp;
1027	int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1028	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1029	break;
1030	if (RT_FAILURE(rc))
1031	return rc;
1032	cchCp = RTLatin1CpSize(Cp);
1033	if (cchCp == 0)
1034	return VERR_NO_TRANSLATION;
1035	cch += cchCp;
1036	}
1037
1038	/* done */
1039	*pcch = cch;
1040	return VINF_SUCCESS;
1041	}
1042
1043
1044	/**
1045	* Recodes a valid UTF-8 string as Latin-1.
1046	*
1047	* Since we know the input is valid, we do not perform encoding or length checks.
1048	*
1049	* @returns iprt status code.
1050	* @param pszIn The UTF-8 string to recode. This is a valid encoding.
1051	* @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1052	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1053	* @param psz Where to store the Latin-1 string.
1054	* @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1055	*/
1056	static int rtUtf8RecodeAsLatin1(const char pszIn, size_t cchIn, char psz, size_t cch)
1057	{
1058	int rc;
1059	for (;;)
1060	{
1061	RTUNICP Cp;
1062	size_t cchCp;
1063	rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1064	if (Cp == 0 \|\| RT_FAILURE(rc))
1065	break;
1066	cchCp = RTLatin1CpSize(Cp);
1067	if (RT_UNLIKELY(cch < cchCp))
1068	{
1069	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1070	rc = VERR_BUFFER_OVERFLOW;
1071	break;
1072	}
1073	cch -= cchCp;
1074	psz = RTLatin1PutCp(psz, Cp);
1075	}
1076
1077	/* done */
1078	if (rc == VERR_END_OF_STRING)
1079	rc = VINF_SUCCESS;
1080	*psz = '\0';
1081	return rc;
1082	}
1083
1084
1085
1086	RTDECL(int) RTStrToLatin1Tag(const char pszString, char ppszString, const char pszTag)
1087	{
1088	/*
1089	* Validate input.
1090	*/
1091	Assert(VALID_PTR(ppszString));
1092	Assert(VALID_PTR(pszString));
1093	*ppszString = NULL;
1094
1095	/*
1096	* Validate the UTF-8 input and calculate the length of the Latin-1 string.
1097	*/
1098	size_t cch;
1099	int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1100	if (RT_SUCCESS(rc))
1101	{
1102	/*
1103	* Allocate buffer.
1104	*/
1105	char psz = (char )RTMemAllocTag(cch + 1, pszTag);
1106	if (psz)
1107	{
1108	/*
1109	* Encode the UTF-16 string.
1110	*/
1111	rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1112	if (RT_SUCCESS(rc))
1113	{
1114	*ppszString = psz;
1115	return rc;
1116	}
1117	RTMemFree(psz);
1118	}
1119	else
1120	rc = VERR_NO_STR_MEMORY;
1121	}
1122	return rc;
1123	}
1124	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1125
1126
1127	RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1128	char *ppsz, size_t cch, size_t pcch, const char *pszTag)
1129	{
1130	/*
1131	* Validate input.
1132	*/
1133	Assert(VALID_PTR(pszString));
1134	Assert(VALID_PTR(ppsz));
1135	Assert(!pcch \|\| VALID_PTR(pcch));
1136
1137	/*
1138	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1139	*/
1140	size_t cchResult;
1141	int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1142	if (RT_SUCCESS(rc))
1143	{
1144	if (pcch)
1145	*pcch = cchResult;
1146
1147	/*
1148	* Check buffer size / Allocate buffer.
1149	*/
1150	bool fShouldFree;
1151	char *pszResult;
1152	if (cch > 0 && *ppsz)
1153	{
1154	fShouldFree = false;
1155	if (cch <= cchResult)
1156	return VERR_BUFFER_OVERFLOW;
1157	pszResult = *ppsz;
1158	}
1159	else
1160	{
1161	*ppsz = NULL;
1162	fShouldFree = true;
1163	cch = RT_MAX(cchResult + 1, cch);
1164	pszResult = (char *)RTMemAllocTag(cch, pszTag);
1165	}
1166	if (pszResult)
1167	{
1168	/*
1169	* Encode the Latin-1 string.
1170	*/
1171	rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1172	if (RT_SUCCESS(rc))
1173	{
1174	*ppsz = pszResult;
1175	return rc;
1176	}
1177	if (fShouldFree)
1178	RTMemFree(pszResult);
1179	}
1180	else
1181	rc = VERR_NO_STR_MEMORY;
1182	}
1183	return rc;
1184	}
1185	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1186
1187
1188	RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1189	{
1190	size_t cch;
1191	int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1192	return RT_SUCCESS(rc) ? cch : 0;
1193	}
1194	RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1195
1196
1197	RTDECL(int) RTStrCalcLatin1LenEx(const char psz, size_t cchIn, size_t pcch)
1198	{
1199	size_t cch;
1200	int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1201	if (pcch)
1202	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1203	return rc;
1204	}
1205	RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1206
1207
1208	/**
1209	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1210	* @returns rc
1211	* @param ppsz The pointer to the string position point.
1212	* @param pCp Where to store RTUNICP_INVALID.
1213	* @param rc The iprt error code.
1214	*/
1215	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1216	{
1217	/*
1218	* Try find a valid encoding.
1219	*/
1220	(ppsz)++; /* @todo code this! */
1221	*pCp = RTUNICP_INVALID;
1222	return rc;
1223	}
1224
1225
1226	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1227	{
1228	RTUNICP Cp;
1229	RTStrGetCpExInternal(&psz, &Cp);
1230	return Cp;
1231	}
1232	RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1233
1234
1235	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1236	{
1237	const unsigned char puch = (const unsigned char )*ppsz;
1238	const unsigned char uch = *puch;
1239	RTUNICP uc;
1240
1241	/* ASCII ? */
1242	if (!(uch & RT_BIT(7)))
1243	{
1244	uc = uch;
1245	puch++;
1246	}
1247	else if (uch & RT_BIT(6))
1248	{
1249	/* figure the length and validate the first octet. */
1250	/** @todo RT_USE_RTC_3629 */
1251	unsigned cb;
1252	if (!(uch & RT_BIT(5)))
1253	cb = 2;
1254	else if (!(uch & RT_BIT(4)))
1255	cb = 3;
1256	else if (!(uch & RT_BIT(3)))
1257	cb = 4;
1258	else if (!(uch & RT_BIT(2)))
1259	cb = 5;
1260	else if (!(uch & RT_BIT(1)))
1261	cb = 6;
1262	else
1263	{
1264	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1265	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1266	}
1267
1268	/* validate the rest */
1269	switch (cb)
1270	{
1271	case 6:
1272	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1273	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1274	case 5:
1275	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1276	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1277	case 4:
1278	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1279	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1280	case 3:
1281	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1282	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1283	case 2:
1284	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1285	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1286	break;
1287	}
1288
1289	/* get and validate the code point. */
1290	switch (cb)
1291	{
1292	case 6:
1293	uc = (puch[5] & 0x3f)
1294	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1295	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1296	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1297	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1298	\| ((RTUNICP)(uch & 0x01) << 30);
1299	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1300	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1301	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1302	break;
1303	case 5:
1304	uc = (puch[4] & 0x3f)
1305	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1306	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1307	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1308	\| ((RTUNICP)(uch & 0x03) << 24);
1309	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1310	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1311	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1312	break;
1313	case 4:
1314	uc = (puch[3] & 0x3f)
1315	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1316	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1317	\| ((RTUNICP)(uch & 0x07) << 18);
1318	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1319	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1320	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1321	break;
1322	case 3:
1323	uc = (puch[2] & 0x3f)
1324	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1325	\| ((RTUNICP)(uch & 0x0f) << 12);
1326	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1327	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1328	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1329	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1330	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1331	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1332	break;
1333	case 2:
1334	uc = (puch[1] & 0x3f)
1335	\| ((RTUNICP)(uch & 0x1f) << 6);
1336	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1337	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1338	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1339	break;
1340	default: /* impossible, but GCC is bitching. */
1341	uc = RTUNICP_INVALID;
1342	break;
1343	}
1344	puch += cb;
1345	}
1346	else
1347	{
1348	/* 6th bit is always set. */
1349	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1350	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1351	}
1352	*pCp = uc;
1353	ppsz = (const char )puch;
1354	return VINF_SUCCESS;
1355	}
1356	RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1357
1358
1359	/**
1360	* Handle invalid encodings passed to RTStrGetCpNEx().
1361	* @returns rc
1362	* @param ppsz The pointer to the string position point.
1363	* @param pcch Pointer to the string length.
1364	* @param pCp Where to store RTUNICP_INVALID.
1365	* @param rc The iprt error code.
1366	*/
1367	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
1368	{
1369	/*
1370	* Try find a valid encoding.
1371	*/
1372	(ppsz)++; /* @todo code this! */
1373	(*pcch)--;
1374	*pCp = RTUNICP_INVALID;
1375	return rc;
1376	}
1377
1378
1379	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
1380	{
1381	const unsigned char puch = (const unsigned char )*ppsz;
1382	const unsigned char uch = *puch;
1383	size_t cch = *pcch;
1384	RTUNICP uc;
1385
1386	if (cch == 0)
1387	{
1388	*pCp = RTUNICP_INVALID;
1389	return VERR_END_OF_STRING;
1390	}
1391
1392	/* ASCII ? */
1393	if (!(uch & RT_BIT(7)))
1394	{
1395	uc = uch;
1396	puch++;
1397	cch--;
1398	}
1399	else if (uch & RT_BIT(6))
1400	{
1401	/* figure the length and validate the first octet. */
1402	/** @todo RT_USE_RTC_3629 */
1403	unsigned cb;
1404	if (!(uch & RT_BIT(5)))
1405	cb = 2;
1406	else if (!(uch & RT_BIT(4)))
1407	cb = 3;
1408	else if (!(uch & RT_BIT(3)))
1409	cb = 4;
1410	else if (!(uch & RT_BIT(2)))
1411	cb = 5;
1412	else if (!(uch & RT_BIT(1)))
1413	cb = 6;
1414	else
1415	{
1416	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1417	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1418	}
1419
1420	if (cb > cch)
1421	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1422
1423	/* validate the rest */
1424	switch (cb)
1425	{
1426	case 6:
1427	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1428	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1429	case 5:
1430	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1431	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1432	case 4:
1433	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1434	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1435	case 3:
1436	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1437	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1438	case 2:
1439	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1440	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1441	break;
1442	}
1443
1444	/* get and validate the code point. */
1445	switch (cb)
1446	{
1447	case 6:
1448	uc = (puch[5] & 0x3f)
1449	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1450	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1451	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1452	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1453	\| ((RTUNICP)(uch & 0x01) << 30);
1454	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1455	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1456	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1457	break;
1458	case 5:
1459	uc = (puch[4] & 0x3f)
1460	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1461	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1462	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1463	\| ((RTUNICP)(uch & 0x03) << 24);
1464	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1465	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1466	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1467	break;
1468	case 4:
1469	uc = (puch[3] & 0x3f)
1470	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1471	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1472	\| ((RTUNICP)(uch & 0x07) << 18);
1473	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1474	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1475	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1476	break;
1477	case 3:
1478	uc = (puch[2] & 0x3f)
1479	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1480	\| ((RTUNICP)(uch & 0x0f) << 12);
1481	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1482	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1483	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1484	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1485	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1486	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1487	break;
1488	case 2:
1489	uc = (puch[1] & 0x3f)
1490	\| ((RTUNICP)(uch & 0x1f) << 6);
1491	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1492	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1493	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1494	break;
1495	default: /* impossible, but GCC is bitching. */
1496	uc = RTUNICP_INVALID;
1497	break;
1498	}
1499	puch += cb;
1500	cch -= cb;
1501	}
1502	else
1503	{
1504	/* 6th bit is always set. */
1505	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1506	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1507	}
1508	*pCp = uc;
1509	ppsz = (const char )puch;
1510	(*pcch) = cch;
1511	return VINF_SUCCESS;
1512	}
1513	RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1514
1515
1516	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1517	{
1518	unsigned char puch = (unsigned char )psz;
1519	if (uc < 0x80)
1520	*puch++ = (unsigned char )uc;
1521	else if (uc < 0x00000800)
1522	{
1523	*puch++ = 0xc0 \| (uc >> 6);
1524	*puch++ = 0x80 \| (uc & 0x3f);
1525	}
1526	else if (uc < 0x00010000)
1527	{
1528	/** @todo RT_USE_RTC_3629 */
1529	if ( uc < 0x0000d8000
1530	\|\| ( uc > 0x0000dfff
1531	&& uc < 0x0000fffe))
1532	{
1533	*puch++ = 0xe0 \| (uc >> 12);
1534	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1535	*puch++ = 0x80 \| (uc & 0x3f);
1536	}
1537	else
1538	{
1539	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1540	*puch++ = 0x7f;
1541	}
1542	}
1543	/** @todo RT_USE_RTC_3629 */
1544	else if (uc < 0x00200000)
1545	{
1546	*puch++ = 0xf0 \| (uc >> 18);
1547	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1548	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1549	*puch++ = 0x80 \| (uc & 0x3f);
1550	}
1551	else if (uc < 0x04000000)
1552	{
1553	*puch++ = 0xf8 \| (uc >> 24);
1554	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1555	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1556	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1557	*puch++ = 0x80 \| (uc & 0x3f);
1558	}
1559	else if (uc <= 0x7fffffff)
1560	{
1561	*puch++ = 0xfc \| (uc >> 30);
1562	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1563	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1564	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1565	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1566	*puch++ = 0x80 \| (uc & 0x3f);
1567	}
1568	else
1569	{
1570	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1571	*puch++ = 0x7f;
1572	}
1573
1574	return (char *)puch;
1575	}
1576	RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1577
1578
1579	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
1580	{
1581	if (pszStart < psz)
1582	{
1583	/* simple char? */
1584	const unsigned char puch = (const unsigned char )psz;
1585	unsigned uch = *--puch;
1586	if (!(uch & RT_BIT(7)))
1587	return (char *)puch;
1588	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1589
1590	/* two or more. */
1591	uint32_t uMask = 0xffffffc0;
1592	while ( (const unsigned char *)pszStart < puch
1593	&& !(uMask & 1))
1594	{
1595	uch = *--puch;
1596	if ((uch & 0xc0) != 0x80)
1597	{
1598	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1599	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
1600	(char *)pszStart);
1601	return (char *)puch;
1602	}
1603	uMask >>= 1;
1604	}
1605	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
1606	}
1607	return (char *)pszStart;
1608	}
1609	RT_EXPORT_SYMBOL(RTStrPrevCp);
1610

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 40091

Download in other formats: