utf-8.cpp@ 40071

Last change on this file since 40071 was 40071, checked in by vboxsync, 13 years ago
Runtime/strings: add Utf-8 and Utf-16 sanitising to a white list of characters.
Property svn:eol-style set to `native` Property svn:keywords set to `Id`
File size: 54.4 KB

Line
1	/* $Id: utf-8.cpp 40071 2012-02-10 21:35:27Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2010 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.215389.xyz. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*******************************************************************************
29	* Header Files *
30	*******************************************************************************/
31	#include <iprt/string.h>
32	#include "internal/iprt.h"
33
34	#include <iprt/uni.h>
35	#include <iprt/alloc.h>
36	#include <iprt/assert.h>
37	#include <iprt/err.h>
38	#include "internal/string.h"
39
40
41
42	/**
43	* Get get length in code points of a UTF-8 encoded string.
44	* The string is validated while doing this.
45	*
46	* @returns IPRT status code.
47	* @param psz Pointer to the UTF-8 string.
48	* @param cch The max length of the string. (btw cch = cb)
49	* Use RTSTR_MAX if all of the string is to be examined.
50	* @param pcuc Where to store the length in unicode code points.
51	* @param pcchActual Where to store the actual size of the UTF-8 string
52	* on success (cch = cb again). Optional.
53	*/
54	DECLHIDDEN(int) rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
55	{
56	const unsigned char puch = (const unsigned char )psz;
57	size_t cCodePoints = 0;
58	while (cch > 0)
59	{
60	const unsigned char uch = *puch;
61	if (!uch)
62	break;
63	if (uch & RT_BIT(7))
64	{
65	/* figure sequence length and validate the first byte */
66	/** @todo RT_USE_RTC_3629 */
67	unsigned cb;
68	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
69	cb = 2;
70	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
71	cb = 3;
72	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
73	cb = 4;
74	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
75	cb = 5;
76	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
77	cb = 6;
78	else
79	{
80	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
81	return VERR_INVALID_UTF8_ENCODING;
82	}
83
84	/* check length */
85	if (cb > cch)
86	{
87	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
88	return VERR_INVALID_UTF8_ENCODING;
89	}
90
91	/* validate the rest */
92	switch (cb)
93	{
94	case 6:
95	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
96	case 5:
97	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98	case 4:
99	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100	case 3:
101	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
102	case 2:
103	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104	break;
105	}
106
107	/* validate the code point. */
108	RTUNICP uc;
109	switch (cb)
110	{
111	case 6:
112	uc = (puch[5] & 0x3f)
113	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
114	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
115	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
116	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
117	\| ((RTUNICP)(uch & 0x01) << 30);
118	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
119	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120	break;
121	case 5:
122	uc = (puch[4] & 0x3f)
123	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
124	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
125	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
126	\| ((RTUNICP)(uch & 0x03) << 24);
127	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
128	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
129	break;
130	case 4:
131	uc = (puch[3] & 0x3f)
132	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
133	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
134	\| ((RTUNICP)(uch & 0x07) << 18);
135	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
136	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
137	break;
138	case 3:
139	uc = (puch[2] & 0x3f)
140	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
141	\| ((RTUNICP)(uch & 0x0f) << 12);
142	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
143	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
144	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
145	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
146	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
147	break;
148	case 2:
149	uc = (puch[1] & 0x3f)
150	\| ((RTUNICP)(uch & 0x1f) << 6);
151	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
152	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153	break;
154	}
155
156	/* advance */
157	cch -= cb;
158	puch += cb;
159	}
160	else
161	{
162	/* one ASCII byte */
163	puch++;
164	cch--;
165	}
166	cCodePoints++;
167	}
168
169	/* done */
170	*pcuc = cCodePoints;
171	if (pcchActual)
172	pcchActual = puch - (unsigned char const )psz;
173	return VINF_SUCCESS;
174	}
175
176
177	/**
178	* Decodes and UTF-8 string into an array of unicode code point.
179	*
180	* Since we know the input is valid, we do not perform encoding or length checks.
181	*
182	* @returns iprt status code.
183	* @param psz The UTF-8 string to recode. This is a valid encoding.
184	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
185	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
186	* @param paCps Where to store the code points array.
187	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
188	*/
189	static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
190	{
191	int rc = VINF_SUCCESS;
192	const unsigned char puch = (const unsigned char )psz;
193	PRTUNICP pCp = paCps;
194	while (cch > 0)
195	{
196	/* read the next char and check for terminator. */
197	const unsigned char uch = *puch;
198	if (!uch)
199	break;
200
201	/* check for output overflow */
202	if (RT_UNLIKELY(cCps < 1))
203	{
204	rc = VERR_BUFFER_OVERFLOW;
205	break;
206	}
207	cCps--;
208
209	/* decode and recode the code point */
210	if (!(uch & RT_BIT(7)))
211	{
212	*pCp++ = uch;
213	puch++;
214	cch--;
215	}
216	#ifdef RT_STRICT
217	else if (!(uch & RT_BIT(6)))
218	AssertMsgFailed(("Internal error!\n"));
219	#endif
220	else if (!(uch & RT_BIT(5)))
221	{
222	*pCp++ = (puch[1] & 0x3f)
223	\| ((uint16_t)(uch & 0x1f) << 6);
224	puch += 2;
225	cch -= 2;
226	}
227	else if (!(uch & RT_BIT(4)))
228	{
229	*pCp++ = (puch[2] & 0x3f)
230	\| ((uint16_t)(puch[1] & 0x3f) << 6)
231	\| ((uint16_t)(uch & 0x0f) << 12);
232	puch += 3;
233	cch -= 3;
234	}
235	else if (!(uch & RT_BIT(3)))
236	{
237	*pCp++ = (puch[3] & 0x3f)
238	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
239	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
240	\| ((RTUNICP)(uch & 0x07) << 18);
241	puch += 4;
242	cch -= 4;
243	}
244	else if (!(uch & RT_BIT(2)))
245	{
246	*pCp++ = (puch[4] & 0x3f)
247	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
248	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
249	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
250	\| ((RTUNICP)(uch & 0x03) << 24);
251	puch += 5;
252	cch -= 6;
253	}
254	else
255	{
256	Assert(!(uch & RT_BIT(1)));
257	*pCp++ = (puch[5] & 0x3f)
258	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
259	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
260	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
261	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
262	\| ((RTUNICP)(uch & 0x01) << 30);
263	puch += 6;
264	cch -= 6;
265	}
266	}
267
268	/* done */
269	*pCp = 0;
270	return rc;
271	}
272
273
274	RTDECL(size_t) RTStrUniLen(const char *psz)
275	{
276	size_t cCodePoints;
277	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
278	return RT_SUCCESS(rc) ? cCodePoints : 0;
279	}
280	RT_EXPORT_SYMBOL(RTStrUniLen);
281
282
283	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
284	{
285	size_t cCodePoints;
286	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
287	if (pcCps)
288	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
289	return rc;
290	}
291	RT_EXPORT_SYMBOL(RTStrUniLenEx);
292
293
294	RTDECL(int) RTStrValidateEncoding(const char *psz)
295	{
296	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
297	}
298	RT_EXPORT_SYMBOL(RTStrValidateEncoding);
299
300
301	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
302	{
303	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
304	AssertPtr(psz);
305
306	/*
307	* Use rtUtf8Length for the job.
308	*/
309	size_t cchActual;
310	size_t cCpsIgnored;
311	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
312	if (RT_SUCCESS(rc))
313	{
314	if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
315	&& cchActual >= cch)
316	rc = VERR_BUFFER_OVERFLOW;
317	}
318	return rc;
319	}
320	RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
321
322
323	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
324	{
325	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
326	return RT_SUCCESS(rc);
327	}
328	RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
329
330
331	RTDECL(size_t) RTStrPurgeEncoding(char *psz)
332	{
333	size_t cErrors = 0;
334	for (;;)
335	{
336	RTUNICP Cp;
337	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
338	if (RT_SUCCESS(rc))
339	{
340	if (!Cp)
341	break;
342	}
343	else
344	{
345	psz[-1] = '?';
346	cErrors++;
347	}
348	}
349	return cErrors;
350	}
351	RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
352
353
354	ssize_t RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidSet, char chReplacement)
355	{
356	size_t cReplacements = 0;
357	AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
358	if (RT_FAILURE(RTStrValidateEncoding(psz)))
359	return -1;
360	for (;;)
361	{
362	RTUNICP Cp;
363	PCRTUNICP pCp;
364	char *pszOld = psz;
365	RTStrGetCpEx((const char **)&psz, &Cp);
366	if (!Cp)
367	break;
368	for (pCp = puszValidSet; ; ++pCp)
369	if (!pCp \|\| pCp == Cp)
370	break;
371	if (!*pCp)
372	{
373	for (; pszOld != psz; ++pszOld)
374	*pszOld = chReplacement;
375	++cReplacements;
376	}
377	}
378	return cReplacements;
379	}
380	RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
381
382
383	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
384	{
385	/*
386	* Validate input.
387	*/
388	Assert(VALID_PTR(pszString));
389	Assert(VALID_PTR(ppaCps));
390	*ppaCps = NULL;
391
392	/*
393	* Validate the UTF-8 input and count its code points.
394	*/
395	size_t cCps;
396	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
397	if (RT_SUCCESS(rc))
398	{
399	/*
400	* Allocate buffer.
401	*/
402	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
403	if (paCps)
404	{
405	/*
406	* Decode the string.
407	*/
408	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
409	if (RT_SUCCESS(rc))
410	{
411	*ppaCps = paCps;
412	return rc;
413	}
414	RTMemFree(paCps);
415	}
416	else
417	rc = VERR_NO_CODE_POINT_MEMORY;
418	}
419	return rc;
420	}
421	RT_EXPORT_SYMBOL(RTStrToUni);
422
423
424	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
425	{
426	/*
427	* Validate input.
428	*/
429	Assert(VALID_PTR(pszString));
430	Assert(VALID_PTR(ppaCps));
431	Assert(!pcCps \|\| VALID_PTR(pcCps));
432
433	/*
434	* Validate the UTF-8 input and count the code points.
435	*/
436	size_t cCpsResult;
437	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
438	if (RT_SUCCESS(rc))
439	{
440	if (pcCps)
441	*pcCps = cCpsResult;
442
443	/*
444	* Check buffer size / Allocate buffer.
445	*/
446	bool fShouldFree;
447	PRTUNICP paCpsResult;
448	if (cCps > 0 && *ppaCps)
449	{
450	fShouldFree = false;
451	if (cCps <= cCpsResult)
452	return VERR_BUFFER_OVERFLOW;
453	paCpsResult = *ppaCps;
454	}
455	else
456	{
457	*ppaCps = NULL;
458	fShouldFree = true;
459	cCps = RT_MAX(cCpsResult + 1, cCps);
460	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
461	}
462	if (paCpsResult)
463	{
464	/*
465	* Encode the UTF-16 string.
466	*/
467	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
468	if (RT_SUCCESS(rc))
469	{
470	*ppaCps = paCpsResult;
471	return rc;
472	}
473	if (fShouldFree)
474	RTMemFree(paCpsResult);
475	}
476	else
477	rc = VERR_NO_CODE_POINT_MEMORY;
478	}
479	return rc;
480	}
481	RT_EXPORT_SYMBOL(RTStrToUniEx);
482
483
484	/**
485	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
486	*
487	* @returns IPRT status code.
488	* @param psz Pointer to the UTF-8 string.
489	* @param cch The max length of the string. (btw cch = cb)
490	* Use RTSTR_MAX if all of the string is to be examined.
491	* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
492	*/
493	static int rtUtf8CalcUtf16Length(const char psz, size_t cch, size_t pcwc)
494	{
495	const unsigned char puch = (const unsigned char )psz;
496	size_t cwc = 0;
497	while (cch > 0)
498	{
499	const unsigned char uch = *puch;
500	if (!uch)
501	break;
502	if (!(uch & RT_BIT(7)))
503	{
504	/* one ASCII byte */
505	cwc++;
506	puch++;
507	cch--;
508	}
509	else
510	{
511	/* figure sequence length and validate the first byte */
512	unsigned cb;
513	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
514	cb = 2;
515	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
516	cb = 3;
517	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
518	cb = 4;
519	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
520	cb = 5;
521	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
522	cb = 6;
523	else
524	{
525	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
526	return VERR_INVALID_UTF8_ENCODING;
527	}
528
529	/* check length */
530	if (cb > cch)
531	{
532	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
533	return VERR_INVALID_UTF8_ENCODING;
534	}
535
536	/* validate the rest */
537	switch (cb)
538	{
539	case 6:
540	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
541	case 5:
542	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
543	case 4:
544	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
545	case 3:
546	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
547	case 2:
548	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
549	break;
550	}
551
552	/* validate the code point. */
553	RTUNICP uc;
554	switch (cb)
555	{
556	case 6:
557	uc = (puch[5] & 0x3f)
558	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
559	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
560	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
561	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
562	\| ((RTUNICP)(uch & 0x01) << 30);
563	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
564	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
565	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
566	return VERR_CANT_RECODE_AS_UTF16;
567	case 5:
568	uc = (puch[4] & 0x3f)
569	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
570	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
571	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
572	\| ((RTUNICP)(uch & 0x03) << 24);
573	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
574	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
575	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
576	return VERR_CANT_RECODE_AS_UTF16;
577	case 4:
578	uc = (puch[3] & 0x3f)
579	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
580	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
581	\| ((RTUNICP)(uch & 0x07) << 18);
582	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
583	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
584	RTStrAssertMsgReturn(uc <= 0x0010ffff,
585	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
586	cwc++;
587	break;
588	case 3:
589	uc = (puch[2] & 0x3f)
590	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
591	\| ((RTUNICP)(uch & 0x0f) << 12);
592	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
593	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
594	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
595	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
596	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
597	break;
598	case 2:
599	uc = (puch[1] & 0x3f)
600	\| ((RTUNICP)(uch & 0x1f) << 6);
601	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
602	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
603	break;
604	}
605
606	/* advance */
607	cch -= cb;
608	puch += cb;
609	cwc++;
610	}
611	}
612
613	/* done */
614	*pcwc = cwc;
615	return VINF_SUCCESS;
616	}
617
618
619	/**
620	* Recodes a valid UTF-8 string as UTF-16.
621	*
622	* Since we know the input is valid, we do not perform encoding or length checks.
623	*
624	* @returns iprt status code.
625	* @param psz The UTF-8 string to recode. This is a valid encoding.
626	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
627	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
628	* @param pwsz Where to store the UTF-16 string.
629	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
630	*/
631	static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
632	{
633	int rc = VINF_SUCCESS;
634	const unsigned char puch = (const unsigned char )psz;
635	PRTUTF16 pwc = pwsz;
636	while (cch > 0)
637	{
638	/* read the next char and check for terminator. */
639	const unsigned char uch = *puch;
640	if (!uch)
641	break;
642
643	/* check for output overflow */
644	if (RT_UNLIKELY(cwc < 1))
645	{
646	rc = VERR_BUFFER_OVERFLOW;
647	break;
648	}
649	cwc--;
650
651	/* decode and recode the code point */
652	if (!(uch & RT_BIT(7)))
653	{
654	*pwc++ = uch;
655	puch++;
656	cch--;
657	}
658	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
659	{
660	uint16_t uc = (puch[1] & 0x3f)
661	\| ((uint16_t)(uch & 0x1f) << 6);
662	*pwc++ = uc;
663	puch += 2;
664	cch -= 2;
665	}
666	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
667	{
668	uint16_t uc = (puch[2] & 0x3f)
669	\| ((uint16_t)(puch[1] & 0x3f) << 6)
670	\| ((uint16_t)(uch & 0x0f) << 12);
671	*pwc++ = uc;
672	puch += 3;
673	cch -= 3;
674	}
675	else
676	{
677	/* generate surrogate pair */
678	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
679	RTUNICP uc = (puch[3] & 0x3f)
680	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
681	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
682	\| ((RTUNICP)(uch & 0x07) << 18);
683	if (RT_UNLIKELY(cwc < 1))
684	{
685	rc = VERR_BUFFER_OVERFLOW;
686	break;
687	}
688	cwc--;
689
690	uc -= 0x10000;
691	*pwc++ = 0xd800 \| (uc >> 10);
692	*pwc++ = 0xdc00 \| (uc & 0x3ff);
693	puch += 4;
694	cch -= 4;
695	}
696	}
697
698	/* done */
699	*pwc = '\0';
700	return rc;
701	}
702
703
704	RTDECL(int) RTStrToUtf16Tag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
705	{
706	/*
707	* Validate input.
708	*/
709	Assert(VALID_PTR(ppwszString));
710	Assert(VALID_PTR(pszString));
711	*ppwszString = NULL;
712
713	/*
714	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
715	*/
716	size_t cwc;
717	int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
718	if (RT_SUCCESS(rc))
719	{
720	/*
721	* Allocate buffer.
722	*/
723	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
724	if (pwsz)
725	{
726	/*
727	* Encode the UTF-16 string.
728	*/
729	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
730	if (RT_SUCCESS(rc))
731	{
732	*ppwszString = pwsz;
733	return rc;
734	}
735	RTMemFree(pwsz);
736	}
737	else
738	rc = VERR_NO_UTF16_MEMORY;
739	}
740	return rc;
741	}
742	RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
743
744
745	RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
746	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
747	{
748	/*
749	* Validate input.
750	*/
751	Assert(VALID_PTR(pszString));
752	Assert(VALID_PTR(ppwsz));
753	Assert(!pcwc \|\| VALID_PTR(pcwc));
754
755	/*
756	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
757	*/
758	size_t cwcResult;
759	int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
760	if (RT_SUCCESS(rc))
761	{
762	if (pcwc)
763	*pcwc = cwcResult;
764
765	/*
766	* Check buffer size / Allocate buffer.
767	*/
768	bool fShouldFree;
769	PRTUTF16 pwszResult;
770	if (cwc > 0 && *ppwsz)
771	{
772	fShouldFree = false;
773	if (cwc <= cwcResult)
774	return VERR_BUFFER_OVERFLOW;
775	pwszResult = *ppwsz;
776	}
777	else
778	{
779	*ppwsz = NULL;
780	fShouldFree = true;
781	cwc = RT_MAX(cwcResult + 1, cwc);
782	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
783	}
784	if (pwszResult)
785	{
786	/*
787	* Encode the UTF-16 string.
788	*/
789	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
790	if (RT_SUCCESS(rc))
791	{
792	*ppwsz = pwszResult;
793	return rc;
794	}
795	if (fShouldFree)
796	RTMemFree(pwszResult);
797	}
798	else
799	rc = VERR_NO_UTF16_MEMORY;
800	}
801	return rc;
802	}
803	RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
804
805
806	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
807	{
808	size_t cwc;
809	int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
810	return RT_SUCCESS(rc) ? cwc : 0;
811	}
812	RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
813
814
815	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
816	{
817	size_t cwc;
818	int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
819	if (pcwc)
820	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
821	return rc;
822	}
823	RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
824
825
826	/**
827	* Calculates the length of the UTF-8 encoding of a Latin-1 string.
828	*
829	* @returns iprt status code.
830	* @param psz The Latin-1 string.
831	* @param cchIn The max length of the Latin-1 string to consider.
832	* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
833	*/
834	static int rtLatin1CalcUtf8Length(const char psz, size_t cchIn, size_t pcch)
835	{
836	size_t cch = 0;
837	for (;;)
838	{
839	RTUNICP Cp;
840	int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
841	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
842	break;
843	if (RT_FAILURE(rc))
844	return rc;
845	cch += RTStrCpSize(Cp); /* cannot fail */
846	}
847
848	/* done */
849	*pcch = cch;
850	return VINF_SUCCESS;
851	}
852
853
854	/**
855	* Recodes a Latin-1 string as UTF-8.
856	*
857	* @returns iprt status code.
858	* @param psz The Latin-1 string.
859	* @param cchIn The number of characters to process from psz. The recoding
860	* will stop when cch or '\\0' is reached.
861	* @param psz Where to store the UTF-8 string.
862	* @param cch The size of the UTF-8 buffer, excluding the terminator.
863	*/
864	static int rtLatin1RecodeAsUtf8(const char pszIn, size_t cchIn, char psz, size_t cch)
865	{
866	int rc;
867	for (;;)
868	{
869	RTUNICP Cp;
870	size_t cchCp;
871	rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
872	if (Cp == 0 \|\| RT_FAILURE(rc))
873	break;
874	cchCp = RTStrCpSize(Cp);
875	if (RT_UNLIKELY(cch < cchCp))
876	{
877	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
878	rc = VERR_BUFFER_OVERFLOW;
879	break;
880	}
881	cch -= cchCp;
882	psz = RTStrPutCp(psz, Cp);
883	}
884
885	/* done */
886	if (rc == VERR_END_OF_STRING)
887	rc = VINF_SUCCESS;
888	*psz = '\0';
889	return rc;
890	}
891
892
893
894	RTDECL(int) RTLatin1ToUtf8Tag(const char pszString, char ppszString, const char pszTag)
895	{
896	/*
897	* Validate input.
898	*/
899	Assert(VALID_PTR(ppszString));
900	Assert(VALID_PTR(pszString));
901	*ppszString = NULL;
902
903	/*
904	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
905	*/
906	size_t cch;
907	int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
908	if (RT_SUCCESS(rc))
909	{
910	/*
911	* Allocate buffer and recode it.
912	*/
913	char pszResult = (char )RTMemAllocTag(cch + 1, pszTag);
914	if (pszResult)
915	{
916	rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
917	if (RT_SUCCESS(rc))
918	{
919	*ppszString = pszResult;
920	return rc;
921	}
922
923	RTMemFree(pszResult);
924	}
925	else
926	rc = VERR_NO_STR_MEMORY;
927	}
928	return rc;
929	}
930	RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
931
932
933	RTDECL(int) RTLatin1ToUtf8ExTag(const char pszString, size_t cchString, char ppsz, size_t cch, size_t pcch, const char *pszTag)
934	{
935	/*
936	* Validate input.
937	*/
938	Assert(VALID_PTR(pszString));
939	Assert(VALID_PTR(ppsz));
940	Assert(!pcch \|\| VALID_PTR(pcch));
941
942	/*
943	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
944	*/
945	size_t cchResult;
946	int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
947	if (RT_SUCCESS(rc))
948	{
949	if (pcch)
950	*pcch = cchResult;
951
952	/*
953	* Check buffer size / Allocate buffer and recode it.
954	*/
955	bool fShouldFree;
956	char *pszResult;
957	if (cch > 0 && *ppsz)
958	{
959	fShouldFree = false;
960	if (RT_UNLIKELY(cch <= cchResult))
961	return VERR_BUFFER_OVERFLOW;
962	pszResult = *ppsz;
963	}
964	else
965	{
966	*ppsz = NULL;
967	fShouldFree = true;
968	cch = RT_MAX(cch, cchResult + 1);
969	pszResult = (char *)RTStrAllocTag(cch, pszTag);
970	}
971	if (pszResult)
972	{
973	rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
974	if (RT_SUCCESS(rc))
975	{
976	*ppsz = pszResult;
977	return rc;
978	}
979
980	if (fShouldFree)
981	RTStrFree(pszResult);
982	}
983	else
984	rc = VERR_NO_STR_MEMORY;
985	}
986	return rc;
987	}
988	RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
989
990
991	RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
992	{
993	size_t cch;
994	int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
995	return RT_SUCCESS(rc) ? cch : 0;
996	}
997	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
998
999
1000	RTDECL(int) RTLatin1CalcUtf8LenEx(const char psz, size_t cchIn, size_t pcch)
1001	{
1002	size_t cch;
1003	int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1004	if (pcch)
1005	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1006	return rc;
1007	}
1008	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1009
1010
1011	/**
1012	* Calculates the Latin-1 length of a string, validating the encoding while
1013	* doing so.
1014	*
1015	* @returns IPRT status code.
1016	* @param psz Pointer to the UTF-8 string.
1017	* @param cchIn The max length of the string. (btw cch = cb)
1018	* Use RTSTR_MAX if all of the string is to be examined.
1019	* @param pcch Where to store the length of the Latin-1 string in bytes.
1020	*/
1021	static int rtUtf8CalcLatin1Length(const char psz, size_t cchIn, size_t pcch)
1022	{
1023	size_t cch = 0;
1024	for (;;)
1025	{
1026	RTUNICP Cp;
1027	size_t cchCp;
1028	int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1029	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1030	break;
1031	if (RT_FAILURE(rc))
1032	return rc;
1033	cchCp = RTLatin1CpSize(Cp);
1034	if (cchCp == 0)
1035	return VERR_NO_TRANSLATION;
1036	cch += cchCp;
1037	}
1038
1039	/* done */
1040	*pcch = cch;
1041	return VINF_SUCCESS;
1042	}
1043
1044
1045	/**
1046	* Recodes a valid UTF-8 string as Latin-1.
1047	*
1048	* Since we know the input is valid, we do not perform encoding or length checks.
1049	*
1050	* @returns iprt status code.
1051	* @param pszIn The UTF-8 string to recode. This is a valid encoding.
1052	* @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1053	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1054	* @param psz Where to store the Latin-1 string.
1055	* @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1056	*/
1057	static int rtUtf8RecodeAsLatin1(const char pszIn, size_t cchIn, char psz, size_t cch)
1058	{
1059	int rc;
1060	for (;;)
1061	{
1062	RTUNICP Cp;
1063	size_t cchCp;
1064	rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1065	if (Cp == 0 \|\| RT_FAILURE(rc))
1066	break;
1067	cchCp = RTLatin1CpSize(Cp);
1068	if (RT_UNLIKELY(cch < cchCp))
1069	{
1070	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1071	rc = VERR_BUFFER_OVERFLOW;
1072	break;
1073	}
1074	cch -= cchCp;
1075	psz = RTLatin1PutCp(psz, Cp);
1076	}
1077
1078	/* done */
1079	if (rc == VERR_END_OF_STRING)
1080	rc = VINF_SUCCESS;
1081	*psz = '\0';
1082	return rc;
1083	}
1084
1085
1086
1087	RTDECL(int) RTStrToLatin1Tag(const char pszString, char ppszString, const char pszTag)
1088	{
1089	/*
1090	* Validate input.
1091	*/
1092	Assert(VALID_PTR(ppszString));
1093	Assert(VALID_PTR(pszString));
1094	*ppszString = NULL;
1095
1096	/*
1097	* Validate the UTF-8 input and calculate the length of the Latin-1 string.
1098	*/
1099	size_t cch;
1100	int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1101	if (RT_SUCCESS(rc))
1102	{
1103	/*
1104	* Allocate buffer.
1105	*/
1106	char psz = (char )RTMemAllocTag(cch + 1, pszTag);
1107	if (psz)
1108	{
1109	/*
1110	* Encode the UTF-16 string.
1111	*/
1112	rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1113	if (RT_SUCCESS(rc))
1114	{
1115	*ppszString = psz;
1116	return rc;
1117	}
1118	RTMemFree(psz);
1119	}
1120	else
1121	rc = VERR_NO_STR_MEMORY;
1122	}
1123	return rc;
1124	}
1125	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1126
1127
1128	RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1129	char *ppsz, size_t cch, size_t pcch, const char *pszTag)
1130	{
1131	/*
1132	* Validate input.
1133	*/
1134	Assert(VALID_PTR(pszString));
1135	Assert(VALID_PTR(ppsz));
1136	Assert(!pcch \|\| VALID_PTR(pcch));
1137
1138	/*
1139	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1140	*/
1141	size_t cchResult;
1142	int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1143	if (RT_SUCCESS(rc))
1144	{
1145	if (pcch)
1146	*pcch = cchResult;
1147
1148	/*
1149	* Check buffer size / Allocate buffer.
1150	*/
1151	bool fShouldFree;
1152	char *pszResult;
1153	if (cch > 0 && *ppsz)
1154	{
1155	fShouldFree = false;
1156	if (cch <= cchResult)
1157	return VERR_BUFFER_OVERFLOW;
1158	pszResult = *ppsz;
1159	}
1160	else
1161	{
1162	*ppsz = NULL;
1163	fShouldFree = true;
1164	cch = RT_MAX(cchResult + 1, cch);
1165	pszResult = (char *)RTMemAllocTag(cch, pszTag);
1166	}
1167	if (pszResult)
1168	{
1169	/*
1170	* Encode the Latin-1 string.
1171	*/
1172	rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1173	if (RT_SUCCESS(rc))
1174	{
1175	*ppsz = pszResult;
1176	return rc;
1177	}
1178	if (fShouldFree)
1179	RTMemFree(pszResult);
1180	}
1181	else
1182	rc = VERR_NO_STR_MEMORY;
1183	}
1184	return rc;
1185	}
1186	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1187
1188
1189	RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1190	{
1191	size_t cch;
1192	int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1193	return RT_SUCCESS(rc) ? cch : 0;
1194	}
1195	RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1196
1197
1198	RTDECL(int) RTStrCalcLatin1LenEx(const char psz, size_t cchIn, size_t pcch)
1199	{
1200	size_t cch;
1201	int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1202	if (pcch)
1203	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1204	return rc;
1205	}
1206	RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1207
1208
1209	/**
1210	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1211	* @returns rc
1212	* @param ppsz The pointer to the string position point.
1213	* @param pCp Where to store RTUNICP_INVALID.
1214	* @param rc The iprt error code.
1215	*/
1216	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1217	{
1218	/*
1219	* Try find a valid encoding.
1220	*/
1221	(ppsz)++; /* @todo code this! */
1222	*pCp = RTUNICP_INVALID;
1223	return rc;
1224	}
1225
1226
1227	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1228	{
1229	RTUNICP Cp;
1230	RTStrGetCpExInternal(&psz, &Cp);
1231	return Cp;
1232	}
1233	RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1234
1235
1236	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1237	{
1238	const unsigned char puch = (const unsigned char )*ppsz;
1239	const unsigned char uch = *puch;
1240	RTUNICP uc;
1241
1242	/* ASCII ? */
1243	if (!(uch & RT_BIT(7)))
1244	{
1245	uc = uch;
1246	puch++;
1247	}
1248	else if (uch & RT_BIT(6))
1249	{
1250	/* figure the length and validate the first octet. */
1251	/** @todo RT_USE_RTC_3629 */
1252	unsigned cb;
1253	if (!(uch & RT_BIT(5)))
1254	cb = 2;
1255	else if (!(uch & RT_BIT(4)))
1256	cb = 3;
1257	else if (!(uch & RT_BIT(3)))
1258	cb = 4;
1259	else if (!(uch & RT_BIT(2)))
1260	cb = 5;
1261	else if (!(uch & RT_BIT(1)))
1262	cb = 6;
1263	else
1264	{
1265	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1266	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1267	}
1268
1269	/* validate the rest */
1270	switch (cb)
1271	{
1272	case 6:
1273	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1274	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1275	case 5:
1276	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1277	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1278	case 4:
1279	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1280	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1281	case 3:
1282	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1283	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1284	case 2:
1285	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1286	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1287	break;
1288	}
1289
1290	/* get and validate the code point. */
1291	switch (cb)
1292	{
1293	case 6:
1294	uc = (puch[5] & 0x3f)
1295	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1296	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1297	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1298	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1299	\| ((RTUNICP)(uch & 0x01) << 30);
1300	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1301	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1302	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1303	break;
1304	case 5:
1305	uc = (puch[4] & 0x3f)
1306	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1307	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1308	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1309	\| ((RTUNICP)(uch & 0x03) << 24);
1310	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1311	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1312	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1313	break;
1314	case 4:
1315	uc = (puch[3] & 0x3f)
1316	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1317	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1318	\| ((RTUNICP)(uch & 0x07) << 18);
1319	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1320	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1321	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1322	break;
1323	case 3:
1324	uc = (puch[2] & 0x3f)
1325	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1326	\| ((RTUNICP)(uch & 0x0f) << 12);
1327	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1328	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1329	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1330	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1331	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1332	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1333	break;
1334	case 2:
1335	uc = (puch[1] & 0x3f)
1336	\| ((RTUNICP)(uch & 0x1f) << 6);
1337	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1338	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1339	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1340	break;
1341	default: /* impossible, but GCC is bitching. */
1342	uc = RTUNICP_INVALID;
1343	break;
1344	}
1345	puch += cb;
1346	}
1347	else
1348	{
1349	/* 6th bit is always set. */
1350	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1351	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1352	}
1353	*pCp = uc;
1354	ppsz = (const char )puch;
1355	return VINF_SUCCESS;
1356	}
1357	RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1358
1359
1360	/**
1361	* Handle invalid encodings passed to RTStrGetCpNEx().
1362	* @returns rc
1363	* @param ppsz The pointer to the string position point.
1364	* @param pcch Pointer to the string length.
1365	* @param pCp Where to store RTUNICP_INVALID.
1366	* @param rc The iprt error code.
1367	*/
1368	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
1369	{
1370	/*
1371	* Try find a valid encoding.
1372	*/
1373	(ppsz)++; /* @todo code this! */
1374	(*pcch)--;
1375	*pCp = RTUNICP_INVALID;
1376	return rc;
1377	}
1378
1379
1380	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
1381	{
1382	const unsigned char puch = (const unsigned char )*ppsz;
1383	const unsigned char uch = *puch;
1384	size_t cch = *pcch;
1385	RTUNICP uc;
1386
1387	if (cch == 0)
1388	{
1389	*pCp = RTUNICP_INVALID;
1390	return VERR_END_OF_STRING;
1391	}
1392
1393	/* ASCII ? */
1394	if (!(uch & RT_BIT(7)))
1395	{
1396	uc = uch;
1397	puch++;
1398	cch--;
1399	}
1400	else if (uch & RT_BIT(6))
1401	{
1402	/* figure the length and validate the first octet. */
1403	/** @todo RT_USE_RTC_3629 */
1404	unsigned cb;
1405	if (!(uch & RT_BIT(5)))
1406	cb = 2;
1407	else if (!(uch & RT_BIT(4)))
1408	cb = 3;
1409	else if (!(uch & RT_BIT(3)))
1410	cb = 4;
1411	else if (!(uch & RT_BIT(2)))
1412	cb = 5;
1413	else if (!(uch & RT_BIT(1)))
1414	cb = 6;
1415	else
1416	{
1417	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1418	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1419	}
1420
1421	if (cb > cch)
1422	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1423
1424	/* validate the rest */
1425	switch (cb)
1426	{
1427	case 6:
1428	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1429	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1430	case 5:
1431	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1432	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1433	case 4:
1434	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1435	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1436	case 3:
1437	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1438	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1439	case 2:
1440	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1441	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1442	break;
1443	}
1444
1445	/* get and validate the code point. */
1446	switch (cb)
1447	{
1448	case 6:
1449	uc = (puch[5] & 0x3f)
1450	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1451	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1452	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1453	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1454	\| ((RTUNICP)(uch & 0x01) << 30);
1455	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1456	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1457	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1458	break;
1459	case 5:
1460	uc = (puch[4] & 0x3f)
1461	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1462	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1463	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1464	\| ((RTUNICP)(uch & 0x03) << 24);
1465	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1466	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1467	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1468	break;
1469	case 4:
1470	uc = (puch[3] & 0x3f)
1471	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1472	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1473	\| ((RTUNICP)(uch & 0x07) << 18);
1474	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1475	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1476	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1477	break;
1478	case 3:
1479	uc = (puch[2] & 0x3f)
1480	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1481	\| ((RTUNICP)(uch & 0x0f) << 12);
1482	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1483	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1484	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1485	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1486	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1487	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1488	break;
1489	case 2:
1490	uc = (puch[1] & 0x3f)
1491	\| ((RTUNICP)(uch & 0x1f) << 6);
1492	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1493	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1494	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1495	break;
1496	default: /* impossible, but GCC is bitching. */
1497	uc = RTUNICP_INVALID;
1498	break;
1499	}
1500	puch += cb;
1501	cch -= cb;
1502	}
1503	else
1504	{
1505	/* 6th bit is always set. */
1506	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1507	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1508	}
1509	*pCp = uc;
1510	ppsz = (const char )puch;
1511	(*pcch) = cch;
1512	return VINF_SUCCESS;
1513	}
1514	RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1515
1516
1517	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1518	{
1519	unsigned char puch = (unsigned char )psz;
1520	if (uc < 0x80)
1521	*puch++ = (unsigned char )uc;
1522	else if (uc < 0x00000800)
1523	{
1524	*puch++ = 0xc0 \| (uc >> 6);
1525	*puch++ = 0x80 \| (uc & 0x3f);
1526	}
1527	else if (uc < 0x00010000)
1528	{
1529	/** @todo RT_USE_RTC_3629 */
1530	if ( uc < 0x0000d8000
1531	\|\| ( uc > 0x0000dfff
1532	&& uc < 0x0000fffe))
1533	{
1534	*puch++ = 0xe0 \| (uc >> 12);
1535	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1536	*puch++ = 0x80 \| (uc & 0x3f);
1537	}
1538	else
1539	{
1540	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1541	*puch++ = 0x7f;
1542	}
1543	}
1544	/** @todo RT_USE_RTC_3629 */
1545	else if (uc < 0x00200000)
1546	{
1547	*puch++ = 0xf0 \| (uc >> 18);
1548	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1549	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1550	*puch++ = 0x80 \| (uc & 0x3f);
1551	}
1552	else if (uc < 0x04000000)
1553	{
1554	*puch++ = 0xf8 \| (uc >> 24);
1555	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1556	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1557	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1558	*puch++ = 0x80 \| (uc & 0x3f);
1559	}
1560	else if (uc <= 0x7fffffff)
1561	{
1562	*puch++ = 0xfc \| (uc >> 30);
1563	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1564	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1565	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1566	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1567	*puch++ = 0x80 \| (uc & 0x3f);
1568	}
1569	else
1570	{
1571	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1572	*puch++ = 0x7f;
1573	}
1574
1575	return (char *)puch;
1576	}
1577	RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1578
1579
1580	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
1581	{
1582	if (pszStart < psz)
1583	{
1584	/* simple char? */
1585	const unsigned char puch = (const unsigned char )psz;
1586	unsigned uch = *--puch;
1587	if (!(uch & RT_BIT(7)))
1588	return (char *)puch;
1589	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1590
1591	/* two or more. */
1592	uint32_t uMask = 0xffffffc0;
1593	while ( (const unsigned char *)pszStart < puch
1594	&& !(uMask & 1))
1595	{
1596	uch = *--puch;
1597	if ((uch & 0xc0) != 0x80)
1598	{
1599	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1600	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
1601	(char *)pszStart);
1602	return (char *)puch;
1603	}
1604	uMask >>= 1;
1605	}
1606	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
1607	}
1608	return (char *)pszStart;
1609	}
1610	RT_EXPORT_SYMBOL(RTStrPrevCp);
1611

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 40071

Download in other formats: