VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 40071

Last change on this file since 40071 was 40071, checked in by vboxsync, 13 years ago

Runtime/strings: add Utf-8 and Utf-16 sanitising to a white list of characters.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 54.4 KB
Line 
1/* $Id: utf-8.cpp 40071 2012-02-10 21:35:27Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.215389.xyz. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42/**
43 * Get get length in code points of a UTF-8 encoded string.
44 * The string is validated while doing this.
45 *
46 * @returns IPRT status code.
47 * @param psz Pointer to the UTF-8 string.
48 * @param cch The max length of the string. (btw cch = cb)
49 * Use RTSTR_MAX if all of the string is to be examined.
50 * @param pcuc Where to store the length in unicode code points.
51 * @param pcchActual Where to store the actual size of the UTF-8 string
52 * on success (cch = cb again). Optional.
53 */
54DECLHIDDEN(int) rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
55{
56 const unsigned char *puch = (const unsigned char *)psz;
57 size_t cCodePoints = 0;
58 while (cch > 0)
59 {
60 const unsigned char uch = *puch;
61 if (!uch)
62 break;
63 if (uch & RT_BIT(7))
64 {
65 /* figure sequence length and validate the first byte */
66/** @todo RT_USE_RTC_3629 */
67 unsigned cb;
68 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
69 cb = 2;
70 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
71 cb = 3;
72 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
73 cb = 4;
74 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
75 cb = 5;
76 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
77 cb = 6;
78 else
79 {
80 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
81 return VERR_INVALID_UTF8_ENCODING;
82 }
83
84 /* check length */
85 if (cb > cch)
86 {
87 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
88 return VERR_INVALID_UTF8_ENCODING;
89 }
90
91 /* validate the rest */
92 switch (cb)
93 {
94 case 6:
95 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
96 case 5:
97 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98 case 4:
99 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100 case 3:
101 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
102 case 2:
103 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104 break;
105 }
106
107 /* validate the code point. */
108 RTUNICP uc;
109 switch (cb)
110 {
111 case 6:
112 uc = (puch[5] & 0x3f)
113 | ((RTUNICP)(puch[4] & 0x3f) << 6)
114 | ((RTUNICP)(puch[3] & 0x3f) << 12)
115 | ((RTUNICP)(puch[2] & 0x3f) << 18)
116 | ((RTUNICP)(puch[1] & 0x3f) << 24)
117 | ((RTUNICP)(uch & 0x01) << 30);
118 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
119 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120 break;
121 case 5:
122 uc = (puch[4] & 0x3f)
123 | ((RTUNICP)(puch[3] & 0x3f) << 6)
124 | ((RTUNICP)(puch[2] & 0x3f) << 12)
125 | ((RTUNICP)(puch[1] & 0x3f) << 18)
126 | ((RTUNICP)(uch & 0x03) << 24);
127 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
128 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
129 break;
130 case 4:
131 uc = (puch[3] & 0x3f)
132 | ((RTUNICP)(puch[2] & 0x3f) << 6)
133 | ((RTUNICP)(puch[1] & 0x3f) << 12)
134 | ((RTUNICP)(uch & 0x07) << 18);
135 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
136 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
137 break;
138 case 3:
139 uc = (puch[2] & 0x3f)
140 | ((RTUNICP)(puch[1] & 0x3f) << 6)
141 | ((RTUNICP)(uch & 0x0f) << 12);
142 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
143 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
144 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
145 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
146 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
147 break;
148 case 2:
149 uc = (puch[1] & 0x3f)
150 | ((RTUNICP)(uch & 0x1f) << 6);
151 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
152 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153 break;
154 }
155
156 /* advance */
157 cch -= cb;
158 puch += cb;
159 }
160 else
161 {
162 /* one ASCII byte */
163 puch++;
164 cch--;
165 }
166 cCodePoints++;
167 }
168
169 /* done */
170 *pcuc = cCodePoints;
171 if (pcchActual)
172 *pcchActual = puch - (unsigned char const *)psz;
173 return VINF_SUCCESS;
174}
175
176
177/**
178 * Decodes and UTF-8 string into an array of unicode code point.
179 *
180 * Since we know the input is valid, we do *not* perform encoding or length checks.
181 *
182 * @returns iprt status code.
183 * @param psz The UTF-8 string to recode. This is a valid encoding.
184 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
185 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
186 * @param paCps Where to store the code points array.
187 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
188 */
189static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
190{
191 int rc = VINF_SUCCESS;
192 const unsigned char *puch = (const unsigned char *)psz;
193 PRTUNICP pCp = paCps;
194 while (cch > 0)
195 {
196 /* read the next char and check for terminator. */
197 const unsigned char uch = *puch;
198 if (!uch)
199 break;
200
201 /* check for output overflow */
202 if (RT_UNLIKELY(cCps < 1))
203 {
204 rc = VERR_BUFFER_OVERFLOW;
205 break;
206 }
207 cCps--;
208
209 /* decode and recode the code point */
210 if (!(uch & RT_BIT(7)))
211 {
212 *pCp++ = uch;
213 puch++;
214 cch--;
215 }
216#ifdef RT_STRICT
217 else if (!(uch & RT_BIT(6)))
218 AssertMsgFailed(("Internal error!\n"));
219#endif
220 else if (!(uch & RT_BIT(5)))
221 {
222 *pCp++ = (puch[1] & 0x3f)
223 | ((uint16_t)(uch & 0x1f) << 6);
224 puch += 2;
225 cch -= 2;
226 }
227 else if (!(uch & RT_BIT(4)))
228 {
229 *pCp++ = (puch[2] & 0x3f)
230 | ((uint16_t)(puch[1] & 0x3f) << 6)
231 | ((uint16_t)(uch & 0x0f) << 12);
232 puch += 3;
233 cch -= 3;
234 }
235 else if (!(uch & RT_BIT(3)))
236 {
237 *pCp++ = (puch[3] & 0x3f)
238 | ((RTUNICP)(puch[2] & 0x3f) << 6)
239 | ((RTUNICP)(puch[1] & 0x3f) << 12)
240 | ((RTUNICP)(uch & 0x07) << 18);
241 puch += 4;
242 cch -= 4;
243 }
244 else if (!(uch & RT_BIT(2)))
245 {
246 *pCp++ = (puch[4] & 0x3f)
247 | ((RTUNICP)(puch[3] & 0x3f) << 6)
248 | ((RTUNICP)(puch[2] & 0x3f) << 12)
249 | ((RTUNICP)(puch[1] & 0x3f) << 18)
250 | ((RTUNICP)(uch & 0x03) << 24);
251 puch += 5;
252 cch -= 6;
253 }
254 else
255 {
256 Assert(!(uch & RT_BIT(1)));
257 *pCp++ = (puch[5] & 0x3f)
258 | ((RTUNICP)(puch[4] & 0x3f) << 6)
259 | ((RTUNICP)(puch[3] & 0x3f) << 12)
260 | ((RTUNICP)(puch[2] & 0x3f) << 18)
261 | ((RTUNICP)(puch[1] & 0x3f) << 24)
262 | ((RTUNICP)(uch & 0x01) << 30);
263 puch += 6;
264 cch -= 6;
265 }
266 }
267
268 /* done */
269 *pCp = 0;
270 return rc;
271}
272
273
274RTDECL(size_t) RTStrUniLen(const char *psz)
275{
276 size_t cCodePoints;
277 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
278 return RT_SUCCESS(rc) ? cCodePoints : 0;
279}
280RT_EXPORT_SYMBOL(RTStrUniLen);
281
282
283RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
284{
285 size_t cCodePoints;
286 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
287 if (pcCps)
288 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
289 return rc;
290}
291RT_EXPORT_SYMBOL(RTStrUniLenEx);
292
293
294RTDECL(int) RTStrValidateEncoding(const char *psz)
295{
296 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
297}
298RT_EXPORT_SYMBOL(RTStrValidateEncoding);
299
300
301RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
302{
303 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
304 AssertPtr(psz);
305
306 /*
307 * Use rtUtf8Length for the job.
308 */
309 size_t cchActual;
310 size_t cCpsIgnored;
311 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
312 if (RT_SUCCESS(rc))
313 {
314 if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
315 && cchActual >= cch)
316 rc = VERR_BUFFER_OVERFLOW;
317 }
318 return rc;
319}
320RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
321
322
323RTDECL(bool) RTStrIsValidEncoding(const char *psz)
324{
325 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
326 return RT_SUCCESS(rc);
327}
328RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
329
330
331RTDECL(size_t) RTStrPurgeEncoding(char *psz)
332{
333 size_t cErrors = 0;
334 for (;;)
335 {
336 RTUNICP Cp;
337 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
338 if (RT_SUCCESS(rc))
339 {
340 if (!Cp)
341 break;
342 }
343 else
344 {
345 psz[-1] = '?';
346 cErrors++;
347 }
348 }
349 return cErrors;
350}
351RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
352
353
354ssize_t RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidSet, char chReplacement)
355{
356 size_t cReplacements = 0;
357 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
358 if (RT_FAILURE(RTStrValidateEncoding(psz)))
359 return -1;
360 for (;;)
361 {
362 RTUNICP Cp;
363 PCRTUNICP pCp;
364 char *pszOld = psz;
365 RTStrGetCpEx((const char **)&psz, &Cp);
366 if (!Cp)
367 break;
368 for (pCp = puszValidSet; ; ++pCp)
369 if (!*pCp || *pCp == Cp)
370 break;
371 if (!*pCp)
372 {
373 for (; pszOld != psz; ++pszOld)
374 *pszOld = chReplacement;
375 ++cReplacements;
376 }
377 }
378 return cReplacements;
379}
380RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
381
382
383RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
384{
385 /*
386 * Validate input.
387 */
388 Assert(VALID_PTR(pszString));
389 Assert(VALID_PTR(ppaCps));
390 *ppaCps = NULL;
391
392 /*
393 * Validate the UTF-8 input and count its code points.
394 */
395 size_t cCps;
396 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
397 if (RT_SUCCESS(rc))
398 {
399 /*
400 * Allocate buffer.
401 */
402 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
403 if (paCps)
404 {
405 /*
406 * Decode the string.
407 */
408 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
409 if (RT_SUCCESS(rc))
410 {
411 *ppaCps = paCps;
412 return rc;
413 }
414 RTMemFree(paCps);
415 }
416 else
417 rc = VERR_NO_CODE_POINT_MEMORY;
418 }
419 return rc;
420}
421RT_EXPORT_SYMBOL(RTStrToUni);
422
423
424RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
425{
426 /*
427 * Validate input.
428 */
429 Assert(VALID_PTR(pszString));
430 Assert(VALID_PTR(ppaCps));
431 Assert(!pcCps || VALID_PTR(pcCps));
432
433 /*
434 * Validate the UTF-8 input and count the code points.
435 */
436 size_t cCpsResult;
437 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
438 if (RT_SUCCESS(rc))
439 {
440 if (pcCps)
441 *pcCps = cCpsResult;
442
443 /*
444 * Check buffer size / Allocate buffer.
445 */
446 bool fShouldFree;
447 PRTUNICP paCpsResult;
448 if (cCps > 0 && *ppaCps)
449 {
450 fShouldFree = false;
451 if (cCps <= cCpsResult)
452 return VERR_BUFFER_OVERFLOW;
453 paCpsResult = *ppaCps;
454 }
455 else
456 {
457 *ppaCps = NULL;
458 fShouldFree = true;
459 cCps = RT_MAX(cCpsResult + 1, cCps);
460 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
461 }
462 if (paCpsResult)
463 {
464 /*
465 * Encode the UTF-16 string.
466 */
467 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
468 if (RT_SUCCESS(rc))
469 {
470 *ppaCps = paCpsResult;
471 return rc;
472 }
473 if (fShouldFree)
474 RTMemFree(paCpsResult);
475 }
476 else
477 rc = VERR_NO_CODE_POINT_MEMORY;
478 }
479 return rc;
480}
481RT_EXPORT_SYMBOL(RTStrToUniEx);
482
483
484/**
485 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
486 *
487 * @returns IPRT status code.
488 * @param psz Pointer to the UTF-8 string.
489 * @param cch The max length of the string. (btw cch = cb)
490 * Use RTSTR_MAX if all of the string is to be examined.
491 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
492 */
493static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
494{
495 const unsigned char *puch = (const unsigned char *)psz;
496 size_t cwc = 0;
497 while (cch > 0)
498 {
499 const unsigned char uch = *puch;
500 if (!uch)
501 break;
502 if (!(uch & RT_BIT(7)))
503 {
504 /* one ASCII byte */
505 cwc++;
506 puch++;
507 cch--;
508 }
509 else
510 {
511 /* figure sequence length and validate the first byte */
512 unsigned cb;
513 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
514 cb = 2;
515 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
516 cb = 3;
517 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
518 cb = 4;
519 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
520 cb = 5;
521 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
522 cb = 6;
523 else
524 {
525 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
526 return VERR_INVALID_UTF8_ENCODING;
527 }
528
529 /* check length */
530 if (cb > cch)
531 {
532 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
533 return VERR_INVALID_UTF8_ENCODING;
534 }
535
536 /* validate the rest */
537 switch (cb)
538 {
539 case 6:
540 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
541 case 5:
542 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
543 case 4:
544 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
545 case 3:
546 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
547 case 2:
548 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
549 break;
550 }
551
552 /* validate the code point. */
553 RTUNICP uc;
554 switch (cb)
555 {
556 case 6:
557 uc = (puch[5] & 0x3f)
558 | ((RTUNICP)(puch[4] & 0x3f) << 6)
559 | ((RTUNICP)(puch[3] & 0x3f) << 12)
560 | ((RTUNICP)(puch[2] & 0x3f) << 18)
561 | ((RTUNICP)(puch[1] & 0x3f) << 24)
562 | ((RTUNICP)(uch & 0x01) << 30);
563 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
564 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
565 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
566 return VERR_CANT_RECODE_AS_UTF16;
567 case 5:
568 uc = (puch[4] & 0x3f)
569 | ((RTUNICP)(puch[3] & 0x3f) << 6)
570 | ((RTUNICP)(puch[2] & 0x3f) << 12)
571 | ((RTUNICP)(puch[1] & 0x3f) << 18)
572 | ((RTUNICP)(uch & 0x03) << 24);
573 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
574 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
575 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
576 return VERR_CANT_RECODE_AS_UTF16;
577 case 4:
578 uc = (puch[3] & 0x3f)
579 | ((RTUNICP)(puch[2] & 0x3f) << 6)
580 | ((RTUNICP)(puch[1] & 0x3f) << 12)
581 | ((RTUNICP)(uch & 0x07) << 18);
582 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
583 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
584 RTStrAssertMsgReturn(uc <= 0x0010ffff,
585 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
586 cwc++;
587 break;
588 case 3:
589 uc = (puch[2] & 0x3f)
590 | ((RTUNICP)(puch[1] & 0x3f) << 6)
591 | ((RTUNICP)(uch & 0x0f) << 12);
592 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
593 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
594 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
595 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
596 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
597 break;
598 case 2:
599 uc = (puch[1] & 0x3f)
600 | ((RTUNICP)(uch & 0x1f) << 6);
601 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
602 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
603 break;
604 }
605
606 /* advance */
607 cch -= cb;
608 puch += cb;
609 cwc++;
610 }
611 }
612
613 /* done */
614 *pcwc = cwc;
615 return VINF_SUCCESS;
616}
617
618
619/**
620 * Recodes a valid UTF-8 string as UTF-16.
621 *
622 * Since we know the input is valid, we do *not* perform encoding or length checks.
623 *
624 * @returns iprt status code.
625 * @param psz The UTF-8 string to recode. This is a valid encoding.
626 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
627 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
628 * @param pwsz Where to store the UTF-16 string.
629 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
630 */
631static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
632{
633 int rc = VINF_SUCCESS;
634 const unsigned char *puch = (const unsigned char *)psz;
635 PRTUTF16 pwc = pwsz;
636 while (cch > 0)
637 {
638 /* read the next char and check for terminator. */
639 const unsigned char uch = *puch;
640 if (!uch)
641 break;
642
643 /* check for output overflow */
644 if (RT_UNLIKELY(cwc < 1))
645 {
646 rc = VERR_BUFFER_OVERFLOW;
647 break;
648 }
649 cwc--;
650
651 /* decode and recode the code point */
652 if (!(uch & RT_BIT(7)))
653 {
654 *pwc++ = uch;
655 puch++;
656 cch--;
657 }
658 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
659 {
660 uint16_t uc = (puch[1] & 0x3f)
661 | ((uint16_t)(uch & 0x1f) << 6);
662 *pwc++ = uc;
663 puch += 2;
664 cch -= 2;
665 }
666 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
667 {
668 uint16_t uc = (puch[2] & 0x3f)
669 | ((uint16_t)(puch[1] & 0x3f) << 6)
670 | ((uint16_t)(uch & 0x0f) << 12);
671 *pwc++ = uc;
672 puch += 3;
673 cch -= 3;
674 }
675 else
676 {
677 /* generate surrogate pair */
678 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
679 RTUNICP uc = (puch[3] & 0x3f)
680 | ((RTUNICP)(puch[2] & 0x3f) << 6)
681 | ((RTUNICP)(puch[1] & 0x3f) << 12)
682 | ((RTUNICP)(uch & 0x07) << 18);
683 if (RT_UNLIKELY(cwc < 1))
684 {
685 rc = VERR_BUFFER_OVERFLOW;
686 break;
687 }
688 cwc--;
689
690 uc -= 0x10000;
691 *pwc++ = 0xd800 | (uc >> 10);
692 *pwc++ = 0xdc00 | (uc & 0x3ff);
693 puch += 4;
694 cch -= 4;
695 }
696 }
697
698 /* done */
699 *pwc = '\0';
700 return rc;
701}
702
703
704RTDECL(int) RTStrToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
705{
706 /*
707 * Validate input.
708 */
709 Assert(VALID_PTR(ppwszString));
710 Assert(VALID_PTR(pszString));
711 *ppwszString = NULL;
712
713 /*
714 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
715 */
716 size_t cwc;
717 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
718 if (RT_SUCCESS(rc))
719 {
720 /*
721 * Allocate buffer.
722 */
723 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
724 if (pwsz)
725 {
726 /*
727 * Encode the UTF-16 string.
728 */
729 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
730 if (RT_SUCCESS(rc))
731 {
732 *ppwszString = pwsz;
733 return rc;
734 }
735 RTMemFree(pwsz);
736 }
737 else
738 rc = VERR_NO_UTF16_MEMORY;
739 }
740 return rc;
741}
742RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
743
744
745RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
746 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
747{
748 /*
749 * Validate input.
750 */
751 Assert(VALID_PTR(pszString));
752 Assert(VALID_PTR(ppwsz));
753 Assert(!pcwc || VALID_PTR(pcwc));
754
755 /*
756 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
757 */
758 size_t cwcResult;
759 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
760 if (RT_SUCCESS(rc))
761 {
762 if (pcwc)
763 *pcwc = cwcResult;
764
765 /*
766 * Check buffer size / Allocate buffer.
767 */
768 bool fShouldFree;
769 PRTUTF16 pwszResult;
770 if (cwc > 0 && *ppwsz)
771 {
772 fShouldFree = false;
773 if (cwc <= cwcResult)
774 return VERR_BUFFER_OVERFLOW;
775 pwszResult = *ppwsz;
776 }
777 else
778 {
779 *ppwsz = NULL;
780 fShouldFree = true;
781 cwc = RT_MAX(cwcResult + 1, cwc);
782 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
783 }
784 if (pwszResult)
785 {
786 /*
787 * Encode the UTF-16 string.
788 */
789 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
790 if (RT_SUCCESS(rc))
791 {
792 *ppwsz = pwszResult;
793 return rc;
794 }
795 if (fShouldFree)
796 RTMemFree(pwszResult);
797 }
798 else
799 rc = VERR_NO_UTF16_MEMORY;
800 }
801 return rc;
802}
803RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
804
805
806RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
807{
808 size_t cwc;
809 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
810 return RT_SUCCESS(rc) ? cwc : 0;
811}
812RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
813
814
815RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
816{
817 size_t cwc;
818 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
819 if (pcwc)
820 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
821 return rc;
822}
823RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
824
825
826/**
827 * Calculates the length of the UTF-8 encoding of a Latin-1 string.
828 *
829 * @returns iprt status code.
830 * @param psz The Latin-1 string.
831 * @param cchIn The max length of the Latin-1 string to consider.
832 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
833 */
834static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch)
835{
836 size_t cch = 0;
837 for (;;)
838 {
839 RTUNICP Cp;
840 int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
841 if (Cp == 0 || rc == VERR_END_OF_STRING)
842 break;
843 if (RT_FAILURE(rc))
844 return rc;
845 cch += RTStrCpSize(Cp); /* cannot fail */
846 }
847
848 /* done */
849 *pcch = cch;
850 return VINF_SUCCESS;
851}
852
853
854/**
855 * Recodes a Latin-1 string as UTF-8.
856 *
857 * @returns iprt status code.
858 * @param psz The Latin-1 string.
859 * @param cchIn The number of characters to process from psz. The recoding
860 * will stop when cch or '\\0' is reached.
861 * @param psz Where to store the UTF-8 string.
862 * @param cch The size of the UTF-8 buffer, excluding the terminator.
863 */
864static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
865{
866 int rc;
867 for (;;)
868 {
869 RTUNICP Cp;
870 size_t cchCp;
871 rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
872 if (Cp == 0 || RT_FAILURE(rc))
873 break;
874 cchCp = RTStrCpSize(Cp);
875 if (RT_UNLIKELY(cch < cchCp))
876 {
877 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
878 rc = VERR_BUFFER_OVERFLOW;
879 break;
880 }
881 cch -= cchCp;
882 psz = RTStrPutCp(psz, Cp);
883 }
884
885 /* done */
886 if (rc == VERR_END_OF_STRING)
887 rc = VINF_SUCCESS;
888 *psz = '\0';
889 return rc;
890}
891
892
893
894RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag)
895{
896 /*
897 * Validate input.
898 */
899 Assert(VALID_PTR(ppszString));
900 Assert(VALID_PTR(pszString));
901 *ppszString = NULL;
902
903 /*
904 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
905 */
906 size_t cch;
907 int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
908 if (RT_SUCCESS(rc))
909 {
910 /*
911 * Allocate buffer and recode it.
912 */
913 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
914 if (pszResult)
915 {
916 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
917 if (RT_SUCCESS(rc))
918 {
919 *ppszString = pszResult;
920 return rc;
921 }
922
923 RTMemFree(pszResult);
924 }
925 else
926 rc = VERR_NO_STR_MEMORY;
927 }
928 return rc;
929}
930RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
931
932
933RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
934{
935 /*
936 * Validate input.
937 */
938 Assert(VALID_PTR(pszString));
939 Assert(VALID_PTR(ppsz));
940 Assert(!pcch || VALID_PTR(pcch));
941
942 /*
943 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
944 */
945 size_t cchResult;
946 int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
947 if (RT_SUCCESS(rc))
948 {
949 if (pcch)
950 *pcch = cchResult;
951
952 /*
953 * Check buffer size / Allocate buffer and recode it.
954 */
955 bool fShouldFree;
956 char *pszResult;
957 if (cch > 0 && *ppsz)
958 {
959 fShouldFree = false;
960 if (RT_UNLIKELY(cch <= cchResult))
961 return VERR_BUFFER_OVERFLOW;
962 pszResult = *ppsz;
963 }
964 else
965 {
966 *ppsz = NULL;
967 fShouldFree = true;
968 cch = RT_MAX(cch, cchResult + 1);
969 pszResult = (char *)RTStrAllocTag(cch, pszTag);
970 }
971 if (pszResult)
972 {
973 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
974 if (RT_SUCCESS(rc))
975 {
976 *ppsz = pszResult;
977 return rc;
978 }
979
980 if (fShouldFree)
981 RTStrFree(pszResult);
982 }
983 else
984 rc = VERR_NO_STR_MEMORY;
985 }
986 return rc;
987}
988RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
989
990
991RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
992{
993 size_t cch;
994 int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
995 return RT_SUCCESS(rc) ? cch : 0;
996}
997RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
998
999
1000RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch)
1001{
1002 size_t cch;
1003 int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1004 if (pcch)
1005 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1006 return rc;
1007}
1008RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1009
1010
1011/**
1012 * Calculates the Latin-1 length of a string, validating the encoding while
1013 * doing so.
1014 *
1015 * @returns IPRT status code.
1016 * @param psz Pointer to the UTF-8 string.
1017 * @param cchIn The max length of the string. (btw cch = cb)
1018 * Use RTSTR_MAX if all of the string is to be examined.
1019 * @param pcch Where to store the length of the Latin-1 string in bytes.
1020 */
1021static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
1022{
1023 size_t cch = 0;
1024 for (;;)
1025 {
1026 RTUNICP Cp;
1027 size_t cchCp;
1028 int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1029 if (Cp == 0 || rc == VERR_END_OF_STRING)
1030 break;
1031 if (RT_FAILURE(rc))
1032 return rc;
1033 cchCp = RTLatin1CpSize(Cp);
1034 if (cchCp == 0)
1035 return VERR_NO_TRANSLATION;
1036 cch += cchCp;
1037 }
1038
1039 /* done */
1040 *pcch = cch;
1041 return VINF_SUCCESS;
1042}
1043
1044
1045/**
1046 * Recodes a valid UTF-8 string as Latin-1.
1047 *
1048 * Since we know the input is valid, we do *not* perform encoding or length checks.
1049 *
1050 * @returns iprt status code.
1051 * @param pszIn The UTF-8 string to recode. This is a valid encoding.
1052 * @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1053 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1054 * @param psz Where to store the Latin-1 string.
1055 * @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1056 */
1057static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1058{
1059 int rc;
1060 for (;;)
1061 {
1062 RTUNICP Cp;
1063 size_t cchCp;
1064 rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1065 if (Cp == 0 || RT_FAILURE(rc))
1066 break;
1067 cchCp = RTLatin1CpSize(Cp);
1068 if (RT_UNLIKELY(cch < cchCp))
1069 {
1070 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1071 rc = VERR_BUFFER_OVERFLOW;
1072 break;
1073 }
1074 cch -= cchCp;
1075 psz = RTLatin1PutCp(psz, Cp);
1076 }
1077
1078 /* done */
1079 if (rc == VERR_END_OF_STRING)
1080 rc = VINF_SUCCESS;
1081 *psz = '\0';
1082 return rc;
1083}
1084
1085
1086
1087RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag)
1088{
1089 /*
1090 * Validate input.
1091 */
1092 Assert(VALID_PTR(ppszString));
1093 Assert(VALID_PTR(pszString));
1094 *ppszString = NULL;
1095
1096 /*
1097 * Validate the UTF-8 input and calculate the length of the Latin-1 string.
1098 */
1099 size_t cch;
1100 int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1101 if (RT_SUCCESS(rc))
1102 {
1103 /*
1104 * Allocate buffer.
1105 */
1106 char *psz = (char *)RTMemAllocTag(cch + 1, pszTag);
1107 if (psz)
1108 {
1109 /*
1110 * Encode the UTF-16 string.
1111 */
1112 rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1113 if (RT_SUCCESS(rc))
1114 {
1115 *ppszString = psz;
1116 return rc;
1117 }
1118 RTMemFree(psz);
1119 }
1120 else
1121 rc = VERR_NO_STR_MEMORY;
1122 }
1123 return rc;
1124}
1125RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1126
1127
1128RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1129 char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1130{
1131 /*
1132 * Validate input.
1133 */
1134 Assert(VALID_PTR(pszString));
1135 Assert(VALID_PTR(ppsz));
1136 Assert(!pcch || VALID_PTR(pcch));
1137
1138 /*
1139 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1140 */
1141 size_t cchResult;
1142 int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1143 if (RT_SUCCESS(rc))
1144 {
1145 if (pcch)
1146 *pcch = cchResult;
1147
1148 /*
1149 * Check buffer size / Allocate buffer.
1150 */
1151 bool fShouldFree;
1152 char *pszResult;
1153 if (cch > 0 && *ppsz)
1154 {
1155 fShouldFree = false;
1156 if (cch <= cchResult)
1157 return VERR_BUFFER_OVERFLOW;
1158 pszResult = *ppsz;
1159 }
1160 else
1161 {
1162 *ppsz = NULL;
1163 fShouldFree = true;
1164 cch = RT_MAX(cchResult + 1, cch);
1165 pszResult = (char *)RTMemAllocTag(cch, pszTag);
1166 }
1167 if (pszResult)
1168 {
1169 /*
1170 * Encode the Latin-1 string.
1171 */
1172 rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1173 if (RT_SUCCESS(rc))
1174 {
1175 *ppsz = pszResult;
1176 return rc;
1177 }
1178 if (fShouldFree)
1179 RTMemFree(pszResult);
1180 }
1181 else
1182 rc = VERR_NO_STR_MEMORY;
1183 }
1184 return rc;
1185}
1186RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1187
1188
1189RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1190{
1191 size_t cch;
1192 int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1193 return RT_SUCCESS(rc) ? cch : 0;
1194}
1195RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1196
1197
1198RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cchIn, size_t *pcch)
1199{
1200 size_t cch;
1201 int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1202 if (pcch)
1203 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1204 return rc;
1205}
1206RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1207
1208
1209/**
1210 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1211 * @returns rc
1212 * @param ppsz The pointer to the string position point.
1213 * @param pCp Where to store RTUNICP_INVALID.
1214 * @param rc The iprt error code.
1215 */
1216static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1217{
1218 /*
1219 * Try find a valid encoding.
1220 */
1221 (*ppsz)++; /** @todo code this! */
1222 *pCp = RTUNICP_INVALID;
1223 return rc;
1224}
1225
1226
1227RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1228{
1229 RTUNICP Cp;
1230 RTStrGetCpExInternal(&psz, &Cp);
1231 return Cp;
1232}
1233RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1234
1235
1236RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1237{
1238 const unsigned char *puch = (const unsigned char *)*ppsz;
1239 const unsigned char uch = *puch;
1240 RTUNICP uc;
1241
1242 /* ASCII ? */
1243 if (!(uch & RT_BIT(7)))
1244 {
1245 uc = uch;
1246 puch++;
1247 }
1248 else if (uch & RT_BIT(6))
1249 {
1250 /* figure the length and validate the first octet. */
1251/** @todo RT_USE_RTC_3629 */
1252 unsigned cb;
1253 if (!(uch & RT_BIT(5)))
1254 cb = 2;
1255 else if (!(uch & RT_BIT(4)))
1256 cb = 3;
1257 else if (!(uch & RT_BIT(3)))
1258 cb = 4;
1259 else if (!(uch & RT_BIT(2)))
1260 cb = 5;
1261 else if (!(uch & RT_BIT(1)))
1262 cb = 6;
1263 else
1264 {
1265 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1266 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1267 }
1268
1269 /* validate the rest */
1270 switch (cb)
1271 {
1272 case 6:
1273 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1274 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1275 case 5:
1276 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1277 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1278 case 4:
1279 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1280 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1281 case 3:
1282 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1283 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1284 case 2:
1285 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1286 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1287 break;
1288 }
1289
1290 /* get and validate the code point. */
1291 switch (cb)
1292 {
1293 case 6:
1294 uc = (puch[5] & 0x3f)
1295 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1296 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1297 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1298 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1299 | ((RTUNICP)(uch & 0x01) << 30);
1300 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1301 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1302 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1303 break;
1304 case 5:
1305 uc = (puch[4] & 0x3f)
1306 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1307 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1308 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1309 | ((RTUNICP)(uch & 0x03) << 24);
1310 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1311 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1312 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1313 break;
1314 case 4:
1315 uc = (puch[3] & 0x3f)
1316 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1317 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1318 | ((RTUNICP)(uch & 0x07) << 18);
1319 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1320 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1321 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1322 break;
1323 case 3:
1324 uc = (puch[2] & 0x3f)
1325 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1326 | ((RTUNICP)(uch & 0x0f) << 12);
1327 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1328 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1329 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1330 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1331 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1332 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1333 break;
1334 case 2:
1335 uc = (puch[1] & 0x3f)
1336 | ((RTUNICP)(uch & 0x1f) << 6);
1337 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1338 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1339 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1340 break;
1341 default: /* impossible, but GCC is bitching. */
1342 uc = RTUNICP_INVALID;
1343 break;
1344 }
1345 puch += cb;
1346 }
1347 else
1348 {
1349 /* 6th bit is always set. */
1350 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1351 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1352 }
1353 *pCp = uc;
1354 *ppsz = (const char *)puch;
1355 return VINF_SUCCESS;
1356}
1357RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1358
1359
1360/**
1361 * Handle invalid encodings passed to RTStrGetCpNEx().
1362 * @returns rc
1363 * @param ppsz The pointer to the string position point.
1364 * @param pcch Pointer to the string length.
1365 * @param pCp Where to store RTUNICP_INVALID.
1366 * @param rc The iprt error code.
1367 */
1368static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
1369{
1370 /*
1371 * Try find a valid encoding.
1372 */
1373 (*ppsz)++; /** @todo code this! */
1374 (*pcch)--;
1375 *pCp = RTUNICP_INVALID;
1376 return rc;
1377}
1378
1379
1380RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
1381{
1382 const unsigned char *puch = (const unsigned char *)*ppsz;
1383 const unsigned char uch = *puch;
1384 size_t cch = *pcch;
1385 RTUNICP uc;
1386
1387 if (cch == 0)
1388 {
1389 *pCp = RTUNICP_INVALID;
1390 return VERR_END_OF_STRING;
1391 }
1392
1393 /* ASCII ? */
1394 if (!(uch & RT_BIT(7)))
1395 {
1396 uc = uch;
1397 puch++;
1398 cch--;
1399 }
1400 else if (uch & RT_BIT(6))
1401 {
1402 /* figure the length and validate the first octet. */
1403/** @todo RT_USE_RTC_3629 */
1404 unsigned cb;
1405 if (!(uch & RT_BIT(5)))
1406 cb = 2;
1407 else if (!(uch & RT_BIT(4)))
1408 cb = 3;
1409 else if (!(uch & RT_BIT(3)))
1410 cb = 4;
1411 else if (!(uch & RT_BIT(2)))
1412 cb = 5;
1413 else if (!(uch & RT_BIT(1)))
1414 cb = 6;
1415 else
1416 {
1417 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1418 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1419 }
1420
1421 if (cb > cch)
1422 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1423
1424 /* validate the rest */
1425 switch (cb)
1426 {
1427 case 6:
1428 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1429 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1430 case 5:
1431 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1432 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1433 case 4:
1434 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1435 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1436 case 3:
1437 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1438 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1439 case 2:
1440 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1441 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1442 break;
1443 }
1444
1445 /* get and validate the code point. */
1446 switch (cb)
1447 {
1448 case 6:
1449 uc = (puch[5] & 0x3f)
1450 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1451 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1452 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1453 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1454 | ((RTUNICP)(uch & 0x01) << 30);
1455 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1456 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1457 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1458 break;
1459 case 5:
1460 uc = (puch[4] & 0x3f)
1461 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1462 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1463 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1464 | ((RTUNICP)(uch & 0x03) << 24);
1465 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1466 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1467 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1468 break;
1469 case 4:
1470 uc = (puch[3] & 0x3f)
1471 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1472 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1473 | ((RTUNICP)(uch & 0x07) << 18);
1474 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1475 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1476 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1477 break;
1478 case 3:
1479 uc = (puch[2] & 0x3f)
1480 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1481 | ((RTUNICP)(uch & 0x0f) << 12);
1482 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1483 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1484 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1485 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1486 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1487 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1488 break;
1489 case 2:
1490 uc = (puch[1] & 0x3f)
1491 | ((RTUNICP)(uch & 0x1f) << 6);
1492 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1493 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1494 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1495 break;
1496 default: /* impossible, but GCC is bitching. */
1497 uc = RTUNICP_INVALID;
1498 break;
1499 }
1500 puch += cb;
1501 cch -= cb;
1502 }
1503 else
1504 {
1505 /* 6th bit is always set. */
1506 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1507 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1508 }
1509 *pCp = uc;
1510 *ppsz = (const char *)puch;
1511 (*pcch) = cch;
1512 return VINF_SUCCESS;
1513}
1514RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1515
1516
1517RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1518{
1519 unsigned char *puch = (unsigned char *)psz;
1520 if (uc < 0x80)
1521 *puch++ = (unsigned char )uc;
1522 else if (uc < 0x00000800)
1523 {
1524 *puch++ = 0xc0 | (uc >> 6);
1525 *puch++ = 0x80 | (uc & 0x3f);
1526 }
1527 else if (uc < 0x00010000)
1528 {
1529/** @todo RT_USE_RTC_3629 */
1530 if ( uc < 0x0000d8000
1531 || ( uc > 0x0000dfff
1532 && uc < 0x0000fffe))
1533 {
1534 *puch++ = 0xe0 | (uc >> 12);
1535 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1536 *puch++ = 0x80 | (uc & 0x3f);
1537 }
1538 else
1539 {
1540 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1541 *puch++ = 0x7f;
1542 }
1543 }
1544/** @todo RT_USE_RTC_3629 */
1545 else if (uc < 0x00200000)
1546 {
1547 *puch++ = 0xf0 | (uc >> 18);
1548 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1549 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1550 *puch++ = 0x80 | (uc & 0x3f);
1551 }
1552 else if (uc < 0x04000000)
1553 {
1554 *puch++ = 0xf8 | (uc >> 24);
1555 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1556 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1557 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1558 *puch++ = 0x80 | (uc & 0x3f);
1559 }
1560 else if (uc <= 0x7fffffff)
1561 {
1562 *puch++ = 0xfc | (uc >> 30);
1563 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1564 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1565 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1566 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1567 *puch++ = 0x80 | (uc & 0x3f);
1568 }
1569 else
1570 {
1571 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1572 *puch++ = 0x7f;
1573 }
1574
1575 return (char *)puch;
1576}
1577RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1578
1579
1580RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
1581{
1582 if (pszStart < psz)
1583 {
1584 /* simple char? */
1585 const unsigned char *puch = (const unsigned char *)psz;
1586 unsigned uch = *--puch;
1587 if (!(uch & RT_BIT(7)))
1588 return (char *)puch;
1589 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1590
1591 /* two or more. */
1592 uint32_t uMask = 0xffffffc0;
1593 while ( (const unsigned char *)pszStart < puch
1594 && !(uMask & 1))
1595 {
1596 uch = *--puch;
1597 if ((uch & 0xc0) != 0x80)
1598 {
1599 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1600 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
1601 (char *)pszStart);
1602 return (char *)puch;
1603 }
1604 uMask >>= 1;
1605 }
1606 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
1607 }
1608 return (char *)pszStart;
1609}
1610RT_EXPORT_SYMBOL(RTStrPrevCp);
1611
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette