VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 40091

Last change on this file since 40091 was 40091, checked in by vboxsync, 13 years ago

Runtime/strings: add Utf-8 and Utf-16 sanitising to a white list of characters. Do not validate the string encoding in advance.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 54.4 KB
Line 
1/* $Id: utf-8.cpp 40091 2012-02-13 10:14:00Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.215389.xyz. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42/**
43 * Get get length in code points of a UTF-8 encoded string.
44 * The string is validated while doing this.
45 *
46 * @returns IPRT status code.
47 * @param psz Pointer to the UTF-8 string.
48 * @param cch The max length of the string. (btw cch = cb)
49 * Use RTSTR_MAX if all of the string is to be examined.
50 * @param pcuc Where to store the length in unicode code points.
51 * @param pcchActual Where to store the actual size of the UTF-8 string
52 * on success (cch = cb again). Optional.
53 */
54DECLHIDDEN(int) rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
55{
56 const unsigned char *puch = (const unsigned char *)psz;
57 size_t cCodePoints = 0;
58 while (cch > 0)
59 {
60 const unsigned char uch = *puch;
61 if (!uch)
62 break;
63 if (uch & RT_BIT(7))
64 {
65 /* figure sequence length and validate the first byte */
66/** @todo RT_USE_RTC_3629 */
67 unsigned cb;
68 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
69 cb = 2;
70 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
71 cb = 3;
72 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
73 cb = 4;
74 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
75 cb = 5;
76 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
77 cb = 6;
78 else
79 {
80 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
81 return VERR_INVALID_UTF8_ENCODING;
82 }
83
84 /* check length */
85 if (cb > cch)
86 {
87 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
88 return VERR_INVALID_UTF8_ENCODING;
89 }
90
91 /* validate the rest */
92 switch (cb)
93 {
94 case 6:
95 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
96 case 5:
97 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98 case 4:
99 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100 case 3:
101 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
102 case 2:
103 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104 break;
105 }
106
107 /* validate the code point. */
108 RTUNICP uc;
109 switch (cb)
110 {
111 case 6:
112 uc = (puch[5] & 0x3f)
113 | ((RTUNICP)(puch[4] & 0x3f) << 6)
114 | ((RTUNICP)(puch[3] & 0x3f) << 12)
115 | ((RTUNICP)(puch[2] & 0x3f) << 18)
116 | ((RTUNICP)(puch[1] & 0x3f) << 24)
117 | ((RTUNICP)(uch & 0x01) << 30);
118 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
119 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120 break;
121 case 5:
122 uc = (puch[4] & 0x3f)
123 | ((RTUNICP)(puch[3] & 0x3f) << 6)
124 | ((RTUNICP)(puch[2] & 0x3f) << 12)
125 | ((RTUNICP)(puch[1] & 0x3f) << 18)
126 | ((RTUNICP)(uch & 0x03) << 24);
127 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
128 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
129 break;
130 case 4:
131 uc = (puch[3] & 0x3f)
132 | ((RTUNICP)(puch[2] & 0x3f) << 6)
133 | ((RTUNICP)(puch[1] & 0x3f) << 12)
134 | ((RTUNICP)(uch & 0x07) << 18);
135 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
136 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
137 break;
138 case 3:
139 uc = (puch[2] & 0x3f)
140 | ((RTUNICP)(puch[1] & 0x3f) << 6)
141 | ((RTUNICP)(uch & 0x0f) << 12);
142 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
143 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
144 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
145 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
146 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
147 break;
148 case 2:
149 uc = (puch[1] & 0x3f)
150 | ((RTUNICP)(uch & 0x1f) << 6);
151 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
152 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153 break;
154 }
155
156 /* advance */
157 cch -= cb;
158 puch += cb;
159 }
160 else
161 {
162 /* one ASCII byte */
163 puch++;
164 cch--;
165 }
166 cCodePoints++;
167 }
168
169 /* done */
170 *pcuc = cCodePoints;
171 if (pcchActual)
172 *pcchActual = puch - (unsigned char const *)psz;
173 return VINF_SUCCESS;
174}
175
176
177/**
178 * Decodes and UTF-8 string into an array of unicode code point.
179 *
180 * Since we know the input is valid, we do *not* perform encoding or length checks.
181 *
182 * @returns iprt status code.
183 * @param psz The UTF-8 string to recode. This is a valid encoding.
184 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
185 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
186 * @param paCps Where to store the code points array.
187 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
188 */
189static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
190{
191 int rc = VINF_SUCCESS;
192 const unsigned char *puch = (const unsigned char *)psz;
193 PRTUNICP pCp = paCps;
194 while (cch > 0)
195 {
196 /* read the next char and check for terminator. */
197 const unsigned char uch = *puch;
198 if (!uch)
199 break;
200
201 /* check for output overflow */
202 if (RT_UNLIKELY(cCps < 1))
203 {
204 rc = VERR_BUFFER_OVERFLOW;
205 break;
206 }
207 cCps--;
208
209 /* decode and recode the code point */
210 if (!(uch & RT_BIT(7)))
211 {
212 *pCp++ = uch;
213 puch++;
214 cch--;
215 }
216#ifdef RT_STRICT
217 else if (!(uch & RT_BIT(6)))
218 AssertMsgFailed(("Internal error!\n"));
219#endif
220 else if (!(uch & RT_BIT(5)))
221 {
222 *pCp++ = (puch[1] & 0x3f)
223 | ((uint16_t)(uch & 0x1f) << 6);
224 puch += 2;
225 cch -= 2;
226 }
227 else if (!(uch & RT_BIT(4)))
228 {
229 *pCp++ = (puch[2] & 0x3f)
230 | ((uint16_t)(puch[1] & 0x3f) << 6)
231 | ((uint16_t)(uch & 0x0f) << 12);
232 puch += 3;
233 cch -= 3;
234 }
235 else if (!(uch & RT_BIT(3)))
236 {
237 *pCp++ = (puch[3] & 0x3f)
238 | ((RTUNICP)(puch[2] & 0x3f) << 6)
239 | ((RTUNICP)(puch[1] & 0x3f) << 12)
240 | ((RTUNICP)(uch & 0x07) << 18);
241 puch += 4;
242 cch -= 4;
243 }
244 else if (!(uch & RT_BIT(2)))
245 {
246 *pCp++ = (puch[4] & 0x3f)
247 | ((RTUNICP)(puch[3] & 0x3f) << 6)
248 | ((RTUNICP)(puch[2] & 0x3f) << 12)
249 | ((RTUNICP)(puch[1] & 0x3f) << 18)
250 | ((RTUNICP)(uch & 0x03) << 24);
251 puch += 5;
252 cch -= 6;
253 }
254 else
255 {
256 Assert(!(uch & RT_BIT(1)));
257 *pCp++ = (puch[5] & 0x3f)
258 | ((RTUNICP)(puch[4] & 0x3f) << 6)
259 | ((RTUNICP)(puch[3] & 0x3f) << 12)
260 | ((RTUNICP)(puch[2] & 0x3f) << 18)
261 | ((RTUNICP)(puch[1] & 0x3f) << 24)
262 | ((RTUNICP)(uch & 0x01) << 30);
263 puch += 6;
264 cch -= 6;
265 }
266 }
267
268 /* done */
269 *pCp = 0;
270 return rc;
271}
272
273
274RTDECL(size_t) RTStrUniLen(const char *psz)
275{
276 size_t cCodePoints;
277 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
278 return RT_SUCCESS(rc) ? cCodePoints : 0;
279}
280RT_EXPORT_SYMBOL(RTStrUniLen);
281
282
283RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
284{
285 size_t cCodePoints;
286 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
287 if (pcCps)
288 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
289 return rc;
290}
291RT_EXPORT_SYMBOL(RTStrUniLenEx);
292
293
294RTDECL(int) RTStrValidateEncoding(const char *psz)
295{
296 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
297}
298RT_EXPORT_SYMBOL(RTStrValidateEncoding);
299
300
301RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
302{
303 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
304 AssertPtr(psz);
305
306 /*
307 * Use rtUtf8Length for the job.
308 */
309 size_t cchActual;
310 size_t cCpsIgnored;
311 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
312 if (RT_SUCCESS(rc))
313 {
314 if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
315 && cchActual >= cch)
316 rc = VERR_BUFFER_OVERFLOW;
317 }
318 return rc;
319}
320RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
321
322
323RTDECL(bool) RTStrIsValidEncoding(const char *psz)
324{
325 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
326 return RT_SUCCESS(rc);
327}
328RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
329
330
331RTDECL(size_t) RTStrPurgeEncoding(char *psz)
332{
333 size_t cErrors = 0;
334 for (;;)
335 {
336 RTUNICP Cp;
337 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
338 if (RT_SUCCESS(rc))
339 {
340 if (!Cp)
341 break;
342 }
343 else
344 {
345 psz[-1] = '?';
346 cErrors++;
347 }
348 }
349 return cErrors;
350}
351RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
352
353
354RTDECL(ssize_t) RTStrPurgeComplementSet(char *psz, PCRTUNICP puszValidSet, char chReplacement)
355{
356 size_t cReplacements = 0;
357 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
358 for (;;)
359 {
360 RTUNICP Cp;
361 PCRTUNICP pCp;
362 char *pszOld = psz;
363 if (RT_FAILURE(RTStrGetCpEx((const char **)&psz, &Cp)))
364 return -1;
365 if (!Cp)
366 break;
367 for (pCp = puszValidSet; ; ++pCp)
368 if (!*pCp || *pCp == Cp)
369 break;
370 if (!*pCp)
371 {
372 for (; pszOld != psz; ++pszOld)
373 *pszOld = chReplacement;
374 ++cReplacements;
375 }
376 }
377 return cReplacements;
378}
379RT_EXPORT_SYMBOL(RTStrPurgeComplementSet);
380
381
382RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
383{
384 /*
385 * Validate input.
386 */
387 Assert(VALID_PTR(pszString));
388 Assert(VALID_PTR(ppaCps));
389 *ppaCps = NULL;
390
391 /*
392 * Validate the UTF-8 input and count its code points.
393 */
394 size_t cCps;
395 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
396 if (RT_SUCCESS(rc))
397 {
398 /*
399 * Allocate buffer.
400 */
401 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
402 if (paCps)
403 {
404 /*
405 * Decode the string.
406 */
407 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
408 if (RT_SUCCESS(rc))
409 {
410 *ppaCps = paCps;
411 return rc;
412 }
413 RTMemFree(paCps);
414 }
415 else
416 rc = VERR_NO_CODE_POINT_MEMORY;
417 }
418 return rc;
419}
420RT_EXPORT_SYMBOL(RTStrToUni);
421
422
423RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
424{
425 /*
426 * Validate input.
427 */
428 Assert(VALID_PTR(pszString));
429 Assert(VALID_PTR(ppaCps));
430 Assert(!pcCps || VALID_PTR(pcCps));
431
432 /*
433 * Validate the UTF-8 input and count the code points.
434 */
435 size_t cCpsResult;
436 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
437 if (RT_SUCCESS(rc))
438 {
439 if (pcCps)
440 *pcCps = cCpsResult;
441
442 /*
443 * Check buffer size / Allocate buffer.
444 */
445 bool fShouldFree;
446 PRTUNICP paCpsResult;
447 if (cCps > 0 && *ppaCps)
448 {
449 fShouldFree = false;
450 if (cCps <= cCpsResult)
451 return VERR_BUFFER_OVERFLOW;
452 paCpsResult = *ppaCps;
453 }
454 else
455 {
456 *ppaCps = NULL;
457 fShouldFree = true;
458 cCps = RT_MAX(cCpsResult + 1, cCps);
459 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
460 }
461 if (paCpsResult)
462 {
463 /*
464 * Encode the UTF-16 string.
465 */
466 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
467 if (RT_SUCCESS(rc))
468 {
469 *ppaCps = paCpsResult;
470 return rc;
471 }
472 if (fShouldFree)
473 RTMemFree(paCpsResult);
474 }
475 else
476 rc = VERR_NO_CODE_POINT_MEMORY;
477 }
478 return rc;
479}
480RT_EXPORT_SYMBOL(RTStrToUniEx);
481
482
483/**
484 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
485 *
486 * @returns IPRT status code.
487 * @param psz Pointer to the UTF-8 string.
488 * @param cch The max length of the string. (btw cch = cb)
489 * Use RTSTR_MAX if all of the string is to be examined.
490 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
491 */
492static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
493{
494 const unsigned char *puch = (const unsigned char *)psz;
495 size_t cwc = 0;
496 while (cch > 0)
497 {
498 const unsigned char uch = *puch;
499 if (!uch)
500 break;
501 if (!(uch & RT_BIT(7)))
502 {
503 /* one ASCII byte */
504 cwc++;
505 puch++;
506 cch--;
507 }
508 else
509 {
510 /* figure sequence length and validate the first byte */
511 unsigned cb;
512 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
513 cb = 2;
514 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
515 cb = 3;
516 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
517 cb = 4;
518 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
519 cb = 5;
520 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
521 cb = 6;
522 else
523 {
524 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
525 return VERR_INVALID_UTF8_ENCODING;
526 }
527
528 /* check length */
529 if (cb > cch)
530 {
531 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
532 return VERR_INVALID_UTF8_ENCODING;
533 }
534
535 /* validate the rest */
536 switch (cb)
537 {
538 case 6:
539 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
540 case 5:
541 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
542 case 4:
543 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
544 case 3:
545 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
546 case 2:
547 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
548 break;
549 }
550
551 /* validate the code point. */
552 RTUNICP uc;
553 switch (cb)
554 {
555 case 6:
556 uc = (puch[5] & 0x3f)
557 | ((RTUNICP)(puch[4] & 0x3f) << 6)
558 | ((RTUNICP)(puch[3] & 0x3f) << 12)
559 | ((RTUNICP)(puch[2] & 0x3f) << 18)
560 | ((RTUNICP)(puch[1] & 0x3f) << 24)
561 | ((RTUNICP)(uch & 0x01) << 30);
562 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
563 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
564 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
565 return VERR_CANT_RECODE_AS_UTF16;
566 case 5:
567 uc = (puch[4] & 0x3f)
568 | ((RTUNICP)(puch[3] & 0x3f) << 6)
569 | ((RTUNICP)(puch[2] & 0x3f) << 12)
570 | ((RTUNICP)(puch[1] & 0x3f) << 18)
571 | ((RTUNICP)(uch & 0x03) << 24);
572 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
573 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
574 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
575 return VERR_CANT_RECODE_AS_UTF16;
576 case 4:
577 uc = (puch[3] & 0x3f)
578 | ((RTUNICP)(puch[2] & 0x3f) << 6)
579 | ((RTUNICP)(puch[1] & 0x3f) << 12)
580 | ((RTUNICP)(uch & 0x07) << 18);
581 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
582 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
583 RTStrAssertMsgReturn(uc <= 0x0010ffff,
584 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
585 cwc++;
586 break;
587 case 3:
588 uc = (puch[2] & 0x3f)
589 | ((RTUNICP)(puch[1] & 0x3f) << 6)
590 | ((RTUNICP)(uch & 0x0f) << 12);
591 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
592 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
593 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
594 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
595 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
596 break;
597 case 2:
598 uc = (puch[1] & 0x3f)
599 | ((RTUNICP)(uch & 0x1f) << 6);
600 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
601 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
602 break;
603 }
604
605 /* advance */
606 cch -= cb;
607 puch += cb;
608 cwc++;
609 }
610 }
611
612 /* done */
613 *pcwc = cwc;
614 return VINF_SUCCESS;
615}
616
617
618/**
619 * Recodes a valid UTF-8 string as UTF-16.
620 *
621 * Since we know the input is valid, we do *not* perform encoding or length checks.
622 *
623 * @returns iprt status code.
624 * @param psz The UTF-8 string to recode. This is a valid encoding.
625 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
626 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
627 * @param pwsz Where to store the UTF-16 string.
628 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
629 */
630static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
631{
632 int rc = VINF_SUCCESS;
633 const unsigned char *puch = (const unsigned char *)psz;
634 PRTUTF16 pwc = pwsz;
635 while (cch > 0)
636 {
637 /* read the next char and check for terminator. */
638 const unsigned char uch = *puch;
639 if (!uch)
640 break;
641
642 /* check for output overflow */
643 if (RT_UNLIKELY(cwc < 1))
644 {
645 rc = VERR_BUFFER_OVERFLOW;
646 break;
647 }
648 cwc--;
649
650 /* decode and recode the code point */
651 if (!(uch & RT_BIT(7)))
652 {
653 *pwc++ = uch;
654 puch++;
655 cch--;
656 }
657 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
658 {
659 uint16_t uc = (puch[1] & 0x3f)
660 | ((uint16_t)(uch & 0x1f) << 6);
661 *pwc++ = uc;
662 puch += 2;
663 cch -= 2;
664 }
665 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
666 {
667 uint16_t uc = (puch[2] & 0x3f)
668 | ((uint16_t)(puch[1] & 0x3f) << 6)
669 | ((uint16_t)(uch & 0x0f) << 12);
670 *pwc++ = uc;
671 puch += 3;
672 cch -= 3;
673 }
674 else
675 {
676 /* generate surrogate pair */
677 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
678 RTUNICP uc = (puch[3] & 0x3f)
679 | ((RTUNICP)(puch[2] & 0x3f) << 6)
680 | ((RTUNICP)(puch[1] & 0x3f) << 12)
681 | ((RTUNICP)(uch & 0x07) << 18);
682 if (RT_UNLIKELY(cwc < 1))
683 {
684 rc = VERR_BUFFER_OVERFLOW;
685 break;
686 }
687 cwc--;
688
689 uc -= 0x10000;
690 *pwc++ = 0xd800 | (uc >> 10);
691 *pwc++ = 0xdc00 | (uc & 0x3ff);
692 puch += 4;
693 cch -= 4;
694 }
695 }
696
697 /* done */
698 *pwc = '\0';
699 return rc;
700}
701
702
703RTDECL(int) RTStrToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
704{
705 /*
706 * Validate input.
707 */
708 Assert(VALID_PTR(ppwszString));
709 Assert(VALID_PTR(pszString));
710 *ppwszString = NULL;
711
712 /*
713 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
714 */
715 size_t cwc;
716 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
717 if (RT_SUCCESS(rc))
718 {
719 /*
720 * Allocate buffer.
721 */
722 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
723 if (pwsz)
724 {
725 /*
726 * Encode the UTF-16 string.
727 */
728 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
729 if (RT_SUCCESS(rc))
730 {
731 *ppwszString = pwsz;
732 return rc;
733 }
734 RTMemFree(pwsz);
735 }
736 else
737 rc = VERR_NO_UTF16_MEMORY;
738 }
739 return rc;
740}
741RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
742
743
744RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
745 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
746{
747 /*
748 * Validate input.
749 */
750 Assert(VALID_PTR(pszString));
751 Assert(VALID_PTR(ppwsz));
752 Assert(!pcwc || VALID_PTR(pcwc));
753
754 /*
755 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
756 */
757 size_t cwcResult;
758 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
759 if (RT_SUCCESS(rc))
760 {
761 if (pcwc)
762 *pcwc = cwcResult;
763
764 /*
765 * Check buffer size / Allocate buffer.
766 */
767 bool fShouldFree;
768 PRTUTF16 pwszResult;
769 if (cwc > 0 && *ppwsz)
770 {
771 fShouldFree = false;
772 if (cwc <= cwcResult)
773 return VERR_BUFFER_OVERFLOW;
774 pwszResult = *ppwsz;
775 }
776 else
777 {
778 *ppwsz = NULL;
779 fShouldFree = true;
780 cwc = RT_MAX(cwcResult + 1, cwc);
781 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
782 }
783 if (pwszResult)
784 {
785 /*
786 * Encode the UTF-16 string.
787 */
788 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
789 if (RT_SUCCESS(rc))
790 {
791 *ppwsz = pwszResult;
792 return rc;
793 }
794 if (fShouldFree)
795 RTMemFree(pwszResult);
796 }
797 else
798 rc = VERR_NO_UTF16_MEMORY;
799 }
800 return rc;
801}
802RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
803
804
805RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
806{
807 size_t cwc;
808 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
809 return RT_SUCCESS(rc) ? cwc : 0;
810}
811RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
812
813
814RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
815{
816 size_t cwc;
817 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
818 if (pcwc)
819 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
820 return rc;
821}
822RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
823
824
825/**
826 * Calculates the length of the UTF-8 encoding of a Latin-1 string.
827 *
828 * @returns iprt status code.
829 * @param psz The Latin-1 string.
830 * @param cchIn The max length of the Latin-1 string to consider.
831 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
832 */
833static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch)
834{
835 size_t cch = 0;
836 for (;;)
837 {
838 RTUNICP Cp;
839 int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
840 if (Cp == 0 || rc == VERR_END_OF_STRING)
841 break;
842 if (RT_FAILURE(rc))
843 return rc;
844 cch += RTStrCpSize(Cp); /* cannot fail */
845 }
846
847 /* done */
848 *pcch = cch;
849 return VINF_SUCCESS;
850}
851
852
853/**
854 * Recodes a Latin-1 string as UTF-8.
855 *
856 * @returns iprt status code.
857 * @param psz The Latin-1 string.
858 * @param cchIn The number of characters to process from psz. The recoding
859 * will stop when cch or '\\0' is reached.
860 * @param psz Where to store the UTF-8 string.
861 * @param cch The size of the UTF-8 buffer, excluding the terminator.
862 */
863static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
864{
865 int rc;
866 for (;;)
867 {
868 RTUNICP Cp;
869 size_t cchCp;
870 rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
871 if (Cp == 0 || RT_FAILURE(rc))
872 break;
873 cchCp = RTStrCpSize(Cp);
874 if (RT_UNLIKELY(cch < cchCp))
875 {
876 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
877 rc = VERR_BUFFER_OVERFLOW;
878 break;
879 }
880 cch -= cchCp;
881 psz = RTStrPutCp(psz, Cp);
882 }
883
884 /* done */
885 if (rc == VERR_END_OF_STRING)
886 rc = VINF_SUCCESS;
887 *psz = '\0';
888 return rc;
889}
890
891
892
893RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag)
894{
895 /*
896 * Validate input.
897 */
898 Assert(VALID_PTR(ppszString));
899 Assert(VALID_PTR(pszString));
900 *ppszString = NULL;
901
902 /*
903 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
904 */
905 size_t cch;
906 int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
907 if (RT_SUCCESS(rc))
908 {
909 /*
910 * Allocate buffer and recode it.
911 */
912 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
913 if (pszResult)
914 {
915 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
916 if (RT_SUCCESS(rc))
917 {
918 *ppszString = pszResult;
919 return rc;
920 }
921
922 RTMemFree(pszResult);
923 }
924 else
925 rc = VERR_NO_STR_MEMORY;
926 }
927 return rc;
928}
929RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
930
931
932RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
933{
934 /*
935 * Validate input.
936 */
937 Assert(VALID_PTR(pszString));
938 Assert(VALID_PTR(ppsz));
939 Assert(!pcch || VALID_PTR(pcch));
940
941 /*
942 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
943 */
944 size_t cchResult;
945 int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
946 if (RT_SUCCESS(rc))
947 {
948 if (pcch)
949 *pcch = cchResult;
950
951 /*
952 * Check buffer size / Allocate buffer and recode it.
953 */
954 bool fShouldFree;
955 char *pszResult;
956 if (cch > 0 && *ppsz)
957 {
958 fShouldFree = false;
959 if (RT_UNLIKELY(cch <= cchResult))
960 return VERR_BUFFER_OVERFLOW;
961 pszResult = *ppsz;
962 }
963 else
964 {
965 *ppsz = NULL;
966 fShouldFree = true;
967 cch = RT_MAX(cch, cchResult + 1);
968 pszResult = (char *)RTStrAllocTag(cch, pszTag);
969 }
970 if (pszResult)
971 {
972 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
973 if (RT_SUCCESS(rc))
974 {
975 *ppsz = pszResult;
976 return rc;
977 }
978
979 if (fShouldFree)
980 RTStrFree(pszResult);
981 }
982 else
983 rc = VERR_NO_STR_MEMORY;
984 }
985 return rc;
986}
987RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
988
989
990RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
991{
992 size_t cch;
993 int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
994 return RT_SUCCESS(rc) ? cch : 0;
995}
996RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
997
998
999RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch)
1000{
1001 size_t cch;
1002 int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
1003 if (pcch)
1004 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1005 return rc;
1006}
1007RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
1008
1009
1010/**
1011 * Calculates the Latin-1 length of a string, validating the encoding while
1012 * doing so.
1013 *
1014 * @returns IPRT status code.
1015 * @param psz Pointer to the UTF-8 string.
1016 * @param cchIn The max length of the string. (btw cch = cb)
1017 * Use RTSTR_MAX if all of the string is to be examined.
1018 * @param pcch Where to store the length of the Latin-1 string in bytes.
1019 */
1020static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
1021{
1022 size_t cch = 0;
1023 for (;;)
1024 {
1025 RTUNICP Cp;
1026 size_t cchCp;
1027 int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1028 if (Cp == 0 || rc == VERR_END_OF_STRING)
1029 break;
1030 if (RT_FAILURE(rc))
1031 return rc;
1032 cchCp = RTLatin1CpSize(Cp);
1033 if (cchCp == 0)
1034 return VERR_NO_TRANSLATION;
1035 cch += cchCp;
1036 }
1037
1038 /* done */
1039 *pcch = cch;
1040 return VINF_SUCCESS;
1041}
1042
1043
1044/**
1045 * Recodes a valid UTF-8 string as Latin-1.
1046 *
1047 * Since we know the input is valid, we do *not* perform encoding or length checks.
1048 *
1049 * @returns iprt status code.
1050 * @param pszIn The UTF-8 string to recode. This is a valid encoding.
1051 * @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1052 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1053 * @param psz Where to store the Latin-1 string.
1054 * @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1055 */
1056static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1057{
1058 int rc;
1059 for (;;)
1060 {
1061 RTUNICP Cp;
1062 size_t cchCp;
1063 rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1064 if (Cp == 0 || RT_FAILURE(rc))
1065 break;
1066 cchCp = RTLatin1CpSize(Cp);
1067 if (RT_UNLIKELY(cch < cchCp))
1068 {
1069 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1070 rc = VERR_BUFFER_OVERFLOW;
1071 break;
1072 }
1073 cch -= cchCp;
1074 psz = RTLatin1PutCp(psz, Cp);
1075 }
1076
1077 /* done */
1078 if (rc == VERR_END_OF_STRING)
1079 rc = VINF_SUCCESS;
1080 *psz = '\0';
1081 return rc;
1082}
1083
1084
1085
1086RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag)
1087{
1088 /*
1089 * Validate input.
1090 */
1091 Assert(VALID_PTR(ppszString));
1092 Assert(VALID_PTR(pszString));
1093 *ppszString = NULL;
1094
1095 /*
1096 * Validate the UTF-8 input and calculate the length of the Latin-1 string.
1097 */
1098 size_t cch;
1099 int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1100 if (RT_SUCCESS(rc))
1101 {
1102 /*
1103 * Allocate buffer.
1104 */
1105 char *psz = (char *)RTMemAllocTag(cch + 1, pszTag);
1106 if (psz)
1107 {
1108 /*
1109 * Encode the UTF-16 string.
1110 */
1111 rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1112 if (RT_SUCCESS(rc))
1113 {
1114 *ppszString = psz;
1115 return rc;
1116 }
1117 RTMemFree(psz);
1118 }
1119 else
1120 rc = VERR_NO_STR_MEMORY;
1121 }
1122 return rc;
1123}
1124RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1125
1126
1127RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1128 char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1129{
1130 /*
1131 * Validate input.
1132 */
1133 Assert(VALID_PTR(pszString));
1134 Assert(VALID_PTR(ppsz));
1135 Assert(!pcch || VALID_PTR(pcch));
1136
1137 /*
1138 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1139 */
1140 size_t cchResult;
1141 int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1142 if (RT_SUCCESS(rc))
1143 {
1144 if (pcch)
1145 *pcch = cchResult;
1146
1147 /*
1148 * Check buffer size / Allocate buffer.
1149 */
1150 bool fShouldFree;
1151 char *pszResult;
1152 if (cch > 0 && *ppsz)
1153 {
1154 fShouldFree = false;
1155 if (cch <= cchResult)
1156 return VERR_BUFFER_OVERFLOW;
1157 pszResult = *ppsz;
1158 }
1159 else
1160 {
1161 *ppsz = NULL;
1162 fShouldFree = true;
1163 cch = RT_MAX(cchResult + 1, cch);
1164 pszResult = (char *)RTMemAllocTag(cch, pszTag);
1165 }
1166 if (pszResult)
1167 {
1168 /*
1169 * Encode the Latin-1 string.
1170 */
1171 rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1172 if (RT_SUCCESS(rc))
1173 {
1174 *ppsz = pszResult;
1175 return rc;
1176 }
1177 if (fShouldFree)
1178 RTMemFree(pszResult);
1179 }
1180 else
1181 rc = VERR_NO_STR_MEMORY;
1182 }
1183 return rc;
1184}
1185RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1186
1187
1188RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1189{
1190 size_t cch;
1191 int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1192 return RT_SUCCESS(rc) ? cch : 0;
1193}
1194RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1195
1196
1197RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cchIn, size_t *pcch)
1198{
1199 size_t cch;
1200 int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1201 if (pcch)
1202 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1203 return rc;
1204}
1205RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1206
1207
1208/**
1209 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1210 * @returns rc
1211 * @param ppsz The pointer to the string position point.
1212 * @param pCp Where to store RTUNICP_INVALID.
1213 * @param rc The iprt error code.
1214 */
1215static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1216{
1217 /*
1218 * Try find a valid encoding.
1219 */
1220 (*ppsz)++; /** @todo code this! */
1221 *pCp = RTUNICP_INVALID;
1222 return rc;
1223}
1224
1225
1226RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1227{
1228 RTUNICP Cp;
1229 RTStrGetCpExInternal(&psz, &Cp);
1230 return Cp;
1231}
1232RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1233
1234
1235RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1236{
1237 const unsigned char *puch = (const unsigned char *)*ppsz;
1238 const unsigned char uch = *puch;
1239 RTUNICP uc;
1240
1241 /* ASCII ? */
1242 if (!(uch & RT_BIT(7)))
1243 {
1244 uc = uch;
1245 puch++;
1246 }
1247 else if (uch & RT_BIT(6))
1248 {
1249 /* figure the length and validate the first octet. */
1250/** @todo RT_USE_RTC_3629 */
1251 unsigned cb;
1252 if (!(uch & RT_BIT(5)))
1253 cb = 2;
1254 else if (!(uch & RT_BIT(4)))
1255 cb = 3;
1256 else if (!(uch & RT_BIT(3)))
1257 cb = 4;
1258 else if (!(uch & RT_BIT(2)))
1259 cb = 5;
1260 else if (!(uch & RT_BIT(1)))
1261 cb = 6;
1262 else
1263 {
1264 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1265 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1266 }
1267
1268 /* validate the rest */
1269 switch (cb)
1270 {
1271 case 6:
1272 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1273 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1274 case 5:
1275 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1276 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1277 case 4:
1278 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1279 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1280 case 3:
1281 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1282 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1283 case 2:
1284 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1285 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1286 break;
1287 }
1288
1289 /* get and validate the code point. */
1290 switch (cb)
1291 {
1292 case 6:
1293 uc = (puch[5] & 0x3f)
1294 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1295 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1296 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1297 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1298 | ((RTUNICP)(uch & 0x01) << 30);
1299 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1300 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1301 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1302 break;
1303 case 5:
1304 uc = (puch[4] & 0x3f)
1305 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1306 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1307 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1308 | ((RTUNICP)(uch & 0x03) << 24);
1309 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1310 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1311 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1312 break;
1313 case 4:
1314 uc = (puch[3] & 0x3f)
1315 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1316 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1317 | ((RTUNICP)(uch & 0x07) << 18);
1318 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1319 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1320 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1321 break;
1322 case 3:
1323 uc = (puch[2] & 0x3f)
1324 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1325 | ((RTUNICP)(uch & 0x0f) << 12);
1326 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1327 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1328 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1329 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1330 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1331 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1332 break;
1333 case 2:
1334 uc = (puch[1] & 0x3f)
1335 | ((RTUNICP)(uch & 0x1f) << 6);
1336 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1337 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1338 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1339 break;
1340 default: /* impossible, but GCC is bitching. */
1341 uc = RTUNICP_INVALID;
1342 break;
1343 }
1344 puch += cb;
1345 }
1346 else
1347 {
1348 /* 6th bit is always set. */
1349 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1350 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1351 }
1352 *pCp = uc;
1353 *ppsz = (const char *)puch;
1354 return VINF_SUCCESS;
1355}
1356RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1357
1358
1359/**
1360 * Handle invalid encodings passed to RTStrGetCpNEx().
1361 * @returns rc
1362 * @param ppsz The pointer to the string position point.
1363 * @param pcch Pointer to the string length.
1364 * @param pCp Where to store RTUNICP_INVALID.
1365 * @param rc The iprt error code.
1366 */
1367static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
1368{
1369 /*
1370 * Try find a valid encoding.
1371 */
1372 (*ppsz)++; /** @todo code this! */
1373 (*pcch)--;
1374 *pCp = RTUNICP_INVALID;
1375 return rc;
1376}
1377
1378
1379RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
1380{
1381 const unsigned char *puch = (const unsigned char *)*ppsz;
1382 const unsigned char uch = *puch;
1383 size_t cch = *pcch;
1384 RTUNICP uc;
1385
1386 if (cch == 0)
1387 {
1388 *pCp = RTUNICP_INVALID;
1389 return VERR_END_OF_STRING;
1390 }
1391
1392 /* ASCII ? */
1393 if (!(uch & RT_BIT(7)))
1394 {
1395 uc = uch;
1396 puch++;
1397 cch--;
1398 }
1399 else if (uch & RT_BIT(6))
1400 {
1401 /* figure the length and validate the first octet. */
1402/** @todo RT_USE_RTC_3629 */
1403 unsigned cb;
1404 if (!(uch & RT_BIT(5)))
1405 cb = 2;
1406 else if (!(uch & RT_BIT(4)))
1407 cb = 3;
1408 else if (!(uch & RT_BIT(3)))
1409 cb = 4;
1410 else if (!(uch & RT_BIT(2)))
1411 cb = 5;
1412 else if (!(uch & RT_BIT(1)))
1413 cb = 6;
1414 else
1415 {
1416 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1417 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1418 }
1419
1420 if (cb > cch)
1421 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1422
1423 /* validate the rest */
1424 switch (cb)
1425 {
1426 case 6:
1427 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1428 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1429 case 5:
1430 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1431 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1432 case 4:
1433 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1434 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1435 case 3:
1436 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1437 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1438 case 2:
1439 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1440 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1441 break;
1442 }
1443
1444 /* get and validate the code point. */
1445 switch (cb)
1446 {
1447 case 6:
1448 uc = (puch[5] & 0x3f)
1449 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1450 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1451 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1452 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1453 | ((RTUNICP)(uch & 0x01) << 30);
1454 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1455 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1456 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1457 break;
1458 case 5:
1459 uc = (puch[4] & 0x3f)
1460 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1461 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1462 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1463 | ((RTUNICP)(uch & 0x03) << 24);
1464 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1465 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1466 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1467 break;
1468 case 4:
1469 uc = (puch[3] & 0x3f)
1470 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1471 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1472 | ((RTUNICP)(uch & 0x07) << 18);
1473 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1474 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1475 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1476 break;
1477 case 3:
1478 uc = (puch[2] & 0x3f)
1479 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1480 | ((RTUNICP)(uch & 0x0f) << 12);
1481 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1482 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1483 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1484 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1485 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1486 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1487 break;
1488 case 2:
1489 uc = (puch[1] & 0x3f)
1490 | ((RTUNICP)(uch & 0x1f) << 6);
1491 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1492 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1493 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1494 break;
1495 default: /* impossible, but GCC is bitching. */
1496 uc = RTUNICP_INVALID;
1497 break;
1498 }
1499 puch += cb;
1500 cch -= cb;
1501 }
1502 else
1503 {
1504 /* 6th bit is always set. */
1505 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1506 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1507 }
1508 *pCp = uc;
1509 *ppsz = (const char *)puch;
1510 (*pcch) = cch;
1511 return VINF_SUCCESS;
1512}
1513RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1514
1515
1516RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1517{
1518 unsigned char *puch = (unsigned char *)psz;
1519 if (uc < 0x80)
1520 *puch++ = (unsigned char )uc;
1521 else if (uc < 0x00000800)
1522 {
1523 *puch++ = 0xc0 | (uc >> 6);
1524 *puch++ = 0x80 | (uc & 0x3f);
1525 }
1526 else if (uc < 0x00010000)
1527 {
1528/** @todo RT_USE_RTC_3629 */
1529 if ( uc < 0x0000d8000
1530 || ( uc > 0x0000dfff
1531 && uc < 0x0000fffe))
1532 {
1533 *puch++ = 0xe0 | (uc >> 12);
1534 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1535 *puch++ = 0x80 | (uc & 0x3f);
1536 }
1537 else
1538 {
1539 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1540 *puch++ = 0x7f;
1541 }
1542 }
1543/** @todo RT_USE_RTC_3629 */
1544 else if (uc < 0x00200000)
1545 {
1546 *puch++ = 0xf0 | (uc >> 18);
1547 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1548 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1549 *puch++ = 0x80 | (uc & 0x3f);
1550 }
1551 else if (uc < 0x04000000)
1552 {
1553 *puch++ = 0xf8 | (uc >> 24);
1554 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1555 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1556 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1557 *puch++ = 0x80 | (uc & 0x3f);
1558 }
1559 else if (uc <= 0x7fffffff)
1560 {
1561 *puch++ = 0xfc | (uc >> 30);
1562 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1563 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1564 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1565 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1566 *puch++ = 0x80 | (uc & 0x3f);
1567 }
1568 else
1569 {
1570 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1571 *puch++ = 0x7f;
1572 }
1573
1574 return (char *)puch;
1575}
1576RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1577
1578
1579RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
1580{
1581 if (pszStart < psz)
1582 {
1583 /* simple char? */
1584 const unsigned char *puch = (const unsigned char *)psz;
1585 unsigned uch = *--puch;
1586 if (!(uch & RT_BIT(7)))
1587 return (char *)puch;
1588 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1589
1590 /* two or more. */
1591 uint32_t uMask = 0xffffffc0;
1592 while ( (const unsigned char *)pszStart < puch
1593 && !(uMask & 1))
1594 {
1595 uch = *--puch;
1596 if ((uch & 0xc0) != 0x80)
1597 {
1598 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1599 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
1600 (char *)pszStart);
1601 return (char *)puch;
1602 }
1603 uMask >>= 1;
1604 }
1605 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
1606 }
1607 return (char *)pszStart;
1608}
1609RT_EXPORT_SYMBOL(RTStrPrevCp);
1610
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette