VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 28800

Last change on this file since 28800 was 28800, checked in by vboxsync, 15 years ago

Automated rebranding to Oracle copyright/license strings via filemuncher

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 43.3 KB
Line 
1/* $Id: utf-8.cpp 28800 2010-04-27 08:22:32Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2009 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.215389.xyz. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42/**
43 * Get get length in code points of a UTF-8 encoded string.
44 * The string is validated while doing this.
45 *
46 * @returns IPRT status code.
47 * @param psz Pointer to the UTF-8 string.
48 * @param cch The max length of the string. (btw cch = cb)
49 * Use RTSTR_MAX if all of the string is to be examined.
50 * @param pcuc Where to store the length in unicode code points.
51 * @param pcchActual Where to store the actual size of the UTF-8 string
52 * on success (cch = cb again). Optional.
53 */
54static int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
55{
56 const unsigned char *puch = (const unsigned char *)psz;
57 size_t cCodePoints = 0;
58 while (cch > 0)
59 {
60 const unsigned char uch = *puch;
61 if (!uch)
62 break;
63 if (uch & RT_BIT(7))
64 {
65 /* figure sequence length and validate the first byte */
66 unsigned cb;
67 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
68 cb = 2;
69 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
70 cb = 3;
71 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
72 cb = 4;
73 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
74 cb = 5;
75 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
76 cb = 6;
77 else
78 {
79 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
80 return VERR_INVALID_UTF8_ENCODING;
81 }
82
83 /* check length */
84 if (cb > cch)
85 {
86 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
87 return VERR_INVALID_UTF8_ENCODING;
88 }
89
90 /* validate the rest */
91 switch (cb)
92 {
93 case 6:
94 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
95 case 5:
96 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
97 case 4:
98 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99 case 3:
100 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101 case 2:
102 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103 break;
104 }
105
106 /* validate the code point. */
107 RTUNICP uc;
108 switch (cb)
109 {
110 case 6:
111 uc = (puch[5] & 0x3f)
112 | ((RTUNICP)(puch[4] & 0x3f) << 6)
113 | ((RTUNICP)(puch[3] & 0x3f) << 12)
114 | ((RTUNICP)(puch[2] & 0x3f) << 18)
115 | ((RTUNICP)(puch[1] & 0x3f) << 24)
116 | ((RTUNICP)(uch & 0x01) << 30);
117 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
118 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
119 break;
120 case 5:
121 uc = (puch[4] & 0x3f)
122 | ((RTUNICP)(puch[3] & 0x3f) << 6)
123 | ((RTUNICP)(puch[2] & 0x3f) << 12)
124 | ((RTUNICP)(puch[1] & 0x3f) << 18)
125 | ((RTUNICP)(uch & 0x03) << 24);
126 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
127 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
128 break;
129 case 4:
130 uc = (puch[3] & 0x3f)
131 | ((RTUNICP)(puch[2] & 0x3f) << 6)
132 | ((RTUNICP)(puch[1] & 0x3f) << 12)
133 | ((RTUNICP)(uch & 0x07) << 18);
134 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
135 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
136 break;
137 case 3:
138 uc = (puch[2] & 0x3f)
139 | ((RTUNICP)(puch[1] & 0x3f) << 6)
140 | ((RTUNICP)(uch & 0x0f) << 12);
141 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
142 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
143 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
144 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
145 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
146 break;
147 case 2:
148 uc = (puch[1] & 0x3f)
149 | ((RTUNICP)(uch & 0x1f) << 6);
150 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
151 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
152 break;
153 }
154
155 /* advance */
156 cch -= cb;
157 puch += cb;
158 }
159 else
160 {
161 /* one ASCII byte */
162 puch++;
163 cch--;
164 }
165 cCodePoints++;
166 }
167
168 /* done */
169 *pcuc = cCodePoints;
170 if (pcchActual)
171 *pcchActual = puch - (unsigned char const *)psz;
172 return VINF_SUCCESS;
173}
174
175
176/**
177 * Decodes and UTF-8 string into an array of unicode code point.
178 *
179 * Since we know the input is valid, we do *not* perform encoding or length checks.
180 *
181 * @returns iprt status code.
182 * @param psz The UTF-8 string to recode. This is a valid encoding.
183 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
184 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
185 * @param paCps Where to store the code points array.
186 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
187 */
188static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
189{
190 int rc = VINF_SUCCESS;
191 const unsigned char *puch = (const unsigned char *)psz;
192 PRTUNICP pCp = paCps;
193 while (cch > 0)
194 {
195 /* read the next char and check for terminator. */
196 const unsigned char uch = *puch;
197 if (!uch)
198 break;
199
200 /* check for output overflow */
201 if (RT_UNLIKELY(cCps < 1))
202 {
203 rc = VERR_BUFFER_OVERFLOW;
204 break;
205 }
206 cCps--;
207
208 /* decode and recode the code point */
209 if (!(uch & RT_BIT(7)))
210 {
211 *pCp++ = uch;
212 puch++;
213 cch--;
214 }
215#ifdef RT_STRICT
216 else if (!(uch & RT_BIT(6)))
217 AssertMsgFailed(("Internal error!\n"));
218#endif
219 else if (!(uch & RT_BIT(5)))
220 {
221 *pCp++ = (puch[1] & 0x3f)
222 | ((uint16_t)(uch & 0x1f) << 6);
223 puch += 2;
224 cch -= 2;
225 }
226 else if (!(uch & RT_BIT(4)))
227 {
228 *pCp++ = (puch[2] & 0x3f)
229 | ((uint16_t)(puch[1] & 0x3f) << 6)
230 | ((uint16_t)(uch & 0x0f) << 12);
231 puch += 3;
232 cch -= 3;
233 }
234 else if (!(uch & RT_BIT(3)))
235 {
236 *pCp++ = (puch[3] & 0x3f)
237 | ((RTUNICP)(puch[2] & 0x3f) << 6)
238 | ((RTUNICP)(puch[1] & 0x3f) << 12)
239 | ((RTUNICP)(uch & 0x07) << 18);
240 puch += 4;
241 cch -= 4;
242 }
243 else if (!(uch & RT_BIT(2)))
244 {
245 *pCp++ = (puch[4] & 0x3f)
246 | ((RTUNICP)(puch[3] & 0x3f) << 6)
247 | ((RTUNICP)(puch[2] & 0x3f) << 12)
248 | ((RTUNICP)(puch[1] & 0x3f) << 18)
249 | ((RTUNICP)(uch & 0x03) << 24);
250 puch += 5;
251 cch -= 6;
252 }
253 else
254 {
255 Assert(!(uch & RT_BIT(1)));
256 *pCp++ = (puch[5] & 0x3f)
257 | ((RTUNICP)(puch[4] & 0x3f) << 6)
258 | ((RTUNICP)(puch[3] & 0x3f) << 12)
259 | ((RTUNICP)(puch[2] & 0x3f) << 18)
260 | ((RTUNICP)(puch[1] & 0x3f) << 24)
261 | ((RTUNICP)(uch & 0x01) << 30);
262 puch += 6;
263 cch -= 6;
264 }
265 }
266
267 /* done */
268 *pCp = 0;
269 return rc;
270}
271
272
273RTDECL(size_t) RTStrUniLen(const char *psz)
274{
275 size_t cCodePoints;
276 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
277 return RT_SUCCESS(rc) ? cCodePoints : 0;
278}
279RT_EXPORT_SYMBOL(RTStrUniLen);
280
281
282RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
283{
284 size_t cCodePoints;
285 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
286 if (pcCps)
287 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
288 return rc;
289}
290RT_EXPORT_SYMBOL(RTStrUniLenEx);
291
292
293RTDECL(int) RTStrValidateEncoding(const char *psz)
294{
295 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
296}
297RT_EXPORT_SYMBOL(RTStrValidateEncoding);
298
299
300RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
301{
302 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
303 AssertPtr(psz);
304
305 /*
306 * Use rtUtf8Length for the job.
307 */
308 size_t cchActual;
309 size_t cCpsIgnored;
310 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
311 if (RT_SUCCESS(rc))
312 {
313 if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
314 && cchActual >= cch)
315 rc = VERR_BUFFER_OVERFLOW;
316 }
317 return rc;
318
319
320 return RTStrUniLenEx(psz, cch, &cCpsIgnored);
321}
322RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
323
324
325RTDECL(bool) RTStrIsValidEncoding(const char *psz)
326{
327 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
328 return RT_SUCCESS(rc);
329}
330RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
331
332
333RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
334{
335 /*
336 * Validate input.
337 */
338 Assert(VALID_PTR(pszString));
339 Assert(VALID_PTR(ppaCps));
340 *ppaCps = NULL;
341
342 /*
343 * Validate the UTF-8 input and count its code points.
344 */
345 size_t cCps;
346 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
347 if (RT_SUCCESS(rc))
348 {
349 /*
350 * Allocate buffer.
351 */
352 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
353 if (paCps)
354 {
355 /*
356 * Decode the string.
357 */
358 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
359 if (RT_SUCCESS(rc))
360 {
361 *ppaCps = paCps;
362 return rc;
363 }
364 RTMemFree(paCps);
365 }
366 else
367 rc = VERR_NO_CODE_POINT_MEMORY;
368 }
369 return rc;
370}
371RT_EXPORT_SYMBOL(RTStrToUni);
372
373
374RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
375{
376 /*
377 * Validate input.
378 */
379 Assert(VALID_PTR(pszString));
380 Assert(VALID_PTR(ppaCps));
381 Assert(!pcCps || VALID_PTR(pcCps));
382
383 /*
384 * Validate the UTF-8 input and count the code points.
385 */
386 size_t cCpsResult;
387 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
388 if (RT_SUCCESS(rc))
389 {
390 if (pcCps)
391 *pcCps = cCpsResult;
392
393 /*
394 * Check buffer size / Allocate buffer.
395 */
396 bool fShouldFree;
397 PRTUNICP paCpsResult;
398 if (cCps > 0 && *ppaCps)
399 {
400 fShouldFree = false;
401 if (cCps <= cCpsResult)
402 return VERR_BUFFER_OVERFLOW;
403 paCpsResult = *ppaCps;
404 }
405 else
406 {
407 *ppaCps = NULL;
408 fShouldFree = true;
409 cCps = RT_MAX(cCpsResult + 1, cCps);
410 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
411 }
412 if (paCpsResult)
413 {
414 /*
415 * Encode the UTF-16 string.
416 */
417 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
418 if (RT_SUCCESS(rc))
419 {
420 *ppaCps = paCpsResult;
421 return rc;
422 }
423 if (fShouldFree)
424 RTMemFree(paCpsResult);
425 }
426 else
427 rc = VERR_NO_CODE_POINT_MEMORY;
428 }
429 return rc;
430}
431RT_EXPORT_SYMBOL(RTStrToUniEx);
432
433
434/**
435 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
436 *
437 * @returns IPRT status code.
438 * @param psz Pointer to the UTF-8 string.
439 * @param cch The max length of the string. (btw cch = cb)
440 * Use RTSTR_MAX if all of the string is to be examined.
441 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
442 */
443static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
444{
445 const unsigned char *puch = (const unsigned char *)psz;
446 size_t cwc = 0;
447 while (cch > 0)
448 {
449 const unsigned char uch = *puch;
450 if (!uch)
451 break;
452 if (!(uch & RT_BIT(7)))
453 {
454 /* one ASCII byte */
455 cwc++;
456 puch++;
457 cch--;
458 }
459 else
460 {
461 /* figure sequence length and validate the first byte */
462 unsigned cb;
463 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
464 cb = 2;
465 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
466 cb = 3;
467 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
468 cb = 4;
469 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
470 cb = 5;
471 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
472 cb = 6;
473 else
474 {
475 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
476 return VERR_INVALID_UTF8_ENCODING;
477 }
478
479 /* check length */
480 if (cb > cch)
481 {
482 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
483 return VERR_INVALID_UTF8_ENCODING;
484 }
485
486 /* validate the rest */
487 switch (cb)
488 {
489 case 6:
490 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
491 case 5:
492 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
493 case 4:
494 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
495 case 3:
496 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
497 case 2:
498 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
499 break;
500 }
501
502 /* validate the code point. */
503 RTUNICP uc;
504 switch (cb)
505 {
506 case 6:
507 uc = (puch[5] & 0x3f)
508 | ((RTUNICP)(puch[4] & 0x3f) << 6)
509 | ((RTUNICP)(puch[3] & 0x3f) << 12)
510 | ((RTUNICP)(puch[2] & 0x3f) << 18)
511 | ((RTUNICP)(puch[1] & 0x3f) << 24)
512 | ((RTUNICP)(uch & 0x01) << 30);
513 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
514 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
515 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
516 return VERR_CANT_RECODE_AS_UTF16;
517 case 5:
518 uc = (puch[4] & 0x3f)
519 | ((RTUNICP)(puch[3] & 0x3f) << 6)
520 | ((RTUNICP)(puch[2] & 0x3f) << 12)
521 | ((RTUNICP)(puch[1] & 0x3f) << 18)
522 | ((RTUNICP)(uch & 0x03) << 24);
523 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
524 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
525 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
526 return VERR_CANT_RECODE_AS_UTF16;
527 case 4:
528 uc = (puch[3] & 0x3f)
529 | ((RTUNICP)(puch[2] & 0x3f) << 6)
530 | ((RTUNICP)(puch[1] & 0x3f) << 12)
531 | ((RTUNICP)(uch & 0x07) << 18);
532 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
533 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
534 RTStrAssertMsgReturn(uc <= 0x0010ffff,
535 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
536 cwc++;
537 break;
538 case 3:
539 uc = (puch[2] & 0x3f)
540 | ((RTUNICP)(puch[1] & 0x3f) << 6)
541 | ((RTUNICP)(uch & 0x0f) << 12);
542 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
543 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
544 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
545 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
546 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
547 break;
548 case 2:
549 uc = (puch[1] & 0x3f)
550 | ((RTUNICP)(uch & 0x1f) << 6);
551 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
552 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
553 break;
554 }
555
556 /* advance */
557 cch -= cb;
558 puch += cb;
559 cwc++;
560 }
561 }
562
563 /* done */
564 *pcwc = cwc;
565 return VINF_SUCCESS;
566}
567
568
569/**
570 * Recodes a valid UTF-8 string as UTF-16.
571 *
572 * Since we know the input is valid, we do *not* perform encoding or length checks.
573 *
574 * @returns iprt status code.
575 * @param psz The UTF-8 string to recode. This is a valid encoding.
576 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
577 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
578 * @param pwsz Where to store the UTF-16 string.
579 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
580 */
581static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
582{
583 int rc = VINF_SUCCESS;
584 const unsigned char *puch = (const unsigned char *)psz;
585 PRTUTF16 pwc = pwsz;
586 while (cch > 0)
587 {
588 /* read the next char and check for terminator. */
589 const unsigned char uch = *puch;
590 if (!uch)
591 break;
592
593 /* check for output overflow */
594 if (RT_UNLIKELY(cwc < 1))
595 {
596 rc = VERR_BUFFER_OVERFLOW;
597 break;
598 }
599 cwc--;
600
601 /* decode and recode the code point */
602 if (!(uch & RT_BIT(7)))
603 {
604 *pwc++ = uch;
605 puch++;
606 cch--;
607 }
608 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
609 {
610 uint16_t uc = (puch[1] & 0x3f)
611 | ((uint16_t)(uch & 0x1f) << 6);
612 *pwc++ = uc;
613 puch += 2;
614 cch -= 2;
615 }
616 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
617 {
618 uint16_t uc = (puch[2] & 0x3f)
619 | ((uint16_t)(puch[1] & 0x3f) << 6)
620 | ((uint16_t)(uch & 0x0f) << 12);
621 *pwc++ = uc;
622 puch += 3;
623 cch -= 3;
624 }
625 else
626 {
627 /* generate surrugate pair */
628 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
629 RTUNICP uc = (puch[3] & 0x3f)
630 | ((RTUNICP)(puch[2] & 0x3f) << 6)
631 | ((RTUNICP)(puch[1] & 0x3f) << 12)
632 | ((RTUNICP)(uch & 0x07) << 18);
633 if (RT_UNLIKELY(cwc < 1))
634 {
635 rc = VERR_BUFFER_OVERFLOW;
636 break;
637 }
638 cwc--;
639
640 uc -= 0x10000;
641 *pwc++ = 0xd800 | (uc >> 10);
642 *pwc++ = 0xdc00 | (uc & 0x3ff);
643 puch += 4;
644 cch -= 4;
645 }
646 }
647
648 /* done */
649 *pwc = '\0';
650 return rc;
651}
652
653
654RTDECL(int) RTStrToUtf16(const char *pszString, PRTUTF16 *ppwszString)
655{
656 /*
657 * Validate input.
658 */
659 Assert(VALID_PTR(ppwszString));
660 Assert(VALID_PTR(pszString));
661 *ppwszString = NULL;
662
663 /*
664 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
665 */
666 size_t cwc;
667 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
668 if (RT_SUCCESS(rc))
669 {
670 /*
671 * Allocate buffer.
672 */
673 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
674 if (pwsz)
675 {
676 /*
677 * Encode the UTF-16 string.
678 */
679 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
680 if (RT_SUCCESS(rc))
681 {
682 *ppwszString = pwsz;
683 return rc;
684 }
685 RTMemFree(pwsz);
686 }
687 else
688 rc = VERR_NO_UTF16_MEMORY;
689 }
690 return rc;
691}
692RT_EXPORT_SYMBOL(RTStrToUtf16);
693
694
695RTDECL(int) RTStrToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
696{
697 /*
698 * Validate input.
699 */
700 Assert(VALID_PTR(pszString));
701 Assert(VALID_PTR(ppwsz));
702 Assert(!pcwc || VALID_PTR(pcwc));
703
704 /*
705 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
706 */
707 size_t cwcResult;
708 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
709 if (RT_SUCCESS(rc))
710 {
711 if (pcwc)
712 *pcwc = cwcResult;
713
714 /*
715 * Check buffer size / Allocate buffer.
716 */
717 bool fShouldFree;
718 PRTUTF16 pwszResult;
719 if (cwc > 0 && *ppwsz)
720 {
721 fShouldFree = false;
722 if (cwc <= cwcResult)
723 return VERR_BUFFER_OVERFLOW;
724 pwszResult = *ppwsz;
725 }
726 else
727 {
728 *ppwsz = NULL;
729 fShouldFree = true;
730 cwc = RT_MAX(cwcResult + 1, cwc);
731 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
732 }
733 if (pwszResult)
734 {
735 /*
736 * Encode the UTF-16 string.
737 */
738 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
739 if (RT_SUCCESS(rc))
740 {
741 *ppwsz = pwszResult;
742 return rc;
743 }
744 if (fShouldFree)
745 RTMemFree(pwszResult);
746 }
747 else
748 rc = VERR_NO_UTF16_MEMORY;
749 }
750 return rc;
751}
752RT_EXPORT_SYMBOL(RTStrToUtf16Ex);
753
754
755RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
756{
757 size_t cwc;
758 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
759 return RT_SUCCESS(rc) ? cwc : 0;
760}
761RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
762
763
764RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
765{
766 size_t cwc;
767 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
768 if (pcwc)
769 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
770 return rc;
771}
772RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
773
774
775/**
776 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
777 * @returns rc
778 * @param ppsz The pointer to the string position point.
779 * @param pCp Where to store RTUNICP_INVALID.
780 * @param rc The iprt error code.
781 */
782static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
783{
784 /*
785 * Try find a valid encoding.
786 */
787 (*ppsz)++; /** @todo code this! */
788 *pCp = RTUNICP_INVALID;
789 return rc;
790}
791
792
793RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
794{
795 RTUNICP Cp;
796 RTStrGetCpExInternal(&psz, &Cp);
797 return Cp;
798}
799RT_EXPORT_SYMBOL(RTStrGetCpInternal);
800
801
802RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
803{
804 const unsigned char *puch = (const unsigned char *)*ppsz;
805 const unsigned char uch = *puch;
806 RTUNICP uc;
807
808 /* ASCII ? */
809 if (!(uch & RT_BIT(7)))
810 {
811 uc = uch;
812 puch++;
813 }
814 else if (uch & RT_BIT(6))
815 {
816 /* figure the length and validate the first octet. */
817 unsigned cb;
818 if (!(uch & RT_BIT(5)))
819 cb = 2;
820 else if (!(uch & RT_BIT(4)))
821 cb = 3;
822 else if (!(uch & RT_BIT(3)))
823 cb = 4;
824 else if (!(uch & RT_BIT(2)))
825 cb = 5;
826 else if (!(uch & RT_BIT(1)))
827 cb = 6;
828 else
829 {
830 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
831 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
832 }
833
834 /* validate the rest */
835 switch (cb)
836 {
837 case 6:
838 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
839 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
840 case 5:
841 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
842 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
843 case 4:
844 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
845 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
846 case 3:
847 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
848 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
849 case 2:
850 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
851 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
852 break;
853 }
854
855 /* get and validate the code point. */
856 switch (cb)
857 {
858 case 6:
859 uc = (puch[5] & 0x3f)
860 | ((RTUNICP)(puch[4] & 0x3f) << 6)
861 | ((RTUNICP)(puch[3] & 0x3f) << 12)
862 | ((RTUNICP)(puch[2] & 0x3f) << 18)
863 | ((RTUNICP)(puch[1] & 0x3f) << 24)
864 | ((RTUNICP)(uch & 0x01) << 30);
865 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
866 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
867 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
868 break;
869 case 5:
870 uc = (puch[4] & 0x3f)
871 | ((RTUNICP)(puch[3] & 0x3f) << 6)
872 | ((RTUNICP)(puch[2] & 0x3f) << 12)
873 | ((RTUNICP)(puch[1] & 0x3f) << 18)
874 | ((RTUNICP)(uch & 0x03) << 24);
875 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
876 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
877 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
878 break;
879 case 4:
880 uc = (puch[3] & 0x3f)
881 | ((RTUNICP)(puch[2] & 0x3f) << 6)
882 | ((RTUNICP)(puch[1] & 0x3f) << 12)
883 | ((RTUNICP)(uch & 0x07) << 18);
884 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
885 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
886 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
887 break;
888 case 3:
889 uc = (puch[2] & 0x3f)
890 | ((RTUNICP)(puch[1] & 0x3f) << 6)
891 | ((RTUNICP)(uch & 0x0f) << 12);
892 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
893 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
894 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
895 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
896 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
897 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
898 break;
899 case 2:
900 uc = (puch[1] & 0x3f)
901 | ((RTUNICP)(uch & 0x1f) << 6);
902 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
903 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
904 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
905 break;
906 default: /* impossible, but GCC is bitching. */
907 uc = RTUNICP_INVALID;
908 break;
909 }
910 puch += cb;
911 }
912 else
913 {
914 /* 6th bit is always set. */
915 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
916 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
917 }
918 *pCp = uc;
919 *ppsz = (const char *)puch;
920 return VINF_SUCCESS;
921}
922RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
923
924
925/**
926 * Handle invalid encodings passed to RTStrGetCpNEx().
927 * @returns rc
928 * @param ppsz The pointer to the string position point.
929 * @param pcch Pointer to the string length.
930 * @param pCp Where to store RTUNICP_INVALID.
931 * @param rc The iprt error code.
932 */
933static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
934{
935 /*
936 * Try find a valid encoding.
937 */
938 (*ppsz)++; /** @todo code this! */
939 (*pcch)--;
940 *pCp = RTUNICP_INVALID;
941 return rc;
942}
943
944
945RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
946{
947 const unsigned char *puch = (const unsigned char *)*ppsz;
948 const unsigned char uch = *puch;
949 size_t cch = *pcch;
950 RTUNICP uc;
951
952 if (cch == 0)
953 {
954 *pCp = RTUNICP_INVALID;
955 return VERR_END_OF_STRING;
956 }
957
958 /* ASCII ? */
959 if (!(uch & RT_BIT(7)))
960 {
961 uc = uch;
962 puch++;
963 cch--;
964 }
965 else if (uch & RT_BIT(6))
966 {
967 /* figure the length and validate the first octet. */
968 unsigned cb;
969 if (!(uch & RT_BIT(5)))
970 cb = 2;
971 else if (!(uch & RT_BIT(4)))
972 cb = 3;
973 else if (!(uch & RT_BIT(3)))
974 cb = 4;
975 else if (!(uch & RT_BIT(2)))
976 cb = 5;
977 else if (!(uch & RT_BIT(1)))
978 cb = 6;
979 else
980 {
981 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
982 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
983 }
984
985 if (cb > cch)
986 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
987
988 /* validate the rest */
989 switch (cb)
990 {
991 case 6:
992 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
993 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
994 case 5:
995 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
996 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
997 case 4:
998 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
999 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1000 case 3:
1001 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1002 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1003 case 2:
1004 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1005 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1006 break;
1007 }
1008
1009 /* get and validate the code point. */
1010 switch (cb)
1011 {
1012 case 6:
1013 uc = (puch[5] & 0x3f)
1014 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1015 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1016 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1017 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1018 | ((RTUNICP)(uch & 0x01) << 30);
1019 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1020 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1021 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1022 break;
1023 case 5:
1024 uc = (puch[4] & 0x3f)
1025 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1026 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1027 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1028 | ((RTUNICP)(uch & 0x03) << 24);
1029 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1030 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1031 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1032 break;
1033 case 4:
1034 uc = (puch[3] & 0x3f)
1035 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1036 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1037 | ((RTUNICP)(uch & 0x07) << 18);
1038 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1039 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1040 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1041 break;
1042 case 3:
1043 uc = (puch[2] & 0x3f)
1044 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1045 | ((RTUNICP)(uch & 0x0f) << 12);
1046 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1047 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1048 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1049 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1050 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1051 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1052 break;
1053 case 2:
1054 uc = (puch[1] & 0x3f)
1055 | ((RTUNICP)(uch & 0x1f) << 6);
1056 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1057 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1058 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1059 break;
1060 default: /* impossible, but GCC is bitching. */
1061 uc = RTUNICP_INVALID;
1062 break;
1063 }
1064 puch += cb;
1065 cch -= cb;
1066 }
1067 else
1068 {
1069 /* 6th bit is always set. */
1070 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1071 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1072 }
1073 *pCp = uc;
1074 *ppsz = (const char *)puch;
1075 (*pcch) = cch;
1076 return VINF_SUCCESS;
1077}
1078RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1079
1080
1081RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1082{
1083 unsigned char *puch = (unsigned char *)psz;
1084 if (uc < 0x80)
1085 *puch++ = (unsigned char )uc;
1086 else if (uc < 0x00000800)
1087 {
1088 *puch++ = 0xc0 | (uc >> 6);
1089 *puch++ = 0x80 | (uc & 0x3f);
1090 }
1091 else if (uc < 0x00010000)
1092 {
1093 if ( uc < 0x0000d8000
1094 || ( uc > 0x0000dfff
1095 && uc < 0x0000fffe))
1096 {
1097 *puch++ = 0xe0 | (uc >> 12);
1098 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1099 *puch++ = 0x80 | (uc & 0x3f);
1100 }
1101 else
1102 {
1103 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1104 *puch++ = 0x7f;
1105 }
1106 }
1107 else if (uc < 0x00200000)
1108 {
1109 *puch++ = 0xf0 | (uc >> 18);
1110 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1111 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1112 *puch++ = 0x80 | (uc & 0x3f);
1113 }
1114 else if (uc < 0x04000000)
1115 {
1116 *puch++ = 0xf8 | (uc >> 24);
1117 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1118 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1119 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1120 *puch++ = 0x80 | (uc & 0x3f);
1121 }
1122 else if (uc <= 0x7fffffff)
1123 {
1124 *puch++ = 0xfc | (uc >> 30);
1125 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1126 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1127 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1128 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1129 *puch++ = 0x80 | (uc & 0x3f);
1130 }
1131 else
1132 {
1133 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1134 *puch++ = 0x7f;
1135 }
1136
1137 return (char *)puch;
1138}
1139RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1140
1141
1142RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
1143{
1144 if (pszStart < psz)
1145 {
1146 /* simple char? */
1147 const unsigned char *puch = (const unsigned char *)psz;
1148 unsigned uch = *--puch;
1149 if (!(uch & RT_BIT(7)))
1150 return (char *)puch;
1151 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1152
1153 /* two or more. */
1154 uint32_t uMask = 0xffffffc0;
1155 while ( (const unsigned char *)pszStart < puch
1156 && !(uMask & 1))
1157 {
1158 uch = *--puch;
1159 if ((uch & 0xc0) != 0x80)
1160 {
1161 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1162 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
1163 (char *)pszStart);
1164 return (char *)puch;
1165 }
1166 uMask >>= 1;
1167 }
1168 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
1169 }
1170 return (char *)pszStart;
1171}
1172RT_EXPORT_SYMBOL(RTStrPrevCp);
1173
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette