VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 21728

Last change on this file since 21728 was 21728, checked in by vboxsync, 16 years ago

iprt/string: change behaviour of Utf16 to Latin1 to reject untranslatable strings instead of doing a best-effort translation

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 30.1 KB
Line 
1/* $Id: utf-16.cpp 21728 2009-07-20 15:11:45Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2007 Sun Microsystems, Inc.
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.215389.xyz. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 *
26 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27 * Clara, CA 95054 USA or visit http://www.sun.com if you need
28 * additional information or have any questions.
29 */
30
31
32/*******************************************************************************
33* Header Files *
34*******************************************************************************/
35#include <iprt/string.h>
36#include "internal/iprt.h"
37
38#include <iprt/uni.h>
39#include <iprt/alloc.h>
40#include <iprt/assert.h>
41#include <iprt/err.h>
42#include "internal/string.h"
43
44
45
46RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
47{
48 if (pwszString)
49 RTMemTmpFree(pwszString);
50}
51RT_EXPORT_SYMBOL(RTUtf16Free);
52
53
54RTDECL(PRTUTF16) RTUtf16Dup(PCRTUTF16 pwszString)
55{
56 Assert(pwszString);
57 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
58 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb);
59 if (pwsz)
60 memcpy(pwsz, pwszString, cb);
61 return pwsz;
62}
63RT_EXPORT_SYMBOL(RTUtf16Dup);
64
65
66RTDECL(int) RTUtf16DupEx(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra)
67{
68 Assert(pwszString);
69 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
70 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb + cwcExtra * sizeof(RTUTF16));
71 if (pwsz)
72 {
73 memcpy(pwsz, pwszString, cb);
74 *ppwszString = pwsz;
75 return VINF_SUCCESS;
76 }
77 return VERR_NO_MEMORY;
78}
79RT_EXPORT_SYMBOL(RTUtf16DupEx);
80
81
82RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
83{
84 if (!pwszString)
85 return 0;
86
87 PCRTUTF16 pwsz = pwszString;
88 while (*pwsz)
89 pwsz++;
90 return pwsz - pwszString;
91}
92RT_EXPORT_SYMBOL(RTUtf16Len);
93
94
95RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
96{
97 if (pwsz1 == pwsz2)
98 return 0;
99 if (!pwsz1)
100 return -1;
101 if (!pwsz2)
102 return 1;
103
104 for (;;)
105 {
106 register RTUTF16 wcs = *pwsz1;
107 register int iDiff = wcs - *pwsz2;
108 if (iDiff || !wcs)
109 return iDiff;
110 pwsz1++;
111 pwsz2++;
112 }
113}
114RT_EXPORT_SYMBOL(RTUtf16Cmp);
115
116
117RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
118{
119 if (pwsz1 == pwsz2)
120 return 0;
121 if (!pwsz1)
122 return -1;
123 if (!pwsz2)
124 return 1;
125
126 PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */
127 for (;;)
128 {
129 register RTUTF16 wc1 = *pwsz1;
130 register RTUTF16 wc2 = *pwsz2;
131 register int iDiff = wc1 - wc2;
132 if (iDiff)
133 {
134 /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
135 if ( wc1 < 0xd800
136 || wc2 < 0xd800
137 || wc1 > 0xdfff
138 || wc2 > 0xdfff)
139 {
140 /* simple UCS-2 char */
141 iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2);
142 if (iDiff)
143 iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2);
144 }
145 else
146 {
147 /* a damned pair */
148 RTUNICP uc1;
149 RTUNICP uc2;
150 if (wc1 >= 0xdc00)
151 {
152 if (pwsz1Start == pwsz1)
153 return iDiff;
154 uc1 = pwsz1[-1];
155 if (uc1 < 0xd800 || uc1 >= 0xdc00)
156 return iDiff;
157 uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff));
158 uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff));
159 }
160 else
161 {
162 uc1 = *++pwsz1;
163 if (uc1 < 0xdc00 || uc1 >= 0xe000)
164 return iDiff;
165 uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff));
166 uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff));
167 }
168 iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2);
169 if (iDiff)
170 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */
171 }
172 if (iDiff)
173 return iDiff;
174 }
175 if (!wc1)
176 return 0;
177 pwsz1++;
178 pwsz2++;
179 }
180}
181RT_EXPORT_SYMBOL(RTUtf16ICmp);
182
183
184RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz)
185{
186 PRTUTF16 pwc = pwsz;
187 for (;;)
188 {
189 RTUTF16 wc = *pwc;
190 if (!wc)
191 break;
192 if (wc < 0xd800 || wc >= 0xdc00)
193 {
194 RTUNICP ucFolded = RTUniCpToLower(wc);
195 if (ucFolded < 0x10000)
196 *pwc++ = RTUniCpToLower(wc);
197 }
198 else
199 {
200 /* surrogate */
201 RTUTF16 wc2 = pwc[1];
202 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
203 {
204 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
205 RTUNICP ucFolded = RTUniCpToLower(uc);
206 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
207 {
208 uc -= 0x10000;
209 *pwc++ = 0xd800 | (uc >> 10);
210 *pwc++ = 0xdc00 | (uc & 0x3ff);
211 }
212 }
213 else /* invalid encoding. */
214 pwc++;
215 }
216 }
217 return pwsz;
218}
219RT_EXPORT_SYMBOL(RTUtf16ToLower);
220
221
222RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz)
223{
224 PRTUTF16 pwc = pwsz;
225 for (;;)
226 {
227 RTUTF16 wc = *pwc;
228 if (!wc)
229 break;
230 if (wc < 0xd800 || wc >= 0xdc00)
231 *pwc++ = RTUniCpToUpper(wc);
232 else
233 {
234 /* surrogate */
235 RTUTF16 wc2 = pwc[1];
236 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
237 {
238 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
239 RTUNICP ucFolded = RTUniCpToUpper(uc);
240 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
241 {
242 uc -= 0x10000;
243 *pwc++ = 0xd800 | (uc >> 10);
244 *pwc++ = 0xdc00 | (uc & 0x3ff);
245 }
246 }
247 else /* invalid encoding. */
248 pwc++;
249 }
250 }
251 return pwsz;
252}
253RT_EXPORT_SYMBOL(RTUtf16ToUpper);
254
255
256/**
257 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
258 *
259 * @returns iprt status code.
260 * @param pwsz The UTF-16 string.
261 * @param cwc The max length of the UTF-16 string to consider.
262 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
263 */
264static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
265{
266 int rc = VINF_SUCCESS;
267 size_t cch = 0;
268 while (cwc > 0)
269 {
270 RTUTF16 wc = *pwsz++; cwc--;
271 if (!wc)
272 break;
273 else if (wc < 0xd800 || wc > 0xdfff)
274 {
275 if (wc < 0x80)
276 cch++;
277 else if (wc < 0x800)
278 cch += 2;
279 else if (wc < 0xfffe)
280 cch += 3;
281 else
282 {
283 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
284 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
285 break;
286 }
287 }
288 else
289 {
290 if (wc >= 0xdc00)
291 {
292 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
293 rc = VERR_INVALID_UTF16_ENCODING;
294 break;
295 }
296 if (cwc <= 0)
297 {
298 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
299 rc = VERR_INVALID_UTF16_ENCODING;
300 break;
301 }
302 wc = *pwsz++; cwc--;
303 if (wc < 0xdc00 || wc > 0xdfff)
304 {
305 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
306 rc = VERR_INVALID_UTF16_ENCODING;
307 break;
308 }
309 cch += 4;
310 }
311 }
312
313
314 /* done */
315 *pcch = cch;
316 return rc;
317}
318
319
320/**
321 * Recodes an valid UTF-16 string as UTF-8.
322 *
323 * @returns iprt status code.
324 * @param pwsz The UTF-16 string.
325 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
326 * will stop when cwc or '\\0' is reached.
327 * @param psz Where to store the UTF-8 string.
328 * @param cch The size of the UTF-8 buffer, excluding the terminator.
329 * @param pcch Where to store the number of octets actually encoded.
330 */
331static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
332{
333 unsigned char *pwch = (unsigned char *)psz;
334 int rc = VINF_SUCCESS;
335 while (cwc > 0)
336 {
337 RTUTF16 wc = *pwsz++; cwc--;
338 if (!wc)
339 break;
340 else if (wc < 0xd800 || wc > 0xdfff)
341 {
342 if (wc < 0x80)
343 {
344 if (cch < 1)
345 {
346 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
347 rc = VERR_BUFFER_OVERFLOW;
348 break;
349 }
350 cch--;
351 *pwch++ = (unsigned char)wc;
352 }
353 else if (wc < 0x800)
354 {
355 if (cch < 2)
356 {
357 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
358 rc = VERR_BUFFER_OVERFLOW;
359 break;
360 }
361 cch -= 2;
362 *pwch++ = 0xc0 | (wc >> 6);
363 *pwch++ = 0x80 | (wc & 0x3f);
364 }
365 else if (wc < 0xfffe)
366 {
367 if (cch < 3)
368 {
369 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
370 rc = VERR_BUFFER_OVERFLOW;
371 break;
372 }
373 cch -= 3;
374 *pwch++ = 0xe0 | (wc >> 12);
375 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
376 *pwch++ = 0x80 | (wc & 0x3f);
377 }
378 else
379 {
380 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
381 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
382 break;
383 }
384 }
385 else
386 {
387 if (wc >= 0xdc00)
388 {
389 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
390 rc = VERR_INVALID_UTF16_ENCODING;
391 break;
392 }
393 if (cwc <= 0)
394 {
395 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
396 rc = VERR_INVALID_UTF16_ENCODING;
397 break;
398 }
399 RTUTF16 wc2 = *pwsz++; cwc--;
400 if (wc2 < 0xdc00 || wc2 > 0xdfff)
401 {
402 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
403 rc = VERR_INVALID_UTF16_ENCODING;
404 break;
405 }
406 uint32_t CodePoint = 0x10000
407 + ( ((wc & 0x3ff) << 10)
408 | (wc2 & 0x3ff));
409 if (cch < 4)
410 {
411 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
412 rc = VERR_BUFFER_OVERFLOW;
413 break;
414 }
415 cch -= 4;
416 *pwch++ = 0xf0 | (CodePoint >> 18);
417 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
418 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
419 *pwch++ = 0x80 | (CodePoint & 0x3f);
420 }
421 }
422
423 /* done */
424 *pwch = '\0';
425 *pcch = (char *)pwch - psz;
426 return rc;
427}
428
429
430
431RTDECL(int) RTUtf16ToUtf8(PCRTUTF16 pwszString, char **ppszString)
432{
433 /*
434 * Validate input.
435 */
436 Assert(VALID_PTR(ppszString));
437 Assert(VALID_PTR(pwszString));
438 *ppszString = NULL;
439
440 /*
441 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
442 */
443 size_t cch;
444 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
445 if (RT_SUCCESS(rc))
446 {
447 /*
448 * Allocate buffer and recode it.
449 */
450 char *pszResult = (char *)RTMemAlloc(cch + 1);
451 if (pszResult)
452 {
453 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
454 if (RT_SUCCESS(rc))
455 {
456 *ppszString = pszResult;
457 return rc;
458 }
459
460 RTMemFree(pszResult);
461 }
462 else
463 rc = VERR_NO_STR_MEMORY;
464 }
465 return rc;
466}
467RT_EXPORT_SYMBOL(RTUtf16ToUtf8);
468
469
470RTDECL(int) RTUtf16ToUtf8Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
471{
472 /*
473 * Validate input.
474 */
475 Assert(VALID_PTR(pwszString));
476 Assert(VALID_PTR(ppsz));
477 Assert(!pcch || VALID_PTR(pcch));
478
479 /*
480 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
481 */
482 size_t cchResult;
483 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
484 if (RT_SUCCESS(rc))
485 {
486 if (pcch)
487 *pcch = cchResult;
488
489 /*
490 * Check buffer size / Allocate buffer and recode it.
491 */
492 bool fShouldFree;
493 char *pszResult;
494 if (cch > 0 && *ppsz)
495 {
496 fShouldFree = false;
497 if (cch <= cchResult)
498 return VERR_BUFFER_OVERFLOW;
499 pszResult = *ppsz;
500 }
501 else
502 {
503 *ppsz = NULL;
504 fShouldFree = true;
505 cch = RT_MAX(cch, cchResult + 1);
506 pszResult = (char *)RTMemAlloc(cch);
507 }
508 if (pszResult)
509 {
510 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
511 if (RT_SUCCESS(rc))
512 {
513 *ppsz = pszResult;
514 return rc;
515 }
516
517 if (fShouldFree)
518 RTMemFree(pszResult);
519 }
520 else
521 rc = VERR_NO_STR_MEMORY;
522 }
523 return rc;
524}
525RT_EXPORT_SYMBOL(RTUtf16ToUtf8Ex);
526
527
528RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
529{
530 size_t cch;
531 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
532 return RT_SUCCESS(rc) ? cch : 0;
533}
534RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
535
536
537RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
538{
539 size_t cch;
540 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
541 if (pcch)
542 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
543 return rc;
544}
545RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
546
547
548RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
549{
550 const RTUTF16 wc = *pwsz;
551
552 /* simple */
553 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
554 return wc;
555 if (wc < 0xfffe)
556 {
557 /* surrogate pair */
558 if (wc < 0xdc00)
559 {
560 const RTUTF16 wc2 = pwsz[1];
561 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
562 {
563 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
564 return uc;
565 }
566
567 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
568 }
569 else
570 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
571 }
572 else
573 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
574 return RTUNICP_INVALID;
575}
576RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
577
578
579RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
580{
581 const RTUTF16 wc = **ppwsz;
582
583 /* simple */
584 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
585 {
586 (*ppwsz)++;
587 *pCp = wc;
588 return VINF_SUCCESS;
589 }
590
591 int rc;
592 if (wc < 0xfffe)
593 {
594 /* surrogate pair */
595 if (wc < 0xdc00)
596 {
597 const RTUTF16 wc2 = (*ppwsz)[1];
598 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
599 {
600 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
601 *pCp = uc;
602 (*ppwsz) += 2;
603 return VINF_SUCCESS;
604 }
605
606 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
607 }
608 else
609 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
610 rc = VERR_INVALID_UTF16_ENCODING;
611 }
612 else
613 {
614 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
615 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
616 }
617 *pCp = RTUNICP_INVALID;
618 (*ppwsz)++;
619 return rc;
620}
621RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
622
623
624RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
625{
626 /* simple */
627 if ( CodePoint < 0xd800
628 || ( CodePoint > 0xdfff
629 && CodePoint < 0xfffe))
630 {
631 *pwsz++ = (RTUTF16)CodePoint;
632 return pwsz;
633 }
634
635 /* surrogate pair */
636 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
637 {
638 CodePoint -= 0x10000;
639 *pwsz++ = 0xd800 | (CodePoint >> 10);
640 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
641 return pwsz;
642 }
643
644 /* invalid code point. */
645 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
646 *pwsz++ = 0x7f;
647 return pwsz;
648}
649RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
650
651
652/**
653 * Validate the UTF-16 encoding and calculates the length of a Latin1 encoding.
654 *
655 * @returns iprt status code.
656 * @param pwsz The UTF-16 string.
657 * @param cwc The max length of the UTF-16 string to consider.
658 * @param pcch Where to store the length (excluding '\\0') of the Latin1 string. (cch == cb, btw)
659 */
660static int rtUtf16CalcLatin1Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
661{
662 int rc = VINF_SUCCESS;
663 size_t cch = 0;
664 while (cwc > 0)
665 {
666 RTUTF16 wc = *pwsz++; cwc--;
667 if (!wc)
668 break;
669 else if (wc < 256)
670 ++cch;
671 else if (wc < 0xd800 || wc > 0xdfff)
672 {
673 if (wc < 0xfffe)
674 {
675 rc = VERR_NO_TRANSLATION;
676 break;
677 }
678 else
679 {
680 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
681 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
682 break;
683 }
684 }
685 else
686 {
687 if (wc >= 0xdc00)
688 {
689 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
690 rc = VERR_INVALID_UTF16_ENCODING;
691 break;
692 }
693 if (cwc <= 0)
694 {
695 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
696 rc = VERR_INVALID_UTF16_ENCODING;
697 break;
698 }
699 wc = *pwsz++; cwc--;
700 if (wc < 0xdc00 || wc > 0xdfff)
701 {
702 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
703 rc = VERR_INVALID_UTF16_ENCODING;
704 break;
705 }
706 rc = VERR_NO_TRANSLATION;
707 break;
708 }
709 }
710
711
712 /* done */
713 *pcch = cch;
714 return rc;
715}
716
717
718/**
719 * Recodes an valid UTF-16 string as Latin1.
720 *
721 * @returns iprt status code.
722 * @param pwsz The UTF-16 string.
723 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
724 * will stop when cwc or '\\0' is reached.
725 * @param psz Where to store the Latin1 string.
726 * @param cch The size of the Latin1 buffer, excluding the terminator.
727 * @param pcch Where to store the number of octets actually encoded.
728 */
729static int rtUtf16RecodeAsLatin1(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
730{
731 unsigned char *pwch = (unsigned char *)psz;
732 int rc = VINF_SUCCESS;
733 while (cwc > 0)
734 {
735 RTUTF16 wc = *pwsz++; cwc--;
736 if (!wc)
737 break;
738 else if (wc < 0xd800 || wc > 0xdfff)
739 {
740 if (wc < 0x100)
741 {
742 if (cch < 1)
743 {
744 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
745 rc = VERR_BUFFER_OVERFLOW;
746 break;
747 }
748 cch--;
749 *pwch++ = (char)wc;
750 }
751 else if (wc < 0xfffe)
752 {
753 rc = VERR_NO_TRANSLATION;
754 break;
755 }
756 else
757 {
758 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
759 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
760 break;
761 }
762 }
763 else
764 {
765 if (wc >= 0xdc00)
766 {
767 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
768 rc = VERR_INVALID_UTF16_ENCODING;
769 break;
770 }
771 if (cwc <= 0)
772 {
773 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
774 rc = VERR_INVALID_UTF16_ENCODING;
775 break;
776 }
777 RTUTF16 wc2 = *pwsz++; cwc--;
778 if (wc2 < 0xdc00 || wc2 > 0xdfff)
779 {
780 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
781 rc = VERR_INVALID_UTF16_ENCODING;
782 break;
783 }
784 rc = VERR_NO_TRANSLATION;
785 break;
786 }
787 }
788
789 /* done */
790 *pwch = '\0';
791 *pcch = (char *)pwch - psz;
792 return rc;
793}
794
795
796RTDECL(int) RTUtf16ToLatin1(PCRTUTF16 pwszString, char **ppszString)
797{
798 /*
799 * Validate input.
800 */
801 Assert(VALID_PTR(ppszString));
802 Assert(VALID_PTR(pwszString));
803 *ppszString = NULL;
804
805 /*
806 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
807 */
808 size_t cch;
809 int rc = rtUtf16CalcLatin1Length(pwszString, RTSTR_MAX, &cch);
810 if (RT_SUCCESS(rc))
811 {
812 /*
813 * Allocate buffer and recode it.
814 */
815 char *pszResult = (char *)RTMemAlloc(cch + 1);
816 if (pszResult)
817 {
818 rc = rtUtf16RecodeAsLatin1(pwszString, RTSTR_MAX, pszResult, cch, &cch);
819 if (RT_SUCCESS(rc))
820 {
821 *ppszString = pszResult;
822 return rc;
823 }
824
825 RTMemFree(pszResult);
826 }
827 else
828 rc = VERR_NO_STR_MEMORY;
829 }
830 return rc;
831}
832RT_EXPORT_SYMBOL(RTUtf16ToLatin1);
833
834
835RTDECL(int) RTUtf16ToLatin1Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
836{
837 /*
838 * Validate input.
839 */
840 Assert(VALID_PTR(pwszString));
841 Assert(VALID_PTR(ppsz));
842 Assert(!pcch || VALID_PTR(pcch));
843
844 /*
845 * Validate the UTF-16 string and calculate the length of the Latin1 encoding of it.
846 */
847 size_t cchResult;
848 int rc = rtUtf16CalcLatin1Length(pwszString, cwcString, &cchResult);
849 if (RT_SUCCESS(rc))
850 {
851 if (pcch)
852 *pcch = cchResult;
853
854 /*
855 * Check buffer size / Allocate buffer and recode it.
856 */
857 bool fShouldFree;
858 char *pszResult;
859 if (cch > 0 && *ppsz)
860 {
861 fShouldFree = false;
862 if (cch <= cchResult)
863 return VERR_BUFFER_OVERFLOW;
864 pszResult = *ppsz;
865 }
866 else
867 {
868 *ppsz = NULL;
869 fShouldFree = true;
870 cch = RT_MAX(cch, cchResult + 1);
871 pszResult = (char *)RTMemAlloc(cch);
872 }
873 if (pszResult)
874 {
875 rc = rtUtf16RecodeAsLatin1(pwszString, cwcString, pszResult, cch - 1, &cch);
876 if (RT_SUCCESS(rc))
877 {
878 *ppsz = pszResult;
879 return rc;
880 }
881
882 if (fShouldFree)
883 RTMemFree(pszResult);
884 }
885 else
886 rc = VERR_NO_STR_MEMORY;
887 }
888 return rc;
889}
890RT_EXPORT_SYMBOL(RTUtf16ToLatin1Ex);
891
892
893RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz)
894{
895 size_t cch;
896 int rc = rtUtf16CalcLatin1Length(pwsz, RTSTR_MAX, &cch);
897 return RT_SUCCESS(rc) ? cch : 0;
898}
899RT_EXPORT_SYMBOL(RTUtf16CalcLatin1Len);
900
901
902RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
903{
904 size_t cch;
905 int rc = rtUtf16CalcLatin1Length(pwsz, cwc, &cch);
906 if (pcch)
907 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
908 return rc;
909}
910RT_EXPORT_SYMBOL(RTUtf16CalcLatin1LenEx);
911
912
913/**
914 * Calculates the UTF-16 length of a Latin1 string. In fact this is just the
915 * original length, but the function saves us nasty comments to that effect
916 * all over the place.
917 *
918 * @returns IPRT status code.
919 * @param psz Pointer to the Latin1 string.
920 * @param cch The max length of the string. (btw cch = cb)
921 * Use RTSTR_MAX if all of the string is to be examined.s
922 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
923 */
924static int rtLatin1CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
925{
926 *pcwc = RTStrNLen(psz, cch);
927 return VINF_SUCCESS;
928}
929
930
931/**
932 * Recodes a Latin1 string as UTF-16. This is just a case of expanding it to
933 * sixteen bits, as Unicode is a superset of Latin1.
934 *
935 * Since we know the input is valid, we do *not* perform length checks.
936 *
937 * @returns iprt status code.
938 * @param psz The Latin1 string to recode.
939 * @param cch The number of chars (the type char, so bytes if you like) to process of the Latin1 string.
940 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
941 * @param pwsz Where to store the UTF-16 string.
942 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
943 * @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
944 */
945static int rtLatin1RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t *pcwc)
946{
947 int rc = VINF_SUCCESS;
948 const unsigned char *puch = (const unsigned char *)psz;
949 const PRTUTF16 pwszEnd = pwsz + cwc;
950 PRTUTF16 pwc = pwsz;
951 Assert(pwszEnd >= pwc);
952 while (cch > 0)
953 {
954 /* read the next char and check for terminator. */
955 const unsigned char uch = *puch;
956 if (!uch)
957 break;
958
959 /* check for output overflow */
960 if (pwc >= pwszEnd)
961 {
962 rc = VERR_BUFFER_OVERFLOW;
963 break;
964 }
965
966 /* expand the code point */
967 *pwc++ = uch;
968 puch++;
969 cch--;
970 }
971
972 /* done */
973 *pwc = '\0';
974 *pcwc = pwc - pwsz;
975 return rc;
976}
977
978
979RTDECL(int) RTLatin1ToUtf16(const char *pszString, PRTUTF16 *ppwszString)
980{
981 /*
982 * Validate input.
983 */
984 Assert(VALID_PTR(ppwszString));
985 Assert(VALID_PTR(pszString));
986 *ppwszString = NULL;
987
988 /*
989 * Validate the input and calculate the length of the UTF-16 string.
990 */
991 size_t cwc;
992 int rc = rtLatin1CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
993 if (RT_SUCCESS(rc))
994 {
995 /*
996 * Allocate buffer.
997 */
998 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
999 if (pwsz)
1000 {
1001 /*
1002 * Encode the UTF-16 string.
1003 */
1004 rc = rtLatin1RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
1005 if (RT_SUCCESS(rc))
1006 {
1007 *ppwszString = pwsz;
1008 return rc;
1009 }
1010 RTMemFree(pwsz);
1011 }
1012 else
1013 rc = VERR_NO_UTF16_MEMORY;
1014 }
1015 return rc;
1016}
1017RT_EXPORT_SYMBOL(RTLatin1ToUtf16);
1018
1019
1020RTDECL(int) RTLatin1ToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
1021{
1022 /*
1023 * Validate input.
1024 */
1025 Assert(VALID_PTR(pszString));
1026 Assert(VALID_PTR(ppwsz));
1027 Assert(!pcwc || VALID_PTR(pcwc));
1028
1029 /*
1030 * Validate the input and calculate the length of the UTF-16 string.
1031 */
1032 size_t cwcResult;
1033 int rc = rtLatin1CalcUtf16Length(pszString, cchString, &cwcResult);
1034 if (RT_SUCCESS(rc))
1035 {
1036 if (pcwc)
1037 *pcwc = cwcResult;
1038
1039 /*
1040 * Check buffer size / Allocate buffer.
1041 */
1042 bool fShouldFree;
1043 PRTUTF16 pwszResult;
1044 if (cwc > 0 && *ppwsz)
1045 {
1046 fShouldFree = false;
1047 if (cwc <= cwcResult)
1048 return VERR_BUFFER_OVERFLOW;
1049 pwszResult = *ppwsz;
1050 }
1051 else
1052 {
1053 *ppwsz = NULL;
1054 fShouldFree = true;
1055 cwc = RT_MAX(cwcResult + 1, cwc);
1056 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
1057 }
1058 if (pwszResult)
1059 {
1060 /*
1061 * Encode the UTF-16 string.
1062 */
1063 rc = rtLatin1RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
1064 if (RT_SUCCESS(rc))
1065 {
1066 *ppwsz = pwszResult;
1067 return rc;
1068 }
1069 if (fShouldFree)
1070 RTMemFree(pwszResult);
1071 }
1072 else
1073 rc = VERR_NO_UTF16_MEMORY;
1074 }
1075 return rc;
1076}
1077RT_EXPORT_SYMBOL(RTLatin1ToUtf16Ex);
1078
1079
1080RTDECL(size_t) RTLatin1CalcUtf16Len(const char *psz)
1081{
1082 size_t cwc;
1083 int rc = rtLatin1CalcUtf16Length(psz, RTSTR_MAX, &cwc);
1084 return RT_SUCCESS(rc) ? cwc : 0;
1085}
1086RT_EXPORT_SYMBOL(RTLatin1CalcUtf16Len);
1087
1088
1089RTDECL(int) RTLatin1CalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
1090{
1091 size_t cwc;
1092 int rc = rtLatin1CalcUtf16Length(psz, cch, &cwc);
1093 if (pcwc)
1094 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1095 return rc;
1096}
1097RT_EXPORT_SYMBOL(RTLatin1CalcUtf16LenEx);
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette