OpenVPN 3 Core Library
Loading...
Searching...
No Matches
unicode-impl.hpp
Go to the documentation of this file.
1/* Source: pkg:generic/CVTUTF@02-Nov-2004?download_url=https%3A%2F%2Fweb.archive.org%2Fweb%2F20041122041550%2Fhttp%3A%2F%2Fwww.unicode.org%2FPublic%2FPROGRAMS%2FCVTUTF%2F */
2/*
3 * Copyright 2001-2004 Unicode, Inc.
4 *
5 * Disclaimer
6 *
7 * This source code is provided as is by Unicode, Inc. No claims are
8 * made as to fitness for any particular purpose. No warranties of any
9 * kind are expressed or implied. The recipient agrees to determine
10 * applicability of information provided. If this file has been
11 * purchased on magnetic or optical media from Unicode, Inc., the
12 * sole remedy for any claim will be exchange of defective media
13 * within 90 days of receipt.
14 *
15 * Limitations on Rights to Redistribute This Code
16 *
17 * Unicode, Inc. hereby grants the right to freely use the information
18 * supplied in this file in the creation of products supporting the
19 * Unicode Standard, and to make copies of this file in any form
20 * for internal or external distribution as long as this notice
21 * remains attached.
22 */
23
24#ifndef OPENVPN_COMMON_UNICODE_IMPL_H
25#define OPENVPN_COMMON_UNICODE_IMPL_H
26
28 /* ---------------------------------------------------------------------
29
30 Conversions between UTF32, UTF-16, and UTF-8. Header file.
31
32 Several funtions are included here, forming a complete set of
33 conversions between the three formats. UTF-7 is not included
34 here, but is handled in a separate source file.
35
36 Each of these routines takes pointers to input buffers and output
37 buffers. The input buffers are const.
38
39 Each routine converts the text between *sourceStart and sourceEnd,
40 putting the result into the buffer between *targetStart and
41 targetEnd. Note: the end pointers are *after* the last item: e.g.
42 *(sourceEnd - 1) is the last item.
43
44 The return result indicates whether the conversion was successful,
45 and if not, whether the problem was in the source or target buffers.
46 (Only the first encountered problem is indicated.)
47
48 After the conversion, *sourceStart and *targetStart are both
49 updated to point to the end of last text successfully converted in
50 the respective buffers.
51
52 Input parameters:
53 sourceStart - pointer to a pointer to the source buffer.
54 The contents of this are modified on return so that
55 it points at the next thing to be converted.
56 targetStart - similarly, pointer to pointer to the target buffer.
57 sourceEnd, targetEnd - respectively pointers to the ends of the
58 two buffers, for overflow checking only.
59
60 These conversion functions take a ConversionFlags argument. When this
61 flag is set to strict, both irregular sequences and isolated surrogates
62 will cause an error. When the flag is set to lenient, both irregular
63 sequences and isolated surrogates are converted.
64
65 Whether the flag is strict or lenient, all illegal sequences will cause
66 an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
67 or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
68 must check for illegal sequences.
69
70 When the flag is set to lenient, characters over 0x10FFFF are converted
71 to the replacement character; otherwise (when the flag is set to strict)
72 they constitute an error.
73
74 Output parameters:
75 The value "sourceIllegal" is returned from some routines if the input
76 sequence is malformed. When "sourceIllegal" is returned, the source
77 value will point to the illegal value that caused the problem. E.g.,
78 in UTF-8 when a sequence is malformed, it points to the start of the
79 malformed sequence.
80
81 Author: Mark E. Davis, 1994.
82 Rev History: Rick McGowan, fixes & updates May 2001.
83 Fixes & updates, Sept 2001.
84
85 ------------------------------------------------------------------------ */
86
87 /* ---------------------------------------------------------------------
88 The following 4 definitions are compiler-specific.
89 The C standard does not guarantee that wchar_t has at least
90 16 bits, so wchar_t is no less portable than unsigned short!
91 All should be unsigned values to avoid sign extension during
92 bit mask & shift operations.
93 ------------------------------------------------------------------------ */
94
95 typedef unsigned int UTF32; /* at least 32 bits */
96 typedef unsigned short UTF16; /* at least 16 bits */
97 typedef unsigned char UTF8; /* typically 8 bits */
98
99 /* Some fundamental constants */
100 const UTF32 UNI_REPLACEMENT_CHAR = (UTF32)0x0000FFFD;
101 const UTF32 UNI_MAX_BMP = (UTF32)0x0000FFFF;
102 const UTF32 UNI_MAX_UTF16 = (UTF32)0x0010FFFF;
103 const UTF32 UNI_MAX_UTF32 = (UTF32)0x7FFFFFFF;
104 const UTF32 UNI_MAX_LEGAL_UTF32 = (UTF32)0x0010FFFF;
105
106 typedef enum {
107 conversionOK, /* conversion successful */
108 sourceExhausted, /* partial character in source, but hit end */
109 targetExhausted, /* insuff. room in target for conversion */
110 sourceIllegal /* source sequence is illegal/malformed */
112
117
118 /* --------------------------------------------------------------------- */
119 /*
120 * Copyright 2001-2004 Unicode, Inc.
121 *
122 * Disclaimer
123 *
124 * This source code is provided as is by Unicode, Inc. No claims are
125 * made as to fitness for any particular purpose. No warranties of any
126 * kind are expressed or implied. The recipient agrees to determine
127 * applicability of information provided. If this file has been
128 * purchased on magnetic or optical media from Unicode, Inc., the
129 * sole remedy for any claim will be exchange of defective media
130 * within 90 days of receipt.
131 *
132 * Limitations on Rights to Redistribute This Code
133 *
134 * Unicode, Inc. hereby grants the right to freely use the information
135 * supplied in this file in the creation of products supporting the
136 * Unicode Standard, and to make copies of this file in any form
137 * for internal or external distribution as long as this notice
138 * remains attached.
139 */
140
141 /* ---------------------------------------------------------------------
142
143 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
144 Author: Mark E. Davis, 1994.
145 Rev History: Rick McGowan, fixes & updates May 2001.
146 Sept 2001: fixed const & error conditions per
147 mods suggested by S. Parent & A. Lillich.
148 June 2002: Tim Dodd added detection and handling of incomplete
149 source sequences, enhanced error detection, added casts
150 to eliminate compiler warnings.
151 July 2003: slight mods to back out aggressive FFFE detection.
152 Jan 2004: updated switches in from-UTF8 conversions.
153 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
154
155 See the header file "ConvertUTF.h" for complete documentation.
156
157 ------------------------------------------------------------------------ */
158
159 const int halfShift = 10; /* used for shifting by 10 bits */
160
161 const UTF32 halfBase = 0x0010000UL;
162 const UTF32 halfMask = 0x3FFUL;
163
165 const UTF32 UNI_SUR_HIGH_END = (UTF32)0xDBFF;
167 const UTF32 UNI_SUR_LOW_END = (UTF32)0xDFFF;
168
169 /* --------------------------------------------------------------------- */
170
172 const UTF32** sourceStart, const UTF32* sourceEnd,
173 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
175 const UTF32* source = *sourceStart;
176 UTF16* target = *targetStart;
177 while (source < sourceEnd) {
178 UTF32 ch;
179 if (target >= targetEnd) {
180 result = targetExhausted; break;
181 }
182 ch = *source++;
183 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
184 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
185 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
186 if (flags == strictConversion) {
187 --source; /* return to the illegal value itself */
188 result = sourceIllegal;
189 break;
190 } else {
191 *target++ = UNI_REPLACEMENT_CHAR;
192 }
193 } else {
194 *target++ = (UTF16)ch; /* normal case */
195 }
196 } else if (ch > UNI_MAX_LEGAL_UTF32) {
197 if (flags == strictConversion) {
198 result = sourceIllegal;
199 } else {
200 *target++ = UNI_REPLACEMENT_CHAR;
201 }
202 } else {
203 /* target is a character in range 0xFFFF - 0x10FFFF. */
204 if (target + 1 >= targetEnd) {
205 --source; /* Back up source pointer! */
206 result = targetExhausted; break;
207 }
208 ch -= halfBase;
209 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
210 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
211 }
212 }
213 *sourceStart = source;
214 *targetStart = target;
215 return result;
216 }
217
218 /* --------------------------------------------------------------------- */
219
221 const UTF16** sourceStart, const UTF16* sourceEnd,
222 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
224 const UTF16* source = *sourceStart;
225 UTF32* target = *targetStart;
226 UTF32 ch, ch2;
227 while (source < sourceEnd) {
228 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
229 ch = *source++;
230 /* If we have a surrogate pair, convert to UTF32 first. */
231 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
232 /* If the 16 bits following the high surrogate are in the source buffer... */
233 if (source < sourceEnd) {
234 ch2 = *source;
235 /* If it's a low surrogate, convert to UTF32. */
236 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
237 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
238 + (ch2 - UNI_SUR_LOW_START) + halfBase;
239 ++source;
240 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
241 --source; /* return to the illegal value itself */
242 result = sourceIllegal;
243 break;
244 }
245 } else { /* We don't have the 16 bits following the high surrogate. */
246 --source; /* return to the high surrogate */
247 result = sourceExhausted;
248 break;
249 }
250 } else if (flags == strictConversion) {
251 /* UTF-16 surrogate values are illegal in UTF-32 */
252 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
253 --source; /* return to the illegal value itself */
254 result = sourceIllegal;
255 break;
256 }
257 }
258 if (target >= targetEnd) {
259 source = oldSource; /* Back up source pointer! */
260 result = targetExhausted; break;
261 }
262 *target++ = ch;
263 }
264 *sourceStart = source;
265 *targetStart = target;
266#ifdef CVTUTF_DEBUG
267 if (result == sourceIllegal) {
268 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
269 fflush(stderr);
270 }
271#endif
272 return result;
273 }
274
275 /* --------------------------------------------------------------------- */
276
277 /*
278 * Index into the table below with the first byte of a UTF-8 sequence to
279 * get the number of trailing bytes that are supposed to follow it.
280 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
281 * left as-is for anyone who may want to do such conversion, which was
282 * allowed in earlier algorithms.
283 */
284 const char trailingBytesForUTF8[256] = {
285 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
286 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
287 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
288 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
289 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
290 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
291 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
292 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
293 };
294
295 /*
296 * Magic values subtracted from a buffer value during UTF8 conversion.
297 * This table contains as many values as there might be trailing bytes
298 * in a UTF-8 sequence.
299 */
300 const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
301 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
302
303 /*
304 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
305 * into the first byte, depending on how many bytes follow. There are
306 * as many entries in this table as there are UTF-8 sequence types.
307 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
308 * for *legal* UTF-8 will be 4 or fewer bytes total.
309 */
310 const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
311
312 /* --------------------------------------------------------------------- */
313
314 /* The interface converts a whole buffer to avoid function-call overhead.
315 * Constants have been gathered. Loops & conditionals have been removed as
316 * much as possible for efficiency, in favor of drop-through switches.
317 * (See "Note A" at the bottom of the file for equivalent code.)
318 * If your compiler supports it, the "isLegalUTF8" call can be turned
319 * into an inline function.
320 */
321
322 /* --------------------------------------------------------------------- */
323
325 const UTF16** sourceStart, const UTF16* sourceEnd,
326 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
328 const UTF16* source = *sourceStart;
329 UTF8* target = *targetStart;
330 while (source < sourceEnd) {
331 UTF32 ch;
332 unsigned short bytesToWrite = 0;
333 const UTF32 byteMask = 0xBF;
334 const UTF32 byteMark = 0x80;
335 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
336 ch = *source++;
337 /* If we have a surrogate pair, convert to UTF32 first. */
338 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
339 /* If the 16 bits following the high surrogate are in the source buffer... */
340 if (source < sourceEnd) {
341 UTF32 ch2 = *source;
342 /* If it's a low surrogate, convert to UTF32. */
343 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
344 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
345 + (ch2 - UNI_SUR_LOW_START) + halfBase;
346 ++source;
347 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
348 --source; /* return to the illegal value itself */
349 result = sourceIllegal;
350 break;
351 }
352 } else { /* We don't have the 16 bits following the high surrogate. */
353 --source; /* return to the high surrogate */
354 result = sourceExhausted;
355 break;
356 }
357 } else if (flags == strictConversion) {
358 /* UTF-16 surrogate values are illegal in UTF-32 */
359 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
360 --source; /* return to the illegal value itself */
361 result = sourceIllegal;
362 break;
363 }
364 }
365 /* Figure out how many bytes the result will require */
366 if (ch < (UTF32)0x80) { bytesToWrite = 1;
367 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
368 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
369 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
370 } else { bytesToWrite = 3;
372 }
373
374 target += bytesToWrite;
375 if (target > targetEnd) {
376 source = oldSource; /* Back up source pointer! */
377 target -= bytesToWrite; result = targetExhausted; break;
378 }
379 switch (bytesToWrite) { /* note: everything falls through. */
380 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
381 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
382 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
383 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
384 }
385 target += bytesToWrite;
386 }
387 *sourceStart = source;
388 *targetStart = target;
389 return result;
390 }
391
392 /* --------------------------------------------------------------------- */
393
394 /*
395 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
396 * This must be called with the length pre-determined by the first byte.
397 * If not calling this from ConvertUTF8to*, then the length can be set by:
398 * length = trailingBytesForUTF8[*source]+1;
399 * and the sequence is illegal right away if there aren't that many bytes
400 * available.
401 * If presented with a length > 4, this returns false. The Unicode
402 * definition of UTF-8 goes up to 4-byte sequences.
403 */
404
405 inline bool isLegalUTF8(const UTF8 *source, int length) {
406 UTF8 a;
407 const UTF8 *srcptr = source+length;
408 switch (length) {
409 default: return false;
410 /* Everything else falls through when "true"... */
411 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
412 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
413 case 2: if ((a = (*--srcptr)) > 0xBF) return false;
414
415 switch (*source) {
416 /* no fall-through in this inner switch */
417 case 0xE0: if (a < 0xA0) return false; break;
418 case 0xED: if (a > 0x9F) return false; break;
419 case 0xF0: if (a < 0x90) return false; break;
420 case 0xF4: if (a > 0x8F) return false; break;
421 default: if (a < 0x80) return false;
422 }
423
424 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
425 }
426 if (*source > 0xF4) return false;
427 return true;
428 }
429
430 /* --------------------------------------------------------------------- */
431
432 /*
433 * Exported function to return whether a UTF-8 sequence is legal or not.
434 * This is not used here; it's just exported.
435 */
436 inline bool isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
437 int length = trailingBytesForUTF8[*source]+1;
438 if (source+length > sourceEnd) {
439 return false;
440 }
441 return isLegalUTF8(source, length);
442 }
443
444 /* --------------------------------------------------------------------- */
445
447 const UTF8** sourceStart, const UTF8* sourceEnd,
448 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
450 const UTF8* source = *sourceStart;
451 UTF16* target = *targetStart;
452 while (source < sourceEnd) {
453 UTF32 ch = 0;
454 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
455 if (source + extraBytesToRead >= sourceEnd) {
456 result = sourceExhausted; break;
457 }
458 /* Do this check whether lenient or strict */
459 if (! isLegalUTF8(source, extraBytesToRead+1)) {
460 result = sourceIllegal;
461 break;
462 }
463 /*
464 * The cases all fall through. See "Note A" below.
465 */
466 switch (extraBytesToRead) {
467 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
468 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
469 case 3: ch += *source++; ch <<= 6;
470 case 2: ch += *source++; ch <<= 6;
471 case 1: ch += *source++; ch <<= 6;
472 case 0: ch += *source++;
473 }
474 ch -= offsetsFromUTF8[extraBytesToRead];
475
476 if (target >= targetEnd) {
477 source -= (extraBytesToRead+1); /* Back up source pointer! */
478 result = targetExhausted; break;
479 }
480 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
481 /* UTF-16 surrogate values are illegal in UTF-32 */
482 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
483 if (flags == strictConversion) {
484 source -= (extraBytesToRead+1); /* return to the illegal value itself */
485 result = sourceIllegal;
486 break;
487 } else {
488 *target++ = UNI_REPLACEMENT_CHAR;
489 }
490 } else {
491 *target++ = (UTF16)ch; /* normal case */
492 }
493 } else if (ch > UNI_MAX_UTF16) {
494 if (flags == strictConversion) {
495 result = sourceIllegal;
496 source -= (extraBytesToRead+1); /* return to the start */
497 break; /* Bail out; shouldn't continue */
498 } else {
499 *target++ = UNI_REPLACEMENT_CHAR;
500 }
501 } else {
502 /* target is a character in range 0xFFFF - 0x10FFFF. */
503 if (target + 1 >= targetEnd) {
504 source -= (extraBytesToRead+1); /* Back up source pointer! */
505 result = targetExhausted; break;
506 }
507 ch -= halfBase;
508 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
509 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
510 }
511 }
512 *sourceStart = source;
513 *targetStart = target;
514 return result;
515 }
516
517 /* --------------------------------------------------------------------- */
518
520 const UTF32** sourceStart, const UTF32* sourceEnd,
521 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
523 const UTF32* source = *sourceStart;
524 UTF8* target = *targetStart;
525 while (source < sourceEnd) {
526 UTF32 ch;
527 unsigned short bytesToWrite = 0;
528 const UTF32 byteMask = 0xBF;
529 const UTF32 byteMark = 0x80;
530 ch = *source++;
531 if (flags == strictConversion ) {
532 /* UTF-16 surrogate values are illegal in UTF-32 */
533 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
534 --source; /* return to the illegal value itself */
535 result = sourceIllegal;
536 break;
537 }
538 }
539 /*
540 * Figure out how many bytes the result will require. Turn any
541 * illegally large UTF32 things (> Plane 17) into replacement chars.
542 */
543 if (ch < (UTF32)0x80) { bytesToWrite = 1;
544 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
545 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
546 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
547 } else { bytesToWrite = 3;
549 result = sourceIllegal;
550 }
551
552 target += bytesToWrite;
553 if (target > targetEnd) {
554 --source; /* Back up source pointer! */
555 target -= bytesToWrite; result = targetExhausted; break;
556 }
557 switch (bytesToWrite) { /* note: everything falls through. */
558 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
559 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
560 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
561 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
562 }
563 target += bytesToWrite;
564 }
565 *sourceStart = source;
566 *targetStart = target;
567 return result;
568 }
569
570 /* --------------------------------------------------------------------- */
571
573 const UTF8** sourceStart, const UTF8* sourceEnd,
574 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
576 const UTF8* source = *sourceStart;
577 UTF32* target = *targetStart;
578 while (source < sourceEnd) {
579 UTF32 ch = 0;
580 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
581 if (source + extraBytesToRead >= sourceEnd) {
582 result = sourceExhausted; break;
583 }
584 /* Do this check whether lenient or strict */
585 if (! isLegalUTF8(source, extraBytesToRead+1)) {
586 result = sourceIllegal;
587 break;
588 }
589 /*
590 * The cases all fall through. See "Note A" below.
591 */
592 switch (extraBytesToRead) {
593 case 5: ch += *source++; ch <<= 6;
594 case 4: ch += *source++; ch <<= 6;
595 case 3: ch += *source++; ch <<= 6;
596 case 2: ch += *source++; ch <<= 6;
597 case 1: ch += *source++; ch <<= 6;
598 case 0: ch += *source++;
599 }
600 ch -= offsetsFromUTF8[extraBytesToRead];
601
602 if (target >= targetEnd) {
603 source -= (extraBytesToRead+1); /* Back up the source pointer! */
604 result = targetExhausted; break;
605 }
606 if (ch <= UNI_MAX_LEGAL_UTF32) {
607 /*
608 * UTF-16 surrogate values are illegal in UTF-32, and anything
609 * over Plane 17 (> 0x10FFFF) is illegal.
610 */
611 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
612 if (flags == strictConversion) {
613 source -= (extraBytesToRead+1); /* return to the illegal value itself */
614 result = sourceIllegal;
615 break;
616 } else {
617 *target++ = UNI_REPLACEMENT_CHAR;
618 }
619 } else {
620 *target++ = ch;
621 }
622 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
623 result = sourceIllegal;
624 *target++ = UNI_REPLACEMENT_CHAR;
625 }
626 }
627 *sourceStart = source;
628 *targetStart = target;
629 return result;
630 }
631
632 /* ---------------------------------------------------------------------
633
634 Note A.
635 The fall-through switches in UTF-8 reading code save a
636 temp variable, some decrements & conditionals. The switches
637 are equivalent to the following loop:
638 {
639 int tmpBytesToRead = extraBytesToRead+1;
640 do {
641 ch += *source++;
642 --tmpBytesToRead;
643 if (tmpBytesToRead) ch <<= 6;
644 } while (tmpBytesToRead > 0);
645 }
646 In UTF-8 writing code, the switches on "bytesToWrite" are
647 similarly unrolled loops.
648
649 --------------------------------------------------------------------- */
650}
651
652#endif
const UTF32 UNI_SUR_LOW_START
const UTF32 UNI_SUR_HIGH_START
const UTF32 halfBase
const char trailingBytesForUTF8[256]
ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
const UTF32 UNI_SUR_LOW_END
const UTF8 firstByteMark[7]
const UTF32 UNI_SUR_HIGH_END
const UTF32 halfMask
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
unsigned int UTF32
const UTF32 UNI_MAX_UTF16
const UTF32 UNI_MAX_UTF32
const UTF32 UNI_REPLACEMENT_CHAR
ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
bool isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
const UTF32 UNI_MAX_BMP
ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
const UTF32 offsetsFromUTF8[6]
unsigned char UTF8
ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
unsigned short UTF16
const UTF32 UNI_MAX_LEGAL_UTF32
bool isLegalUTF8(const UTF8 *source, int length)
reroute_gw flags