Cogs.Core
utf8_strings.cpp
1#ifdef _WIN32
2#pragma warning(disable: 4244) // conversion from 'unsigned int' to '_Elem', possible loss of data
3#endif
4
5#include "html.h"
6#include "utf8_strings.h"
7
8
9litehtml::utf8_to_wchar::utf8_to_wchar(const char* val)
10{
11 m_utf8 = (const byte*) val;
12 while (true)
13 {
14 ucode_t wch = get_char();
15 if (!wch) break;
16 m_str += wch;
17 }
18}
19
20litehtml::ucode_t litehtml::utf8_to_wchar::get_char()
21{
22 ucode_t b1 = getb();
23
24 if (!b1)
25 {
26 return 0;
27 }
28
29 // Determine whether we are dealing
30 // with a one-, two-, three-, or four-
31 // byte sequence.
32 if ((b1 & 0x80) == 0)
33 {
34 // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
35 return b1;
36 }
37 else if ((b1 & 0xe0) == 0xc0)
38 {
39 // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
40 ucode_t r = (b1 & 0x1f) << 6;
41 r |= get_next_utf8(getb());
42 return r;
43 }
44 else if ((b1 & 0xf0) == 0xe0)
45 {
46 // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
47 ucode_t r = (b1 & 0x0f) << 12;
48 r |= get_next_utf8(getb()) << 6;
49 r |= get_next_utf8(getb());
50 return r;
51 }
52 else if ((b1 & 0xf8) == 0xf0)
53 {
54 // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
55 // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
56 // (uuuuu = wwww + 1)
57 int b2 = get_next_utf8(getb());
58 int b3 = get_next_utf8(getb());
59 int b4 = get_next_utf8(getb());
60 return ((b1 & 7) << 18) | ((b2 & 0x3f) << 12) |
61 ((b3 & 0x3f) << 6) | (b4 & 0x3f);
62 }
63
64 //bad start for UTF-8 multi-byte sequence
65 return '?';
66}
67
68litehtml::wchar_to_utf8::wchar_to_utf8(const wchar_t* val)
69{
70 unsigned int code;
71 for (int i = 0; val[i]; i++)
72 {
73 code = val[i];
74 if (code <= 0x7F)
75 {
76 m_str += (char)code;
77 }
78 else if (code <= 0x7FF)
79 {
80 m_str += (code >> 6) + 192;
81 m_str += (code & 63) + 128;
82 }
83 else if (0xd800 <= code && code <= 0xdfff)
84 {
85 //invalid block of utf8
86 }
87 else if (code <= 0xFFFF)
88 {
89 m_str += (code >> 12) + 224;
90 m_str += ((code >> 6) & 63) + 128;
91 m_str += (code & 63) + 128;
92 }
93 else if (code <= 0x10FFFF)
94 {
95 m_str += (code >> 18) + 240;
96 m_str += ((code >> 12) & 63) + 128;
97 m_str += ((code >> 6) & 63) + 128;
98 m_str += (code & 63) + 128;
99 }
100 }
101}