1 | // Copyright 2006 Nemanja Trifunovic |
---|
2 | |
---|
3 | /* |
---|
4 | Permission is hereby granted, free of charge, to any person or organization |
---|
5 | obtaining a copy of the software and accompanying documentation covered by |
---|
6 | this license (the "Software") to use, reproduce, display, distribute, |
---|
7 | execute, and transmit the Software, and to prepare derivative works of the |
---|
8 | Software, and to permit third-parties to whom the Software is furnished to |
---|
9 | do so, all subject to the following: |
---|
10 | |
---|
11 | The copyright notices in the Software and this entire statement, including |
---|
12 | the above license grant, this restriction and the following disclaimer, |
---|
13 | must be included in all copies of the Software, in whole or in part, and |
---|
14 | all derivative works of the Software, unless such copies or derivative |
---|
15 | works are solely in the form of machine-executable object code generated by |
---|
16 | a source language processor. |
---|
17 | |
---|
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
---|
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
---|
20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT |
---|
21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE |
---|
22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, |
---|
23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
---|
24 | DEALINGS IN THE SOFTWARE. |
---|
25 | */ |
---|
26 | |
---|
27 | |
---|
28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 |
---|
29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 |
---|
30 | |
---|
31 | #include <iterator> |
---|
32 | |
---|
33 | // use Trinity core types |
---|
34 | #include "Platform/Define.h" |
---|
35 | |
---|
36 | namespace utf8 |
---|
37 | { |
---|
38 | // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers |
---|
39 | // You may need to change them to match your system. |
---|
40 | // These typedefs have the same names as ones from cstdint, or boost/cstdint |
---|
41 | |
---|
42 | /* use Trinity alternatives |
---|
43 | typedef unsigned char uint8_t; |
---|
44 | typedef unsigned short uint16_t; |
---|
45 | typedef unsigned int uint32_t; |
---|
46 | */ |
---|
47 | typedef uint8 uint8_t; |
---|
48 | typedef uint16 uint16_t; |
---|
49 | typedef uint32 uint32_t; |
---|
50 | |
---|
51 | // Helper code - not intended to be directly called by the library users. May be changed at any time |
---|
52 | namespace internal |
---|
53 | { |
---|
54 | // Unicode constants |
---|
55 | // Leading (high) surrogates: 0xd800 - 0xdbff |
---|
56 | // Trailing (low) surrogates: 0xdc00 - 0xdfff |
---|
57 | const uint16_t LEAD_SURROGATE_MIN = 0xd800u; |
---|
58 | const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; |
---|
59 | const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; |
---|
60 | const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; |
---|
61 | const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); |
---|
62 | const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; |
---|
63 | |
---|
64 | // Maximum valid value for a Unicode code point |
---|
65 | const uint32_t CODE_POINT_MAX = 0x0010ffffu; |
---|
66 | |
---|
67 | template<typename octet_type> |
---|
68 | inline uint8_t mask8(octet_type oc) |
---|
69 | { |
---|
70 | return static_cast<uint8_t>(0xff & oc); |
---|
71 | } |
---|
72 | template<typename u16_type> |
---|
73 | inline uint16_t mask16(u16_type oc) |
---|
74 | { |
---|
75 | return static_cast<uint16_t>(0xffff & oc); |
---|
76 | } |
---|
77 | template<typename octet_type> |
---|
78 | inline bool is_trail(octet_type oc) |
---|
79 | { |
---|
80 | return ((mask8(oc) >> 6) == 0x2); |
---|
81 | } |
---|
82 | |
---|
83 | template <typename u16> |
---|
84 | inline bool is_surrogate(u16 cp) |
---|
85 | { |
---|
86 | return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); |
---|
87 | } |
---|
88 | |
---|
89 | template <typename u32> |
---|
90 | inline bool is_code_point_valid(u32 cp) |
---|
91 | { |
---|
92 | return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff); |
---|
93 | } |
---|
94 | |
---|
95 | template <typename octet_iterator> |
---|
96 | inline typename std::iterator_traits<octet_iterator>::difference_type |
---|
97 | sequence_length(octet_iterator lead_it) |
---|
98 | { |
---|
99 | uint8_t lead = mask8(*lead_it); |
---|
100 | if (lead < 0x80) |
---|
101 | return 1; |
---|
102 | else if ((lead >> 5) == 0x6) |
---|
103 | return 2; |
---|
104 | else if ((lead >> 4) == 0xe) |
---|
105 | return 3; |
---|
106 | else if ((lead >> 3) == 0x1e) |
---|
107 | return 4; |
---|
108 | else |
---|
109 | return 0; |
---|
110 | } |
---|
111 | |
---|
112 | enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; |
---|
113 | |
---|
114 | template <typename octet_iterator> |
---|
115 | utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point) |
---|
116 | { |
---|
117 | uint32_t cp = mask8(*it); |
---|
118 | // Check the lead octet |
---|
119 | typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type; |
---|
120 | octet_difference_type length = sequence_length(it); |
---|
121 | |
---|
122 | // "Shortcut" for ASCII characters |
---|
123 | if (length == 1) { |
---|
124 | if (end - it > 0) { |
---|
125 | if (code_point) |
---|
126 | *code_point = cp; |
---|
127 | ++it; |
---|
128 | return OK; |
---|
129 | } |
---|
130 | else |
---|
131 | return NOT_ENOUGH_ROOM; |
---|
132 | } |
---|
133 | |
---|
134 | // Do we have enough memory? |
---|
135 | if (std::distance(it, end) < length) |
---|
136 | return NOT_ENOUGH_ROOM; |
---|
137 | |
---|
138 | // Check trail octets and calculate the code point |
---|
139 | switch (length) { |
---|
140 | case 0: |
---|
141 | return INVALID_LEAD; |
---|
142 | break; |
---|
143 | case 2: |
---|
144 | if (is_trail(*(++it))) { |
---|
145 | cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); |
---|
146 | } |
---|
147 | else { |
---|
148 | --it; |
---|
149 | return INCOMPLETE_SEQUENCE; |
---|
150 | } |
---|
151 | break; |
---|
152 | case 3: |
---|
153 | if (is_trail(*(++it))) { |
---|
154 | cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff); |
---|
155 | if (is_trail(*(++it))) { |
---|
156 | cp += (*it) & 0x3f; |
---|
157 | } |
---|
158 | else { |
---|
159 | std::advance(it, -2); |
---|
160 | return INCOMPLETE_SEQUENCE; |
---|
161 | } |
---|
162 | } |
---|
163 | else { |
---|
164 | --it; |
---|
165 | return INCOMPLETE_SEQUENCE; |
---|
166 | } |
---|
167 | break; |
---|
168 | case 4: |
---|
169 | if (is_trail(*(++it))) { |
---|
170 | cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff); |
---|
171 | if (is_trail(*(++it))) { |
---|
172 | cp += (mask8(*it) << 6) & 0xfff; |
---|
173 | if (is_trail(*(++it))) { |
---|
174 | cp += (*it) & 0x3f; |
---|
175 | } |
---|
176 | else { |
---|
177 | std::advance(it, -3); |
---|
178 | return INCOMPLETE_SEQUENCE; |
---|
179 | } |
---|
180 | } |
---|
181 | else { |
---|
182 | std::advance(it, -2); |
---|
183 | return INCOMPLETE_SEQUENCE; |
---|
184 | } |
---|
185 | } |
---|
186 | else { |
---|
187 | --it; |
---|
188 | return INCOMPLETE_SEQUENCE; |
---|
189 | } |
---|
190 | break; |
---|
191 | } |
---|
192 | // Is the code point valid? |
---|
193 | if (!is_code_point_valid(cp)) { |
---|
194 | for (octet_difference_type i = 0; i < length - 1; ++i) |
---|
195 | --it; |
---|
196 | return INVALID_CODE_POINT; |
---|
197 | } |
---|
198 | |
---|
199 | if (code_point) |
---|
200 | *code_point = cp; |
---|
201 | |
---|
202 | if (cp < 0x80) { |
---|
203 | if (length != 1) { |
---|
204 | std::advance(it, -(length-1)); |
---|
205 | return OVERLONG_SEQUENCE; |
---|
206 | } |
---|
207 | } |
---|
208 | else if (cp < 0x800) { |
---|
209 | if (length != 2) { |
---|
210 | std::advance(it, -(length-1)); |
---|
211 | return OVERLONG_SEQUENCE; |
---|
212 | } |
---|
213 | } |
---|
214 | else if (cp < 0x10000) { |
---|
215 | if (length != 3) { |
---|
216 | std::advance(it, -(length-1)); |
---|
217 | return OVERLONG_SEQUENCE; |
---|
218 | } |
---|
219 | } |
---|
220 | |
---|
221 | ++it; |
---|
222 | return OK; |
---|
223 | } |
---|
224 | |
---|
225 | template <typename octet_iterator> |
---|
226 | inline utf_error validate_next(octet_iterator& it, octet_iterator end) { |
---|
227 | return validate_next(it, end, 0); |
---|
228 | } |
---|
229 | |
---|
230 | } // namespace internal |
---|
231 | |
---|
232 | /// The library API - functions intended to be called by the users |
---|
233 | |
---|
234 | // Byte order mark |
---|
235 | const uint8_t bom[] = {0xef, 0xbb, 0xbf}; |
---|
236 | |
---|
237 | template <typename octet_iterator> |
---|
238 | octet_iterator find_invalid(octet_iterator start, octet_iterator end) |
---|
239 | { |
---|
240 | octet_iterator result = start; |
---|
241 | while (result != end) { |
---|
242 | internal::utf_error err_code = internal::validate_next(result, end); |
---|
243 | if (err_code != internal::OK) |
---|
244 | return result; |
---|
245 | } |
---|
246 | return result; |
---|
247 | } |
---|
248 | |
---|
249 | template <typename octet_iterator> |
---|
250 | inline bool is_valid(octet_iterator start, octet_iterator end) |
---|
251 | { |
---|
252 | return (find_invalid(start, end) == end); |
---|
253 | } |
---|
254 | |
---|
255 | template <typename octet_iterator> |
---|
256 | inline bool is_bom (octet_iterator it) |
---|
257 | { |
---|
258 | return ( |
---|
259 | (internal::mask8(*it++)) == bom[0] && |
---|
260 | (internal::mask8(*it++)) == bom[1] && |
---|
261 | (internal::mask8(*it)) == bom[2] |
---|
262 | ); |
---|
263 | } |
---|
264 | } // namespace utf8 |
---|
265 | |
---|
266 | #endif // header guard |
---|
267 | |
---|
268 | |
---|