1 | // Copyright 2006 Nemanja Trifunovic |
---|
2 | |
---|
3 | /* |
---|
4 | Permission is hereby granted, free of charge, to any person or organization |
---|
5 | obtaining a copy of the software and accompanying documentation covered by |
---|
6 | this license (the "Software") to use, reproduce, display, distribute, |
---|
7 | execute, and transmit the Software, and to prepare derivative works of the |
---|
8 | Software, and to permit third-parties to whom the Software is furnished to |
---|
9 | do so, all subject to the following: |
---|
10 | |
---|
11 | The copyright notices in the Software and this entire statement, including |
---|
12 | the above license grant, this restriction and the following disclaimer, |
---|
13 | must be included in all copies of the Software, in whole or in part, and |
---|
14 | all derivative works of the Software, unless such copies or derivative |
---|
15 | works are solely in the form of machine-executable object code generated by |
---|
16 | a source language processor. |
---|
17 | |
---|
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
---|
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
---|
20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT |
---|
21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE |
---|
22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, |
---|
23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
---|
24 | DEALINGS IN THE SOFTWARE. |
---|
25 | */ |
---|
26 | |
---|
27 | |
---|
28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 |
---|
29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 |
---|
30 | |
---|
31 | #include "core.h" |
---|
32 | #include <stdexcept> |
---|
33 | |
---|
34 | namespace utf8 |
---|
35 | { |
---|
36 | // Exceptions that may be thrown from the library functions. |
---|
37 | class invalid_code_point : public std::exception { |
---|
38 | uint32_t cp; |
---|
39 | public: |
---|
40 | invalid_code_point(uint32_t cp) : cp(cp) {} |
---|
41 | virtual const char* what() const throw() { return "Invalid code point"; } |
---|
42 | uint32_t code_point() const {return cp;} |
---|
43 | }; |
---|
44 | |
---|
45 | class invalid_utf8 : public std::exception { |
---|
46 | uint8_t u8; |
---|
47 | public: |
---|
48 | invalid_utf8 (uint8_t u) : u8(u) {} |
---|
49 | virtual const char* what() const throw() { return "Invalid UTF-8"; } |
---|
50 | uint8_t utf8_octet() const {return u8;} |
---|
51 | }; |
---|
52 | |
---|
53 | class invalid_utf16 : public std::exception { |
---|
54 | uint16_t u16; |
---|
55 | public: |
---|
56 | invalid_utf16 (uint16_t u) : u16(u) {} |
---|
57 | virtual const char* what() const throw() { return "Invalid UTF-16"; } |
---|
58 | uint16_t utf16_word() const {return u16;} |
---|
59 | }; |
---|
60 | |
---|
61 | class not_enough_room : public std::exception { |
---|
62 | public: |
---|
63 | virtual const char* what() const throw() { return "Not enough space"; } |
---|
64 | }; |
---|
65 | |
---|
66 | /// The library API - functions intended to be called by the users |
---|
67 | |
---|
68 | template <typename octet_iterator, typename output_iterator> |
---|
69 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) |
---|
70 | { |
---|
71 | while (start != end) { |
---|
72 | octet_iterator sequence_start = start; |
---|
73 | internal::utf_error err_code = internal::validate_next(start, end); |
---|
74 | switch (err_code) { |
---|
75 | case internal::OK : |
---|
76 | for (octet_iterator it = sequence_start; it != start; ++it) |
---|
77 | *out++ = *it; |
---|
78 | break; |
---|
79 | case internal::NOT_ENOUGH_ROOM: |
---|
80 | throw not_enough_room(); |
---|
81 | case internal::INVALID_LEAD: |
---|
82 | append (replacement, out); |
---|
83 | ++start; |
---|
84 | break; |
---|
85 | case internal::INCOMPLETE_SEQUENCE: |
---|
86 | case internal::OVERLONG_SEQUENCE: |
---|
87 | case internal::INVALID_CODE_POINT: |
---|
88 | append (replacement, out); |
---|
89 | ++start; |
---|
90 | // just one replacement mark for the sequence |
---|
91 | while (internal::is_trail(*start) && start != end) |
---|
92 | ++start; |
---|
93 | break; |
---|
94 | } |
---|
95 | } |
---|
96 | return out; |
---|
97 | } |
---|
98 | |
---|
99 | template <typename octet_iterator, typename output_iterator> |
---|
100 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) |
---|
101 | { |
---|
102 | static const uint32_t replacement_marker = internal::mask16(0xfffd); |
---|
103 | return replace_invalid(start, end, out, replacement_marker); |
---|
104 | } |
---|
105 | |
---|
106 | template <typename octet_iterator> |
---|
107 | octet_iterator append(uint32_t cp, octet_iterator result) |
---|
108 | { |
---|
109 | if (!internal::is_code_point_valid(cp)) |
---|
110 | throw invalid_code_point(cp); |
---|
111 | |
---|
112 | if (cp < 0x80) // one octet |
---|
113 | *(result++) = static_cast<uint8_t>(cp); |
---|
114 | else if (cp < 0x800) { // two octets |
---|
115 | *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); |
---|
116 | *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); |
---|
117 | } |
---|
118 | else if (cp < 0x10000) { // three octets |
---|
119 | *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); |
---|
120 | *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80); |
---|
121 | *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); |
---|
122 | } |
---|
123 | else if (cp <= internal::CODE_POINT_MAX) { // four octets |
---|
124 | *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); |
---|
125 | *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80); |
---|
126 | *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80); |
---|
127 | *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); |
---|
128 | } |
---|
129 | else |
---|
130 | throw invalid_code_point(cp); |
---|
131 | |
---|
132 | return result; |
---|
133 | } |
---|
134 | |
---|
135 | template <typename octet_iterator> |
---|
136 | uint32_t next(octet_iterator& it, octet_iterator end) |
---|
137 | { |
---|
138 | uint32_t cp = 0; |
---|
139 | internal::utf_error err_code = internal::validate_next(it, end, &cp); |
---|
140 | switch (err_code) { |
---|
141 | case internal::OK : |
---|
142 | break; |
---|
143 | case internal::NOT_ENOUGH_ROOM : |
---|
144 | throw not_enough_room(); |
---|
145 | case internal::INVALID_LEAD : |
---|
146 | case internal::INCOMPLETE_SEQUENCE : |
---|
147 | case internal::OVERLONG_SEQUENCE : |
---|
148 | throw invalid_utf8(*it); |
---|
149 | case internal::INVALID_CODE_POINT : |
---|
150 | throw invalid_code_point(cp); |
---|
151 | } |
---|
152 | return cp; |
---|
153 | } |
---|
154 | |
---|
155 | template <typename octet_iterator> |
---|
156 | uint32_t peek_next(octet_iterator it, octet_iterator end) |
---|
157 | { |
---|
158 | return next(it, end); |
---|
159 | } |
---|
160 | |
---|
161 | template <typename octet_iterator> |
---|
162 | uint32_t prior(octet_iterator& it, octet_iterator start) |
---|
163 | { |
---|
164 | octet_iterator end = it; |
---|
165 | while (internal::is_trail(*(--it))) |
---|
166 | if (it < start) |
---|
167 | throw invalid_utf8(*it); // error - no lead byte in the sequence |
---|
168 | octet_iterator temp = it; |
---|
169 | return next(temp, end); |
---|
170 | } |
---|
171 | |
---|
172 | /// Deprecated in versions that include "prior" |
---|
173 | template <typename octet_iterator> |
---|
174 | uint32_t previous(octet_iterator& it, octet_iterator pass_start) |
---|
175 | { |
---|
176 | octet_iterator end = it; |
---|
177 | while (internal::is_trail(*(--it))) |
---|
178 | if (it == pass_start) |
---|
179 | throw invalid_utf8(*it); // error - no lead byte in the sequence |
---|
180 | octet_iterator temp = it; |
---|
181 | return next(temp, end); |
---|
182 | } |
---|
183 | |
---|
184 | template <typename octet_iterator, typename distance_type> |
---|
185 | void advance (octet_iterator& it, distance_type n, octet_iterator end) |
---|
186 | { |
---|
187 | for (distance_type i = 0; i < n; ++i) |
---|
188 | next(it, end); |
---|
189 | } |
---|
190 | |
---|
191 | template <typename octet_iterator> |
---|
192 | typename std::iterator_traits<octet_iterator>::difference_type |
---|
193 | distance (octet_iterator first, octet_iterator last) |
---|
194 | { |
---|
195 | typename std::iterator_traits<octet_iterator>::difference_type dist; |
---|
196 | for (dist = 0; first < last; ++dist) |
---|
197 | next(first, last); |
---|
198 | return dist; |
---|
199 | } |
---|
200 | |
---|
201 | template <typename u16bit_iterator, typename octet_iterator> |
---|
202 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) |
---|
203 | { |
---|
204 | while (start != end) { |
---|
205 | uint32_t cp = internal::mask16(*start++); |
---|
206 | // Take care of surrogate pairs first |
---|
207 | if (internal::is_surrogate(cp)) { |
---|
208 | if (start != end) { |
---|
209 | uint32_t trail_surrogate = internal::mask16(*start++); |
---|
210 | if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX) |
---|
211 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; |
---|
212 | else |
---|
213 | throw invalid_utf16(static_cast<uint16_t>(trail_surrogate)); |
---|
214 | } |
---|
215 | else |
---|
216 | throw invalid_utf16(static_cast<uint16_t>(*start)); |
---|
217 | |
---|
218 | } |
---|
219 | result = append(cp, result); |
---|
220 | } |
---|
221 | return result; |
---|
222 | } |
---|
223 | |
---|
224 | template <typename u16bit_iterator, typename octet_iterator> |
---|
225 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) |
---|
226 | { |
---|
227 | while (start != end) { |
---|
228 | uint32_t cp = next(start, end); |
---|
229 | if (cp > 0xffff) { //make a surrogate pair |
---|
230 | *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); |
---|
231 | *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); |
---|
232 | } |
---|
233 | else |
---|
234 | *result++ = static_cast<uint16_t>(cp); |
---|
235 | } |
---|
236 | return result; |
---|
237 | } |
---|
238 | |
---|
239 | template <typename octet_iterator, typename u32bit_iterator> |
---|
240 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) |
---|
241 | { |
---|
242 | while (start != end) |
---|
243 | result = append(*(start++), result); |
---|
244 | |
---|
245 | return result; |
---|
246 | } |
---|
247 | |
---|
248 | template <typename octet_iterator, typename u32bit_iterator> |
---|
249 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) |
---|
250 | { |
---|
251 | while (start < end) |
---|
252 | (*result++) = next(start, end); |
---|
253 | |
---|
254 | return result; |
---|
255 | } |
---|
256 | |
---|
257 | // The iterator class |
---|
258 | template <typename octet_iterator> |
---|
259 | class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { |
---|
260 | octet_iterator it; |
---|
261 | octet_iterator range_start; |
---|
262 | octet_iterator range_end; |
---|
263 | public: |
---|
264 | iterator () {}; |
---|
265 | explicit iterator (const octet_iterator& octet_it, |
---|
266 | const octet_iterator& range_start, |
---|
267 | const octet_iterator& range_end) : |
---|
268 | it(octet_it), range_start(range_start), range_end(range_end) |
---|
269 | { |
---|
270 | if (it < range_start || it > range_end) |
---|
271 | throw std::out_of_range("Invalid utf-8 iterator position"); |
---|
272 | } |
---|
273 | // the default "big three" are OK |
---|
274 | octet_iterator base () const { return it; } |
---|
275 | uint32_t operator * () const |
---|
276 | { |
---|
277 | octet_iterator temp = it; |
---|
278 | return next(temp, range_end); |
---|
279 | } |
---|
280 | bool operator == (const iterator& rhs) const |
---|
281 | { |
---|
282 | if (range_start != rhs.range_start || range_end != rhs.range_end) |
---|
283 | throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); |
---|
284 | return (it == rhs.it); |
---|
285 | } |
---|
286 | bool operator != (const iterator& rhs) const |
---|
287 | { |
---|
288 | return !(operator == (rhs)); |
---|
289 | } |
---|
290 | iterator& operator ++ () |
---|
291 | { |
---|
292 | next(it, range_end); |
---|
293 | return *this; |
---|
294 | } |
---|
295 | iterator operator ++ (int) |
---|
296 | { |
---|
297 | iterator temp = *this; |
---|
298 | next(it, range_end); |
---|
299 | return temp; |
---|
300 | } |
---|
301 | iterator& operator -- () |
---|
302 | { |
---|
303 | prior(it, range_start); |
---|
304 | return *this; |
---|
305 | } |
---|
306 | iterator operator -- (int) |
---|
307 | { |
---|
308 | iterator temp = *this; |
---|
309 | prior(it, range_start); |
---|
310 | return temp; |
---|
311 | } |
---|
312 | }; // class iterator |
---|
313 | |
---|
314 | } // namespace utf8 |
---|
315 | |
---|
316 | #endif //header guard |
---|
317 | |
---|
318 | |
---|