#include "util/utf8_iterator.hpp"
#include <stdexcept>
#include "util/log.hpp"
Go to the source code of this file.
Functions | |
bool | has_multibyte_mark (unsigned char c) |
uint32_t | decode_utf8 (const std::string &text, size_t &p) |
std::string | encode_utf8 (uint32_t code) |
bool | has_multibyte_mark (unsigned char c) |
returns true if this byte matches a bitmask of 10xx.xxxx, i.e. | |
uint32_t | decode_utf8 (const std::string &text, size_t &p) |
gets unicode character at byte position p of UTF-8 encoded text, then advances p to the next character. |
uint32_t @497::decode_utf8 | ( | const std::string & | text, | |
size_t & | p | |||
) | [static] |
gets unicode character at byte position p of UTF-8 encoded text, then advances p to the next character.
std::runtime_error | if decoding fails. See unicode standard section 3.10 table 3-5 and 3-6 for details. |
Definition at line 43 of file utf8_iterator.cpp.
References has_multibyte_mark().
00044 { 00045 uint32_t c1 = (unsigned char) text[p+0]; 00046 00047 if (has_multibyte_mark(c1)) std::runtime_error("Malformed utf-8 sequence"); 00048 00049 if ((c1 & 0200) == 0000) { 00050 // 0xxx.xxxx: 1 byte sequence 00051 p+=1; 00052 return c1; 00053 } 00054 else if ((c1 & 0340) == 0300) { 00055 // 110x.xxxx: 2 byte sequence 00056 if(p+1 >= text.size()) throw std::range_error("Malformed utf-8 sequence"); 00057 uint32_t c2 = (unsigned char) text[p+1]; 00058 if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence"); 00059 p+=2; 00060 return (c1 & 0037) << 6 | (c2 & 0077); 00061 } 00062 else if ((c1 & 0360) == 0340) { 00063 // 1110.xxxx: 3 byte sequence 00064 if(p+2 >= text.size()) throw std::range_error("Malformed utf-8 sequence"); 00065 uint32_t c2 = (unsigned char) text[p+1]; 00066 uint32_t c3 = (unsigned char) text[p+2]; 00067 if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence"); 00068 if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 sequence"); 00069 p+=3; 00070 return (c1 & 0017) << 12 | (c2 & 0077) << 6 | (c3 & 0077); 00071 } 00072 else if ((c1 & 0370) == 0360) { 00073 // 1111.0xxx: 4 byte sequence 00074 if(p+3 >= text.size()) throw std::range_error("Malformed utf-8 sequence"); 00075 uint32_t c2 = (unsigned char) text[p+1]; 00076 uint32_t c3 = (unsigned char) text[p+2]; 00077 uint32_t c4 = (unsigned char) text[p+4]; 00078 if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence"); 00079 if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 sequence"); 00080 if (!has_multibyte_mark(c4)) throw std::runtime_error("Malformed utf-8 sequence"); 00081 p+=4; 00082 return (c1 & 0007) << 18 | (c2 & 0077) << 12 | (c3 & 0077) << 6 | (c4 & 0077); 00083 } 00084 throw std::runtime_error("Malformed utf-8 sequence"); 00085 }
uint32_t @497::decode_utf8 | ( | const std::string & | text, | |
size_t & | p | |||
) | [static] |
Referenced by UTF8Iterator::operator++(), and UTF8Iterator::UTF8Iterator().
std::string @497::encode_utf8 | ( | uint32_t | code | ) | [static] |
bool @497::has_multibyte_mark | ( | unsigned char | c | ) | [static] |
returns true if this byte matches a bitmask of 10xx.xxxx, i.e.
it is the 2nd, 3rd or 4th byte of a multibyte utf8 string
Definition at line 32 of file utf8_iterator.cpp.
bool @497::has_multibyte_mark | ( | unsigned char | c | ) | [static] |
Referenced by decode_utf8().