00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017 #include "util/utf8_iterator.hpp"
00018
00019 #include <stdexcept>
00020
00021 #include "util/log.hpp"
00022
00023 namespace {
00024
00025 bool has_multibyte_mark(unsigned char c);
00026 uint32_t decode_utf8(const std::string& text, size_t& p);
00027 std::string encode_utf8(uint32_t code);
00028
00032 bool has_multibyte_mark(unsigned char c) {
00033 return ((c & 0300) == 0200);
00034 }
00035
00043 uint32_t decode_utf8(const std::string& text, size_t& p)
00044 {
00045 uint32_t c1 = (unsigned char) text[p+0];
00046
00047 if (has_multibyte_mark(c1)) std::runtime_error("Malformed utf-8 sequence");
00048
00049 if ((c1 & 0200) == 0000) {
00050
00051 p+=1;
00052 return c1;
00053 }
00054 else if ((c1 & 0340) == 0300) {
00055
00056 if(p+1 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
00057 uint32_t c2 = (unsigned char) text[p+1];
00058 if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence");
00059 p+=2;
00060 return (c1 & 0037) << 6 | (c2 & 0077);
00061 }
00062 else if ((c1 & 0360) == 0340) {
00063
00064 if(p+2 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
00065 uint32_t c2 = (unsigned char) text[p+1];
00066 uint32_t c3 = (unsigned char) text[p+2];
00067 if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence");
00068 if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 sequence");
00069 p+=3;
00070 return (c1 & 0017) << 12 | (c2 & 0077) << 6 | (c3 & 0077);
00071 }
00072 else if ((c1 & 0370) == 0360) {
00073
00074 if(p+3 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
00075 uint32_t c2 = (unsigned char) text[p+1];
00076 uint32_t c3 = (unsigned char) text[p+2];
00077 uint32_t c4 = (unsigned char) text[p+4];
00078 if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence");
00079 if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 sequence");
00080 if (!has_multibyte_mark(c4)) throw std::runtime_error("Malformed utf-8 sequence");
00081 p+=4;
00082 return (c1 & 0007) << 18 | (c2 & 0077) << 12 | (c3 & 0077) << 6 | (c4 & 0077);
00083 }
00084 throw std::runtime_error("Malformed utf-8 sequence");
00085 }
00086
00087 }
00088
00089
00090 UTF8Iterator::UTF8Iterator(const std::string& text_) :
00091 text(text_),
00092 pos(0),
00093 chr()
00094 {
00095 try {
00096 chr = decode_utf8(text, pos);
00097 } catch (std::exception) {
00098 log_debug << "Malformed utf-8 sequence beginning with " << *((uint32_t*)(text.c_str() + pos)) << " found " << std::endl;
00099 chr = 0;
00100 }
00101 }
00102
00103 bool
00104 UTF8Iterator::done() const
00105 {
00106 return pos > text.size();
00107 }
00108
00109 UTF8Iterator&
00110 UTF8Iterator::operator++() {
00111 try {
00112 chr = decode_utf8(text, pos);
00113 } catch (std::exception) {
00114 log_debug << "Malformed utf-8 sequence beginning with " << *((uint32_t*)(text.c_str() + pos)) << " found " << std::endl;
00115 chr = 0;
00116 ++pos;
00117 }
00118
00119 return *this;
00120 }
00121
00122 uint32_t
00123 UTF8Iterator::operator*() const {
00124 return chr;
00125 }
00126
00127