15 #include "textEncoder.h" 16 #include "stringDecoder.h" 17 #include "unicodeLatinMap.h" 18 #include "config_dtoolutil.h" 20 TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859;
33 for (si = _wtext.begin(); si != _wtext.end(); ++si) {
36 _flags &= ~F_got_text;
50 for (si = _wtext.begin(); si != _wtext.end(); ++si) {
53 _flags &= ~F_got_text;
79 wstring::const_iterator si;
80 for (si = _wtext.begin(); si != _wtext.end(); ++si) {
81 wchar_t character = (*si);
85 if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
86 result += (wchar_t)map_entry->_ascii_equiv;
87 if (map_entry->_ascii_additional != 0) {
88 result += (wchar_t)map_entry->_ascii_additional;
110 wstring::const_iterator ti;
111 for (ti = _wtext.begin(); ti != _wtext.end(); ++ti) {
112 if (((*ti) & ~0x7f) != 0) {
131 if ((ch & ~0xff) == 0) {
132 return string(1, (
char)ch);
139 if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
141 if (map_entry->_ascii_additional != 0) {
144 string(1, map_entry->_ascii_equiv) +
145 string(1, map_entry->_ascii_additional);
147 return string(1, map_entry->_ascii_equiv);
154 if ((ch & ~0x7f) == 0) {
155 return string(1, (
char)ch);
156 }
else if ((ch & ~0x7ff) == 0) {
158 string(1, (
char)((ch >> 6) | 0xc0)) +
159 string(1, (
char)((ch & 0x3f) | 0x80));
162 string(1, (
char)((ch >> 12) | 0xe0)) +
163 string(1, (
char)(((ch >> 6) & 0x3f) | 0x80)) +
164 string(1, (
char)((ch & 0x3f) | 0x80));
169 string(1, (
char)(ch >> 8)) +
170 string(1, (
char)(ch & 0xff));
186 for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
205 return decode_text_impl(decoder);
211 return decode_text_impl(decoder);
218 return decode_text_impl(decoder);
229 wstring TextEncoder::
235 while (!decoder.
is_eof()) {
252 // Function: TextEncoder::expand_amp_sequence
254 // Description: Given that we have just read an ampersand from the
255 // StringDecoder, and that we have expand_amp in effect
256 // and are therefore expected to expand the sequence
257 // that this ampersand begins into a single unicode
258 // character, do the expansion and return the character.
261 expand_amp_sequence(StringDecoder &decoder) const {
264 int character = decoder.get_next_character();
265 if (!decoder.is_eof() && character == '#') {
266 // An explicit numeric sequence: &#nnn;
268 character = decoder.get_next_character();
269 while (!decoder.is_eof() && character < 128 && isdigit((unsigned int)character)) {
270 result = (result * 10) + (character - '0');
271 character = decoder.get_next_character();
273 if (character != ';') {
283 // Some non-numeric sequence.
284 while (!decoder.is_eof() && character < 128 && isalpha((unsigned int)character)) {
285 sequence += character;
286 character = decoder.get_next_character();
288 if (character != ';') {
293 static const struct {
297 { "amp", '&' }, { "lt", '<' }, { "gt", '>' }, { "quot", '"' },
300 { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 },
301 { "yen", 165 }, { "brvbar", 166 }, { "brkbar", 166 }, { "sect", 167 },
302 { "uml", 168 }, { "die", 168 }, { "copy", 169 }, { "ordf", 170 },
303 { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 },
304 { "macr", 175 }, { "hibar", 175 }, { "deg", 176 }, { "plusmn", 177 },
305 { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 },
306 { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 },
307 { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 },
308 { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 },
309 { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 },
310 { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 },
311 { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 },
312 { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Dstrok", 208 },
313 { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 },
314 { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 },
315 { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 },
316 { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 },
317 { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 },
318 { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 },
319 { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 },
320 { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 },
321 { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 },
322 { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 },
323 { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 },
324 { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 },
329 for (int i = 0; tokens[i].name != NULL; i++) {
330 if (sequence == tokens[i].name) {
332 return tokens[i].code;
336 // Some unrecognized sequence.
347 operator << (ostream &out, TextEncoder::Encoding encoding) {
349 case TextEncoder::E_iso8859:
350 return out <<
"iso8859";
352 case TextEncoder::E_utf8:
353 return out <<
"utf8";
355 case TextEncoder::E_unicode:
356 return out <<
"unicode";
359 return out <<
"**invalid TextEncoder::Encoding(" << (int)encoding <<
")**";
367 operator >> (istream &in, TextEncoder::Encoding &encoding) {
371 if (word ==
"iso8859") {
372 encoding = TextEncoder::E_iso8859;
373 }
else if (word ==
"utf8" || word ==
"utf-8") {
374 encoding = TextEncoder::E_utf8;
375 }
else if (word ==
"unicode") {
376 encoding = TextEncoder::E_unicode;
379 if (notify_ptr != (ostream *)NULL) {
381 <<
"Invalid TextEncoder::Encoding: " << word <<
"\n";
383 encoding = TextEncoder::E_iso8859;
This decoder extracts utf-8 sequences.
wstring decode_text(const string &text) const
Returns the given wstring decoded to a single-byte string, via the current encoding system...
void make_lower()
Adjusts the text stored within the encoder to all lowercase letters (preserving accent marks correctl...
static int unicode_tolower(int character)
Returns the uppercase equivalent of the given Unicode character.
const wstring & get_wtext() const
Returns the text associated with the TextEncoder, as a wide-character string.
bool is_wtext() const
Returns true if any of the characters in the string returned by get_wtext() are out of the range of a...
virtual int get_next_character()
Returns the next character in sequence.
static string encode_wchar(wchar_t ch, Encoding encoding)
Encodes a single wide char into a one-, two-, or three-byte string, according to the given encoding s...
string encode_wtext(const wstring &wtext) const
Encodes a wide-text string into a single-char string, according to the current encoding.
This decoder extracts characters two at a time to get a plain wide character sequence.
wstring get_wtext_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
The base class to a family of classes that decode various kinds of encoded byte streams.
static ostream * get_notify_ptr()
Returns the ostream that is used to write error messages to.
bool is_eof()
Returns true if the decoder has returned the last character in the string, false if there are more to...
static const Entry * look_up(wchar_t character)
Returns the Entry associated with the indicated character, if there is one.
void make_upper()
Adjusts the text stored within the encoder to all uppercase letters (preserving accent marks correctl...
static int unicode_toupper(int character)
Returns the uppercase equivalent of the given Unicode character.