00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include "textEncoder.h"
00016 #include "stringDecoder.h"
00017 #include "unicodeLatinMap.h"
00018 #include "config_dtoolutil.h"
00019
00020 TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859;
00021
00022
00023
00024
00025
00026
00027
00028
00029 void TextEncoder::
00030 make_upper() {
00031 get_wtext();
00032 wstring::iterator si;
00033 for (si = _wtext.begin(); si != _wtext.end(); ++si) {
00034 (*si) = unicode_toupper(*si);
00035 }
00036 _flags &= ~F_got_text;
00037 }
00038
00039
00040
00041
00042
00043
00044
00045
00046 void TextEncoder::
00047 make_lower() {
00048 get_wtext();
00049 wstring::iterator si;
00050 for (si = _wtext.begin(); si != _wtext.end(); ++si) {
00051 (*si) = unicode_tolower(*si);
00052 }
00053 _flags &= ~F_got_text;
00054 }
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075 wstring TextEncoder::
00076 get_wtext_as_ascii() const {
00077 get_wtext();
00078 wstring result;
00079 wstring::const_iterator si;
00080 for (si = _wtext.begin(); si != _wtext.end(); ++si) {
00081 wchar_t character = (*si);
00082
00083 const UnicodeLatinMap::Entry *map_entry =
00084 UnicodeLatinMap::look_up(character);
00085 if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
00086 result += (wchar_t)map_entry->_ascii_equiv;
00087 if (map_entry->_ascii_additional != 0) {
00088 result += (wchar_t)map_entry->_ascii_additional;
00089 }
00090
00091 } else {
00092 result += character;
00093 }
00094 }
00095
00096 return result;
00097 }
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107 bool TextEncoder::
00108 is_wtext() const {
00109 get_wtext();
00110 wstring::const_iterator ti;
00111 for (ti = _wtext.begin(); ti != _wtext.end(); ++ti) {
00112 if (((*ti) & ~0x7f) != 0) {
00113 return true;
00114 }
00115 }
00116
00117 return false;
00118 }
00119
00120
00121
00122
00123
00124
00125
00126
00127 string TextEncoder::
00128 encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
00129 switch (encoding) {
00130 case E_iso8859:
00131 if ((ch & ~0xff) == 0) {
00132 return string(1, (char)ch);
00133 } else {
00134
00135
00136
00137 const UnicodeLatinMap::Entry *map_entry =
00138 UnicodeLatinMap::look_up(ch);
00139 if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
00140
00141 if (map_entry->_ascii_additional != 0) {
00142
00143 return
00144 string(1, map_entry->_ascii_equiv) +
00145 string(1, map_entry->_ascii_additional);
00146 }
00147 return string(1, map_entry->_ascii_equiv);
00148 }
00149
00150 return ".";
00151 }
00152
00153 case E_utf8:
00154 if ((ch & ~0x7f) == 0) {
00155 return string(1, (char)ch);
00156 } else if ((ch & ~0x7ff) == 0) {
00157 return
00158 string(1, (char)((ch >> 6) | 0xc0)) +
00159 string(1, (char)((ch & 0x3f) | 0x80));
00160 } else {
00161 return
00162 string(1, (char)((ch >> 12) | 0xe0)) +
00163 string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
00164 string(1, (char)((ch & 0x3f) | 0x80));
00165 }
00166
00167 case E_unicode:
00168 return
00169 string(1, (char)(ch >> 8)) +
00170 string(1, (char)(ch & 0xff));
00171 }
00172
00173 return "";
00174 }
00175
00176
00177
00178
00179
00180
00181
00182 string TextEncoder::
00183 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
00184 string result;
00185
00186 for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
00187 result += encode_wchar(*pi, encoding);
00188 }
00189
00190 return result;
00191 }
00192
00193
00194
00195
00196
00197
00198
00199 wstring TextEncoder::
00200 decode_text(const string &text, TextEncoder::Encoding encoding) {
00201 switch (encoding) {
00202 case E_utf8:
00203 {
00204 StringUtf8Decoder decoder(text);
00205 return decode_text_impl(decoder);
00206 }
00207
00208 case E_unicode:
00209 {
00210 StringUnicodeDecoder decoder(text);
00211 return decode_text_impl(decoder);
00212 }
00213
00214 case E_iso8859:
00215 default:
00216 {
00217 StringDecoder decoder(text);
00218 return decode_text_impl(decoder);
00219 }
00220 };
00221 }
00222
00223
00224
00225
00226
00227
00228
00229 wstring TextEncoder::
00230 decode_text_impl(StringDecoder &decoder) {
00231 wstring result;
00232
00233
00234 wchar_t character = decoder.get_next_character();
00235 while (!decoder.is_eof()) {
00236
00237
00238
00239
00240
00241
00242
00243 result += character;
00244 character = decoder.get_next_character();
00245 }
00246
00247 return result;
00248 }
00249
00250
00251
00252 // Function: TextEncoder::expand_amp_sequence
00253 // Access: Private
00254 // Description: Given that we have just read an ampersand from the
00255 // StringDecoder, and that we have expand_amp in effect
00256 // and are therefore expected to expand the sequence
00257 // that this ampersand begins into a single unicode
00258 // character, do the expansion and return the character.
00259
00260 int TextEncoder::
00261 expand_amp_sequence(StringDecoder &decoder) const {
00262 int result = 0;
00263
00264 int character = decoder.get_next_character();
00265 if (!decoder.is_eof() && character == '#') {
00266 // An explicit numeric sequence: &#nnn;
00267 result = 0;
00268 character = decoder.get_next_character();
00269 while (!decoder.is_eof() && character < 128 && isdigit((unsigned int)character)) {
00270 result = (result * 10) + (character - '0');
00271 character = decoder.get_next_character();
00272 }
00273 if (character != ';') {
00274 // Invalid sequence.
00275 return 0;
00276 }
00277
00278 return result;
00279 }
00280
00281 string sequence;
00282
00283 // Some non-numeric sequence.
00284 while (!decoder.is_eof() && character < 128 && isalpha((unsigned int)character)) {
00285 sequence += character;
00286 character = decoder.get_next_character();
00287 }
00288 if (character != ';') {
00289 // Invalid sequence.
00290 return 0;
00291 }
00292
00293 static const struct {
00294 const char *name;
00295 int code;
00296 } tokens[] = {
00297 { "amp", '&' }, { "lt", '<' }, { "gt", '>' }, { "quot", '"' },
00298 { "nbsp", ' ' },
00299
00300 { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 },
00301 { "yen", 165 }, { "brvbar", 166 }, { "brkbar", 166 }, { "sect", 167 },
00302 { "uml", 168 }, { "die", 168 }, { "copy", 169 }, { "ordf", 170 },
00303 { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 },
00304 { "macr", 175 }, { "hibar", 175 }, { "deg", 176 }, { "plusmn", 177 },
00305 { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 },
00306 { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 },
00307 { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 },
00308 { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 },
00309 { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 },
00310 { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 },
00311 { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 },
00312 { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Dstrok", 208 },
00313 { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 },
00314 { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 },
00315 { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 },
00316 { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 },
00317 { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 },
00318 { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 },
00319 { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 },
00320 { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 },
00321 { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 },
00322 { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 },
00323 { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 },
00324 { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 },
00325
00326 { NULL, 0 },
00327 };
00328
00329 for (int i = 0; tokens[i].name != NULL; i++) {
00330 if (sequence == tokens[i].name) {
00331 // Here's a match.
00332 return tokens[i].code;
00333 }
00334 }
00335
00336 // Some unrecognized sequence.
00337 return 0;
00338 }
00339 */
00340
00341
00342
00343
00344
00345
00346 ostream &
00347 operator << (ostream &out, TextEncoder::Encoding encoding) {
00348 switch (encoding) {
00349 case TextEncoder::E_iso8859:
00350 return out << "iso8859";
00351
00352 case TextEncoder::E_utf8:
00353 return out << "utf8";
00354
00355 case TextEncoder::E_unicode:
00356 return out << "unicode";
00357 };
00358
00359 return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
00360 }
00361
00362
00363
00364
00365
00366 istream &
00367 operator >> (istream &in, TextEncoder::Encoding &encoding) {
00368 string word;
00369 in >> word;
00370
00371 if (word == "iso8859") {
00372 encoding = TextEncoder::E_iso8859;
00373 } else if (word == "utf8" || word == "utf-8") {
00374 encoding = TextEncoder::E_utf8;
00375 } else if (word == "unicode") {
00376 encoding = TextEncoder::E_unicode;
00377 } else {
00378 ostream *notify_ptr = StringDecoder::get_notify_ptr();
00379 if (notify_ptr != (ostream *)NULL) {
00380 (*notify_ptr)
00381 << "Invalid TextEncoder::Encoding: " << word << "\n";
00382 }
00383 encoding = TextEncoder::E_iso8859;
00384 }
00385
00386 return in;
00387 }