Panda3D
 All Classes Functions Variables Enumerations
textEncoder.cxx
00001 // Filename: textEncoder.cxx
00002 // Created by:  drose (26Mar03)
00003 //
00004 ////////////////////////////////////////////////////////////////////
00005 //
00006 // PANDA 3D SOFTWARE
00007 // Copyright (c) Carnegie Mellon University.  All rights reserved.
00008 //
00009 // All use of this software is subject to the terms of the revised BSD
00010 // license.  You should have received a copy of this license along
00011 // with this source code in a file named "LICENSE."
00012 //
00013 ////////////////////////////////////////////////////////////////////
00014 
00015 #include "textEncoder.h"
00016 #include "stringDecoder.h"
00017 #include "unicodeLatinMap.h"
00018 #include "config_dtoolutil.h"
00019 
00020 TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859;
00021 
00022 ////////////////////////////////////////////////////////////////////
00023 //     Function: TextEncoder::make_upper
00024 //       Access: Published
00025 //  Description: Adjusts the text stored within the encoder to all
00026 //               uppercase letters (preserving accent marks
00027 //               correctly).
00028 ////////////////////////////////////////////////////////////////////
00029 void TextEncoder::
00030 make_upper() {
00031   get_wtext();
00032   wstring::iterator si;
00033   for (si = _wtext.begin(); si != _wtext.end(); ++si) {
00034     (*si) = unicode_toupper(*si);
00035   }
00036   _flags &= ~F_got_text;
00037 }
00038 
00039 ////////////////////////////////////////////////////////////////////
00040 //     Function: TextEncoder::make_lower
00041 //       Access: Published
00042 //  Description: Adjusts the text stored within the encoder to all
00043 //               lowercase letters (preserving accent marks
00044 //               correctly).
00045 ////////////////////////////////////////////////////////////////////
00046 void TextEncoder::
00047 make_lower() {
00048   get_wtext();
00049   wstring::iterator si;
00050   for (si = _wtext.begin(); si != _wtext.end(); ++si) {
00051     (*si) = unicode_tolower(*si);
00052   }
00053   _flags &= ~F_got_text;
00054 }
00055 
00056 ////////////////////////////////////////////////////////////////////
00057 //     Function: TextEncoder::get_wtext_as_ascii
00058 //       Access: Published
00059 //  Description: Returns the text associated with the node, converted
00060 //               as nearly as possible to a fully-ASCII
00061 //               representation.  This means replacing accented
00062 //               letters with their unaccented ASCII equivalents.
00063 //
00064 //               It is possible that some characters in the string
00065 //               cannot be converted to ASCII.  (The string may
00066 //               involve symbols like the copyright symbol, for
00067 //               instance, or it might involve letters in some other
00068 //               alphabet such as Greek or Cyrillic, or even Latin
00069 //               letters like thorn or eth that are not part of the
00070 //               ASCII character set.)  In this case, as much of the
00071 //               string as possible will be converted to ASCII, and
00072 //               the nonconvertible characters will remain in their
00073 //               original form.
00074 ////////////////////////////////////////////////////////////////////
00075 wstring TextEncoder::
00076 get_wtext_as_ascii() const {
00077   get_wtext();
00078   wstring result;
00079   wstring::const_iterator si;
00080   for (si = _wtext.begin(); si != _wtext.end(); ++si) {
00081     wchar_t character = (*si);
00082 
00083     const UnicodeLatinMap::Entry *map_entry = 
00084       UnicodeLatinMap::look_up(character);
00085     if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
00086       result += (wchar_t)map_entry->_ascii_equiv;
00087       if (map_entry->_ascii_additional != 0) {
00088         result += (wchar_t)map_entry->_ascii_additional;
00089       }
00090 
00091     } else {
00092       result += character;
00093     }
00094   }
00095 
00096   return result;
00097 }
00098 
00099 ////////////////////////////////////////////////////////////////////
00100 //     Function: TextEncoder::is_wtext
00101 //       Access: Published
00102 //  Description: Returns true if any of the characters in the string
00103 //               returned by get_wtext() are out of the range of an
00104 //               ASCII character (and, therefore, get_wtext() should
00105 //               be called in preference to get_text()).
00106 ////////////////////////////////////////////////////////////////////
00107 bool TextEncoder:: 
00108 is_wtext() const {
00109   get_wtext();
00110   wstring::const_iterator ti;
00111   for (ti = _wtext.begin(); ti != _wtext.end(); ++ti) {
00112     if (((*ti) & ~0x7f) != 0) {
00113       return true;
00114     }
00115   }
00116 
00117   return false;
00118 }
00119 
00120 ////////////////////////////////////////////////////////////////////
00121 //     Function: TextEncoder::encode_wchar
00122 //       Access: Published, Static
00123 //  Description: Encodes a single wide char into a one-, two-, or
00124 //               three-byte string, according to the given encoding
00125 //               system.
00126 ////////////////////////////////////////////////////////////////////
00127 string TextEncoder::
00128 encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
00129   switch (encoding) {
00130   case E_iso8859:
00131     if ((ch & ~0xff) == 0) {
00132       return string(1, (char)ch);
00133     } else {
00134       // The character won't fit in the 8-bit ISO 8859.  See if we can
00135       // make it fit by reducing it to its ascii equivalent
00136       // (essentially stripping off an unusual accent mark).
00137       const UnicodeLatinMap::Entry *map_entry = 
00138         UnicodeLatinMap::look_up(ch);
00139       if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
00140         // Yes, it has an ascii equivalent.
00141         if (map_entry->_ascii_additional != 0) {
00142           // In fact, it has two of them.
00143           return
00144             string(1, map_entry->_ascii_equiv) +
00145             string(1, map_entry->_ascii_additional);
00146         }
00147         return string(1, map_entry->_ascii_equiv);
00148       }
00149       // Nope; return "." for lack of anything better.
00150       return ".";
00151     }
00152 
00153   case E_utf8:
00154     if ((ch & ~0x7f) == 0) {
00155       return string(1, (char)ch);
00156     } else if ((ch & ~0x7ff) == 0) {
00157       return 
00158         string(1, (char)((ch >> 6) | 0xc0)) +
00159         string(1, (char)((ch & 0x3f) | 0x80));
00160     } else {
00161       return 
00162         string(1, (char)((ch >> 12) | 0xe0)) +
00163         string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
00164         string(1, (char)((ch & 0x3f) | 0x80));
00165     }
00166 
00167   case E_unicode:
00168     return
00169       string(1, (char)(ch >> 8)) + 
00170       string(1, (char)(ch & 0xff));
00171   }
00172 
00173   return "";
00174 }
00175 
00176 ////////////////////////////////////////////////////////////////////
00177 //     Function: TextEncoder::encode_wtext
00178 //       Access: Published, Static
00179 //  Description: Encodes a wide-text string into a single-char string,
00180 //               according to the given encoding.
00181 ////////////////////////////////////////////////////////////////////
00182 string TextEncoder::
00183 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
00184   string result;
00185 
00186   for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
00187     result += encode_wchar(*pi, encoding);
00188   }
00189 
00190   return result;
00191 }
00192 
00193 ////////////////////////////////////////////////////////////////////
00194 //     Function: TextEncoder::decode_text
00195 //       Access: Published, Static
00196 //  Description: Returns the given wstring decoded to a single-byte
00197 //               string, via the given encoding system.
00198 ////////////////////////////////////////////////////////////////////
00199 wstring TextEncoder::
00200 decode_text(const string &text, TextEncoder::Encoding encoding) {
00201   switch (encoding) {
00202   case E_utf8:
00203     {
00204       StringUtf8Decoder decoder(text);
00205       return decode_text_impl(decoder);
00206     }
00207 
00208   case E_unicode:
00209     {
00210       StringUnicodeDecoder decoder(text);
00211       return decode_text_impl(decoder);
00212     }
00213 
00214   case E_iso8859:
00215   default:
00216     {
00217       StringDecoder decoder(text);
00218       return decode_text_impl(decoder);
00219     }
00220   };
00221 }
00222 
00223 ////////////////////////////////////////////////////////////////////
00224 //     Function: TextEncoder::decode_text_impl
00225 //       Access: Private, Static
00226 //  Description: Decodes the eight-bit stream from the indicated
00227 //               decoder, returning the decoded wide-char string.
00228 ////////////////////////////////////////////////////////////////////
00229 wstring TextEncoder::
00230 decode_text_impl(StringDecoder &decoder) {
00231   wstring result;
00232   //  bool expand_amp = get_expand_amp();
00233 
00234   wchar_t character = decoder.get_next_character();
00235   while (!decoder.is_eof()) {
00236     /*
00237     if (character == '&' && expand_amp) {
00238       // An ampersand in expand_amp mode is treated as an escape
00239       // character.
00240       character = expand_amp_sequence(decoder);
00241     }
00242     */
00243     result += character;
00244     character = decoder.get_next_character();
00245   }
00246 
00247   return result;
00248 }
00249 
00250 /*
00251 ////////////////////////////////////////////////////////////////////
00252 //     Function: TextEncoder::expand_amp_sequence
00253 //       Access: Private
00254 //  Description: Given that we have just read an ampersand from the
00255 //               StringDecoder, and that we have expand_amp in effect
00256 //               and are therefore expected to expand the sequence
00257 //               that this ampersand begins into a single unicode
00258 //               character, do the expansion and return the character.
00259 ////////////////////////////////////////////////////////////////////
00260 int TextEncoder::
00261 expand_amp_sequence(StringDecoder &decoder) const {
00262   int result = 0;
00263 
00264   int character = decoder.get_next_character();
00265   if (!decoder.is_eof() && character == '#') {
00266     // An explicit numeric sequence: &#nnn;
00267     result = 0;
00268     character = decoder.get_next_character();
00269     while (!decoder.is_eof() && character < 128 && isdigit((unsigned int)character)) {
00270       result = (result * 10) + (character - '0');
00271       character = decoder.get_next_character();
00272     }
00273     if (character != ';') {
00274       // Invalid sequence.
00275       return 0;
00276     }
00277 
00278     return result;
00279   }
00280 
00281   string sequence;
00282   
00283   // Some non-numeric sequence.
00284   while (!decoder.is_eof() && character < 128 && isalpha((unsigned int)character)) {
00285     sequence += character;
00286     character = decoder.get_next_character();
00287   }
00288   if (character != ';') {
00289     // Invalid sequence.
00290     return 0;
00291   }
00292 
00293   static const struct {
00294     const char *name;
00295     int code;
00296   } tokens[] = {
00297     { "amp", '&' }, { "lt", '<' }, { "gt", '>' }, { "quot", '"' },
00298     { "nbsp", ' ' },
00299 
00300     { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 },
00301     { "yen", 165 }, { "brvbar", 166 }, { "brkbar", 166 }, { "sect", 167 },
00302     { "uml", 168 }, { "die", 168 }, { "copy", 169 }, { "ordf", 170 },
00303     { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 },
00304     { "macr", 175 }, { "hibar", 175 }, { "deg", 176 }, { "plusmn", 177 },
00305     { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 },
00306     { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 },
00307     { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 },
00308     { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 },
00309     { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 },
00310     { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 },
00311     { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 },
00312     { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Dstrok", 208 },
00313     { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 },
00314     { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 },
00315     { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 },
00316     { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 },
00317     { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 },
00318     { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 },
00319     { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 },
00320     { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 },
00321     { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 },
00322     { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 },
00323     { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 },
00324     { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 },
00325 
00326     { NULL, 0 },
00327   };
00328 
00329   for (int i = 0; tokens[i].name != NULL; i++) {
00330     if (sequence == tokens[i].name) {
00331       // Here's a match.
00332       return tokens[i].code;
00333     }
00334   }
00335 
00336   // Some unrecognized sequence.
00337   return 0;
00338 }
00339 */
00340 
00341 
00342 ////////////////////////////////////////////////////////////////////
00343 //     Function: TextEncoder::Encoding ostream operator
00344 //  Description:
00345 ////////////////////////////////////////////////////////////////////
00346 ostream &
00347 operator << (ostream &out, TextEncoder::Encoding encoding) {
00348   switch (encoding) {
00349   case TextEncoder::E_iso8859:
00350     return out << "iso8859";
00351 
00352   case TextEncoder::E_utf8:
00353     return out << "utf8";
00354 
00355   case TextEncoder::E_unicode:
00356     return out << "unicode";
00357   };
00358 
00359   return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
00360 }
00361 
00362 ////////////////////////////////////////////////////////////////////
00363 //     Function: TextEncoder::Encoding istream operator
00364 //  Description:
00365 ////////////////////////////////////////////////////////////////////
00366 istream &
00367 operator >> (istream &in, TextEncoder::Encoding &encoding) {
00368   string word;
00369   in >> word;
00370 
00371   if (word == "iso8859") {
00372     encoding = TextEncoder::E_iso8859;
00373   } else if (word == "utf8" || word == "utf-8") {
00374     encoding = TextEncoder::E_utf8;
00375   } else if (word == "unicode") {
00376     encoding = TextEncoder::E_unicode;
00377   } else {
00378     ostream *notify_ptr = StringDecoder::get_notify_ptr();
00379     if (notify_ptr != (ostream *)NULL) {
00380       (*notify_ptr)
00381         << "Invalid TextEncoder::Encoding: " << word << "\n";
00382     }
00383     encoding = TextEncoder::E_iso8859;
00384   }
00385 
00386   return in;
00387 }
 All Classes Functions Variables Enumerations