Panda3D

textEncoder.I

00001 // Filename: textEncoder.I
00002 // Created by:  drose (26Mar03)
00003 //
00004 ////////////////////////////////////////////////////////////////////
00005 //
00006 // PANDA 3D SOFTWARE
00007 // Copyright (c) Carnegie Mellon University.  All rights reserved.
00008 //
00009 // All use of this software is subject to the terms of the revised BSD
00010 // license.  You should have received a copy of this license along
00011 // with this source code in a file named "LICENSE."
00012 //
00013 ////////////////////////////////////////////////////////////////////
00014 
00015 
00016 ////////////////////////////////////////////////////////////////////
00017 //     Function: TextEncoder::Constructor
00018 //       Access: Published
00019 //  Description:
00020 ////////////////////////////////////////////////////////////////////
00021 INLINE TextEncoder::
00022 TextEncoder() {
00023   _encoding = _default_encoding;
00024   
00025   // Initially, since the text string is empty, we know that both
00026   // _text and _wtext accurately reflect the empty state; so we "got"
00027   // both of them.
00028   _flags = (F_got_text | F_got_wtext);
00029 }
00030 
00031 ////////////////////////////////////////////////////////////////////
00032 //     Function: TextEncoder::Copy Constructor
00033 //       Access: Published
00034 //  Description:
00035 ////////////////////////////////////////////////////////////////////
00036 INLINE TextEncoder::
00037 TextEncoder(const TextEncoder &copy) :
00038   _flags(copy._flags),
00039   _encoding(copy._encoding),
00040   _text(copy._text),
00041   _wtext(copy._wtext)
00042 {
00043 }
00044 
00045 ////////////////////////////////////////////////////////////////////
00046 //     Function: TextEncoder::set_encoding
00047 //       Access: Published
00048 //  Description: Specifies how the string set via set_text() is to be
00049 //               interpreted.  The default, E_iso8859, means a
00050 //               standard string with one-byte characters
00051 //               (i.e. ASCII).  Other encodings are possible to take
00052 //               advantage of character sets with more than 256
00053 //               characters.
00054 //
00055 //               This affects only future calls to set_text(); it does
00056 //               not change text that was set previously.
00057 ////////////////////////////////////////////////////////////////////
00058 INLINE void TextEncoder::
00059 set_encoding(TextEncoder::Encoding encoding) {
00060   // Force the previously-set strings to be encoded or decoded now.
00061   get_text();
00062   get_wtext();
00063   _encoding = encoding;
00064 }
00065 
00066 ////////////////////////////////////////////////////////////////////
00067 //     Function: TextEncoder::get_encoding
00068 //       Access: Published
00069 //  Description: Returns the encoding by which the string set via
00070 //               set_text() is to be interpreted.  See set_encoding().
00071 ////////////////////////////////////////////////////////////////////
00072 INLINE TextEncoder::Encoding TextEncoder::
00073 get_encoding() const {
00074   return _encoding;
00075 }
00076 
00077 ////////////////////////////////////////////////////////////////////
00078 //     Function: TextEncoder::set_default_encoding
00079 //       Access: Published, Static
00080 //  Description: Specifies the default encoding to be used for all
00081 //               subsequently created TextEncoder objects.  See
00082 //               set_encoding().
00083 ////////////////////////////////////////////////////////////////////
00084 INLINE void TextEncoder::
00085 set_default_encoding(TextEncoder::Encoding encoding) {
00086   _default_encoding = encoding;
00087 }
00088 
00089 ////////////////////////////////////////////////////////////////////
00090 //     Function: TextEncoder::get_default_encoding
00091 //       Access: Published, Static
00092 //  Description: Specifies the default encoding to be used for all
00093 //               subsequently created TextEncoder objects.  See
00094 //               set_encoding().
00095 ////////////////////////////////////////////////////////////////////
00096 INLINE TextEncoder::Encoding TextEncoder::
00097 get_default_encoding() {
00098   return _default_encoding;
00099 }
00100 
00101 ////////////////////////////////////////////////////////////////////
00102 //     Function: TextEncoder::set_text
00103 //       Access: Published
00104 //  Description: Changes the text that is stored in the encoder.  The
00105 //               text should be encoded according to the method
00106 //               indicated by set_encoding().  Subsequent calls to
00107 //               get_text() will return this same string, while
00108 //               get_wtext() will return the decoded version of the
00109 //               string.
00110 ////////////////////////////////////////////////////////////////////
00111 INLINE void TextEncoder::
00112 set_text(const string &text) {
00113   if (!has_text() || _text != text) {
00114     _text = text;
00115     _flags = (_flags | F_got_text) & ~F_got_wtext;
00116   }
00117 }
00118 
00119 ////////////////////////////////////////////////////////////////////
00120 //     Function: TextEncoder::set_text
00121 //       Access: Published
00122 //  Description: The two-parameter version of set_text() accepts an
00123 //               explicit encoding; the text is immediately decoded
00124 //               and stored as a wide-character string.  Subsequent
00125 //               calls to get_text() will return the same text
00126 //               re-encoded using whichever encoding is specified by
00127 //               set_encoding().
00128 ////////////////////////////////////////////////////////////////////
00129 INLINE void TextEncoder::
00130 set_text(const string &text, TextEncoder::Encoding encoding) {
00131   set_wtext(decode_text(text, encoding));
00132 }
00133 
00134 ////////////////////////////////////////////////////////////////////
00135 //     Function: TextEncoder::clear_text
00136 //       Access: Published
00137 //  Description: Removes the text from the TextEncoder.
00138 ////////////////////////////////////////////////////////////////////
00139 INLINE void TextEncoder::
00140 clear_text() {
00141   _text = string();
00142   _wtext = wstring();
00143   _flags |= (F_got_text | F_got_wtext);
00144 }
00145 
00146 ////////////////////////////////////////////////////////////////////
00147 //     Function: TextEncoder::has_text
00148 //       Access: Published
00149 //  Description:
00150 ////////////////////////////////////////////////////////////////////
00151 INLINE bool TextEncoder::
00152 has_text() const {
00153   if (_flags & F_got_wtext) {
00154     return !_wtext.empty();
00155   } else {
00156     return !_text.empty();
00157   }
00158 }
00159 
00160 ////////////////////////////////////////////////////////////////////
00161 //     Function: TextEncoder::get_text
00162 //       Access: Published
00163 //  Description: Returns the current text, as encoded via the current
00164 //               encoding system.
00165 ////////////////////////////////////////////////////////////////////
00166 INLINE string TextEncoder::
00167 get_text() const {
00168   if ((_flags & F_got_text) == 0) {
00169     ((TextEncoder *)this)->_text = encode_wtext(_wtext);
00170     ((TextEncoder *)this)->_flags |= F_got_text;
00171   }
00172   return _text;
00173 }
00174 
00175 ////////////////////////////////////////////////////////////////////
00176 //     Function: TextEncoder::get_text
00177 //       Access: Published
00178 //  Description: Returns the current text, as encoded via the indicated
00179 //               encoding system.
00180 ////////////////////////////////////////////////////////////////////
00181 INLINE string TextEncoder::
00182 get_text(TextEncoder::Encoding encoding) const {
00183   return encode_wtext(get_wtext(), encoding);
00184 }
00185 
00186 ////////////////////////////////////////////////////////////////////
00187 //     Function: TextEncoder::append_text
00188 //       Access: Published
00189 //  Description: Appends the indicates string to the end of the stored
00190 //               text.
00191 ////////////////////////////////////////////////////////////////////
00192 INLINE void TextEncoder::
00193 append_text(const string &text) {
00194   _text = get_text() + text;
00195   _flags = (_flags | F_got_text) & ~F_got_wtext;
00196 }
00197 
00198 ////////////////////////////////////////////////////////////////////
00199 //     Function: TextEncoder::append_unicode_char
00200 //       Access: Published
00201 //  Description: Appends a single character to the end of the stored
00202 //               text.  This may be a wide character, up to 16 bits in
00203 //               Unicode.
00204 ////////////////////////////////////////////////////////////////////
00205 INLINE void TextEncoder::
00206 append_unicode_char(int character) {
00207   _wtext = get_wtext() + wstring(1, (wchar_t)character);
00208   _flags = (_flags | F_got_wtext) & ~F_got_text;
00209 }
00210 
00211 ////////////////////////////////////////////////////////////////////
00212 //     Function: TextEncoder::get_num_chars
00213 //       Access: Published
00214 //  Description: Returns the number of characters in the stored text.
00215 //               This is a count of wide characters, after the string
00216 //               has been decoded according to set_encoding().
00217 ////////////////////////////////////////////////////////////////////
00218 INLINE int TextEncoder::
00219 get_num_chars() const {
00220   return get_wtext().length();
00221 }
00222 
00223 ////////////////////////////////////////////////////////////////////
00224 //     Function: TextEncoder::get_unicode_char
00225 //       Access: Published
00226 //  Description: Returns the Unicode value of the nth character in the
00227 //               stored text.  This may be a wide character (greater
00228 //               than 255), after the string has been decoded
00229 //               according to set_encoding().
00230 ////////////////////////////////////////////////////////////////////
00231 INLINE int TextEncoder::
00232 get_unicode_char(int index) const {
00233   get_wtext();
00234   nassertr(index >= 0 && index < (int)_wtext.length(), 0);
00235   return _wtext[index];
00236 }
00237 
00238 ////////////////////////////////////////////////////////////////////
00239 //     Function: TextEncoder::set_unicode_char
00240 //       Access: Published
00241 //  Description: Sets the Unicode value of the nth character in the
00242 //               stored text.  This may be a wide character (greater
00243 //               than 255), after the string has been decoded
00244 //               according to set_encoding().
00245 ////////////////////////////////////////////////////////////////////
00246 INLINE void TextEncoder::
00247 set_unicode_char(int index, int character) {
00248   get_wtext();
00249   nassertv(index >= 0 && index < (int)_wtext.length());
00250   _wtext[index] = character;
00251   _flags &= ~F_got_text;
00252 }
00253 
00254 ////////////////////////////////////////////////////////////////////
00255 //     Function: TextEncoder::get_encoded_char
00256 //       Access: Published
00257 //  Description: Returns the nth char of the stored text, as a one-,
00258 //               two-, or three-byte encoded string.
00259 ////////////////////////////////////////////////////////////////////
00260 INLINE string TextEncoder::
00261 get_encoded_char(int index) const {
00262   return get_encoded_char(index, get_encoding());
00263 }
00264 
00265 ////////////////////////////////////////////////////////////////////
00266 //     Function: TextEncoder::get_encoded_char
00267 //       Access: Published
00268 //  Description: Returns the nth char of the stored text, as a one-,
00269 //               two-, or three-byte encoded string.
00270 ////////////////////////////////////////////////////////////////////
00271 INLINE string TextEncoder::
00272 get_encoded_char(int index, TextEncoder::Encoding encoding) const {
00273   wstring wch(1, (wchar_t)get_unicode_char(index));
00274   return encode_wtext(wch, encoding);
00275 }
00276 
00277 ////////////////////////////////////////////////////////////////////
00278 //     Function: TextEncoder::get_text_as_ascii
00279 //       Access: Published
00280 //  Description: Returns the text associated with the node, converted
00281 //               as nearly as possible to a fully-ASCII
00282 //               representation.  This means replacing accented
00283 //               letters with their unaccented ASCII equivalents.
00284 //
00285 //               It is possible that some characters in the string
00286 //               cannot be converted to ASCII.  (The string may
00287 //               involve symbols like the copyright symbol, for
00288 //               instance, or it might involve letters in some other
00289 //               alphabet such as Greek or Cyrillic, or even Latin
00290 //               letters like thorn or eth that are not part of the
00291 //               ASCII character set.)  In this case, as much of the
00292 //               string as possible will be converted to ASCII, and
00293 //               the nonconvertible characters will remain encoded in
00294 //               the encoding specified by set_encoding().
00295 ////////////////////////////////////////////////////////////////////
00296 INLINE string TextEncoder::
00297 get_text_as_ascii() const {
00298   return encode_wtext(get_wtext_as_ascii());
00299 }
00300 
00301 ////////////////////////////////////////////////////////////////////
00302 //     Function: TextEncoder::reencode_text
00303 //       Access: Published, Static
00304 //  Description: Given the indicated text string, which is assumed to
00305 //               be encoded via the encoding "from", decodes it and
00306 //               then reencodes it into the encoding "to", and returns
00307 //               the newly encoded string.  This does not change or
00308 //               affect any properties on the TextEncoder itself.
00309 ////////////////////////////////////////////////////////////////////
00310 INLINE string TextEncoder::
00311 reencode_text(const string &text, TextEncoder::Encoding from, 
00312               TextEncoder::Encoding to) {
00313   return encode_wtext(decode_text(text, from), to);
00314 }
00315 
00316 ////////////////////////////////////////////////////////////////////
00317 //     Function: TextEncoder::unicode_isalpha
00318 //       Access: Published, Static
00319 //  Description: Returns true if the indicated character is an
00320 //               alphabetic letter, false otherwise.  This is akin to
00321 //               ctype's isalpha(), extended to Unicode.
00322 ////////////////////////////////////////////////////////////////////
00323 INLINE bool TextEncoder::
00324 unicode_isalpha(int character) {
00325   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00326   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00327     return false;
00328   }
00329   return entry->_char_type == UnicodeLatinMap::CT_upper ||
00330     entry->_char_type == UnicodeLatinMap::CT_lower;
00331 }
00332 
00333 ////////////////////////////////////////////////////////////////////
00334 //     Function: TextEncoder::unicode_isdigit
00335 //       Access: Published, Static
00336 //  Description: Returns true if the indicated character is a
00337 //               numeric digit, false otherwise.  This is akin to
00338 //               ctype's isdigit(), extended to Unicode.
00339 ////////////////////////////////////////////////////////////////////
00340 INLINE bool TextEncoder::
00341 unicode_isdigit(int character) {
00342   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00343   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00344     // The digits aren't actually listed in the map.
00345     return (character >= '0' && character <= '9');
00346   }
00347   // This silly test (!= 0) is necessary to prevent a VC++ warning.
00348   return (isdigit(entry->_ascii_equiv) != 0);
00349 }
00350 
00351 ////////////////////////////////////////////////////////////////////
00352 //     Function: TextEncoder::unicode_ispunct
00353 //       Access: Published, Static
00354 //  Description: Returns true if the indicated character is a
00355 //               punctuation mark, false otherwise.  This is akin to
00356 //               ctype's ispunct(), extended to Unicode.
00357 ////////////////////////////////////////////////////////////////////
00358 INLINE bool TextEncoder::
00359 unicode_ispunct(int character) {
00360   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00361   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00362     // Some punctuation marks aren't listed in the map.
00363     return (character >= 0 && character < 128 && ispunct(character));
00364   }
00365   return entry->_char_type == UnicodeLatinMap::CT_punct;
00366 }
00367 
00368 ////////////////////////////////////////////////////////////////////
00369 //     Function: TextEncoder::unicode_isupper
00370 //       Access: Published, Static
00371 //  Description: Returns true if the indicated character is an
00372 //               uppercase letter, false otherwise.  This is akin to
00373 //               ctype's isupper(), extended to Unicode.
00374 ////////////////////////////////////////////////////////////////////
00375 INLINE bool TextEncoder::
00376 unicode_isupper(int character) {
00377   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00378   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00379     return false;
00380   }
00381   return entry->_char_type == UnicodeLatinMap::CT_upper;
00382 }
00383 
00384 ////////////////////////////////////////////////////////////////////
00385 //     Function: TextEncoder::unicode_isspace
00386 //       Access: Published, Static
00387 //  Description: Returns true if the indicated character is a
00388 //               whitespace letter, false otherwise.  This is akin to
00389 //               ctype's isspace(), extended to Unicode.
00390 ////////////////////////////////////////////////////////////////////
00391 INLINE bool TextEncoder::
00392 unicode_isspace(int character) {
00393   switch (character) {
00394   case ' ':
00395   case '\t':
00396   case '\n':
00397     return true;
00398 
00399   default:
00400     return false;
00401   }
00402 }
00403 
00404 ////////////////////////////////////////////////////////////////////
00405 //     Function: TextEncoder::unicode_islower
00406 //       Access: Published, Static
00407 //  Description: Returns true if the indicated character is a
00408 //               lowercase letter, false otherwise.  This is akin to
00409 //               ctype's islower(), extended to Unicode.
00410 ////////////////////////////////////////////////////////////////////
00411 INLINE bool TextEncoder::
00412 unicode_islower(int character) {
00413   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00414   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00415     return false;
00416   }
00417   return entry->_char_type == UnicodeLatinMap::CT_lower;
00418 }
00419 
00420 ////////////////////////////////////////////////////////////////////
00421 //     Function: TextEncoder::unicode_toupper
00422 //       Access: Published, Static
00423 //  Description: Returns the uppercase equivalent of the given Unicode
00424 //               character.  This is akin to ctype's toupper(),
00425 //               extended to Unicode.
00426 ////////////////////////////////////////////////////////////////////
00427 INLINE int TextEncoder::
00428 unicode_toupper(int character) {
00429   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00430   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00431     return character;
00432   } 
00433   return entry->_toupper_character;
00434 }
00435 
00436 ////////////////////////////////////////////////////////////////////
00437 //     Function: TextEncoder::unicode_tolower
00438 //       Access: Published, Static
00439 //  Description: Returns the uppercase equivalent of the given Unicode
00440 //               character.  This is akin to ctype's tolower(),
00441 //               extended to Unicode.
00442 ////////////////////////////////////////////////////////////////////
00443 INLINE int TextEncoder::
00444 unicode_tolower(int character) {
00445   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00446   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00447     return character;
00448   } 
00449   return entry->_tolower_character;
00450 }
00451 
00452 ////////////////////////////////////////////////////////////////////
00453 //     Function: TextEncoder::upper
00454 //       Access: Published, Static
00455 //  Description: Converts the string to uppercase, assuming the string
00456 //               is encoded in the default encoding.
00457 ////////////////////////////////////////////////////////////////////
00458 INLINE string TextEncoder::
00459 upper(const string &source) {
00460   return upper(source, get_default_encoding());
00461 }
00462 
00463 ////////////////////////////////////////////////////////////////////
00464 //     Function: TextEncoder::upper
00465 //       Access: Published, Static
00466 //  Description: Converts the string to uppercase, assuming the string
00467 //               is encoded in the indicated encoding.
00468 ////////////////////////////////////////////////////////////////////
00469 INLINE string TextEncoder::
00470 upper(const string &source, TextEncoder::Encoding encoding) {
00471   TextEncoder encoder;
00472   encoder.set_encoding(encoding);
00473   encoder.set_text(source);
00474   encoder.make_upper();
00475   return encoder.get_text();
00476 }
00477 
00478 ////////////////////////////////////////////////////////////////////
00479 //     Function: TextEncoder::lower
00480 //       Access: Published, Static
00481 //  Description: Converts the string to lowercase, assuming the string
00482 //               is encoded in the default encoding.
00483 ////////////////////////////////////////////////////////////////////
00484 INLINE string TextEncoder::
00485 lower(const string &source) {
00486   return lower(source, get_default_encoding());
00487 }
00488 
00489 ////////////////////////////////////////////////////////////////////
00490 //     Function: TextEncoder::lower
00491 //       Access: Published, Static
00492 //  Description: Converts the string to lowercase, assuming the string
00493 //               is encoded in the indicated encoding.
00494 ////////////////////////////////////////////////////////////////////
00495 INLINE string TextEncoder::
00496 lower(const string &source, TextEncoder::Encoding encoding) {
00497   TextEncoder encoder;
00498   encoder.set_encoding(encoding);
00499   encoder.set_text(source);
00500   encoder.make_lower();
00501   return encoder.get_text();
00502 }
00503 
00504 ////////////////////////////////////////////////////////////////////
00505 //     Function: TextEncoder::set_wtext
00506 //       Access: Published
00507 //  Description: Changes the text that is stored in the encoder.
00508 //               Subsequent calls to get_wtext() will return this same
00509 //               string, while get_text() will return the encoded
00510 //               version of the string.
00511 ////////////////////////////////////////////////////////////////////
00512 INLINE void TextEncoder::
00513 set_wtext(const wstring &wtext) {
00514   if (!has_text() || _wtext != wtext) {
00515     _wtext = wtext;
00516     _flags = (_flags | F_got_wtext) & ~F_got_text;
00517   }
00518 }
00519 
00520 ////////////////////////////////////////////////////////////////////
00521 //     Function: TextEncoder::get_wtext
00522 //       Access: Published
00523 //  Description: Returns the text associated with the TextEncoder, as
00524 //               a wide-character string.
00525 ////////////////////////////////////////////////////////////////////
00526 INLINE const wstring &TextEncoder::
00527 get_wtext() const {
00528   if ((_flags & F_got_wtext) == 0) {
00529     ((TextEncoder *)this)->_wtext = decode_text(_text);
00530     ((TextEncoder *)this)->_flags |= F_got_wtext;
00531   }
00532   return _wtext;
00533 }
00534 
00535 ////////////////////////////////////////////////////////////////////
00536 //     Function: TextEncoder::append_wtext
00537 //       Access: Published
00538 //  Description: Appends the indicates string to the end of the stored
00539 //               wide-character text.
00540 ////////////////////////////////////////////////////////////////////
00541 INLINE void TextEncoder::
00542 append_wtext(const wstring &wtext) {
00543   _wtext = get_wtext() + wtext;
00544   _flags = (_flags | F_got_wtext) & ~F_got_text;
00545 }
00546 
00547 ////////////////////////////////////////////////////////////////////
00548 //     Function: TextEncoder::encode_wtext
00549 //       Access: Published
00550 //  Description: Encodes a wide-text string into a single-char string,
00551 //               according to the current encoding.
00552 ////////////////////////////////////////////////////////////////////
00553 INLINE string TextEncoder::
00554 encode_wtext(const wstring &wtext) const {
00555   return encode_wtext(wtext, _encoding);
00556 }
00557 
00558 ////////////////////////////////////////////////////////////////////
00559 //     Function: TextEncoder::decode_text
00560 //       Access: Published
00561 //  Description: Returns the given wstring decoded to a single-byte
00562 //               string, via the current encoding system.
00563 ////////////////////////////////////////////////////////////////////
00564 INLINE wstring TextEncoder::
00565 decode_text(const string &text) const {
00566   return decode_text(text, _encoding);
00567 }
 All Classes Functions Variables Enumerations