Panda3D
textEncoder.I
Go to the documentation of this file.
1 /**
2  * PANDA 3D SOFTWARE
3  * Copyright (c) Carnegie Mellon University. All rights reserved.
4  *
5  * All use of this software is subject to the terms of the revised BSD
6  * license. You should have received a copy of this license along
7  * with this source code in a file named "LICENSE."
8  *
9  * @file textEncoder.I
10  * @author drose
11  * @date 2003-03-26
12  */
13 
14 /**
15  *
16  */
17 INLINE TextEncoder::
18 TextEncoder() {
19  _encoding = _default_encoding;
20 
21  // Initially, since the text string is empty, we know that both _text and
22  // _wtext accurately reflect the empty state; so we "got" both of them.
23  _flags = (F_got_text | F_got_wtext);
24 }
25 
26 /**
27  *
28  */
29 INLINE TextEncoder::
30 TextEncoder(const TextEncoder &copy) :
31  _flags(copy._flags),
32  _encoding(copy._encoding),
33  _text(copy._text),
34  _wtext(copy._wtext)
35 {
36 }
37 
38 /**
39  * Specifies how the string set via set_text() is to be interpreted. The
40  * default, E_iso8859, means a standard string with one-byte characters (i.e.
41  * ASCII). Other encodings are possible to take advantage of character sets
42  * with more than 256 characters.
43  *
44  * This affects only future calls to set_text(); it does not change text that
45  * was set previously.
46  */
47 INLINE void TextEncoder::
48 set_encoding(TextEncoder::Encoding encoding) {
49  // Force the previously-set strings to be encoded or decoded now.
50  get_text();
51  get_wtext();
52  _encoding = encoding;
53 }
54 
55 /**
56  * Returns the encoding by which the string set via set_text() is to be
57  * interpreted. See set_encoding().
58  */
59 INLINE TextEncoder::Encoding TextEncoder::
60 get_encoding() const {
61  return _encoding;
62 }
63 
64 /**
65  * Specifies the default encoding to be used for all subsequently created
66  * TextEncoder objects. See set_encoding().
67  */
68 INLINE void TextEncoder::
69 set_default_encoding(TextEncoder::Encoding encoding) {
70  _default_encoding = encoding;
71 }
72 
73 /**
74  * Specifies the default encoding to be used for all subsequently created
75  * TextEncoder objects. See set_encoding().
76  */
77 INLINE TextEncoder::Encoding TextEncoder::
78 get_default_encoding() {
79  return _default_encoding;
80 }
81 
82 /**
83  * Changes the text that is stored in the encoder. The text should be encoded
84  * according to the method indicated by set_encoding(). Subsequent calls to
85  * get_text() will return this same string, while get_wtext() will return the
86  * decoded version of the string.
87  */
88 INLINE void TextEncoder::
89 set_text(const std::string &text) {
90  if (!has_text() || _text != text) {
91  _text = text;
92  _flags = (_flags | F_got_text) & ~F_got_wtext;
93  text_changed();
94  }
95 }
96 
97 /**
98  * The two-parameter version of set_text() accepts an explicit encoding; the
99  * text is immediately decoded and stored as a wide-character string.
100  * Subsequent calls to get_text() will return the same text re-encoded using
101  * whichever encoding is specified by set_encoding().
102  */
103 INLINE void TextEncoder::
104 set_text(const std::string &text, TextEncoder::Encoding encoding) {
105  if (encoding == _encoding) {
106  set_text(text);
107  } else {
108  set_wtext(decode_text(text, encoding));
109  }
110 }
111 
112 /**
113  * Removes the text from the TextEncoder.
114  */
115 INLINE void TextEncoder::
117  _text = std::string();
118  _wtext = std::wstring();
119  _flags |= (F_got_text | F_got_wtext);
120  text_changed();
121 }
122 
123 /**
124  *
125  */
126 INLINE bool TextEncoder::
127 has_text() const {
128  if (_flags & F_got_wtext) {
129  return !_wtext.empty();
130  } else {
131  return !_text.empty();
132  }
133 }
134 
135 /**
136  * Returns the current text, as encoded via the current encoding system.
137  */
138 INLINE std::string TextEncoder::
139 get_text() const {
140  if ((_flags & F_got_text) == 0) {
141  ((TextEncoder *)this)->_text = encode_wtext(_wtext);
142  ((TextEncoder *)this)->_flags |= F_got_text;
143  }
144  return _text;
145 }
146 
147 /**
148  * Returns the current text, as encoded via the indicated encoding system.
149  */
150 INLINE std::string TextEncoder::
151 get_text(TextEncoder::Encoding encoding) const {
152  return encode_wtext(get_wtext(), encoding);
153 }
154 
155 /**
156  * Appends the indicates string to the end of the stored text.
157  */
158 INLINE void TextEncoder::
159 append_text(const std::string &text) {
160  if (!text.empty()) {
161  _text = get_text() + text;
162  _flags = (_flags | F_got_text) & ~F_got_wtext;
163  text_changed();
164  }
165 }
166 
167 /**
168  * Appends a single character to the end of the stored text. This may be a
169  * wide character, up to 16 bits in Unicode.
170  */
171 INLINE void TextEncoder::
172 append_unicode_char(char32_t character) {
173 #if WCHAR_MAX >= 0x10FFFF
174  // wchar_t might be UTF-32.
175  _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
176 #else
177  if ((character & ~0xffff) == 0) {
178  _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
179  } else {
180  // Encode as a surrogate pair.
181  uint32_t v = (uint32_t)character - 0x10000u;
182  wchar_t wstr[2] = {
183  (wchar_t)((v >> 10u) | 0xd800u),
184  (wchar_t)((v & 0x3ffu) | 0xdc00u),
185  };
186  _wtext = get_wtext() + std::wstring(wstr, 2);
187  }
188 #endif
189  _flags = (_flags | F_got_wtext) & ~F_got_text;
190  text_changed();
191 }
192 
193 /**
194  * Returns the number of characters in the stored text. This is a count of
195  * wide characters, after the string has been decoded according to
196  * set_encoding().
197  */
198 INLINE size_t TextEncoder::
199 get_num_chars() const {
200  return get_wtext().length();
201 }
202 
203 /**
204  * Returns the Unicode value of the nth character in the stored text. This
205  * may be a wide character (greater than 255), after the string has been
206  * decoded according to set_encoding().
207  */
208 INLINE int TextEncoder::
209 get_unicode_char(size_t index) const {
210  get_wtext();
211  if (index < _wtext.length()) {
212  return _wtext[index];
213  }
214  return 0;
215 }
216 
217 /**
218  * Sets the Unicode value of the nth character in the stored text. This may
219  * be a wide character (greater than 255), after the string has been decoded
220  * according to set_encoding().
221  */
222 INLINE void TextEncoder::
223 set_unicode_char(size_t index, char32_t character) {
224  get_wtext();
225  if (index < _wtext.length()) {
226  _wtext[index] = character;
227  _flags &= ~F_got_text;
228  text_changed();
229  }
230 }
231 
232 /**
233  * Returns the nth char of the stored text, as a one-, two-, or three-byte
234  * encoded string.
235  */
236 INLINE std::string TextEncoder::
237 get_encoded_char(size_t index) const {
238  return get_encoded_char(index, get_encoding());
239 }
240 
241 /**
242  * Returns the nth char of the stored text, as a one-, two-, or three-byte
243  * encoded string.
244  */
245 INLINE std::string TextEncoder::
246 get_encoded_char(size_t index, TextEncoder::Encoding encoding) const {
247  std::wstring wch(1, (wchar_t)get_unicode_char(index));
248  return encode_wtext(wch, encoding);
249 }
250 
251 /**
252  * Returns the text associated with the node, converted as nearly as possible
253  * to a fully-ASCII representation. This means replacing accented letters
254  * with their unaccented ASCII equivalents.
255  *
256  * It is possible that some characters in the string cannot be converted to
257  * ASCII. (The string may involve symbols like the copyright symbol, for
258  * instance, or it might involve letters in some other alphabet such as Greek
259  * or Cyrillic, or even Latin letters like thorn or eth that are not part of
260  * the ASCII character set.) In this case, as much of the string as possible
261  * will be converted to ASCII, and the nonconvertible characters will remain
262  * encoded in the encoding specified by set_encoding().
263  */
264 INLINE std::string TextEncoder::
267 }
268 
269 /**
270  * Given the indicated text string, which is assumed to be encoded via the
271  * encoding "from", decodes it and then reencodes it into the encoding "to",
272  * and returns the newly encoded string. This does not change or affect any
273  * properties on the TextEncoder itself.
274  */
275 INLINE std::string TextEncoder::
276 reencode_text(const std::string &text, TextEncoder::Encoding from,
277  TextEncoder::Encoding to) {
278  return encode_wtext(decode_text(text, from), to);
279 }
280 
281 /**
282  * Returns true if the indicated character is an alphabetic letter, false
283  * otherwise. This is akin to ctype's isalpha(), extended to Unicode.
284  */
285 INLINE bool TextEncoder::
286 unicode_isalpha(char32_t character) {
287  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
288  if (entry == nullptr) {
289  return false;
290  }
291  return entry->_char_type == UnicodeLatinMap::CT_upper ||
292  entry->_char_type == UnicodeLatinMap::CT_lower;
293 }
294 
295 /**
296  * Returns true if the indicated character is a numeric digit, false
297  * otherwise. This is akin to ctype's isdigit(), extended to Unicode.
298  */
299 INLINE bool TextEncoder::
300 unicode_isdigit(char32_t character) {
301  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
302  if (entry == nullptr) {
303  // The digits aren't actually listed in the map.
304  return (character >= '0' && character <= '9');
305  }
306  // This silly test (!= 0) is necessary to prevent a VC++ warning.
307  return (isdigit(entry->_ascii_equiv) != 0);
308 }
309 
310 /**
311  * Returns true if the indicated character is a punctuation mark, false
312  * otherwise. This is akin to ctype's ispunct(), extended to Unicode.
313  */
314 INLINE bool TextEncoder::
315 unicode_ispunct(char32_t character) {
316  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
317  if (entry == nullptr) {
318  // Some punctuation marks aren't listed in the map.
319  return (character < 128 && ispunct(character));
320  }
321  return entry->_char_type == UnicodeLatinMap::CT_punct;
322 }
323 
324 /**
325  * Returns true if the indicated character is an uppercase letter, false
326  * otherwise. This is akin to ctype's isupper(), extended to Unicode.
327  */
328 INLINE bool TextEncoder::
329 unicode_isupper(char32_t character) {
330  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
331  if (entry == nullptr) {
332  return false;
333  }
334  return entry->_char_type == UnicodeLatinMap::CT_upper;
335 }
336 
337 /**
338  * Returns true if the indicated character is a whitespace letter, false
339  * otherwise. This is akin to ctype's isspace(), extended to Unicode.
340  */
341 INLINE bool TextEncoder::
342 unicode_isspace(char32_t character) {
343  switch (character) {
344  case ' ':
345  case '\t':
346  case '\n':
347  return true;
348 
349  default:
350  return false;
351  }
352 }
353 
354 /**
355  * Returns true if the indicated character is a lowercase letter, false
356  * otherwise. This is akin to ctype's islower(), extended to Unicode.
357  */
358 INLINE bool TextEncoder::
359 unicode_islower(char32_t character) {
360  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
361  if (entry == nullptr) {
362  return false;
363  }
364  return entry->_char_type == UnicodeLatinMap::CT_lower;
365 }
366 
367 /**
368  * Returns the uppercase equivalent of the given Unicode character. This is
369  * akin to ctype's toupper(), extended to Unicode.
370  */
371 INLINE int TextEncoder::
372 unicode_toupper(char32_t character) {
373  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
374  if (entry == nullptr) {
375  return character;
376  }
377  return entry->_toupper_character;
378 }
379 
380 /**
381  * Returns the uppercase equivalent of the given Unicode character. This is
382  * akin to ctype's tolower(), extended to Unicode.
383  */
384 INLINE int TextEncoder::
385 unicode_tolower(char32_t character) {
386  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
387  if (entry == nullptr) {
388  return character;
389  }
390  return entry->_tolower_character;
391 }
392 
393 /**
394  * Converts the string to uppercase, assuming the string is encoded in the
395  * default encoding.
396  */
397 INLINE std::string TextEncoder::
398 upper(const std::string &source) {
399  return upper(source, get_default_encoding());
400 }
401 
402 /**
403  * Converts the string to uppercase, assuming the string is encoded in the
404  * indicated encoding.
405  */
406 INLINE std::string TextEncoder::
407 upper(const std::string &source, TextEncoder::Encoding encoding) {
408  TextEncoder encoder;
409  encoder.set_encoding(encoding);
410  encoder.set_text(source);
411  encoder.make_upper();
412  return encoder.get_text();
413 }
414 
415 /**
416  * Converts the string to lowercase, assuming the string is encoded in the
417  * default encoding.
418  */
419 INLINE std::string TextEncoder::
420 lower(const std::string &source) {
421  return lower(source, get_default_encoding());
422 }
423 
424 /**
425  * Converts the string to lowercase, assuming the string is encoded in the
426  * indicated encoding.
427  */
428 INLINE std::string TextEncoder::
429 lower(const std::string &source, TextEncoder::Encoding encoding) {
430  TextEncoder encoder;
431  encoder.set_encoding(encoding);
432  encoder.set_text(source);
433  encoder.make_lower();
434  return encoder.get_text();
435 }
436 
437 /**
438  * Changes the text that is stored in the encoder. Subsequent calls to
439  * get_wtext() will return this same string, while get_text() will return the
440  * encoded version of the string.
441  */
442 INLINE void TextEncoder::
443 set_wtext(const std::wstring &wtext) {
444  if (!has_text() || _wtext != wtext) {
445  _wtext = wtext;
446  _flags = (_flags | F_got_wtext) & ~F_got_text;
447  text_changed();
448  }
449 }
450 
451 /**
452  * Returns the text associated with the TextEncoder, as a wide-character
453  * string.
454  */
455 INLINE const std::wstring &TextEncoder::
456 get_wtext() const {
457  if ((_flags & F_got_wtext) == 0) {
458  ((TextEncoder *)this)->_wtext = decode_text(_text);
459  ((TextEncoder *)this)->_flags |= F_got_wtext;
460  }
461  return _wtext;
462 }
463 
464 /**
465  * Appends the indicates string to the end of the stored wide-character text.
466  */
467 INLINE void TextEncoder::
468 append_wtext(const std::wstring &wtext) {
469  if (!wtext.empty()) {
470  _wtext = get_wtext() + wtext;
471  _flags = (_flags | F_got_wtext) & ~F_got_text;
472  text_changed();
473  }
474 }
475 
476 /**
477  * Encodes a wide-text string into a single-char string, according to the
478  * current encoding.
479  */
480 INLINE std::string TextEncoder::
481 encode_wtext(const std::wstring &wtext) const {
482  return encode_wtext(wtext, _encoding);
483 }
484 
485 /**
486  * Returns the given wstring decoded to a single-byte string, via the current
487  * encoding system.
488  */
489 INLINE std::wstring TextEncoder::
490 decode_text(const std::string &text) const {
491  return decode_text(text, _encoding);
492 }
493 
494 /**
495  * Uses the current default encoding to output the wstring.
496  */
497 INLINE std::ostream &
498 operator << (std::ostream &out, const std::wstring &str) {
499  TextEncoder encoder;
500  encoder.set_wtext(str);
501  out << encoder.get_text();
502  return out;
503 }
void append_text(const std::string &text)
Appends the indicates string to the end of the stored text.
Definition: textEncoder.I:159
std::ostream & operator<<(std::ostream &out, const std::wstring &str)
Uses the current default encoding to output the wstring.
Definition: textEncoder.I:498
static std::string reencode_text(const std::string &text, Encoding from, Encoding to)
Given the indicated text string, which is assumed to be encoded via the encoding "from",...
Definition: textEncoder.I:276
static int unicode_toupper(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
Definition: textEncoder.I:372
int get_unicode_char(size_t index) const
Returns the Unicode value of the nth character in the stored text.
Definition: textEncoder.I:209
This class can be used to convert text between multiple representations, e.g.
Definition: textEncoder.h:33
static bool unicode_ispunct(char32_t character)
Returns true if the indicated character is a punctuation mark, false otherwise.
Definition: textEncoder.I:315
static std::string upper(const std::string &source)
Converts the string to uppercase, assuming the string is encoded in the default encoding.
Definition: textEncoder.I:398
set_default_encoding
Specifies the default encoding to be used for all subsequently created TextEncoder objects.
Definition: textEncoder.h:54
void make_lower()
Adjusts the text stored within the encoder to all lowercase letters (preserving accent marks correctl...
Definition: textEncoder.cxx:46
void clear_text()
Removes the text from the TextEncoder.
Definition: textEncoder.I:116
void append_wtext(const std::wstring &text)
Appends the indicates string to the end of the stored wide-character text.
Definition: textEncoder.I:468
set_text
Changes the text that is stored in the encoder.
Definition: textEncoder.h:124
std::string get_text_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
Definition: textEncoder.I:265
static const Entry * look_up(char32_t character)
Returns the Entry associated with the indicated character, if there is one.
void set_unicode_char(size_t index, char32_t character)
Sets the Unicode value of the nth character in the stored text.
Definition: textEncoder.I:223
get_default_encoding
Specifies the default encoding to be used for all subsequently created TextEncoder objects.
Definition: textEncoder.h:54
static int unicode_tolower(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
Definition: textEncoder.I:385
static bool unicode_isalpha(char32_t character)
Returns true if the indicated character is an alphabetic letter, false otherwise.
Definition: textEncoder.I:286
std::string get_encoded_char(size_t index) const
Returns the nth char of the stored text, as a one-, two-, or three-byte encoded string.
Definition: textEncoder.I:237
std::wstring get_wtext_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
Definition: textEncoder.cxx:70
get_text
Returns the current text, as encoded via the current encoding system.
Definition: textEncoder.h:124
std::string encode_wtext(const std::wstring &wtext) const
Encodes a wide-text string into a single-char string, according to the current encoding.
Definition: textEncoder.I:481
static bool unicode_isdigit(char32_t character)
Returns true if the indicated character is a numeric digit, false otherwise.
Definition: textEncoder.I:300
void set_encoding(Encoding encoding)
Specifies how the string set via set_text() is to be interpreted.
Definition: textEncoder.I:48
static std::string lower(const std::string &source)
Converts the string to lowercase, assuming the string is encoded in the default encoding.
Definition: textEncoder.I:420
const std::wstring & get_wtext() const
Returns the text associated with the TextEncoder, as a wide-character string.
Definition: textEncoder.I:456
Encoding get_encoding() const
Returns the encoding by which the string set via set_text() is to be interpreted.
Definition: textEncoder.I:60
void append_unicode_char(char32_t character)
Appends a single character to the end of the stored text.
Definition: textEncoder.I:172
std::wstring decode_text(const std::string &text) const
Returns the given wstring decoded to a single-byte string, via the current encoding system.
Definition: textEncoder.I:490
size_t get_num_chars() const
Returns the number of characters in the stored text.
Definition: textEncoder.I:199
void make_upper()
Adjusts the text stored within the encoder to all uppercase letters (preserving accent marks correctl...
Definition: textEncoder.cxx:31
static bool unicode_islower(char32_t character)
Returns true if the indicated character is a lowercase letter, false otherwise.
Definition: textEncoder.I:359
void set_wtext(const std::wstring &wtext)
Changes the text that is stored in the encoder.
Definition: textEncoder.I:443
static bool unicode_isupper(char32_t character)
Returns true if the indicated character is an uppercase letter, false otherwise.
Definition: textEncoder.I:329
static bool unicode_isspace(char32_t character)
Returns true if the indicated character is a whitespace letter, false otherwise.
Definition: textEncoder.I:342