Panda3D
textEncoder.cxx
Go to the documentation of this file.
1 /**
2  * PANDA 3D SOFTWARE
3  * Copyright (c) Carnegie Mellon University. All rights reserved.
4  *
5  * All use of this software is subject to the terms of the revised BSD
6  * license. You should have received a copy of this license along
7  * with this source code in a file named "LICENSE."
8  *
9  * @file textEncoder.cxx
10  * @author drose
11  * @date 2003-03-26
12  */
13 
14 #include "textEncoder.h"
15 #include "stringDecoder.h"
16 #include "unicodeLatinMap.h"
17 #include "config_dtoolutil.h"
18 
19 using std::istream;
20 using std::ostream;
21 using std::string;
22 using std::wstring;
23 
24 TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8;
25 
26 /**
27  * Adjusts the text stored within the encoder to all uppercase letters
28  * (preserving accent marks correctly).
29  */
30 void TextEncoder::
32  get_wtext();
33  wstring::iterator si;
34  for (si = _wtext.begin(); si != _wtext.end(); ++si) {
35  (*si) = unicode_toupper(*si);
36  }
37  _flags &= ~F_got_text;
38  text_changed();
39 }
40 
41 /**
42  * Adjusts the text stored within the encoder to all lowercase letters
43  * (preserving accent marks correctly).
44  */
45 void TextEncoder::
47  get_wtext();
48  wstring::iterator si;
49  for (si = _wtext.begin(); si != _wtext.end(); ++si) {
50  (*si) = unicode_tolower(*si);
51  }
52  _flags &= ~F_got_text;
53  text_changed();
54 }
55 
56 /**
57  * Returns the text associated with the node, converted as nearly as possible
58  * to a fully-ASCII representation. This means replacing accented letters
59  * with their unaccented ASCII equivalents.
60  *
61  * It is possible that some characters in the string cannot be converted to
62  * ASCII. (The string may involve symbols like the copyright symbol, for
63  * instance, or it might involve letters in some other alphabet such as Greek
64  * or Cyrillic, or even Latin letters like thorn or eth that are not part of
65  * the ASCII character set.) In this case, as much of the string as possible
66  * will be converted to ASCII, and the nonconvertible characters will remain
67  * in their original form.
68  */
69 wstring TextEncoder::
71  get_wtext();
72  wstring result;
73  wstring::const_iterator si;
74  for (si = _wtext.begin(); si != _wtext.end(); ++si) {
75  wchar_t character = (*si);
76 
77  const UnicodeLatinMap::Entry *map_entry =
78  UnicodeLatinMap::look_up(character);
79  if (map_entry != nullptr && map_entry->_ascii_equiv != 0) {
80  result += (wchar_t)map_entry->_ascii_equiv;
81  if (map_entry->_ascii_additional != 0) {
82  result += (wchar_t)map_entry->_ascii_additional;
83  }
84 
85  } else {
86  result += character;
87  }
88  }
89 
90  return result;
91 }
92 
93 /**
94  * Returns true if any of the characters in the string returned by get_wtext()
95  * are out of the range of an ASCII character (and, therefore, get_wtext()
96  * should be called in preference to get_text()).
97  */
98 bool TextEncoder::
99 is_wtext() const {
100  get_wtext();
101  wstring::const_iterator ti;
102  for (ti = _wtext.begin(); ti != _wtext.end(); ++ti) {
103  if (((*ti) & ~0x7f) != 0) {
104  return true;
105  }
106  }
107 
108  return false;
109 }
110 
111 /**
112  * Encodes a single Unicode character into a one-, two-, three-, or four-byte
113  * string, according to the given encoding system.
114  */
115 string TextEncoder::
116 encode_wchar(char32_t ch, TextEncoder::Encoding encoding) {
117  switch (encoding) {
118  case E_iso8859:
119  if ((ch & ~0xff) == 0) {
120  return string(1, (char)ch);
121  } else {
122  // The character won't fit in the 8-bit ISO 8859. See if we can make it
123  // fit by reducing it to its ascii equivalent (essentially stripping off
124  // an unusual accent mark).
125  const UnicodeLatinMap::Entry *map_entry =
127  if (map_entry != nullptr && map_entry->_ascii_equiv != 0) {
128  // Yes, it has an ascii equivalent.
129  if (map_entry->_ascii_additional != 0) {
130  // In fact, it has two of them.
131  return
132  string(1, map_entry->_ascii_equiv) +
133  string(1, map_entry->_ascii_additional);
134  }
135  return string(1, map_entry->_ascii_equiv);
136  }
137  // Nope; return "." for lack of anything better.
138  return ".";
139  }
140 
141  case E_utf8:
142  if ((ch & ~0x7f) == 0) {
143  return string(1, (char)ch);
144  } else if ((ch & ~0x7ff) == 0) {
145  return
146  string(1, (char)((ch >> 6) | 0xc0)) +
147  string(1, (char)((ch & 0x3f) | 0x80));
148  } else if ((ch & ~0xffff) == 0) {
149  return
150  string(1, (char)((ch >> 12) | 0xe0)) +
151  string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
152  string(1, (char)((ch & 0x3f) | 0x80));
153  } else {
154  return
155  string(1, (char)((ch >> 18) | 0xf0)) +
156  string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) +
157  string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
158  string(1, (char)((ch & 0x3f) | 0x80));
159  }
160 
161  case E_utf16be:
162  if ((ch & ~0xffff) == 0) {
163  // Note that this passes through surrogates and BOMs unharmed.
164  return
165  string(1, (char)(ch >> 8)) +
166  string(1, (char)(ch & 0xff));
167  } else {
168  // Use a surrogate pair.
169  uint32_t v = (uint32_t)ch - 0x10000u;
170  uint16_t hi = (v >> 10u) | 0xd800u;
171  uint16_t lo = (v & 0x3ffu) | 0xdc00u;
172  char encoded[4] = {
173  (char)(hi >> 8),
174  (char)(hi & 0xff),
175  (char)(lo >> 8),
176  (char)(lo & 0xff),
177  };
178  return string(encoded, 4);
179  }
180  }
181 
182  return "";
183 }
184 
185 /**
186  * Encodes a wide-text string into a single-char string, according to the
187  * given encoding.
188  */
189 string TextEncoder::
190 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
191  string result;
192 
193  for (size_t i = 0; i < wtext.size(); ++i) {
194  wchar_t ch = wtext[i];
195 
196  // On some systems, wstring may be UTF-16, and contain surrogate pairs.
197 #if WCHAR_MAX < 0x10FFFF
198  if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) {
199  // This is a high surrogate. Look for a subsequent low surrogate.
200  wchar_t ch2 = wtext[i + 1];
201  if (ch2 >= 0xdc00 && ch2 < 0xe000) {
202  // Yes, this is a low surrogate.
203  char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
204  result += encode_wchar(code_point, encoding);
205  i++;
206  continue;
207  }
208  }
209 #endif
210 
211  result += encode_wchar(ch, encoding);
212  }
213 
214  return result;
215 }
216 
217 /**
218  * Returns the given wstring decoded to a single-byte string, via the given
219  * encoding system.
220  */
221 wstring TextEncoder::
222 decode_text(const string &text, TextEncoder::Encoding encoding) {
223  switch (encoding) {
224  case E_utf8:
225  {
226  StringUtf8Decoder decoder(text);
227  return decode_text_impl(decoder);
228  }
229 
230  case E_utf16be:
231  {
232  StringUtf16Decoder decoder(text);
233  return decode_text_impl(decoder);
234  }
235 
236  case E_iso8859:
237  default:
238  {
239  StringDecoder decoder(text);
240  return decode_text_impl(decoder);
241  }
242  };
243 }
244 
245 /**
246  * Decodes the eight-bit stream from the indicated decoder, returning the
247  * decoded wide-char string.
248  */
249 wstring TextEncoder::
250 decode_text_impl(StringDecoder &decoder) {
251  wstring result;
252  // bool expand_amp = get_expand_amp();
253 
254  char32_t character = decoder.get_next_character();
255  while (!decoder.is_eof()) {
256  /*
257  if (character == '&' && expand_amp) {
258  // An ampersand in expand_amp mode is treated as an escape character.
259  character = expand_amp_sequence(decoder);
260  }
261  */
262  if (character <= WCHAR_MAX) {
263  result += character;
264  } else {
265  // We need to encode this as a surrogate pair.
266  uint32_t v = (uint32_t)character - 0x10000u;
267  result += (wchar_t)((v >> 10u) | 0xd800u);
268  result += (wchar_t)((v & 0x3ffu) | 0xdc00u);
269  }
270  character = decoder.get_next_character();
271  }
272 
273  return result;
274 }
275 
276 /**
277  * Given that we have just read an ampersand from the StringDecoder, and that
278  * we have expand_amp in effect and are therefore expected to expand the
279  * sequence that this ampersand begins into a single unicode character, do the
280  * expansion and return the character.
281  */
282 /*
283 int TextEncoder::
284 expand_amp_sequence(StringDecoder &decoder) const {
285  int result = 0;
286 
287  int character = decoder.get_next_character();
288  if (!decoder.is_eof() && character == '#') {
289  // An explicit numeric sequence: &#nnn;
290  result = 0;
291  character = decoder.get_next_character();
292  while (!decoder.is_eof() && character < 128 && isdigit((unsigned int)character)) {
293  result = (result * 10) + (character - '0');
294  character = decoder.get_next_character();
295  }
296  if (character != ';') {
297  // Invalid sequence.
298  return 0;
299  }
300 
301  return result;
302  }
303 
304  string sequence;
305 
306  // Some non-numeric sequence.
307  while (!decoder.is_eof() && character < 128 && isalpha((unsigned int)character)) {
308  sequence += character;
309  character = decoder.get_next_character();
310  }
311  if (character != ';') {
312  // Invalid sequence.
313  return 0;
314  }
315 
316  static const struct {
317  const char *name;
318  int code;
319  } tokens[] = {
320  { "amp", '&' }, { "lt", '<' }, { "gt", '>' }, { "quot", '"' },
321  { "nbsp", ' ' },
322 
323  { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 },
324  { "yen", 165 }, { "brvbar", 166 }, { "brkbar", 166 }, { "sect", 167 },
325  { "uml", 168 }, { "die", 168 }, { "copy", 169 }, { "ordf", 170 },
326  { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 },
327  { "macr", 175 }, { "hibar", 175 }, { "deg", 176 }, { "plusmn", 177 },
328  { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 },
329  { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 },
330  { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 },
331  { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 },
332  { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 },
333  { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 },
334  { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 },
335  { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Dstrok", 208 },
336  { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 },
337  { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 },
338  { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 },
339  { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 },
340  { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 },
341  { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 },
342  { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 },
343  { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 },
344  { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 },
345  { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 },
346  { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 },
347  { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 },
348 
349  { NULL, 0 },
350  };
351 
352  for (int i = 0; tokens[i].name != NULL; i++) {
353  if (sequence == tokens[i].name) {
354  // Here's a match.
355  return tokens[i].code;
356  }
357  }
358 
359  // Some unrecognized sequence.
360  return 0;
361 }
362 */
363 
364 /**
365  * Called whenever the text has been changed.
366  */
367 void TextEncoder::
368 text_changed() {
369 }
370 
371 /**
372  *
373  */
374 ostream &
375 operator << (ostream &out, TextEncoder::Encoding encoding) {
376  switch (encoding) {
377  case TextEncoder::E_iso8859:
378  return out << "iso8859";
379 
380  case TextEncoder::E_utf8:
381  return out << "utf8";
382 
383  case TextEncoder::E_utf16be:
384  return out << "utf16be";
385  };
386 
387  return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
388 }
389 
390 /**
391  *
392  */
393 istream &
394 operator >> (istream &in, TextEncoder::Encoding &encoding) {
395  string word;
396  in >> word;
397 
398  if (word == "iso8859") {
399  encoding = TextEncoder::E_iso8859;
400  } else if (word == "utf8" || word == "utf-8") {
401  encoding = TextEncoder::E_utf8;
402  } else if (word == "unicode" || word == "utf16be" || word == "utf-16be" ||
403  word == "utf16-be" || word == "utf-16-be") {
404  encoding = TextEncoder::E_utf16be;
405  } else {
406  ostream *notify_ptr = StringDecoder::get_notify_ptr();
407  if (notify_ptr != nullptr) {
408  (*notify_ptr)
409  << "Invalid TextEncoder::Encoding: " << word << "\n";
410  }
411  encoding = TextEncoder::E_iso8859;
412  }
413 
414  return in;
415 }
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
static int unicode_toupper(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
Definition: textEncoder.I:372
static std::string encode_wchar(char32_t ch, Encoding encoding)
Encodes a single Unicode character into a one-, two-, three-, or four-byte string,...
This decoder extracts characters two at a time to get a plain wide character sequence.
Definition: stringDecoder.h:58
This decoder extracts utf-8 sequences.
Definition: stringDecoder.h:47
void make_lower()
Adjusts the text stored within the encoder to all lowercase letters (preserving accent marks correctl...
Definition: textEncoder.cxx:46
virtual char32_t get_next_character()
Returns the next character in sequence.
bool is_wtext() const
Returns true if any of the characters in the string returned by get_wtext() are out of the range of a...
Definition: textEncoder.cxx:99
static const Entry * look_up(char32_t character)
Returns the Entry associated with the indicated character, if there is one.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
static int unicode_tolower(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
Definition: textEncoder.I:385
std::wstring get_wtext_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
Definition: textEncoder.cxx:70
std::string encode_wtext(const std::wstring &wtext) const
Encodes a wide-text string into a single-char string, according to the current encoding.
Definition: textEncoder.I:481
The base class to a family of classes that decode various kinds of encoded byte streams.
Definition: stringDecoder.h:24
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
bool is_eof()
Returns true if the decoder has returned the last character in the string, false if there are more to...
Definition: stringDecoder.I:28
const std::wstring & get_wtext() const
Returns the text associated with the TextEncoder, as a wide-character string.
Definition: textEncoder.I:456
std::wstring decode_text(const std::string &text) const
Returns the given wstring decoded to a single-byte string, via the current encoding system.
Definition: textEncoder.I:490
void make_upper()
Adjusts the text stored within the encoder to all uppercase letters (preserving accent marks correctl...
Definition: textEncoder.cxx:31
static std::ostream * get_notify_ptr()
Returns the ostream that is used to write error messages to.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.