Panda3D
textEncoder.cxx
Go to the documentation of this file.
1 /**
2  * PANDA 3D SOFTWARE
3  * Copyright (c) Carnegie Mellon University. All rights reserved.
4  *
5  * All use of this software is subject to the terms of the revised BSD
6  * license. You should have received a copy of this license along
7  * with this source code in a file named "LICENSE."
8  *
9  * @file textEncoder.cxx
10  * @author drose
11  * @date 2003-03-26
12  */
13 
14 #include "textEncoder.h"
15 #include "stringDecoder.h"
16 #include "unicodeLatinMap.h"
17 #include "config_dtoolutil.h"
18 
19 using std::istream;
20 using std::ostream;
21 using std::string;
22 using std::wstring;
23 
24 TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8;
25 
26 /**
27  * Adjusts the text stored within the encoder to all uppercase letters
28  * (preserving accent marks correctly).
29  */
31 make_upper() {
32  get_wtext();
33  wstring::iterator si;
34  for (si = _wtext.begin(); si != _wtext.end(); ++si) {
35  (*si) = unicode_toupper(*si);
36  }
37  _flags &= ~F_got_text;
38  text_changed();
39 }
40 
41 /**
42  * Adjusts the text stored within the encoder to all lowercase letters
43  * (preserving accent marks correctly).
44  */
46 make_lower() {
47  get_wtext();
48  wstring::iterator si;
49  for (si = _wtext.begin(); si != _wtext.end(); ++si) {
50  (*si) = unicode_tolower(*si);
51  }
52  _flags &= ~F_got_text;
53  text_changed();
54 }
55 
56 /**
57  * Returns the text associated with the node, converted as nearly as possible
58  * to a fully-ASCII representation. This means replacing accented letters
59  * with their unaccented ASCII equivalents.
60  *
61  * It is possible that some characters in the string cannot be converted to
62  * ASCII. (The string may involve symbols like the copyright symbol, for
63  * instance, or it might involve letters in some other alphabet such as Greek
64  * or Cyrillic, or even Latin letters like thorn or eth that are not part of
65  * the ASCII character set.) In this case, as much of the string as possible
66  * will be converted to ASCII, and the nonconvertible characters will remain
67  * in their original form.
68  */
70 get_wtext_as_ascii() const {
71  get_wtext();
72  wstring result;
73  wstring::const_iterator si;
74  for (si = _wtext.begin(); si != _wtext.end(); ++si) {
75  wchar_t character = (*si);
76 
77  const UnicodeLatinMap::Entry *map_entry =
78  UnicodeLatinMap::look_up(character);
79  if (map_entry != nullptr && map_entry->_ascii_equiv != 0) {
80  result += (wchar_t)map_entry->_ascii_equiv;
81  if (map_entry->_ascii_additional != 0) {
82  result += (wchar_t)map_entry->_ascii_additional;
83  }
84 
85  } else {
86  result += character;
87  }
88  }
89 
90  return result;
91 }
92 
93 /**
94  * Returns true if any of the characters in the string returned by get_wtext()
95  * are out of the range of an ASCII character (and, therefore, get_wtext()
96  * should be called in preference to get_text()).
97  */
99 is_wtext() const {
100  get_wtext();
101  wstring::const_iterator ti;
102  for (ti = _wtext.begin(); ti != _wtext.end(); ++ti) {
103  if (((*ti) & ~0x7f) != 0) {
104  return true;
105  }
106  }
107 
108  return false;
109 }
110 
111 /**
112  * Encodes a single Unicode character into a one-, two-, three-, or four-byte
113  * string, according to the given encoding system.
114  */
116 encode_wchar(char32_t ch, TextEncoder::Encoding encoding) {
117  switch (encoding) {
118  case E_iso8859:
119  if ((ch & ~0xff) == 0) {
120  return string(1, (char)ch);
121  } else {
122  // The character won't fit in the 8-bit ISO 8859. See if we can make it
123  // fit by reducing it to its ascii equivalent (essentially stripping off
124  // an unusual accent mark).
125  const UnicodeLatinMap::Entry *map_entry =
127  if (map_entry != nullptr && map_entry->_ascii_equiv != 0) {
128  // Yes, it has an ascii equivalent.
129  if (map_entry->_ascii_additional != 0) {
130  // In fact, it has two of them.
131  return
132  string(1, map_entry->_ascii_equiv) +
133  string(1, map_entry->_ascii_additional);
134  }
135  return string(1, map_entry->_ascii_equiv);
136  }
137  // Nope; return "." for lack of anything better.
138  return ".";
139  }
140 
141  case E_utf8:
142  if ((ch & ~0x7f) == 0) {
143  return string(1, (char)ch);
144  } else if ((ch & ~0x7ff) == 0) {
145  return
146  string(1, (char)((ch >> 6) | 0xc0)) +
147  string(1, (char)((ch & 0x3f) | 0x80));
148  } else if ((ch & ~0xffff) == 0) {
149  return
150  string(1, (char)((ch >> 12) | 0xe0)) +
151  string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
152  string(1, (char)((ch & 0x3f) | 0x80));
153  } else {
154  return
155  string(1, (char)((ch >> 18) | 0xf0)) +
156  string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) +
157  string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
158  string(1, (char)((ch & 0x3f) | 0x80));
159  }
160 
161  case E_utf16be:
162  if ((ch & ~0xffff) == 0) {
163  // Note that this passes through surrogates and BOMs unharmed.
164  return
165  string(1, (char)(ch >> 8)) +
166  string(1, (char)(ch & 0xff));
167  } else {
168  // Use a surrogate pair.
169  uint32_t v = (uint32_t)ch - 0x10000u;
170  uint16_t hi = (v >> 10u) | 0xd800u;
171  uint16_t lo = (v & 0x3ffu) | 0xdc00u;
172  char encoded[4] = {
173  (char)(hi >> 8),
174  (char)(hi & 0xff),
175  (char)(lo >> 8),
176  (char)(lo & 0xff),
177  };
178  return string(encoded, 4);
179  }
180  }
181 
182  return "";
183 }
184 
185 /**
186  * Encodes a wide-text string into a single-char string, according to the
187  * given encoding.
188  */
190 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
191  string result;
192 
193  for (size_t i = 0; i < wtext.size(); ++i) {
194  wchar_t ch = wtext[i];
195 
196  // On some systems, wstring may be UTF-16, and contain surrogate pairs.
197 #if WCHAR_MAX < 0x10FFFF
198  if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) {
199  // This is a high surrogate. Look for a subsequent low surrogate.
200  wchar_t ch2 = wtext[i + 1];
201  if (ch2 >= 0xdc00 && ch2 < 0xe000) {
202  // Yes, this is a low surrogate.
203  char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
204  result += encode_wchar(code_point, encoding);
205  i++;
206  continue;
207  }
208  }
209 #endif
210 
211  result += encode_wchar(ch, encoding);
212  }
213 
214  return result;
215 }
216 
217 /**
218  * Returns the given wstring decoded to a single-byte string, via the given
219  * encoding system.
220  */
222 decode_text(const string &text, TextEncoder::Encoding encoding) {
223  switch (encoding) {
224  case E_utf8:
225  {
226  StringUtf8Decoder decoder(text);
227  return decode_text_impl(decoder);
228  }
229 
230  case E_utf16be:
231  {
232  StringUtf16Decoder decoder(text);
233  return decode_text_impl(decoder);
234  }
235 
236  case E_iso8859:
237  default:
238  {
239  StringDecoder decoder(text);
240  return decode_text_impl(decoder);
241  }
242  };
243 }
244 
245 /**
246  * Decodes the eight-bit stream from the indicated decoder, returning the
247  * decoded wide-char string.
248  */
249 wstring TextEncoder::
250 decode_text_impl(StringDecoder &decoder) {
251  wstring result;
252  // bool expand_amp = get_expand_amp();
253 
254  char32_t character = decoder.get_next_character();
255  while (!decoder.is_eof()) {
256  /*
257  if (character == '&' && expand_amp) {
258  // An ampersand in expand_amp mode is treated as an escape character.
259  character = expand_amp_sequence(decoder);
260  }
261  */
262  if (character <= WCHAR_MAX) {
263  result += character;
264  } else {
265  // We need to encode this as a surrogate pair.
266  uint32_t v = (uint32_t)character - 0x10000u;
267  result += (wchar_t)((v >> 10u) | 0xd800u);
268  result += (wchar_t)((v & 0x3ffu) | 0xdc00u);
269  }
270  character = decoder.get_next_character();
271  }
272 
273  return result;
274 }
275 
276 /**
277  * Given that we have just read an ampersand from the StringDecoder, and that
278  * we have expand_amp in effect and are therefore expected to expand the
279  * sequence that this ampersand begins into a single unicode character, do the
280  * expansion and return the character.
281  */
282 /*
283 int TextEncoder::
284 expand_amp_sequence(StringDecoder &decoder) const {
285  int result = 0;
286 
287  int character = decoder.get_next_character();
288  if (!decoder.is_eof() && character == '#') {
289  // An explicit numeric sequence: &#nnn;
290  result = 0;
291  character = decoder.get_next_character();
292  while (!decoder.is_eof() && character < 128 && isdigit((unsigned int)character)) {
293  result = (result * 10) + (character - '0');
294  character = decoder.get_next_character();
295  }
296  if (character != ';') {
297  // Invalid sequence.
298  return 0;
299  }
300 
301  return result;
302  }
303 
304  string sequence;
305 
306  // Some non-numeric sequence.
307  while (!decoder.is_eof() && character < 128 && isalpha((unsigned int)character)) {
308  sequence += character;
309  character = decoder.get_next_character();
310  }
311  if (character != ';') {
312  // Invalid sequence.
313  return 0;
314  }
315 
316  static const struct {
317  const char *name;
318  int code;
319  } tokens[] = {
320  { "amp", '&' }, { "lt", '<' }, { "gt", '>' }, { "quot", '"' },
321  { "nbsp", ' ' },
322 
323  { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 },
324  { "yen", 165 }, { "brvbar", 166 }, { "brkbar", 166 }, { "sect", 167 },
325  { "uml", 168 }, { "die", 168 }, { "copy", 169 }, { "ordf", 170 },
326  { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 },
327  { "macr", 175 }, { "hibar", 175 }, { "deg", 176 }, { "plusmn", 177 },
328  { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 },
329  { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 },
330  { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 },
331  { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 },
332  { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 },
333  { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 },
334  { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 },
335  { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Dstrok", 208 },
336  { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 },
337  { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 },
338  { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 },
339  { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 },
340  { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 },
341  { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 },
342  { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 },
343  { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 },
344  { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 },
345  { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 },
346  { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 },
347  { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 },
348 
349  { NULL, 0 },
350  };
351 
352  for (int i = 0; tokens[i].name != NULL; i++) {
353  if (sequence == tokens[i].name) {
354  // Here's a match.
355  return tokens[i].code;
356  }
357  }
358 
359  // Some unrecognized sequence.
360  return 0;
361 }
362 */
363 
364 /**
365  * Called whenever the text has been changed.
366  */
367 void TextEncoder::
368 text_changed() {
369 }
370 
371 /**
372  *
373  */
374 ostream &
375 operator << (ostream &out, TextEncoder::Encoding encoding) {
376  switch (encoding) {
377  case TextEncoder::E_iso8859:
378  return out << "iso8859";
379 
380  case TextEncoder::E_utf8:
381  return out << "utf8";
382 
383  case TextEncoder::E_utf16be:
384  return out << "utf16be";
385  };
386 
387  return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
388 }
389 
390 /**
391  *
392  */
393 istream &
394 operator >> (istream &in, TextEncoder::Encoding &encoding) {
395  string word;
396  in >> word;
397 
398  if (word == "iso8859") {
399  encoding = TextEncoder::E_iso8859;
400  } else if (word == "utf8" || word == "utf-8") {
401  encoding = TextEncoder::E_utf8;
402  } else if (word == "unicode" || word == "utf16be" || word == "utf-16be" ||
403  word == "utf16-be" || word == "utf-16-be") {
404  encoding = TextEncoder::E_utf16be;
405  } else {
406  ostream *notify_ptr = StringDecoder::get_notify_ptr();
407  if (notify_ptr != nullptr) {
408  (*notify_ptr)
409  << "Invalid TextEncoder::Encoding: " << word << "\n";
410  }
411  encoding = TextEncoder::E_iso8859;
412  }
413 
414  return in;
415 }
The base class to a family of classes that decode various kinds of encoded byte streams.
Definition: stringDecoder.h:24
bool is_eof()
Returns true if the decoder has returned the last character in the string, false if there are more to...
Definition: stringDecoder.I:28
virtual char32_t get_next_character()
Returns the next character in sequence.
static std::ostream * get_notify_ptr()
Returns the ostream that is used to write error messages to.
This decoder extracts characters two at a time to get a plain wide character sequence.
Definition: stringDecoder.h:58
This decoder extracts utf-8 sequences.
Definition: stringDecoder.h:47
std::wstring decode_text(const std::string &text) const
Returns the given wstring decoded to a single-byte string, via the current encoding system.
Definition: textEncoder.I:490
static std::string encode_wchar(char32_t ch, Encoding encoding)
Encodes a single Unicode character into a one-, two-, three-, or four-byte string,...
static int unicode_tolower(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
Definition: textEncoder.I:385
bool is_wtext() const
Returns true if any of the characters in the string returned by get_wtext() are out of the range of a...
Definition: textEncoder.cxx:99
static int unicode_toupper(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
Definition: textEncoder.I:372
std::wstring get_wtext_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
Definition: textEncoder.cxx:70
const std::wstring & get_wtext() const
Returns the text associated with the TextEncoder, as a wide-character string.
Definition: textEncoder.I:456
void make_lower()
Adjusts the text stored within the encoder to all lowercase letters (preserving accent marks correctl...
Definition: textEncoder.cxx:46
void make_upper()
Adjusts the text stored within the encoder to all uppercase letters (preserving accent marks correctl...
Definition: textEncoder.cxx:31
std::string encode_wtext(const std::wstring &wtext) const
Encodes a wide-text string into a single-char string, according to the current encoding.
Definition: textEncoder.I:481
static const Entry * look_up(char32_t character)
Returns the Entry associated with the indicated character, if there is one.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.