Panda3D
 All Classes Functions Variables Enumerations
textEncoder.cxx
1 // Filename: textEncoder.cxx
2 // Created by: drose (26Mar03)
3 //
4 ////////////////////////////////////////////////////////////////////
5 //
6 // PANDA 3D SOFTWARE
7 // Copyright (c) Carnegie Mellon University. All rights reserved.
8 //
9 // All use of this software is subject to the terms of the revised BSD
10 // license. You should have received a copy of this license along
11 // with this source code in a file named "LICENSE."
12 //
13 ////////////////////////////////////////////////////////////////////
14 
15 #include "textEncoder.h"
16 #include "stringDecoder.h"
17 #include "unicodeLatinMap.h"
18 #include "config_dtoolutil.h"
19 
20 TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859;
21 
22 ////////////////////////////////////////////////////////////////////
23 // Function: TextEncoder::make_upper
24 // Access: Published
25 // Description: Adjusts the text stored within the encoder to all
26 // uppercase letters (preserving accent marks
27 // correctly).
28 ////////////////////////////////////////////////////////////////////
29 void TextEncoder::
31  get_wtext();
32  wstring::iterator si;
33  for (si = _wtext.begin(); si != _wtext.end(); ++si) {
34  (*si) = unicode_toupper(*si);
35  }
36  _flags &= ~F_got_text;
37 }
38 
39 ////////////////////////////////////////////////////////////////////
40 // Function: TextEncoder::make_lower
41 // Access: Published
42 // Description: Adjusts the text stored within the encoder to all
43 // lowercase letters (preserving accent marks
44 // correctly).
45 ////////////////////////////////////////////////////////////////////
46 void TextEncoder::
48  get_wtext();
49  wstring::iterator si;
50  for (si = _wtext.begin(); si != _wtext.end(); ++si) {
51  (*si) = unicode_tolower(*si);
52  }
53  _flags &= ~F_got_text;
54 }
55 
56 ////////////////////////////////////////////////////////////////////
57 // Function: TextEncoder::get_wtext_as_ascii
58 // Access: Published
59 // Description: Returns the text associated with the node, converted
60 // as nearly as possible to a fully-ASCII
61 // representation. This means replacing accented
62 // letters with their unaccented ASCII equivalents.
63 //
64 // It is possible that some characters in the string
65 // cannot be converted to ASCII. (The string may
66 // involve symbols like the copyright symbol, for
67 // instance, or it might involve letters in some other
68 // alphabet such as Greek or Cyrillic, or even Latin
69 // letters like thorn or eth that are not part of the
70 // ASCII character set.) In this case, as much of the
71 // string as possible will be converted to ASCII, and
72 // the nonconvertible characters will remain in their
73 // original form.
74 ////////////////////////////////////////////////////////////////////
75 wstring TextEncoder::
77  get_wtext();
78  wstring result;
79  wstring::const_iterator si;
80  for (si = _wtext.begin(); si != _wtext.end(); ++si) {
81  wchar_t character = (*si);
82 
83  const UnicodeLatinMap::Entry *map_entry =
84  UnicodeLatinMap::look_up(character);
85  if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
86  result += (wchar_t)map_entry->_ascii_equiv;
87  if (map_entry->_ascii_additional != 0) {
88  result += (wchar_t)map_entry->_ascii_additional;
89  }
90 
91  } else {
92  result += character;
93  }
94  }
95 
96  return result;
97 }
98 
99 ////////////////////////////////////////////////////////////////////
100 // Function: TextEncoder::is_wtext
101 // Access: Published
102 // Description: Returns true if any of the characters in the string
103 // returned by get_wtext() are out of the range of an
104 // ASCII character (and, therefore, get_wtext() should
105 // be called in preference to get_text()).
106 ////////////////////////////////////////////////////////////////////
107 bool TextEncoder::
108 is_wtext() const {
109  get_wtext();
110  wstring::const_iterator ti;
111  for (ti = _wtext.begin(); ti != _wtext.end(); ++ti) {
112  if (((*ti) & ~0x7f) != 0) {
113  return true;
114  }
115  }
116 
117  return false;
118 }
119 
120 ////////////////////////////////////////////////////////////////////
121 // Function: TextEncoder::encode_wchar
122 // Access: Published, Static
123 // Description: Encodes a single wide char into a one-, two-, or
124 // three-byte string, according to the given encoding
125 // system.
126 ////////////////////////////////////////////////////////////////////
127 string TextEncoder::
128 encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
129  switch (encoding) {
130  case E_iso8859:
131  if ((ch & ~0xff) == 0) {
132  return string(1, (char)ch);
133  } else {
134  // The character won't fit in the 8-bit ISO 8859. See if we can
135  // make it fit by reducing it to its ascii equivalent
136  // (essentially stripping off an unusual accent mark).
137  const UnicodeLatinMap::Entry *map_entry =
139  if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
140  // Yes, it has an ascii equivalent.
141  if (map_entry->_ascii_additional != 0) {
142  // In fact, it has two of them.
143  return
144  string(1, map_entry->_ascii_equiv) +
145  string(1, map_entry->_ascii_additional);
146  }
147  return string(1, map_entry->_ascii_equiv);
148  }
149  // Nope; return "." for lack of anything better.
150  return ".";
151  }
152 
153  case E_utf8:
154  if ((ch & ~0x7f) == 0) {
155  return string(1, (char)ch);
156  } else if ((ch & ~0x7ff) == 0) {
157  return
158  string(1, (char)((ch >> 6) | 0xc0)) +
159  string(1, (char)((ch & 0x3f) | 0x80));
160  } else {
161  return
162  string(1, (char)((ch >> 12) | 0xe0)) +
163  string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
164  string(1, (char)((ch & 0x3f) | 0x80));
165  }
166 
167  case E_unicode:
168  return
169  string(1, (char)(ch >> 8)) +
170  string(1, (char)(ch & 0xff));
171  }
172 
173  return "";
174 }
175 
176 ////////////////////////////////////////////////////////////////////
177 // Function: TextEncoder::encode_wtext
178 // Access: Published, Static
179 // Description: Encodes a wide-text string into a single-char string,
180 // according to the given encoding.
181 ////////////////////////////////////////////////////////////////////
182 string TextEncoder::
183 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
184  string result;
185 
186  for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
187  result += encode_wchar(*pi, encoding);
188  }
189 
190  return result;
191 }
192 
193 ////////////////////////////////////////////////////////////////////
194 // Function: TextEncoder::decode_text
195 // Access: Published, Static
196 // Description: Returns the given wstring decoded to a single-byte
197 // string, via the given encoding system.
198 ////////////////////////////////////////////////////////////////////
199 wstring TextEncoder::
200 decode_text(const string &text, TextEncoder::Encoding encoding) {
201  switch (encoding) {
202  case E_utf8:
203  {
204  StringUtf8Decoder decoder(text);
205  return decode_text_impl(decoder);
206  }
207 
208  case E_unicode:
209  {
210  StringUnicodeDecoder decoder(text);
211  return decode_text_impl(decoder);
212  }
213 
214  case E_iso8859:
215  default:
216  {
217  StringDecoder decoder(text);
218  return decode_text_impl(decoder);
219  }
220  };
221 }
222 
223 ////////////////////////////////////////////////////////////////////
224 // Function: TextEncoder::decode_text_impl
225 // Access: Private, Static
226 // Description: Decodes the eight-bit stream from the indicated
227 // decoder, returning the decoded wide-char string.
228 ////////////////////////////////////////////////////////////////////
229 wstring TextEncoder::
230 decode_text_impl(StringDecoder &decoder) {
231  wstring result;
232  // bool expand_amp = get_expand_amp();
233 
234  wchar_t character = decoder.get_next_character();
235  while (!decoder.is_eof()) {
236  /*
237  if (character == '&' && expand_amp) {
238  // An ampersand in expand_amp mode is treated as an escape
239  // character.
240  character = expand_amp_sequence(decoder);
241  }
242  */
243  result += character;
244  character = decoder.get_next_character();
245  }
246 
247  return result;
248 }
249 
250 /*
251 ////////////////////////////////////////////////////////////////////
252 // Function: TextEncoder::expand_amp_sequence
253 // Access: Private
254 // Description: Given that we have just read an ampersand from the
255 // StringDecoder, and that we have expand_amp in effect
256 // and are therefore expected to expand the sequence
257 // that this ampersand begins into a single unicode
258 // character, do the expansion and return the character.
259 ////////////////////////////////////////////////////////////////////
260 int TextEncoder::
261 expand_amp_sequence(StringDecoder &decoder) const {
262  int result = 0;
263 
264  int character = decoder.get_next_character();
265  if (!decoder.is_eof() && character == '#') {
266  // An explicit numeric sequence: &#nnn;
267  result = 0;
268  character = decoder.get_next_character();
269  while (!decoder.is_eof() && character < 128 && isdigit((unsigned int)character)) {
270  result = (result * 10) + (character - '0');
271  character = decoder.get_next_character();
272  }
273  if (character != ';') {
274  // Invalid sequence.
275  return 0;
276  }
277 
278  return result;
279  }
280 
281  string sequence;
282 
283  // Some non-numeric sequence.
284  while (!decoder.is_eof() && character < 128 && isalpha((unsigned int)character)) {
285  sequence += character;
286  character = decoder.get_next_character();
287  }
288  if (character != ';') {
289  // Invalid sequence.
290  return 0;
291  }
292 
293  static const struct {
294  const char *name;
295  int code;
296  } tokens[] = {
297  { "amp", '&' }, { "lt", '<' }, { "gt", '>' }, { "quot", '"' },
298  { "nbsp", ' ' },
299 
300  { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 },
301  { "yen", 165 }, { "brvbar", 166 }, { "brkbar", 166 }, { "sect", 167 },
302  { "uml", 168 }, { "die", 168 }, { "copy", 169 }, { "ordf", 170 },
303  { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 },
304  { "macr", 175 }, { "hibar", 175 }, { "deg", 176 }, { "plusmn", 177 },
305  { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 },
306  { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 },
307  { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 },
308  { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 },
309  { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 },
310  { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 },
311  { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 },
312  { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Dstrok", 208 },
313  { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 },
314  { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 },
315  { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 },
316  { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 },
317  { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 },
318  { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 },
319  { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 },
320  { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 },
321  { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 },
322  { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 },
323  { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 },
324  { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 },
325 
326  { NULL, 0 },
327  };
328 
329  for (int i = 0; tokens[i].name != NULL; i++) {
330  if (sequence == tokens[i].name) {
331  // Here's a match.
332  return tokens[i].code;
333  }
334  }
335 
336  // Some unrecognized sequence.
337  return 0;
338 }
339 */
340 
341 
342 ////////////////////////////////////////////////////////////////////
343 // Function: TextEncoder::Encoding ostream operator
344 // Description:
345 ////////////////////////////////////////////////////////////////////
346 ostream &
347 operator << (ostream &out, TextEncoder::Encoding encoding) {
348  switch (encoding) {
349  case TextEncoder::E_iso8859:
350  return out << "iso8859";
351 
352  case TextEncoder::E_utf8:
353  return out << "utf8";
354 
355  case TextEncoder::E_unicode:
356  return out << "unicode";
357  };
358 
359  return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
360 }
361 
362 ////////////////////////////////////////////////////////////////////
363 // Function: TextEncoder::Encoding istream operator
364 // Description:
365 ////////////////////////////////////////////////////////////////////
366 istream &
367 operator >> (istream &in, TextEncoder::Encoding &encoding) {
368  string word;
369  in >> word;
370 
371  if (word == "iso8859") {
372  encoding = TextEncoder::E_iso8859;
373  } else if (word == "utf8" || word == "utf-8") {
374  encoding = TextEncoder::E_utf8;
375  } else if (word == "unicode") {
376  encoding = TextEncoder::E_unicode;
377  } else {
378  ostream *notify_ptr = StringDecoder::get_notify_ptr();
379  if (notify_ptr != (ostream *)NULL) {
380  (*notify_ptr)
381  << "Invalid TextEncoder::Encoding: " << word << "\n";
382  }
383  encoding = TextEncoder::E_iso8859;
384  }
385 
386  return in;
387 }
const wstring & get_wtext() const
Returns the text associated with the TextEncoder, as a wide-character string.
Definition: textEncoder.I:530
This decoder extracts utf-8 sequences.
Definition: stringDecoder.h:52
string encode_wtext(const wstring &wtext) const
Encodes a wide-text string into a single-char string, according to the current encoding.
Definition: textEncoder.I:557
void make_lower()
Adjusts the text stored within the encoder to all lowercase letters (preserving accent marks correctl...
Definition: textEncoder.cxx:47
wstring get_wtext_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
Definition: textEncoder.cxx:76
static int unicode_tolower(int character)
Returns the uppercase equivalent of the given Unicode character.
Definition: textEncoder.I:447
virtual int get_next_character()
Returns the next character in sequence.
static string encode_wchar(wchar_t ch, Encoding encoding)
Encodes a single wide char into a one-, two-, or three-byte string, according to the given encoding s...
This decoder extracts characters two at a time to get a plain wide character sequence.
Definition: stringDecoder.h:64
wstring decode_text(const string &text) const
Returns the given wstring decoded to a single-byte string, via the current encoding system...
Definition: textEncoder.I:568
The base class to a family of classes that decode various kinds of encoded byte streams.
Definition: stringDecoder.h:28
static ostream * get_notify_ptr()
Returns the ostream that is used to write error messages to.
bool is_eof()
Returns true if the decoder has returned the last character in the string, false if there are more to...
Definition: stringDecoder.I:35
static const Entry * look_up(wchar_t character)
Returns the Entry associated with the indicated character, if there is one.
bool is_wtext() const
Returns true if any of the characters in the string returned by get_wtext() are out of the range of a...
void make_upper()
Adjusts the text stored within the encoder to all uppercase letters (preserving accent marks correctl...
Definition: textEncoder.cxx:30
static int unicode_toupper(int character)
Returns the uppercase equivalent of the given Unicode character.
Definition: textEncoder.I:431