Panda3D
|
00001 // Filename: stringDecoder.cxx 00002 // Created by: drose (11Feb02) 00003 // 00004 //////////////////////////////////////////////////////////////////// 00005 // 00006 // PANDA 3D SOFTWARE 00007 // Copyright (c) Carnegie Mellon University. All rights reserved. 00008 // 00009 // All use of this software is subject to the terms of the revised BSD 00010 // license. You should have received a copy of this license along 00011 // with this source code in a file named "LICENSE." 00012 // 00013 //////////////////////////////////////////////////////////////////// 00014 00015 #include "stringDecoder.h" 00016 #include "config_dtoolutil.h" 00017 00018 ostream *StringDecoder::_notify_ptr = &cerr; 00019 00020 //////////////////////////////////////////////////////////////////// 00021 // Function: StringDecoder::Destructor 00022 // Access: Public, Virtual 00023 // Description: 00024 //////////////////////////////////////////////////////////////////// 00025 StringDecoder:: 00026 ~StringDecoder() { 00027 } 00028 00029 //////////////////////////////////////////////////////////////////// 00030 // Function: StringDecoder::get_next_character 00031 // Access: Public, Virtual 00032 // Description: Returns the next character in sequence. 00033 //////////////////////////////////////////////////////////////////// 00034 int StringDecoder:: 00035 get_next_character() { 00036 if (test_eof()) { 00037 return -1; 00038 } 00039 return (unsigned char)_input[_p++]; 00040 } 00041 00042 //////////////////////////////////////////////////////////////////// 00043 // Function: StringDecoder::set_notify_ptr 00044 // Access: Public, Static 00045 // Description: Sets the ostream that is used to write error messages 00046 // to. This is necessary because of the low-level 00047 // placement of this class, before the definition of the 00048 // NotifyCategory class, so it cannot specify its own 00049 // notify. 00050 //////////////////////////////////////////////////////////////////// 00051 void StringDecoder:: 00052 set_notify_ptr(ostream *notify_ptr) { 00053 _notify_ptr = notify_ptr; 00054 } 00055 00056 //////////////////////////////////////////////////////////////////// 00057 // Function: StringDecoder::get_notify_ptr 00058 // Access: Public, Static 00059 // Description: Returns the ostream that is used to write error messages 00060 // to. See set_notify_ptr(). 00061 //////////////////////////////////////////////////////////////////// 00062 ostream *StringDecoder:: 00063 get_notify_ptr() { 00064 return _notify_ptr; 00065 } 00066 00067 00068 /* 00069 In UTF-8, each 16-bit Unicode character is encoded as a sequence of 00070 one, two, or three 8-bit bytes, depending on the value of the 00071 character. The following table shows the format of such UTF-8 byte 00072 sequences (where the "free bits" shown by x's in the table are 00073 combined in the order shown, and interpreted from most significant to 00074 least significant): 00075 00076 Binary format of bytes in sequence: 00077 Number of Maximum expressible 00078 1st byte 2nd byte 3rd byte free bits: Unicode value: 00079 00080 0xxxxxxx 7 007F hex (127) 00081 110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047) 00082 1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535) 00083 00084 The value of each individual byte indicates its UTF-8 function, as follows: 00085 00086 00 to 7F hex (0 to 127): first and only byte of a sequence. 00087 80 to BF hex (128 to 191): continuing byte in a multi-byte sequence. 00088 C2 to DF hex (194 to 223): first byte of a two-byte sequence. 00089 E0 to EF hex (224 to 239): first byte of a three-byte sequence. 00090 */ 00091 00092 //////////////////////////////////////////////////////////////////// 00093 // Function: StringUtf8Decoder::get_next_character 00094 // Access: Public, Virtual 00095 // Description: Returns the next character in sequence. 00096 //////////////////////////////////////////////////////////////////// 00097 int StringUtf8Decoder:: 00098 get_next_character() { 00099 unsigned int result; 00100 while (!test_eof()) { 00101 result = (unsigned char)_input[_p++]; 00102 if ((result & 0x80) == 0) { 00103 // A 7-bit ascii value in one byte. 00104 return result; 00105 00106 } if ((result & 0xe0) == 0xc0) { 00107 // First byte of two. 00108 unsigned int two = 0; 00109 if (test_eof()) { 00110 if (_notify_ptr != NULL) { 00111 (*_notify_ptr) 00112 << "utf-8 encoded string '" << _input << "' ends abruptly.\n"; 00113 } 00114 return -1; 00115 } 00116 two = (unsigned char)_input[_p++]; 00117 result = ((result & 0x1f) << 6) | (two & 0x3f); 00118 return result; 00119 00120 } else if ((result & 0xf0) == 0xe0) { 00121 // First byte of three. 00122 if (test_eof()) { 00123 if (_notify_ptr != NULL) { 00124 (*_notify_ptr) 00125 << "utf-8 encoded string '" << _input << "' ends abruptly.\n"; 00126 } 00127 return -1; 00128 } 00129 unsigned int two = (unsigned char)_input[_p++]; 00130 if (test_eof()) { 00131 if (_notify_ptr != NULL) { 00132 (*_notify_ptr) 00133 << "utf-8 encoded string '" << _input << "' ends abruptly.\n"; 00134 } 00135 return -1; 00136 } 00137 unsigned int three = (unsigned char)_input[_p++]; 00138 result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f); 00139 return result; 00140 } 00141 00142 // Otherwise--the high bit is set but it is not one of the 00143 // introductory utf-8 bytes--we have an error. 00144 if (_notify_ptr != NULL) { 00145 (*_notify_ptr) 00146 << "Non utf-8 byte in string: 0x" << hex << result << dec 00147 << ", string is '" << _input << "'\n"; 00148 } 00149 return -1; 00150 } 00151 00152 // End of string reached. 00153 return -1; 00154 } 00155 00156 //////////////////////////////////////////////////////////////////// 00157 // Function: StringUnicodeDecoder::get_next_character 00158 // Access: Public, Virtual 00159 // Description: Returns the next character in sequence. 00160 //////////////////////////////////////////////////////////////////// 00161 int StringUnicodeDecoder:: 00162 get_next_character() { 00163 if (test_eof()) { 00164 return -1; 00165 } 00166 00167 unsigned int high = (unsigned char)_input[_p++]; 00168 if (test_eof()) { 00169 if (_notify_ptr != NULL) { 00170 (*_notify_ptr) 00171 << "Unicode-encoded string has odd number of bytes.\n"; 00172 } 00173 return -1; 00174 } 00175 unsigned int low = (unsigned char)_input[_p++]; 00176 return ((high << 8) | low); 00177 }