Panda3D
|
00001 // Filename: stringDecoder.cxx 00002 // Created by: drose (11Feb02) 00003 // 00004 //////////////////////////////////////////////////////////////////// 00005 // 00006 // PANDA 3D SOFTWARE 00007 // Copyright (c) Carnegie Mellon University. All rights reserved. 00008 // 00009 // All use of this software is subject to the terms of the revised BSD 00010 // license. You should have received a copy of this license along 00011 // with this source code in a file named "LICENSE." 00012 // 00013 //////////////////////////////////////////////////////////////////// 00014 00015 #include "stringDecoder.h" 00016 #include "config_express.h" 00017 00018 //////////////////////////////////////////////////////////////////// 00019 // Function: StringDecoder::Destructor 00020 // Access: Public, Virtual 00021 // Description: 00022 //////////////////////////////////////////////////////////////////// 00023 StringDecoder:: 00024 ~StringDecoder() { 00025 } 00026 00027 //////////////////////////////////////////////////////////////////// 00028 // Function: StringDecoder::get_next_character 00029 // Access: Public, Virtual 00030 // Description: Returns the next character in sequence. 00031 //////////////////////////////////////////////////////////////////// 00032 int StringDecoder:: 00033 get_next_character() { 00034 if (test_eof()) { 00035 return -1; 00036 } 00037 return (unsigned char)_input[_p++]; 00038 } 00039 00040 /* 00041 In UTF-8, each 16-bit Unicode character is encoded as a sequence of 00042 one, two, or three 8-bit bytes, depending on the value of the 00043 character. The following table shows the format of such UTF-8 byte 00044 sequences (where the "free bits" shown by x's in the table are 00045 combined in the order shown, and interpreted from most significant to 00046 least significant): 00047 00048 Binary format of bytes in sequence: 00049 Number of Maximum expressible 00050 1st byte 2nd byte 3rd byte free bits: Unicode value: 00051 00052 0xxxxxxx 7 007F hex (127) 00053 110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047) 00054 1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535) 00055 00056 The value of each individual byte indicates its UTF-8 function, as follows: 00057 00058 00 to 7F hex (0 to 127): first and only byte of a sequence. 00059 80 to BF hex (128 to 191): continuing byte in a multi-byte sequence. 00060 C2 to DF hex (194 to 223): first byte of a two-byte sequence. 00061 E0 to EF hex (224 to 239): first byte of a three-byte sequence. 00062 */ 00063 00064 //////////////////////////////////////////////////////////////////// 00065 // Function: StringUtf8Decoder::get_next_character 00066 // Access: Public, Virtual 00067 // Description: Returns the next character in sequence. 00068 //////////////////////////////////////////////////////////////////// 00069 int StringUtf8Decoder:: 00070 get_next_character() { 00071 unsigned int result; 00072 while (!test_eof()) { 00073 result = (unsigned char)_input[_p++]; 00074 if ((result & 0x80) == 0) { 00075 // A 7-bit ascii value in one byte. 00076 return result; 00077 00078 } if ((result & 0xe0) == 0xc0) { 00079 // First byte of two. 00080 unsigned int two = 0; 00081 if (test_eof()) { 00082 express_cat.warning() 00083 << "utf-8 encoded string '" << _input << "' ends abruptly.\n"; 00084 return -1; 00085 } 00086 two = (unsigned char)_input[_p++]; 00087 result = ((result & 0x1f) << 6) | (two & 0x3f); 00088 return result; 00089 00090 } else if ((result & 0xf0) == 0xe0) { 00091 // First byte of three. 00092 if (test_eof()) { 00093 express_cat.warning() 00094 << "utf-8 encoded string '" << _input << "' ends abruptly.\n"; 00095 return -1; 00096 } 00097 unsigned int two = (unsigned char)_input[_p++]; 00098 if (test_eof()) { 00099 express_cat.warning() 00100 << "utf-8 encoded string '" << _input << "' ends abruptly.\n"; 00101 return -1; 00102 } 00103 unsigned int three = (unsigned char)_input[_p++]; 00104 result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f); 00105 return result; 00106 } 00107 00108 // Otherwise--the high bit is set but it is not one of the 00109 // introductory utf-8 bytes--we have an error. 00110 express_cat.warning() 00111 << "Non utf-8 byte in string: 0x" << hex << result << dec 00112 << ", string is '" << _input << "'\n"; 00113 nassertr(false, -1); 00114 } 00115 00116 // End of string reached. 00117 return -1; 00118 } 00119 00120 //////////////////////////////////////////////////////////////////// 00121 // Function: StringUnicodeDecoder::get_next_character 00122 // Access: Public, Virtual 00123 // Description: Returns the next character in sequence. 00124 //////////////////////////////////////////////////////////////////// 00125 int StringUnicodeDecoder:: 00126 get_next_character() { 00127 if (test_eof()) { 00128 return -1; 00129 } 00130 00131 unsigned int high = (unsigned char)_input[_p++]; 00132 if (test_eof()) { 00133 express_cat.warning() 00134 << "Unicode-encoded string has odd number of bytes.\n"; 00135 return -1; 00136 } 00137 unsigned int low = (unsigned char)_input[_p++]; 00138 return ((high << 8) | low); 00139 }