19 _encoding = _default_encoding;
23 _flags = (F_got_text | F_got_wtext);
32 _encoding(copy._encoding),
70 _default_encoding = encoding;
79 return _default_encoding;
90 if (!has_text() || _text != text) {
92 _flags = (_flags | F_got_text) & ~F_got_wtext;
104set_text(
const std::string &text, TextEncoder::Encoding encoding) {
105 if (encoding == _encoding) {
117 _text = std::string();
118 _wtext = std::wstring();
119 _flags |= (F_got_text | F_got_wtext);
126INLINE
bool TextEncoder::
128 if (_flags & F_got_wtext) {
129 return !_wtext.empty();
131 return !_text.empty();
140 if ((_flags & F_got_text) == 0) {
151get_text(TextEncoder::Encoding encoding)
const {
162 _flags = (_flags | F_got_text) & ~F_got_wtext;
173#if WCHAR_MAX >= 0x10FFFF
175 _wtext =
get_wtext() + std::wstring(1, (
wchar_t)character);
177 if ((character & ~0xffff) == 0) {
178 _wtext =
get_wtext() + std::wstring(1, (
wchar_t)character);
181 uint32_t v = (uint32_t)character - 0x10000u;
183 (wchar_t)((v >> 10u) | 0xd800u),
184 (
wchar_t)((v & 0x3ffu) | 0xdc00u),
186 _wtext =
get_wtext() + std::wstring(wstr, 2);
189 _flags = (_flags | F_got_wtext) & ~F_got_text;
211 if (index < _wtext.length()) {
212 return _wtext[index];
225 if (index < _wtext.length()) {
226 _wtext[index] = character;
227 _flags &= ~F_got_text;
276reencode_text(
const std::string &text, TextEncoder::Encoding from,
277 TextEncoder::Encoding to) {
288 if (entry ==
nullptr) {
291 return entry->_char_type == UnicodeLatinMap::CT_upper ||
292 entry->_char_type == UnicodeLatinMap::CT_lower;
302 if (entry ==
nullptr) {
304 return (character >=
'0' && character <=
'9');
307 return (isdigit(entry->_ascii_equiv) != 0);
317 if (entry ==
nullptr) {
319 return (character < 128 && ispunct(character));
321 return entry->_char_type == UnicodeLatinMap::CT_punct;
331 if (entry ==
nullptr) {
334 return entry->_char_type == UnicodeLatinMap::CT_upper;
361 if (entry ==
nullptr) {
364 return entry->_char_type == UnicodeLatinMap::CT_lower;
374 if (entry ==
nullptr) {
377 return entry->_toupper_character;
387 if (entry ==
nullptr) {
390 return entry->_tolower_character;
398upper(
const std::string &source) {
407upper(
const std::string &source, TextEncoder::Encoding encoding) {
420lower(
const std::string &source) {
429lower(
const std::string &source, TextEncoder::Encoding encoding) {
444 if (!has_text() || _wtext != wtext) {
446 _flags = (_flags | F_got_wtext) & ~F_got_text;
457 if ((_flags & F_got_wtext) == 0) {
469 if (!wtext.empty()) {
471 _flags = (_flags | F_got_wtext) & ~F_got_text;
498operator << (std::ostream &out,
const std::wstring &str) {
This class can be used to convert text between multiple representations, e.g.
std::wstring decode_text(const std::string &text) const
Returns the given wstring decoded to a single-byte string, via the current encoding system.
void append_text(const std::string &text)
Appends the indicates string to the end of the stored text.
set_text
Changes the text that is stored in the encoder.
static std::string upper(const std::string &source)
Converts the string to uppercase, assuming the string is encoded in the default encoding.
void append_wtext(const std::wstring &text)
Appends the indicates string to the end of the stored wide-character text.
void set_unicode_char(size_t index, char32_t character)
Sets the Unicode value of the nth character in the stored text.
static std::string lower(const std::string &source)
Converts the string to lowercase, assuming the string is encoded in the default encoding.
std::string get_text_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
static std::string reencode_text(const std::string &text, Encoding from, Encoding to)
Given the indicated text string, which is assumed to be encoded via the encoding "from",...
static bool unicode_ispunct(char32_t character)
Returns true if the indicated character is a punctuation mark, false otherwise.
static int unicode_tolower(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
static bool unicode_isupper(char32_t character)
Returns true if the indicated character is an uppercase letter, false otherwise.
get_default_encoding
Specifies the default encoding to be used for all subsequently created TextEncoder objects.
static int unicode_toupper(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
std::wstring get_wtext_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
Encoding get_encoding() const
Returns the encoding by which the string set via set_text() is to be interpreted.
static bool unicode_isdigit(char32_t character)
Returns true if the indicated character is a numeric digit, false otherwise.
void set_encoding(Encoding encoding)
Specifies how the string set via set_text() is to be interpreted.
get_text
Returns the current text, as encoded via the current encoding system.
const std::wstring & get_wtext() const
Returns the text associated with the TextEncoder, as a wide-character string.
size_t get_num_chars() const
Returns the number of characters in the stored text.
void make_lower()
Adjusts the text stored within the encoder to all lowercase letters (preserving accent marks correctl...
void clear_text()
Removes the text from the TextEncoder.
void make_upper()
Adjusts the text stored within the encoder to all uppercase letters (preserving accent marks correctl...
std::string get_encoded_char(size_t index) const
Returns the nth char of the stored text, as a one-, two-, or three-byte encoded string.
static bool unicode_isspace(char32_t character)
Returns true if the indicated character is a whitespace letter, false otherwise.
int get_unicode_char(size_t index) const
Returns the Unicode value of the nth character in the stored text.
static bool unicode_islower(char32_t character)
Returns true if the indicated character is a lowercase letter, false otherwise.
set_default_encoding
Specifies the default encoding to be used for all subsequently created TextEncoder objects.
std::string encode_wtext(const std::wstring &wtext) const
Encodes a wide-text string into a single-char string, according to the current encoding.
static bool unicode_isalpha(char32_t character)
Returns true if the indicated character is an alphabetic letter, false otherwise.
void append_unicode_char(char32_t character)
Appends a single character to the end of the stored text.
void set_wtext(const std::wstring &wtext)
Changes the text that is stored in the encoder.
static const Entry * look_up(char32_t character)
Returns the Entry associated with the indicated character, if there is one.
std::ostream & operator<<(std::ostream &out, const std::wstring &str)
Uses the current default encoding to output the wstring.