Skip to main content

The utf8.cpp File Reference

Included Headers

#include <cstdint> #include <sstream> #include "utf8.h" #include "caseconvert.h" #include "textstream.h"

Functions Index

uint8_tgetUTF8CharNumBytes (char c)

Returns the number of bytes making up a single UTF8 character given the first byte in the sequence. More...

static uint32_tdecode_utf8 (const char *data, int numBytes) noexcept
static uint32_tconvertUTF8CharToUnicode (const char *s, size_t bytesLeft, int &len)
std::stringgetUTF8CharAt (const std::string &input, size_t pos)

Returns the UTF8 character found at byte position pos in the input string. More...

uint32_tgetUnicodeForUTF8CharAt (const std::string &input, size_t pos)

Returns the 32bit Unicode value matching character at byte position pos in the UTF8 encoded input. More...

static charasciiToLower (uint32_t code)
static charasciiToUpper (uint32_t code)
static std::stringcaseConvert (const std::string &input, char(*asciiConversionFunc)(uint32_t code), const char *(*conversionFunc)(uint32_t code))
std::stringconvertUTF8ToLower (const std::string &input)

Converts the input string into a lower case version, also taking into account non-ASCII characters that has a lower case variant. More...

std::stringconvertUTF8ToUpper (const std::string &input)

Converts the input string into a upper case version, also taking into account non-ASCII characters that has a upper case variant. More...

const char *writeUTF8Char (TextStream &t, const char *s)

Writes the UTF8 character pointed to by s to stream t and returns a pointer to the next character. More...

boollastUTF8CharIsMultibyte (const std::string &input)

Returns true iff the last character in input is a multibyte character. More...

boolisUTF8CharUpperCase (const std::string &input, size_t pos)

Returns true iff the input string at byte position pos holds an upper case character. More...

intisUTF8NonBreakableSpace (const char *input)

Check if the first character pointed at by input is a non-breakable whitespace character. More...

boolisUTF8PunctuationCharacter (uint32_t unicode)

Check if the given Unicode character represents a punctuation character. More...

Functions

asciiToLower()

char asciiToLower (uint32_t code)
inline static

Definition at line 142 of file utf8.cpp.

142static inline char asciiToLower(uint32_t code)
143{
144 return code>='A' && code<='Z' ? static_cast<char>(code+'a'-'A') : static_cast<char>(code);
145}

asciiToUpper()

char asciiToUpper (uint32_t code)
inline static

Definition at line 147 of file utf8.cpp.

147static inline char asciiToUpper(uint32_t code)
148{
149 return code>='a' && code<='z' ? static_cast<char>(code+'A'-'a') : static_cast<char>(code);
150}

Referenced by convertUTF8ToUpper.

caseConvert()

std::string caseConvert (const std::string & input, char(*)(uint32_t code) asciiConversionFunc, const char *(*)(uint32_t code) conversionFunc)
inline static

Definition at line 152 of file utf8.cpp.

152static inline std::string caseConvert(const std::string &input,
153 char (*asciiConversionFunc)(uint32_t code),
154 const char *(*conversionFunc)(uint32_t code))
155{
156 uint32_t code=0;
157 std::string result;
158 result.reserve(input.length()); // assume all ASCII characters
159 int len=0;
160 size_t bytesLeft = input.length();
161 const char *p = input.c_str();
162 while ((code=convertUTF8CharToUnicode(p,bytesLeft,len)))
163 {
164 if (code<128) // ASCII case
165 {
166 char c = asciiConversionFunc(code);
167 result+=c;
168 }
169 else // generic case
170 {
171 const char *conv = conversionFunc(code);
172 if (conv==nullptr) // no difference between lower and upper case
173 {
174 result.append(p,len);
175 }
176 else // replace the input character with the conversion result
177 {
178 result.append(conv);
179 }
180 }
181 p+=len;
182 bytesLeft-=len;
183 }
184 return result;
185}

Reference convertUTF8CharToUnicode.

Referenced by convertUTF8ToLower and convertUTF8ToUpper.

convertUTF8CharToUnicode()

uint32_t convertUTF8CharToUnicode (const char * s, size_t bytesLeft, int & len)
inline static

Definition at line 69 of file utf8.cpp.

69static inline uint32_t convertUTF8CharToUnicode(const char *s,size_t bytesLeft,int &len)
70{
71 if (s==nullptr || bytesLeft==0)
72 {
73 len=0;
74 return 0;
75 }
76 unsigned char uc = static_cast<unsigned char>(*s);
77 if (uc<128) // ASCII case
78 {
79 len=1;
80 return uc;
81 }
82 switch (bytesLeft)
83 {
84 default:
85 if ((uc&0xFEu)==0xFCu)// 1111110X six bytes
86 {
87 len=6;
88 return decode_utf8(s,len);
89 }
90 // fall through
91 case 5:
92 if ((uc&0xFCu)==0xF8u) // 111110XX five bytes
93 {
94 len=5;
95 return decode_utf8(s,len);
96 }
97 // fall through
98 case 4:
99 if ((uc&0xF8u)==0xF0u) // 11110XXX four bytes
100 {
101 len=4;
102 return decode_utf8(s,len);
103 }
104 // fall through
105 case 3:
106 if ((uc&0xF0u)==0xE0u) // 1110XXXX three bytes
107 {
108 len=3;
109 return decode_utf8(s,len);
110 }
111 // fall through
112 case 2:
113 if ((uc&0xE0u)==0xC0u) // 110XXXXX two bytes
114 {
115 len=2;
116 return decode_utf8(s,len);
117 }
118 // fall through
119 case 1:
120 {
121 len=1;
122 return uc;
123 }
124 }
125}

Reference decode_utf8.

Referenced by caseConvert, getUnicodeForUTF8CharAt and isUTF8CharUpperCase.

convertUTF8ToLower()

std::string convertUTF8ToLower (const std::string & input)

Converts the input string into a lower case version, also taking into account non-ASCII characters that has a lower case variant.

Definition at line 187 of file utf8.cpp.

187std::string convertUTF8ToLower(const std::string &input)
188{
190}

References asciiToLower, caseConvert and convertUnicodeToLower.

Referenced by SearchIndexInfo::add, Index::addClassMemberNameToIndex, Index::addFileMemberNameToIndex, Index::addModuleMemberNameToIndex, Index::addNamespaceMemberNameToIndex, AnchorGenerator::generate, QCString::lower, FileNameFn::searchKey, SearchTerm::termEncoded and HtmlGenerator::writeLabel.

convertUTF8ToUpper()

std::string convertUTF8ToUpper (const std::string & input)

Converts the input string into a upper case version, also taking into account non-ASCII characters that has a upper case variant.

Definition at line 192 of file utf8.cpp.

192std::string convertUTF8ToUpper(const std::string &input)
193{
195}

References asciiToUpper, caseConvert and convertUnicodeToUpper.

Referenced by Translator::createNoun, QCString::upper and writeAlphabeticalClassList.

decode_utf8()

uint32_t decode_utf8 (const char * data, int numBytes)
inline noexcept static

Decodes a given input of utf8 data to a unicode code point given the number of bytes it's made of

Definition at line 55 of file utf8.cpp.

55static inline uint32_t decode_utf8( const char* data , int numBytes ) noexcept
56{
57 uint32_t cp = static_cast<unsigned char>(*data);
58 if (numBytes>1)
59 {
60 cp &= 127 >> numBytes; // Mask out the header bits
61 for (int i=1 ; i<numBytes ; i++)
62 {
63 cp = (cp<<6) | (static_cast<unsigned char>(data[i])&0x3F);
64 }
65 }
66 return cp;
67}

Referenced by convertUTF8CharToUnicode.

getUnicodeForUTF8CharAt()

uint32_t getUnicodeForUTF8CharAt (const std::string & input, size_t pos)

Returns the 32bit Unicode value matching character at byte position pos in the UTF8 encoded input.

Definition at line 135 of file utf8.cpp.

135uint32_t getUnicodeForUTF8CharAt(const std::string &input,size_t pos)
136{
137 std::string charS = getUTF8CharAt(input,pos);
138 int len=0;
139 return convertUTF8CharToUnicode(charS.c_str(),charS.length(),len);
140}

References convertUTF8CharToUnicode and getUTF8CharAt.

Referenced by AnchorGenerator::generate.

getUTF8CharAt()

std::string getUTF8CharAt (const std::string & input, size_t pos)

Returns the UTF8 character found at byte position pos in the input string.

The resulting string can be a multi byte sequence.

Definition at line 127 of file utf8.cpp.

127std::string getUTF8CharAt(const std::string &input,size_t pos)
128{
129 if (input.length()<=pos) return std::string();
130 int numBytes=getUTF8CharNumBytes(input[pos]);
131 if (input.length()<pos+numBytes) return std::string();
132 return input.substr(pos,numBytes);
133}

Reference getUTF8CharNumBytes.

Referenced by SearchIndexInfo::add, Index::addClassMemberNameToIndex, Index::addFileMemberNameToIndex, Index::addModuleMemberNameToIndex, Index::addNamespaceMemberNameToIndex, Translator::createNoun, AnchorGenerator::generate, getUnicodeForUTF8CharAt and writeAlphabeticalClassList.

getUTF8CharNumBytes()

uint8_t getUTF8CharNumBytes (char c)

Returns the number of bytes making up a single UTF8 character given the first byte in the sequence.

Definition at line 23 of file utf8.cpp.

23uint8_t getUTF8CharNumBytes(char c)
24{
25 uint8_t num=1;
26 unsigned char uc = static_cast<unsigned char>(c);
27 if (uc>=0x80u) // multibyte character
28 {
29 if ((uc&0xE0u)==0xC0u)
30 {
31 num=2; // 110x.xxxx: 2 byte character
32 }
33 if ((uc&0xF0u)==0xE0u)
34 {
35 num=3; // 1110.xxxx: 3 byte character
36 }
37 if ((uc&0xF8u)==0xF0u)
38 {
39 num=4; // 1111.0xxx: 4 byte character
40 }
41 if ((uc&0xFCu)==0xF8u)
42 {
43 num=5; // 1111.10xx: 5 byte character
44 }
45 if ((uc&0xFEu)==0xFCu)
46 {
47 num=6; // 1111.110x: 6 byte character
48 }
49 }
50 return num;
51}

Referenced by detab, escapeCharsInString, AnchorGenerator::generate, getUTF8CharAt, nextUTF8CharPosition, updateColumnCount and writeUTF8Char.

isUTF8CharUpperCase()

bool isUTF8CharUpperCase (const std::string & input, size_t pos)

Returns true iff the input string at byte position pos holds an upper case character.

Definition at line 218 of file utf8.cpp.

218bool isUTF8CharUpperCase(const std::string &input,size_t pos)
219{
220 if (input.length()<=pos) return false;
221 int len=0;
222 // turn the UTF8 character at position pos into a unicode value
223 uint32_t code = convertUTF8CharToUnicode(input.c_str()+pos,input.length()-pos,len);
224 // check if the character can be converted to lower case, if so it was an upper case character
225 return convertUnicodeToLower(code)!=nullptr;
226}

References convertUnicodeToLower and convertUTF8CharToUnicode.

Referenced by DefinitionImpl::_setBriefDescription.

isUTF8NonBreakableSpace()

int isUTF8NonBreakableSpace (const char * input)

Check if the first character pointed at by input is a non-breakable whitespace character.

Returns the byte size of the character if there is match or 0 if not.

Definition at line 228 of file utf8.cpp.

228int isUTF8NonBreakableSpace(const char *input)
229{
230 return (static_cast<unsigned char>(input[0])==0xC2 &&
231 static_cast<unsigned char>(input[1])==0xA0) ? 2 : 0;
232}

Referenced by detab.

isUTF8PunctuationCharacter()

bool isUTF8PunctuationCharacter (uint32_t unicode)

Check if the given Unicode character represents a punctuation character.

Definition at line 234 of file utf8.cpp.

234bool isUTF8PunctuationCharacter(uint32_t unicode)
235{
236 bool b = isPunctuationCharacter(unicode);
237 return b;
238}

Reference isPunctuationCharacter.

Referenced by AnchorGenerator::generate.

lastUTF8CharIsMultibyte()

bool lastUTF8CharIsMultibyte (const std::string & input)

Returns true iff the last character in input is a multibyte character.

Definition at line 212 of file utf8.cpp.

212bool lastUTF8CharIsMultibyte(const std::string &input)
213{
214 // last byte is part of a multibyte UTF8 char if bit 8 is set and bit 7 is not
215 return !input.empty() && (static_cast<unsigned char>(input[input.length()-1])&0xC0)==0x80;
216}

Referenced by DefinitionImpl::_setBriefDescription.

writeUTF8Char()

const char * writeUTF8Char (TextStream & t, const char * s)

Writes the UTF8 character pointed to by s to stream t and returns a pointer to the next character.

Definition at line 197 of file utf8.cpp.

197const char *writeUTF8Char(TextStream &t,const char *s)
198{
199 if (s==nullptr) return nullptr;
200 uint8_t len = getUTF8CharNumBytes(*s);
201 for (uint8_t i=0;i<len;i++)
202 {
203 if (s[i]==0) // detect premature end of string (due to invalid UTF8 char)
204 {
205 len=i;
206 }
207 }
208 t.write(s,len);
209 return s+len;
210}

References getUTF8CharNumBytes and TextStream::write.

Referenced by HtmlCodeGenerator::codify, ManCodeGenerator::codify, RTFCodeGenerator::codify, HtmlDocVisitor::operator(), HtmlDocVisitor::writeObfuscatedMailAddress and writeXMLCodeString.


Generated via doxygen2docusaurus by Doxygen 1.14.0.