OpenVPN 3 Core Library
Loading...
Searching...
No Matches
unicode.hpp
Go to the documentation of this file.
1// OpenVPN -- An application to securely tunnel IP networks
2// over a single port, with support for SSL/TLS-based
3// session authentication and key exchange,
4// packet encryption, packet authentication, and
5// packet compression.
6//
7// Copyright (C) 2012- OpenVPN Inc.
8//
9// SPDX-License-Identifier: MPL-2.0 OR AGPL-3.0-only WITH openvpn3-openssl-exception
10//
11
12// General-purpose function for dealing with unicode.
13
14#ifndef OPENVPN_COMMON_UNICODE_H
15#define OPENVPN_COMMON_UNICODE_H
16
17#include <string>
18#include <cstring> // for std::memcpy
19#include <algorithm> // for std::min
20#include <memory>
21#include <cctype>
22
27
28namespace openvpn::Unicode {
29
30OPENVPN_SIMPLE_EXCEPTION(unicode_src_overflow);
31OPENVPN_SIMPLE_EXCEPTION(unicode_dest_overflow);
32OPENVPN_SIMPLE_EXCEPTION(unicode_malformed);
33
34// Return true if the given buffer is a valid UTF-8 string.
35// Extra constraints:
36enum
37{
38 UTF8_NO_CTRL = (1 << 30), // no control chars allowed
39 UTF8_NO_SPACE = (1 << 31), // no space chars allowed
40};
41inline bool is_valid_utf8_uchar_buf(const unsigned char *source,
42 size_t size,
43 const size_t max_len_flags = 0) // OR max length (or 0 to disable) with UTF8_x flags above
44{
45 const size_t max_len = max_len_flags & ((size_t)UTF8_NO_CTRL - 1); // NOTE -- use smallest flag value here
46 size_t unicode_len = 0;
47 while (size)
48 {
49 const unsigned char c = *source;
50 if (c == '\0')
51 return false;
52 const int length = trailingBytesForUTF8[c] + 1;
53 if ((size_t)length > size)
54 return false;
55 if (!isLegalUTF8(source, length))
56 return false;
57 if (length == 1)
58 {
59 if ((max_len_flags & UTF8_NO_CTRL) && std::iscntrl(c))
60 return false;
61 if ((max_len_flags & UTF8_NO_SPACE) && std::isspace(c))
62 return false;
63 }
64
65 source += length;
66 size -= length;
67 ++unicode_len;
68 if (max_len && unicode_len > max_len)
69 return false;
70 }
71 return true;
72}
73
74template <typename STRING>
75inline bool is_valid_utf8(const STRING &str, const size_t max_len_flags = 0)
76{
77 return is_valid_utf8_uchar_buf((const unsigned char *)str.c_str(), str.length(), max_len_flags);
78}
79
80// Return the byte position in the string that corresponds with
81// the given character index. Return values:
82enum
83{
84 UTF8_GOOD = 0, // succeeded, result in index
85 UTF8_BAD, // failed, string is not legal UTF8
86 UTF8_RANGE, // failed, index is beyond end of string
87};
88template <typename STRING>
89inline int utf8_index(STRING &str, size_t &index)
90{
91 const size_t size = str.length();
92 size_t upos = 0;
93 size_t pos = 0;
94 while (pos < size)
95 {
96 const int len = trailingBytesForUTF8[(unsigned char)str[pos]] + 1;
97 if (pos + len > size || !isLegalUTF8((const unsigned char *)&str[pos], len))
98 return UTF8_BAD;
99 if (upos >= index)
100 {
101 index = pos;
102 return UTF8_GOOD;
103 }
104 pos += len;
105 ++upos;
106 }
107 return UTF8_RANGE;
108}
109
110// Truncate a UTF8 string if its length exceeds max_len
111template <typename STRING>
112inline void utf8_truncate(STRING &str, size_t max_len)
113{
114 const int status = utf8_index(str, max_len);
115 if (status == UTF8_GOOD || status == UTF8_BAD)
116 str = str.substr(0, max_len);
117}
118
119// Return a printable UTF-8 string, where bad UTF-8 chars and
120// control chars are mapped to '?'.
121// If max_len_flags > 0, print a maximum of max_len_flags chars.
122// If UTF8_PASS_FMT flag is set in max_len_flags, pass through \r\n\t
123enum
124{
125 UTF8_PASS_FMT = (1 << 31),
126 UTF8_FILTER = (1 << 30),
127};
128template <typename STRING>
129inline STRING utf8_printable(const STRING &str, size_t max_len_flags)
130{
131 STRING ret;
132 const size_t size = str.length();
133 const size_t max_len = max_len_flags & ((size_t)UTF8_FILTER - 1); // NOTE -- use smallest flag value here
134 size_t upos = 0;
135 size_t pos = 0;
136 ret.reserve(std::min(str.length(), max_len) + 3); // add 3 for "..."
137 while (pos < size)
138 {
139 if (!max_len || upos < max_len)
140 {
141 unsigned char c = str[pos];
142 int len = trailingBytesForUTF8[c] + 1;
143 if (pos + len <= size
144 && c >= 0x20 && c != 0x7F
145 && isLegalUTF8((const unsigned char *)&str[pos], len))
146 {
147 // non-control, legal UTF-8
148 ret.append(str, pos, len);
149 }
150 else
151 {
152 // control char or bad UTF-8 char
153 if (c == '\r' || c == '\n' || c == '\t')
154 {
155 if (!(max_len_flags & UTF8_PASS_FMT))
156 c = ' ';
157 }
158 else if (max_len_flags & UTF8_FILTER)
159 c = 0;
160 else
161 c = '?';
162 if (c)
163 ret += c;
164 len = 1;
165 }
166 pos += len;
167 ++upos;
168 }
169 else
170 {
171 ret.append("...");
172 break;
173 }
174 }
175 return ret;
176}
177
178template <typename STRING>
179inline size_t utf8_length(const STRING &str)
180{
181 const size_t size = str.length();
182 size_t upos = 0;
183 size_t pos = 0;
184 while (pos < size)
185 {
186 int len = std::min((int)trailingBytesForUTF8[(unsigned char)str[pos]] + 1,
187 (int)size);
188 if (!isLegalUTF8((const unsigned char *)&str[pos], len))
189 len = 1;
190 pos += len;
191 ++upos;
192 }
193 return upos;
194}
195
197{
198 switch (res)
199 {
200 case conversionOK:
201 return;
202 case sourceExhausted:
203 throw unicode_src_overflow();
204 case targetExhausted:
205 throw unicode_dest_overflow();
206 case sourceIllegal:
207 throw unicode_malformed();
208 }
209}
210
211// Convert a UTF-8 string to UTF-16 little endian (no null termination in return)
212template <typename STRING>
213inline BufferPtr string_to_utf16(const STRING &str)
214{
215 std::unique_ptr<UTF16[]> utf16_dest(new UTF16[str.length()]);
216 const UTF8 *src = (UTF8 *)str.c_str();
217 UTF16 *dest = utf16_dest.get();
218 const ConversionResult res = ConvertUTF8toUTF16(&src,
219 src + str.length(),
220 &dest,
221 dest + str.length(),
224 auto ret = BufferAllocatedRc::Create((dest - utf16_dest.get()) * 2, BufAllocFlags::ARRAY);
225 UTF8 *d = ret->data();
226 for (const UTF16 *s = utf16_dest.get(); s < dest; ++s)
227 {
228 *d++ = static_cast<UTF8>(*s & 0xFF);
229 *d++ = static_cast<UTF8>((*s >> 8) & 0xFF);
230 }
231 return ret;
232}
233
235{
236 public:
237 struct Char
238 {
239 unsigned int len;
240 unsigned char data[4];
241 bool valid;
242
243 bool is_valid() const
244 {
245 return valid && len >= 1 && len <= sizeof(data);
246 }
247
248 std::string str(const char *malformed)
249 {
250 if (is_valid())
251 return std::string((char *)data, len);
252 else
253 return malformed;
254 }
255 };
256
257 UTF8Iterator(const std::string &str_arg)
258 : str((unsigned char *)str_arg.c_str()),
259 size(str_arg.length())
260 {
261 }
262
263 bool get(Char &c)
264 {
265 if (size)
266 {
267 unsigned int len = std::min((unsigned int)trailingBytesForUTF8[*str] + 1,
268 (unsigned int)size);
269 if (isLegalUTF8(str, len))
270 {
271 c.valid = true;
272 c.len = std::min(len, (unsigned int)sizeof(c.data));
273 std::memcpy(c.data, str, c.len);
274 }
275 else
276 {
277 c.valid = false;
278 c.len = 1;
279 }
280 str += c.len;
281 size -= c.len;
282 return true;
283 }
284 else
285 return false;
286 }
287
288 private:
289 const unsigned char *str;
290 size_t size;
291};
292} // namespace openvpn::Unicode
293
294#endif
static Ptr Create(ArgsT &&...args)
Creates a new instance of RcEnable with the given arguments.
Definition make_rc.hpp:43
const unsigned char * str
Definition unicode.hpp:289
UTF8Iterator(const std::string &str_arg)
Definition unicode.hpp:257
#define OPENVPN_SIMPLE_EXCEPTION(C)
Definition exception.hpp:75
constexpr BufferFlags ARRAY(1u<< 3)
if enabled, use as array
STRING utf8_printable(const STRING &str, size_t max_len_flags)
Definition unicode.hpp:129
void conversion_result_throw(const ConversionResult res)
Definition unicode.hpp:196
const char trailingBytesForUTF8[256]
bool is_valid_utf8_uchar_buf(const unsigned char *source, size_t size, const size_t max_len_flags=0)
Definition unicode.hpp:41
bool is_valid_utf8(const STRING &str, const size_t max_len_flags=0)
Definition unicode.hpp:75
ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
int utf8_index(STRING &str, size_t &index)
Definition unicode.hpp:89
size_t utf8_length(const STRING &str)
Definition unicode.hpp:179
void utf8_truncate(STRING &str, size_t max_len)
Definition unicode.hpp:112
unsigned char UTF8
BufferPtr string_to_utf16(const STRING &str)
Definition unicode.hpp:213
unsigned short UTF16
bool isLegalUTF8(const UTF8 *source, int length)
std::string str(const char *malformed)
Definition unicode.hpp:248
os<< "Session Name: "<< tbc-> session_name<< '\n';os<< "Layer: "<< tbc-> layer str()<< '\n'
std::string ret