1 /* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3 * Copyright 2015 Couchbase, Inc
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 #ifndef SUBDOC_UESCAPE_H
19 #define SUBDOC_UESCAPE_H
20
21 namespace Subdoc {
22
23 class UescapeConverter {
24 public:
25 class Status {
26 public:
27 enum Code {
28 SUCCESS,
29 INCOMPLETE_SURROGATE, // End of string encountered with incomplete surrogate
30 INVALID_SURROGATE, // Invalid surrogate pair
31 EMBEDDED_NUL, // found embedded 0x00 pair
32 INVALID_HEXCHARS,
33 INVALID_CODEPOINT
34 };
35
operator bool() const36 operator bool() const {
37 return m_code == SUCCESS;
38 }
39
code() const40 Code code() const {
41 return m_code;
42 }
43
Status(Code code)44 Status(Code code) : m_code(code) {}
45
46 private:
47 Code m_code;
48 };
49
UescapeConverter(const std::string& in, std::string& out)50 UescapeConverter(const std::string& in, std::string& out)
51 : m_inbuf(in.c_str()), m_inlen(in.size()), m_out(out) {
52 }
53
UescapeConverter(const char *s, size_t n, std::string& out)54 UescapeConverter(const char *s, size_t n, std::string& out)
55 : m_inbuf(s), m_inlen(n), m_out(out) {
56 }
57
58 inline Status convert();
59
convert(const char *s, size_t n, std::string& out)60 static Status convert(const char *s, size_t n, std::string& out) {
61 UescapeConverter conv(s, n, out);
62 return conv.convert();
63 }
64
convert(const std::string& in, std::string &out)65 static Status convert(const std::string& in, std::string &out) {
66 UescapeConverter conv(in, out);
67 return conv.convert();
68 }
69
70
71 private:
72 inline bool is_uescape(size_t pos);
73 inline void append_utf8(char32_t pt);
74 inline Status handle_uescape(size_t pos);
75
76 const char *m_inbuf;
77 size_t m_inlen;
78 std::string& m_out;
79 char16_t last_codepoint = 0;
80 };
81
82 UescapeConverter::Status
convert()83 UescapeConverter::convert()
84 {
85 for (size_t ii = 0; ii < m_inlen; ii++) {
86 if (is_uescape(ii)) {
87 Status st = handle_uescape(ii);
88 if (!st) {
89 return st;
90 }
91
92 // Skip over the 6-1 characters of {\,u,x,x,x,x}
93 ii += 5;
94 } else {
95 m_out += m_inbuf[ii];
96 }
97 }
98 if (last_codepoint) {
99 return Status::INCOMPLETE_SURROGATE;
100 }
101 return Status::SUCCESS;
102 }
103
104 bool
is_uescape(size_t pos)105 UescapeConverter::is_uescape(size_t pos)
106 {
107 if (m_inbuf[pos] != '\\') {
108 return false;
109 }
110 if (pos == m_inlen - 1) {
111 return false;
112 }
113 if (m_inbuf[pos+1] == 'u') {
114 return true;
115 }
116 return false;
117 }
118
119 void
append_utf8(char32_t pt)120 UescapeConverter::append_utf8(char32_t pt)
121 {
122 if (pt < 0x80) {
123 m_out += static_cast<char>(pt);
124 } else if (pt < 0x800) {
125 // 110xxxxxx 10xxxxxx
126 m_out += static_cast<char>((pt >> 6) | 0xC0);
127 // Write the remaining 6 bytes, and set higher order 11
128 m_out += static_cast<char>((pt & 0x3F) | 0x80);
129 } else if (pt < 0x10000) {
130 // 1110xxxx 10xxxxxx 10xxxxxx
131 m_out += static_cast<char>((pt >> 12) | 0xE0);
132 m_out += static_cast<char>(((pt >> 6) & 0x3F) | 0x80);
133 m_out += static_cast<char>((pt & 0x3F) | 0x80);
134 } else {
135 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
136 m_out += static_cast<char>((pt >> 18) | 0xF0);
137 m_out += static_cast<char>(((pt >> 12) & 0x3F) | 0x80);
138 m_out += static_cast<char>(((pt >> 6) & 0x3F) | 0x80);
139 m_out += static_cast<char>((pt & 0x3F) | 0x80);
140 }
141 }
142
143 UescapeConverter::Status
handle_uescape(size_t pos)144 UescapeConverter::handle_uescape(size_t pos)
145 {
146 pos += 2; // Swallow '\u'
147 if (m_inlen - pos < 4) {
148 return Status::INVALID_HEXCHARS; // too small
149 }
150
151 char16_t res = 0;
152
153 for (size_t ii = pos; ii < pos+4; ii++) {
154 char numbuf[2] = { m_inbuf[ii], 0 };
155 char *endptr = NULL;
156
157 long rv = strtol(numbuf, &endptr, 16);
158 if (endptr && *endptr != '\0') {
159 return Status::INVALID_HEXCHARS;
160 }
161 if (rv == 0 && res == 0) {
162 continue;
163 }
164 res <<= 4;
165 res |= rv;
166 }
167
168 // From RFC 2781:
169 // 2.2 Decoding UTF-16
170 // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
171 // of W1. Terminate.
172 //
173 // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
174 // is in error and no valid character can be obtained using W1.
175 // Terminate.
176 //
177 // 3) If there is no W2 (that is, the sequence ends with W1), or if W2
178 // is not between 0xDC00 and 0xDFFF, the sequence is in error.
179 // Terminate.
180 //
181 // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
182 // bits of W1 as its 10 high-order bits and the 10 low-order bits of
183 // W2 as its 10 low-order bits.
184 // 5) Add 0x10000 to U' to obtain the character value U. Terminate.
185 if (res == 0x00) {
186 return Status::EMBEDDED_NUL;
187 } else if (last_codepoint) {
188 if (res < 0xDC00 || res > 0xDFFF) {
189 return Status::INVALID_SURROGATE; // error
190 }
191
192 char16_t w1 = last_codepoint;
193 char16_t w2 = res;
194
195 // 10 low bits of w1 as its 10 high bits
196 char32_t cp;
197 cp = (w1 & 0x3FF) << 10;
198 // 10 low bits of w2 as its 20 low bits
199 cp |= (w2 & 0x3FF);
200
201 // Add 0x10000
202 cp += 0x10000;
203 append_utf8(cp);
204 last_codepoint = 0;
205
206 } else if (res < 0xD800 || res > 0xDFFF) {
207 append_utf8(res);
208 } else if (res > 0xD7FF && res < 0xDC00) {
209 last_codepoint = res;
210 } else {
211 return Status::INVALID_CODEPOINT;
212 }
213
214 return Status::SUCCESS;
215 }
216
217 }
218 #endif
219