1 /* -*- Mode: C++; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3 *     Copyright 2015 Couchbase, Inc
4 *
5 *   Licensed under the Apache License, Version 2.0 (the "License");
6 *   you may not use this file except in compliance with the License.
7 *   You may obtain a copy of the License at
8 *
9 *       http://www.apache.org/licenses/LICENSE-2.0
10 *
11 *   Unless required by applicable law or agreed to in writing, software
12 *   distributed under the License is distributed on an "AS IS" BASIS,
13 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *   See the License for the specific language governing permissions and
15 *   limitations under the License.
16 */
17 
18 #ifndef SUBDOC_UESCAPE_H
19 #define SUBDOC_UESCAPE_H
20 
21 namespace Subdoc {
22 
23 class UescapeConverter {
24 public:
25     class Status {
26     public:
27         enum Code {
28             SUCCESS,
29             INCOMPLETE_SURROGATE, // End of string encountered with incomplete surrogate
30             INVALID_SURROGATE, // Invalid surrogate pair
31             EMBEDDED_NUL, // found embedded 0x00 pair
32             INVALID_HEXCHARS,
33             INVALID_CODEPOINT
34         };
35 
operator bool() const36         operator bool() const {
37             return m_code == SUCCESS;
38         }
39 
code() const40         Code code() const {
41             return m_code;
42         }
43 
Status(Code code)44         Status(Code code) : m_code(code) {}
45 
46     private:
47         Code m_code;
48     };
49 
UescapeConverter(const std::string& in, std::string& out)50     UescapeConverter(const std::string& in, std::string& out)
51     : m_inbuf(in.c_str()), m_inlen(in.size()), m_out(out) {
52     }
53 
UescapeConverter(const char *s, size_t n, std::string& out)54     UescapeConverter(const char *s, size_t n, std::string& out)
55     : m_inbuf(s), m_inlen(n), m_out(out) {
56     }
57 
58     inline Status convert();
59 
convert(const char *s, size_t n, std::string& out)60     static Status convert(const char *s, size_t n, std::string& out) {
61         UescapeConverter conv(s, n, out);
62         return conv.convert();
63     }
64 
convert(const std::string& in, std::string &out)65     static Status convert(const std::string& in, std::string &out) {
66         UescapeConverter conv(in, out);
67         return conv.convert();
68     }
69 
70 
71 private:
72     inline bool is_uescape(size_t pos);
73     inline void append_utf8(char32_t pt);
74     inline Status handle_uescape(size_t pos);
75 
76     const char *m_inbuf;
77     size_t m_inlen;
78     std::string& m_out;
79     char16_t last_codepoint = 0;
80 };
81 
82 UescapeConverter::Status
convert()83 UescapeConverter::convert()
84 {
85     for (size_t ii = 0; ii < m_inlen; ii++) {
86         if (is_uescape(ii)) {
87             Status st = handle_uescape(ii);
88             if (!st) {
89                 return st;
90             }
91 
92             // Skip over the 6-1 characters of {\,u,x,x,x,x}
93             ii += 5;
94         } else {
95             m_out += m_inbuf[ii];
96         }
97     }
98     if (last_codepoint) {
99         return Status::INCOMPLETE_SURROGATE;
100     }
101     return Status::SUCCESS;
102 }
103 
104 bool
is_uescape(size_t pos)105 UescapeConverter::is_uescape(size_t pos)
106 {
107     if (m_inbuf[pos] != '\\') {
108         return false;
109     }
110     if (pos == m_inlen - 1) {
111         return false;
112     }
113     if (m_inbuf[pos+1] == 'u') {
114         return true;
115     }
116     return false;
117 }
118 
119 void
append_utf8(char32_t pt)120 UescapeConverter::append_utf8(char32_t pt)
121 {
122     if (pt < 0x80) {
123         m_out += static_cast<char>(pt);
124     } else if (pt < 0x800) {
125         // 110xxxxxx 10xxxxxx
126         m_out += static_cast<char>((pt >> 6) | 0xC0);
127         // Write the remaining 6 bytes, and set higher order 11
128         m_out += static_cast<char>((pt & 0x3F) | 0x80);
129     } else if (pt < 0x10000) {
130         // 1110xxxx 10xxxxxx 10xxxxxx
131         m_out += static_cast<char>((pt >> 12) | 0xE0);
132         m_out += static_cast<char>(((pt >> 6) & 0x3F) | 0x80);
133         m_out += static_cast<char>((pt & 0x3F) | 0x80);
134     } else {
135         // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
136         m_out += static_cast<char>((pt >> 18) | 0xF0);
137         m_out += static_cast<char>(((pt >> 12) & 0x3F) | 0x80);
138         m_out += static_cast<char>(((pt >> 6) & 0x3F) | 0x80);
139         m_out += static_cast<char>((pt & 0x3F) | 0x80);
140     }
141 }
142 
143 UescapeConverter::Status
handle_uescape(size_t pos)144 UescapeConverter::handle_uescape(size_t pos)
145 {
146     pos += 2; // Swallow '\u'
147     if (m_inlen - pos < 4) {
148         return Status::INVALID_HEXCHARS; // too small
149     }
150 
151     char16_t res = 0;
152 
153     for (size_t ii = pos; ii < pos+4; ii++) {
154         char numbuf[2] = { m_inbuf[ii], 0 };
155         char *endptr = NULL;
156 
157         long rv = strtol(numbuf, &endptr, 16);
158         if (endptr && *endptr != '\0') {
159             return Status::INVALID_HEXCHARS;
160         }
161         if (rv == 0 && res == 0) {
162             continue;
163         }
164         res <<= 4;
165         res |= rv;
166     }
167 
168     // From RFC 2781:
169     // 2.2 Decoding UTF-16
170     //    1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
171     //       of W1. Terminate.
172     //
173     //    2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
174     //       is in error and no valid character can be obtained using W1.
175     //       Terminate.
176     //
177     //    3) If there is no W2 (that is, the sequence ends with W1), or if W2
178     //       is not between 0xDC00 and 0xDFFF, the sequence is in error.
179     //       Terminate.
180     //
181     //    4) Construct a 20-bit unsigned integer U', taking the 10 low-order
182     //       bits of W1 as its 10 high-order bits and the 10 low-order bits of
183     //       W2 as its 10 low-order bits.
184     //    5) Add 0x10000 to U' to obtain the character value U. Terminate.
185     if (res == 0x00) {
186         return Status::EMBEDDED_NUL;
187     } else if (last_codepoint) {
188         if (res < 0xDC00 || res > 0xDFFF) {
189             return Status::INVALID_SURROGATE; // error
190         }
191 
192         char16_t w1 = last_codepoint;
193         char16_t w2 = res;
194 
195         // 10 low bits of w1 as its 10 high bits
196         char32_t cp;
197         cp = (w1 & 0x3FF) << 10;
198         // 10 low bits of w2 as its 20 low bits
199         cp |= (w2 & 0x3FF);
200 
201         // Add 0x10000
202         cp += 0x10000;
203         append_utf8(cp);
204         last_codepoint = 0;
205 
206     } else if (res < 0xD800 || res > 0xDFFF) {
207         append_utf8(res);
208     } else if (res > 0xD7FF && res < 0xDC00) {
209         last_codepoint = res;
210     } else {
211         return Status::INVALID_CODEPOINT;
212     }
213 
214     return Status::SUCCESS;
215 }
216 
217 }
218 #endif
219