1 /*
2 * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include "yajl_encode.h"
18
19 #include <assert.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <stdio.h>
23
CharToHex(unsigned char c, char * hexBuf)24 static void CharToHex(unsigned char c, char * hexBuf)
25 {
26 static const char * hexchar = "0123456789ABCDEF";
27 hexBuf[0] = hexchar[c >> 4];
28 hexBuf[1] = hexchar[c & 0x0F];
29 }
30
31 void
yajl_string_encode(const yajl_print_t print, void * ctx, const unsigned char * str, size_t len, int escape_solidus)32 yajl_string_encode(const yajl_print_t print,
33 void * ctx,
34 const unsigned char * str,
35 size_t len,
36 int escape_solidus)
37 {
38 size_t beg = 0;
39 size_t end = 0;
40 size_t escaped_len;
41 char escBuf[3];
42 char hexBuf[7];
43 escBuf[0] = '\\';
44 escBuf[2] = 0;
45 hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0';
46 hexBuf[6] = 0;
47
48 while (end < len) {
49 const unsigned char chr = str[end];
50 const char * escaped = NULL;
51 escBuf[1] = 0;
52 /* we're not going to be escaping most characters, so first
53 * check for this common case (of doing nothing). Doing so
54 * decreases the runtime of this function by around 30%.
55 */
56 if (chr > '/') {
57 /* it is not required to escape a solidus in JSON:
58 * read sec. 2.5: http://www.ietf.org/rfc/rfc4627.txt
59 * specifically, this production from the grammar:
60 * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
61 */
62 if (chr == '\\') {
63 escBuf[1] = '\\';
64 }
65 } else if (chr == ' ' || (chr < '/' && chr > ')')) {
66 /* skip these fairly common cases */
67 } else {
68 switch (chr) {
69 case '\r': escBuf[1] = 'r'; break;
70 case '\n': escBuf[1] = 'n'; break;
71 case '"': escBuf[1] = '"'; break;
72 case '\t': escBuf[1] = 't'; break;
73 case '\f': escBuf[1] = 'f'; break;
74 case '\b': escBuf[1] = 'b'; break;
75 case '/': if (escape_solidus) escBuf[1] = '/'; break;
76 default:
77 if (chr < 32) {
78 CharToHex(chr, hexBuf + 4);
79 escaped = hexBuf;
80 escaped_len = 6;
81 }
82 break;
83 }
84 }
85 if (escBuf[1] != 0) {
86 escaped = escBuf;
87 escaped_len = 2;
88 }
89 if (escaped != NULL) {
90 print(ctx, (const char *) (str + beg), end - beg);
91 print(ctx, escaped, escaped_len);
92 beg = ++end;
93 } else {
94 ++end;
95 }
96 }
97 print(ctx, (const char *) (str + beg), end - beg);
98 }
99
hexToDigit(unsigned int * val, const unsigned char * hex)100 static void hexToDigit(unsigned int * val, const unsigned char * hex)
101 {
102 unsigned int i;
103 for (i=0;i<4;i++) {
104 unsigned char c = hex[i];
105 if (c >= 'A') c = (c & ~0x20) - 7;
106 c -= '0';
107 assert(!(c & 0xF0));
108 *val = (*val << 4) | c;
109 }
110 }
111
Utf32toUtf8(unsigned int codepoint, char * utf8Buf)112 static void Utf32toUtf8(unsigned int codepoint, char * utf8Buf)
113 {
114 if (codepoint < 0x80) {
115 utf8Buf[0] = (char) codepoint;
116 utf8Buf[1] = 0;
117 } else if (codepoint < 0x0800) {
118 utf8Buf[0] = (char) ((codepoint >> 6) | 0xC0);
119 utf8Buf[1] = (char) ((codepoint & 0x3F) | 0x80);
120 utf8Buf[2] = 0;
121 } else if (codepoint < 0x10000) {
122 utf8Buf[0] = (char) ((codepoint >> 12) | 0xE0);
123 utf8Buf[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
124 utf8Buf[2] = (char) ((codepoint & 0x3F) | 0x80);
125 utf8Buf[3] = 0;
126 } else if (codepoint < 0x200000) {
127 utf8Buf[0] =(char)((codepoint >> 18) | 0xF0);
128 utf8Buf[1] =(char)(((codepoint >> 12) & 0x3F) | 0x80);
129 utf8Buf[2] =(char)(((codepoint >> 6) & 0x3F) | 0x80);
130 utf8Buf[3] =(char)((codepoint & 0x3F) | 0x80);
131 utf8Buf[4] = 0;
132 } else {
133 utf8Buf[0] = '?';
134 utf8Buf[1] = 0;
135 }
136 }
137
yajl_string_decode(yajl_buf buf, const unsigned char * str, size_t len)138 void yajl_string_decode(yajl_buf buf, const unsigned char * str,
139 size_t len)
140 {
141 size_t beg = 0;
142 size_t end = 0;
143
144 while (end < len) {
145 if (str[end] == '\\') {
146 char utf8Buf[5];
147 const char * unescaped = "?";
148 yajl_buf_append(buf, str + beg, end - beg);
149 switch (str[++end]) {
150 case 'r': unescaped = "\r"; break;
151 case 'n': unescaped = "\n"; break;
152 case '\\': unescaped = "\\"; break;
153 case '/': unescaped = "/"; break;
154 case '"': unescaped = "\""; break;
155 case 'f': unescaped = "\f"; break;
156 case 'b': unescaped = "\b"; break;
157 case 't': unescaped = "\t"; break;
158 case 'u': {
159 unsigned int codepoint = 0;
160 hexToDigit(&codepoint, str + ++end);
161 end+=3;
162 /* check if this is a surrogate */
163 if ((codepoint & 0xFC00) == 0xD800) {
164 end++;
165 if (str[end] == '\\' && str[end + 1] == 'u') {
166 unsigned int surrogate = 0;
167 hexToDigit(&surrogate, str + end + 2);
168 codepoint =
169 (((codepoint & 0x3F) << 10) |
170 ((((codepoint >> 6) & 0xF) + 1) << 16) |
171 (surrogate & 0x3FF));
172 end += 5;
173 } else {
174 unescaped = "?";
175 break;
176 }
177 }
178
179 Utf32toUtf8(codepoint, utf8Buf);
180 unescaped = utf8Buf;
181
182 if (codepoint == 0) {
183 yajl_buf_append(buf, unescaped, 1);
184 beg = ++end;
185 continue;
186 }
187
188 break;
189 }
190 default:
191 assert("this should never happen" == NULL);
192 }
193 yajl_buf_append(buf, unescaped, (unsigned int)strlen(unescaped));
194 beg = ++end;
195 } else {
196 end++;
197 }
198 }
199 yajl_buf_append(buf, str + beg, end - beg);
200 }
201
202 #define ADV_PTR s++; if (!(len--)) return 0;
203
yajl_string_validate_utf8(const unsigned char * s, size_t len)204 int yajl_string_validate_utf8(const unsigned char * s, size_t len)
205 {
206 if (!len) return 1;
207 if (!s) return 0;
208
209 while (len--) {
210 /* single byte */
211 if (*s <= 0x7f) {
212 /* noop */
213 }
214 /* two byte */
215 else if ((*s >> 5) == 0x6) {
216 ADV_PTR;
217 if (!((*s >> 6) == 0x2)) return 0;
218 }
219 /* three byte */
220 else if ((*s >> 4) == 0x0e) {
221 ADV_PTR;
222 if (!((*s >> 6) == 0x2)) return 0;
223 ADV_PTR;
224 if (!((*s >> 6) == 0x2)) return 0;
225 }
226 /* four byte */
227 else if ((*s >> 3) == 0x1e) {
228 ADV_PTR;
229 if (!((*s >> 6) == 0x2)) return 0;
230 ADV_PTR;
231 if (!((*s >> 6) == 0x2)) return 0;
232 ADV_PTR;
233 if (!((*s >> 6) == 0x2)) return 0;
234 } else {
235 return 0;
236 }
237
238 s++;
239 }
240
241 return 1;
242 }
243