1 /*
2  * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 #include "yajl_encode.h"
18 
19 #include <assert.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <stdio.h>
23 
CharToHex(unsigned char c, char * hexBuf)24 static void CharToHex(unsigned char c, char * hexBuf)
25 {
26     static const char * hexchar = "0123456789ABCDEF";
27     hexBuf[0] = hexchar[c >> 4];
28     hexBuf[1] = hexchar[c & 0x0F];
29 }
30 
31 void
yajl_string_encode(const yajl_print_t print, void * ctx, const unsigned char * str, size_t len, int escape_solidus)32 yajl_string_encode(const yajl_print_t print,
33                    void * ctx,
34                    const unsigned char * str,
35                    size_t len,
36                    int escape_solidus)
37 {
38     size_t beg = 0;
39     size_t end = 0;
40     size_t escaped_len;
41     char escBuf[3];
42     char hexBuf[7];
43     escBuf[0] = '\\';
44     escBuf[2] = 0;
45     hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0';
46     hexBuf[6] = 0;
47 
48     while (end < len) {
49         const unsigned char chr = str[end];
50         const char * escaped = NULL;
51         escBuf[1] = 0;
52         /* we're not going to be escaping most characters, so first
53          * check for this common case (of doing nothing).  Doing so
54          * decreases the runtime of this function by around 30%.
55          */
56         if (chr > '/') {
57             /* it is not required to escape a solidus in JSON:
58              * read sec. 2.5: http://www.ietf.org/rfc/rfc4627.txt
59              * specifically, this production from the grammar:
60              *   unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
61              */
62             if (chr == '\\') {
63                 escBuf[1] = '\\';
64             }
65         } else if (chr == ' ' || (chr <  '/' && chr > ')')) {
66             /* skip these fairly common cases */
67         } else {
68             switch (chr) {
69             case '\r': escBuf[1] = 'r'; break;
70             case '\n': escBuf[1] = 'n'; break;
71             case '"':  escBuf[1] = '"'; break;
72             case '\t': escBuf[1] = 't'; break;
73             case '\f': escBuf[1] = 'f'; break;
74             case '\b': escBuf[1] = 'b'; break;
75             case '/': if (escape_solidus) escBuf[1] = '/'; break;
76             default:
77                 if (chr < 32) {
78                     CharToHex(chr, hexBuf + 4);
79                     escaped = hexBuf;
80                     escaped_len = 6;
81                 }
82                 break;
83             }
84         }
85         if (escBuf[1] != 0) {
86                 escaped = escBuf;
87                 escaped_len = 2;
88         }
89         if (escaped != NULL) {
90             print(ctx, (const char *) (str + beg), end - beg);
91             print(ctx, escaped, escaped_len);
92             beg = ++end;
93         } else {
94             ++end;
95         }
96     }
97     print(ctx, (const char *) (str + beg), end - beg);
98 }
99 
hexToDigit(unsigned int * val, const unsigned char * hex)100 static void hexToDigit(unsigned int * val, const unsigned char * hex)
101 {
102     unsigned int i;
103     for (i=0;i<4;i++) {
104         unsigned char c = hex[i];
105         if (c >= 'A') c = (c & ~0x20) - 7;
106         c -= '0';
107         assert(!(c & 0xF0));
108         *val = (*val << 4) | c;
109     }
110 }
111 
Utf32toUtf8(unsigned int codepoint, char * utf8Buf)112 static void Utf32toUtf8(unsigned int codepoint, char * utf8Buf)
113 {
114     if (codepoint < 0x80) {
115         utf8Buf[0] = (char) codepoint;
116         utf8Buf[1] = 0;
117     } else if (codepoint < 0x0800) {
118         utf8Buf[0] = (char) ((codepoint >> 6) | 0xC0);
119         utf8Buf[1] = (char) ((codepoint & 0x3F) | 0x80);
120         utf8Buf[2] = 0;
121     } else if (codepoint < 0x10000) {
122         utf8Buf[0] = (char) ((codepoint >> 12) | 0xE0);
123         utf8Buf[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
124         utf8Buf[2] = (char) ((codepoint & 0x3F) | 0x80);
125         utf8Buf[3] = 0;
126     } else if (codepoint < 0x200000) {
127         utf8Buf[0] =(char)((codepoint >> 18) | 0xF0);
128         utf8Buf[1] =(char)(((codepoint >> 12) & 0x3F) | 0x80);
129         utf8Buf[2] =(char)(((codepoint >> 6) & 0x3F) | 0x80);
130         utf8Buf[3] =(char)((codepoint & 0x3F) | 0x80);
131         utf8Buf[4] = 0;
132     } else {
133         utf8Buf[0] = '?';
134         utf8Buf[1] = 0;
135     }
136 }
137 
yajl_string_decode(yajl_buf buf, const unsigned char * str, size_t len)138 void yajl_string_decode(yajl_buf buf, const unsigned char * str,
139                         size_t len)
140 {
141     size_t beg = 0;
142     size_t end = 0;
143 
144     while (end < len) {
145         if (str[end] == '\\') {
146             char utf8Buf[5];
147             const char * unescaped = "?";
148             yajl_buf_append(buf, str + beg, end - beg);
149             switch (str[++end]) {
150                 case 'r': unescaped = "\r"; break;
151                 case 'n': unescaped = "\n"; break;
152                 case '\\': unescaped = "\\"; break;
153                 case '/': unescaped = "/"; break;
154                 case '"': unescaped = "\""; break;
155                 case 'f': unescaped = "\f"; break;
156                 case 'b': unescaped = "\b"; break;
157                 case 't': unescaped = "\t"; break;
158                 case 'u': {
159                     unsigned int codepoint = 0;
160                     hexToDigit(&codepoint, str + ++end);
161                     end+=3;
162                     /* check if this is a surrogate */
163                     if ((codepoint & 0xFC00) == 0xD800) {
164                         end++;
165                         if (str[end] == '\\' && str[end + 1] == 'u') {
166                             unsigned int surrogate = 0;
167                             hexToDigit(&surrogate, str + end + 2);
168                             codepoint =
169                                 (((codepoint & 0x3F) << 10) |
170                                  ((((codepoint >> 6) & 0xF) + 1) << 16) |
171                                  (surrogate & 0x3FF));
172                             end += 5;
173                         } else {
174                             unescaped = "?";
175                             break;
176                         }
177                     }
178 
179                     Utf32toUtf8(codepoint, utf8Buf);
180                     unescaped = utf8Buf;
181 
182                     if (codepoint == 0) {
183                         yajl_buf_append(buf, unescaped, 1);
184                         beg = ++end;
185                         continue;
186                     }
187 
188                     break;
189                 }
190                 default:
191                     assert("this should never happen" == NULL);
192             }
193             yajl_buf_append(buf, unescaped, (unsigned int)strlen(unescaped));
194             beg = ++end;
195         } else {
196             end++;
197         }
198     }
199     yajl_buf_append(buf, str + beg, end - beg);
200 }
201 
202 #define ADV_PTR s++; if (!(len--)) return 0;
203 
yajl_string_validate_utf8(const unsigned char * s, size_t len)204 int yajl_string_validate_utf8(const unsigned char * s, size_t len)
205 {
206     if (!len) return 1;
207     if (!s) return 0;
208 
209     while (len--) {
210         /* single byte */
211         if (*s <= 0x7f) {
212             /* noop */
213         }
214         /* two byte */
215         else if ((*s >> 5) == 0x6) {
216             ADV_PTR;
217             if (!((*s >> 6) == 0x2)) return 0;
218         }
219         /* three byte */
220         else if ((*s >> 4) == 0x0e) {
221             ADV_PTR;
222             if (!((*s >> 6) == 0x2)) return 0;
223             ADV_PTR;
224             if (!((*s >> 6) == 0x2)) return 0;
225         }
226         /* four byte */
227         else if ((*s >> 3) == 0x1e) {
228             ADV_PTR;
229             if (!((*s >> 6) == 0x2)) return 0;
230             ADV_PTR;
231             if (!((*s >> 6) == 0x2)) return 0;
232             ADV_PTR;
233             if (!((*s >> 6) == 0x2)) return 0;
234         } else {
235             return 0;
236         }
237 
238         s++;
239     }
240 
241     return 1;
242 }
243