1 /**
2  * @copyright 2012 Couchbase, Inc.
3  *
4  * @author Jens Alfke    <jens@couchbase.com>   (algorithm, original implementation)
5  * @author Filipe Manana <filipe@couchbase.com> (Erlang NIF port)
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
8  * use this file except in compliance with the License. You may obtain a copy of
9  * the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
16  * License for the specific language governing permissions and limitations under
17  * the License.
18  */
19 
20 #include <platform/cbassert.h>
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <string.h>
24 #include <ctype.h>
25 #include <unicode/ucol.h>
26 #include <unicode/ucasemap.h>
27 #include "couch_ejson_compare.h"
28 
29 
30 #define CMP(a, b) ((a) > (b) ? 1 : ((a) < (b) ? -1 : 0))
31 
32 
33 /* Ordered by collation order. */
34 typedef enum {
35     kEndArray,
36     kEndObject,
37     kComma,
38     kColon,
39     kNull,
40     kFalse,
41     kTrue,
42     kNumber,
43     kString,
44     kArray,
45     kObject,
46     kIllegal
47 } ValueType;
48 
49 
50 static __inline ValueType valueTypeOf(char);
51 static int compareStringsUnicode(const char **, const char **, couch_ejson_ctx_t *);
52 static const char * createStringFromJSON(const char **, int *, int *, couch_ejson_ctx_t *);
53 static int compareUnicode(const char *, int, const char *, int, couch_ejson_ctx_t *);
54 static __inline char ConvertJSONEscape(const char **, couch_ejson_ctx_t *);
55 static __inline int digitToInt(int );
56 
57 
58 static const char *MALLOC_ERROR         = "Failed to allocate memory";
59 static const char *UNEXPECTED_CHARACTER = "Unexpected character in key";
60 static const char *ICU_ERROR            = "ICU collator error";
61 
62 
less_json(const char *str1, const char *str2, couch_ejson_ctx_t *ctx)63 int less_json(const char *str1, const char *str2, couch_ejson_ctx_t *ctx)
64 {
65     int depth = 0;
66 
67     do {
68         /* Get the types of the next token in each string: */
69         ValueType type1 = valueTypeOf(*str1);
70         ValueType type2 = valueTypeOf(*str2);
71 
72         if (type1 == kIllegal || type2 == kIllegal) {
73             ctx->error = 1;
74             ctx->errorMsg = UNEXPECTED_CHARACTER;
75             return -1;
76         }
77 
78         /* If types don't match, stop and return their relative ordering: */
79         if (type1 != type2) {
80             return CMP(type1, type2);
81         } else {
82             /* If types match, compare the actual token values: */
83             switch (type1) {
84             case kNull:
85             case kTrue:
86                 str1 += 4;
87                 str2 += 4;
88                 break;
89             case kFalse:
90                 str1 += 5;
91                 str2 += 5;
92                 break;
93             case kNumber: {
94                 char *next1, *next2;
95                 int diff = CMP( strtod(str1, &next1), strtod(str2, &next2) );
96                 if (diff != 0) {
97                     return diff;
98                 }
99                 str1 = next1;
100                 str2 = next2;
101                 break;
102             }
103             case kString: {
104                 int diff = compareStringsUnicode(&str1, &str2, ctx);
105 
106                 if (ctx->error != 0) {
107                     return -1;
108                 }
109                 if (diff != 0) {
110                     return diff;
111                 }
112                 break;
113             }
114             case kArray:
115             case kObject:
116                 ++str1;
117                 ++str2;
118                 ++depth;
119                 break;
120             case kEndArray:
121             case kEndObject:
122                 ++str1;
123                 ++str2;
124                 --depth;
125                 break;
126             case kComma:
127             case kColon:
128                 ++str1;
129                 ++str2;
130                 break;
131             case kIllegal:
132                 return 0;
133             }
134         }
135     } while (depth > 0); /* Keep going as long as we're inside an array or object */
136     return 0;
137 }
138 
139 
valueTypeOf(char c)140 static __inline ValueType valueTypeOf(char c)
141 {
142     switch (c) {
143     case 'n':           return kNull;
144     case 'f':           return kFalse;
145     case 't':           return kTrue;
146     case '0':
147     case '1':
148     case '2':
149     case '3':
150     case '4':
151     case '5':
152     case '6':
153     case '7':
154     case '8':
155     case '9':
156     case '-':           return kNumber;
157     case '"':           return kString;
158     case ']':           return kEndArray;
159     case '}':           return kEndObject;
160     case ',':           return kComma;
161     case ':':           return kColon;
162     case '[':           return kArray;
163     case '{':           return kObject;
164     default:
165         return kIllegal;
166     }
167 }
168 
169 
compareStringsUnicode(const char **in1, const char **in2, couch_ejson_ctx_t *ctx)170 static int compareStringsUnicode(const char **in1, const char **in2,
171                                  couch_ejson_ctx_t *ctx)
172 {
173     int len1, len2;
174     int free1 = 0, free2 = 0;
175     const char *str1 = NULL;
176     const char *str2 = NULL;
177     int result = 0;
178 
179     str1 = createStringFromJSON(in1, &len1, &free1, ctx);
180     if (ctx->error == 0) {
181         str2 = createStringFromJSON(in2, &len2, &free2, ctx);
182     }
183     if (ctx->error == 0) {
184         result = compareUnicode(str1, len1, str2, len2, ctx);
185     }
186 
187     if (free1) {
188         enif_free((void *) str1);
189     }
190     if (free2) {
191         enif_free((void *) str2);
192     }
193 
194     return result;
195 }
196 
197 
createStringFromJSON(const char **in, int *length, int *freeWhenDone, couch_ejson_ctx_t *ctx)198 static const char * createStringFromJSON(const char **in, int *length,
199                                          int *freeWhenDone,
200                                          couch_ejson_ctx_t *ctx)
201 {
202     /* Scan the JSON string to find its length and whether it contains escapes: */
203     const char* start = ++*in;
204     unsigned escapes = 0;
205     const char* str;
206 
207     for (str = start; *str != '"'; ++str) {
208         if (*str == '\\') {
209             ++str;
210             if (*str == 'u') {
211                 escapes += 5;  /* \uxxxx adds 5 bytes */
212                 str += 4;
213             } else {
214                 escapes += 1;
215             }
216         }
217     }
218     *in = str + 1;
219     *length = str - start;
220     *freeWhenDone = 0;
221 
222     if (escapes > 0) {
223         char* buf = NULL;
224         char* dst = NULL;
225         char c;
226 
227         *length -= escapes;
228         buf = (char *) enif_alloc(*length);
229 
230         if (buf == NULL) {
231             ctx->error = 1;
232             ctx->errorMsg = MALLOC_ERROR;
233             return NULL;
234         }
235 
236         dst = buf;
237         *freeWhenDone = 1;
238 
239         for (str = start; (c = *str) != '"'; ++str) {
240             if (c == '\\') {
241                 c = ConvertJSONEscape(&str, ctx);
242                 if (ctx->error != 0) {
243                     return buf;
244                 }
245             }
246             *dst++ = c;
247         }
248         cb_assert(dst - buf == (int) *length);
249         start = buf;
250     }
251 
252     return start;
253 }
254 
255 
compareUnicode(const char *str1, int len1, const char *str2, int len2, couch_ejson_ctx_t *ctx)256 static int compareUnicode(const char *str1, int len1,
257                           const char *str2, int len2,
258                           couch_ejson_ctx_t *ctx)
259 {
260     UCharIterator iterA, iterB;
261     UErrorCode status = U_ZERO_ERROR;
262     int result;
263 
264     reserve_coll(ctx);
265 
266     uiter_setUTF8(&iterA, str1, len1);
267     uiter_setUTF8(&iterB, str2, len2);
268 
269     result = ucol_strcollIter(ctx->coll, &iterA, &iterB, &status);
270 
271     if (U_FAILURE(status)) {
272         ctx->error = 1;
273         ctx->errorMsg = ICU_ERROR;
274         return -1;
275     }
276 
277     return result;
278 }
279 
280 
ConvertJSONEscape(const char **in, couch_ejson_ctx_t *ctx)281 static __inline char ConvertJSONEscape(const char **in, couch_ejson_ctx_t *ctx)
282 {
283     char c = *++(*in);
284     switch (c) {
285     case 'u': {
286         int uc;
287         /* \u is a Unicode escape; 4 hex digits follow. */
288         const char* digits = *in + 1;
289         *in += 4;
290         uc = (digitToInt(digits[0]) << 12) | (digitToInt(digits[1]) << 8) |
291             (digitToInt(digits[2]) <<  4) | (digitToInt(digits[3]));
292         if (uc > 127) {
293             ctx->error = 1;
294             ctx->errorMsg = UNEXPECTED_CHARACTER;
295         }
296         return (char) uc;
297     }
298     case 'b':   return '\b';
299     case 'n':   return '\n';
300     case 'r':   return '\r';
301     case 't':   return '\t';
302     default:    return c;
303     }
304 }
305 
306 
digitToInt(int c)307 static __inline int digitToInt(int c)
308 {
309     if (isdigit(c)) {
310         return c - '0';
311     } else if (isxdigit(c)) {
312         return 10 + tolower(c) - 'a';
313     } else {
314         return 0;
315     }
316 }
317