xref: /4.0.0/couchstore/src/views/collate_json.c (revision 4f26a200)
1/* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
3/* Reference: http://wiki.apache.org/couchdb/View_collation */
4
5#include "config.h"
6#include "collate_json.h"
7#include <assert.h>
8#include <ctype.h>
9#include <stdbool.h>
10#include <stdint.h>
11#include <stdlib.h>
12#include <stdio.h>
13#include <string.h>
14#include <unicode/ucol.h>
15#include <unicode/ucasemap.h>
16#include <unicode/ucnv.h>
17
18
19static int cmp(int n1, int n2)
20{
21    return n1 > n2 ? 1 : (n1 < n2 ? -1 : 0);
22}
23
24static int dcmp(double n1, double n2)
25{
26    return n1 > n2 ? 1 : (n1 < n2 ? -1 : 0);
27}
28
29
30/* Types of values, ordered according to CouchDB collation order
31   (see view_collation.js tests) */
32typedef enum {
33    kEndArray,
34    kEndObject,
35    kComma,
36    kColon,
37    kNull,
38    kFalse,
39    kTrue,
40    kNumber,
41    kString,
42    kArray,
43    kObject,
44    kIllegal
45} ValueType;
46
47
48/* "Raw" ordering is:
49   0:number, 1:false, 2:null, 3:true, 4:object, 5:array, 6:string
50   (according to view_collation_raw.js) */
51static int8_t kRawOrderOfValueType[] = {
52    -4, -3, -2, -1,
53    2, 1, 3, 0, 6, 5, 4,
54    7
55};
56
57
58static ValueType valueTypeOf(char c)
59{
60    switch (c) {
61        case 'n':           return kNull;
62        case 'f':           return kFalse;
63        case 't':           return kTrue;
64        case '0': case '1': case '2': case '3': case '4':
65        case '5': case '6': case '7': case '8': case '9':
66        case '-':           return kNumber;
67        case '"':           return kString;
68        case ']':           return kEndArray;
69        case '}':           return kEndObject;
70        case ',':           return kComma;
71        case ':':           return kColon;
72        case '[':           return kArray;
73        case '{':           return kObject;
74        default:
75            fprintf(stderr, "CouchStore CollateJSON: Unexpected character '%c' (0x%02x)\n", c, (unsigned char)c);
76            return kIllegal;
77    }
78}
79
80
81static int digitToInt(int c)
82{
83    if (isdigit(c))
84        return c - '0';
85    else if (isxdigit(c))
86        return 10 + tolower(c) - 'a';
87    else
88        return 0;
89}
90
91
92char ConvertJSONEscape(const char **in)
93{
94    int uc;
95    char c = *++(*in);
96    switch (c) {
97        case 'u': {
98            /* \u is a Unicode escape; 4 hex digits follow. */
99            const char* digits = *in + 1;
100            *in += 4;
101            uc = (digitToInt(digits[0]) << 12) | (digitToInt(digits[1]) << 8) |
102                     (digitToInt(digits[2]) <<  4) | (digitToInt(digits[3]));
103            if (uc > 127)
104                fprintf(stderr, "CouchStore CollateJSON: Can't correctly compare \\u%.4s\n", digits);
105            return (char)uc;
106        }
107        case 'b':   return '\b';
108        case 'n':   return '\n';
109        case 'r':   return '\r';
110        case 't':   return '\t';
111        default:    return c;
112    }
113}
114
115
116static int compareStringsASCII(const char** in1, const char** in2)
117{
118    const char* str1 = *in1, *str2 = *in2;
119    int s;
120    while(true) {
121        char c1 = *++str1;
122        char c2 = *++str2;
123
124        /* If one string ends, the other is greater; if both end,
125           they're equal */
126        if (c1 == '"') {
127            if (c2 == '"')
128                break;
129            else
130                return -1;
131        } else if (c2 == '"')
132            return 1;
133
134        /* Handle escape sequences: */
135        if (c1 == '\\')
136            c1 = ConvertJSONEscape(&str1);
137        if (c2 == '\\')
138            c2 = ConvertJSONEscape(&str2);
139
140        /* Compare the next characters: */
141        s = cmp(c1, c2);
142        if (s)
143            return s;
144    }
145
146    /* Strings are equal, so update the positions: */
147    *in1 = str1 + 1;
148    *in2 = str2 + 1;
149    return 0;
150}
151
152
153static const char* createStringFromJSON(const char** in, size_t *length, bool *freeWhenDone)
154{
155    char* buf;
156    char* dst;
157    char c;
158    /* Scan the JSON string to find its length and whether it contains
159       escapes: */
160    const char* start = ++*in;
161    unsigned escapes = 0;
162    const char* str;
163    for (str = start; *str != '"'; ++str) {
164        if (*str == '\\') {
165            ++str;
166            if (*str == 'u') {
167                escapes += 5;  /* \uxxxx adds 5 bytes */
168                str += 4;
169            } else
170                escapes += 1;
171        }
172    }
173    *in = str + 1;
174    *length = str - start;
175
176    *freeWhenDone = false;
177    if (escapes > 0) {
178        *length -= escapes;
179        buf = malloc(*length);
180        dst = buf;
181        for (str = start; (c = *str) != '"'; ++str) {
182            if (c == '\\')
183                c = ConvertJSONEscape(&str);
184            *dst++ = c;
185        }
186        assert(dst - buf == (int)*length);
187        start = buf;
188        *freeWhenDone = true;
189    }
190
191    return start;
192}
193
194
195static int compareUnicodeSlow(const char* str1, size_t len1,
196                              const char* str2, size_t len2)
197{
198    static UCollator* coll = NULL;
199    UCharIterator iterA, iterB;
200    int result;
201
202    UErrorCode status = U_ZERO_ERROR;
203    if (!coll) {
204        coll = ucol_open("", &status);
205        if (U_FAILURE(status)) {
206            fprintf(stderr, "CouchStore CollateJSON: Couldn't initialize ICU (%d)\n", (int)status);
207            return -1;
208        }
209    }
210
211    uiter_setUTF8(&iterA, str1, (int)len1);
212    uiter_setUTF8(&iterB, str2, (int)len2);
213
214    result = ucol_strcollIter(coll, &iterA, &iterB, &status);
215
216    if (U_FAILURE(status)) {
217        fprintf(stderr, "CouchStore CollateJSON: ICU error %d\n", (int)status);
218        return -1;
219    }
220
221    if (result < 0) {
222        return -1;
223    } else if (result > 0) {
224        return 1;
225    }
226
227    return 0;
228}
229
230
231static int convertUTF8toUChar(const char *src, UChar *dst, int len)
232{
233    static UConverter *c;
234    UErrorCode status;
235    UChar *p = dst;
236    const char *s = src;
237
238    if (!c) {
239        status = U_ZERO_ERROR;
240        c = ucnv_open("UTF-8", &status);
241        if (!c) {
242            fprintf(stderr, "CouchStore CollateJSON: Couldn't initialize ICU (%d)\n", (int)status);
243            abort();
244        }
245    }
246
247    while (len) {
248        unsigned char ch = (unsigned char)(*s);
249        if ((ch & 0x80)) {
250            goto icu_conv;
251        }
252        *p++ = (UChar)(ch);
253        s++;
254        len--;
255    }
256
257    return p - dst;
258
259icu_conv:
260    status = U_ZERO_ERROR;
261    ucnv_toUnicode(c, &p, p + len, &s, s + len, NULL, TRUE, &status);
262
263    if (U_FAILURE(status)) {
264        return -1;
265    }
266
267    return p - dst;
268}
269
270static int compareUnicode(const char* str1, size_t len1,
271                          const char* str2, size_t len2)
272{
273    static UCollator* coll = NULL;
274    UChar *b1;
275    UChar *b2;
276    int ret1, ret2;
277    int result;
278
279    UErrorCode status = U_ZERO_ERROR;
280    if (!coll) {
281        coll = ucol_open("", &status);
282        if (U_FAILURE(status)) {
283            fprintf(stderr, "CouchStore CollateJSON: Couldn't initialize ICU (%d)\n", (int)status);
284            return -1;
285        }
286    }
287
288    if (len1 > 256 || len2 > 256) {
289        return compareUnicodeSlow(str1, len1, str2, len2);
290    }
291
292    b1 = malloc(len1 * sizeof(UChar));
293    b2 = malloc(len2 * sizeof(UChar));
294    if (b1 == NULL || b2 == NULL) {
295        free(b1);
296        free(b2);
297        fprintf(stderr, "CouchStore CollateJSON: Couldn't allocate memory\n");
298        return -2;
299    }
300
301    ret1 = convertUTF8toUChar(str1, b1, len1);
302    ret2 = convertUTF8toUChar(str2, b2, len2);
303
304    if (ret1 < 0 || ret2 < 0) {
305        /* something went wrong with utf8->utf32 conversion */
306        free(b1);
307        free(b2);
308        return compareUnicodeSlow(str1, len1, str2, len2);
309    }
310
311    result = ucol_strcoll(coll, b1, ret1, b2, ret2);
312    free(b1);
313    free(b2);
314
315    if (result < 0) {
316        return -1;
317    } else if (result > 0) {
318        return 1;
319    }
320
321    return 0;
322
323}
324
325
326static int compareStringsUnicode(const char** in1, const char** in2)
327{
328    size_t len1, len2;
329    bool free1, free2;
330    const char* str1 = createStringFromJSON(in1, &len1, &free1);
331    const char* str2 = createStringFromJSON(in2, &len2, &free2);
332
333    int result = compareUnicode(str1, len1, str2, len2);
334
335    if (free1) {
336        free((char*)str1);
337    }
338    if (free2) {
339        free((char*)str2);
340    }
341    return result;
342}
343
344
345static double readNumber(const char* start, const char* end, char** endOfNumber) {
346    /* First copy the string into a zero-terminated buffer so we can safely
347       call strtod: */
348    char buf[50];
349    char* endInStr;
350    double result;
351    size_t len;
352    char* str;
353
354    assert(end > start);
355    len = end - start;
356    str = (len < sizeof(buf)) ? buf : malloc(len + 1);
357    if (!str)
358        return 0.0;
359    memcpy(str, start, len);
360    str[len] = '\0';
361
362    result = strtod(str, &endInStr);
363    *endOfNumber = (char*)start + (endInStr - str);
364    if (len >= sizeof(buf))
365        free(str);
366    return result;
367}
368
369
370int CollateJSON(const sized_buf *buf1,
371                const sized_buf *buf2,
372                CollateJSONMode mode)
373{
374    const char* str1 = buf1->buf;
375    const char* str2 = buf2->buf;
376    int depth = 0;
377
378    do {
379        /* Get the types of the next token in each string: */
380        ValueType type1 = valueTypeOf(*str1);
381        ValueType type2 = valueTypeOf(*str2);
382        /* If types don't match, stop and return their relative ordering: */
383        if (type1 != type2) {
384            if (mode != kCollateJSON_Raw)
385                return cmp(type1, type2);
386            else
387                return cmp(kRawOrderOfValueType[type1], kRawOrderOfValueType[type2]);
388
389        /* If types match, compare the actual token values: */
390        } else switch (type1) {
391            case kNull:
392            case kTrue:
393                str1 += 4;
394                str2 += 4;
395                break;
396            case kFalse:
397                str1 += 5;
398                str2 += 5;
399                break;
400            case kNumber: {
401                char* next1, *next2;
402                int diff;
403                if (depth == 0) {
404                    /* At depth 0, be careful not to fall off the end of the
405                       input, because there won't be any delimiters (']' or
406                       '}') after the number! */
407                    diff = dcmp( readNumber(str1, buf1->buf + buf1->size, &next1),
408                                 readNumber(str2, buf2->buf + buf2->size, &next2) );
409                } else {
410                    diff = dcmp( strtod(str1, &next1), strtod(str2, &next2) );
411                }
412                if (diff)
413                    return diff; /* Numbers don't match */
414                str1 = next1;
415                str2 = next2;
416                break;
417            }
418            case kString: {
419                int diff;
420                if (mode == kCollateJSON_Unicode)
421                    diff = compareStringsUnicode(&str1, &str2);
422                else
423                    diff = compareStringsASCII(&str1, &str2);
424                if (diff)
425                    return diff; /* Strings don't match */
426                break;
427            }
428            case kArray:
429            case kObject:
430                ++str1;
431                ++str2;
432                ++depth;
433                break;
434            case kEndArray:
435            case kEndObject:
436                ++str1;
437                ++str2;
438                --depth;
439                break;
440            case kComma:
441            case kColon:
442                ++str1;
443                ++str2;
444                break;
445            case kIllegal:
446                return 0;
447        }
448    /* Keep going as long as we're inside an array or object */
449    } while (depth > 0);
450    return 0;
451}
452