1// Copyright 2010 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package json
6
7// JSON value parser state machine.
8// Just about at the limit of what is reasonable to write by hand.
9// Some parts are a bit tedious, but overall it nicely factors out the
10// otherwise common code from the multiple scanning functions
11// in this package (Compact, Indent, checkValid, nextValue, etc).
12//
13// This file starts with two simple examples using the scanner
14// before diving into the scanner itself.
15
16import (
17	"strconv"
18	"unicode/utf8"
19)
20
21// checkValid verifies that data is valid JSON-encoded data.
22// scan is passed in for use by checkValid to avoid an allocation.
23func checkValid(data []byte, scan *scanner) error {
24	scan.reset()
25	for scan.offset < len(scan.data) {
26		c := scan.data[scan.offset]
27		scan.offset++
28		if scan.step(scan, c) == scanError {
29			return scan.err
30		}
31	}
32	if scan.eof() == scanError {
33		return scan.err
34	}
35	return nil
36}
37
38// Validate some alleged JSON.  Return nil iff the JSON is valid.
39func Validate(data []byte) error {
40	return checkValid(data, newScanner(data))
41}
42
43// nextLiteral scans the data and grabs the next literal, in one pass
44func nextLiteral(scan *scanner) ([]byte, error) {
45	l := len(scan.data)
46
47	// first try and see if we can pass the literal straight from our slice
48	start := scan.offset
49	for {
50		if scan.offset >= l {
51			return nil, &SyntaxError{"unexpected end of JSON input", int64(scan.offset)}
52		}
53		c := scan.data[scan.offset]
54
55		// found the other side
56		if c == '"' {
57			scan.step = stateEndValue
58			oldOffset := scan.offset
59			scan.offset++
60			return scan.data[start:oldOffset], nil
61		}
62
63		// no control characters
64		if c < 0x20 {
65			_ = scan.error(c, "in string literal")
66			return nil, scan.err
67		}
68		if c == '\\' || c >= utf8.RuneSelf {
69			break
70		}
71		scan.offset++
72	}
73
74	usableCap := (scan.offset - start) * 2
75	if usableCap < 64 {
76		usableCap = 64
77	}
78	literal := make([]byte, usableCap)
79	copy(literal, scan.data[start:scan.offset])
80	out := scan.offset - start
81
82	// we always leave 8 bytes for a possible utf16
83	usableCap -= 8
84	for {
85		if scan.offset >= l {
86			return nil, &SyntaxError{"unexpected end of JSON input", int64(scan.offset)}
87		}
88		c := scan.data[scan.offset]
89
90		// found the other side
91		if c == '"' {
92			scan.step = stateEndValue
93			scan.offset++
94			return literal[:out], nil
95		}
96
97		// no control characters
98		if c < 0x20 {
99			_ = scan.error(c, "in string literal")
100			return nil, scan.err
101		}
102
103		// make more space
104		if out >= usableCap {
105			newCap := (usableCap + 8) * 2
106			newLiteral := make([]byte, newCap)
107			copy(newLiteral, literal[:out])
108			literal = newLiteral
109			usableCap = newCap - 8
110		}
111
112		// escape
113		if c == '\\' {
114			scan.offset++
115			c := scan.data[scan.offset]
116			scan.offset++
117			switch c {
118			case '"', '\\', '/', '\'':
119				literal[out] = c
120			case 'b':
121				literal[out] = '\b'
122			case 'f':
123				literal[out] = '\f'
124			case 'n':
125				literal[out] = '\n'
126			case 'r':
127				literal[out] = '\r'
128			case 't':
129				literal[out] = '\t'
130			case 'u':
131				oldOffset := scan.offset - 2
132				rr, size := getu4OrSurrogate(scan.data, oldOffset)
133				if rr < 0 {
134					_ = scan.error(c, "invalid unicode sequence")
135					return nil, scan.err
136				}
137				scan.offset = oldOffset + size
138				out += utf8.EncodeRune(literal[out:], rr)
139				continue
140			default:
141				_ = scan.error(c, "invalid escaped character")
142				return nil, scan.err
143			}
144			out++
145
146			// ascii
147		} else if c < utf8.RuneSelf {
148			literal[out] = c
149			out++
150			scan.offset++
151
152			// UTFs
153		} else {
154			rr, size := utf8.DecodeRune(scan.data[scan.offset:])
155			scan.offset += size
156			out += utf8.EncodeRune(literal[out:], rr)
157		}
158	}
159
160	// we should never get here
161	_ = scan.error(' ', "unxepected state")
162	return nil, scan.err
163
164}
165
166// nextValue splits data after the next whole JSON value,
167// returning that value and the bytes that follow it as separate slices.
168// scan is passed in for use by nextValue to avoid an allocation.
169func nextValue(data []byte, scan *scanner) (value, rest []byte, err error) {
170	start := scan.offset
171	scan.reset()
172	for scan.offset < len(scan.data) {
173		i := scan.offset
174		c := data[i]
175		scan.offset++
176		v := scan.step(scan, c)
177		if v >= scanEndObject {
178			switch v {
179			// probe the scanner with a space to determine whether we will
180			// get scanEnd on the next character. Otherwise, if the next character
181			// is not a space, scanEndTop allocates a needless error.
182			case scanEndObject, scanEndArray:
183				if scan.step(scan, ' ') == scanEnd {
184					return data[start : i+1], data[i+1:], nil
185				}
186			case scanError:
187				return nil, nil, scan.err
188			case scanEnd:
189				return data[start:i], data[i:], nil
190			}
191		}
192	}
193	if scan.eof() == scanError {
194		return nil, nil, scan.err
195	}
196	return data[start:], nil, nil
197}
198
199// nextScanValue behaves like nextValue, but preserves the scan, so as to be
200// useful in single scan environments
201// it does not need a data argument, and will not return rest
202func nextScanValue(baseScan *scanner) ([]byte, error) {
203	scan := newScanner(baseScan.data)
204	scan.reset()
205	scan.offset = baseScan.offset
206	start := scan.offset
207	for scan.offset < len(scan.data) {
208		i := scan.offset
209		c := scan.data[i]
210		scan.offset++
211		v := scan.step(scan, c)
212		if v >= scanEndObject {
213			switch v {
214			case scanEndObject, scanEndArray:
215
216				// since we are looking for a value within a scan, here we do not expect
217				// the scan to end, and there will be something after the value
218				// the next scan step therefore is stateEndValue
219				if scan.step(scan, ' ') == scanEnd {
220					baseScan.step = stateEndValue
221					baseScan.offset = scan.offset
222					return scan.data[start : i+1], nil
223				}
224			case scanError:
225				return nil, scan.err
226			case scanEnd:
227				return scan.data[start:i], nil
228			}
229		}
230	}
231	if scan.eof() == scanError {
232		return nil, scan.err
233	}
234	return scan.data[start:], nil
235}
236
237// A SyntaxError is a description of a JSON syntax error.
238type SyntaxError struct {
239	msg    string // description of error
240	Offset int64  // error occurred after reading Offset bytes
241}
242
243func (e *SyntaxError) Error() string { return e.msg }
244
245// A scanner is a JSON scanning state machine.
246// Callers call scan.reset() and then pass bytes in one at a time
247// by calling scan.step(&scan, c) for each byte.
248// The return value, referred to as an opcode, tells the
249// caller about significant parsing events like beginning
250// and ending literals, objects, and arrays, so that the
251// caller can follow along if it wishes.
252// The return value scanEnd indicates that a single top-level
253// JSON value has been completed, *before* the byte that
254// just got passed in.  (The indication must be delayed in order
255// to recognize the end of numbers: is 123 a whole value or
256// the beginning of 12345e+6?).
257type scanner struct {
258
259	// our scanner keeps track of the state of the scan, for fast loops
260	// skipping bytes and traversing strings
261	data   []byte
262	offset int
263
264	// The step is a func to be called to execute the next transition.
265	step func(*scanner, byte) int
266
267	// Reached end of top-level value.
268	endTop bool
269
270	// Stack of what we're in the middle of - array values, object keys, object values.
271	parseState []int
272
273	// Error that happened, if any.
274	err error
275
276	// 1-byte redo (see undo method)
277	redo      bool
278	redoCode  int
279	redoState func(*scanner, byte) int
280}
281
282// These values are returned by the state transition functions
283// assigned to scanner.state and the method scanner.eof.
284// They give details about the current state of the scan that
285// callers might be interested to know about.
286// It is okay to ignore the return value of any particular
287// call to scanner.state: if one call returns scanError,
288// every subsequent call will return scanError too.
289const (
290	// Continue.
291	scanContinue     = iota // uninteresting byte
292	scanBeginLiteral        // end implied by next result != scanContinue
293	scanBeginObject         // begin object
294	scanObjectKey           // just finished object key (string)
295	scanObjectValue         // just finished non-last object value
296	scanEndObject           // end object (implies scanObjectValue if possible)
297	scanBeginArray          // begin array
298	scanArrayValue          // just finished array value
299	scanEndArray            // end array (implies scanArrayValue if possible)
300	scanSkipSpace           // space byte; can skip; known to be last "continue" result
301
302	// Stop.
303	scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
304	scanError // hit an error, scanner.err.
305)
306
307// These values are stored in the parseState stack.
308// They give the current state of a composite value
309// being scanned. If the parser is inside a nested value
310// the parseState describes the nested state, outermost at entry 0.
311const (
312	parseObjectKey   = iota // parsing object key (before colon)
313	parseObjectValue        // parsing object value (after colon)
314	parseArrayValue         // parsing array value
315)
316
317// reset prepares the scanner for use.
318// It must be called before calling s.step.
319func (s *scanner) reset() {
320	s.step = stateBeginValue
321	s.parseState = s.parseState[0:0]
322	s.err = nil
323	s.redo = false
324	s.endTop = false
325}
326
327// Sets up a scanner
328func newScanner(data []byte) *scanner {
329	return &scanner{offset: 0, data: data}
330}
331
332// eof tells the scanner that the end of input has been reached.
333// It returns a scan status just as s.step does.
334func (s *scanner) eof() int {
335	if s.err != nil {
336		return scanError
337	}
338	if s.endTop {
339		return scanEnd
340	}
341	s.step(s, ' ')
342	if s.endTop {
343		return scanEnd
344	}
345	if s.err == nil {
346		s.err = &SyntaxError{"unexpected end of JSON input", int64(s.offset)}
347	}
348	return scanError
349}
350
351// pushParseState pushes a new parse state p onto the parse stack.
352func (s *scanner) pushParseState(p int) {
353	s.parseState = append(s.parseState, p)
354}
355
356// popParseState pops a parse state (already obtained) off the stack
357// and updates s.step accordingly.
358func (s *scanner) popParseState() {
359	n := len(s.parseState) - 1
360	s.parseState = s.parseState[0:n]
361	s.redo = false
362	if n == 0 {
363		s.step = stateEndTop
364		s.endTop = true
365	} else {
366		s.step = stateEndValue
367	}
368}
369
370func isSpace(c byte) bool {
371	return c == ' ' || c == '\t' || c == '\r' || c == '\n'
372}
373
374// skip spaces
375// to be called at the beginning of a token, not at the end, so that
376// a token does not include extra spaces
377// they will be skipped in one go before the next token
378func (s *scanner) skipSpaces(c byte) bool {
379	if isSpace(c) {
380		l := len(s.data)
381
382		for {
383			if s.offset >= l {
384				break
385			}
386			c = s.data[s.offset]
387			if isSpace(c) {
388				s.offset++
389			} else {
390				break
391			}
392		}
393		return true
394	}
395	return false
396}
397
398// stateBeginValueOrEmpty is the state after reading `[`.
399func stateBeginValueOrEmpty(s *scanner, c byte) int {
400	if c <= ' ' && s.skipSpaces(c) {
401		return scanSkipSpace
402	}
403	if c == ']' {
404		return stateEndValue(s, c)
405	}
406	return stateBeginValue(s, c)
407}
408
409// stateBeginValue is the state at the beginning of the input.
410func stateBeginValue(s *scanner, c byte) int {
411	if c <= ' ' && s.skipSpaces(c) {
412		return scanSkipSpace
413	}
414	switch c {
415	case '{':
416		s.step = stateBeginStringOrEmpty
417		s.pushParseState(parseObjectKey)
418		return scanBeginObject
419	case '[':
420		s.step = stateBeginValueOrEmpty
421		s.pushParseState(parseArrayValue)
422		return scanBeginArray
423	case '"':
424		s.step = stateInString
425		return scanBeginLiteral
426	case '-':
427		s.step = stateNeg
428		return scanBeginLiteral
429	case '0': // beginning of 0.123
430		s.step = state0
431		return scanBeginLiteral
432	case 't': // beginning of true
433		s.step = stateT
434		return scanBeginLiteral
435	case 'f': // beginning of false
436		s.step = stateF
437		return scanBeginLiteral
438	case 'n': // beginning of null
439		s.step = stateN
440		return scanBeginLiteral
441	}
442	if '1' <= c && c <= '9' { // beginning of 1234.5
443		s.step = state1
444		return scanBeginLiteral
445	}
446	return s.error(c, "looking for beginning of value")
447}
448
449// stateBeginStringOrEmpty is the state after reading `{`.
450func stateBeginStringOrEmpty(s *scanner, c byte) int {
451	if c <= ' ' && s.skipSpaces(c) {
452		return scanSkipSpace
453	}
454	if c == '}' {
455		n := len(s.parseState)
456		s.parseState[n-1] = parseObjectValue
457		return stateEndValue(s, c)
458	}
459	return stateBeginString(s, c)
460}
461
462// stateBeginString is the state after reading `{"key": value,`.
463func stateBeginString(s *scanner, c byte) int {
464	if c <= ' ' && s.skipSpaces(c) {
465		return scanSkipSpace
466	}
467	if c == '"' {
468		s.step = stateInString
469		return scanBeginLiteral
470	}
471	return s.error(c, "looking for beginning of object key string")
472}
473
474// stateEndValue is the state after completing a value,
475// such as after reading `{}` or `true` or `["x"`.
476func stateEndValue(s *scanner, c byte) int {
477	n := len(s.parseState)
478	if n == 0 {
479		// Completed top-level before the current byte.
480		s.step = stateEndTop
481		s.endTop = true
482		return stateEndTop(s, c)
483	}
484	if c <= ' ' && isSpace(c) {
485		s.step = stateEndValue
486		return scanSkipSpace
487	}
488	ps := s.parseState[n-1]
489	switch ps {
490	case parseObjectKey:
491		if c == ':' {
492			s.parseState[n-1] = parseObjectValue
493			s.step = stateBeginValue
494			return scanObjectKey
495		}
496		return s.error(c, "after object key")
497	case parseObjectValue:
498		if c == ',' {
499			s.parseState[n-1] = parseObjectKey
500			s.step = stateBeginString
501			return scanObjectValue
502		}
503		if c == '}' {
504			s.popParseState()
505			return scanEndObject
506		}
507		return s.error(c, "after object key:value pair")
508	case parseArrayValue:
509		if c == ',' {
510			s.step = stateBeginValue
511			return scanArrayValue
512		}
513		if c == ']' {
514			s.popParseState()
515			return scanEndArray
516		}
517		return s.error(c, "after array element")
518	}
519	return s.error(c, "")
520}
521
522// stateEndTop is the state after finishing the top-level value,
523// such as after reading `{}` or `[1,2,3]`.
524// Only space characters should be seen now.
525func stateEndTop(s *scanner, c byte) int {
526	if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
527		// Complain about non-space byte on next call.
528		s.error(c, "after top-level value")
529	}
530	return scanEnd
531}
532
533// stateInString is the state after reading `"`.
534func stateInString(s *scanner, c byte) int {
535	l := len(s.data)
536	for {
537		if c == '"' {
538			s.step = stateEndValue
539			return scanContinue
540		}
541		if c == '\\' {
542			s.step = stateInStringEsc
543			return scanContinue
544		}
545		if c < 0x20 {
546			return s.error(c, "in string literal")
547		}
548		if s.offset >= l {
549			break
550		}
551		c = s.data[s.offset]
552		s.offset++
553	}
554	return scanContinue
555}
556
557// stateInStringEsc is the state after reading `"\` during a quoted string.
558func stateInStringEsc(s *scanner, c byte) int {
559	switch c {
560	case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
561		s.step = stateInString
562		return scanContinue
563	case 'u':
564		s.step = stateInStringEscU
565		return scanContinue
566	}
567	return s.error(c, "in string escape code")
568}
569
570// stateInStringEscU is the state after reading `"\u` during a quoted string.
571func stateInStringEscU(s *scanner, c byte) int {
572	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
573		s.step = stateInStringEscU1
574		return scanContinue
575	}
576	// numbers
577	return s.error(c, "in \\u hexadecimal character escape")
578}
579
580// stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
581func stateInStringEscU1(s *scanner, c byte) int {
582	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
583		s.step = stateInStringEscU12
584		return scanContinue
585	}
586	// numbers
587	return s.error(c, "in \\u hexadecimal character escape")
588}
589
590// stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
591func stateInStringEscU12(s *scanner, c byte) int {
592	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
593		s.step = stateInStringEscU123
594		return scanContinue
595	}
596	// numbers
597	return s.error(c, "in \\u hexadecimal character escape")
598}
599
600// stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
601func stateInStringEscU123(s *scanner, c byte) int {
602	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
603		s.step = stateInString
604		return scanContinue
605	}
606	// numbers
607	return s.error(c, "in \\u hexadecimal character escape")
608}
609
610// stateNeg is the state after reading `-` during a number.
611func stateNeg(s *scanner, c byte) int {
612	if c == '0' {
613		s.step = state0
614		return scanContinue
615	}
616	if '1' <= c && c <= '9' {
617		s.step = state1
618		return scanContinue
619	}
620	return s.error(c, "in numeric literal")
621}
622
623// state1 is the state after reading a non-zero integer during a number,
624// such as after reading `1` or `100` but not `0`.
625func state1(s *scanner, c byte) int {
626	if '0' <= c && c <= '9' {
627		s.step = state1
628		return scanContinue
629	}
630	return state0(s, c)
631}
632
633// state0 is the state after reading `0` during a number.
634func state0(s *scanner, c byte) int {
635	if c == '.' {
636		s.step = stateDot
637		return scanContinue
638	}
639	if c == 'e' || c == 'E' {
640		s.step = stateE
641		return scanContinue
642	}
643	return stateEndValue(s, c)
644}
645
646// stateDot is the state after reading the integer and decimal point in a number,
647// such as after reading `1.`.
648func stateDot(s *scanner, c byte) int {
649	if '0' <= c && c <= '9' {
650		s.step = stateDot0
651		return scanContinue
652	}
653	return s.error(c, "after decimal point in numeric literal")
654}
655
656// stateDot0 is the state after reading the integer, decimal point, and subsequent
657// digits of a number, such as after reading `3.14`.
658func stateDot0(s *scanner, c byte) int {
659	if '0' <= c && c <= '9' {
660		return scanContinue
661	}
662	if c == 'e' || c == 'E' {
663		s.step = stateE
664		return scanContinue
665	}
666	return stateEndValue(s, c)
667}
668
669// stateE is the state after reading the mantissa and e in a number,
670// such as after reading `314e` or `0.314e`.
671func stateE(s *scanner, c byte) int {
672	if c == '+' || c == '-' {
673		s.step = stateESign
674		return scanContinue
675	}
676	return stateESign(s, c)
677}
678
679// stateESign is the state after reading the mantissa, e, and sign in a number,
680// such as after reading `314e-` or `0.314e+`.
681func stateESign(s *scanner, c byte) int {
682	if '0' <= c && c <= '9' {
683		s.step = stateE0
684		return scanContinue
685	}
686	return s.error(c, "in exponent of numeric literal")
687}
688
689// stateE0 is the state after reading the mantissa, e, optional sign,
690// and at least one digit of the exponent in a number,
691// such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
692func stateE0(s *scanner, c byte) int {
693	if '0' <= c && c <= '9' {
694		return scanContinue
695	}
696	return stateEndValue(s, c)
697}
698
699// stateT is the state after reading `t`.
700func stateT(s *scanner, c byte) int {
701	if c == 'r' {
702		s.step = stateTr
703		return scanContinue
704	}
705	return s.error(c, "in literal true (expecting 'r')")
706}
707
708// stateTr is the state after reading `tr`.
709func stateTr(s *scanner, c byte) int {
710	if c == 'u' {
711		s.step = stateTru
712		return scanContinue
713	}
714	return s.error(c, "in literal true (expecting 'u')")
715}
716
717// stateTru is the state after reading `tru`.
718func stateTru(s *scanner, c byte) int {
719	if c == 'e' {
720		s.step = stateEndValue
721		return scanContinue
722	}
723	return s.error(c, "in literal true (expecting 'e')")
724}
725
726// stateF is the state after reading `f`.
727func stateF(s *scanner, c byte) int {
728	if c == 'a' {
729		s.step = stateFa
730		return scanContinue
731	}
732	return s.error(c, "in literal false (expecting 'a')")
733}
734
735// stateFa is the state after reading `fa`.
736func stateFa(s *scanner, c byte) int {
737	if c == 'l' {
738		s.step = stateFal
739		return scanContinue
740	}
741	return s.error(c, "in literal false (expecting 'l')")
742}
743
744// stateFal is the state after reading `fal`.
745func stateFal(s *scanner, c byte) int {
746	if c == 's' {
747		s.step = stateFals
748		return scanContinue
749	}
750	return s.error(c, "in literal false (expecting 's')")
751}
752
753// stateFals is the state after reading `fals`.
754func stateFals(s *scanner, c byte) int {
755	if c == 'e' {
756		s.step = stateEndValue
757		return scanContinue
758	}
759	return s.error(c, "in literal false (expecting 'e')")
760}
761
762// stateN is the state after reading `n`.
763func stateN(s *scanner, c byte) int {
764	if c == 'u' {
765		s.step = stateNu
766		return scanContinue
767	}
768	return s.error(c, "in literal null (expecting 'u')")
769}
770
771// stateNu is the state after reading `nu`.
772func stateNu(s *scanner, c byte) int {
773	if c == 'l' {
774		s.step = stateNul
775		return scanContinue
776	}
777	return s.error(c, "in literal null (expecting 'l')")
778}
779
780// stateNul is the state after reading `nul`.
781func stateNul(s *scanner, c byte) int {
782	if c == 'l' {
783		s.step = stateEndValue
784		return scanContinue
785	}
786	return s.error(c, "in literal null (expecting 'l')")
787}
788
789// stateError is the state after reaching a syntax error,
790// such as after reading `[1}` or `5.1.2`.
791func stateError(s *scanner, c byte) int {
792	return scanError
793}
794
795// error records an error and switches to the error state.
796func (s *scanner) error(c byte, context string) int {
797	s.step = stateError
798	s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, int64(s.offset)}
799	return scanError
800}
801
802// quoteChar formats c as a quoted character literal
803func quoteChar(c byte) string {
804	// special cases - different from quoted strings
805	if c == '\'' {
806		return `'\''`
807	}
808	if c == '"' {
809		return `'"'`
810	}
811
812	// use quoted string with different quotation marks
813	s := strconv.Quote(string(c))
814	return "'" + s[1:len(s)-1] + "'"
815}
816
817// undo causes the scanner to return scanCode from the next state transition.
818// This gives callers a simple 1-byte undo mechanism.
819func (s *scanner) undo(scanCode int) {
820	if s.redo {
821		panic("json: invalid use of scanner")
822	}
823	s.redoCode = scanCode
824	s.redoState = s.step
825	s.step = stateRedo
826	s.redo = true
827	s.offset--
828}
829
830// stateRedo helps implement the scanner's 1-byte undo.
831func stateRedo(s *scanner, c byte) int {
832	s.redo = false
833	s.step = s.redoState
834	return s.redoCode
835}
836