scanner.go

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !goexperiment.jsonv2
     6  
     7  package json
     8  
     9  // JSON value parser state machine.
    10  // Just about at the limit of what is reasonable to write by hand.
    11  // Some parts are a bit tedious, but overall it nicely factors out the
    12  // otherwise common code from the multiple scanning functions
    13  // in this package (Compact, Indent, checkValid, etc).
    14  //
    15  // This file starts with two simple examples using the scanner
    16  // before diving into the scanner itself.
    17  
    18  import (
    19  	"strconv"
    20  	"sync"
    21  )
    22  
    23  // Valid reports whether data is a valid JSON encoding.
    24  func Valid(data []byte) bool {
    25  	scan := newScanner()
    26  	defer freeScanner(scan)
    27  	return checkValid(data, scan) == nil
    28  }
    29  
    30  // checkValid verifies that data is valid JSON-encoded data.
    31  // scan is passed in for use by checkValid to avoid an allocation.
    32  // checkValid returns nil or a SyntaxError.
    33  func checkValid(data []byte, scan *scanner) error {
    34  	scan.reset()
    35  	for _, c := range data {
    36  		scan.bytes++
    37  		if scan.step(scan, c) == scanError {
    38  			return scan.err
    39  		}
    40  	}
    41  	if scan.eof() == scanError {
    42  		return scan.err
    43  	}
    44  	return nil
    45  }
    46  
    47  // A SyntaxError is a description of a JSON syntax error.
    48  // [Unmarshal] will return a SyntaxError if the JSON can't be parsed.
    49  type SyntaxError struct {
    50  	msg    string // description of error
    51  	Offset int64  // error occurred after reading Offset bytes
    52  }
    53  
    54  func (e *SyntaxError) Error() string { return e.msg }
    55  
    56  // A scanner is a JSON scanning state machine.
    57  // Callers call scan.reset and then pass bytes in one at a time
    58  // by calling scan.step(&scan, c) for each byte.
    59  // The return value, referred to as an opcode, tells the
    60  // caller about significant parsing events like beginning
    61  // and ending literals, objects, and arrays, so that the
    62  // caller can follow along if it wishes.
    63  // The return value scanEnd indicates that a single top-level
    64  // JSON value has been completed, *before* the byte that
    65  // just got passed in.  (The indication must be delayed in order
    66  // to recognize the end of numbers: is 123 a whole value or
    67  // the beginning of 12345e+6?).
    68  type scanner struct {
    69  	// The step is a func to be called to execute the next transition.
    70  	// Also tried using an integer constant and a single func
    71  	// with a switch, but using the func directly was 10% faster
    72  	// on a 64-bit Mac Mini, and it's nicer to read.
    73  	step func(*scanner, byte) int
    74  
    75  	// Reached end of top-level value.
    76  	endTop bool
    77  
    78  	// Stack of what we're in the middle of - array values, object keys, object values.
    79  	parseState []int
    80  
    81  	// Error that happened, if any.
    82  	err error
    83  
    84  	// total bytes consumed, updated by decoder.Decode (and deliberately
    85  	// not set to zero by scan.reset)
    86  	bytes int64
    87  }
    88  
    89  var scannerPool = sync.Pool{
    90  	New: func() any {
    91  		return &scanner{}
    92  	},
    93  }
    94  
    95  func newScanner() *scanner {
    96  	scan := scannerPool.Get().(*scanner)
    97  	// scan.reset by design doesn't set bytes to zero
    98  	scan.bytes = 0
    99  	scan.reset()
   100  	return scan
   101  }
   102  
   103  func freeScanner(scan *scanner) {
   104  	// Avoid hanging on to too much memory in extreme cases.
   105  	if len(scan.parseState) > 1024 {
   106  		scan.parseState = nil
   107  	}
   108  	scannerPool.Put(scan)
   109  }
   110  
   111  // These values are returned by the state transition functions
   112  // assigned to scanner.state and the method scanner.eof.
   113  // They give details about the current state of the scan that
   114  // callers might be interested to know about.
   115  // It is okay to ignore the return value of any particular
   116  // call to scanner.state: if one call returns scanError,
   117  // every subsequent call will return scanError too.
   118  const (
   119  	// Continue.
   120  	scanContinue     = iota // uninteresting byte
   121  	scanBeginLiteral        // end implied by next result != scanContinue
   122  	scanBeginObject         // begin object
   123  	scanObjectKey           // just finished object key (string)
   124  	scanObjectValue         // just finished non-last object value
   125  	scanEndObject           // end object (implies scanObjectValue if possible)
   126  	scanBeginArray          // begin array
   127  	scanArrayValue          // just finished array value
   128  	scanEndArray            // end array (implies scanArrayValue if possible)
   129  	scanSkipSpace           // space byte; can skip; known to be last "continue" result
   130  
   131  	// Stop.
   132  	scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
   133  	scanError // hit an error, scanner.err.
   134  )
   135  
   136  // These values are stored in the parseState stack.
   137  // They give the current state of a composite value
   138  // being scanned. If the parser is inside a nested value
   139  // the parseState describes the nested state, outermost at entry 0.
   140  const (
   141  	parseObjectKey   = iota // parsing object key (before colon)
   142  	parseObjectValue        // parsing object value (after colon)
   143  	parseArrayValue         // parsing array value
   144  )
   145  
   146  // This limits the max nesting depth to prevent stack overflow.
   147  // This is permitted by https://tools.ietf.org/html/rfc7159#section-9
   148  const maxNestingDepth = 10000
   149  
   150  // reset prepares the scanner for use.
   151  // It must be called before calling s.step.
   152  func (s *scanner) reset() {
   153  	s.step = stateBeginValue
   154  	s.parseState = s.parseState[0:0]
   155  	s.err = nil
   156  	s.endTop = false
   157  }
   158  
   159  // eof tells the scanner that the end of input has been reached.
   160  // It returns a scan status just as s.step does.
   161  func (s *scanner) eof() int {
   162  	if s.err != nil {
   163  		return scanError
   164  	}
   165  	if s.endTop {
   166  		return scanEnd
   167  	}
   168  	s.step(s, ' ')
   169  	if s.endTop {
   170  		return scanEnd
   171  	}
   172  	if s.err == nil {
   173  		s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
   174  	}
   175  	return scanError
   176  }
   177  
   178  // pushParseState pushes a new parse state newParseState onto the parse stack.
   179  // an error state is returned if maxNestingDepth was exceeded, otherwise successState is returned.
   180  func (s *scanner) pushParseState(c byte, newParseState int, successState int) int {
   181  	s.parseState = append(s.parseState, newParseState)
   182  	if len(s.parseState) <= maxNestingDepth {
   183  		return successState
   184  	}
   185  	return s.error(c, "exceeded max depth")
   186  }
   187  
   188  // popParseState pops a parse state (already obtained) off the stack
   189  // and updates s.step accordingly.
   190  func (s *scanner) popParseState() {
   191  	n := len(s.parseState) - 1
   192  	s.parseState = s.parseState[0:n]
   193  	if n == 0 {
   194  		s.step = stateEndTop
   195  		s.endTop = true
   196  	} else {
   197  		s.step = stateEndValue
   198  	}
   199  }
   200  
   201  func isSpace(c byte) bool {
   202  	return c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n')
   203  }
   204  
   205  // stateBeginValueOrEmpty is the state after reading `[`.
   206  func stateBeginValueOrEmpty(s *scanner, c byte) int {
   207  	if isSpace(c) {
   208  		return scanSkipSpace
   209  	}
   210  	if c == ']' {
   211  		return stateEndValue(s, c)
   212  	}
   213  	return stateBeginValue(s, c)
   214  }
   215  
   216  // stateBeginValue is the state at the beginning of the input.
   217  func stateBeginValue(s *scanner, c byte) int {
   218  	if isSpace(c) {
   219  		return scanSkipSpace
   220  	}
   221  	switch c {
   222  	case '{':
   223  		s.step = stateBeginStringOrEmpty
   224  		return s.pushParseState(c, parseObjectKey, scanBeginObject)
   225  	case '[':
   226  		s.step = stateBeginValueOrEmpty
   227  		return s.pushParseState(c, parseArrayValue, scanBeginArray)
   228  	case '"':
   229  		s.step = stateInString
   230  		return scanBeginLiteral
   231  	case '-':
   232  		s.step = stateNeg
   233  		return scanBeginLiteral
   234  	case '0': // beginning of 0.123
   235  		s.step = state0
   236  		return scanBeginLiteral
   237  	case 't': // beginning of true
   238  		s.step = stateT
   239  		return scanBeginLiteral
   240  	case 'f': // beginning of false
   241  		s.step = stateF
   242  		return scanBeginLiteral
   243  	case 'n': // beginning of null
   244  		s.step = stateN
   245  		return scanBeginLiteral
   246  	}
   247  	if '1' <= c && c <= '9' { // beginning of 1234.5
   248  		s.step = state1
   249  		return scanBeginLiteral
   250  	}
   251  	return s.error(c, "looking for beginning of value")
   252  }
   253  
   254  // stateBeginStringOrEmpty is the state after reading `{`.
   255  func stateBeginStringOrEmpty(s *scanner, c byte) int {
   256  	if isSpace(c) {
   257  		return scanSkipSpace
   258  	}
   259  	if c == '}' {
   260  		n := len(s.parseState)
   261  		s.parseState[n-1] = parseObjectValue
   262  		return stateEndValue(s, c)
   263  	}
   264  	return stateBeginString(s, c)
   265  }
   266  
   267  // stateBeginString is the state after reading `{"key": value,`.
   268  func stateBeginString(s *scanner, c byte) int {
   269  	if isSpace(c) {
   270  		return scanSkipSpace
   271  	}
   272  	if c == '"' {
   273  		s.step = stateInString
   274  		return scanBeginLiteral
   275  	}
   276  	return s.error(c, "looking for beginning of object key string")
   277  }
   278  
   279  // stateEndValue is the state after completing a value,
   280  // such as after reading `{}` or `true` or `["x"`.
   281  func stateEndValue(s *scanner, c byte) int {
   282  	n := len(s.parseState)
   283  	if n == 0 {
   284  		// Completed top-level before the current byte.
   285  		s.step = stateEndTop
   286  		s.endTop = true
   287  		return stateEndTop(s, c)
   288  	}
   289  	if isSpace(c) {
   290  		s.step = stateEndValue
   291  		return scanSkipSpace
   292  	}
   293  	ps := s.parseState[n-1]
   294  	switch ps {
   295  	case parseObjectKey:
   296  		if c == ':' {
   297  			s.parseState[n-1] = parseObjectValue
   298  			s.step = stateBeginValue
   299  			return scanObjectKey
   300  		}
   301  		return s.error(c, "after object key")
   302  	case parseObjectValue:
   303  		if c == ',' {
   304  			s.parseState[n-1] = parseObjectKey
   305  			s.step = stateBeginString
   306  			return scanObjectValue
   307  		}
   308  		if c == '}' {
   309  			s.popParseState()
   310  			return scanEndObject
   311  		}
   312  		return s.error(c, "after object key:value pair")
   313  	case parseArrayValue:
   314  		if c == ',' {
   315  			s.step = stateBeginValue
   316  			return scanArrayValue
   317  		}
   318  		if c == ']' {
   319  			s.popParseState()
   320  			return scanEndArray
   321  		}
   322  		return s.error(c, "after array element")
   323  	}
   324  	return s.error(c, "")
   325  }
   326  
   327  // stateEndTop is the state after finishing the top-level value,
   328  // such as after reading `{}` or `[1,2,3]`.
   329  // Only space characters should be seen now.
   330  func stateEndTop(s *scanner, c byte) int {
   331  	if !isSpace(c) {
   332  		// Complain about non-space byte on next call.
   333  		s.error(c, "after top-level value")
   334  	}
   335  	return scanEnd
   336  }
   337  
   338  // stateInString is the state after reading `"`.
   339  func stateInString(s *scanner, c byte) int {
   340  	if c == '"' {
   341  		s.step = stateEndValue
   342  		return scanContinue
   343  	}
   344  	if c == '\\' {
   345  		s.step = stateInStringEsc
   346  		return scanContinue
   347  	}
   348  	if c < 0x20 {
   349  		return s.error(c, "in string literal")
   350  	}
   351  	return scanContinue
   352  }
   353  
   354  // stateInStringEsc is the state after reading `"\` during a quoted string.
   355  func stateInStringEsc(s *scanner, c byte) int {
   356  	switch c {
   357  	case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
   358  		s.step = stateInString
   359  		return scanContinue
   360  	case 'u':
   361  		s.step = stateInStringEscU
   362  		return scanContinue
   363  	}
   364  	return s.error(c, "in string escape code")
   365  }
   366  
   367  // stateInStringEscU is the state after reading `"\u` during a quoted string.
   368  func stateInStringEscU(s *scanner, c byte) int {
   369  	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   370  		s.step = stateInStringEscU1
   371  		return scanContinue
   372  	}
   373  	// numbers
   374  	return s.error(c, "in \\u hexadecimal character escape")
   375  }
   376  
   377  // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
   378  func stateInStringEscU1(s *scanner, c byte) int {
   379  	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   380  		s.step = stateInStringEscU12
   381  		return scanContinue
   382  	}
   383  	// numbers
   384  	return s.error(c, "in \\u hexadecimal character escape")
   385  }
   386  
   387  // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
   388  func stateInStringEscU12(s *scanner, c byte) int {
   389  	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   390  		s.step = stateInStringEscU123
   391  		return scanContinue
   392  	}
   393  	// numbers
   394  	return s.error(c, "in \\u hexadecimal character escape")
   395  }
   396  
   397  // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
   398  func stateInStringEscU123(s *scanner, c byte) int {
   399  	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   400  		s.step = stateInString
   401  		return scanContinue
   402  	}
   403  	// numbers
   404  	return s.error(c, "in \\u hexadecimal character escape")
   405  }
   406  
   407  // stateNeg is the state after reading `-` during a number.
   408  func stateNeg(s *scanner, c byte) int {
   409  	if c == '0' {
   410  		s.step = state0
   411  		return scanContinue
   412  	}
   413  	if '1' <= c && c <= '9' {
   414  		s.step = state1
   415  		return scanContinue
   416  	}
   417  	return s.error(c, "in numeric literal")
   418  }
   419  
   420  // state1 is the state after reading a non-zero integer during a number,
   421  // such as after reading `1` or `100` but not `0`.
   422  func state1(s *scanner, c byte) int {
   423  	if '0' <= c && c <= '9' {
   424  		s.step = state1
   425  		return scanContinue
   426  	}
   427  	return state0(s, c)
   428  }
   429  
   430  // state0 is the state after reading `0` during a number.
   431  func state0(s *scanner, c byte) int {
   432  	if c == '.' {
   433  		s.step = stateDot
   434  		return scanContinue
   435  	}
   436  	if c == 'e' || c == 'E' {
   437  		s.step = stateE
   438  		return scanContinue
   439  	}
   440  	return stateEndValue(s, c)
   441  }
   442  
   443  // stateDot is the state after reading the integer and decimal point in a number,
   444  // such as after reading `1.`.
   445  func stateDot(s *scanner, c byte) int {
   446  	if '0' <= c && c <= '9' {
   447  		s.step = stateDot0
   448  		return scanContinue
   449  	}
   450  	return s.error(c, "after decimal point in numeric literal")
   451  }
   452  
   453  // stateDot0 is the state after reading the integer, decimal point, and subsequent
   454  // digits of a number, such as after reading `3.14`.
   455  func stateDot0(s *scanner, c byte) int {
   456  	if '0' <= c && c <= '9' {
   457  		return scanContinue
   458  	}
   459  	if c == 'e' || c == 'E' {
   460  		s.step = stateE
   461  		return scanContinue
   462  	}
   463  	return stateEndValue(s, c)
   464  }
   465  
   466  // stateE is the state after reading the mantissa and e in a number,
   467  // such as after reading `314e` or `0.314e`.
   468  func stateE(s *scanner, c byte) int {
   469  	if c == '+' || c == '-' {
   470  		s.step = stateESign
   471  		return scanContinue
   472  	}
   473  	return stateESign(s, c)
   474  }
   475  
   476  // stateESign is the state after reading the mantissa, e, and sign in a number,
   477  // such as after reading `314e-` or `0.314e+`.
   478  func stateESign(s *scanner, c byte) int {
   479  	if '0' <= c && c <= '9' {
   480  		s.step = stateE0
   481  		return scanContinue
   482  	}
   483  	return s.error(c, "in exponent of numeric literal")
   484  }
   485  
   486  // stateE0 is the state after reading the mantissa, e, optional sign,
   487  // and at least one digit of the exponent in a number,
   488  // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
   489  func stateE0(s *scanner, c byte) int {
   490  	if '0' <= c && c <= '9' {
   491  		return scanContinue
   492  	}
   493  	return stateEndValue(s, c)
   494  }
   495  
   496  // stateT is the state after reading `t`.
   497  func stateT(s *scanner, c byte) int {
   498  	if c == 'r' {
   499  		s.step = stateTr
   500  		return scanContinue
   501  	}
   502  	return s.error(c, "in literal true (expecting 'r')")
   503  }
   504  
   505  // stateTr is the state after reading `tr`.
   506  func stateTr(s *scanner, c byte) int {
   507  	if c == 'u' {
   508  		s.step = stateTru
   509  		return scanContinue
   510  	}
   511  	return s.error(c, "in literal true (expecting 'u')")
   512  }
   513  
   514  // stateTru is the state after reading `tru`.
   515  func stateTru(s *scanner, c byte) int {
   516  	if c == 'e' {
   517  		s.step = stateEndValue
   518  		return scanContinue
   519  	}
   520  	return s.error(c, "in literal true (expecting 'e')")
   521  }
   522  
   523  // stateF is the state after reading `f`.
   524  func stateF(s *scanner, c byte) int {
   525  	if c == 'a' {
   526  		s.step = stateFa
   527  		return scanContinue
   528  	}
   529  	return s.error(c, "in literal false (expecting 'a')")
   530  }
   531  
   532  // stateFa is the state after reading `fa`.
   533  func stateFa(s *scanner, c byte) int {
   534  	if c == 'l' {
   535  		s.step = stateFal
   536  		return scanContinue
   537  	}
   538  	return s.error(c, "in literal false (expecting 'l')")
   539  }
   540  
   541  // stateFal is the state after reading `fal`.
   542  func stateFal(s *scanner, c byte) int {
   543  	if c == 's' {
   544  		s.step = stateFals
   545  		return scanContinue
   546  	}
   547  	return s.error(c, "in literal false (expecting 's')")
   548  }
   549  
   550  // stateFals is the state after reading `fals`.
   551  func stateFals(s *scanner, c byte) int {
   552  	if c == 'e' {
   553  		s.step = stateEndValue
   554  		return scanContinue
   555  	}
   556  	return s.error(c, "in literal false (expecting 'e')")
   557  }
   558  
   559  // stateN is the state after reading `n`.
   560  func stateN(s *scanner, c byte) int {
   561  	if c == 'u' {
   562  		s.step = stateNu
   563  		return scanContinue
   564  	}
   565  	return s.error(c, "in literal null (expecting 'u')")
   566  }
   567  
   568  // stateNu is the state after reading `nu`.
   569  func stateNu(s *scanner, c byte) int {
   570  	if c == 'l' {
   571  		s.step = stateNul
   572  		return scanContinue
   573  	}
   574  	return s.error(c, "in literal null (expecting 'l')")
   575  }
   576  
   577  // stateNul is the state after reading `nul`.
   578  func stateNul(s *scanner, c byte) int {
   579  	if c == 'l' {
   580  		s.step = stateEndValue
   581  		return scanContinue
   582  	}
   583  	return s.error(c, "in literal null (expecting 'l')")
   584  }
   585  
   586  // stateError is the state after reaching a syntax error,
   587  // such as after reading `[1}` or `5.1.2`.
   588  func stateError(s *scanner, c byte) int {
   589  	return scanError
   590  }
   591  
   592  // error records an error and switches to the error state.
   593  func (s *scanner) error(c byte, context string) int {
   594  	s.step = stateError
   595  	s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
   596  	return scanError
   597  }
   598  
   599  // quoteChar formats c as a quoted character literal.
   600  func quoteChar(c byte) string {
   601  	// special cases - different from quoted strings
   602  	if c == '\'' {
   603  		return `'\''`
   604  	}
   605  	if c == '"' {
   606  		return `'"'`
   607  	}
   608  
   609  	// use quoted string with different quotation marks
   610  	s := strconv.Quote(string(c))
   611  	return "'" + s[1:len(s)-1] + "'"
   612  }
   613
View as plain text