Source file src/go/scanner/scanner.go

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package scanner implements a scanner for Go source text.
     6  // It takes a []byte as source which can then be tokenized
     7  // through repeated calls to the Scan method.
     8  package scanner
     9  
    10  import (
    11  	"bytes"
    12  	"fmt"
    13  	"go/token"
    14  	"path/filepath"
    15  	"strconv"
    16  	"unicode"
    17  	"unicode/utf8"
    18  )
    19  
    20  // An ErrorHandler may be provided to Scanner.Init. If a syntax error is
    21  // encountered and a handler was installed, the handler is called with a
    22  // position and an error message. The position points to the beginning of
    23  // the offending token.
    24  type ErrorHandler func(pos token.Position, msg string)
    25  
    26  // A Scanner holds the scanner's internal state while processing
    27  // a given text. It can be allocated as part of another data
    28  // structure but must be initialized via Init before use.
    29  type Scanner struct {
    30  	// immutable state
    31  	file *token.File  // source file handle
    32  	dir  string       // directory portion of file.Name()
    33  	src  []byte       // source
    34  	err  ErrorHandler // error reporting; or nil
    35  	mode Mode         // scanning mode
    36  
    37  	// scanning state
    38  	ch         rune      // current character
    39  	offset     int       // character offset
    40  	rdOffset   int       // reading offset (position after current character)
    41  	lineOffset int       // current line offset
    42  	insertSemi bool      // insert a semicolon before next newline
    43  	nlPos      token.Pos // position of newline in preceding comment
    44  
    45  	// public state - ok to modify
    46  	ErrorCount int // number of errors encountered
    47  }
    48  
    49  const (
    50  	bom = 0xFEFF // byte order mark, only permitted as very first character
    51  	eof = -1     // end of file
    52  )
    53  
    54  // Read the next Unicode char into s.ch.
    55  // s.ch < 0 means end-of-file.
    56  //
    57  // For optimization, there is some overlap between this method and
    58  // s.scanIdentifier.
    59  func (s *Scanner) next() {
    60  	if s.rdOffset < len(s.src) {
    61  		s.offset = s.rdOffset
    62  		if s.ch == '\n' {
    63  			s.lineOffset = s.offset
    64  			s.file.AddLine(s.offset)
    65  		}
    66  		r, w := rune(s.src[s.rdOffset]), 1
    67  		switch {
    68  		case r == 0:
    69  			s.error(s.offset, "illegal character NUL")
    70  		case r >= utf8.RuneSelf:
    71  			// not ASCII
    72  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
    73  			if r == utf8.RuneError && w == 1 {
    74  				s.error(s.offset, "illegal UTF-8 encoding")
    75  			} else if r == bom && s.offset > 0 {
    76  				s.error(s.offset, "illegal byte order mark")
    77  			}
    78  		}
    79  		s.rdOffset += w
    80  		s.ch = r
    81  	} else {
    82  		s.offset = len(s.src)
    83  		if s.ch == '\n' {
    84  			s.lineOffset = s.offset
    85  			s.file.AddLine(s.offset)
    86  		}
    87  		s.ch = eof
    88  	}
    89  }
    90  
    91  // peek returns the byte following the most recently read character without
    92  // advancing the scanner. If the scanner is at EOF, peek returns 0.
    93  func (s *Scanner) peek() byte {
    94  	if s.rdOffset < len(s.src) {
    95  		return s.src[s.rdOffset]
    96  	}
    97  	return 0
    98  }
    99  
   100  // A mode value is a set of flags (or 0).
   101  // They control scanner behavior.
   102  type Mode uint
   103  
   104  const (
   105  	ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
   106  	dontInsertSemis                  // do not automatically insert semicolons - for testing only
   107  )
   108  
   109  // Init prepares the scanner s to tokenize the text src by setting the
   110  // scanner at the beginning of src. The scanner uses the file set file
   111  // for position information and it adds line information for each line.
   112  // It is ok to re-use the same file when re-scanning the same file as
   113  // line information which is already present is ignored. Init causes a
   114  // panic if the file size does not match the src size.
   115  //
   116  // Calls to Scan will invoke the error handler err if they encounter a
   117  // syntax error and err is not nil. Also, for each error encountered,
   118  // the Scanner field ErrorCount is incremented by one. The mode parameter
   119  // determines how comments are handled.
   120  //
   121  // Note that Init may call err if there is an error in the first character
   122  // of the file.
   123  func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
   124  	// Explicitly initialize all fields since a scanner may be reused.
   125  	if file.Size() != len(src) {
   126  		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
   127  	}
   128  	s.file = file
   129  	s.dir, _ = filepath.Split(file.Name())
   130  	s.src = src
   131  	s.err = err
   132  	s.mode = mode
   133  
   134  	s.ch = ' '
   135  	s.offset = 0
   136  	s.rdOffset = 0
   137  	s.lineOffset = 0
   138  	s.insertSemi = false
   139  	s.ErrorCount = 0
   140  
   141  	s.next()
   142  	if s.ch == bom {
   143  		s.next() // ignore BOM at file beginning
   144  	}
   145  }
   146  
   147  func (s *Scanner) error(offs int, msg string) {
   148  	if s.err != nil {
   149  		s.err(s.file.Position(s.file.Pos(offs)), msg)
   150  	}
   151  	s.ErrorCount++
   152  }
   153  
   154  func (s *Scanner) errorf(offs int, format string, args ...any) {
   155  	s.error(offs, fmt.Sprintf(format, args...))
   156  }
   157  
   158  // scanComment returns the text of the comment and (if nonzero)
   159  // the offset of the first newline within it, which implies a
   160  // /*...*/ comment.
   161  func (s *Scanner) scanComment() (string, int) {
   162  	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
   163  	offs := s.offset - 1 // position of initial '/'
   164  	next := -1           // position immediately following the comment; < 0 means invalid comment
   165  	numCR := 0
   166  	nlOffset := 0 // offset of first newline within /*...*/ comment
   167  
   168  	if s.ch == '/' {
   169  		//-style comment
   170  		// (the final '\n' is not considered part of the comment)
   171  		s.next()
   172  		for s.ch != '\n' && s.ch >= 0 {
   173  			if s.ch == '\r' {
   174  				numCR++
   175  			}
   176  			s.next()
   177  		}
   178  		// if we are at '\n', the position following the comment is afterwards
   179  		next = s.offset
   180  		if s.ch == '\n' {
   181  			next++
   182  		}
   183  		goto exit
   184  	}
   185  
   186  	/*-style comment */
   187  	s.next()
   188  	for s.ch >= 0 {
   189  		ch := s.ch
   190  		if ch == '\r' {
   191  			numCR++
   192  		} else if ch == '\n' && nlOffset == 0 {
   193  			nlOffset = s.offset
   194  		}
   195  		s.next()
   196  		if ch == '*' && s.ch == '/' {
   197  			s.next()
   198  			next = s.offset
   199  			goto exit
   200  		}
   201  	}
   202  
   203  	s.error(offs, "comment not terminated")
   204  
   205  exit:
   206  	lit := s.src[offs:s.offset]
   207  
   208  	// On Windows, a (//-comment) line may end in "\r\n".
   209  	// Remove the final '\r' before analyzing the text for
   210  	// line directives (matching the compiler). Remove any
   211  	// other '\r' afterwards (matching the pre-existing be-
   212  	// havior of the scanner).
   213  	if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
   214  		lit = lit[:len(lit)-1]
   215  		numCR--
   216  	}
   217  
   218  	// interpret line directives
   219  	// (//line directives must start at the beginning of the current line)
   220  	if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
   221  		s.updateLineInfo(next, offs, lit)
   222  	}
   223  
   224  	if numCR > 0 {
   225  		lit = stripCR(lit, lit[1] == '*')
   226  	}
   227  
   228  	return string(lit), nlOffset
   229  }
   230  
   231  var prefix = []byte("line ")
   232  
   233  // updateLineInfo parses the incoming comment text at offset offs
   234  // as a line directive. If successful, it updates the line info table
   235  // for the position next per the line directive.
   236  func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
   237  	// extract comment text
   238  	if text[1] == '*' {
   239  		text = text[:len(text)-2] // lop off trailing "*/"
   240  	}
   241  	text = text[7:] // lop off leading "//line " or "/*line "
   242  	offs += 7
   243  
   244  	i, n, ok := trailingDigits(text)
   245  	if i == 0 {
   246  		return // ignore (not a line directive)
   247  	}
   248  	// i > 0
   249  
   250  	if !ok {
   251  		// text has a suffix :xxx but xxx is not a number
   252  		s.error(offs+i, "invalid line number: "+string(text[i:]))
   253  		return
   254  	}
   255  
   256  	var line, col int
   257  	i2, n2, ok2 := trailingDigits(text[:i-1])
   258  	if ok2 {
   259  		//line filename:line:col
   260  		i, i2 = i2, i
   261  		line, col = n2, n
   262  		if col == 0 {
   263  			s.error(offs+i2, "invalid column number: "+string(text[i2:]))
   264  			return
   265  		}
   266  		text = text[:i2-1] // lop off ":col"
   267  	} else {
   268  		//line filename:line
   269  		line = n
   270  	}
   271  
   272  	if line == 0 {
   273  		s.error(offs+i, "invalid line number: "+string(text[i:]))
   274  		return
   275  	}
   276  
   277  	// If we have a column (//line filename:line:col form),
   278  	// an empty filename means to use the previous filename.
   279  	filename := string(text[:i-1]) // lop off ":line", and trim white space
   280  	if filename == "" && ok2 {
   281  		filename = s.file.Position(s.file.Pos(offs)).Filename
   282  	} else if filename != "" {
   283  		// Put a relative filename in the current directory.
   284  		// This is for compatibility with earlier releases.
   285  		// See issue 26671.
   286  		filename = filepath.Clean(filename)
   287  		if !filepath.IsAbs(filename) {
   288  			filename = filepath.Join(s.dir, filename)
   289  		}
   290  	}
   291  
   292  	s.file.AddLineColumnInfo(next, filename, line, col)
   293  }
   294  
   295  func trailingDigits(text []byte) (int, int, bool) {
   296  	i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
   297  	if i < 0 {
   298  		return 0, 0, false // no ":"
   299  	}
   300  	// i >= 0
   301  	n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
   302  	return i + 1, int(n), err == nil
   303  }
   304  
   305  func isLetter(ch rune) bool {
   306  	return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
   307  }
   308  
   309  func isDigit(ch rune) bool {
   310  	return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
   311  }
   312  
   313  // scanIdentifier reads the string of valid identifier characters at s.offset.
   314  // It must only be called when s.ch is known to be a valid letter.
   315  //
   316  // Be careful when making changes to this function: it is optimized and affects
   317  // scanning performance significantly.
   318  func (s *Scanner) scanIdentifier() string {
   319  	offs := s.offset
   320  
   321  	// Optimize for the common case of an ASCII identifier.
   322  	//
   323  	// Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
   324  	// avoids conversions to runes.
   325  	//
   326  	// In case we encounter a non-ASCII character, fall back on the slower path
   327  	// of calling into s.next().
   328  	for rdOffset, b := range s.src[s.rdOffset:] {
   329  		if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
   330  			// Avoid assigning a rune for the common case of an ascii character.
   331  			continue
   332  		}
   333  		s.rdOffset += rdOffset
   334  		if 0 < b && b < utf8.RuneSelf {
   335  			// Optimization: we've encountered an ASCII character that's not a letter
   336  			// or number. Avoid the call into s.next() and corresponding set up.
   337  			//
   338  			// Note that s.next() does some line accounting if s.ch is '\n', so this
   339  			// shortcut is only possible because we know that the preceding character
   340  			// is not '\n'.
   341  			s.ch = rune(b)
   342  			s.offset = s.rdOffset
   343  			s.rdOffset++
   344  			goto exit
   345  		}
   346  		// We know that the preceding character is valid for an identifier because
   347  		// scanIdentifier is only called when s.ch is a letter, so calling s.next()
   348  		// at s.rdOffset resets the scanner state.
   349  		s.next()
   350  		for isLetter(s.ch) || isDigit(s.ch) {
   351  			s.next()
   352  		}
   353  		goto exit
   354  	}
   355  	s.offset = len(s.src)
   356  	s.rdOffset = len(s.src)
   357  	s.ch = eof
   358  
   359  exit:
   360  	return string(s.src[offs:s.offset])
   361  }
   362  
   363  func digitVal(ch rune) int {
   364  	switch {
   365  	case '0' <= ch && ch <= '9':
   366  		return int(ch - '0')
   367  	case 'a' <= lower(ch) && lower(ch) <= 'f':
   368  		return int(lower(ch) - 'a' + 10)
   369  	}
   370  	return 16 // larger than any legal digit val
   371  }
   372  
   373  func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
   374  func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
   375  func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
   376  
   377  // digits accepts the sequence { digit | '_' }.
   378  // If base <= 10, digits accepts any decimal digit but records
   379  // the offset (relative to the source start) of a digit >= base
   380  // in *invalid, if *invalid < 0.
   381  // digits returns a bitset describing whether the sequence contained
   382  // digits (bit 0 is set), or separators '_' (bit 1 is set).
   383  func (s *Scanner) digits(base int, invalid *int) (digsep int) {
   384  	if base <= 10 {
   385  		max := rune('0' + base)
   386  		for isDecimal(s.ch) || s.ch == '_' {
   387  			ds := 1
   388  			if s.ch == '_' {
   389  				ds = 2
   390  			} else if s.ch >= max && *invalid < 0 {
   391  				*invalid = s.offset // record invalid rune offset
   392  			}
   393  			digsep |= ds
   394  			s.next()
   395  		}
   396  	} else {
   397  		for isHex(s.ch) || s.ch == '_' {
   398  			ds := 1
   399  			if s.ch == '_' {
   400  				ds = 2
   401  			}
   402  			digsep |= ds
   403  			s.next()
   404  		}
   405  	}
   406  	return
   407  }
   408  
   409  func (s *Scanner) scanNumber() (token.Token, string) {
   410  	offs := s.offset
   411  	tok := token.ILLEGAL
   412  
   413  	base := 10        // number base
   414  	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
   415  	digsep := 0       // bit 0: digit present, bit 1: '_' present
   416  	invalid := -1     // index of invalid digit in literal, or < 0
   417  
   418  	// integer part
   419  	if s.ch != '.' {
   420  		tok = token.INT
   421  		if s.ch == '0' {
   422  			s.next()
   423  			switch lower(s.ch) {
   424  			case 'x':
   425  				s.next()
   426  				base, prefix = 16, 'x'
   427  			case 'o':
   428  				s.next()
   429  				base, prefix = 8, 'o'
   430  			case 'b':
   431  				s.next()
   432  				base, prefix = 2, 'b'
   433  			default:
   434  				base, prefix = 8, '0'
   435  				digsep = 1 // leading 0
   436  			}
   437  		}
   438  		digsep |= s.digits(base, &invalid)
   439  	}
   440  
   441  	// fractional part
   442  	if s.ch == '.' {
   443  		tok = token.FLOAT
   444  		if prefix == 'o' || prefix == 'b' {
   445  			s.error(s.offset, "invalid radix point in "+litname(prefix))
   446  		}
   447  		s.next()
   448  		digsep |= s.digits(base, &invalid)
   449  	}
   450  
   451  	if digsep&1 == 0 {
   452  		s.error(s.offset, litname(prefix)+" has no digits")
   453  	}
   454  
   455  	// exponent
   456  	if e := lower(s.ch); e == 'e' || e == 'p' {
   457  		switch {
   458  		case e == 'e' && prefix != 0 && prefix != '0':
   459  			s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
   460  		case e == 'p' && prefix != 'x':
   461  			s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
   462  		}
   463  		s.next()
   464  		tok = token.FLOAT
   465  		if s.ch == '+' || s.ch == '-' {
   466  			s.next()
   467  		}
   468  		ds := s.digits(10, nil)
   469  		digsep |= ds
   470  		if ds&1 == 0 {
   471  			s.error(s.offset, "exponent has no digits")
   472  		}
   473  	} else if prefix == 'x' && tok == token.FLOAT {
   474  		s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
   475  	}
   476  
   477  	// suffix 'i'
   478  	if s.ch == 'i' {
   479  		tok = token.IMAG
   480  		s.next()
   481  	}
   482  
   483  	lit := string(s.src[offs:s.offset])
   484  	if tok == token.INT && invalid >= 0 {
   485  		s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
   486  	}
   487  	if digsep&2 != 0 {
   488  		if i := invalidSep(lit); i >= 0 {
   489  			s.error(offs+i, "'_' must separate successive digits")
   490  		}
   491  	}
   492  
   493  	return tok, lit
   494  }
   495  
   496  func litname(prefix rune) string {
   497  	switch prefix {
   498  	case 'x':
   499  		return "hexadecimal literal"
   500  	case 'o', '0':
   501  		return "octal literal"
   502  	case 'b':
   503  		return "binary literal"
   504  	}
   505  	return "decimal literal"
   506  }
   507  
   508  // invalidSep returns the index of the first invalid separator in x, or -1.
   509  func invalidSep(x string) int {
   510  	x1 := ' ' // prefix char, we only care if it's 'x'
   511  	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
   512  	i := 0
   513  
   514  	// a prefix counts as a digit
   515  	if len(x) >= 2 && x[0] == '0' {
   516  		x1 = lower(rune(x[1]))
   517  		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
   518  			d = '0'
   519  			i = 2
   520  		}
   521  	}
   522  
   523  	// mantissa and exponent
   524  	for ; i < len(x); i++ {
   525  		p := d // previous digit
   526  		d = rune(x[i])
   527  		switch {
   528  		case d == '_':
   529  			if p != '0' {
   530  				return i
   531  			}
   532  		case isDecimal(d) || x1 == 'x' && isHex(d):
   533  			d = '0'
   534  		default:
   535  			if p == '_' {
   536  				return i - 1
   537  			}
   538  			d = '.'
   539  		}
   540  	}
   541  	if d == '_' {
   542  		return len(x) - 1
   543  	}
   544  
   545  	return -1
   546  }
   547  
   548  // scanEscape parses an escape sequence where rune is the accepted
   549  // escaped quote. In case of a syntax error, it stops at the offending
   550  // character (without consuming it) and returns false. Otherwise
   551  // it returns true.
   552  func (s *Scanner) scanEscape(quote rune) bool {
   553  	offs := s.offset
   554  
   555  	var n int
   556  	var base, max uint32
   557  	switch s.ch {
   558  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   559  		s.next()
   560  		return true
   561  	case '0', '1', '2', '3', '4', '5', '6', '7':
   562  		n, base, max = 3, 8, 255
   563  	case 'x':
   564  		s.next()
   565  		n, base, max = 2, 16, 255
   566  	case 'u':
   567  		s.next()
   568  		n, base, max = 4, 16, unicode.MaxRune
   569  	case 'U':
   570  		s.next()
   571  		n, base, max = 8, 16, unicode.MaxRune
   572  	default:
   573  		msg := "unknown escape sequence"
   574  		if s.ch < 0 {
   575  			msg = "escape sequence not terminated"
   576  		}
   577  		s.error(offs, msg)
   578  		return false
   579  	}
   580  
   581  	var x uint32
   582  	for n > 0 {
   583  		d := uint32(digitVal(s.ch))
   584  		if d >= base {
   585  			msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
   586  			if s.ch < 0 {
   587  				msg = "escape sequence not terminated"
   588  			}
   589  			s.error(s.offset, msg)
   590  			return false
   591  		}
   592  		x = x*base + d
   593  		s.next()
   594  		n--
   595  	}
   596  
   597  	if x > max || 0xD800 <= x && x < 0xE000 {
   598  		s.error(offs, "escape sequence is invalid Unicode code point")
   599  		return false
   600  	}
   601  
   602  	return true
   603  }
   604  
   605  func (s *Scanner) scanRune() string {
   606  	// '\'' opening already consumed
   607  	offs := s.offset - 1
   608  
   609  	valid := true
   610  	n := 0
   611  	for {
   612  		ch := s.ch
   613  		if ch == '\n' || ch < 0 {
   614  			// only report error if we don't have one already
   615  			if valid {
   616  				s.error(offs, "rune literal not terminated")
   617  				valid = false
   618  			}
   619  			break
   620  		}
   621  		s.next()
   622  		if ch == '\'' {
   623  			break
   624  		}
   625  		n++
   626  		if ch == '\\' {
   627  			if !s.scanEscape('\'') {
   628  				valid = false
   629  			}
   630  			// continue to read to closing quote
   631  		}
   632  	}
   633  
   634  	if valid && n != 1 {
   635  		s.error(offs, "illegal rune literal")
   636  	}
   637  
   638  	return string(s.src[offs:s.offset])
   639  }
   640  
   641  func (s *Scanner) scanString() string {
   642  	// '"' opening already consumed
   643  	offs := s.offset - 1
   644  
   645  	for {
   646  		ch := s.ch
   647  		if ch == '\n' || ch < 0 {
   648  			s.error(offs, "string literal not terminated")
   649  			break
   650  		}
   651  		s.next()
   652  		if ch == '"' {
   653  			break
   654  		}
   655  		if ch == '\\' {
   656  			s.scanEscape('"')
   657  		}
   658  	}
   659  
   660  	return string(s.src[offs:s.offset])
   661  }
   662  
   663  func stripCR(b []byte, comment bool) []byte {
   664  	c := make([]byte, len(b))
   665  	i := 0
   666  	for j, ch := range b {
   667  		// In a /*-style comment, don't strip \r from *\r/ (incl.
   668  		// sequences of \r from *\r\r...\r/) since the resulting
   669  		// */ would terminate the comment too early unless the \r
   670  		// is immediately following the opening /* in which case
   671  		// it's ok because /*/ is not closed yet (issue #11151).
   672  		if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
   673  			c[i] = ch
   674  			i++
   675  		}
   676  	}
   677  	return c[:i]
   678  }
   679  
   680  func (s *Scanner) scanRawString() string {
   681  	// '`' opening already consumed
   682  	offs := s.offset - 1
   683  
   684  	hasCR := false
   685  	for {
   686  		ch := s.ch
   687  		if ch < 0 {
   688  			s.error(offs, "raw string literal not terminated")
   689  			break
   690  		}
   691  		s.next()
   692  		if ch == '`' {
   693  			break
   694  		}
   695  		if ch == '\r' {
   696  			hasCR = true
   697  		}
   698  	}
   699  
   700  	lit := s.src[offs:s.offset]
   701  	if hasCR {
   702  		lit = stripCR(lit, false)
   703  	}
   704  
   705  	return string(lit)
   706  }
   707  
   708  func (s *Scanner) skipWhitespace() {
   709  	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
   710  		s.next()
   711  	}
   712  }
   713  
   714  // Helper functions for scanning multi-byte tokens such as >> += >>= .
   715  // Different routines recognize different length tok_i based on matches
   716  // of ch_i. If a token ends in '=', the result is tok1 or tok3
   717  // respectively. Otherwise, the result is tok0 if there was no other
   718  // matching character, or tok2 if the matching character was ch2.
   719  
   720  func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
   721  	if s.ch == '=' {
   722  		s.next()
   723  		return tok1
   724  	}
   725  	return tok0
   726  }
   727  
   728  func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
   729  	if s.ch == '=' {
   730  		s.next()
   731  		return tok1
   732  	}
   733  	if s.ch == ch2 {
   734  		s.next()
   735  		return tok2
   736  	}
   737  	return tok0
   738  }
   739  
   740  func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
   741  	if s.ch == '=' {
   742  		s.next()
   743  		return tok1
   744  	}
   745  	if s.ch == ch2 {
   746  		s.next()
   747  		if s.ch == '=' {
   748  			s.next()
   749  			return tok3
   750  		}
   751  		return tok2
   752  	}
   753  	return tok0
   754  }
   755  
   756  // Scan scans the next token and returns the token position, the token,
   757  // and its literal string if applicable. The source end is indicated by
   758  // token.EOF.
   759  //
   760  // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
   761  // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
   762  // has the corresponding value.
   763  //
   764  // If the returned token is a keyword, the literal string is the keyword.
   765  //
   766  // If the returned token is token.SEMICOLON, the corresponding
   767  // literal string is ";" if the semicolon was present in the source,
   768  // and "\n" if the semicolon was inserted because of a newline or
   769  // at EOF.
   770  //
   771  // If the returned token is token.ILLEGAL, the literal string is the
   772  // offending character.
   773  //
   774  // In all other cases, Scan returns an empty literal string.
   775  //
   776  // For more tolerant parsing, Scan will return a valid token if
   777  // possible even if a syntax error was encountered. Thus, even
   778  // if the resulting token sequence contains no illegal tokens,
   779  // a client may not assume that no error occurred. Instead it
   780  // must check the scanner's ErrorCount or the number of calls
   781  // of the error handler, if there was one installed.
   782  //
   783  // Scan adds line information to the file added to the file
   784  // set with Init. Token positions are relative to that file
   785  // and thus relative to the file set.
   786  func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
   787  scanAgain:
   788  	if s.nlPos.IsValid() {
   789  		// Return artificial ';' token after /*...*/ comment
   790  		// containing newline, at position of first newline.
   791  		pos, tok, lit = s.nlPos, token.SEMICOLON, "\n"
   792  		s.nlPos = token.NoPos
   793  		return
   794  	}
   795  
   796  	s.skipWhitespace()
   797  
   798  	// current token start
   799  	pos = s.file.Pos(s.offset)
   800  
   801  	// determine token value
   802  	insertSemi := false
   803  	switch ch := s.ch; {
   804  	case isLetter(ch):
   805  		lit = s.scanIdentifier()
   806  		if len(lit) > 1 {
   807  			// keywords are longer than one letter - avoid lookup otherwise
   808  			tok = token.Lookup(lit)
   809  			switch tok {
   810  			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
   811  				insertSemi = true
   812  			}
   813  		} else {
   814  			insertSemi = true
   815  			tok = token.IDENT
   816  		}
   817  	case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
   818  		insertSemi = true
   819  		tok, lit = s.scanNumber()
   820  	default:
   821  		s.next() // always make progress
   822  		switch ch {
   823  		case eof:
   824  			if s.insertSemi {
   825  				s.insertSemi = false // EOF consumed
   826  				return pos, token.SEMICOLON, "\n"
   827  			}
   828  			tok = token.EOF
   829  		case '\n':
   830  			// we only reach here if s.insertSemi was
   831  			// set in the first place and exited early
   832  			// from s.skipWhitespace()
   833  			s.insertSemi = false // newline consumed
   834  			return pos, token.SEMICOLON, "\n"
   835  		case '"':
   836  			insertSemi = true
   837  			tok = token.STRING
   838  			lit = s.scanString()
   839  		case '\'':
   840  			insertSemi = true
   841  			tok = token.CHAR
   842  			lit = s.scanRune()
   843  		case '`':
   844  			insertSemi = true
   845  			tok = token.STRING
   846  			lit = s.scanRawString()
   847  		case ':':
   848  			tok = s.switch2(token.COLON, token.DEFINE)
   849  		case '.':
   850  			// fractions starting with a '.' are handled by outer switch
   851  			tok = token.PERIOD
   852  			if s.ch == '.' && s.peek() == '.' {
   853  				s.next()
   854  				s.next() // consume last '.'
   855  				tok = token.ELLIPSIS
   856  			}
   857  		case ',':
   858  			tok = token.COMMA
   859  		case ';':
   860  			tok = token.SEMICOLON
   861  			lit = ";"
   862  		case '(':
   863  			tok = token.LPAREN
   864  		case ')':
   865  			insertSemi = true
   866  			tok = token.RPAREN
   867  		case '[':
   868  			tok = token.LBRACK
   869  		case ']':
   870  			insertSemi = true
   871  			tok = token.RBRACK
   872  		case '{':
   873  			tok = token.LBRACE
   874  		case '}':
   875  			insertSemi = true
   876  			tok = token.RBRACE
   877  		case '+':
   878  			tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
   879  			if tok == token.INC {
   880  				insertSemi = true
   881  			}
   882  		case '-':
   883  			tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
   884  			if tok == token.DEC {
   885  				insertSemi = true
   886  			}
   887  		case '*':
   888  			tok = s.switch2(token.MUL, token.MUL_ASSIGN)
   889  		case '/':
   890  			if s.ch == '/' || s.ch == '*' {
   891  				// comment
   892  				comment, nlOffset := s.scanComment()
   893  				if s.insertSemi && nlOffset != 0 {
   894  					// For /*...*/ containing \n, return
   895  					// COMMENT then artificial SEMICOLON.
   896  					s.nlPos = s.file.Pos(nlOffset)
   897  					s.insertSemi = false
   898  				} else {
   899  					insertSemi = s.insertSemi // preserve insertSemi info
   900  				}
   901  				if s.mode&ScanComments == 0 {
   902  					// skip comment
   903  					goto scanAgain
   904  				}
   905  				tok = token.COMMENT
   906  				lit = comment
   907  			} else {
   908  				// division
   909  				tok = s.switch2(token.QUO, token.QUO_ASSIGN)
   910  			}
   911  		case '%':
   912  			tok = s.switch2(token.REM, token.REM_ASSIGN)
   913  		case '^':
   914  			tok = s.switch2(token.XOR, token.XOR_ASSIGN)
   915  		case '<':
   916  			if s.ch == '-' {
   917  				s.next()
   918  				tok = token.ARROW
   919  			} else {
   920  				tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
   921  			}
   922  		case '>':
   923  			tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
   924  		case '=':
   925  			tok = s.switch2(token.ASSIGN, token.EQL)
   926  		case '!':
   927  			tok = s.switch2(token.NOT, token.NEQ)
   928  		case '&':
   929  			if s.ch == '^' {
   930  				s.next()
   931  				tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
   932  			} else {
   933  				tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
   934  			}
   935  		case '|':
   936  			tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
   937  		case '~':
   938  			tok = token.TILDE
   939  		default:
   940  			// next reports unexpected BOMs - don't repeat
   941  			if ch != bom {
   942  				s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
   943  			}
   944  			insertSemi = s.insertSemi // preserve insertSemi info
   945  			tok = token.ILLEGAL
   946  			lit = string(ch)
   947  		}
   948  	}
   949  	if s.mode&dontInsertSemis == 0 {
   950  		s.insertSemi = insertSemi
   951  	}
   952  
   953  	return
   954  }
   955  

View as plain text