Source file
src/go/scanner/scanner.go
1
2
3
4
5
6
7
8 package scanner
9
10 import (
11 "bytes"
12 "fmt"
13 "go/token"
14 "path/filepath"
15 "strconv"
16 "unicode"
17 "unicode/utf8"
18 )
19
20
21
22
23
24 type ErrorHandler func(pos token.Position, msg string)
25
26
27
28
29 type Scanner struct {
30
31 file *token.File
32 dir string
33 src []byte
34 err ErrorHandler
35 mode Mode
36
37
38 ch rune
39 offset int
40 rdOffset int
41 lineOffset int
42 insertSemi bool
43 nlPos token.Pos
44
45
46 ErrorCount int
47 }
48
49 const (
50 bom = 0xFEFF
51 eof = -1
52 )
53
54
55
56
57
58
59 func (s *Scanner) next() {
60 if s.rdOffset < len(s.src) {
61 s.offset = s.rdOffset
62 if s.ch == '\n' {
63 s.lineOffset = s.offset
64 s.file.AddLine(s.offset)
65 }
66 r, w := rune(s.src[s.rdOffset]), 1
67 switch {
68 case r == 0:
69 s.error(s.offset, "illegal character NUL")
70 case r >= utf8.RuneSelf:
71
72 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
73 if r == utf8.RuneError && w == 1 {
74 s.error(s.offset, "illegal UTF-8 encoding")
75 } else if r == bom && s.offset > 0 {
76 s.error(s.offset, "illegal byte order mark")
77 }
78 }
79 s.rdOffset += w
80 s.ch = r
81 } else {
82 s.offset = len(s.src)
83 if s.ch == '\n' {
84 s.lineOffset = s.offset
85 s.file.AddLine(s.offset)
86 }
87 s.ch = eof
88 }
89 }
90
91
92
93 func (s *Scanner) peek() byte {
94 if s.rdOffset < len(s.src) {
95 return s.src[s.rdOffset]
96 }
97 return 0
98 }
99
100
101
102 type Mode uint
103
104 const (
105 ScanComments Mode = 1 << iota
106 dontInsertSemis
107 )
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
124
125 if file.Size() != len(src) {
126 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
127 }
128 s.file = file
129 s.dir, _ = filepath.Split(file.Name())
130 s.src = src
131 s.err = err
132 s.mode = mode
133
134 s.ch = ' '
135 s.offset = 0
136 s.rdOffset = 0
137 s.lineOffset = 0
138 s.insertSemi = false
139 s.ErrorCount = 0
140
141 s.next()
142 if s.ch == bom {
143 s.next()
144 }
145 }
146
147 func (s *Scanner) error(offs int, msg string) {
148 if s.err != nil {
149 s.err(s.file.Position(s.file.Pos(offs)), msg)
150 }
151 s.ErrorCount++
152 }
153
154 func (s *Scanner) errorf(offs int, format string, args ...any) {
155 s.error(offs, fmt.Sprintf(format, args...))
156 }
157
158
159
160
161 func (s *Scanner) scanComment() (string, int) {
162
163 offs := s.offset - 1
164 next := -1
165 numCR := 0
166 nlOffset := 0
167
168 if s.ch == '/' {
169
170
171 s.next()
172 for s.ch != '\n' && s.ch >= 0 {
173 if s.ch == '\r' {
174 numCR++
175 }
176 s.next()
177 }
178
179 next = s.offset
180 if s.ch == '\n' {
181 next++
182 }
183 goto exit
184 }
185
186
187 s.next()
188 for s.ch >= 0 {
189 ch := s.ch
190 if ch == '\r' {
191 numCR++
192 } else if ch == '\n' && nlOffset == 0 {
193 nlOffset = s.offset
194 }
195 s.next()
196 if ch == '*' && s.ch == '/' {
197 s.next()
198 next = s.offset
199 goto exit
200 }
201 }
202
203 s.error(offs, "comment not terminated")
204
205 exit:
206 lit := s.src[offs:s.offset]
207
208
209
210
211
212
213 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
214 lit = lit[:len(lit)-1]
215 numCR--
216 }
217
218
219
220 if next >= 0 && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
221 s.updateLineInfo(next, offs, lit)
222 }
223
224 if numCR > 0 {
225 lit = stripCR(lit, lit[1] == '*')
226 }
227
228 return string(lit), nlOffset
229 }
230
231 var prefix = []byte("line ")
232
233
234
235
236 func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
237
238 if text[1] == '*' {
239 text = text[:len(text)-2]
240 }
241 text = text[7:]
242 offs += 7
243
244 i, n, ok := trailingDigits(text)
245 if i == 0 {
246 return
247 }
248
249
250 if !ok {
251
252 s.error(offs+i, "invalid line number: "+string(text[i:]))
253 return
254 }
255
256 var line, col int
257 i2, n2, ok2 := trailingDigits(text[:i-1])
258 if ok2 {
259
260 i, i2 = i2, i
261 line, col = n2, n
262 if col == 0 {
263 s.error(offs+i2, "invalid column number: "+string(text[i2:]))
264 return
265 }
266 text = text[:i2-1]
267 } else {
268
269 line = n
270 }
271
272 if line == 0 {
273 s.error(offs+i, "invalid line number: "+string(text[i:]))
274 return
275 }
276
277
278
279 filename := string(text[:i-1])
280 if filename == "" && ok2 {
281 filename = s.file.Position(s.file.Pos(offs)).Filename
282 } else if filename != "" {
283
284
285
286 filename = filepath.Clean(filename)
287 if !filepath.IsAbs(filename) {
288 filename = filepath.Join(s.dir, filename)
289 }
290 }
291
292 s.file.AddLineColumnInfo(next, filename, line, col)
293 }
294
295 func trailingDigits(text []byte) (int, int, bool) {
296 i := bytes.LastIndexByte(text, ':')
297 if i < 0 {
298 return 0, 0, false
299 }
300
301 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
302 return i + 1, int(n), err == nil
303 }
304
305 func isLetter(ch rune) bool {
306 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
307 }
308
309 func isDigit(ch rune) bool {
310 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
311 }
312
313
314
315
316
317
318 func (s *Scanner) scanIdentifier() string {
319 offs := s.offset
320
321
322
323
324
325
326
327
328 for rdOffset, b := range s.src[s.rdOffset:] {
329 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
330
331 continue
332 }
333 s.rdOffset += rdOffset
334 if 0 < b && b < utf8.RuneSelf {
335
336
337
338
339
340
341 s.ch = rune(b)
342 s.offset = s.rdOffset
343 s.rdOffset++
344 goto exit
345 }
346
347
348
349 s.next()
350 for isLetter(s.ch) || isDigit(s.ch) {
351 s.next()
352 }
353 goto exit
354 }
355 s.offset = len(s.src)
356 s.rdOffset = len(s.src)
357 s.ch = eof
358
359 exit:
360 return string(s.src[offs:s.offset])
361 }
362
363 func digitVal(ch rune) int {
364 switch {
365 case '0' <= ch && ch <= '9':
366 return int(ch - '0')
367 case 'a' <= lower(ch) && lower(ch) <= 'f':
368 return int(lower(ch) - 'a' + 10)
369 }
370 return 16
371 }
372
373 func lower(ch rune) rune { return ('a' - 'A') | ch }
374 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
375 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
376
377
378
379
380
381
382
383 func (s *Scanner) digits(base int, invalid *int) (digsep int) {
384 if base <= 10 {
385 max := rune('0' + base)
386 for isDecimal(s.ch) || s.ch == '_' {
387 ds := 1
388 if s.ch == '_' {
389 ds = 2
390 } else if s.ch >= max && *invalid < 0 {
391 *invalid = s.offset
392 }
393 digsep |= ds
394 s.next()
395 }
396 } else {
397 for isHex(s.ch) || s.ch == '_' {
398 ds := 1
399 if s.ch == '_' {
400 ds = 2
401 }
402 digsep |= ds
403 s.next()
404 }
405 }
406 return
407 }
408
409 func (s *Scanner) scanNumber() (token.Token, string) {
410 offs := s.offset
411 tok := token.ILLEGAL
412
413 base := 10
414 prefix := rune(0)
415 digsep := 0
416 invalid := -1
417
418
419 if s.ch != '.' {
420 tok = token.INT
421 if s.ch == '0' {
422 s.next()
423 switch lower(s.ch) {
424 case 'x':
425 s.next()
426 base, prefix = 16, 'x'
427 case 'o':
428 s.next()
429 base, prefix = 8, 'o'
430 case 'b':
431 s.next()
432 base, prefix = 2, 'b'
433 default:
434 base, prefix = 8, '0'
435 digsep = 1
436 }
437 }
438 digsep |= s.digits(base, &invalid)
439 }
440
441
442 if s.ch == '.' {
443 tok = token.FLOAT
444 if prefix == 'o' || prefix == 'b' {
445 s.error(s.offset, "invalid radix point in "+litname(prefix))
446 }
447 s.next()
448 digsep |= s.digits(base, &invalid)
449 }
450
451 if digsep&1 == 0 {
452 s.error(s.offset, litname(prefix)+" has no digits")
453 }
454
455
456 if e := lower(s.ch); e == 'e' || e == 'p' {
457 switch {
458 case e == 'e' && prefix != 0 && prefix != '0':
459 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
460 case e == 'p' && prefix != 'x':
461 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
462 }
463 s.next()
464 tok = token.FLOAT
465 if s.ch == '+' || s.ch == '-' {
466 s.next()
467 }
468 ds := s.digits(10, nil)
469 digsep |= ds
470 if ds&1 == 0 {
471 s.error(s.offset, "exponent has no digits")
472 }
473 } else if prefix == 'x' && tok == token.FLOAT {
474 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
475 }
476
477
478 if s.ch == 'i' {
479 tok = token.IMAG
480 s.next()
481 }
482
483 lit := string(s.src[offs:s.offset])
484 if tok == token.INT && invalid >= 0 {
485 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
486 }
487 if digsep&2 != 0 {
488 if i := invalidSep(lit); i >= 0 {
489 s.error(offs+i, "'_' must separate successive digits")
490 }
491 }
492
493 return tok, lit
494 }
495
496 func litname(prefix rune) string {
497 switch prefix {
498 case 'x':
499 return "hexadecimal literal"
500 case 'o', '0':
501 return "octal literal"
502 case 'b':
503 return "binary literal"
504 }
505 return "decimal literal"
506 }
507
508
509 func invalidSep(x string) int {
510 x1 := ' '
511 d := '.'
512 i := 0
513
514
515 if len(x) >= 2 && x[0] == '0' {
516 x1 = lower(rune(x[1]))
517 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
518 d = '0'
519 i = 2
520 }
521 }
522
523
524 for ; i < len(x); i++ {
525 p := d
526 d = rune(x[i])
527 switch {
528 case d == '_':
529 if p != '0' {
530 return i
531 }
532 case isDecimal(d) || x1 == 'x' && isHex(d):
533 d = '0'
534 default:
535 if p == '_' {
536 return i - 1
537 }
538 d = '.'
539 }
540 }
541 if d == '_' {
542 return len(x) - 1
543 }
544
545 return -1
546 }
547
548
549
550
551
552 func (s *Scanner) scanEscape(quote rune) bool {
553 offs := s.offset
554
555 var n int
556 var base, max uint32
557 switch s.ch {
558 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
559 s.next()
560 return true
561 case '0', '1', '2', '3', '4', '5', '6', '7':
562 n, base, max = 3, 8, 255
563 case 'x':
564 s.next()
565 n, base, max = 2, 16, 255
566 case 'u':
567 s.next()
568 n, base, max = 4, 16, unicode.MaxRune
569 case 'U':
570 s.next()
571 n, base, max = 8, 16, unicode.MaxRune
572 default:
573 msg := "unknown escape sequence"
574 if s.ch < 0 {
575 msg = "escape sequence not terminated"
576 }
577 s.error(offs, msg)
578 return false
579 }
580
581 var x uint32
582 for n > 0 {
583 d := uint32(digitVal(s.ch))
584 if d >= base {
585 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
586 if s.ch < 0 {
587 msg = "escape sequence not terminated"
588 }
589 s.error(s.offset, msg)
590 return false
591 }
592 x = x*base + d
593 s.next()
594 n--
595 }
596
597 if x > max || 0xD800 <= x && x < 0xE000 {
598 s.error(offs, "escape sequence is invalid Unicode code point")
599 return false
600 }
601
602 return true
603 }
604
605 func (s *Scanner) scanRune() string {
606
607 offs := s.offset - 1
608
609 valid := true
610 n := 0
611 for {
612 ch := s.ch
613 if ch == '\n' || ch < 0 {
614
615 if valid {
616 s.error(offs, "rune literal not terminated")
617 valid = false
618 }
619 break
620 }
621 s.next()
622 if ch == '\'' {
623 break
624 }
625 n++
626 if ch == '\\' {
627 if !s.scanEscape('\'') {
628 valid = false
629 }
630
631 }
632 }
633
634 if valid && n != 1 {
635 s.error(offs, "illegal rune literal")
636 }
637
638 return string(s.src[offs:s.offset])
639 }
640
641 func (s *Scanner) scanString() string {
642
643 offs := s.offset - 1
644
645 for {
646 ch := s.ch
647 if ch == '\n' || ch < 0 {
648 s.error(offs, "string literal not terminated")
649 break
650 }
651 s.next()
652 if ch == '"' {
653 break
654 }
655 if ch == '\\' {
656 s.scanEscape('"')
657 }
658 }
659
660 return string(s.src[offs:s.offset])
661 }
662
663 func stripCR(b []byte, comment bool) []byte {
664 c := make([]byte, len(b))
665 i := 0
666 for j, ch := range b {
667
668
669
670
671
672 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
673 c[i] = ch
674 i++
675 }
676 }
677 return c[:i]
678 }
679
680 func (s *Scanner) scanRawString() string {
681
682 offs := s.offset - 1
683
684 hasCR := false
685 for {
686 ch := s.ch
687 if ch < 0 {
688 s.error(offs, "raw string literal not terminated")
689 break
690 }
691 s.next()
692 if ch == '`' {
693 break
694 }
695 if ch == '\r' {
696 hasCR = true
697 }
698 }
699
700 lit := s.src[offs:s.offset]
701 if hasCR {
702 lit = stripCR(lit, false)
703 }
704
705 return string(lit)
706 }
707
708 func (s *Scanner) skipWhitespace() {
709 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
710 s.next()
711 }
712 }
713
714
715
716
717
718
719
720 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
721 if s.ch == '=' {
722 s.next()
723 return tok1
724 }
725 return tok0
726 }
727
728 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
729 if s.ch == '=' {
730 s.next()
731 return tok1
732 }
733 if s.ch == ch2 {
734 s.next()
735 return tok2
736 }
737 return tok0
738 }
739
740 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
741 if s.ch == '=' {
742 s.next()
743 return tok1
744 }
745 if s.ch == ch2 {
746 s.next()
747 if s.ch == '=' {
748 s.next()
749 return tok3
750 }
751 return tok2
752 }
753 return tok0
754 }
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
787 scanAgain:
788 if s.nlPos.IsValid() {
789
790
791 pos, tok, lit = s.nlPos, token.SEMICOLON, "\n"
792 s.nlPos = token.NoPos
793 return
794 }
795
796 s.skipWhitespace()
797
798
799 pos = s.file.Pos(s.offset)
800
801
802 insertSemi := false
803 switch ch := s.ch; {
804 case isLetter(ch):
805 lit = s.scanIdentifier()
806 if len(lit) > 1 {
807
808 tok = token.Lookup(lit)
809 switch tok {
810 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
811 insertSemi = true
812 }
813 } else {
814 insertSemi = true
815 tok = token.IDENT
816 }
817 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
818 insertSemi = true
819 tok, lit = s.scanNumber()
820 default:
821 s.next()
822 switch ch {
823 case eof:
824 if s.insertSemi {
825 s.insertSemi = false
826 return pos, token.SEMICOLON, "\n"
827 }
828 tok = token.EOF
829 case '\n':
830
831
832
833 s.insertSemi = false
834 return pos, token.SEMICOLON, "\n"
835 case '"':
836 insertSemi = true
837 tok = token.STRING
838 lit = s.scanString()
839 case '\'':
840 insertSemi = true
841 tok = token.CHAR
842 lit = s.scanRune()
843 case '`':
844 insertSemi = true
845 tok = token.STRING
846 lit = s.scanRawString()
847 case ':':
848 tok = s.switch2(token.COLON, token.DEFINE)
849 case '.':
850
851 tok = token.PERIOD
852 if s.ch == '.' && s.peek() == '.' {
853 s.next()
854 s.next()
855 tok = token.ELLIPSIS
856 }
857 case ',':
858 tok = token.COMMA
859 case ';':
860 tok = token.SEMICOLON
861 lit = ";"
862 case '(':
863 tok = token.LPAREN
864 case ')':
865 insertSemi = true
866 tok = token.RPAREN
867 case '[':
868 tok = token.LBRACK
869 case ']':
870 insertSemi = true
871 tok = token.RBRACK
872 case '{':
873 tok = token.LBRACE
874 case '}':
875 insertSemi = true
876 tok = token.RBRACE
877 case '+':
878 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
879 if tok == token.INC {
880 insertSemi = true
881 }
882 case '-':
883 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
884 if tok == token.DEC {
885 insertSemi = true
886 }
887 case '*':
888 tok = s.switch2(token.MUL, token.MUL_ASSIGN)
889 case '/':
890 if s.ch == '/' || s.ch == '*' {
891
892 comment, nlOffset := s.scanComment()
893 if s.insertSemi && nlOffset != 0 {
894
895
896 s.nlPos = s.file.Pos(nlOffset)
897 s.insertSemi = false
898 } else {
899 insertSemi = s.insertSemi
900 }
901 if s.mode&ScanComments == 0 {
902
903 goto scanAgain
904 }
905 tok = token.COMMENT
906 lit = comment
907 } else {
908
909 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
910 }
911 case '%':
912 tok = s.switch2(token.REM, token.REM_ASSIGN)
913 case '^':
914 tok = s.switch2(token.XOR, token.XOR_ASSIGN)
915 case '<':
916 if s.ch == '-' {
917 s.next()
918 tok = token.ARROW
919 } else {
920 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
921 }
922 case '>':
923 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
924 case '=':
925 tok = s.switch2(token.ASSIGN, token.EQL)
926 case '!':
927 tok = s.switch2(token.NOT, token.NEQ)
928 case '&':
929 if s.ch == '^' {
930 s.next()
931 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
932 } else {
933 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
934 }
935 case '|':
936 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
937 case '~':
938 tok = token.TILDE
939 default:
940
941 if ch != bom {
942 s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
943 }
944 insertSemi = s.insertSemi
945 tok = token.ILLEGAL
946 lit = string(ch)
947 }
948 }
949 if s.mode&dontInsertSemis == 0 {
950 s.insertSemi = insertSemi
951 }
952
953 return
954 }
955
View as plain text