1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package scanner
16
17 import (
18 "bytes"
19 "fmt"
20 "io"
21 "os"
22 "unicode"
23 "unicode/utf8"
24 )
25
26
27
28 type Position struct {
29 Filename string
30 Offset int
31 Line int
32 Column int
33 }
34
35
36 func (pos *Position) IsValid() bool { return pos.Line > 0 }
37
38 func (pos Position) String() string {
39 s := pos.Filename
40 if s == "" {
41 s = "<input>"
42 }
43 if pos.IsValid() {
44 s += fmt.Sprintf(":%d:%d", pos.Line, pos.Column)
45 }
46 return s
47 }
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63 const (
64 ScanIdents = 1 << -Ident
65 ScanInts = 1 << -Int
66 ScanFloats = 1 << -Float
67 ScanChars = 1 << -Char
68 ScanStrings = 1 << -String
69 ScanRawStrings = 1 << -RawString
70 ScanComments = 1 << -Comment
71 SkipComments = 1 << -skipComment
72 GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
73 )
74
75
76 const (
77 EOF = -(iota + 1)
78 Ident
79 Int
80 Float
81 Char
82 String
83 RawString
84 Comment
85
86
87 skipComment
88 )
89
90 var tokenString = map[rune]string{
91 EOF: "EOF",
92 Ident: "Ident",
93 Int: "Int",
94 Float: "Float",
95 Char: "Char",
96 String: "String",
97 RawString: "RawString",
98 Comment: "Comment",
99 }
100
101
102 func TokenString(tok rune) string {
103 if s, found := tokenString[tok]; found {
104 return s
105 }
106 return fmt.Sprintf("%q", string(tok))
107 }
108
109
110
111 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
112
113 const bufLen = 1024
114
115
116 type Scanner struct {
117
118 src io.Reader
119
120
121 srcBuf [bufLen + 1]byte
122 srcPos int
123 srcEnd int
124
125
126 srcBufOffset int
127 line int
128 column int
129 lastLineLen int
130 lastCharLen int
131
132
133
134
135
136 tokBuf bytes.Buffer
137 tokPos int
138 tokEnd int
139
140
141 ch rune
142
143
144
145 Error func(s *Scanner, msg string)
146
147
148 ErrorCount int
149
150
151
152
153 Mode uint
154
155
156
157
158
159 Whitespace uint64
160
161
162
163
164
165
166 IsIdentRune func(ch rune, i int) bool
167
168
169
170
171
172
173
174
175 Position
176 }
177
178
179
180
181 func (s *Scanner) Init(src io.Reader) *Scanner {
182 s.src = src
183
184
185
186 s.srcBuf[0] = utf8.RuneSelf
187 s.srcPos = 0
188 s.srcEnd = 0
189
190
191 s.srcBufOffset = 0
192 s.line = 1
193 s.column = 0
194 s.lastLineLen = 0
195 s.lastCharLen = 0
196
197
198
199 s.tokPos = -1
200
201
202 s.ch = -2
203
204
205 s.Error = nil
206 s.ErrorCount = 0
207 s.Mode = GoTokens
208 s.Whitespace = GoWhitespace
209 s.Line = 0
210
211 return s
212 }
213
214
215
216
217
218 func (s *Scanner) next() rune {
219 ch, width := rune(s.srcBuf[s.srcPos]), 1
220
221 if ch >= utf8.RuneSelf {
222
223 for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
224
225
226 if s.tokPos >= 0 {
227 s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
228 s.tokPos = 0
229
230 }
231
232 copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
233 s.srcBufOffset += s.srcPos
234
235
236
237
238
239 i := s.srcEnd - s.srcPos
240 n, err := s.src.Read(s.srcBuf[i:bufLen])
241 s.srcPos = 0
242 s.srcEnd = i + n
243 s.srcBuf[s.srcEnd] = utf8.RuneSelf
244 if err != nil {
245 if err != io.EOF {
246 s.error(err.Error())
247 }
248 if s.srcEnd == 0 {
249 if s.lastCharLen > 0 {
250
251 s.column++
252 }
253 s.lastCharLen = 0
254 return EOF
255 }
256
257
258
259
260 break
261 }
262 }
263
264 ch = rune(s.srcBuf[s.srcPos])
265 if ch >= utf8.RuneSelf {
266
267 ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
268 if ch == utf8.RuneError && width == 1 {
269
270 s.srcPos += width
271 s.lastCharLen = width
272 s.column++
273 s.error("invalid UTF-8 encoding")
274 return ch
275 }
276 }
277 }
278
279
280 s.srcPos += width
281 s.lastCharLen = width
282 s.column++
283
284
285 switch ch {
286 case 0:
287
288 s.error("invalid character NUL")
289 case '\n':
290 s.line++
291 s.lastLineLen = s.column
292 s.column = 0
293 }
294
295 return ch
296 }
297
298
299
300
301
302
303
304 func (s *Scanner) Next() rune {
305 s.tokPos = -1
306 s.Line = 0
307 ch := s.Peek()
308 if ch != EOF {
309 s.ch = s.next()
310 }
311 return ch
312 }
313
314
315
316
317 func (s *Scanner) Peek() rune {
318 if s.ch == -2 {
319
320 s.ch = s.next()
321 if s.ch == '\uFEFF' {
322 s.ch = s.next()
323 }
324 }
325 return s.ch
326 }
327
328 func (s *Scanner) error(msg string) {
329 s.tokEnd = s.srcPos - s.lastCharLen
330 s.ErrorCount++
331 if s.Error != nil {
332 s.Error(s, msg)
333 return
334 }
335 pos := s.Position
336 if !pos.IsValid() {
337 pos = s.Pos()
338 }
339 fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
340 }
341
342 func (s *Scanner) errorf(format string, args ...any) {
343 s.error(fmt.Sprintf(format, args...))
344 }
345
346 func (s *Scanner) isIdentRune(ch rune, i int) bool {
347 if s.IsIdentRune != nil {
348 return ch != EOF && s.IsIdentRune(ch, i)
349 }
350 return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0
351 }
352
353 func (s *Scanner) scanIdentifier() rune {
354
355 ch := s.next()
356 for i := 1; s.isIdentRune(ch, i); i++ {
357 ch = s.next()
358 }
359 return ch
360 }
361
362 func lower(ch rune) rune { return ('a' - 'A') | ch }
363 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
364 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
365
366
367
368
369
370
371
372 func (s *Scanner) digits(ch0 rune, base int, invalid *rune) (ch rune, digsep int) {
373 ch = ch0
374 if base <= 10 {
375 max := rune('0' + base)
376 for isDecimal(ch) || ch == '_' {
377 ds := 1
378 if ch == '_' {
379 ds = 2
380 } else if ch >= max && *invalid == 0 {
381 *invalid = ch
382 }
383 digsep |= ds
384 ch = s.next()
385 }
386 } else {
387 for isHex(ch) || ch == '_' {
388 ds := 1
389 if ch == '_' {
390 ds = 2
391 }
392 digsep |= ds
393 ch = s.next()
394 }
395 }
396 return
397 }
398
399 func (s *Scanner) scanNumber(ch rune, seenDot bool) (rune, rune) {
400 base := 10
401 prefix := rune(0)
402 digsep := 0
403 invalid := rune(0)
404
405
406 var tok rune
407 var ds int
408 if !seenDot {
409 tok = Int
410 if ch == '0' {
411 ch = s.next()
412 switch lower(ch) {
413 case 'x':
414 ch = s.next()
415 base, prefix = 16, 'x'
416 case 'o':
417 ch = s.next()
418 base, prefix = 8, 'o'
419 case 'b':
420 ch = s.next()
421 base, prefix = 2, 'b'
422 default:
423 base, prefix = 8, '0'
424 digsep = 1
425 }
426 }
427 ch, ds = s.digits(ch, base, &invalid)
428 digsep |= ds
429 if ch == '.' && s.Mode&ScanFloats != 0 {
430 ch = s.next()
431 seenDot = true
432 }
433 }
434
435
436 if seenDot {
437 tok = Float
438 if prefix == 'o' || prefix == 'b' {
439 s.error("invalid radix point in " + litname(prefix))
440 }
441 ch, ds = s.digits(ch, base, &invalid)
442 digsep |= ds
443 }
444
445 if digsep&1 == 0 {
446 s.error(litname(prefix) + " has no digits")
447 }
448
449
450 if e := lower(ch); (e == 'e' || e == 'p') && s.Mode&ScanFloats != 0 {
451 switch {
452 case e == 'e' && prefix != 0 && prefix != '0':
453 s.errorf("%q exponent requires decimal mantissa", ch)
454 case e == 'p' && prefix != 'x':
455 s.errorf("%q exponent requires hexadecimal mantissa", ch)
456 }
457 ch = s.next()
458 tok = Float
459 if ch == '+' || ch == '-' {
460 ch = s.next()
461 }
462 ch, ds = s.digits(ch, 10, nil)
463 digsep |= ds
464 if ds&1 == 0 {
465 s.error("exponent has no digits")
466 }
467 } else if prefix == 'x' && tok == Float {
468 s.error("hexadecimal mantissa requires a 'p' exponent")
469 }
470
471 if tok == Int && invalid != 0 {
472 s.errorf("invalid digit %q in %s", invalid, litname(prefix))
473 }
474
475 if digsep&2 != 0 {
476 s.tokEnd = s.srcPos - s.lastCharLen
477 if i := invalidSep(s.TokenText()); i >= 0 {
478 s.error("'_' must separate successive digits")
479 }
480 }
481
482 return tok, ch
483 }
484
485 func litname(prefix rune) string {
486 switch prefix {
487 default:
488 return "decimal literal"
489 case 'x':
490 return "hexadecimal literal"
491 case 'o', '0':
492 return "octal literal"
493 case 'b':
494 return "binary literal"
495 }
496 }
497
498
499 func invalidSep(x string) int {
500 x1 := ' '
501 d := '.'
502 i := 0
503
504
505 if len(x) >= 2 && x[0] == '0' {
506 x1 = lower(rune(x[1]))
507 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
508 d = '0'
509 i = 2
510 }
511 }
512
513
514 for ; i < len(x); i++ {
515 p := d
516 d = rune(x[i])
517 switch {
518 case d == '_':
519 if p != '0' {
520 return i
521 }
522 case isDecimal(d) || x1 == 'x' && isHex(d):
523 d = '0'
524 default:
525 if p == '_' {
526 return i - 1
527 }
528 d = '.'
529 }
530 }
531 if d == '_' {
532 return len(x) - 1
533 }
534
535 return -1
536 }
537
538 func digitVal(ch rune) int {
539 switch {
540 case '0' <= ch && ch <= '9':
541 return int(ch - '0')
542 case 'a' <= lower(ch) && lower(ch) <= 'f':
543 return int(lower(ch) - 'a' + 10)
544 }
545 return 16
546 }
547
548 func (s *Scanner) scanDigits(ch rune, base, n int) rune {
549 for n > 0 && digitVal(ch) < base {
550 ch = s.next()
551 n--
552 }
553 if n > 0 {
554 s.error("invalid char escape")
555 }
556 return ch
557 }
558
559 func (s *Scanner) scanEscape(quote rune) rune {
560 ch := s.next()
561 switch ch {
562 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
563
564 ch = s.next()
565 case '0', '1', '2', '3', '4', '5', '6', '7':
566 ch = s.scanDigits(ch, 8, 3)
567 case 'x':
568 ch = s.scanDigits(s.next(), 16, 2)
569 case 'u':
570 ch = s.scanDigits(s.next(), 16, 4)
571 case 'U':
572 ch = s.scanDigits(s.next(), 16, 8)
573 default:
574 s.error("invalid char escape")
575 }
576 return ch
577 }
578
579 func (s *Scanner) scanString(quote rune) (n int) {
580 ch := s.next()
581 for ch != quote {
582 if ch == '\n' || ch < 0 {
583 s.error("literal not terminated")
584 return
585 }
586 if ch == '\\' {
587 ch = s.scanEscape(quote)
588 } else {
589 ch = s.next()
590 }
591 n++
592 }
593 return
594 }
595
596 func (s *Scanner) scanRawString() {
597 ch := s.next()
598 for ch != '`' {
599 if ch < 0 {
600 s.error("literal not terminated")
601 return
602 }
603 ch = s.next()
604 }
605 }
606
607 func (s *Scanner) scanChar() {
608 if s.scanString('\'') != 1 {
609 s.error("invalid char literal")
610 }
611 }
612
613 func (s *Scanner) scanComment(ch rune) rune {
614
615 if ch == '/' {
616
617 ch = s.next()
618 for ch != '\n' && ch >= 0 {
619 ch = s.next()
620 }
621 return ch
622 }
623
624
625 ch = s.next()
626 for {
627 if ch < 0 {
628 s.error("comment not terminated")
629 break
630 }
631 ch0 := ch
632 ch = s.next()
633 if ch0 == '*' && ch == '/' {
634 ch = s.next()
635 break
636 }
637 }
638 return ch
639 }
640
641
642
643
644
645
646 func (s *Scanner) Scan() rune {
647 ch := s.Peek()
648
649
650 s.tokPos = -1
651 s.Line = 0
652
653 redo:
654
655 for s.Whitespace&(1<<uint(ch)) != 0 {
656 ch = s.next()
657 }
658
659
660 s.tokBuf.Reset()
661 s.tokPos = s.srcPos - s.lastCharLen
662
663
664
665 s.Offset = s.srcBufOffset + s.tokPos
666 if s.column > 0 {
667
668 s.Line = s.line
669 s.Column = s.column
670 } else {
671
672
673
674 s.Line = s.line - 1
675 s.Column = s.lastLineLen
676 }
677
678
679 tok := ch
680 switch {
681 case s.isIdentRune(ch, 0):
682 if s.Mode&ScanIdents != 0 {
683 tok = Ident
684 ch = s.scanIdentifier()
685 } else {
686 ch = s.next()
687 }
688 case isDecimal(ch):
689 if s.Mode&(ScanInts|ScanFloats) != 0 {
690 tok, ch = s.scanNumber(ch, false)
691 } else {
692 ch = s.next()
693 }
694 default:
695 switch ch {
696 case EOF:
697 break
698 case '"':
699 if s.Mode&ScanStrings != 0 {
700 s.scanString('"')
701 tok = String
702 }
703 ch = s.next()
704 case '\'':
705 if s.Mode&ScanChars != 0 {
706 s.scanChar()
707 tok = Char
708 }
709 ch = s.next()
710 case '.':
711 ch = s.next()
712 if isDecimal(ch) && s.Mode&ScanFloats != 0 {
713 tok, ch = s.scanNumber(ch, true)
714 }
715 case '/':
716 ch = s.next()
717 if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 {
718 if s.Mode&SkipComments != 0 {
719 s.tokPos = -1
720 ch = s.scanComment(ch)
721 goto redo
722 }
723 ch = s.scanComment(ch)
724 tok = Comment
725 }
726 case '`':
727 if s.Mode&ScanRawStrings != 0 {
728 s.scanRawString()
729 tok = RawString
730 }
731 ch = s.next()
732 default:
733 ch = s.next()
734 }
735 }
736
737
738 s.tokEnd = s.srcPos - s.lastCharLen
739
740 s.ch = ch
741 return tok
742 }
743
744
745
746
747
748 func (s *Scanner) Pos() (pos Position) {
749 pos.Filename = s.Filename
750 pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen
751 switch {
752 case s.column > 0:
753
754 pos.Line = s.line
755 pos.Column = s.column
756 case s.lastLineLen > 0:
757
758 pos.Line = s.line - 1
759 pos.Column = s.lastLineLen
760 default:
761
762 pos.Line = 1
763 pos.Column = 1
764 }
765 return
766 }
767
768
769
770 func (s *Scanner) TokenText() string {
771 if s.tokPos < 0 {
772
773 return ""
774 }
775
776 if s.tokEnd < s.tokPos {
777
778 s.tokEnd = s.tokPos
779 }
780
781
782 if s.tokBuf.Len() == 0 {
783
784 return string(s.srcBuf[s.tokPos:s.tokEnd])
785 }
786
787
788
789 s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd])
790 s.tokPos = s.tokEnd
791 return s.tokBuf.String()
792 }
793
View as plain text