Source file src/go/doc/comment/parse.go

     1  // Copyright 2022 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package comment
     6  
     7  import (
     8  	"slices"
     9  	"strings"
    10  	"unicode"
    11  	"unicode/utf8"
    12  )
    13  
    14  // A Doc is a parsed Go doc comment.
    15  type Doc struct {
    16  	// Content is the sequence of content blocks in the comment.
    17  	Content []Block
    18  
    19  	// Links is the link definitions in the comment.
    20  	Links []*LinkDef
    21  }
    22  
    23  // A LinkDef is a single link definition.
    24  type LinkDef struct {
    25  	Text string // the link text
    26  	URL  string // the link URL
    27  	Used bool   // whether the comment uses the definition
    28  }
    29  
    30  // A Block is block-level content in a doc comment,
    31  // one of [*Code], [*Heading], [*List], or [*Paragraph].
    32  type Block interface {
    33  	block()
    34  }
    35  
    36  // A Heading is a doc comment heading.
    37  type Heading struct {
    38  	Text []Text // the heading text
    39  }
    40  
    41  func (*Heading) block() {}
    42  
    43  // A List is a numbered or bullet list.
    44  // Lists are always non-empty: len(Items) > 0.
    45  // In a numbered list, every Items[i].Number is a non-empty string.
    46  // In a bullet list, every Items[i].Number is an empty string.
    47  type List struct {
    48  	// Items is the list items.
    49  	Items []*ListItem
    50  
    51  	// ForceBlankBefore indicates that the list must be
    52  	// preceded by a blank line when reformatting the comment,
    53  	// overriding the usual conditions. See the BlankBefore method.
    54  	//
    55  	// The comment parser sets ForceBlankBefore for any list
    56  	// that is preceded by a blank line, to make sure
    57  	// the blank line is preserved when printing.
    58  	ForceBlankBefore bool
    59  
    60  	// ForceBlankBetween indicates that list items must be
    61  	// separated by blank lines when reformatting the comment,
    62  	// overriding the usual conditions. See the BlankBetween method.
    63  	//
    64  	// The comment parser sets ForceBlankBetween for any list
    65  	// that has a blank line between any two of its items, to make sure
    66  	// the blank lines are preserved when printing.
    67  	ForceBlankBetween bool
    68  }
    69  
    70  func (*List) block() {}
    71  
    72  // BlankBefore reports whether a reformatting of the comment
    73  // should include a blank line before the list.
    74  // The default rule is the same as for [BlankBetween]:
    75  // if the list item content contains any blank lines
    76  // (meaning at least one item has multiple paragraphs)
    77  // then the list itself must be preceded by a blank line.
    78  // A preceding blank line can be forced by setting [List].ForceBlankBefore.
    79  func (l *List) BlankBefore() bool {
    80  	return l.ForceBlankBefore || l.BlankBetween()
    81  }
    82  
    83  // BlankBetween reports whether a reformatting of the comment
    84  // should include a blank line between each pair of list items.
    85  // The default rule is that if the list item content contains any blank lines
    86  // (meaning at least one item has multiple paragraphs)
    87  // then list items must themselves be separated by blank lines.
    88  // Blank line separators can be forced by setting [List].ForceBlankBetween.
    89  func (l *List) BlankBetween() bool {
    90  	if l.ForceBlankBetween {
    91  		return true
    92  	}
    93  	for _, item := range l.Items {
    94  		if len(item.Content) != 1 {
    95  			// Unreachable for parsed comments today,
    96  			// since the only way to get multiple item.Content
    97  			// is multiple paragraphs, which must have been
    98  			// separated by a blank line.
    99  			return true
   100  		}
   101  	}
   102  	return false
   103  }
   104  
   105  // A ListItem is a single item in a numbered or bullet list.
   106  type ListItem struct {
   107  	// Number is a decimal string in a numbered list
   108  	// or an empty string in a bullet list.
   109  	Number string // "1", "2", ...; "" for bullet list
   110  
   111  	// Content is the list content.
   112  	// Currently, restrictions in the parser and printer
   113  	// require every element of Content to be a *Paragraph.
   114  	Content []Block // Content of this item.
   115  }
   116  
   117  // A Paragraph is a paragraph of text.
   118  type Paragraph struct {
   119  	Text []Text
   120  }
   121  
   122  func (*Paragraph) block() {}
   123  
   124  // A Code is a preformatted code block.
   125  type Code struct {
   126  	// Text is the preformatted text, ending with a newline character.
   127  	// It may be multiple lines, each of which ends with a newline character.
   128  	// It is never empty, nor does it start or end with a blank line.
   129  	Text string
   130  }
   131  
   132  func (*Code) block() {}
   133  
   134  // A Text is text-level content in a doc comment,
   135  // one of [Plain], [Italic], [*Link], or [*DocLink].
   136  type Text interface {
   137  	text()
   138  }
   139  
   140  // A Plain is a string rendered as plain text (not italicized).
   141  type Plain string
   142  
   143  func (Plain) text() {}
   144  
   145  // An Italic is a string rendered as italicized text.
   146  type Italic string
   147  
   148  func (Italic) text() {}
   149  
   150  // A Link is a link to a specific URL.
   151  type Link struct {
   152  	Auto bool   // is this an automatic (implicit) link of a literal URL?
   153  	Text []Text // text of link
   154  	URL  string // target URL of link
   155  }
   156  
   157  func (*Link) text() {}
   158  
   159  // A DocLink is a link to documentation for a Go package or symbol.
   160  type DocLink struct {
   161  	Text []Text // text of link
   162  
   163  	// ImportPath, Recv, and Name identify the Go package or symbol
   164  	// that is the link target. The potential combinations of
   165  	// non-empty fields are:
   166  	//  - ImportPath: a link to another package
   167  	//  - ImportPath, Name: a link to a const, func, type, or var in another package
   168  	//  - ImportPath, Recv, Name: a link to a method in another package
   169  	//  - Name: a link to a const, func, type, or var in this package
   170  	//  - Recv, Name: a link to a method in this package
   171  	ImportPath string // import path
   172  	Recv       string // receiver type, without any pointer star, for methods
   173  	Name       string // const, func, type, var, or method name
   174  }
   175  
   176  func (*DocLink) text() {}
   177  
   178  // A Parser is a doc comment parser.
   179  // The fields in the struct can be filled in before calling [Parser.Parse]
   180  // in order to customize the details of the parsing process.
   181  type Parser struct {
   182  	// Words is a map of Go identifier words that
   183  	// should be italicized and potentially linked.
   184  	// If Words[w] is the empty string, then the word w
   185  	// is only italicized. Otherwise it is linked, using
   186  	// Words[w] as the link target.
   187  	// Words corresponds to the [go/doc.ToHTML] words parameter.
   188  	Words map[string]string
   189  
   190  	// LookupPackage resolves a package name to an import path.
   191  	//
   192  	// If LookupPackage(name) returns ok == true, then [name]
   193  	// (or [name.Sym] or [name.Sym.Method])
   194  	// is considered a documentation link to importPath's package docs.
   195  	// It is valid to return "", true, in which case name is considered
   196  	// to refer to the current package.
   197  	//
   198  	// If LookupPackage(name) returns ok == false,
   199  	// then [name] (or [name.Sym] or [name.Sym.Method])
   200  	// will not be considered a documentation link,
   201  	// except in the case where name is the full (but single-element) import path
   202  	// of a package in the standard library, such as in [math] or [io.Reader].
   203  	// LookupPackage is still called for such names,
   204  	// in order to permit references to imports of other packages
   205  	// with the same package names.
   206  	//
   207  	// Setting LookupPackage to nil is equivalent to setting it to
   208  	// a function that always returns "", false.
   209  	LookupPackage func(name string) (importPath string, ok bool)
   210  
   211  	// LookupSym reports whether a symbol name or method name
   212  	// exists in the current package.
   213  	//
   214  	// If LookupSym("", "Name") returns true, then [Name]
   215  	// is considered a documentation link for a const, func, type, or var.
   216  	//
   217  	// Similarly, if LookupSym("Recv", "Name") returns true,
   218  	// then [Recv.Name] is considered a documentation link for
   219  	// type Recv's method Name.
   220  	//
   221  	// Setting LookupSym to nil is equivalent to setting it to a function
   222  	// that always returns false.
   223  	LookupSym func(recv, name string) (ok bool)
   224  }
   225  
   226  // parseDoc is parsing state for a single doc comment.
   227  type parseDoc struct {
   228  	*Parser
   229  	*Doc
   230  	links     map[string]*LinkDef
   231  	lines     []string
   232  	lookupSym func(recv, name string) bool
   233  }
   234  
   235  // lookupPkg is called to look up the pkg in [pkg], [pkg.Name], and [pkg.Name.Recv].
   236  // If pkg has a slash, it is assumed to be the full import path and is returned with ok = true.
   237  //
   238  // Otherwise, pkg is probably a simple package name like "rand" (not "crypto/rand" or "math/rand").
   239  // d.LookupPackage provides a way for the caller to allow resolving such names with reference
   240  // to the imports in the surrounding package.
   241  //
   242  // There is one collision between these two cases: single-element standard library names
   243  // like "math" are full import paths but don't contain slashes. We let d.LookupPackage have
   244  // the first chance to resolve it, in case there's a different package imported as math,
   245  // and otherwise we refer to a built-in list of single-element standard library package names.
   246  func (d *parseDoc) lookupPkg(pkg string) (importPath string, ok bool) {
   247  	if strings.Contains(pkg, "/") { // assume a full import path
   248  		if validImportPath(pkg) {
   249  			return pkg, true
   250  		}
   251  		return "", false
   252  	}
   253  	if d.LookupPackage != nil {
   254  		// Give LookupPackage a chance.
   255  		if path, ok := d.LookupPackage(pkg); ok {
   256  			return path, true
   257  		}
   258  	}
   259  	return DefaultLookupPackage(pkg)
   260  }
   261  
   262  func isStdPkg(path string) bool {
   263  	_, ok := slices.BinarySearch(stdPkgs, path)
   264  	return ok
   265  }
   266  
   267  // DefaultLookupPackage is the default package lookup
   268  // function, used when [Parser.LookupPackage] is nil.
   269  // It recognizes names of the packages from the standard
   270  // library with single-element import paths, such as math,
   271  // which would otherwise be impossible to name.
   272  //
   273  // Note that the go/doc package provides a more sophisticated
   274  // lookup based on the imports used in the current package.
   275  func DefaultLookupPackage(name string) (importPath string, ok bool) {
   276  	if isStdPkg(name) {
   277  		return name, true
   278  	}
   279  	return "", false
   280  }
   281  
   282  // Parse parses the doc comment text and returns the *[Doc] form.
   283  // Comment markers (/* // and */) in the text must have already been removed.
   284  func (p *Parser) Parse(text string) *Doc {
   285  	lines := unindent(strings.Split(text, "\n"))
   286  	d := &parseDoc{
   287  		Parser:    p,
   288  		Doc:       new(Doc),
   289  		links:     make(map[string]*LinkDef),
   290  		lines:     lines,
   291  		lookupSym: func(recv, name string) bool { return false },
   292  	}
   293  	if p.LookupSym != nil {
   294  		d.lookupSym = p.LookupSym
   295  	}
   296  
   297  	// First pass: break into block structure and collect known links.
   298  	// The text is all recorded as Plain for now.
   299  	var prev span
   300  	for _, s := range parseSpans(lines) {
   301  		var b Block
   302  		switch s.kind {
   303  		default:
   304  			panic("go/doc/comment: internal error: unknown span kind")
   305  		case spanList:
   306  			b = d.list(lines[s.start:s.end], prev.end < s.start)
   307  		case spanCode:
   308  			b = d.code(lines[s.start:s.end])
   309  		case spanOldHeading:
   310  			b = d.oldHeading(lines[s.start])
   311  		case spanHeading:
   312  			b = d.heading(lines[s.start])
   313  		case spanPara:
   314  			b = d.paragraph(lines[s.start:s.end])
   315  		}
   316  		if b != nil {
   317  			d.Content = append(d.Content, b)
   318  		}
   319  		prev = s
   320  	}
   321  
   322  	// Second pass: interpret all the Plain text now that we know the links.
   323  	for _, b := range d.Content {
   324  		switch b := b.(type) {
   325  		case *Paragraph:
   326  			b.Text = d.parseLinkedText(string(b.Text[0].(Plain)))
   327  		case *List:
   328  			for _, i := range b.Items {
   329  				for _, c := range i.Content {
   330  					p := c.(*Paragraph)
   331  					p.Text = d.parseLinkedText(string(p.Text[0].(Plain)))
   332  				}
   333  			}
   334  		}
   335  	}
   336  
   337  	return d.Doc
   338  }
   339  
   340  // A span represents a single span of comment lines (lines[start:end])
   341  // of an identified kind (code, heading, paragraph, and so on).
   342  type span struct {
   343  	start int
   344  	end   int
   345  	kind  spanKind
   346  }
   347  
   348  // A spanKind describes the kind of span.
   349  type spanKind int
   350  
   351  const (
   352  	_ spanKind = iota
   353  	spanCode
   354  	spanHeading
   355  	spanList
   356  	spanOldHeading
   357  	spanPara
   358  )
   359  
   360  func parseSpans(lines []string) []span {
   361  	var spans []span
   362  
   363  	// The loop may process a line twice: once as unindented
   364  	// and again forced indented. So the maximum expected
   365  	// number of iterations is 2*len(lines). The repeating logic
   366  	// can be subtle, though, and to protect against introduction
   367  	// of infinite loops in future changes, we watch to see that
   368  	// we are not looping too much. A panic is better than a
   369  	// quiet infinite loop.
   370  	watchdog := 2 * len(lines)
   371  
   372  	i := 0
   373  	forceIndent := 0
   374  Spans:
   375  	for {
   376  		// Skip blank lines.
   377  		for i < len(lines) && lines[i] == "" {
   378  			i++
   379  		}
   380  		if i >= len(lines) {
   381  			break
   382  		}
   383  		if watchdog--; watchdog < 0 {
   384  			panic("go/doc/comment: internal error: not making progress")
   385  		}
   386  
   387  		var kind spanKind
   388  		start := i
   389  		end := i
   390  		if i < forceIndent || indented(lines[i]) {
   391  			// Indented (or force indented).
   392  			// Ends before next unindented. (Blank lines are OK.)
   393  			// If this is an unindented list that we are heuristically treating as indented,
   394  			// then accept unindented list item lines up to the first blank lines.
   395  			// The heuristic is disabled at blank lines to contain its effect
   396  			// to non-gofmt'ed sections of the comment.
   397  			unindentedListOK := isList(lines[i]) && i < forceIndent
   398  			i++
   399  			for i < len(lines) && (lines[i] == "" || i < forceIndent || indented(lines[i]) || (unindentedListOK && isList(lines[i]))) {
   400  				if lines[i] == "" {
   401  					unindentedListOK = false
   402  				}
   403  				i++
   404  			}
   405  
   406  			// Drop trailing blank lines.
   407  			end = i
   408  			for end > start && lines[end-1] == "" {
   409  				end--
   410  			}
   411  
   412  			// If indented lines are followed (without a blank line)
   413  			// by an unindented line ending in a brace,
   414  			// take that one line too. This fixes the common mistake
   415  			// of pasting in something like
   416  			//
   417  			// func main() {
   418  			//	fmt.Println("hello, world")
   419  			// }
   420  			//
   421  			// and forgetting to indent it.
   422  			// The heuristic will never trigger on a gofmt'ed comment,
   423  			// because any gofmt'ed code block or list would be
   424  			// followed by a blank line or end of comment.
   425  			if end < len(lines) && strings.HasPrefix(lines[end], "}") {
   426  				end++
   427  			}
   428  
   429  			if isList(lines[start]) {
   430  				kind = spanList
   431  			} else {
   432  				kind = spanCode
   433  			}
   434  		} else {
   435  			// Unindented. Ends at next blank or indented line.
   436  			i++
   437  			for i < len(lines) && lines[i] != "" && !indented(lines[i]) {
   438  				i++
   439  			}
   440  			end = i
   441  
   442  			// If unindented lines are followed (without a blank line)
   443  			// by an indented line that would start a code block,
   444  			// check whether the final unindented lines
   445  			// should be left for the indented section.
   446  			// This can happen for the common mistakes of
   447  			// unindented code or unindented lists.
   448  			// The heuristic will never trigger on a gofmt'ed comment,
   449  			// because any gofmt'ed code block would have a blank line
   450  			// preceding it after the unindented lines.
   451  			if i < len(lines) && lines[i] != "" && !isList(lines[i]) {
   452  				switch {
   453  				case isList(lines[i-1]):
   454  					// If the final unindented line looks like a list item,
   455  					// this may be the first indented line wrap of
   456  					// a mistakenly unindented list.
   457  					// Leave all the unindented list items.
   458  					forceIndent = end
   459  					end--
   460  					for end > start && isList(lines[end-1]) {
   461  						end--
   462  					}
   463  
   464  				case strings.HasSuffix(lines[i-1], "{") || strings.HasSuffix(lines[i-1], `\`):
   465  					// If the final unindented line ended in { or \
   466  					// it is probably the start of a misindented code block.
   467  					// Give the user a single line fix.
   468  					// Often that's enough; if not, the user can fix the others themselves.
   469  					forceIndent = end
   470  					end--
   471  				}
   472  
   473  				if start == end && forceIndent > start {
   474  					i = start
   475  					continue Spans
   476  				}
   477  			}
   478  
   479  			// Span is either paragraph or heading.
   480  			if end-start == 1 && isHeading(lines[start]) {
   481  				kind = spanHeading
   482  			} else if end-start == 1 && isOldHeading(lines[start], lines, start) {
   483  				kind = spanOldHeading
   484  			} else {
   485  				kind = spanPara
   486  			}
   487  		}
   488  
   489  		spans = append(spans, span{start, end, kind})
   490  		i = end
   491  	}
   492  
   493  	return spans
   494  }
   495  
   496  // indented reports whether line is indented
   497  // (starts with a leading space or tab).
   498  func indented(line string) bool {
   499  	return line != "" && (line[0] == ' ' || line[0] == '\t')
   500  }
   501  
   502  // unindent removes any common space/tab prefix
   503  // from each line in lines, returning a copy of lines in which
   504  // those prefixes have been trimmed from each line.
   505  // It also replaces any lines containing only spaces with blank lines (empty strings).
   506  func unindent(lines []string) []string {
   507  	// Trim leading and trailing blank lines.
   508  	for len(lines) > 0 && isBlank(lines[0]) {
   509  		lines = lines[1:]
   510  	}
   511  	for len(lines) > 0 && isBlank(lines[len(lines)-1]) {
   512  		lines = lines[:len(lines)-1]
   513  	}
   514  	if len(lines) == 0 {
   515  		return nil
   516  	}
   517  
   518  	// Compute and remove common indentation.
   519  	prefix := leadingSpace(lines[0])
   520  	for _, line := range lines[1:] {
   521  		if !isBlank(line) {
   522  			prefix = commonPrefix(prefix, leadingSpace(line))
   523  		}
   524  	}
   525  
   526  	out := make([]string, len(lines))
   527  	for i, line := range lines {
   528  		line = strings.TrimPrefix(line, prefix)
   529  		if strings.TrimSpace(line) == "" {
   530  			line = ""
   531  		}
   532  		out[i] = line
   533  	}
   534  	for len(out) > 0 && out[0] == "" {
   535  		out = out[1:]
   536  	}
   537  	for len(out) > 0 && out[len(out)-1] == "" {
   538  		out = out[:len(out)-1]
   539  	}
   540  	return out
   541  }
   542  
   543  // isBlank reports whether s is a blank line.
   544  func isBlank(s string) bool {
   545  	return len(s) == 0 || (len(s) == 1 && s[0] == '\n')
   546  }
   547  
   548  // commonPrefix returns the longest common prefix of a and b.
   549  func commonPrefix(a, b string) string {
   550  	i := 0
   551  	for i < len(a) && i < len(b) && a[i] == b[i] {
   552  		i++
   553  	}
   554  	return a[0:i]
   555  }
   556  
   557  // leadingSpace returns the longest prefix of s consisting of spaces and tabs.
   558  func leadingSpace(s string) string {
   559  	i := 0
   560  	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
   561  		i++
   562  	}
   563  	return s[:i]
   564  }
   565  
   566  // isOldHeading reports whether line is an old-style section heading.
   567  // line is all[off].
   568  func isOldHeading(line string, all []string, off int) bool {
   569  	if off <= 0 || all[off-1] != "" || off+2 >= len(all) || all[off+1] != "" || leadingSpace(all[off+2]) != "" {
   570  		return false
   571  	}
   572  
   573  	line = strings.TrimSpace(line)
   574  
   575  	// a heading must start with an uppercase letter
   576  	r, _ := utf8.DecodeRuneInString(line)
   577  	if !unicode.IsLetter(r) || !unicode.IsUpper(r) {
   578  		return false
   579  	}
   580  
   581  	// it must end in a letter or digit:
   582  	r, _ = utf8.DecodeLastRuneInString(line)
   583  	if !unicode.IsLetter(r) && !unicode.IsDigit(r) {
   584  		return false
   585  	}
   586  
   587  	// exclude lines with illegal characters. we allow "(),"
   588  	if strings.ContainsAny(line, ";:!?+*/=[]{}_^°&§~%#@<\">\\") {
   589  		return false
   590  	}
   591  
   592  	// allow "'" for possessive "'s" only
   593  	for b := line; ; {
   594  		var ok bool
   595  		if _, b, ok = strings.Cut(b, "'"); !ok {
   596  			break
   597  		}
   598  		if b != "s" && !strings.HasPrefix(b, "s ") {
   599  			return false // ' not followed by s and then end-of-word
   600  		}
   601  	}
   602  
   603  	// allow "." when followed by non-space
   604  	for b := line; ; {
   605  		var ok bool
   606  		if _, b, ok = strings.Cut(b, "."); !ok {
   607  			break
   608  		}
   609  		if b == "" || strings.HasPrefix(b, " ") {
   610  			return false // not followed by non-space
   611  		}
   612  	}
   613  
   614  	return true
   615  }
   616  
   617  // oldHeading returns the *Heading for the given old-style section heading line.
   618  func (d *parseDoc) oldHeading(line string) Block {
   619  	return &Heading{Text: []Text{Plain(strings.TrimSpace(line))}}
   620  }
   621  
   622  // isHeading reports whether line is a new-style section heading.
   623  func isHeading(line string) bool {
   624  	return len(line) >= 2 &&
   625  		line[0] == '#' &&
   626  		(line[1] == ' ' || line[1] == '\t') &&
   627  		strings.TrimSpace(line) != "#"
   628  }
   629  
   630  // heading returns the *Heading for the given new-style section heading line.
   631  func (d *parseDoc) heading(line string) Block {
   632  	return &Heading{Text: []Text{Plain(strings.TrimSpace(line[1:]))}}
   633  }
   634  
   635  // code returns a code block built from the lines.
   636  func (d *parseDoc) code(lines []string) *Code {
   637  	body := unindent(lines)
   638  	body = append(body, "") // to get final \n from Join
   639  	return &Code{Text: strings.Join(body, "\n")}
   640  }
   641  
   642  // paragraph returns a paragraph block built from the lines.
   643  // If the lines are link definitions, paragraph adds them to d and returns nil.
   644  func (d *parseDoc) paragraph(lines []string) Block {
   645  	// Is this a block of known links? Handle.
   646  	var defs []*LinkDef
   647  	for _, line := range lines {
   648  		def, ok := parseLink(line)
   649  		if !ok {
   650  			goto NoDefs
   651  		}
   652  		defs = append(defs, def)
   653  	}
   654  	for _, def := range defs {
   655  		d.Links = append(d.Links, def)
   656  		if d.links[def.Text] == nil {
   657  			d.links[def.Text] = def
   658  		}
   659  	}
   660  	return nil
   661  NoDefs:
   662  
   663  	return &Paragraph{Text: []Text{Plain(strings.Join(lines, "\n"))}}
   664  }
   665  
   666  // parseLink parses a single link definition line:
   667  //
   668  //	[text]: url
   669  //
   670  // It returns the link definition and whether the line was well formed.
   671  func parseLink(line string) (*LinkDef, bool) {
   672  	if line == "" || line[0] != '[' {
   673  		return nil, false
   674  	}
   675  	i := strings.Index(line, "]:")
   676  	if i < 0 || i+3 >= len(line) || (line[i+2] != ' ' && line[i+2] != '\t') {
   677  		return nil, false
   678  	}
   679  
   680  	text := line[1:i]
   681  	url := strings.TrimSpace(line[i+3:])
   682  	j := strings.Index(url, "://")
   683  	if j < 0 || !isScheme(url[:j]) {
   684  		return nil, false
   685  	}
   686  
   687  	// Line has right form and has valid scheme://.
   688  	// That's good enough for us - we are not as picky
   689  	// about the characters beyond the :// as we are
   690  	// when extracting inline URLs from text.
   691  	return &LinkDef{Text: text, URL: url}, true
   692  }
   693  
   694  // list returns a list built from the indented lines,
   695  // using forceBlankBefore as the value of the List's ForceBlankBefore field.
   696  func (d *parseDoc) list(lines []string, forceBlankBefore bool) *List {
   697  	num, _, _ := listMarker(lines[0])
   698  	var (
   699  		list *List = &List{ForceBlankBefore: forceBlankBefore}
   700  		item *ListItem
   701  		text []string
   702  	)
   703  	flush := func() {
   704  		if item != nil {
   705  			if para := d.paragraph(text); para != nil {
   706  				item.Content = append(item.Content, para)
   707  			}
   708  		}
   709  		text = nil
   710  	}
   711  
   712  	for _, line := range lines {
   713  		if n, after, ok := listMarker(line); ok && (n != "") == (num != "") {
   714  			// start new list item
   715  			flush()
   716  
   717  			item = &ListItem{Number: n}
   718  			list.Items = append(list.Items, item)
   719  			line = after
   720  		}
   721  		line = strings.TrimSpace(line)
   722  		if line == "" {
   723  			list.ForceBlankBetween = true
   724  			flush()
   725  			continue
   726  		}
   727  		text = append(text, strings.TrimSpace(line))
   728  	}
   729  	flush()
   730  	return list
   731  }
   732  
   733  // listMarker parses the line as beginning with a list marker.
   734  // If it can do that, it returns the numeric marker ("" for a bullet list),
   735  // the rest of the line, and ok == true.
   736  // Otherwise, it returns "", "", false.
   737  func listMarker(line string) (num, rest string, ok bool) {
   738  	line = strings.TrimSpace(line)
   739  	if line == "" {
   740  		return "", "", false
   741  	}
   742  
   743  	// Can we find a marker?
   744  	if r, n := utf8.DecodeRuneInString(line); r == '•' || r == '*' || r == '+' || r == '-' {
   745  		num, rest = "", line[n:]
   746  	} else if '0' <= line[0] && line[0] <= '9' {
   747  		n := 1
   748  		for n < len(line) && '0' <= line[n] && line[n] <= '9' {
   749  			n++
   750  		}
   751  		if n >= len(line) || (line[n] != '.' && line[n] != ')') {
   752  			return "", "", false
   753  		}
   754  		num, rest = line[:n], line[n+1:]
   755  	} else {
   756  		return "", "", false
   757  	}
   758  
   759  	if !indented(rest) || strings.TrimSpace(rest) == "" {
   760  		return "", "", false
   761  	}
   762  
   763  	return num, rest, true
   764  }
   765  
   766  // isList reports whether the line is the first line of a list,
   767  // meaning starts with a list marker after any indentation.
   768  // (The caller is responsible for checking the line is indented, as appropriate.)
   769  func isList(line string) bool {
   770  	_, _, ok := listMarker(line)
   771  	return ok
   772  }
   773  
   774  // parseLinkedText parses text that is allowed to contain explicit links,
   775  // such as [math.Sin] or [Go home page], into a slice of Text items.
   776  //
   777  // A “pkg” is only assumed to be a full import path if it starts with
   778  // a domain name (a path element with a dot) or is one of the packages
   779  // from the standard library (“[os]”, “[encoding/json]”, and so on).
   780  // To avoid problems with maps, generics, and array types, doc links
   781  // must be both preceded and followed by punctuation, spaces, tabs,
   782  // or the start or end of a line. An example problem would be treating
   783  // map[ast.Expr]TypeAndValue as containing a link.
   784  func (d *parseDoc) parseLinkedText(text string) []Text {
   785  	var out []Text
   786  	wrote := 0
   787  	flush := func(i int) {
   788  		if wrote < i {
   789  			out = d.parseText(out, text[wrote:i], true)
   790  			wrote = i
   791  		}
   792  	}
   793  
   794  	start := -1
   795  	var buf []byte
   796  	for i := 0; i < len(text); i++ {
   797  		c := text[i]
   798  		if c == '\n' || c == '\t' {
   799  			c = ' '
   800  		}
   801  		switch c {
   802  		case '[':
   803  			start = i
   804  		case ']':
   805  			if start >= 0 {
   806  				if def, ok := d.links[string(buf)]; ok {
   807  					def.Used = true
   808  					flush(start)
   809  					out = append(out, &Link{
   810  						Text: d.parseText(nil, text[start+1:i], false),
   811  						URL:  def.URL,
   812  					})
   813  					wrote = i + 1
   814  				} else if link, ok := d.docLink(text[start+1:i], text[:start], text[i+1:]); ok {
   815  					flush(start)
   816  					link.Text = d.parseText(nil, text[start+1:i], false)
   817  					out = append(out, link)
   818  					wrote = i + 1
   819  				}
   820  			}
   821  			start = -1
   822  			buf = buf[:0]
   823  		}
   824  		if start >= 0 && i != start {
   825  			buf = append(buf, c)
   826  		}
   827  	}
   828  
   829  	flush(len(text))
   830  	return out
   831  }
   832  
   833  // docLink parses text, which was found inside [ ] brackets,
   834  // as a doc link if possible, returning the DocLink and ok == true
   835  // or else nil, false.
   836  // The before and after strings are the text before the [ and after the ]
   837  // on the same line. Doc links must be preceded and followed by
   838  // punctuation, spaces, tabs, or the start or end of a line.
   839  func (d *parseDoc) docLink(text, before, after string) (link *DocLink, ok bool) {
   840  	if before != "" {
   841  		r, _ := utf8.DecodeLastRuneInString(before)
   842  		if !unicode.IsPunct(r) && r != ' ' && r != '\t' && r != '\n' {
   843  			return nil, false
   844  		}
   845  	}
   846  	if after != "" {
   847  		r, _ := utf8.DecodeRuneInString(after)
   848  		if !unicode.IsPunct(r) && r != ' ' && r != '\t' && r != '\n' {
   849  			return nil, false
   850  		}
   851  	}
   852  	text = strings.TrimPrefix(text, "*")
   853  	pkg, name, ok := splitDocName(text)
   854  	var recv string
   855  	if ok {
   856  		pkg, recv, _ = splitDocName(pkg)
   857  	}
   858  	if pkg != "" {
   859  		if pkg, ok = d.lookupPkg(pkg); !ok {
   860  			return nil, false
   861  		}
   862  	} else {
   863  		if ok = d.lookupSym(recv, name); !ok {
   864  			return nil, false
   865  		}
   866  	}
   867  	link = &DocLink{
   868  		ImportPath: pkg,
   869  		Recv:       recv,
   870  		Name:       name,
   871  	}
   872  	return link, true
   873  }
   874  
   875  // If text is of the form before.Name, where Name is a capitalized Go identifier,
   876  // then splitDocName returns before, name, true.
   877  // Otherwise it returns text, "", false.
   878  func splitDocName(text string) (before, name string, foundDot bool) {
   879  	i := strings.LastIndex(text, ".")
   880  	name = text[i+1:]
   881  	if !isName(name) {
   882  		return text, "", false
   883  	}
   884  	if i >= 0 {
   885  		before = text[:i]
   886  	}
   887  	return before, name, true
   888  }
   889  
   890  // parseText parses s as text and returns the result of appending
   891  // those parsed Text elements to out.
   892  // parseText does not handle explicit links like [math.Sin] or [Go home page]:
   893  // those are handled by parseLinkedText.
   894  // If autoLink is true, then parseText recognizes URLs and words from d.Words
   895  // and converts those to links as appropriate.
   896  func (d *parseDoc) parseText(out []Text, s string, autoLink bool) []Text {
   897  	var w strings.Builder
   898  	wrote := 0
   899  	writeUntil := func(i int) {
   900  		w.WriteString(s[wrote:i])
   901  		wrote = i
   902  	}
   903  	flush := func(i int) {
   904  		writeUntil(i)
   905  		if w.Len() > 0 {
   906  			out = append(out, Plain(w.String()))
   907  			w.Reset()
   908  		}
   909  	}
   910  	for i := 0; i < len(s); {
   911  		t := s[i:]
   912  		if autoLink {
   913  			if url, ok := autoURL(t); ok {
   914  				flush(i)
   915  				// Note: The old comment parser would look up the URL in words
   916  				// and replace the target with words[URL] if it was non-empty.
   917  				// That would allow creating links that display as one URL but
   918  				// when clicked go to a different URL. Not sure what the point
   919  				// of that is, so we're not doing that lookup here.
   920  				out = append(out, &Link{Auto: true, Text: []Text{Plain(url)}, URL: url})
   921  				i += len(url)
   922  				wrote = i
   923  				continue
   924  			}
   925  			if id, ok := ident(t); ok {
   926  				url, italics := d.Words[id]
   927  				if !italics {
   928  					i += len(id)
   929  					continue
   930  				}
   931  				flush(i)
   932  				if url == "" {
   933  					out = append(out, Italic(id))
   934  				} else {
   935  					out = append(out, &Link{Auto: true, Text: []Text{Italic(id)}, URL: url})
   936  				}
   937  				i += len(id)
   938  				wrote = i
   939  				continue
   940  			}
   941  		}
   942  		switch {
   943  		case strings.HasPrefix(t, "``"):
   944  			if len(t) >= 3 && t[2] == '`' {
   945  				// Do not convert `` inside ```, in case people are mistakenly writing Markdown.
   946  				i += 3
   947  				for i < len(t) && t[i] == '`' {
   948  					i++
   949  				}
   950  				break
   951  			}
   952  			writeUntil(i)
   953  			w.WriteRune('“')
   954  			i += 2
   955  			wrote = i
   956  		case strings.HasPrefix(t, "''"):
   957  			writeUntil(i)
   958  			w.WriteRune('”')
   959  			i += 2
   960  			wrote = i
   961  		default:
   962  			i++
   963  		}
   964  	}
   965  	flush(len(s))
   966  	return out
   967  }
   968  
   969  // autoURL checks whether s begins with a URL that should be hyperlinked.
   970  // If so, it returns the URL, which is a prefix of s, and ok == true.
   971  // Otherwise it returns "", false.
   972  // The caller should skip over the first len(url) bytes of s
   973  // before further processing.
   974  func autoURL(s string) (url string, ok bool) {
   975  	// Find the ://. Fast path to pick off non-URL,
   976  	// since we call this at every position in the string.
   977  	// The shortest possible URL is ftp://x, 7 bytes.
   978  	var i int
   979  	switch {
   980  	case len(s) < 7:
   981  		return "", false
   982  	case s[3] == ':':
   983  		i = 3
   984  	case s[4] == ':':
   985  		i = 4
   986  	case s[5] == ':':
   987  		i = 5
   988  	case s[6] == ':':
   989  		i = 6
   990  	default:
   991  		return "", false
   992  	}
   993  	if i+3 > len(s) || s[i:i+3] != "://" {
   994  		return "", false
   995  	}
   996  
   997  	// Check valid scheme.
   998  	if !isScheme(s[:i]) {
   999  		return "", false
  1000  	}
  1001  
  1002  	// Scan host part. Must have at least one byte,
  1003  	// and must start and end in non-punctuation.
  1004  	i += 3
  1005  	if i >= len(s) || !isHost(s[i]) || isPunct(s[i]) {
  1006  		return "", false
  1007  	}
  1008  	i++
  1009  	end := i
  1010  	for i < len(s) && isHost(s[i]) {
  1011  		if !isPunct(s[i]) {
  1012  			end = i + 1
  1013  		}
  1014  		i++
  1015  	}
  1016  	i = end
  1017  
  1018  	// At this point we are definitely returning a URL (scheme://host).
  1019  	// We just have to find the longest path we can add to it.
  1020  	// Heuristics abound.
  1021  	// We allow parens, braces, and brackets,
  1022  	// but only if they match (#5043, #22285).
  1023  	// We allow .,:;?! in the path but not at the end,
  1024  	// to avoid end-of-sentence punctuation (#18139, #16565).
  1025  	stk := []byte{}
  1026  	end = i
  1027  Path:
  1028  	for ; i < len(s); i++ {
  1029  		if isPunct(s[i]) {
  1030  			continue
  1031  		}
  1032  		if !isPath(s[i]) {
  1033  			break
  1034  		}
  1035  		switch s[i] {
  1036  		case '(':
  1037  			stk = append(stk, ')')
  1038  		case '{':
  1039  			stk = append(stk, '}')
  1040  		case '[':
  1041  			stk = append(stk, ']')
  1042  		case ')', '}', ']':
  1043  			if len(stk) == 0 || stk[len(stk)-1] != s[i] {
  1044  				break Path
  1045  			}
  1046  			stk = stk[:len(stk)-1]
  1047  		}
  1048  		if len(stk) == 0 {
  1049  			end = i + 1
  1050  		}
  1051  	}
  1052  
  1053  	return s[:end], true
  1054  }
  1055  
  1056  // isScheme reports whether s is a recognized URL scheme.
  1057  // Note that if strings of new length (beyond 3-7)
  1058  // are added here, the fast path at the top of autoURL will need updating.
  1059  func isScheme(s string) bool {
  1060  	switch s {
  1061  	case "file",
  1062  		"ftp",
  1063  		"gopher",
  1064  		"http",
  1065  		"https",
  1066  		"mailto",
  1067  		"nntp":
  1068  		return true
  1069  	}
  1070  	return false
  1071  }
  1072  
  1073  // isHost reports whether c is a byte that can appear in a URL host,
  1074  // like www.example.com or user@[::1]:8080
  1075  func isHost(c byte) bool {
  1076  	// mask is a 128-bit bitmap with 1s for allowed bytes,
  1077  	// so that the byte c can be tested with a shift and an and.
  1078  	// If c > 128, then 1<<c and 1<<(c-64) will both be zero,
  1079  	// and this function will return false.
  1080  	const mask = 0 |
  1081  		(1<<26-1)<<'A' |
  1082  		(1<<26-1)<<'a' |
  1083  		(1<<10-1)<<'0' |
  1084  		1<<'_' |
  1085  		1<<'@' |
  1086  		1<<'-' |
  1087  		1<<'.' |
  1088  		1<<'[' |
  1089  		1<<']' |
  1090  		1<<':'
  1091  
  1092  	return ((uint64(1)<<c)&(mask&(1<<64-1)) |
  1093  		(uint64(1)<<(c-64))&(mask>>64)) != 0
  1094  }
  1095  
  1096  // isPunct reports whether c is a punctuation byte that can appear
  1097  // inside a path but not at the end.
  1098  func isPunct(c byte) bool {
  1099  	// mask is a 128-bit bitmap with 1s for allowed bytes,
  1100  	// so that the byte c can be tested with a shift and an and.
  1101  	// If c > 128, then 1<<c and 1<<(c-64) will both be zero,
  1102  	// and this function will return false.
  1103  	const mask = 0 |
  1104  		1<<'.' |
  1105  		1<<',' |
  1106  		1<<':' |
  1107  		1<<';' |
  1108  		1<<'?' |
  1109  		1<<'!'
  1110  
  1111  	return ((uint64(1)<<c)&(mask&(1<<64-1)) |
  1112  		(uint64(1)<<(c-64))&(mask>>64)) != 0
  1113  }
  1114  
  1115  // isPath reports whether c is a (non-punctuation) path byte.
  1116  func isPath(c byte) bool {
  1117  	// mask is a 128-bit bitmap with 1s for allowed bytes,
  1118  	// so that the byte c can be tested with a shift and an and.
  1119  	// If c > 128, then 1<<c and 1<<(c-64) will both be zero,
  1120  	// and this function will return false.
  1121  	const mask = 0 |
  1122  		(1<<26-1)<<'A' |
  1123  		(1<<26-1)<<'a' |
  1124  		(1<<10-1)<<'0' |
  1125  		1<<'$' |
  1126  		1<<'\'' |
  1127  		1<<'(' |
  1128  		1<<')' |
  1129  		1<<'*' |
  1130  		1<<'+' |
  1131  		1<<'&' |
  1132  		1<<'#' |
  1133  		1<<'=' |
  1134  		1<<'@' |
  1135  		1<<'~' |
  1136  		1<<'_' |
  1137  		1<<'/' |
  1138  		1<<'-' |
  1139  		1<<'[' |
  1140  		1<<']' |
  1141  		1<<'{' |
  1142  		1<<'}' |
  1143  		1<<'%'
  1144  
  1145  	return ((uint64(1)<<c)&(mask&(1<<64-1)) |
  1146  		(uint64(1)<<(c-64))&(mask>>64)) != 0
  1147  }
  1148  
  1149  // isName reports whether s is a capitalized Go identifier (like Name).
  1150  func isName(s string) bool {
  1151  	t, ok := ident(s)
  1152  	if !ok || t != s {
  1153  		return false
  1154  	}
  1155  	r, _ := utf8.DecodeRuneInString(s)
  1156  	return unicode.IsUpper(r)
  1157  }
  1158  
  1159  // ident checks whether s begins with a Go identifier.
  1160  // If so, it returns the identifier, which is a prefix of s, and ok == true.
  1161  // Otherwise it returns "", false.
  1162  // The caller should skip over the first len(id) bytes of s
  1163  // before further processing.
  1164  func ident(s string) (id string, ok bool) {
  1165  	// Scan [\pL_][\pL_0-9]*
  1166  	n := 0
  1167  	for n < len(s) {
  1168  		if c := s[n]; c < utf8.RuneSelf {
  1169  			if isIdentASCII(c) && (n > 0 || c < '0' || c > '9') {
  1170  				n++
  1171  				continue
  1172  			}
  1173  			break
  1174  		}
  1175  		r, nr := utf8.DecodeRuneInString(s[n:])
  1176  		if unicode.IsLetter(r) {
  1177  			n += nr
  1178  			continue
  1179  		}
  1180  		break
  1181  	}
  1182  	return s[:n], n > 0
  1183  }
  1184  
  1185  // isIdentASCII reports whether c is an ASCII identifier byte.
  1186  func isIdentASCII(c byte) bool {
  1187  	// mask is a 128-bit bitmap with 1s for allowed bytes,
  1188  	// so that the byte c can be tested with a shift and an and.
  1189  	// If c > 128, then 1<<c and 1<<(c-64) will both be zero,
  1190  	// and this function will return false.
  1191  	const mask = 0 |
  1192  		(1<<26-1)<<'A' |
  1193  		(1<<26-1)<<'a' |
  1194  		(1<<10-1)<<'0' |
  1195  		1<<'_'
  1196  
  1197  	return ((uint64(1)<<c)&(mask&(1<<64-1)) |
  1198  		(uint64(1)<<(c-64))&(mask>>64)) != 0
  1199  }
  1200  
  1201  // validImportPath reports whether path is a valid import path.
  1202  // It is a lightly edited copy of golang.org/x/mod/module.CheckImportPath.
  1203  func validImportPath(path string) bool {
  1204  	if !utf8.ValidString(path) {
  1205  		return false
  1206  	}
  1207  	if path == "" {
  1208  		return false
  1209  	}
  1210  	if path[0] == '-' {
  1211  		return false
  1212  	}
  1213  	if strings.Contains(path, "//") {
  1214  		return false
  1215  	}
  1216  	if path[len(path)-1] == '/' {
  1217  		return false
  1218  	}
  1219  	elemStart := 0
  1220  	for i, r := range path {
  1221  		if r == '/' {
  1222  			if !validImportPathElem(path[elemStart:i]) {
  1223  				return false
  1224  			}
  1225  			elemStart = i + 1
  1226  		}
  1227  	}
  1228  	return validImportPathElem(path[elemStart:])
  1229  }
  1230  
  1231  func validImportPathElem(elem string) bool {
  1232  	if elem == "" || elem[0] == '.' || elem[len(elem)-1] == '.' {
  1233  		return false
  1234  	}
  1235  	for i := 0; i < len(elem); i++ {
  1236  		if !importPathOK(elem[i]) {
  1237  			return false
  1238  		}
  1239  	}
  1240  	return true
  1241  }
  1242  
  1243  func importPathOK(c byte) bool {
  1244  	// mask is a 128-bit bitmap with 1s for allowed bytes,
  1245  	// so that the byte c can be tested with a shift and an and.
  1246  	// If c > 128, then 1<<c and 1<<(c-64) will both be zero,
  1247  	// and this function will return false.
  1248  	const mask = 0 |
  1249  		(1<<26-1)<<'A' |
  1250  		(1<<26-1)<<'a' |
  1251  		(1<<10-1)<<'0' |
  1252  		1<<'-' |
  1253  		1<<'.' |
  1254  		1<<'~' |
  1255  		1<<'_' |
  1256  		1<<'+'
  1257  
  1258  	return ((uint64(1)<<c)&(mask&(1<<64-1)) |
  1259  		(uint64(1)<<(c-64))&(mask>>64)) != 0
  1260  }
  1261  

View as plain text