// Copyright 2021 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package markdown import ( "bytes" "fmt" "strings" "unicode/utf8" "golang.org/x/text/cases" ) func parseLinkRefDef(p buildState, s string) (int, bool) { // “A link reference definition consists of a link label, // optionally preceded by up to three spaces of indentation, // followed by a colon (:), // optional spaces or tabs (including up to one line ending), // a link destination, // optional spaces or tabs (including up to one line ending), // and an optional link title, // which if it is present must be separated from the link destination // by spaces or tabs. No further character may occur.” i := skipSpace(s, 0) label, i, ok := parseLinkLabel(p.(*parseState), s, i) if !ok || i >= len(s) || s[i] != ':' { return 0, false } i = skipSpace(s, i+1) suf := s[i:] dest, i, ok := parseLinkDest(s, i) if !ok { if suf != "" && suf[0] == '<' { // Goldmark treats <<> as a link definition. p.(*parseState).corner = true } return 0, false } moved := false for i < len(s) && (s[i] == ' ' || s[i] == '\t') { moved = true i++ } // Take title if present and doesn't break parse. j := i if j >= len(s) || s[j] == '\n' { moved = true if j < len(s) { j++ } } var title string var titleChar byte var corner bool if moved { for j < len(s) && (s[j] == ' ' || s[j] == '\t') { j++ } if t, c, j, ok := parseLinkTitle(s, j); ok { for j < len(s) && (s[j] == ' ' || s[j] == '\t') { j++ } if j >= len(s) || s[j] == '\n' { i = j if t == "" { // Goldmark adds title="" in this case. // We do not, nor does the Dingus. corner = true } title = t titleChar = c } } } // Must end line. Already trimmed spaces. if i < len(s) && s[i] != '\n' { return 0, false } if i < len(s) { i++ } label = normalizeLabel(label) if p.link(label) == nil { p.defineLink(label, &Link{URL: dest, Title: title, TitleChar: titleChar, corner: corner}) } return i, true } func parseLinkTitle(s string, i int) (title string, char byte, next int, found bool) { if i < len(s) && (s[i] == '"' || s[i] == '\'' || s[i] == '(') { want := s[i] if want == '(' { want = ')' } j := i + 1 for ; j < len(s); j++ { if s[j] == want { title := s[i+1 : j] // TODO: Validate title? return mdUnescaper.Replace(title), want, j + 1, true } if s[j] == '(' && want == ')' { break } if s[j] == '\\' && j+1 < len(s) { j++ } } } return "", 0, 0, false } func parseLinkLabel(p *parseState, s string, i int) (string, int, bool) { // “A link label begins with a left bracket ([) and ends with // the first right bracket (]) that is not backslash-escaped. // Between these brackets there must be at least one character // that is not a space, tab, or line ending. // Unescaped square bracket characters are not allowed // inside the opening and closing square brackets of link labels. // A link label can have at most 999 characters inside the square brackets.” if i >= len(s) || s[i] != '[' { return "", 0, false } j := i + 1 for ; j < len(s); j++ { if s[j] == ']' { if j-(i+1) > 999 { // Goldmark does not apply 999 limit. p.corner = true break } if label := trimSpaceTabNewline(s[i+1 : j]); label != "" { // Note: CommonMark Dingus does not escape. return label, j + 1, true } break } if s[j] == '[' { break } if s[j] == '\\' && j+1 < len(s) { j++ } } return "", 0, false } func normalizeLabel(s string) string { if strings.Contains(s, "[") || strings.Contains(s, "]") { // Labels cannot have [ ] so avoid the work of translating. // This is especially important for pathlogical cases like // [[[[[[[[[[a]]]]]]]]]] which would otherwise generate quadratic // amounts of garbage. return "" } // “To normalize a label, strip off the opening and closing brackets, // perform the Unicode case fold, strip leading and trailing spaces, tabs, and line endings, // and collapse consecutive internal spaces, tabs, and line endings to a single space.” s = trimSpaceTabNewline(s) var b strings.Builder space := false hi := false for i := 0; i < len(s); i++ { c := s[i] switch c { case ' ', '\t', '\n': space = true continue default: if space { b.WriteByte(' ') space = false } if 'A' <= c && c <= 'Z' { c += 'a' - 'A' } if c >= 0x80 { hi = true } b.WriteByte(c) } } s = b.String() if hi { s = cases.Fold().String(s) } return s } func parseLinkDest(s string, i int) (string, int, bool) { if i >= len(s) { return "", 0, false } // “A sequence of zero or more characters between an opening < and a closing > // that contains no line endings or unescaped < or > characters,” if s[i] == '<' { for j := i + 1; ; j++ { if j >= len(s) || s[j] == '\n' || s[j] == '<' { return "", 0, false } if s[j] == '>' { // TODO unescape? return mdUnescape(s[i+1 : j]), j + 1, true } if s[j] == '\\' { j++ } } } // “or a nonempty sequence of characters that does not start with <, // does not include ASCII control characters or space character, // and includes parentheses only if (a) they are backslash-escaped // or (b) they are part of a balanced pair of unescaped parentheses. depth := 0 j := i Loop: for ; j < len(s); j++ { switch s[j] { case '(': depth++ if depth > 32 { // Avoid quadratic inputs by stopping if too deep. // This is the same depth that cmark-gfm uses. return "", 0, false } case ')': if depth == 0 { break Loop } depth-- case '\\': if j+1 < len(s) { if s[j+1] == ' ' || s[j+1] == '\t' { return "", 0, false } j++ } case ' ', '\t', '\n': break Loop } } dest := s[i:j] // TODO: Validate dest? // TODO: Unescape? // NOTE: CommonMark Dingus does not reject control characters. return mdUnescape(dest), j, true } func parseAutoLinkURI(s string, i int) (Inline, int, bool) { // CommonMark 0.30: // // For purposes of this spec, a scheme is any sequence of 2–32 characters // beginning with an ASCII letter and followed by any combination of // ASCII letters, digits, or the symbols plus (”+”), period (”.”), or // hyphen (”-”). // // An absolute URI, for these purposes, consists of a scheme followed by // a colon (:) followed by zero or more characters other ASCII control // characters, space, <, and >. If the URI includes these characters, // they must be percent-encoded (e.g. %20 for a space). j := i if j+1 >= len(s) || s[j] != '<' || !isLetter(s[j+1]) { return nil, 0, false } j++ for j < len(s) && isScheme(s[j]) && j-(i+1) <= 32 { j++ } if j-(i+1) < 2 || j-(i+1) > 32 || j >= len(s) || s[j] != ':' { return nil, 0, false } j++ for j < len(s) && isURL(s[j]) { j++ } if j >= len(s) || s[j] != '>' { return nil, 0, false } link := s[i+1 : j] // link = mdUnescaper.Replace(link) return &AutoLink{link, link}, j + 1, true } func parseAutoLinkEmail(s string, i int) (Inline, int, bool) { // CommonMark 0.30: // // An email address, for these purposes, is anything that matches // the non-normative regex from the HTML5 spec: // // /^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/ j := i if j+1 >= len(s) || s[j] != '<' || !isUser(s[j+1]) { return nil, 0, false } j++ for j < len(s) && isUser(s[j]) { j++ } if j >= len(s) || s[j] != '@' { return nil, 0, false } for { j++ n, ok := skipDomainElem(s[j:]) if !ok { return nil, 0, false } j += n if j >= len(s) || s[j] != '.' && s[j] != '>' { return nil, 0, false } if s[j] == '>' { break } } email := s[i+1 : j] return &AutoLink{email, "mailto:" + email}, j + 1, true } func isUser(c byte) bool { if isLetterDigit(c) { return true } s := ".!#$%&'*+/=?^_`{|}~-" for i := 0; i < len(s); i++ { if c == s[i] { return true } } return false } func isHexDigit(c byte) bool { return 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' || '0' <= c && c <= '9' } func isDigit(c byte) bool { return '0' <= c && c <= '9' } func skipDomainElem(s string) (int, bool) { // String of LDH, up to 63 in length, with LetterDigit // at both ends (1-letter/digit names are OK). // Aka /[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?/. if len(s) < 1 || !isLetterDigit(s[0]) { return 0, false } i := 1 for i < len(s) && isLDH(s[i]) && i <= 63 { i++ } if i > 63 || !isLetterDigit(s[i-1]) { return 0, false } return i, true } func isScheme(c byte) bool { return isLetterDigit(c) || c == '+' || c == '.' || c == '-' } func isURL(c byte) bool { return c > ' ' && c != '<' && c != '>' } type AutoLink struct { Text string URL string } func (*AutoLink) Inline() {} func (x *AutoLink) PrintHTML(buf *bytes.Buffer) { fmt.Fprintf(buf, "%s", htmlLinkEscaper.Replace(x.URL), htmlEscaper.Replace(x.Text)) } func (x *AutoLink) printMarkdown(buf *bytes.Buffer) { fmt.Fprintf(buf, "<%s>", x.Text) } func (x *AutoLink) PrintText(buf *bytes.Buffer) { fmt.Fprintf(buf, "%s", htmlEscaper.Replace(x.Text)) } type Link struct { Inner []Inline URL string Title string TitleChar byte // ', " or ) corner bool } func (*Link) Inline() {} func (x *Link) PrintHTML(buf *bytes.Buffer) { fmt.Fprintf(buf, "") for _, c := range x.Inner { c.PrintHTML(buf) } buf.WriteString("") } func (x *Link) printMarkdown(buf *bytes.Buffer) { buf.WriteByte('[') x.printRemainingMarkdown(buf) } func (x *Link) printRemainingMarkdown(buf *bytes.Buffer) { for _, c := range x.Inner { c.printMarkdown(buf) } buf.WriteString("](") buf.WriteString(x.URL) printLinkTitleMarkdown(buf, x.Title, x.TitleChar) buf.WriteByte(')') } func printLinkTitleMarkdown(buf *bytes.Buffer, title string, titleChar byte) { if title == "" { return } closeChar := titleChar openChar := closeChar if openChar == ')' { openChar = '(' } fmt.Fprintf(buf, " %c%s%c", openChar, title /*TODO(jba): escape*/, closeChar) } func (x *Link) PrintText(buf *bytes.Buffer) { for _, c := range x.Inner { c.PrintText(buf) } } type Image struct { Inner []Inline URL string Title string TitleChar byte corner bool } func (*Image) Inline() {} func (x *Image) PrintHTML(buf *bytes.Buffer) { fmt.Fprintf(buf, "\"")") } func (x *Image) printMarkdown(buf *bytes.Buffer) { buf.WriteString("![") (*Link)(x).printRemainingMarkdown(buf) } func (x *Image) PrintText(buf *bytes.Buffer) { for _, c := range x.Inner { c.PrintText(buf) } } // GitHub Flavored Markdown autolinks extension // https://github.github.com/gfm/#autolinks-extension- // autoLinkMore rewrites any extended autolinks in the body // and returns the result. // // body is a list of Plain, Emph, Strong, and Del nodes. // Two Plains only appear consecutively when one is a // potential emphasis marker that ended up being plain after all, like "_" or "**". // There are no Link nodes. // // The GitHub “spec” declares that “autolinks can only come at the // beginning of a line, after whitespace, or any of the delimiting // characters *, _, ~, and (”. However, the GitHub web site does not // enforce this rule: text like "$abc@def.ghi is my email" links the // text following the $ as an email address. It appears the actual rule // is that autolinks cannot come after ASCII letters, although they can // come after numbers or Unicode letters. // Since the only point of implementing GitHub Flavored Markdown // is to match GitHub's behavior, we do what they do, not what they say, // at least for now. func (p *parseState) autoLinkText(list []Inline) []Inline { if !p.AutoLinkText { return list } var out []Inline // allocated lazily when we first change list for i, x := range list { switch x := x.(type) { case *Plain: if rewrite := p.autoLinkPlain(x.Text); rewrite != nil { if out == nil { out = append(out, list[:i]...) } out = append(out, rewrite...) continue } case *Strong: x.Inner = p.autoLinkText(x.Inner) case *Del: x.Inner = p.autoLinkText(x.Inner) case *Emph: x.Inner = p.autoLinkText(x.Inner) } if out != nil { out = append(out, x) } } if out == nil { return list } return out } func (p *parseState) autoLinkPlain(s string) []Inline { vd := &validDomainChecker{s: s} var out []Inline Restart: for i := 0; i < len(s); i++ { c := s[i] if c == '@' { if before, link, after, ok := p.parseAutoEmail(s, i); ok { if before != "" { out = append(out, &Plain{Text: before}) } out = append(out, link) vd.skip(len(s) - len(after)) s = after goto Restart } } if (c == 'h' || c == 'm' || c == 'x' || c == 'w') && (i == 0 || !isLetter(s[i-1])) { if link, after, ok := p.parseAutoProto(s, i, vd); ok { if i > 0 { out = append(out, &Plain{Text: s[:i]}) } out = append(out, link) vd.skip(len(s) - len(after)) s = after goto Restart } } } if out == nil { return nil } out = append(out, &Plain{Text: s}) return out } func (p *parseState) parseAutoProto(s string, i int, vd *validDomainChecker) (link *Link, after string, found bool) { if s == "" { return } switch s[i] { case 'h': var n int if strings.HasPrefix(s[i:], "https://") { n = len("https://") } else if strings.HasPrefix(s[i:], "http://") { n = len("http://") } else { return } return p.parseAutoHTTP(s[i:i+n], s, i, i+n, i+n+1, vd) case 'w': if !strings.HasPrefix(s[i:], "www.") { return } // GitHub Flavored Markdown says to use http://, // but it's not 1985 anymore. We live in the https:// future // (unless the parser is explicitly configured otherwise). // People who really care in their docs can write http:// themselves. scheme := "https://" if p.AutoLinkAssumeHTTP { scheme = "http://" } return p.parseAutoHTTP(scheme, s, i, i, i+3, vd) case 'm': if !strings.HasPrefix(s[i:], "mailto:") { return } return p.parseAutoMailto(s, i) case 'x': if !strings.HasPrefix(s[i:], "xmpp:") { return } return p.parseAutoXmpp(s, i) } return } // parseAutoWWW parses an extended www autolink. // https://github.github.com/gfm/#extended-www-autolink func (p *parseState) parseAutoHTTP(scheme, s string, textstart, start, min int, vd *validDomainChecker) (link *Link, after string, found bool) { n, ok := vd.parseValidDomain(start) if !ok { return } i := start + n domEnd := i // “After a valid domain, zero or more non-space non-< characters may follow.” paren := 0 for i < len(s) { r, n := utf8.DecodeRuneInString(s[i:]) if isUnicodeSpace(r) || r == '<' { break } if r == '(' { paren++ } if r == ')' { paren-- } i += n } // https://github.github.com/gfm/#extended-autolink-path-validation Trim: for i > min { switch s[i-1] { case '?', '!', '.', ',', ':', '@', '_', '~': // Trim certain trailing punctuation. i-- continue Trim case ')': // Trim trailing unmatched (by count only) parens. if paren < 0 { for s[i-1] == ')' && paren < 0 { paren++ i-- } continue Trim } case ';': // Trim entity reference. // After doing the work of the scan, we either cut that part off the string // or we stop the trimming entirely, so there's no chance of repeating // the scan on a future iteration and going accidentally quadratic. // Even though the Markdown spec already requires having a complete // list of all the HTML entities, the GitHub definition here just requires // "looks like" an entity, meaning its an ampersand, letters/digits, and semicolon. for j := i - 2; j > start; j-- { if j < i-2 && s[j] == '&' { i = j continue Trim } if !isLetterDigit(s[j]) { break Trim } } } break Trim } // According to the literal text of the GitHub Flavored Markdown spec // and the actual behavior on GitHub, // www.example.com$foo turns into , // but that makes the character restrictions in the valid-domain check // almost meaningless. So we insist that when all is said and done, // if the domain is followed by anything, that thing must be a slash, // even though GitHub is not that picky. // People might complain about www.example.com:1234 not working, // but if you want to get fancy with that kind of thing, just write http:// in front. if textstart == start && i > domEnd && s[domEnd] != '/' { i = domEnd } if i < min { return } link = &Link{ Inner: []Inline{&Plain{Text: s[textstart:i]}}, URL: scheme + s[start:i], } return link, s[i:], true } type validDomainChecker struct { s string cut int // before this index, no valid domains } func (v *validDomainChecker) skip(i int) { v.s = v.s[i:] v.cut -= i } // parseValidDomain parses a valid domain. // https://github.github.com/gfm/#valid-domain // // If s starts with a valid domain, parseValidDomain returns // the length of that domain and true. If s does not start with // a valid domain, parseValidDomain returns n, false, // where n is the length of a prefix guaranteed not to be acceptable // to any future call to parseValidDomain. // // “A valid domain consists of segments of alphanumeric characters, // underscores (_) and hyphens (-) separated by periods (.). // There must be at least one period, and no underscores may be // present in the last two segments of the domain.” // // The spec does not spell out whether segments can be empty. // Empirically, in GitHub's implementation they can. func (v *validDomainChecker) parseValidDomain(start int) (n int, found bool) { if start < v.cut { return 0, false } i := start dots := 0 for ; i < len(v.s); i++ { c := v.s[i] if c == '_' { dots = -2 continue } if c == '.' { dots++ continue } if !isLDH(c) { break } } if dots >= 0 && i > start { return i - start, true } v.cut = i return 0, false } func (p *parseState) parseAutoEmail(s string, i int) (before string, link *Link, after string, ok bool) { if s[i] != '@' { return } // “One ore more characters which are alphanumeric, or ., -, _, or +.” j := i for j > 0 && (isLDH(s[j-1]) || s[j-1] == '_' || s[j-1] == '+' || s[j-1] == '.') { j-- } if i-j < 1 { return } // “One or more characters which are alphanumeric, or - or _, separated by periods (.). // There must be at least one period. The last character must not be one of - or _.” dots := 0 k := i + 1 for k < len(s) && (isLDH(s[k]) || s[k] == '_' || s[k] == '.') { if s[k] == '.' { if s[k-1] == '.' { // Empirically, .. stops the scan but foo@.bar is fine. break } dots++ } k++ } // “., -, and _ can occur on both sides of the @, but only . may occur at the end // of the email address, in which case it will not be considered part of the address” if s[k-1] == '.' { dots-- k-- } if s[k-1] == '-' || s[k-1] == '_' { return } if k-(i+1)-dots < 2 || dots < 1 { return } link = &Link{ Inner: []Inline{&Plain{Text: s[j:k]}}, URL: "mailto:" + s[j:k], } return s[:j], link, s[k:], true } func (p *parseState) parseAutoMailto(s string, i int) (link *Link, after string, ok bool) { j := i + len("mailto:") for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '+' || s[j] == '.') { j++ } if j >= len(s) || s[j] != '@' { return } before, link, after, ok := p.parseAutoEmail(s[i:], j-i) if before != "mailto:" || !ok { return nil, "", false } link.Inner[0] = &Plain{Text: s[i : len(s)-len(after)]} return link, after, true } func (p *parseState) parseAutoXmpp(s string, i int) (link *Link, after string, ok bool) { j := i + len("xmpp:") for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '+' || s[j] == '.') { j++ } if j >= len(s) || s[j] != '@' { return } before, link, after, ok := p.parseAutoEmail(s[i:], j-i) if before != "xmpp:" || !ok { return nil, "", false } if after != "" && after[0] == '/' { k := 1 for k < len(after) && (isLetterDigit(after[k]) || after[k] == '@' || after[k] == '.') { k++ } after = after[k:] } url := s[i : len(s)-len(after)] link.Inner[0] = &Plain{Text: url} link.URL = url return link, after, true }