html.go

     1  // Copyright 2021 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package markdown
     6  
     7  import (
     8  	"bytes"
     9  	"strconv"
    10  	"strings"
    11  	"unicode"
    12  )
    13  
    14  type HTMLBlock struct {
    15  	Position
    16  	Text []string
    17  }
    18  
    19  func (b *HTMLBlock) PrintHTML(buf *bytes.Buffer) {
    20  	for _, s := range b.Text {
    21  		buf.WriteString(s)
    22  		buf.WriteString("\n")
    23  	}
    24  }
    25  
    26  func (b *HTMLBlock) printMarkdown(buf *bytes.Buffer, s mdState) {
    27  	if s.prefix1 != "" {
    28  		buf.WriteString(s.prefix1)
    29  	} else {
    30  		buf.WriteString(s.prefix)
    31  	}
    32  	b.PrintHTML(buf)
    33  }
    34  
    35  type htmlBuilder struct {
    36  	endBlank bool
    37  	text     []string
    38  	endFunc  func(string) bool
    39  }
    40  
    41  func (c *htmlBuilder) extend(p *parseState, s line) (line, bool) {
    42  	if c.endBlank && s.isBlank() {
    43  		return s, false
    44  	}
    45  	t := s.string()
    46  	c.text = append(c.text, t)
    47  	if c.endFunc != nil && c.endFunc(t) {
    48  		return line{}, false
    49  	}
    50  	return line{}, true
    51  }
    52  
    53  func (c *htmlBuilder) build(p buildState) Block {
    54  	return &HTMLBlock{
    55  		p.pos(),
    56  		c.text,
    57  	}
    58  }
    59  
    60  func newHTML(p *parseState, s line) (line, bool) {
    61  	peek := s
    62  	if p.startHTML(&peek) {
    63  		return line{}, true
    64  	}
    65  	return s, false
    66  }
    67  
    68  func (p *parseState) startHTML(s *line) bool {
    69  	tt := *s
    70  	tt.trimSpace(0, 3, false)
    71  	if tt.peek() != '<' {
    72  		return false
    73  	}
    74  	t := tt.string()
    75  
    76  	var end string
    77  	switch {
    78  	case strings.HasPrefix(t, "<!--"):
    79  		end = "-->"
    80  	case strings.HasPrefix(t, "<?"):
    81  		end = "?>"
    82  	case strings.HasPrefix(t, "<![CDATA["):
    83  		end = "]]>"
    84  	case strings.HasPrefix(t, "<!") && len(t) >= 3 && isLetter(t[2]):
    85  		if 'a' <= t[2] && t[2] <= 'z' {
    86  			// Goldmark and the Dingus only accept <!UPPER> not <!lower>.
    87  			p.corner = true
    88  		}
    89  		end = ">"
    90  	}
    91  	if end != "" {
    92  		b := &htmlBuilder{endFunc: func(s string) bool { return strings.Contains(s, end) }}
    93  		p.addBlock(b)
    94  		b.text = append(b.text, s.string())
    95  		if b.endFunc(t) {
    96  			p.closeBlock()
    97  		}
    98  		return true
    99  	}
   100  
   101  	// case 6
   102  	i := 1
   103  	if i < len(t) && t[i] == '/' {
   104  		i++
   105  	}
   106  	buf := make([]byte, 0, 16)
   107  	for ; i < len(t) && len(buf) < 16; i++ {
   108  		c := t[i]
   109  		if 'A' <= c && c <= 'Z' {
   110  			c += 'a' - 'A'
   111  		}
   112  		if !('a' <= c && c <= 'z') && !('0' <= c && c <= '9') {
   113  			break
   114  		}
   115  		buf = append(buf, c)
   116  	}
   117  	var sep byte
   118  	if i < len(t) {
   119  		switch t[i] {
   120  		default:
   121  			goto Next
   122  		case ' ', '\t', '>':
   123  			// ok
   124  			sep = t[i]
   125  		case '/':
   126  			if i+1 >= len(t) || t[i+1] != '>' {
   127  				goto Next
   128  			}
   129  		}
   130  	}
   131  
   132  	if len(buf) == 0 {
   133  		goto Next
   134  	}
   135  	{
   136  		c := buf[0]
   137  		var ok bool
   138  		for _, name := range htmlTags {
   139  			if name[0] == c && len(name) == len(buf) && name == string(buf) {
   140  				if sep == '\t' {
   141  					// Goldmark recognizes space here but not tab.
   142  					// testdata/extra.txt 143.md
   143  					p.corner = true
   144  				}
   145  				ok = true
   146  				break
   147  			}
   148  		}
   149  		if !ok {
   150  			goto Next
   151  		}
   152  	}
   153  
   154  	{
   155  		b := &htmlBuilder{endBlank: true}
   156  		p.addBlock(b)
   157  		b.text = append(b.text, s.string())
   158  		return true
   159  	}
   160  
   161  Next:
   162  	// case 1
   163  	if len(t) > 1 && t[1] != '/' && (i >= len(t) || t[i] == ' ' || t[i] == '\t' || t[i] == '>') {
   164  		switch string(buf) {
   165  		case "pre", "script", "style", "textarea":
   166  			b := &htmlBuilder{endFunc: hasEndPre}
   167  			p.addBlock(b)
   168  			b.text = append(b.text, s.string())
   169  			if hasEndPre(t) {
   170  				p.closeBlock()
   171  			}
   172  			return true
   173  		}
   174  	}
   175  
   176  	// case 7
   177  	if p.para() == nil {
   178  		if _, e, ok := parseHTMLOpenTag(p, t, 0); ok && skipSpace(t, e) == len(t) {
   179  			if e != len(t) {
   180  				// Goldmark disallows trailing space
   181  				p.corner = true
   182  			}
   183  			b := &htmlBuilder{endBlank: true}
   184  			p.addBlock(b)
   185  			b.text = append(b.text, s.string())
   186  			return true
   187  		}
   188  		if _, e, ok := parseHTMLClosingTag(p, t, 0); ok && skipSpace(t, e) == len(t) {
   189  			b := &htmlBuilder{endBlank: true}
   190  			p.addBlock(b)
   191  			b.text = append(b.text, s.string())
   192  			return true
   193  		}
   194  	}
   195  
   196  	return false
   197  }
   198  
   199  func hasEndPre(s string) bool {
   200  	for i := 0; i < len(s); i++ {
   201  		if s[i] == '<' && i+1 < len(s) && s[i+1] == '/' {
   202  			buf := make([]byte, 0, 8)
   203  			for i += 2; i < len(s) && len(buf) < 8; i++ {
   204  				c := s[i]
   205  				if 'A' <= c && c <= 'Z' {
   206  					c += 'a' - 'A'
   207  				}
   208  				if c < 'a' || 'z' < c {
   209  					break
   210  				}
   211  				buf = append(buf, c)
   212  			}
   213  			if i < len(s) && s[i] == '>' {
   214  				switch string(buf) {
   215  				case "pre", "script", "style", "textarea":
   216  					return true
   217  				}
   218  			}
   219  		}
   220  	}
   221  	return false
   222  }
   223  
   224  func parseHTMLTag(p *parseState, s string, i int) (Inline, int, bool) {
   225  	// “An HTML tag consists of an open tag, a closing tag, an HTML comment,
   226  	// a processing instruction, a declaration, or a CDATA section.”
   227  	if i+3 <= len(s) && s[i] == '<' {
   228  		switch s[i+1] {
   229  		default:
   230  			return parseHTMLOpenTag(p, s, i)
   231  		case '/':
   232  			return parseHTMLClosingTag(p, s, i)
   233  		case '!':
   234  			switch s[i+2] {
   235  			case '-':
   236  				return parseHTMLComment(s, i)
   237  			case '[':
   238  				return parseHTMLCDATA(s, i)
   239  			default:
   240  				return parseHTMLDecl(p, s, i)
   241  			}
   242  		case '?':
   243  			return parseHTMLProcInst(s, i)
   244  		}
   245  	}
   246  	return nil, 0, false
   247  }
   248  
   249  func parseHTMLOpenTag(p *parseState, s string, i int) (Inline, int, bool) {
   250  	if i >= len(s) || s[i] != '<' {
   251  		return nil, 0, false
   252  	}
   253  	// “An open tag consists of a < character, a tag name, zero or more attributes,
   254  	// optional spaces, tabs, and up to one line ending, an optional / character, and a > character.”
   255  	if name, j, ok := parseTagName(s, i+1); ok {
   256  		switch name {
   257  		case "pre", "script", "style", "textarea":
   258  			// Goldmark treats these as starting a new HTMLBlock
   259  			// and ending the paragraph they appear in.
   260  			p.corner = true
   261  		}
   262  		for {
   263  			if j >= len(s) || s[j] != ' ' && s[j] != '\t' && s[j] != '\n' && s[j] != '/' && s[j] != '>' {
   264  				return nil, 0, false
   265  			}
   266  			_, k, ok := parseAttr(p, s, j)
   267  			if !ok {
   268  				break
   269  			}
   270  			j = k
   271  		}
   272  		k := skipSpace(s, j)
   273  		if k != j {
   274  			// Goldmark mishandles spaces before >.
   275  			p.corner = true
   276  		}
   277  		j = k
   278  		if j < len(s) && s[j] == '/' {
   279  			j++
   280  		}
   281  		if j < len(s) && s[j] == '>' {
   282  			return &HTMLTag{s[i : j+1]}, j + 1, true
   283  		}
   284  	}
   285  	return nil, 0, false
   286  }
   287  
   288  func parseHTMLClosingTag(p *parseState, s string, i int) (Inline, int, bool) {
   289  	// “A closing tag consists of the string </, a tag name,
   290  	// optional spaces, tabs, and up to one line ending, and the character >.”
   291  	if i+2 >= len(s) || s[i] != '<' || s[i+1] != '/' {
   292  		return nil, 0, false
   293  	}
   294  	if skipSpace(s, i+2) != i+2 {
   295  		// Goldmark allows spaces here but the spec and the Dingus do not.
   296  		p.corner = true
   297  	}
   298  
   299  	if _, j, ok := parseTagName(s, i+2); ok {
   300  		j = skipSpace(s, j)
   301  		if j < len(s) && s[j] == '>' {
   302  			return &HTMLTag{s[i : j+1]}, j + 1, true
   303  		}
   304  	}
   305  	return nil, 0, false
   306  }
   307  
   308  func parseTagName(s string, i int) (string, int, bool) {
   309  	// “A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or hyphens (-).”
   310  	if i < len(s) && isLetter(s[i]) {
   311  		j := i + 1
   312  		for j < len(s) && isLDH(s[j]) {
   313  			j++
   314  		}
   315  		return s[i:j], j, true
   316  	}
   317  	return "", 0, false
   318  }
   319  
   320  func parseAttr(p *parseState, s string, i int) (string, int, bool) {
   321  	// “An attribute consists of spaces, tabs, and up to one line ending,
   322  	// an attribute name, and an optional attribute value specification.”
   323  	i = skipSpace(s, i)
   324  	if _, j, ok := parseAttrName(s, i); ok {
   325  		if _, k, ok := parseAttrValueSpec(p, s, j); ok {
   326  			j = k
   327  		}
   328  		return s[i:j], j, true
   329  	}
   330  	return "", 0, false
   331  }
   332  
   333  func parseAttrName(s string, i int) (string, int, bool) {
   334  	// “An attribute name consists of an ASCII letter, _, or :,
   335  	// followed by zero or more ASCII letters, digits, _, ., :, or -.”
   336  	if i+1 < len(s) && (isLetter(s[i]) || s[i] == '_' || s[i] == ':') {
   337  		j := i + 1
   338  		for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '.' || s[j] == ':') {
   339  			j++
   340  		}
   341  		return s[i:j], j, true
   342  	}
   343  	return "", 0, false
   344  }
   345  
   346  func parseAttrValueSpec(p *parseState, s string, i int) (string, int, bool) {
   347  	// “An attribute value specification consists of
   348  	// optional spaces, tabs, and up to one line ending,
   349  	// a = character,
   350  	// optional spaces, tabs, and up to one line ending,
   351  	// and an attribute value.”
   352  	i = skipSpace(s, i)
   353  	if i+1 < len(s) && s[i] == '=' {
   354  		i = skipSpace(s, i+1)
   355  		if _, j, ok := parseAttrValue(s, i); ok {
   356  			p.corner = p.corner || strings.Contains(s[i:j], "\ufffd")
   357  			return s[i:j], j, true
   358  		}
   359  	}
   360  	return "", 0, false
   361  }
   362  
   363  func parseAttrValue(s string, i int) (string, int, bool) {
   364  	// “An attribute value consists of
   365  	// an unquoted attribute value,
   366  	// a single-quoted attribute value,
   367  	// or a double-quoted attribute value.”
   368  	// TODO: No escaping???
   369  	if i < len(s) && (s[i] == '\'' || s[i] == '"') {
   370  		// “A single-quoted attribute value consists of ',
   371  		// zero or more characters not including ', and a final '.”
   372  		// “A double-quoted attribute value consists of ",
   373  		// zero or more characters not including ", and a final ".”
   374  		if j := strings.IndexByte(s[i+1:], s[i]); j >= 0 {
   375  			end := i + 1 + j + 1
   376  			return s[i:end], end, true
   377  		}
   378  	}
   379  
   380  	// “An unquoted attribute value is a nonempty string of characters
   381  	// not including spaces, tabs, line endings, ", ', =, <, >, or `.”
   382  	j := i
   383  	for j < len(s) && strings.IndexByte(" \t\n\"'=<>`", s[j]) < 0 {
   384  		j++
   385  	}
   386  	if j > i {
   387  		return s[i:j], j, true
   388  	}
   389  	return "", 0, false
   390  }
   391  
   392  func parseHTMLComment(s string, i int) (Inline, int, bool) {
   393  	// “An HTML comment consists of <!-- + text + -->,
   394  	// where text does not start with > or ->,
   395  	// does not end with -, and does not contain --.”
   396  	if !strings.HasPrefix(s[i:], "<!-->") &&
   397  		!strings.HasPrefix(s[i:], "<!--->") {
   398  		if x, end, ok := parseHTMLMarker(s, i, "<!--", "-->"); ok {
   399  			if t := x.(*HTMLTag).Text; !strings.Contains(t[len("<!--"):len(t)-len("->")], "--") {
   400  				return x, end, ok
   401  			}
   402  		}
   403  	}
   404  	return nil, 0, false
   405  }
   406  
   407  func parseHTMLCDATA(s string, i int) (Inline, int, bool) {
   408  	// “A CDATA section consists of the string <![CDATA[,
   409  	// a string of characters not including the string ]]>, and the string ]]>.”
   410  	return parseHTMLMarker(s, i, "<![CDATA[", "]]>")
   411  }
   412  
   413  func parseHTMLDecl(p *parseState, s string, i int) (Inline, int, bool) {
   414  	// “A declaration consists of the string <!, an ASCII letter,
   415  	// zero or more characters not including the character >, and the character >.”
   416  	if i+2 < len(s) && isLetter(s[i+2]) {
   417  		if 'a' <= s[i+2] && s[i+2] <= 'z' {
   418  			p.corner = true // goldmark requires uppercase
   419  		}
   420  		return parseHTMLMarker(s, i, "<!", ">")
   421  	}
   422  	return nil, 0, false
   423  }
   424  
   425  func parseHTMLProcInst(s string, i int) (Inline, int, bool) {
   426  	// “A processing instruction consists of the string <?,
   427  	// a string of characters not including the string ?>, and the string ?>.”
   428  	return parseHTMLMarker(s, i, "<?", "?>")
   429  }
   430  
   431  func parseHTMLMarker(s string, i int, prefix, suffix string) (Inline, int, bool) {
   432  	if strings.HasPrefix(s[i:], prefix) {
   433  		if j := strings.Index(s[i+len(prefix):], suffix); j >= 0 {
   434  			end := i + len(prefix) + j + len(suffix)
   435  			return &HTMLTag{s[i:end]}, end, true
   436  		}
   437  	}
   438  	return nil, 0, false
   439  }
   440  
   441  func parseHTMLEntity(_ *parseState, s string, i int) (Inline, int, int, bool) {
   442  	start := i
   443  	if i+1 < len(s) && s[i+1] == '#' {
   444  		i += 2
   445  		var r, end int
   446  		if i < len(s) && (s[i] == 'x' || s[i] == 'X') {
   447  			// hex
   448  			i++
   449  			j := i
   450  			for j < len(s) && isHexDigit(s[j]) {
   451  				j++
   452  			}
   453  			if j-i < 1 || j-i > 6 || j >= len(s) || s[j] != ';' {
   454  				return nil, 0, 0, false
   455  			}
   456  			r64, _ := strconv.ParseInt(s[i:j], 16, 0)
   457  			r = int(r64)
   458  			end = j + 1
   459  		} else {
   460  			// decimal
   461  			j := i
   462  			for j < len(s) && isDigit(s[j]) {
   463  				j++
   464  			}
   465  			if j-i < 1 || j-i > 7 || j >= len(s) || s[j] != ';' {
   466  				return nil, 0, 0, false
   467  			}
   468  			r, _ = strconv.Atoi(s[i:j])
   469  			end = j + 1
   470  		}
   471  		if r > unicode.MaxRune || r == 0 {
   472  			r = unicode.ReplacementChar
   473  		}
   474  		return &Plain{string(rune(r))}, start, end, true
   475  	}
   476  
   477  	// Max name in list is 32 bytes. Try for 64 for good measure.
   478  	for j := i + 1; j < len(s) && j-i < 64; j++ {
   479  		if s[j] == '&' { // Stop possible quadratic search on &&&&&&&.
   480  			break
   481  		}
   482  		if s[j] == ';' {
   483  			if r, ok := htmlEntity[s[i:j+1]]; ok {
   484  				return &Plain{r}, start, j + 1, true
   485  			}
   486  			break
   487  		}
   488  	}
   489  
   490  	return nil, 0, 0, false
   491  }
   492  
   493  type HTMLTag struct {
   494  	Text string
   495  }
   496  
   497  func (*HTMLTag) Inline() {}
   498  
   499  func (x *HTMLTag) PrintHTML(buf *bytes.Buffer) {
   500  	buf.WriteString(x.Text)
   501  }
   502  
   503  func (x *HTMLTag) printMarkdown(buf *bytes.Buffer) {
   504  	x.PrintHTML(buf)
   505  }
   506  
   507  func (x *HTMLTag) PrintText(buf *bytes.Buffer) {}
   508
View as plain text