parse.go

     1  // Copyright 2021 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package markdown
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"reflect"
    11  	"slices"
    12  	"strings"
    13  )
    14  
    15  /*
    16  
    17  list block itself does not appear on stack?
    18  item does
    19  end of item returns block,
    20  new item continues previous block if possible?
    21  
    22  if close leaves lines or blocks behind, panic
    23  
    24  close(b a list item, parent)
    25  	if b's parent's last block is list && item can be added to it, do so
    26  	else return new list
    27  
    28  or maybe not parent but just current list of blocks
    29  
    30  preserve LinkRefDefs?
    31  
    32  */
    33  
    34  // Block is implemented by:
    35  //
    36  //	CodeBLock
    37  //	Document
    38  //	Empty
    39  //	HTMLBlock
    40  //	Heading
    41  //	Item
    42  //	List
    43  //	Paragraph
    44  //	Quote
    45  //	Text
    46  //	ThematicBreak
    47  type Block interface {
    48  	Pos() Position
    49  	PrintHTML(buf *bytes.Buffer)
    50  	printMarkdown(buf *bytes.Buffer, s mdState)
    51  }
    52  
    53  type mdState struct {
    54  	prefix  string
    55  	prefix1 string // for first line only
    56  	bullet  rune   // for list items
    57  	num     int    // for numbered list items
    58  }
    59  
    60  type Position struct {
    61  	StartLine int
    62  	EndLine   int
    63  }
    64  
    65  func (p Position) Pos() Position {
    66  	return p
    67  }
    68  
    69  type buildState interface {
    70  	blocks() []Block
    71  	pos() Position
    72  	last() Block
    73  	deleteLast()
    74  
    75  	link(label string) *Link
    76  	defineLink(label string, link *Link)
    77  	newText(pos Position, text string) *Text
    78  }
    79  
    80  type blockBuilder interface {
    81  	extend(p *parseState, s line) (line, bool)
    82  	build(buildState) Block
    83  }
    84  
    85  type openBlock struct {
    86  	builder blockBuilder
    87  	inner   []Block
    88  	pos     Position
    89  }
    90  
    91  type itemBuilder struct {
    92  	list        *listBuilder
    93  	width       int
    94  	haveContent bool
    95  }
    96  
    97  func (p *parseState) last() Block {
    98  	ob := &p.stack[len(p.stack)-1]
    99  	return ob.inner[len(ob.inner)-1]
   100  }
   101  
   102  func (p *parseState) deleteLast() {
   103  	ob := &p.stack[len(p.stack)-1]
   104  	ob.inner = ob.inner[:len(ob.inner)-1]
   105  }
   106  
   107  type Text struct {
   108  	Position
   109  	Inline []Inline
   110  	raw    string
   111  }
   112  
   113  func (b *Text) PrintHTML(buf *bytes.Buffer) {
   114  	for _, x := range b.Inline {
   115  		x.PrintHTML(buf)
   116  	}
   117  }
   118  
   119  func (b *Text) printMarkdown(buf *bytes.Buffer, s mdState) {
   120  	if s.prefix1 != "" {
   121  		buf.WriteString(s.prefix1)
   122  	} else {
   123  		buf.WriteString(s.prefix)
   124  	}
   125  	var prev Inline
   126  	for _, x := range b.Inline {
   127  		switch prev.(type) {
   128  		case *SoftBreak, *HardBreak:
   129  			buf.WriteString(s.prefix)
   130  		}
   131  		x.printMarkdown(buf)
   132  		prev = x
   133  	}
   134  	buf.WriteByte('\n')
   135  }
   136  
   137  type rootBuilder struct{}
   138  
   139  func (b *rootBuilder) build(p buildState) Block {
   140  	return &Document{p.pos(), p.blocks(), p.(*parseState).links}
   141  }
   142  
   143  type Document struct {
   144  	Position
   145  	Blocks []Block
   146  	Links  map[string]*Link
   147  }
   148  
   149  // A Parser is a Markdown parser.
   150  // The exported fields in the struct can be filled in before calling
   151  // [Parser.Parse] in order to customize the details of the parsing process.
   152  // A Parser is safe for concurrent use by multiple goroutines.
   153  type Parser struct {
   154  	// HeadingIDs determines whether the parser accepts
   155  	// the {#hdr} syntax for an HTML id="hdr" attribute on headings.
   156  	// For example, if HeadingIDs is true then the Markdown
   157  	//    ## Overview {#overview}
   158  	// will render as the HTML
   159  	//    <h2 id="overview">Overview</h2>
   160  	HeadingIDs bool
   161  
   162  	// Strikethrough determines whether the parser accepts
   163  	// ~abc~ and ~~abc~~ as strikethrough syntax, producing
   164  	// <del>abc</del> in HTML.
   165  	Strikethrough bool
   166  
   167  	// TaskListItems determines whether the parser accepts
   168  	// “task list items” as defined in GitHub Flavored Markdown.
   169  	// When a list item begins with the plain text [ ] or [x]
   170  	// that turns into an unchecked or checked check box.
   171  	TaskListItems bool
   172  
   173  	// TODO
   174  	AutoLinkText       bool
   175  	AutoLinkAssumeHTTP bool
   176  
   177  	// TODO
   178  	Table bool
   179  
   180  	// TODO
   181  	Emoji bool
   182  
   183  	// TODO
   184  	SmartDot   bool
   185  	SmartDash  bool
   186  	SmartQuote bool
   187  }
   188  
   189  type parseState struct {
   190  	*Parser
   191  
   192  	root      *Document
   193  	links     map[string]*Link
   194  	lineno    int
   195  	stack     []openBlock
   196  	lineDepth int
   197  
   198  	corner bool // noticed corner case to ignore in cross-implementation testing
   199  
   200  	// inlines
   201  	s       string
   202  	emitted int // s[:emitted] has been emitted into list
   203  	list    []Inline
   204  
   205  	// for fixup at end
   206  	lists []*List
   207  	texts []*Text
   208  
   209  	backticks backtickParser
   210  }
   211  
   212  func (p *parseState) newText(pos Position, text string) *Text {
   213  	b := &Text{Position: pos, raw: text}
   214  	p.texts = append(p.texts, b)
   215  	return b
   216  }
   217  
   218  func (p *parseState) blocks() []Block {
   219  	b := &p.stack[len(p.stack)-1]
   220  	return b.inner
   221  }
   222  
   223  func (p *parseState) pos() Position {
   224  	b := &p.stack[len(p.stack)-1]
   225  	return b.pos
   226  }
   227  
   228  func (p *Parser) Parse(text string) *Document {
   229  	d, _ := p.parse(text)
   230  	return d
   231  }
   232  
   233  func (p *Parser) parse(text string) (d *Document, corner bool) {
   234  	var ps parseState
   235  	ps.Parser = p
   236  	if strings.Contains(text, "\x00") {
   237  		text = strings.ReplaceAll(text, "\x00", "\uFFFD")
   238  		ps.corner = true // goldmark does not replace NUL
   239  	}
   240  
   241  	ps.lineDepth = -1
   242  	ps.addBlock(&rootBuilder{})
   243  	for text != "" {
   244  		var ln string
   245  		i := strings.Index(text, "\n")
   246  		j := strings.Index(text, "\r")
   247  		var nl byte
   248  		switch {
   249  		case j >= 0 && (i < 0 || j < i): // have \r, maybe \r\n
   250  			ln = text[:j]
   251  			if i == j+1 {
   252  				text = text[j+2:]
   253  				nl = '\r' + '\n'
   254  			} else {
   255  				text = text[j+1:]
   256  				nl = '\r'
   257  			}
   258  		case i >= 0:
   259  			ln, text = text[:i], text[i+1:]
   260  			nl = '\n'
   261  		default:
   262  			ln, text = text, ""
   263  		}
   264  		ps.lineno++
   265  		ps.addLine(line{text: ln, nl: nl})
   266  	}
   267  	ps.trimStack(0)
   268  
   269  	for _, t := range ps.texts {
   270  		t.Inline = ps.inline(t.raw)
   271  	}
   272  
   273  	if p.TaskListItems {
   274  		for _, list := range ps.lists {
   275  			ps.taskList(list)
   276  		}
   277  	}
   278  
   279  	return ps.root, ps.corner
   280  }
   281  
   282  func (p *parseState) curB() blockBuilder {
   283  	if p.lineDepth < len(p.stack) {
   284  		return p.stack[p.lineDepth].builder
   285  	}
   286  	return nil
   287  }
   288  
   289  func (p *parseState) nextB() blockBuilder {
   290  	if p.lineDepth+1 < len(p.stack) {
   291  		return p.stack[p.lineDepth+1].builder
   292  	}
   293  	return nil
   294  }
   295  func (p *parseState) trimStack(depth int) {
   296  	if len(p.stack) < depth {
   297  		panic("trimStack")
   298  	}
   299  	for len(p.stack) > depth {
   300  		p.closeBlock()
   301  	}
   302  }
   303  
   304  func (p *parseState) addBlock(c blockBuilder) {
   305  	p.trimStack(p.lineDepth + 1)
   306  	p.stack = append(p.stack, openBlock{})
   307  	ob := &p.stack[len(p.stack)-1]
   308  	ob.builder = c
   309  	ob.pos.StartLine = p.lineno
   310  	ob.pos.EndLine = p.lineno
   311  }
   312  
   313  func (p *parseState) doneBlock(b Block) {
   314  	p.trimStack(p.lineDepth + 1)
   315  	ob := &p.stack[len(p.stack)-1]
   316  	ob.inner = append(ob.inner, b)
   317  }
   318  
   319  func (p *parseState) para() *paraBuilder {
   320  	if b, ok := p.stack[len(p.stack)-1].builder.(*paraBuilder); ok {
   321  		return b
   322  	}
   323  	return nil
   324  }
   325  
   326  func (p *parseState) closeBlock() Block {
   327  	b := &p.stack[len(p.stack)-1]
   328  	if b.builder == nil {
   329  		println("closeBlock", len(p.stack)-1)
   330  	}
   331  	blk := b.builder.build(p)
   332  	if list, ok := blk.(*List); ok {
   333  		p.corner = p.corner || listCorner(list)
   334  		if p.TaskListItems {
   335  			p.lists = append(p.lists, list)
   336  		}
   337  	}
   338  	p.stack = p.stack[:len(p.stack)-1]
   339  	if len(p.stack) > 0 {
   340  		b := &p.stack[len(p.stack)-1]
   341  		b.inner = append(b.inner, blk)
   342  		// _ = b
   343  	} else {
   344  		p.root = blk.(*Document)
   345  	}
   346  	return blk
   347  }
   348  
   349  func (p *parseState) link(label string) *Link {
   350  	return p.links[label]
   351  }
   352  
   353  func (p *parseState) defineLink(label string, link *Link) {
   354  	if p.links == nil {
   355  		p.links = make(map[string]*Link)
   356  	}
   357  	p.links[label] = link
   358  }
   359  
   360  type line struct {
   361  	spaces int
   362  	i      int
   363  	tab    int
   364  	text   string
   365  	nl     byte // newline character ending this line: \r or \n or zero for EOF
   366  }
   367  
   368  func (p *parseState) addLine(s line) {
   369  	// Process continued prefixes.
   370  	p.lineDepth = 0
   371  	for ; p.lineDepth+1 < len(p.stack); p.lineDepth++ {
   372  		old := s
   373  		var ok bool
   374  		s, ok = p.stack[p.lineDepth+1].builder.extend(p, s)
   375  		if !old.isBlank() && (ok || s != old) {
   376  			p.stack[p.lineDepth+1].pos.EndLine = p.lineno
   377  		}
   378  		if !ok {
   379  			break
   380  		}
   381  	}
   382  
   383  	if s.isBlank() {
   384  		p.trimStack(p.lineDepth + 1)
   385  		return
   386  	}
   387  
   388  	// Process new prefixes, if any.
   389  Prefixes:
   390  	// Start new block inside p.stack[depth].
   391  	for _, fn := range news {
   392  		if l, ok := fn(p, s); ok {
   393  			s = l
   394  			if s.isBlank() {
   395  				return
   396  			}
   397  			p.lineDepth++
   398  			goto Prefixes
   399  		}
   400  	}
   401  
   402  	newPara(p, s)
   403  }
   404  
   405  func (c *rootBuilder) extend(p *parseState, s line) (line, bool) {
   406  	panic("root extend")
   407  }
   408  
   409  var news = []func(*parseState, line) (line, bool){
   410  	newQuote,
   411  	newATXHeading,
   412  	newSetextHeading,
   413  	newHR,
   414  	newListItem,
   415  	newHTML,
   416  	newFence,
   417  	newPre,
   418  }
   419  
   420  func (s *line) peek() byte {
   421  	if s.spaces > 0 {
   422  		return ' '
   423  	}
   424  	if s.i >= len(s.text) {
   425  		return 0
   426  	}
   427  	return s.text[s.i]
   428  }
   429  
   430  func (s *line) skipSpace() {
   431  	s.spaces = 0
   432  	for s.i < len(s.text) && (s.text[s.i] == ' ' || s.text[s.i] == '\t') {
   433  		s.i++
   434  	}
   435  }
   436  
   437  func (s *line) trimSpace(min, max int, eolOK bool) bool {
   438  	t := *s
   439  	for n := 0; n < max; n++ {
   440  		if t.spaces > 0 {
   441  			t.spaces--
   442  			continue
   443  		}
   444  		if t.i >= len(t.text) && eolOK {
   445  			continue
   446  		}
   447  		if t.i < len(t.text) {
   448  			switch t.text[t.i] {
   449  			case '\t':
   450  				t.spaces = 4 - (t.i-t.tab)&3 - 1
   451  				t.i++
   452  				t.tab = t.i
   453  				continue
   454  			case ' ':
   455  				t.i++
   456  				continue
   457  			}
   458  		}
   459  		if n >= min {
   460  			break
   461  		}
   462  		return false
   463  	}
   464  	*s = t
   465  	return true
   466  }
   467  
   468  func (s *line) trim(c byte) bool {
   469  	if s.spaces > 0 {
   470  		if c == ' ' {
   471  			s.spaces--
   472  			return true
   473  		}
   474  		return false
   475  	}
   476  	if s.i < len(s.text) && s.text[s.i] == c {
   477  		s.i++
   478  		return true
   479  	}
   480  	return false
   481  }
   482  
   483  func (s *line) string() string {
   484  	switch s.spaces {
   485  	case 0:
   486  		return s.text[s.i:]
   487  	case 1:
   488  		return " " + s.text[s.i:]
   489  	case 2:
   490  		return "  " + s.text[s.i:]
   491  	case 3:
   492  		return "   " + s.text[s.i:]
   493  	}
   494  	panic("bad spaces")
   495  }
   496  
   497  func trimLeftSpaceTab(s string) string {
   498  	i := 0
   499  	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
   500  		i++
   501  	}
   502  	return s[i:]
   503  }
   504  
   505  func trimRightSpaceTab(s string) string {
   506  	j := len(s)
   507  	for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t') {
   508  		j--
   509  	}
   510  	return s[:j]
   511  }
   512  
   513  func trimSpaceTab(s string) string {
   514  	i := 0
   515  	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
   516  		i++
   517  	}
   518  	s = s[i:]
   519  	j := len(s)
   520  	for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t') {
   521  		j--
   522  	}
   523  	return s[:j]
   524  }
   525  
   526  func trimSpace(s string) string {
   527  	i := 0
   528  	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
   529  		i++
   530  	}
   531  	s = s[i:]
   532  	j := len(s)
   533  	for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t') {
   534  		j--
   535  	}
   536  	return s[:j]
   537  }
   538  
   539  func trimSpaceTabNewline(s string) string {
   540  	i := 0
   541  	for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n') {
   542  		i++
   543  	}
   544  	s = s[i:]
   545  	j := len(s)
   546  	for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t' || s[j-1] == '\n') {
   547  		j--
   548  	}
   549  	return s[:j]
   550  }
   551  
   552  func (s *line) isBlank() bool {
   553  	return trimLeftSpaceTab(s.text[s.i:]) == ""
   554  }
   555  
   556  func (s *line) eof() bool {
   557  	return s.i >= len(s.text)
   558  }
   559  
   560  func (s *line) trimSpaceString() string {
   561  	return trimLeftSpaceTab(s.text[s.i:])
   562  }
   563  
   564  func (s *line) trimString() string {
   565  	return trimSpaceTab(s.text[s.i:])
   566  }
   567  
   568  func ToHTML(b Block) string {
   569  	var buf bytes.Buffer
   570  	b.PrintHTML(&buf)
   571  	return buf.String()
   572  }
   573  
   574  func ToMarkdown(b Block) string {
   575  	var buf bytes.Buffer
   576  	b.printMarkdown(&buf, mdState{})
   577  	s := buf.String()
   578  	// Remove final extra newline.
   579  	if strings.HasSuffix(s, "\n\n") {
   580  		s = s[:len(s)-1]
   581  	}
   582  	return s
   583  }
   584  
   585  func (b *Document) PrintHTML(buf *bytes.Buffer) {
   586  	for _, c := range b.Blocks {
   587  		c.PrintHTML(buf)
   588  	}
   589  }
   590  
   591  func (b *Document) printMarkdown(buf *bytes.Buffer, s mdState) {
   592  	printMarkdownBlocks(b.Blocks, buf, s)
   593  	// Print links sorted by keys for deterministic output.
   594  	var keys []string
   595  	for k := range b.Links {
   596  		keys = append(keys, k)
   597  	}
   598  	slices.Sort(keys)
   599  	for _, k := range keys {
   600  		l := b.Links[k]
   601  		fmt.Fprintf(buf, "[%s]: %s", k, l.URL)
   602  		printLinkTitleMarkdown(buf, l.Title, l.TitleChar)
   603  		buf.WriteByte('\n')
   604  	}
   605  }
   606  
   607  func printMarkdownBlocks(bs []Block, buf *bytes.Buffer, s mdState) {
   608  	prevEnd := 0
   609  	for _, b := range bs {
   610  		// Preserve blank lines between blocks.
   611  		if prevEnd > 0 {
   612  			for i := prevEnd + 1; i < b.Pos().StartLine; i++ {
   613  				buf.WriteString(trimRightSpaceTab(s.prefix))
   614  				buf.WriteByte('\n')
   615  			}
   616  		}
   617  		b.printMarkdown(buf, s)
   618  		prevEnd = b.Pos().EndLine
   619  		s.prefix1 = "" // item prefix only for first block
   620  	}
   621  }
   622  
   623  var (
   624  	blockType   = reflect.TypeOf(new(Block)).Elem()
   625  	blocksType  = reflect.TypeOf(new([]Block)).Elem()
   626  	inlinesType = reflect.TypeOf(new([]Inline)).Elem()
   627  )
   628  
   629  func printb(buf *bytes.Buffer, b Block, prefix string) {
   630  	fmt.Fprintf(buf, "(%T", b)
   631  	v := reflect.ValueOf(b)
   632  	v = reflect.Indirect(v)
   633  	if v.Kind() != reflect.Struct {
   634  		fmt.Fprintf(buf, " %v", b)
   635  	}
   636  	t := v.Type()
   637  	for i := 0; i < t.NumField(); i++ {
   638  		tf := t.Field(i)
   639  		if !tf.IsExported() {
   640  			continue
   641  		}
   642  		if tf.Type == inlinesType {
   643  			printis(buf, v.Field(i).Interface().([]Inline))
   644  		} else if tf.Type.Kind() == reflect.Slice && tf.Type.Elem().Kind() == reflect.String {
   645  			fmt.Fprintf(buf, " %s:%q", tf.Name, v.Field(i))
   646  		} else if tf.Type != blocksType && !tf.Type.Implements(blockType) && tf.Type.Kind() != reflect.Slice {
   647  			fmt.Fprintf(buf, " %s:%v", tf.Name, v.Field(i))
   648  		}
   649  	}
   650  
   651  	prefix += "\t"
   652  	for i := 0; i < t.NumField(); i++ {
   653  		tf := t.Field(i)
   654  		if !tf.IsExported() {
   655  			continue
   656  		}
   657  		if tf.Type.Implements(blockType) {
   658  			fmt.Fprintf(buf, "\n%s", prefix)
   659  			printb(buf, v.Field(i).Interface().(Block), prefix)
   660  		} else if tf.Type == blocksType {
   661  			vf := v.Field(i)
   662  			for i := 0; i < vf.Len(); i++ {
   663  				fmt.Fprintf(buf, "\n%s", prefix)
   664  				printb(buf, vf.Index(i).Interface().(Block), prefix)
   665  			}
   666  		} else if tf.Type.Kind() == reflect.Slice && tf.Type != inlinesType && tf.Type.Elem().Kind() != reflect.String {
   667  			fmt.Fprintf(buf, "\n%s%s:", prefix, t.Field(i).Name)
   668  			printslice(buf, v.Field(i), prefix)
   669  		}
   670  	}
   671  	fmt.Fprintf(buf, ")")
   672  }
   673  
   674  func printslice(buf *bytes.Buffer, v reflect.Value, prefix string) {
   675  	if v.Type().Elem().Kind() == reflect.Slice {
   676  		for i := 0; i < v.Len(); i++ {
   677  			fmt.Fprintf(buf, "\n%s#%d:", prefix, i)
   678  			printslice(buf, v.Index(i), prefix+"\t")
   679  		}
   680  		return
   681  	}
   682  	for i := 0; i < v.Len(); i++ {
   683  		fmt.Fprintf(buf, " ")
   684  		printb(buf, v.Index(i).Interface().(Block), prefix+"\t")
   685  	}
   686  }
   687  
   688  func printi(buf *bytes.Buffer, in Inline) {
   689  	fmt.Fprintf(buf, "%T(", in)
   690  	v := reflect.ValueOf(in).Elem()
   691  	text := v.FieldByName("Text")
   692  	if text.IsValid() {
   693  		fmt.Fprintf(buf, "%q", text)
   694  	}
   695  	inner := v.FieldByName("Inner")
   696  	if inner.IsValid() {
   697  		printis(buf, inner.Interface().([]Inline))
   698  	}
   699  	buf.WriteString(")")
   700  }
   701  
   702  func printis(buf *bytes.Buffer, ins []Inline) {
   703  	for _, in := range ins {
   704  		buf.WriteByte(' ')
   705  		printi(buf, in)
   706  	}
   707  }
   708  
   709  func dump(b Block) string {
   710  	var buf bytes.Buffer
   711  	printb(&buf, b, "")
   712  	return buf.String()
   713  }
   714
View as plain text