Source file src/strconv/quote.go

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run makeisprint.go -output isprint.go
     6  
     7  package strconv
     8  
     9  import (
    10  	"unicode/utf8"
    11  )
    12  
    13  const (
    14  	lowerhex = "0123456789abcdef"
    15  	upperhex = "0123456789ABCDEF"
    16  )
    17  
    18  // contains reports whether the string contains the byte c.
    19  func contains(s string, c byte) bool {
    20  	return index(s, c) != -1
    21  }
    22  
    23  func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
    24  	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
    25  }
    26  
    27  func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
    28  	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
    29  }
    30  
    31  func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
    32  	// Often called with big strings, so preallocate. If there's quoting,
    33  	// this is conservative but still helps a lot.
    34  	if cap(buf)-len(buf) < len(s) {
    35  		nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
    36  		copy(nBuf, buf)
    37  		buf = nBuf
    38  	}
    39  	buf = append(buf, quote)
    40  	for width := 0; len(s) > 0; s = s[width:] {
    41  		r := rune(s[0])
    42  		width = 1
    43  		if r >= utf8.RuneSelf {
    44  			r, width = utf8.DecodeRuneInString(s)
    45  		}
    46  		if width == 1 && r == utf8.RuneError {
    47  			buf = append(buf, `\x`...)
    48  			buf = append(buf, lowerhex[s[0]>>4])
    49  			buf = append(buf, lowerhex[s[0]&0xF])
    50  			continue
    51  		}
    52  		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    53  	}
    54  	buf = append(buf, quote)
    55  	return buf
    56  }
    57  
    58  func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    59  	buf = append(buf, quote)
    60  	if !utf8.ValidRune(r) {
    61  		r = utf8.RuneError
    62  	}
    63  	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    64  	buf = append(buf, quote)
    65  	return buf
    66  }
    67  
    68  func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    69  	if r == rune(quote) || r == '\\' { // always backslashed
    70  		buf = append(buf, '\\')
    71  		buf = append(buf, byte(r))
    72  		return buf
    73  	}
    74  	if ASCIIonly {
    75  		if r < utf8.RuneSelf && IsPrint(r) {
    76  			buf = append(buf, byte(r))
    77  			return buf
    78  		}
    79  	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
    80  		return utf8.AppendRune(buf, r)
    81  	}
    82  	switch r {
    83  	case '\a':
    84  		buf = append(buf, `\a`...)
    85  	case '\b':
    86  		buf = append(buf, `\b`...)
    87  	case '\f':
    88  		buf = append(buf, `\f`...)
    89  	case '\n':
    90  		buf = append(buf, `\n`...)
    91  	case '\r':
    92  		buf = append(buf, `\r`...)
    93  	case '\t':
    94  		buf = append(buf, `\t`...)
    95  	case '\v':
    96  		buf = append(buf, `\v`...)
    97  	default:
    98  		switch {
    99  		case r < ' ' || r == 0x7f:
   100  			buf = append(buf, `\x`...)
   101  			buf = append(buf, lowerhex[byte(r)>>4])
   102  			buf = append(buf, lowerhex[byte(r)&0xF])
   103  		case !utf8.ValidRune(r):
   104  			r = 0xFFFD
   105  			fallthrough
   106  		case r < 0x10000:
   107  			buf = append(buf, `\u`...)
   108  			for s := 12; s >= 0; s -= 4 {
   109  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   110  			}
   111  		default:
   112  			buf = append(buf, `\U`...)
   113  			for s := 28; s >= 0; s -= 4 {
   114  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   115  			}
   116  		}
   117  	}
   118  	return buf
   119  }
   120  
   121  // Quote returns a double-quoted Go string literal representing s. The
   122  // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   123  // control characters and non-printable characters as defined by
   124  // [IsPrint].
   125  func Quote(s string) string {
   126  	return quoteWith(s, '"', false, false)
   127  }
   128  
   129  // AppendQuote appends a double-quoted Go string literal representing s,
   130  // as generated by [Quote], to dst and returns the extended buffer.
   131  func AppendQuote(dst []byte, s string) []byte {
   132  	return appendQuotedWith(dst, s, '"', false, false)
   133  }
   134  
   135  // QuoteToASCII returns a double-quoted Go string literal representing s.
   136  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   137  // non-ASCII characters and non-printable characters as defined by [IsPrint].
   138  func QuoteToASCII(s string) string {
   139  	return quoteWith(s, '"', true, false)
   140  }
   141  
   142  // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
   143  // as generated by [QuoteToASCII], to dst and returns the extended buffer.
   144  func AppendQuoteToASCII(dst []byte, s string) []byte {
   145  	return appendQuotedWith(dst, s, '"', true, false)
   146  }
   147  
   148  // QuoteToGraphic returns a double-quoted Go string literal representing s.
   149  // The returned string leaves Unicode graphic characters, as defined by
   150  // [IsGraphic], unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100)
   151  // for non-graphic characters.
   152  func QuoteToGraphic(s string) string {
   153  	return quoteWith(s, '"', false, true)
   154  }
   155  
   156  // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
   157  // as generated by [QuoteToGraphic], to dst and returns the extended buffer.
   158  func AppendQuoteToGraphic(dst []byte, s string) []byte {
   159  	return appendQuotedWith(dst, s, '"', false, true)
   160  }
   161  
   162  // QuoteRune returns a single-quoted Go character literal representing the
   163  // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
   164  // for control characters and non-printable characters as defined by [IsPrint].
   165  // If r is not a valid Unicode code point, it is interpreted as the Unicode
   166  // replacement character U+FFFD.
   167  func QuoteRune(r rune) string {
   168  	return quoteRuneWith(r, '\'', false, false)
   169  }
   170  
   171  // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   172  // as generated by [QuoteRune], to dst and returns the extended buffer.
   173  func AppendQuoteRune(dst []byte, r rune) []byte {
   174  	return appendQuotedRuneWith(dst, r, '\'', false, false)
   175  }
   176  
   177  // QuoteRuneToASCII returns a single-quoted Go character literal representing
   178  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   179  // \u0100) for non-ASCII characters and non-printable characters as defined
   180  // by [IsPrint].
   181  // If r is not a valid Unicode code point, it is interpreted as the Unicode
   182  // replacement character U+FFFD.
   183  func QuoteRuneToASCII(r rune) string {
   184  	return quoteRuneWith(r, '\'', true, false)
   185  }
   186  
   187  // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
   188  // as generated by [QuoteRuneToASCII], to dst and returns the extended buffer.
   189  func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
   190  	return appendQuotedRuneWith(dst, r, '\'', true, false)
   191  }
   192  
   193  // QuoteRuneToGraphic returns a single-quoted Go character literal representing
   194  // the rune. If the rune is not a Unicode graphic character,
   195  // as defined by [IsGraphic], the returned string will use a Go escape sequence
   196  // (\t, \n, \xFF, \u0100).
   197  // If r is not a valid Unicode code point, it is interpreted as the Unicode
   198  // replacement character U+FFFD.
   199  func QuoteRuneToGraphic(r rune) string {
   200  	return quoteRuneWith(r, '\'', false, true)
   201  }
   202  
   203  // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
   204  // as generated by [QuoteRuneToGraphic], to dst and returns the extended buffer.
   205  func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
   206  	return appendQuotedRuneWith(dst, r, '\'', false, true)
   207  }
   208  
   209  // CanBackquote reports whether the string s can be represented
   210  // unchanged as a single-line backquoted string without control
   211  // characters other than tab.
   212  func CanBackquote(s string) bool {
   213  	for len(s) > 0 {
   214  		r, wid := utf8.DecodeRuneInString(s)
   215  		s = s[wid:]
   216  		if wid > 1 {
   217  			if r == '\ufeff' {
   218  				return false // BOMs are invisible and should not be quoted.
   219  			}
   220  			continue // All other multibyte runes are correctly encoded and assumed printable.
   221  		}
   222  		if r == utf8.RuneError {
   223  			return false
   224  		}
   225  		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
   226  			return false
   227  		}
   228  	}
   229  	return true
   230  }
   231  
   232  func unhex(b byte) (v rune, ok bool) {
   233  	c := rune(b)
   234  	switch {
   235  	case '0' <= c && c <= '9':
   236  		return c - '0', true
   237  	case 'a' <= c && c <= 'f':
   238  		return c - 'a' + 10, true
   239  	case 'A' <= c && c <= 'F':
   240  		return c - 'A' + 10, true
   241  	}
   242  	return
   243  }
   244  
   245  // UnquoteChar decodes the first character or byte in the escaped string
   246  // or character literal represented by the string s.
   247  // It returns four values:
   248  //
   249  //  1. value, the decoded Unicode code point or byte value;
   250  //  2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   251  //  3. tail, the remainder of the string after the character; and
   252  //  4. an error that will be nil if the character is syntactically valid.
   253  //
   254  // The second argument, quote, specifies the type of literal being parsed
   255  // and therefore which escaped quote character is permitted.
   256  // If set to a single quote, it permits the sequence \' and disallows unescaped '.
   257  // If set to a double quote, it permits \" and disallows unescaped ".
   258  // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
   259  func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
   260  	// easy cases
   261  	if len(s) == 0 {
   262  		err = ErrSyntax
   263  		return
   264  	}
   265  	switch c := s[0]; {
   266  	case c == quote && (quote == '\'' || quote == '"'):
   267  		err = ErrSyntax
   268  		return
   269  	case c >= utf8.RuneSelf:
   270  		r, size := utf8.DecodeRuneInString(s)
   271  		return r, true, s[size:], nil
   272  	case c != '\\':
   273  		return rune(s[0]), false, s[1:], nil
   274  	}
   275  
   276  	// hard case: c is backslash
   277  	if len(s) <= 1 {
   278  		err = ErrSyntax
   279  		return
   280  	}
   281  	c := s[1]
   282  	s = s[2:]
   283  
   284  	switch c {
   285  	case 'a':
   286  		value = '\a'
   287  	case 'b':
   288  		value = '\b'
   289  	case 'f':
   290  		value = '\f'
   291  	case 'n':
   292  		value = '\n'
   293  	case 'r':
   294  		value = '\r'
   295  	case 't':
   296  		value = '\t'
   297  	case 'v':
   298  		value = '\v'
   299  	case 'x', 'u', 'U':
   300  		n := 0
   301  		switch c {
   302  		case 'x':
   303  			n = 2
   304  		case 'u':
   305  			n = 4
   306  		case 'U':
   307  			n = 8
   308  		}
   309  		var v rune
   310  		if len(s) < n {
   311  			err = ErrSyntax
   312  			return
   313  		}
   314  		for j := 0; j < n; j++ {
   315  			x, ok := unhex(s[j])
   316  			if !ok {
   317  				err = ErrSyntax
   318  				return
   319  			}
   320  			v = v<<4 | x
   321  		}
   322  		s = s[n:]
   323  		if c == 'x' {
   324  			// single-byte string, possibly not UTF-8
   325  			value = v
   326  			break
   327  		}
   328  		if !utf8.ValidRune(v) {
   329  			err = ErrSyntax
   330  			return
   331  		}
   332  		value = v
   333  		multibyte = true
   334  	case '0', '1', '2', '3', '4', '5', '6', '7':
   335  		v := rune(c) - '0'
   336  		if len(s) < 2 {
   337  			err = ErrSyntax
   338  			return
   339  		}
   340  		for j := 0; j < 2; j++ { // one digit already; two more
   341  			x := rune(s[j]) - '0'
   342  			if x < 0 || x > 7 {
   343  				err = ErrSyntax
   344  				return
   345  			}
   346  			v = (v << 3) | x
   347  		}
   348  		s = s[2:]
   349  		if v > 255 {
   350  			err = ErrSyntax
   351  			return
   352  		}
   353  		value = v
   354  	case '\\':
   355  		value = '\\'
   356  	case '\'', '"':
   357  		if c != quote {
   358  			err = ErrSyntax
   359  			return
   360  		}
   361  		value = rune(c)
   362  	default:
   363  		err = ErrSyntax
   364  		return
   365  	}
   366  	tail = s
   367  	return
   368  }
   369  
   370  // QuotedPrefix returns the quoted string (as understood by [Unquote]) at the prefix of s.
   371  // If s does not start with a valid quoted string, QuotedPrefix returns an error.
   372  func QuotedPrefix(s string) (string, error) {
   373  	out, _, err := unquote(s, false)
   374  	return out, err
   375  }
   376  
   377  // Unquote interprets s as a single-quoted, double-quoted,
   378  // or backquoted Go string literal, returning the string value
   379  // that s quotes.  (If s is single-quoted, it would be a Go
   380  // character literal; Unquote returns the corresponding
   381  // one-character string. For an empty character literal
   382  // Unquote returns the empty string.)
   383  func Unquote(s string) (string, error) {
   384  	out, rem, err := unquote(s, true)
   385  	if len(rem) > 0 {
   386  		return "", ErrSyntax
   387  	}
   388  	return out, err
   389  }
   390  
   391  // unquote parses a quoted string at the start of the input,
   392  // returning the parsed prefix, the remaining suffix, and any parse errors.
   393  // If unescape is true, the parsed prefix is unescaped,
   394  // otherwise the input prefix is provided verbatim.
   395  func unquote(in string, unescape bool) (out, rem string, err error) {
   396  	// Determine the quote form and optimistically find the terminating quote.
   397  	if len(in) < 2 {
   398  		return "", in, ErrSyntax
   399  	}
   400  	quote := in[0]
   401  	end := index(in[1:], quote)
   402  	if end < 0 {
   403  		return "", in, ErrSyntax
   404  	}
   405  	end += 2 // position after terminating quote; may be wrong if escape sequences are present
   406  
   407  	switch quote {
   408  	case '`':
   409  		switch {
   410  		case !unescape:
   411  			out = in[:end] // include quotes
   412  		case !contains(in[:end], '\r'):
   413  			out = in[len("`") : end-len("`")] // exclude quotes
   414  		default:
   415  			// Carriage return characters ('\r') inside raw string literals
   416  			// are discarded from the raw string value.
   417  			buf := make([]byte, 0, end-len("`")-len("\r")-len("`"))
   418  			for i := len("`"); i < end-len("`"); i++ {
   419  				if in[i] != '\r' {
   420  					buf = append(buf, in[i])
   421  				}
   422  			}
   423  			out = string(buf)
   424  		}
   425  		// NOTE: Prior implementations did not verify that raw strings consist
   426  		// of valid UTF-8 characters and we continue to not verify it as such.
   427  		// The Go specification does not explicitly require valid UTF-8,
   428  		// but only mention that it is implicitly valid for Go source code
   429  		// (which must be valid UTF-8).
   430  		return out, in[end:], nil
   431  	case '"', '\'':
   432  		// Handle quoted strings without any escape sequences.
   433  		if !contains(in[:end], '\\') && !contains(in[:end], '\n') {
   434  			var valid bool
   435  			switch quote {
   436  			case '"':
   437  				valid = utf8.ValidString(in[len(`"`) : end-len(`"`)])
   438  			case '\'':
   439  				r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")])
   440  				valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1)
   441  			}
   442  			if valid {
   443  				out = in[:end]
   444  				if unescape {
   445  					out = out[1 : end-1] // exclude quotes
   446  				}
   447  				return out, in[end:], nil
   448  			}
   449  		}
   450  
   451  		// Handle quoted strings with escape sequences.
   452  		var buf []byte
   453  		in0 := in
   454  		in = in[1:] // skip starting quote
   455  		if unescape {
   456  			buf = make([]byte, 0, 3*end/2) // try to avoid more allocations
   457  		}
   458  		for len(in) > 0 && in[0] != quote {
   459  			// Process the next character,
   460  			// rejecting any unescaped newline characters which are invalid.
   461  			r, multibyte, rem, err := UnquoteChar(in, quote)
   462  			if in[0] == '\n' || err != nil {
   463  				return "", in0, ErrSyntax
   464  			}
   465  			in = rem
   466  
   467  			// Append the character if unescaping the input.
   468  			if unescape {
   469  				if r < utf8.RuneSelf || !multibyte {
   470  					buf = append(buf, byte(r))
   471  				} else {
   472  					buf = utf8.AppendRune(buf, r)
   473  				}
   474  			}
   475  
   476  			// Single quoted strings must be a single character.
   477  			if quote == '\'' {
   478  				break
   479  			}
   480  		}
   481  
   482  		// Verify that the string ends with a terminating quote.
   483  		if !(len(in) > 0 && in[0] == quote) {
   484  			return "", in0, ErrSyntax
   485  		}
   486  		in = in[1:] // skip terminating quote
   487  
   488  		if unescape {
   489  			return string(buf), in, nil
   490  		}
   491  		return in0[:len(in0)-len(in)], in, nil
   492  	default:
   493  		return "", in, ErrSyntax
   494  	}
   495  }
   496  
   497  // bsearch is semantically the same as [slices.BinarySearch] (without NaN checks)
   498  // We copied this function because we can not import "slices" here.
   499  func bsearch[S ~[]E, E ~uint16 | ~uint32](s S, v E) (int, bool) {
   500  	n := len(s)
   501  	i, j := 0, n
   502  	for i < j {
   503  		h := i + (j-i)>>1
   504  		if s[h] < v {
   505  			i = h + 1
   506  		} else {
   507  			j = h
   508  		}
   509  	}
   510  	return i, i < n && s[i] == v
   511  }
   512  
   513  // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
   514  // to give the same answer. It allows this package not to depend on unicode,
   515  // and therefore not pull in all the Unicode tables. If the linker were better
   516  // at tossing unused tables, we could get rid of this implementation.
   517  // That would be nice.
   518  
   519  // IsPrint reports whether the rune is defined as printable by Go, with
   520  // the same definition as [unicode.IsPrint]: letters, numbers, punctuation,
   521  // symbols and ASCII space.
   522  func IsPrint(r rune) bool {
   523  	// Fast check for Latin-1
   524  	if r <= 0xFF {
   525  		if 0x20 <= r && r <= 0x7E {
   526  			// All the ASCII is printable from space through DEL-1.
   527  			return true
   528  		}
   529  		if 0xA1 <= r && r <= 0xFF {
   530  			// Similarly for ¡ through ÿ...
   531  			return r != 0xAD // ...except for the bizarre soft hyphen.
   532  		}
   533  		return false
   534  	}
   535  
   536  	// Same algorithm, either on uint16 or uint32 value.
   537  	// First, find first i such that isPrint[i] >= x.
   538  	// This is the index of either the start or end of a pair that might span x.
   539  	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
   540  	// If we find x in a range, make sure x is not in isNotPrint list.
   541  
   542  	if 0 <= r && r < 1<<16 {
   543  		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
   544  		i, _ := bsearch(isPrint, rr)
   545  		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   546  			return false
   547  		}
   548  		_, found := bsearch(isNotPrint, rr)
   549  		return !found
   550  	}
   551  
   552  	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
   553  	i, _ := bsearch(isPrint, rr)
   554  	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   555  		return false
   556  	}
   557  	if r >= 0x20000 {
   558  		return true
   559  	}
   560  	r -= 0x10000
   561  	_, found := bsearch(isNotPrint, uint16(r))
   562  	return !found
   563  }
   564  
   565  // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
   566  // characters include letters, marks, numbers, punctuation, symbols, and
   567  // spaces, from categories L, M, N, P, S, and Zs.
   568  func IsGraphic(r rune) bool {
   569  	if IsPrint(r) {
   570  		return true
   571  	}
   572  	return isInGraphicList(r)
   573  }
   574  
   575  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
   576  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
   577  // Should be called only if IsPrint fails.
   578  func isInGraphicList(r rune) bool {
   579  	// We know r must fit in 16 bits - see makeisprint.go.
   580  	if r > 0xFFFF {
   581  		return false
   582  	}
   583  	_, found := bsearch(isGraphic, uint16(r))
   584  	return found
   585  }
   586  

View as plain text