Source file src/syscall/wtf8_windows.go

     1  // Copyright 2023 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Windows UTF-16 strings can contain unpaired surrogates, which can't be
     6  // decoded into a valid UTF-8 string. This file defines a set of functions
     7  // that can be used to encode and decode potentially ill-formed UTF-16 strings
     8  // by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
     9  //
    10  // WTF-8 is a strict superset of UTF-8, i.e. any string that is
    11  // well-formed in UTF-8 is also well-formed in WTF-8 and the content
    12  // is unchanged. Also, the conversion never fails and is lossless.
    13  //
    14  // The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string
    15  // is that the conversion is lossless even for ill-formed UTF-16 strings.
    16  // This property allows to read an ill-formed UTF-16 string, convert it
    17  // to a Go string, and convert it back to the same original UTF-16 string.
    18  //
    19  // See go.dev/issues/59971 for more info.
    20  
    21  package syscall
    22  
    23  import (
    24  	"unicode/utf16"
    25  	"unicode/utf8"
    26  )
    27  
    28  const (
    29  	surr1 = 0xd800
    30  	surr2 = 0xdc00
    31  	surr3 = 0xe000
    32  
    33  	tx    = 0b10000000
    34  	t3    = 0b11100000
    35  	maskx = 0b00111111
    36  	mask3 = 0b00001111
    37  
    38  	rune1Max = 1<<7 - 1
    39  	rune2Max = 1<<11 - 1
    40  )
    41  
    42  // encodeWTF16 returns the potentially ill-formed
    43  // UTF-16 encoding of s.
    44  func encodeWTF16(s string, buf []uint16) []uint16 {
    45  	for i := 0; i < len(s); {
    46  		// Cannot use 'for range s' because it expects valid
    47  		// UTF-8 runes.
    48  		r, size := utf8.DecodeRuneInString(s[i:])
    49  		if r == utf8.RuneError {
    50  			// Check if s[i:] contains a valid WTF-8 encoded surrogate.
    51  			if sc := s[i:]; len(sc) >= 3 && sc[0] == 0xED && 0xA0 <= sc[1] && sc[1] <= 0xBF && 0x80 <= sc[2] && sc[2] <= 0xBF {
    52  				r = rune(sc[0]&mask3)<<12 + rune(sc[1]&maskx)<<6 + rune(sc[2]&maskx)
    53  				buf = append(buf, uint16(r))
    54  				i += 3
    55  				continue
    56  			}
    57  		}
    58  		i += size
    59  		buf = utf16.AppendRune(buf, r)
    60  	}
    61  	return buf
    62  }
    63  
    64  // decodeWTF16 returns the WTF-8 encoding of
    65  // the potentially ill-formed UTF-16 s.
    66  func decodeWTF16(s []uint16, buf []byte) []byte {
    67  	for i := 0; i < len(s); i++ {
    68  		var ar rune
    69  		switch r := s[i]; {
    70  		case r < surr1, surr3 <= r:
    71  			// normal rune
    72  			ar = rune(r)
    73  		case surr1 <= r && r < surr2 && i+1 < len(s) &&
    74  			surr2 <= s[i+1] && s[i+1] < surr3:
    75  			// valid surrogate sequence
    76  			ar = utf16.DecodeRune(rune(r), rune(s[i+1]))
    77  			i++
    78  		default:
    79  			// WTF-8 fallback.
    80  			// This only handles the 3-byte case of utf8.AppendRune,
    81  			// as surrogates always fall in that case.
    82  			ar = rune(r)
    83  			if ar > utf8.MaxRune {
    84  				ar = utf8.RuneError
    85  			}
    86  			buf = append(buf, t3|byte(ar>>12), tx|byte(ar>>6)&maskx, tx|byte(ar)&maskx)
    87  			continue
    88  		}
    89  		buf = utf8.AppendRune(buf, ar)
    90  	}
    91  	return buf
    92  }
    93  

View as plain text