// Copyright 2023 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Windows UTF-16 strings can contain unpaired surrogates, which can't be // decoded into a valid UTF-8 string. This file defines a set of functions // that can be used to encode and decode potentially ill-formed UTF-16 strings // by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). // // WTF-8 is a strict superset of UTF-8, i.e. any string that is // well-formed in UTF-8 is also well-formed in WTF-8 and the content // is unchanged. Also, the conversion never fails and is lossless. // // The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string // is that the conversion is lossless even for ill-formed UTF-16 strings. // This property allows to read an ill-formed UTF-16 string, convert it // to a Go string, and convert it back to the same original UTF-16 string. // // See go.dev/issues/59971 for more info. package syscall import ( "unicode/utf16" "unicode/utf8" ) const ( surr1 = 0xd800 surr2 = 0xdc00 surr3 = 0xe000 tx = 0b10000000 t3 = 0b11100000 maskx = 0b00111111 mask3 = 0b00001111 rune1Max = 1<<7 - 1 rune2Max = 1<<11 - 1 ) // encodeWTF16 returns the potentially ill-formed // UTF-16 encoding of s. func encodeWTF16(s string, buf []uint16) []uint16 { for i := 0; i < len(s); { // Cannot use 'for range s' because it expects valid // UTF-8 runes. r, size := utf8.DecodeRuneInString(s[i:]) if r == utf8.RuneError { // Check if s[i:] contains a valid WTF-8 encoded surrogate. if sc := s[i:]; len(sc) >= 3 && sc[0] == 0xED && 0xA0 <= sc[1] && sc[1] <= 0xBF && 0x80 <= sc[2] && sc[2] <= 0xBF { r = rune(sc[0]&mask3)<<12 + rune(sc[1]&maskx)<<6 + rune(sc[2]&maskx) buf = append(buf, uint16(r)) i += 3 continue } } i += size buf = utf16.AppendRune(buf, r) } return buf } // decodeWTF16 returns the WTF-8 encoding of // the potentially ill-formed UTF-16 s. func decodeWTF16(s []uint16, buf []byte) []byte { for i := 0; i < len(s); i++ { var ar rune switch r := s[i]; { case r < surr1, surr3 <= r: // normal rune ar = rune(r) case surr1 <= r && r < surr2 && i+1 < len(s) && surr2 <= s[i+1] && s[i+1] < surr3: // valid surrogate sequence ar = utf16.DecodeRune(rune(r), rune(s[i+1])) i++ default: // WTF-8 fallback. // This only handles the 3-byte case of utf8.AppendRune, // as surrogates always fall in that case. ar = rune(r) if ar > utf8.MaxRune { ar = utf8.RuneError } buf = append(buf, t3|byte(ar>>12), tx|byte(ar>>6)&maskx, tx|byte(ar)&maskx) continue } buf = utf8.AppendRune(buf, ar) } return buf }