p256_asm.go

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file contains the Go wrapper for the constant-time, 64-bit assembly
     6  // implementation of P256. The optimizations performed here are described in
     7  // detail in:
     8  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     9  //                          256-bit primes"
    10  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    11  // https://eprint.iacr.org/2013/816.pdf
    12  
    13  //go:build amd64 || arm64 || ppc64le || s390x
    14  
    15  package nistec
    16  
    17  import (
    18  	_ "embed"
    19  	"encoding/binary"
    20  	"errors"
    21  	"math/bits"
    22  	"runtime"
    23  	"unsafe"
    24  )
    25  
    26  // p256Element is a P-256 base field element in [0, P-1] in the Montgomery
    27  // domain (with R 2²⁵⁶) as four limbs in little-endian order value.
    28  type p256Element [4]uint64
    29  
    30  // p256One is one in the Montgomery domain.
    31  var p256One = p256Element{0x0000000000000001, 0xffffffff00000000,
    32  	0xffffffffffffffff, 0x00000000fffffffe}
    33  
    34  var p256Zero = p256Element{}
    35  
    36  // p256P is 2²⁵⁶ - 2²²⁴ + 2¹⁹² + 2⁹⁶ - 1 in the Montgomery domain.
    37  var p256P = p256Element{0xffffffffffffffff, 0x00000000ffffffff,
    38  	0x0000000000000000, 0xffffffff00000001}
    39  
    40  // P256Point is a P-256 point. The zero value should not be assumed to be valid
    41  // (although it is in this implementation).
    42  type P256Point struct {
    43  	// (X:Y:Z) are Jacobian coordinates where x = X/Z² and y = Y/Z³. The point
    44  	// at infinity can be represented by any set of coordinates with Z = 0.
    45  	x, y, z p256Element
    46  }
    47  
    48  // NewP256Point returns a new P256Point representing the point at infinity.
    49  func NewP256Point() *P256Point {
    50  	return &P256Point{
    51  		x: p256One, y: p256One, z: p256Zero,
    52  	}
    53  }
    54  
    55  // SetGenerator sets p to the canonical generator and returns p.
    56  func (p *P256Point) SetGenerator() *P256Point {
    57  	p.x = p256Element{0x79e730d418a9143c, 0x75ba95fc5fedb601,
    58  		0x79fb732b77622510, 0x18905f76a53755c6}
    59  	p.y = p256Element{0xddf25357ce95560a, 0x8b4ab8e4ba19e45c,
    60  		0xd2e88688dd21f325, 0x8571ff1825885d85}
    61  	p.z = p256One
    62  	return p
    63  }
    64  
    65  // Set sets p = q and returns p.
    66  func (p *P256Point) Set(q *P256Point) *P256Point {
    67  	p.x, p.y, p.z = q.x, q.y, q.z
    68  	return p
    69  }
    70  
    71  const p256ElementLength = 32
    72  const p256UncompressedLength = 1 + 2*p256ElementLength
    73  const p256CompressedLength = 1 + p256ElementLength
    74  
    75  // SetBytes sets p to the compressed, uncompressed, or infinity value encoded in
    76  // b, as specified in SEC 1, Version 2.0, Section 2.3.4. If the point is not on
    77  // the curve, it returns nil and an error, and the receiver is unchanged.
    78  // Otherwise, it returns p.
    79  func (p *P256Point) SetBytes(b []byte) (*P256Point, error) {
    80  	// p256Mul operates in the Montgomery domain with R = 2²⁵⁶ mod p. Thus rr
    81  	// here is R in the Montgomery domain, or R×R mod p. See comment in
    82  	// P256OrdInverse about how this is used.
    83  	rr := p256Element{0x0000000000000003, 0xfffffffbffffffff,
    84  		0xfffffffffffffffe, 0x00000004fffffffd}
    85  
    86  	switch {
    87  	// Point at infinity.
    88  	case len(b) == 1 && b[0] == 0:
    89  		return p.Set(NewP256Point()), nil
    90  
    91  	// Uncompressed form.
    92  	case len(b) == p256UncompressedLength && b[0] == 4:
    93  		var r P256Point
    94  		p256BigToLittle(&r.x, (*[32]byte)(b[1:33]))
    95  		p256BigToLittle(&r.y, (*[32]byte)(b[33:65]))
    96  		if p256LessThanP(&r.x) == 0 || p256LessThanP(&r.y) == 0 {
    97  			return nil, errors.New("invalid P256 element encoding")
    98  		}
    99  		p256Mul(&r.x, &r.x, &rr)
   100  		p256Mul(&r.y, &r.y, &rr)
   101  		if err := p256CheckOnCurve(&r.x, &r.y); err != nil {
   102  			return nil, err
   103  		}
   104  		r.z = p256One
   105  		return p.Set(&r), nil
   106  
   107  	// Compressed form.
   108  	case len(b) == p256CompressedLength && (b[0] == 2 || b[0] == 3):
   109  		var r P256Point
   110  		p256BigToLittle(&r.x, (*[32]byte)(b[1:33]))
   111  		if p256LessThanP(&r.x) == 0 {
   112  			return nil, errors.New("invalid P256 element encoding")
   113  		}
   114  		p256Mul(&r.x, &r.x, &rr)
   115  
   116  		// y² = x³ - 3x + b
   117  		p256Polynomial(&r.y, &r.x)
   118  		if !p256Sqrt(&r.y, &r.y) {
   119  			return nil, errors.New("invalid P256 compressed point encoding")
   120  		}
   121  
   122  		// Select the positive or negative root, as indicated by the least
   123  		// significant bit, based on the encoding type byte.
   124  		yy := new(p256Element)
   125  		p256FromMont(yy, &r.y)
   126  		cond := int(yy[0]&1) ^ int(b[0]&1)
   127  		p256NegCond(&r.y, cond)
   128  
   129  		r.z = p256One
   130  		return p.Set(&r), nil
   131  
   132  	default:
   133  		return nil, errors.New("invalid P256 point encoding")
   134  	}
   135  }
   136  
   137  // p256Polynomial sets y2 to x³ - 3x + b, and returns y2.
   138  func p256Polynomial(y2, x *p256Element) *p256Element {
   139  	x3 := new(p256Element)
   140  	p256Sqr(x3, x, 1)
   141  	p256Mul(x3, x3, x)
   142  
   143  	threeX := new(p256Element)
   144  	p256Add(threeX, x, x)
   145  	p256Add(threeX, threeX, x)
   146  	p256NegCond(threeX, 1)
   147  
   148  	p256B := &p256Element{0xd89cdf6229c4bddf, 0xacf005cd78843090,
   149  		0xe5a220abf7212ed6, 0xdc30061d04874834}
   150  
   151  	p256Add(x3, x3, threeX)
   152  	p256Add(x3, x3, p256B)
   153  
   154  	*y2 = *x3
   155  	return y2
   156  }
   157  
   158  func p256CheckOnCurve(x, y *p256Element) error {
   159  	// y² = x³ - 3x + b
   160  	rhs := p256Polynomial(new(p256Element), x)
   161  	lhs := new(p256Element)
   162  	p256Sqr(lhs, y, 1)
   163  	if p256Equal(lhs, rhs) != 1 {
   164  		return errors.New("P256 point not on curve")
   165  	}
   166  	return nil
   167  }
   168  
   169  // p256LessThanP returns 1 if x < p, and 0 otherwise. Note that a p256Element is
   170  // not allowed to be equal to or greater than p, so if this function returns 0
   171  // then x is invalid.
   172  func p256LessThanP(x *p256Element) int {
   173  	var b uint64
   174  	_, b = bits.Sub64(x[0], p256P[0], b)
   175  	_, b = bits.Sub64(x[1], p256P[1], b)
   176  	_, b = bits.Sub64(x[2], p256P[2], b)
   177  	_, b = bits.Sub64(x[3], p256P[3], b)
   178  	return int(b)
   179  }
   180  
   181  // p256Add sets res = x + y.
   182  func p256Add(res, x, y *p256Element) {
   183  	var c, b uint64
   184  	t1 := make([]uint64, 4)
   185  	t1[0], c = bits.Add64(x[0], y[0], 0)
   186  	t1[1], c = bits.Add64(x[1], y[1], c)
   187  	t1[2], c = bits.Add64(x[2], y[2], c)
   188  	t1[3], c = bits.Add64(x[3], y[3], c)
   189  	t2 := make([]uint64, 4)
   190  	t2[0], b = bits.Sub64(t1[0], p256P[0], 0)
   191  	t2[1], b = bits.Sub64(t1[1], p256P[1], b)
   192  	t2[2], b = bits.Sub64(t1[2], p256P[2], b)
   193  	t2[3], b = bits.Sub64(t1[3], p256P[3], b)
   194  	// Three options:
   195  	//   - a+b < p
   196  	//     then c is 0, b is 1, and t1 is correct
   197  	//   - p <= a+b < 2^256
   198  	//     then c is 0, b is 0, and t2 is correct
   199  	//   - 2^256 <= a+b
   200  	//     then c is 1, b is 1, and t2 is correct
   201  	t2Mask := (c ^ b) - 1
   202  	res[0] = (t1[0] & ^t2Mask) | (t2[0] & t2Mask)
   203  	res[1] = (t1[1] & ^t2Mask) | (t2[1] & t2Mask)
   204  	res[2] = (t1[2] & ^t2Mask) | (t2[2] & t2Mask)
   205  	res[3] = (t1[3] & ^t2Mask) | (t2[3] & t2Mask)
   206  }
   207  
   208  // p256Sqrt sets e to a square root of x. If x is not a square, p256Sqrt returns
   209  // false and e is unchanged. e and x can overlap.
   210  func p256Sqrt(e, x *p256Element) (isSquare bool) {
   211  	t0, t1 := new(p256Element), new(p256Element)
   212  
   213  	// Since p = 3 mod 4, exponentiation by (p + 1) / 4 yields a square root candidate.
   214  	//
   215  	// The sequence of 7 multiplications and 253 squarings is derived from the
   216  	// following addition chain generated with github.com/mmcloughlin/addchain v0.4.0.
   217  	//
   218  	//	_10       = 2*1
   219  	//	_11       = 1 + _10
   220  	//	_1100     = _11 << 2
   221  	//	_1111     = _11 + _1100
   222  	//	_11110000 = _1111 << 4
   223  	//	_11111111 = _1111 + _11110000
   224  	//	x16       = _11111111 << 8 + _11111111
   225  	//	x32       = x16 << 16 + x16
   226  	//	return      ((x32 << 32 + 1) << 96 + 1) << 94
   227  	//
   228  	p256Sqr(t0, x, 1)
   229  	p256Mul(t0, x, t0)
   230  	p256Sqr(t1, t0, 2)
   231  	p256Mul(t0, t0, t1)
   232  	p256Sqr(t1, t0, 4)
   233  	p256Mul(t0, t0, t1)
   234  	p256Sqr(t1, t0, 8)
   235  	p256Mul(t0, t0, t1)
   236  	p256Sqr(t1, t0, 16)
   237  	p256Mul(t0, t0, t1)
   238  	p256Sqr(t0, t0, 32)
   239  	p256Mul(t0, x, t0)
   240  	p256Sqr(t0, t0, 96)
   241  	p256Mul(t0, x, t0)
   242  	p256Sqr(t0, t0, 94)
   243  
   244  	p256Sqr(t1, t0, 1)
   245  	if p256Equal(t1, x) != 1 {
   246  		return false
   247  	}
   248  	*e = *t0
   249  	return true
   250  }
   251  
   252  // The following assembly functions are implemented in p256_asm_*.s
   253  
   254  // Montgomery multiplication. Sets res = in1 * in2 * R⁻¹ mod p.
   255  //
   256  //go:noescape
   257  func p256Mul(res, in1, in2 *p256Element)
   258  
   259  // Montgomery square, repeated n times (n >= 1).
   260  //
   261  //go:noescape
   262  func p256Sqr(res, in *p256Element, n int)
   263  
   264  // Montgomery multiplication by R⁻¹, or 1 outside the domain.
   265  // Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
   266  //
   267  //go:noescape
   268  func p256FromMont(res, in *p256Element)
   269  
   270  // If cond is not 0, sets val = -val mod p.
   271  //
   272  //go:noescape
   273  func p256NegCond(val *p256Element, cond int)
   274  
   275  // If cond is 0, sets res = b, otherwise sets res = a.
   276  //
   277  //go:noescape
   278  func p256MovCond(res, a, b *P256Point, cond int)
   279  
   280  //go:noescape
   281  func p256BigToLittle(res *p256Element, in *[32]byte)
   282  
   283  //go:noescape
   284  func p256LittleToBig(res *[32]byte, in *p256Element)
   285  
   286  //go:noescape
   287  func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
   288  
   289  //go:noescape
   290  func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
   291  
   292  // p256Table is a table of the first 16 multiples of a point. Points are stored
   293  // at an index offset of -1 so [8]P is at index 7, P is at 0, and [16]P is at 15.
   294  // [0]P is the point at infinity and it's not stored.
   295  type p256Table [16]P256Point
   296  
   297  // p256Select sets res to the point at index idx in the table.
   298  // idx must be in [0, 15]. It executes in constant time.
   299  //
   300  //go:noescape
   301  func p256Select(res *P256Point, table *p256Table, idx int)
   302  
   303  // p256AffinePoint is a point in affine coordinates (x, y). x and y are still
   304  // Montgomery domain elements. The point can't be the point at infinity.
   305  type p256AffinePoint struct {
   306  	x, y p256Element
   307  }
   308  
   309  // p256AffineTable is a table of the first 32 multiples of a point. Points are
   310  // stored at an index offset of -1 like in p256Table, and [0]P is not stored.
   311  type p256AffineTable [32]p256AffinePoint
   312  
   313  // p256Precomputed is a series of precomputed multiples of G, the canonical
   314  // generator. The first p256AffineTable contains multiples of G. The second one
   315  // multiples of [2⁶]G, the third one of [2¹²]G, and so on, where each successive
   316  // table is the previous table doubled six times. Six is the width of the
   317  // sliding window used in p256ScalarMult, and having each table already
   318  // pre-doubled lets us avoid the doublings between windows entirely. This table
   319  // MUST NOT be modified, as it aliases into p256PrecomputedEmbed below.
   320  var p256Precomputed *[43]p256AffineTable
   321  
   322  //go:embed p256_asm_table.bin
   323  var p256PrecomputedEmbed string
   324  
   325  func init() {
   326  	p256PrecomputedPtr := (*unsafe.Pointer)(unsafe.Pointer(&p256PrecomputedEmbed))
   327  	if runtime.GOARCH == "s390x" {
   328  		var newTable [43 * 32 * 2 * 4]uint64
   329  		for i, x := range (*[43 * 32 * 2 * 4][8]byte)(*p256PrecomputedPtr) {
   330  			newTable[i] = binary.LittleEndian.Uint64(x[:])
   331  		}
   332  		newTablePtr := unsafe.Pointer(&newTable)
   333  		p256PrecomputedPtr = &newTablePtr
   334  	}
   335  	p256Precomputed = (*[43]p256AffineTable)(*p256PrecomputedPtr)
   336  }
   337  
   338  // p256SelectAffine sets res to the point at index idx in the table.
   339  // idx must be in [0, 31]. It executes in constant time.
   340  //
   341  //go:noescape
   342  func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   343  
   344  // Point addition with an affine point and constant time conditions.
   345  // If zero is 0, sets res = in2. If sel is 0, sets res = in1.
   346  // If sign is not 0, sets res = in1 + -in2. Otherwise, sets res = in1 + in2
   347  //
   348  //go:noescape
   349  func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
   350  
   351  // Point addition. Sets res = in1 + in2. Returns one if the two input points
   352  // were equal and zero otherwise. If in1 or in2 are the point at infinity, res
   353  // and the return value are undefined.
   354  //
   355  //go:noescape
   356  func p256PointAddAsm(res, in1, in2 *P256Point) int
   357  
   358  // Point doubling. Sets res = in + in. in can be the point at infinity.
   359  //
   360  //go:noescape
   361  func p256PointDoubleAsm(res, in *P256Point)
   362  
   363  // p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the
   364  // Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
   365  type p256OrdElement [4]uint64
   366  
   367  // p256OrdReduce ensures s is in the range [0, ord(G)-1].
   368  func p256OrdReduce(s *p256OrdElement) {
   369  	// Since 2 * ord(G) > 2²⁵⁶, we can just conditionally subtract ord(G),
   370  	// keeping the result if it doesn't underflow.
   371  	t0, b := bits.Sub64(s[0], 0xf3b9cac2fc632551, 0)
   372  	t1, b := bits.Sub64(s[1], 0xbce6faada7179e84, b)
   373  	t2, b := bits.Sub64(s[2], 0xffffffffffffffff, b)
   374  	t3, b := bits.Sub64(s[3], 0xffffffff00000000, b)
   375  	tMask := b - 1 // zero if subtraction underflowed
   376  	s[0] ^= (t0 ^ s[0]) & tMask
   377  	s[1] ^= (t1 ^ s[1]) & tMask
   378  	s[2] ^= (t2 ^ s[2]) & tMask
   379  	s[3] ^= (t3 ^ s[3]) & tMask
   380  }
   381  
   382  // Add sets q = p1 + p2, and returns q. The points may overlap.
   383  func (q *P256Point) Add(r1, r2 *P256Point) *P256Point {
   384  	var sum, double P256Point
   385  	r1IsInfinity := r1.isInfinity()
   386  	r2IsInfinity := r2.isInfinity()
   387  	pointsEqual := p256PointAddAsm(&sum, r1, r2)
   388  	p256PointDoubleAsm(&double, r1)
   389  	p256MovCond(&sum, &double, &sum, pointsEqual)
   390  	p256MovCond(&sum, r1, &sum, r2IsInfinity)
   391  	p256MovCond(&sum, r2, &sum, r1IsInfinity)
   392  	return q.Set(&sum)
   393  }
   394  
   395  // Double sets q = p + p, and returns q. The points may overlap.
   396  func (q *P256Point) Double(p *P256Point) *P256Point {
   397  	var double P256Point
   398  	p256PointDoubleAsm(&double, p)
   399  	return q.Set(&double)
   400  }
   401  
   402  // ScalarBaseMult sets r = scalar * generator, where scalar is a 32-byte big
   403  // endian value, and returns r. If scalar is not 32 bytes long, ScalarBaseMult
   404  // returns an error and the receiver is unchanged.
   405  func (r *P256Point) ScalarBaseMult(scalar []byte) (*P256Point, error) {
   406  	if len(scalar) != 32 {
   407  		return nil, errors.New("invalid scalar length")
   408  	}
   409  	scalarReversed := new(p256OrdElement)
   410  	p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar))
   411  	p256OrdReduce(scalarReversed)
   412  
   413  	r.p256BaseMult(scalarReversed)
   414  	return r, nil
   415  }
   416  
   417  // ScalarMult sets r = scalar * q, where scalar is a 32-byte big endian value,
   418  // and returns r. If scalar is not 32 bytes long, ScalarBaseMult returns an
   419  // error and the receiver is unchanged.
   420  func (r *P256Point) ScalarMult(q *P256Point, scalar []byte) (*P256Point, error) {
   421  	if len(scalar) != 32 {
   422  		return nil, errors.New("invalid scalar length")
   423  	}
   424  	scalarReversed := new(p256OrdElement)
   425  	p256OrdBigToLittle(scalarReversed, (*[32]byte)(scalar))
   426  	p256OrdReduce(scalarReversed)
   427  
   428  	r.Set(q).p256ScalarMult(scalarReversed)
   429  	return r, nil
   430  }
   431  
   432  // uint64IsZero returns 1 if x is zero and zero otherwise.
   433  func uint64IsZero(x uint64) int {
   434  	x = ^x
   435  	x &= x >> 32
   436  	x &= x >> 16
   437  	x &= x >> 8
   438  	x &= x >> 4
   439  	x &= x >> 2
   440  	x &= x >> 1
   441  	return int(x & 1)
   442  }
   443  
   444  // p256Equal returns 1 if a and b are equal and 0 otherwise.
   445  func p256Equal(a, b *p256Element) int {
   446  	var acc uint64
   447  	for i := range a {
   448  		acc |= a[i] ^ b[i]
   449  	}
   450  	return uint64IsZero(acc)
   451  }
   452  
   453  // isInfinity returns 1 if p is the point at infinity and 0 otherwise.
   454  func (p *P256Point) isInfinity() int {
   455  	return p256Equal(&p.z, &p256Zero)
   456  }
   457  
   458  // Bytes returns the uncompressed or infinity encoding of p, as specified in
   459  // SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the point at
   460  // infinity is shorter than all other encodings.
   461  func (p *P256Point) Bytes() []byte {
   462  	// This function is outlined to make the allocations inline in the caller
   463  	// rather than happen on the heap.
   464  	var out [p256UncompressedLength]byte
   465  	return p.bytes(&out)
   466  }
   467  
   468  func (p *P256Point) bytes(out *[p256UncompressedLength]byte) []byte {
   469  	// The proper representation of the point at infinity is a single zero byte.
   470  	if p.isInfinity() == 1 {
   471  		return append(out[:0], 0)
   472  	}
   473  
   474  	x, y := new(p256Element), new(p256Element)
   475  	p.affineFromMont(x, y)
   476  
   477  	out[0] = 4 // Uncompressed form.
   478  	p256LittleToBig((*[32]byte)(out[1:33]), x)
   479  	p256LittleToBig((*[32]byte)(out[33:65]), y)
   480  
   481  	return out[:]
   482  }
   483  
   484  // affineFromMont sets (x, y) to the affine coordinates of p, converted out of the
   485  // Montgomery domain.
   486  func (p *P256Point) affineFromMont(x, y *p256Element) {
   487  	p256Inverse(y, &p.z)
   488  	p256Sqr(x, y, 1)
   489  	p256Mul(y, y, x)
   490  
   491  	p256Mul(x, &p.x, x)
   492  	p256Mul(y, &p.y, y)
   493  
   494  	p256FromMont(x, x)
   495  	p256FromMont(y, y)
   496  }
   497  
   498  // BytesX returns the encoding of the x-coordinate of p, as specified in SEC 1,
   499  // Version 2.0, Section 2.3.5, or an error if p is the point at infinity.
   500  func (p *P256Point) BytesX() ([]byte, error) {
   501  	// This function is outlined to make the allocations inline in the caller
   502  	// rather than happen on the heap.
   503  	var out [p256ElementLength]byte
   504  	return p.bytesX(&out)
   505  }
   506  
   507  func (p *P256Point) bytesX(out *[p256ElementLength]byte) ([]byte, error) {
   508  	if p.isInfinity() == 1 {
   509  		return nil, errors.New("P256 point is the point at infinity")
   510  	}
   511  
   512  	x := new(p256Element)
   513  	p256Inverse(x, &p.z)
   514  	p256Sqr(x, x, 1)
   515  	p256Mul(x, &p.x, x)
   516  	p256FromMont(x, x)
   517  	p256LittleToBig((*[32]byte)(out[:]), x)
   518  
   519  	return out[:], nil
   520  }
   521  
   522  // BytesCompressed returns the compressed or infinity encoding of p, as
   523  // specified in SEC 1, Version 2.0, Section 2.3.3. Note that the encoding of the
   524  // point at infinity is shorter than all other encodings.
   525  func (p *P256Point) BytesCompressed() []byte {
   526  	// This function is outlined to make the allocations inline in the caller
   527  	// rather than happen on the heap.
   528  	var out [p256CompressedLength]byte
   529  	return p.bytesCompressed(&out)
   530  }
   531  
   532  func (p *P256Point) bytesCompressed(out *[p256CompressedLength]byte) []byte {
   533  	if p.isInfinity() == 1 {
   534  		return append(out[:0], 0)
   535  	}
   536  
   537  	x, y := new(p256Element), new(p256Element)
   538  	p.affineFromMont(x, y)
   539  
   540  	out[0] = 2 | byte(y[0]&1)
   541  	p256LittleToBig((*[32]byte)(out[1:33]), x)
   542  
   543  	return out[:]
   544  }
   545  
   546  // Select sets q to p1 if cond == 1, and to p2 if cond == 0.
   547  func (q *P256Point) Select(p1, p2 *P256Point, cond int) *P256Point {
   548  	p256MovCond(q, p1, p2, cond)
   549  	return q
   550  }
   551  
   552  // p256Inverse sets out to in⁻¹ mod p. If in is zero, out will be zero.
   553  func p256Inverse(out, in *p256Element) {
   554  	// Inversion is calculated through exponentiation by p - 2, per Fermat's
   555  	// little theorem.
   556  	//
   557  	// The sequence of 12 multiplications and 255 squarings is derived from the
   558  	// following addition chain generated with github.com/mmcloughlin/addchain
   559  	// v0.4.0.
   560  	//
   561  	//  _10     = 2*1
   562  	//  _11     = 1 + _10
   563  	//  _110    = 2*_11
   564  	//  _111    = 1 + _110
   565  	//  _111000 = _111 << 3
   566  	//  _111111 = _111 + _111000
   567  	//  x12     = _111111 << 6 + _111111
   568  	//  x15     = x12 << 3 + _111
   569  	//  x16     = 2*x15 + 1
   570  	//  x32     = x16 << 16 + x16
   571  	//  i53     = x32 << 15
   572  	//  x47     = x15 + i53
   573  	//  i263    = ((i53 << 17 + 1) << 143 + x47) << 47
   574  	//  return    (x47 + i263) << 2 + 1
   575  	//
   576  	var z = new(p256Element)
   577  	var t0 = new(p256Element)
   578  	var t1 = new(p256Element)
   579  
   580  	p256Sqr(z, in, 1)
   581  	p256Mul(z, in, z)
   582  	p256Sqr(z, z, 1)
   583  	p256Mul(z, in, z)
   584  	p256Sqr(t0, z, 3)
   585  	p256Mul(t0, z, t0)
   586  	p256Sqr(t1, t0, 6)
   587  	p256Mul(t0, t0, t1)
   588  	p256Sqr(t0, t0, 3)
   589  	p256Mul(z, z, t0)
   590  	p256Sqr(t0, z, 1)
   591  	p256Mul(t0, in, t0)
   592  	p256Sqr(t1, t0, 16)
   593  	p256Mul(t0, t0, t1)
   594  	p256Sqr(t0, t0, 15)
   595  	p256Mul(z, z, t0)
   596  	p256Sqr(t0, t0, 17)
   597  	p256Mul(t0, in, t0)
   598  	p256Sqr(t0, t0, 143)
   599  	p256Mul(t0, z, t0)
   600  	p256Sqr(t0, t0, 47)
   601  	p256Mul(z, z, t0)
   602  	p256Sqr(z, z, 2)
   603  	p256Mul(out, in, z)
   604  }
   605  
   606  func boothW5(in uint) (int, int) {
   607  	var s uint = ^((in >> 5) - 1)
   608  	var d uint = (1 << 6) - in - 1
   609  	d = (d & s) | (in & (^s))
   610  	d = (d >> 1) + (d & 1)
   611  	return int(d), int(s & 1)
   612  }
   613  
   614  func boothW6(in uint) (int, int) {
   615  	var s uint = ^((in >> 6) - 1)
   616  	var d uint = (1 << 7) - in - 1
   617  	d = (d & s) | (in & (^s))
   618  	d = (d >> 1) + (d & 1)
   619  	return int(d), int(s & 1)
   620  }
   621  
   622  func (p *P256Point) p256BaseMult(scalar *p256OrdElement) {
   623  	var t0 p256AffinePoint
   624  
   625  	wvalue := (scalar[0] << 1) & 0x7f
   626  	sel, sign := boothW6(uint(wvalue))
   627  	p256SelectAffine(&t0, &p256Precomputed[0], sel)
   628  	p.x, p.y, p.z = t0.x, t0.y, p256One
   629  	p256NegCond(&p.y, sign)
   630  
   631  	index := uint(5)
   632  	zero := sel
   633  
   634  	for i := 1; i < 43; i++ {
   635  		if index < 192 {
   636  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
   637  		} else {
   638  			wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
   639  		}
   640  		index += 6
   641  		sel, sign = boothW6(uint(wvalue))
   642  		p256SelectAffine(&t0, &p256Precomputed[i], sel)
   643  		p256PointAddAffineAsm(p, p, &t0, sign, sel, zero)
   644  		zero |= sel
   645  	}
   646  
   647  	// If the whole scalar was zero, set to the point at infinity.
   648  	p256MovCond(p, p, NewP256Point(), zero)
   649  }
   650  
   651  func (p *P256Point) p256ScalarMult(scalar *p256OrdElement) {
   652  	// precomp is a table of precomputed points that stores powers of p
   653  	// from p^1 to p^16.
   654  	var precomp p256Table
   655  	var t0, t1, t2, t3 P256Point
   656  
   657  	// Prepare the table
   658  	precomp[0] = *p // 1
   659  
   660  	p256PointDoubleAsm(&t0, p)
   661  	p256PointDoubleAsm(&t1, &t0)
   662  	p256PointDoubleAsm(&t2, &t1)
   663  	p256PointDoubleAsm(&t3, &t2)
   664  	precomp[1] = t0  // 2
   665  	precomp[3] = t1  // 4
   666  	precomp[7] = t2  // 8
   667  	precomp[15] = t3 // 16
   668  
   669  	p256PointAddAsm(&t0, &t0, p)
   670  	p256PointAddAsm(&t1, &t1, p)
   671  	p256PointAddAsm(&t2, &t2, p)
   672  	precomp[2] = t0 // 3
   673  	precomp[4] = t1 // 5
   674  	precomp[8] = t2 // 9
   675  
   676  	p256PointDoubleAsm(&t0, &t0)
   677  	p256PointDoubleAsm(&t1, &t1)
   678  	precomp[5] = t0 // 6
   679  	precomp[9] = t1 // 10
   680  
   681  	p256PointAddAsm(&t2, &t0, p)
   682  	p256PointAddAsm(&t1, &t1, p)
   683  	precomp[6] = t2  // 7
   684  	precomp[10] = t1 // 11
   685  
   686  	p256PointDoubleAsm(&t0, &t0)
   687  	p256PointDoubleAsm(&t2, &t2)
   688  	precomp[11] = t0 // 12
   689  	precomp[13] = t2 // 14
   690  
   691  	p256PointAddAsm(&t0, &t0, p)
   692  	p256PointAddAsm(&t2, &t2, p)
   693  	precomp[12] = t0 // 13
   694  	precomp[14] = t2 // 15
   695  
   696  	// Start scanning the window from top bit
   697  	index := uint(254)
   698  	var sel, sign int
   699  
   700  	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
   701  	sel, _ = boothW5(uint(wvalue))
   702  
   703  	p256Select(p, &precomp, sel)
   704  	zero := sel
   705  
   706  	for index > 4 {
   707  		index -= 5
   708  		p256PointDoubleAsm(p, p)
   709  		p256PointDoubleAsm(p, p)
   710  		p256PointDoubleAsm(p, p)
   711  		p256PointDoubleAsm(p, p)
   712  		p256PointDoubleAsm(p, p)
   713  
   714  		if index < 192 {
   715  			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
   716  		} else {
   717  			wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
   718  		}
   719  
   720  		sel, sign = boothW5(uint(wvalue))
   721  
   722  		p256Select(&t0, &precomp, sel)
   723  		p256NegCond(&t0.y, sign)
   724  		p256PointAddAsm(&t1, p, &t0)
   725  		p256MovCond(&t1, &t1, p, sel)
   726  		p256MovCond(p, &t1, &t0, zero)
   727  		zero |= sel
   728  	}
   729  
   730  	p256PointDoubleAsm(p, p)
   731  	p256PointDoubleAsm(p, p)
   732  	p256PointDoubleAsm(p, p)
   733  	p256PointDoubleAsm(p, p)
   734  	p256PointDoubleAsm(p, p)
   735  
   736  	wvalue = (scalar[0] << 1) & 0x3f
   737  	sel, sign = boothW5(uint(wvalue))
   738  
   739  	p256Select(&t0, &precomp, sel)
   740  	p256NegCond(&t0.y, sign)
   741  	p256PointAddAsm(&t1, p, &t0)
   742  	p256MovCond(&t1, &t1, p, sel)
   743  	p256MovCond(p, &t1, &t0, zero)
   744  }
   745
View as plain text