Text file src/crypto/internal/nistec/p256_asm_amd64.s

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file contains constant-time, 64-bit assembly implementation of
     6  // P256. The optimizations performed here are described in detail in:
     7  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     8  //                          256-bit primes"
     9  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    10  // https://eprint.iacr.org/2013/816.pdf
    11  
    12  #include "textflag.h"
    13  
    14  #define res_ptr DI
    15  #define x_ptr SI
    16  #define y_ptr CX
    17  
    18  #define acc0 R8
    19  #define acc1 R9
    20  #define acc2 R10
    21  #define acc3 R11
    22  #define acc4 R12
    23  #define acc5 R13
    24  #define t0 R14
    25  #define t1 R15
    26  
    27  DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    28  DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    29  DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    30  DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    31  DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    32  DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    33  DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    34  DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    35  DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    36  DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    37  DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    38  GLOBL p256const0<>(SB), 8, $8
    39  GLOBL p256const1<>(SB), 8, $8
    40  GLOBL p256ordK0<>(SB), 8, $8
    41  GLOBL p256ord<>(SB), 8, $32
    42  GLOBL p256one<>(SB), 8, $32
    43  
    44  /* ---------------------------------------*/
    45  // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
    46  TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
    47  	JMP ·p256BigToLittle(SB)
    48  /* ---------------------------------------*/
    49  // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
    50  TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
    51  	JMP ·p256BigToLittle(SB)
    52  /* ---------------------------------------*/
    53  // func p256LittleToBig(res *[32]byte, in *p256Element)
    54  TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    55  	JMP ·p256BigToLittle(SB)
    56  /* ---------------------------------------*/
    57  // func p256BigToLittle(res *p256Element, in *[32]byte)
    58  TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    59  	MOVQ res+0(FP), res_ptr
    60  	MOVQ in+8(FP), x_ptr
    61  
    62  	MOVQ (8*0)(x_ptr), acc0
    63  	MOVQ (8*1)(x_ptr), acc1
    64  	MOVQ (8*2)(x_ptr), acc2
    65  	MOVQ (8*3)(x_ptr), acc3
    66  
    67  	BSWAPQ acc0
    68  	BSWAPQ acc1
    69  	BSWAPQ acc2
    70  	BSWAPQ acc3
    71  
    72  	MOVQ acc3, (8*0)(res_ptr)
    73  	MOVQ acc2, (8*1)(res_ptr)
    74  	MOVQ acc1, (8*2)(res_ptr)
    75  	MOVQ acc0, (8*3)(res_ptr)
    76  
    77  	RET
    78  /* ---------------------------------------*/
    79  // func p256MovCond(res, a, b *P256Point, cond int)
    80  TEXT ·p256MovCond(SB),NOSPLIT,$0
    81  	MOVQ res+0(FP), res_ptr
    82  	MOVQ a+8(FP), x_ptr
    83  	MOVQ b+16(FP), y_ptr
    84  	MOVQ cond+24(FP), X12
    85  
    86  	PXOR X13, X13
    87  	PSHUFD $0, X12, X12
    88  	PCMPEQL X13, X12
    89  
    90  	MOVOU X12, X0
    91  	MOVOU (16*0)(x_ptr), X6
    92  	PANDN X6, X0
    93  	MOVOU X12, X1
    94  	MOVOU (16*1)(x_ptr), X7
    95  	PANDN X7, X1
    96  	MOVOU X12, X2
    97  	MOVOU (16*2)(x_ptr), X8
    98  	PANDN X8, X2
    99  	MOVOU X12, X3
   100  	MOVOU (16*3)(x_ptr), X9
   101  	PANDN X9, X3
   102  	MOVOU X12, X4
   103  	MOVOU (16*4)(x_ptr), X10
   104  	PANDN X10, X4
   105  	MOVOU X12, X5
   106  	MOVOU (16*5)(x_ptr), X11
   107  	PANDN X11, X5
   108  
   109  	MOVOU (16*0)(y_ptr), X6
   110  	MOVOU (16*1)(y_ptr), X7
   111  	MOVOU (16*2)(y_ptr), X8
   112  	MOVOU (16*3)(y_ptr), X9
   113  	MOVOU (16*4)(y_ptr), X10
   114  	MOVOU (16*5)(y_ptr), X11
   115  
   116  	PAND X12, X6
   117  	PAND X12, X7
   118  	PAND X12, X8
   119  	PAND X12, X9
   120  	PAND X12, X10
   121  	PAND X12, X11
   122  
   123  	PXOR X6, X0
   124  	PXOR X7, X1
   125  	PXOR X8, X2
   126  	PXOR X9, X3
   127  	PXOR X10, X4
   128  	PXOR X11, X5
   129  
   130  	MOVOU X0, (16*0)(res_ptr)
   131  	MOVOU X1, (16*1)(res_ptr)
   132  	MOVOU X2, (16*2)(res_ptr)
   133  	MOVOU X3, (16*3)(res_ptr)
   134  	MOVOU X4, (16*4)(res_ptr)
   135  	MOVOU X5, (16*5)(res_ptr)
   136  
   137  	RET
   138  /* ---------------------------------------*/
   139  // func p256NegCond(val *p256Element, cond int)
   140  TEXT ·p256NegCond(SB),NOSPLIT,$0
   141  	MOVQ val+0(FP), res_ptr
   142  	MOVQ cond+8(FP), t0
   143  	// acc = poly
   144  	MOVQ $-1, acc0
   145  	MOVQ p256const0<>(SB), acc1
   146  	MOVQ $0, acc2
   147  	MOVQ p256const1<>(SB), acc3
   148  	// Load the original value
   149  	MOVQ (8*0)(res_ptr), acc5
   150  	MOVQ (8*1)(res_ptr), x_ptr
   151  	MOVQ (8*2)(res_ptr), y_ptr
   152  	MOVQ (8*3)(res_ptr), t1
   153  	// Speculatively subtract
   154  	SUBQ acc5, acc0
   155  	SBBQ x_ptr, acc1
   156  	SBBQ y_ptr, acc2
   157  	SBBQ t1, acc3
   158  	// If condition is 0, keep original value
   159  	TESTQ t0, t0
   160  	CMOVQEQ acc5, acc0
   161  	CMOVQEQ x_ptr, acc1
   162  	CMOVQEQ y_ptr, acc2
   163  	CMOVQEQ t1, acc3
   164  	// Store result
   165  	MOVQ acc0, (8*0)(res_ptr)
   166  	MOVQ acc1, (8*1)(res_ptr)
   167  	MOVQ acc2, (8*2)(res_ptr)
   168  	MOVQ acc3, (8*3)(res_ptr)
   169  
   170  	RET
   171  /* ---------------------------------------*/
   172  // func p256Sqr(res, in *p256Element, n int)
   173  TEXT ·p256Sqr(SB),NOSPLIT,$0
   174  	MOVQ res+0(FP), res_ptr
   175  	MOVQ in+8(FP), x_ptr
   176  	MOVQ n+16(FP), BX
   177  
   178  sqrLoop:
   179  
   180  	// y[1:] * y[0]
   181  	MOVQ (8*0)(x_ptr), t0
   182  
   183  	MOVQ (8*1)(x_ptr), AX
   184  	MULQ t0
   185  	MOVQ AX, acc1
   186  	MOVQ DX, acc2
   187  
   188  	MOVQ (8*2)(x_ptr), AX
   189  	MULQ t0
   190  	ADDQ AX, acc2
   191  	ADCQ $0, DX
   192  	MOVQ DX, acc3
   193  
   194  	MOVQ (8*3)(x_ptr), AX
   195  	MULQ t0
   196  	ADDQ AX, acc3
   197  	ADCQ $0, DX
   198  	MOVQ DX, acc4
   199  	// y[2:] * y[1]
   200  	MOVQ (8*1)(x_ptr), t0
   201  
   202  	MOVQ (8*2)(x_ptr), AX
   203  	MULQ t0
   204  	ADDQ AX, acc3
   205  	ADCQ $0, DX
   206  	MOVQ DX, t1
   207  
   208  	MOVQ (8*3)(x_ptr), AX
   209  	MULQ t0
   210  	ADDQ t1, acc4
   211  	ADCQ $0, DX
   212  	ADDQ AX, acc4
   213  	ADCQ $0, DX
   214  	MOVQ DX, acc5
   215  	// y[3] * y[2]
   216  	MOVQ (8*2)(x_ptr), t0
   217  
   218  	MOVQ (8*3)(x_ptr), AX
   219  	MULQ t0
   220  	ADDQ AX, acc5
   221  	ADCQ $0, DX
   222  	MOVQ DX, y_ptr
   223  	XORQ t1, t1
   224  	// *2
   225  	ADDQ acc1, acc1
   226  	ADCQ acc2, acc2
   227  	ADCQ acc3, acc3
   228  	ADCQ acc4, acc4
   229  	ADCQ acc5, acc5
   230  	ADCQ y_ptr, y_ptr
   231  	ADCQ $0, t1
   232  	// Missing products
   233  	MOVQ (8*0)(x_ptr), AX
   234  	MULQ AX
   235  	MOVQ AX, acc0
   236  	MOVQ DX, t0
   237  
   238  	MOVQ (8*1)(x_ptr), AX
   239  	MULQ AX
   240  	ADDQ t0, acc1
   241  	ADCQ AX, acc2
   242  	ADCQ $0, DX
   243  	MOVQ DX, t0
   244  
   245  	MOVQ (8*2)(x_ptr), AX
   246  	MULQ AX
   247  	ADDQ t0, acc3
   248  	ADCQ AX, acc4
   249  	ADCQ $0, DX
   250  	MOVQ DX, t0
   251  
   252  	MOVQ (8*3)(x_ptr), AX
   253  	MULQ AX
   254  	ADDQ t0, acc5
   255  	ADCQ AX, y_ptr
   256  	ADCQ DX, t1
   257  	MOVQ t1, x_ptr
   258  	// First reduction step
   259  	MOVQ acc0, AX
   260  	MOVQ acc0, t1
   261  	SHLQ $32, acc0
   262  	MULQ p256const1<>(SB)
   263  	SHRQ $32, t1
   264  	ADDQ acc0, acc1
   265  	ADCQ t1, acc2
   266  	ADCQ AX, acc3
   267  	ADCQ $0, DX
   268  	MOVQ DX, acc0
   269  	// Second reduction step
   270  	MOVQ acc1, AX
   271  	MOVQ acc1, t1
   272  	SHLQ $32, acc1
   273  	MULQ p256const1<>(SB)
   274  	SHRQ $32, t1
   275  	ADDQ acc1, acc2
   276  	ADCQ t1, acc3
   277  	ADCQ AX, acc0
   278  	ADCQ $0, DX
   279  	MOVQ DX, acc1
   280  	// Third reduction step
   281  	MOVQ acc2, AX
   282  	MOVQ acc2, t1
   283  	SHLQ $32, acc2
   284  	MULQ p256const1<>(SB)
   285  	SHRQ $32, t1
   286  	ADDQ acc2, acc3
   287  	ADCQ t1, acc0
   288  	ADCQ AX, acc1
   289  	ADCQ $0, DX
   290  	MOVQ DX, acc2
   291  	// Last reduction step
   292  	XORQ t0, t0
   293  	MOVQ acc3, AX
   294  	MOVQ acc3, t1
   295  	SHLQ $32, acc3
   296  	MULQ p256const1<>(SB)
   297  	SHRQ $32, t1
   298  	ADDQ acc3, acc0
   299  	ADCQ t1, acc1
   300  	ADCQ AX, acc2
   301  	ADCQ $0, DX
   302  	MOVQ DX, acc3
   303  	// Add bits [511:256] of the sqr result
   304  	ADCQ acc4, acc0
   305  	ADCQ acc5, acc1
   306  	ADCQ y_ptr, acc2
   307  	ADCQ x_ptr, acc3
   308  	ADCQ $0, t0
   309  
   310  	MOVQ acc0, acc4
   311  	MOVQ acc1, acc5
   312  	MOVQ acc2, y_ptr
   313  	MOVQ acc3, t1
   314  	// Subtract p256
   315  	SUBQ $-1, acc0
   316  	SBBQ p256const0<>(SB) ,acc1
   317  	SBBQ $0, acc2
   318  	SBBQ p256const1<>(SB), acc3
   319  	SBBQ $0, t0
   320  
   321  	CMOVQCS acc4, acc0
   322  	CMOVQCS acc5, acc1
   323  	CMOVQCS y_ptr, acc2
   324  	CMOVQCS t1, acc3
   325  
   326  	MOVQ acc0, (8*0)(res_ptr)
   327  	MOVQ acc1, (8*1)(res_ptr)
   328  	MOVQ acc2, (8*2)(res_ptr)
   329  	MOVQ acc3, (8*3)(res_ptr)
   330  	MOVQ res_ptr, x_ptr
   331  	DECQ BX
   332  	JNE  sqrLoop
   333  
   334  	RET
   335  /* ---------------------------------------*/
   336  // func p256Mul(res, in1, in2 *p256Element)
   337  TEXT ·p256Mul(SB),NOSPLIT,$0
   338  	MOVQ res+0(FP), res_ptr
   339  	MOVQ in1+8(FP), x_ptr
   340  	MOVQ in2+16(FP), y_ptr
   341  	// x * y[0]
   342  	MOVQ (8*0)(y_ptr), t0
   343  
   344  	MOVQ (8*0)(x_ptr), AX
   345  	MULQ t0
   346  	MOVQ AX, acc0
   347  	MOVQ DX, acc1
   348  
   349  	MOVQ (8*1)(x_ptr), AX
   350  	MULQ t0
   351  	ADDQ AX, acc1
   352  	ADCQ $0, DX
   353  	MOVQ DX, acc2
   354  
   355  	MOVQ (8*2)(x_ptr), AX
   356  	MULQ t0
   357  	ADDQ AX, acc2
   358  	ADCQ $0, DX
   359  	MOVQ DX, acc3
   360  
   361  	MOVQ (8*3)(x_ptr), AX
   362  	MULQ t0
   363  	ADDQ AX, acc3
   364  	ADCQ $0, DX
   365  	MOVQ DX, acc4
   366  	XORQ acc5, acc5
   367  	// First reduction step
   368  	MOVQ acc0, AX
   369  	MOVQ acc0, t1
   370  	SHLQ $32, acc0
   371  	MULQ p256const1<>(SB)
   372  	SHRQ $32, t1
   373  	ADDQ acc0, acc1
   374  	ADCQ t1, acc2
   375  	ADCQ AX, acc3
   376  	ADCQ DX, acc4
   377  	ADCQ $0, acc5
   378  	XORQ acc0, acc0
   379  	// x * y[1]
   380  	MOVQ (8*1)(y_ptr), t0
   381  
   382  	MOVQ (8*0)(x_ptr), AX
   383  	MULQ t0
   384  	ADDQ AX, acc1
   385  	ADCQ $0, DX
   386  	MOVQ DX, t1
   387  
   388  	MOVQ (8*1)(x_ptr), AX
   389  	MULQ t0
   390  	ADDQ t1, acc2
   391  	ADCQ $0, DX
   392  	ADDQ AX, acc2
   393  	ADCQ $0, DX
   394  	MOVQ DX, t1
   395  
   396  	MOVQ (8*2)(x_ptr), AX
   397  	MULQ t0
   398  	ADDQ t1, acc3
   399  	ADCQ $0, DX
   400  	ADDQ AX, acc3
   401  	ADCQ $0, DX
   402  	MOVQ DX, t1
   403  
   404  	MOVQ (8*3)(x_ptr), AX
   405  	MULQ t0
   406  	ADDQ t1, acc4
   407  	ADCQ $0, DX
   408  	ADDQ AX, acc4
   409  	ADCQ DX, acc5
   410  	ADCQ $0, acc0
   411  	// Second reduction step
   412  	MOVQ acc1, AX
   413  	MOVQ acc1, t1
   414  	SHLQ $32, acc1
   415  	MULQ p256const1<>(SB)
   416  	SHRQ $32, t1
   417  	ADDQ acc1, acc2
   418  	ADCQ t1, acc3
   419  	ADCQ AX, acc4
   420  	ADCQ DX, acc5
   421  	ADCQ $0, acc0
   422  	XORQ acc1, acc1
   423  	// x * y[2]
   424  	MOVQ (8*2)(y_ptr), t0
   425  
   426  	MOVQ (8*0)(x_ptr), AX
   427  	MULQ t0
   428  	ADDQ AX, acc2
   429  	ADCQ $0, DX
   430  	MOVQ DX, t1
   431  
   432  	MOVQ (8*1)(x_ptr), AX
   433  	MULQ t0
   434  	ADDQ t1, acc3
   435  	ADCQ $0, DX
   436  	ADDQ AX, acc3
   437  	ADCQ $0, DX
   438  	MOVQ DX, t1
   439  
   440  	MOVQ (8*2)(x_ptr), AX
   441  	MULQ t0
   442  	ADDQ t1, acc4
   443  	ADCQ $0, DX
   444  	ADDQ AX, acc4
   445  	ADCQ $0, DX
   446  	MOVQ DX, t1
   447  
   448  	MOVQ (8*3)(x_ptr), AX
   449  	MULQ t0
   450  	ADDQ t1, acc5
   451  	ADCQ $0, DX
   452  	ADDQ AX, acc5
   453  	ADCQ DX, acc0
   454  	ADCQ $0, acc1
   455  	// Third reduction step
   456  	MOVQ acc2, AX
   457  	MOVQ acc2, t1
   458  	SHLQ $32, acc2
   459  	MULQ p256const1<>(SB)
   460  	SHRQ $32, t1
   461  	ADDQ acc2, acc3
   462  	ADCQ t1, acc4
   463  	ADCQ AX, acc5
   464  	ADCQ DX, acc0
   465  	ADCQ $0, acc1
   466  	XORQ acc2, acc2
   467  	// x * y[3]
   468  	MOVQ (8*3)(y_ptr), t0
   469  
   470  	MOVQ (8*0)(x_ptr), AX
   471  	MULQ t0
   472  	ADDQ AX, acc3
   473  	ADCQ $0, DX
   474  	MOVQ DX, t1
   475  
   476  	MOVQ (8*1)(x_ptr), AX
   477  	MULQ t0
   478  	ADDQ t1, acc4
   479  	ADCQ $0, DX
   480  	ADDQ AX, acc4
   481  	ADCQ $0, DX
   482  	MOVQ DX, t1
   483  
   484  	MOVQ (8*2)(x_ptr), AX
   485  	MULQ t0
   486  	ADDQ t1, acc5
   487  	ADCQ $0, DX
   488  	ADDQ AX, acc5
   489  	ADCQ $0, DX
   490  	MOVQ DX, t1
   491  
   492  	MOVQ (8*3)(x_ptr), AX
   493  	MULQ t0
   494  	ADDQ t1, acc0
   495  	ADCQ $0, DX
   496  	ADDQ AX, acc0
   497  	ADCQ DX, acc1
   498  	ADCQ $0, acc2
   499  	// Last reduction step
   500  	MOVQ acc3, AX
   501  	MOVQ acc3, t1
   502  	SHLQ $32, acc3
   503  	MULQ p256const1<>(SB)
   504  	SHRQ $32, t1
   505  	ADDQ acc3, acc4
   506  	ADCQ t1, acc5
   507  	ADCQ AX, acc0
   508  	ADCQ DX, acc1
   509  	ADCQ $0, acc2
   510  	// Copy result [255:0]
   511  	MOVQ acc4, x_ptr
   512  	MOVQ acc5, acc3
   513  	MOVQ acc0, t0
   514  	MOVQ acc1, t1
   515  	// Subtract p256
   516  	SUBQ $-1, acc4
   517  	SBBQ p256const0<>(SB) ,acc5
   518  	SBBQ $0, acc0
   519  	SBBQ p256const1<>(SB), acc1
   520  	SBBQ $0, acc2
   521  
   522  	CMOVQCS x_ptr, acc4
   523  	CMOVQCS acc3, acc5
   524  	CMOVQCS t0, acc0
   525  	CMOVQCS t1, acc1
   526  
   527  	MOVQ acc4, (8*0)(res_ptr)
   528  	MOVQ acc5, (8*1)(res_ptr)
   529  	MOVQ acc0, (8*2)(res_ptr)
   530  	MOVQ acc1, (8*3)(res_ptr)
   531  
   532  	RET
   533  /* ---------------------------------------*/
   534  // func p256FromMont(res, in *p256Element)
   535  TEXT ·p256FromMont(SB),NOSPLIT,$0
   536  	MOVQ res+0(FP), res_ptr
   537  	MOVQ in+8(FP), x_ptr
   538  
   539  	MOVQ (8*0)(x_ptr), acc0
   540  	MOVQ (8*1)(x_ptr), acc1
   541  	MOVQ (8*2)(x_ptr), acc2
   542  	MOVQ (8*3)(x_ptr), acc3
   543  	XORQ acc4, acc4
   544  
   545  	// Only reduce, no multiplications are needed
   546  	// First stage
   547  	MOVQ acc0, AX
   548  	MOVQ acc0, t1
   549  	SHLQ $32, acc0
   550  	MULQ p256const1<>(SB)
   551  	SHRQ $32, t1
   552  	ADDQ acc0, acc1
   553  	ADCQ t1, acc2
   554  	ADCQ AX, acc3
   555  	ADCQ DX, acc4
   556  	XORQ acc5, acc5
   557  	// Second stage
   558  	MOVQ acc1, AX
   559  	MOVQ acc1, t1
   560  	SHLQ $32, acc1
   561  	MULQ p256const1<>(SB)
   562  	SHRQ $32, t1
   563  	ADDQ acc1, acc2
   564  	ADCQ t1, acc3
   565  	ADCQ AX, acc4
   566  	ADCQ DX, acc5
   567  	XORQ acc0, acc0
   568  	// Third stage
   569  	MOVQ acc2, AX
   570  	MOVQ acc2, t1
   571  	SHLQ $32, acc2
   572  	MULQ p256const1<>(SB)
   573  	SHRQ $32, t1
   574  	ADDQ acc2, acc3
   575  	ADCQ t1, acc4
   576  	ADCQ AX, acc5
   577  	ADCQ DX, acc0
   578  	XORQ acc1, acc1
   579  	// Last stage
   580  	MOVQ acc3, AX
   581  	MOVQ acc3, t1
   582  	SHLQ $32, acc3
   583  	MULQ p256const1<>(SB)
   584  	SHRQ $32, t1
   585  	ADDQ acc3, acc4
   586  	ADCQ t1, acc5
   587  	ADCQ AX, acc0
   588  	ADCQ DX, acc1
   589  
   590  	MOVQ acc4, x_ptr
   591  	MOVQ acc5, acc3
   592  	MOVQ acc0, t0
   593  	MOVQ acc1, t1
   594  
   595  	SUBQ $-1, acc4
   596  	SBBQ p256const0<>(SB), acc5
   597  	SBBQ $0, acc0
   598  	SBBQ p256const1<>(SB), acc1
   599  
   600  	CMOVQCS x_ptr, acc4
   601  	CMOVQCS acc3, acc5
   602  	CMOVQCS t0, acc0
   603  	CMOVQCS t1, acc1
   604  
   605  	MOVQ acc4, (8*0)(res_ptr)
   606  	MOVQ acc5, (8*1)(res_ptr)
   607  	MOVQ acc0, (8*2)(res_ptr)
   608  	MOVQ acc1, (8*3)(res_ptr)
   609  
   610  	RET
   611  /* ---------------------------------------*/
   612  // func p256Select(res *P256Point, table *p256Table, idx int)
   613  TEXT ·p256Select(SB),NOSPLIT,$0
   614  	MOVQ idx+16(FP),AX
   615  	MOVQ table+8(FP),DI
   616  	MOVQ res+0(FP),DX
   617  
   618  	PXOR X15, X15	// X15 = 0
   619  	PCMPEQL X14, X14 // X14 = -1
   620  	PSUBL X14, X15   // X15 = 1
   621  	MOVL AX, X14
   622  	PSHUFD $0, X14, X14
   623  
   624  	PXOR X0, X0
   625  	PXOR X1, X1
   626  	PXOR X2, X2
   627  	PXOR X3, X3
   628  	PXOR X4, X4
   629  	PXOR X5, X5
   630  	MOVQ $16, AX
   631  
   632  	MOVOU X15, X13
   633  
   634  loop_select:
   635  
   636  		MOVOU X13, X12
   637  		PADDL X15, X13
   638  		PCMPEQL X14, X12
   639  
   640  		MOVOU (16*0)(DI), X6
   641  		MOVOU (16*1)(DI), X7
   642  		MOVOU (16*2)(DI), X8
   643  		MOVOU (16*3)(DI), X9
   644  		MOVOU (16*4)(DI), X10
   645  		MOVOU (16*5)(DI), X11
   646  		ADDQ $(16*6), DI
   647  
   648  		PAND X12, X6
   649  		PAND X12, X7
   650  		PAND X12, X8
   651  		PAND X12, X9
   652  		PAND X12, X10
   653  		PAND X12, X11
   654  
   655  		PXOR X6, X0
   656  		PXOR X7, X1
   657  		PXOR X8, X2
   658  		PXOR X9, X3
   659  		PXOR X10, X4
   660  		PXOR X11, X5
   661  
   662  		DECQ AX
   663  		JNE loop_select
   664  
   665  	MOVOU X0, (16*0)(DX)
   666  	MOVOU X1, (16*1)(DX)
   667  	MOVOU X2, (16*2)(DX)
   668  	MOVOU X3, (16*3)(DX)
   669  	MOVOU X4, (16*4)(DX)
   670  	MOVOU X5, (16*5)(DX)
   671  
   672  	RET
   673  /* ---------------------------------------*/
   674  // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   675  TEXT ·p256SelectAffine(SB),NOSPLIT,$0
   676  	MOVQ idx+16(FP),AX
   677  	MOVQ table+8(FP),DI
   678  	MOVQ res+0(FP),DX
   679  
   680  	PXOR X15, X15	// X15 = 0
   681  	PCMPEQL X14, X14 // X14 = -1
   682  	PSUBL X14, X15   // X15 = 1
   683  	MOVL AX, X14
   684  	PSHUFD $0, X14, X14
   685  
   686  	PXOR X0, X0
   687  	PXOR X1, X1
   688  	PXOR X2, X2
   689  	PXOR X3, X3
   690  	MOVQ $16, AX
   691  
   692  	MOVOU X15, X13
   693  
   694  loop_select_base:
   695  
   696  		MOVOU X13, X12
   697  		PADDL X15, X13
   698  		PCMPEQL X14, X12
   699  
   700  		MOVOU (16*0)(DI), X4
   701  		MOVOU (16*1)(DI), X5
   702  		MOVOU (16*2)(DI), X6
   703  		MOVOU (16*3)(DI), X7
   704  
   705  		MOVOU (16*4)(DI), X8
   706  		MOVOU (16*5)(DI), X9
   707  		MOVOU (16*6)(DI), X10
   708  		MOVOU (16*7)(DI), X11
   709  
   710  		ADDQ $(16*8), DI
   711  
   712  		PAND X12, X4
   713  		PAND X12, X5
   714  		PAND X12, X6
   715  		PAND X12, X7
   716  
   717  		MOVOU X13, X12
   718  		PADDL X15, X13
   719  		PCMPEQL X14, X12
   720  
   721  		PAND X12, X8
   722  		PAND X12, X9
   723  		PAND X12, X10
   724  		PAND X12, X11
   725  
   726  		PXOR X4, X0
   727  		PXOR X5, X1
   728  		PXOR X6, X2
   729  		PXOR X7, X3
   730  
   731  		PXOR X8, X0
   732  		PXOR X9, X1
   733  		PXOR X10, X2
   734  		PXOR X11, X3
   735  
   736  		DECQ AX
   737  		JNE loop_select_base
   738  
   739  	MOVOU X0, (16*0)(DX)
   740  	MOVOU X1, (16*1)(DX)
   741  	MOVOU X2, (16*2)(DX)
   742  	MOVOU X3, (16*3)(DX)
   743  
   744  	RET
   745  /* ---------------------------------------*/
   746  // func p256OrdMul(res, in1, in2 *p256OrdElement)
   747  TEXT ·p256OrdMul(SB),NOSPLIT,$0
   748  	MOVQ res+0(FP), res_ptr
   749  	MOVQ in1+8(FP), x_ptr
   750  	MOVQ in2+16(FP), y_ptr
   751  	// x * y[0]
   752  	MOVQ (8*0)(y_ptr), t0
   753  
   754  	MOVQ (8*0)(x_ptr), AX
   755  	MULQ t0
   756  	MOVQ AX, acc0
   757  	MOVQ DX, acc1
   758  
   759  	MOVQ (8*1)(x_ptr), AX
   760  	MULQ t0
   761  	ADDQ AX, acc1
   762  	ADCQ $0, DX
   763  	MOVQ DX, acc2
   764  
   765  	MOVQ (8*2)(x_ptr), AX
   766  	MULQ t0
   767  	ADDQ AX, acc2
   768  	ADCQ $0, DX
   769  	MOVQ DX, acc3
   770  
   771  	MOVQ (8*3)(x_ptr), AX
   772  	MULQ t0
   773  	ADDQ AX, acc3
   774  	ADCQ $0, DX
   775  	MOVQ DX, acc4
   776  	XORQ acc5, acc5
   777  	// First reduction step
   778  	MOVQ acc0, AX
   779  	MULQ p256ordK0<>(SB)
   780  	MOVQ AX, t0
   781  
   782  	MOVQ p256ord<>+0x00(SB), AX
   783  	MULQ t0
   784  	ADDQ AX, acc0
   785  	ADCQ $0, DX
   786  	MOVQ DX, t1
   787  
   788  	MOVQ p256ord<>+0x08(SB), AX
   789  	MULQ t0
   790  	ADDQ t1, acc1
   791  	ADCQ $0, DX
   792  	ADDQ AX, acc1
   793  	ADCQ $0, DX
   794  	MOVQ DX, t1
   795  
   796  	MOVQ p256ord<>+0x10(SB), AX
   797  	MULQ t0
   798  	ADDQ t1, acc2
   799  	ADCQ $0, DX
   800  	ADDQ AX, acc2
   801  	ADCQ $0, DX
   802  	MOVQ DX, t1
   803  
   804  	MOVQ p256ord<>+0x18(SB), AX
   805  	MULQ t0
   806  	ADDQ t1, acc3
   807  	ADCQ $0, DX
   808  	ADDQ AX, acc3
   809  	ADCQ DX, acc4
   810  	ADCQ $0, acc5
   811  	// x * y[1]
   812  	MOVQ (8*1)(y_ptr), t0
   813  
   814  	MOVQ (8*0)(x_ptr), AX
   815  	MULQ t0
   816  	ADDQ AX, acc1
   817  	ADCQ $0, DX
   818  	MOVQ DX, t1
   819  
   820  	MOVQ (8*1)(x_ptr), AX
   821  	MULQ t0
   822  	ADDQ t1, acc2
   823  	ADCQ $0, DX
   824  	ADDQ AX, acc2
   825  	ADCQ $0, DX
   826  	MOVQ DX, t1
   827  
   828  	MOVQ (8*2)(x_ptr), AX
   829  	MULQ t0
   830  	ADDQ t1, acc3
   831  	ADCQ $0, DX
   832  	ADDQ AX, acc3
   833  	ADCQ $0, DX
   834  	MOVQ DX, t1
   835  
   836  	MOVQ (8*3)(x_ptr), AX
   837  	MULQ t0
   838  	ADDQ t1, acc4
   839  	ADCQ $0, DX
   840  	ADDQ AX, acc4
   841  	ADCQ DX, acc5
   842  	ADCQ $0, acc0
   843  	// Second reduction step
   844  	MOVQ acc1, AX
   845  	MULQ p256ordK0<>(SB)
   846  	MOVQ AX, t0
   847  
   848  	MOVQ p256ord<>+0x00(SB), AX
   849  	MULQ t0
   850  	ADDQ AX, acc1
   851  	ADCQ $0, DX
   852  	MOVQ DX, t1
   853  
   854  	MOVQ p256ord<>+0x08(SB), AX
   855  	MULQ t0
   856  	ADDQ t1, acc2
   857  	ADCQ $0, DX
   858  	ADDQ AX, acc2
   859  	ADCQ $0, DX
   860  	MOVQ DX, t1
   861  
   862  	MOVQ p256ord<>+0x10(SB), AX
   863  	MULQ t0
   864  	ADDQ t1, acc3
   865  	ADCQ $0, DX
   866  	ADDQ AX, acc3
   867  	ADCQ $0, DX
   868  	MOVQ DX, t1
   869  
   870  	MOVQ p256ord<>+0x18(SB), AX
   871  	MULQ t0
   872  	ADDQ t1, acc4
   873  	ADCQ $0, DX
   874  	ADDQ AX, acc4
   875  	ADCQ DX, acc5
   876  	ADCQ $0, acc0
   877  	// x * y[2]
   878  	MOVQ (8*2)(y_ptr), t0
   879  
   880  	MOVQ (8*0)(x_ptr), AX
   881  	MULQ t0
   882  	ADDQ AX, acc2
   883  	ADCQ $0, DX
   884  	MOVQ DX, t1
   885  
   886  	MOVQ (8*1)(x_ptr), AX
   887  	MULQ t0
   888  	ADDQ t1, acc3
   889  	ADCQ $0, DX
   890  	ADDQ AX, acc3
   891  	ADCQ $0, DX
   892  	MOVQ DX, t1
   893  
   894  	MOVQ (8*2)(x_ptr), AX
   895  	MULQ t0
   896  	ADDQ t1, acc4
   897  	ADCQ $0, DX
   898  	ADDQ AX, acc4
   899  	ADCQ $0, DX
   900  	MOVQ DX, t1
   901  
   902  	MOVQ (8*3)(x_ptr), AX
   903  	MULQ t0
   904  	ADDQ t1, acc5
   905  	ADCQ $0, DX
   906  	ADDQ AX, acc5
   907  	ADCQ DX, acc0
   908  	ADCQ $0, acc1
   909  	// Third reduction step
   910  	MOVQ acc2, AX
   911  	MULQ p256ordK0<>(SB)
   912  	MOVQ AX, t0
   913  
   914  	MOVQ p256ord<>+0x00(SB), AX
   915  	MULQ t0
   916  	ADDQ AX, acc2
   917  	ADCQ $0, DX
   918  	MOVQ DX, t1
   919  
   920  	MOVQ p256ord<>+0x08(SB), AX
   921  	MULQ t0
   922  	ADDQ t1, acc3
   923  	ADCQ $0, DX
   924  	ADDQ AX, acc3
   925  	ADCQ $0, DX
   926  	MOVQ DX, t1
   927  
   928  	MOVQ p256ord<>+0x10(SB), AX
   929  	MULQ t0
   930  	ADDQ t1, acc4
   931  	ADCQ $0, DX
   932  	ADDQ AX, acc4
   933  	ADCQ $0, DX
   934  	MOVQ DX, t1
   935  
   936  	MOVQ p256ord<>+0x18(SB), AX
   937  	MULQ t0
   938  	ADDQ t1, acc5
   939  	ADCQ $0, DX
   940  	ADDQ AX, acc5
   941  	ADCQ DX, acc0
   942  	ADCQ $0, acc1
   943  	// x * y[3]
   944  	MOVQ (8*3)(y_ptr), t0
   945  
   946  	MOVQ (8*0)(x_ptr), AX
   947  	MULQ t0
   948  	ADDQ AX, acc3
   949  	ADCQ $0, DX
   950  	MOVQ DX, t1
   951  
   952  	MOVQ (8*1)(x_ptr), AX
   953  	MULQ t0
   954  	ADDQ t1, acc4
   955  	ADCQ $0, DX
   956  	ADDQ AX, acc4
   957  	ADCQ $0, DX
   958  	MOVQ DX, t1
   959  
   960  	MOVQ (8*2)(x_ptr), AX
   961  	MULQ t0
   962  	ADDQ t1, acc5
   963  	ADCQ $0, DX
   964  	ADDQ AX, acc5
   965  	ADCQ $0, DX
   966  	MOVQ DX, t1
   967  
   968  	MOVQ (8*3)(x_ptr), AX
   969  	MULQ t0
   970  	ADDQ t1, acc0
   971  	ADCQ $0, DX
   972  	ADDQ AX, acc0
   973  	ADCQ DX, acc1
   974  	ADCQ $0, acc2
   975  	// Last reduction step
   976  	MOVQ acc3, AX
   977  	MULQ p256ordK0<>(SB)
   978  	MOVQ AX, t0
   979  
   980  	MOVQ p256ord<>+0x00(SB), AX
   981  	MULQ t0
   982  	ADDQ AX, acc3
   983  	ADCQ $0, DX
   984  	MOVQ DX, t1
   985  
   986  	MOVQ p256ord<>+0x08(SB), AX
   987  	MULQ t0
   988  	ADDQ t1, acc4
   989  	ADCQ $0, DX
   990  	ADDQ AX, acc4
   991  	ADCQ $0, DX
   992  	MOVQ DX, t1
   993  
   994  	MOVQ p256ord<>+0x10(SB), AX
   995  	MULQ t0
   996  	ADDQ t1, acc5
   997  	ADCQ $0, DX
   998  	ADDQ AX, acc5
   999  	ADCQ $0, DX
  1000  	MOVQ DX, t1
  1001  
  1002  	MOVQ p256ord<>+0x18(SB), AX
  1003  	MULQ t0
  1004  	ADDQ t1, acc0
  1005  	ADCQ $0, DX
  1006  	ADDQ AX, acc0
  1007  	ADCQ DX, acc1
  1008  	ADCQ $0, acc2
  1009  	// Copy result [255:0]
  1010  	MOVQ acc4, x_ptr
  1011  	MOVQ acc5, acc3
  1012  	MOVQ acc0, t0
  1013  	MOVQ acc1, t1
  1014  	// Subtract p256
  1015  	SUBQ p256ord<>+0x00(SB), acc4
  1016  	SBBQ p256ord<>+0x08(SB) ,acc5
  1017  	SBBQ p256ord<>+0x10(SB), acc0
  1018  	SBBQ p256ord<>+0x18(SB), acc1
  1019  	SBBQ $0, acc2
  1020  
  1021  	CMOVQCS x_ptr, acc4
  1022  	CMOVQCS acc3, acc5
  1023  	CMOVQCS t0, acc0
  1024  	CMOVQCS t1, acc1
  1025  
  1026  	MOVQ acc4, (8*0)(res_ptr)
  1027  	MOVQ acc5, (8*1)(res_ptr)
  1028  	MOVQ acc0, (8*2)(res_ptr)
  1029  	MOVQ acc1, (8*3)(res_ptr)
  1030  
  1031  	RET
  1032  /* ---------------------------------------*/
  1033  // func p256OrdSqr(res, in *p256OrdElement, n int)
  1034  TEXT ·p256OrdSqr(SB),NOSPLIT,$0
  1035  	MOVQ res+0(FP), res_ptr
  1036  	MOVQ in+8(FP), x_ptr
  1037  	MOVQ n+16(FP), BX
  1038  
  1039  ordSqrLoop:
  1040  
  1041  	// y[1:] * y[0]
  1042  	MOVQ (8*0)(x_ptr), t0
  1043  
  1044  	MOVQ (8*1)(x_ptr), AX
  1045  	MULQ t0
  1046  	MOVQ AX, acc1
  1047  	MOVQ DX, acc2
  1048  
  1049  	MOVQ (8*2)(x_ptr), AX
  1050  	MULQ t0
  1051  	ADDQ AX, acc2
  1052  	ADCQ $0, DX
  1053  	MOVQ DX, acc3
  1054  
  1055  	MOVQ (8*3)(x_ptr), AX
  1056  	MULQ t0
  1057  	ADDQ AX, acc3
  1058  	ADCQ $0, DX
  1059  	MOVQ DX, acc4
  1060  	// y[2:] * y[1]
  1061  	MOVQ (8*1)(x_ptr), t0
  1062  
  1063  	MOVQ (8*2)(x_ptr), AX
  1064  	MULQ t0
  1065  	ADDQ AX, acc3
  1066  	ADCQ $0, DX
  1067  	MOVQ DX, t1
  1068  
  1069  	MOVQ (8*3)(x_ptr), AX
  1070  	MULQ t0
  1071  	ADDQ t1, acc4
  1072  	ADCQ $0, DX
  1073  	ADDQ AX, acc4
  1074  	ADCQ $0, DX
  1075  	MOVQ DX, acc5
  1076  	// y[3] * y[2]
  1077  	MOVQ (8*2)(x_ptr), t0
  1078  
  1079  	MOVQ (8*3)(x_ptr), AX
  1080  	MULQ t0
  1081  	ADDQ AX, acc5
  1082  	ADCQ $0, DX
  1083  	MOVQ DX, y_ptr
  1084  	XORQ t1, t1
  1085  	// *2
  1086  	ADDQ acc1, acc1
  1087  	ADCQ acc2, acc2
  1088  	ADCQ acc3, acc3
  1089  	ADCQ acc4, acc4
  1090  	ADCQ acc5, acc5
  1091  	ADCQ y_ptr, y_ptr
  1092  	ADCQ $0, t1
  1093  	// Missing products
  1094  	MOVQ (8*0)(x_ptr), AX
  1095  	MULQ AX
  1096  	MOVQ AX, acc0
  1097  	MOVQ DX, t0
  1098  
  1099  	MOVQ (8*1)(x_ptr), AX
  1100  	MULQ AX
  1101  	ADDQ t0, acc1
  1102  	ADCQ AX, acc2
  1103  	ADCQ $0, DX
  1104  	MOVQ DX, t0
  1105  
  1106  	MOVQ (8*2)(x_ptr), AX
  1107  	MULQ AX
  1108  	ADDQ t0, acc3
  1109  	ADCQ AX, acc4
  1110  	ADCQ $0, DX
  1111  	MOVQ DX, t0
  1112  
  1113  	MOVQ (8*3)(x_ptr), AX
  1114  	MULQ AX
  1115  	ADDQ t0, acc5
  1116  	ADCQ AX, y_ptr
  1117  	ADCQ DX, t1
  1118  	MOVQ t1, x_ptr
  1119  	// First reduction step
  1120  	MOVQ acc0, AX
  1121  	MULQ p256ordK0<>(SB)
  1122  	MOVQ AX, t0
  1123  
  1124  	MOVQ p256ord<>+0x00(SB), AX
  1125  	MULQ t0
  1126  	ADDQ AX, acc0
  1127  	ADCQ $0, DX
  1128  	MOVQ DX, t1
  1129  
  1130  	MOVQ p256ord<>+0x08(SB), AX
  1131  	MULQ t0
  1132  	ADDQ t1, acc1
  1133  	ADCQ $0, DX
  1134  	ADDQ AX, acc1
  1135  
  1136  	MOVQ t0, t1
  1137  	ADCQ DX, acc2
  1138  	ADCQ $0, t1
  1139  	SUBQ t0, acc2
  1140  	SBBQ $0, t1
  1141  
  1142  	MOVQ t0, AX
  1143  	MOVQ t0, DX
  1144  	MOVQ t0, acc0
  1145  	SHLQ $32, AX
  1146  	SHRQ $32, DX
  1147  
  1148  	ADDQ t1, acc3
  1149  	ADCQ $0, acc0
  1150  	SUBQ AX, acc3
  1151  	SBBQ DX, acc0
  1152  	// Second reduction step
  1153  	MOVQ acc1, AX
  1154  	MULQ p256ordK0<>(SB)
  1155  	MOVQ AX, t0
  1156  
  1157  	MOVQ p256ord<>+0x00(SB), AX
  1158  	MULQ t0
  1159  	ADDQ AX, acc1
  1160  	ADCQ $0, DX
  1161  	MOVQ DX, t1
  1162  
  1163  	MOVQ p256ord<>+0x08(SB), AX
  1164  	MULQ t0
  1165  	ADDQ t1, acc2
  1166  	ADCQ $0, DX
  1167  	ADDQ AX, acc2
  1168  
  1169  	MOVQ t0, t1
  1170  	ADCQ DX, acc3
  1171  	ADCQ $0, t1
  1172  	SUBQ t0, acc3
  1173  	SBBQ $0, t1
  1174  
  1175  	MOVQ t0, AX
  1176  	MOVQ t0, DX
  1177  	MOVQ t0, acc1
  1178  	SHLQ $32, AX
  1179  	SHRQ $32, DX
  1180  
  1181  	ADDQ t1, acc0
  1182  	ADCQ $0, acc1
  1183  	SUBQ AX, acc0
  1184  	SBBQ DX, acc1
  1185  	// Third reduction step
  1186  	MOVQ acc2, AX
  1187  	MULQ p256ordK0<>(SB)
  1188  	MOVQ AX, t0
  1189  
  1190  	MOVQ p256ord<>+0x00(SB), AX
  1191  	MULQ t0
  1192  	ADDQ AX, acc2
  1193  	ADCQ $0, DX
  1194  	MOVQ DX, t1
  1195  
  1196  	MOVQ p256ord<>+0x08(SB), AX
  1197  	MULQ t0
  1198  	ADDQ t1, acc3
  1199  	ADCQ $0, DX
  1200  	ADDQ AX, acc3
  1201  
  1202  	MOVQ t0, t1
  1203  	ADCQ DX, acc0
  1204  	ADCQ $0, t1
  1205  	SUBQ t0, acc0
  1206  	SBBQ $0, t1
  1207  
  1208  	MOVQ t0, AX
  1209  	MOVQ t0, DX
  1210  	MOVQ t0, acc2
  1211  	SHLQ $32, AX
  1212  	SHRQ $32, DX
  1213  
  1214  	ADDQ t1, acc1
  1215  	ADCQ $0, acc2
  1216  	SUBQ AX, acc1
  1217  	SBBQ DX, acc2
  1218  	// Last reduction step
  1219  	MOVQ acc3, AX
  1220  	MULQ p256ordK0<>(SB)
  1221  	MOVQ AX, t0
  1222  
  1223  	MOVQ p256ord<>+0x00(SB), AX
  1224  	MULQ t0
  1225  	ADDQ AX, acc3
  1226  	ADCQ $0, DX
  1227  	MOVQ DX, t1
  1228  
  1229  	MOVQ p256ord<>+0x08(SB), AX
  1230  	MULQ t0
  1231  	ADDQ t1, acc0
  1232  	ADCQ $0, DX
  1233  	ADDQ AX, acc0
  1234  	ADCQ $0, DX
  1235  	MOVQ DX, t1
  1236  
  1237  	MOVQ t0, t1
  1238  	ADCQ DX, acc1
  1239  	ADCQ $0, t1
  1240  	SUBQ t0, acc1
  1241  	SBBQ $0, t1
  1242  
  1243  	MOVQ t0, AX
  1244  	MOVQ t0, DX
  1245  	MOVQ t0, acc3
  1246  	SHLQ $32, AX
  1247  	SHRQ $32, DX
  1248  
  1249  	ADDQ t1, acc2
  1250  	ADCQ $0, acc3
  1251  	SUBQ AX, acc2
  1252  	SBBQ DX, acc3
  1253  	XORQ t0, t0
  1254  	// Add bits [511:256] of the sqr result
  1255  	ADCQ acc4, acc0
  1256  	ADCQ acc5, acc1
  1257  	ADCQ y_ptr, acc2
  1258  	ADCQ x_ptr, acc3
  1259  	ADCQ $0, t0
  1260  
  1261  	MOVQ acc0, acc4
  1262  	MOVQ acc1, acc5
  1263  	MOVQ acc2, y_ptr
  1264  	MOVQ acc3, t1
  1265  	// Subtract p256
  1266  	SUBQ p256ord<>+0x00(SB), acc0
  1267  	SBBQ p256ord<>+0x08(SB) ,acc1
  1268  	SBBQ p256ord<>+0x10(SB), acc2
  1269  	SBBQ p256ord<>+0x18(SB), acc3
  1270  	SBBQ $0, t0
  1271  
  1272  	CMOVQCS acc4, acc0
  1273  	CMOVQCS acc5, acc1
  1274  	CMOVQCS y_ptr, acc2
  1275  	CMOVQCS t1, acc3
  1276  
  1277  	MOVQ acc0, (8*0)(res_ptr)
  1278  	MOVQ acc1, (8*1)(res_ptr)
  1279  	MOVQ acc2, (8*2)(res_ptr)
  1280  	MOVQ acc3, (8*3)(res_ptr)
  1281  	MOVQ res_ptr, x_ptr
  1282  	DECQ BX
  1283  	JNE ordSqrLoop
  1284  
  1285  	RET
  1286  /* ---------------------------------------*/
  1287  #undef res_ptr
  1288  #undef x_ptr
  1289  #undef y_ptr
  1290  
  1291  #undef acc0
  1292  #undef acc1
  1293  #undef acc2
  1294  #undef acc3
  1295  #undef acc4
  1296  #undef acc5
  1297  #undef t0
  1298  #undef t1
  1299  /* ---------------------------------------*/
  1300  #define mul0 AX
  1301  #define mul1 DX
  1302  #define acc0 BX
  1303  #define acc1 CX
  1304  #define acc2 R8
  1305  #define acc3 R9
  1306  #define acc4 R10
  1307  #define acc5 R11
  1308  #define acc6 R12
  1309  #define acc7 R13
  1310  #define t0 R14
  1311  #define t1 R15
  1312  #define t2 DI
  1313  #define t3 SI
  1314  #define hlp BP
  1315  /* ---------------------------------------*/
  1316  TEXT p256SubInternal(SB),NOSPLIT,$0
  1317  	XORQ mul0, mul0
  1318  	SUBQ t0, acc4
  1319  	SBBQ t1, acc5
  1320  	SBBQ t2, acc6
  1321  	SBBQ t3, acc7
  1322  	SBBQ $0, mul0
  1323  
  1324  	MOVQ acc4, acc0
  1325  	MOVQ acc5, acc1
  1326  	MOVQ acc6, acc2
  1327  	MOVQ acc7, acc3
  1328  
  1329  	ADDQ $-1, acc4
  1330  	ADCQ p256const0<>(SB), acc5
  1331  	ADCQ $0, acc6
  1332  	ADCQ p256const1<>(SB), acc7
  1333  	ANDQ $1, mul0
  1334  
  1335  	CMOVQEQ acc0, acc4
  1336  	CMOVQEQ acc1, acc5
  1337  	CMOVQEQ acc2, acc6
  1338  	CMOVQEQ acc3, acc7
  1339  
  1340  	RET
  1341  /* ---------------------------------------*/
  1342  TEXT p256MulInternal(SB),NOSPLIT,$8
  1343  	MOVQ acc4, mul0
  1344  	MULQ t0
  1345  	MOVQ mul0, acc0
  1346  	MOVQ mul1, acc1
  1347  
  1348  	MOVQ acc4, mul0
  1349  	MULQ t1
  1350  	ADDQ mul0, acc1
  1351  	ADCQ $0, mul1
  1352  	MOVQ mul1, acc2
  1353  
  1354  	MOVQ acc4, mul0
  1355  	MULQ t2
  1356  	ADDQ mul0, acc2
  1357  	ADCQ $0, mul1
  1358  	MOVQ mul1, acc3
  1359  
  1360  	MOVQ acc4, mul0
  1361  	MULQ t3
  1362  	ADDQ mul0, acc3
  1363  	ADCQ $0, mul1
  1364  	MOVQ mul1, acc4
  1365  
  1366  	MOVQ acc5, mul0
  1367  	MULQ t0
  1368  	ADDQ mul0, acc1
  1369  	ADCQ $0, mul1
  1370  	MOVQ mul1, hlp
  1371  
  1372  	MOVQ acc5, mul0
  1373  	MULQ t1
  1374  	ADDQ hlp, acc2
  1375  	ADCQ $0, mul1
  1376  	ADDQ mul0, acc2
  1377  	ADCQ $0, mul1
  1378  	MOVQ mul1, hlp
  1379  
  1380  	MOVQ acc5, mul0
  1381  	MULQ t2
  1382  	ADDQ hlp, acc3
  1383  	ADCQ $0, mul1
  1384  	ADDQ mul0, acc3
  1385  	ADCQ $0, mul1
  1386  	MOVQ mul1, hlp
  1387  
  1388  	MOVQ acc5, mul0
  1389  	MULQ t3
  1390  	ADDQ hlp, acc4
  1391  	ADCQ $0, mul1
  1392  	ADDQ mul0, acc4
  1393  	ADCQ $0, mul1
  1394  	MOVQ mul1, acc5
  1395  
  1396  	MOVQ acc6, mul0
  1397  	MULQ t0
  1398  	ADDQ mul0, acc2
  1399  	ADCQ $0, mul1
  1400  	MOVQ mul1, hlp
  1401  
  1402  	MOVQ acc6, mul0
  1403  	MULQ t1
  1404  	ADDQ hlp, acc3
  1405  	ADCQ $0, mul1
  1406  	ADDQ mul0, acc3
  1407  	ADCQ $0, mul1
  1408  	MOVQ mul1, hlp
  1409  
  1410  	MOVQ acc6, mul0
  1411  	MULQ t2
  1412  	ADDQ hlp, acc4
  1413  	ADCQ $0, mul1
  1414  	ADDQ mul0, acc4
  1415  	ADCQ $0, mul1
  1416  	MOVQ mul1, hlp
  1417  
  1418  	MOVQ acc6, mul0
  1419  	MULQ t3
  1420  	ADDQ hlp, acc5
  1421  	ADCQ $0, mul1
  1422  	ADDQ mul0, acc5
  1423  	ADCQ $0, mul1
  1424  	MOVQ mul1, acc6
  1425  
  1426  	MOVQ acc7, mul0
  1427  	MULQ t0
  1428  	ADDQ mul0, acc3
  1429  	ADCQ $0, mul1
  1430  	MOVQ mul1, hlp
  1431  
  1432  	MOVQ acc7, mul0
  1433  	MULQ t1
  1434  	ADDQ hlp, acc4
  1435  	ADCQ $0, mul1
  1436  	ADDQ mul0, acc4
  1437  	ADCQ $0, mul1
  1438  	MOVQ mul1, hlp
  1439  
  1440  	MOVQ acc7, mul0
  1441  	MULQ t2
  1442  	ADDQ hlp, acc5
  1443  	ADCQ $0, mul1
  1444  	ADDQ mul0, acc5
  1445  	ADCQ $0, mul1
  1446  	MOVQ mul1, hlp
  1447  
  1448  	MOVQ acc7, mul0
  1449  	MULQ t3
  1450  	ADDQ hlp, acc6
  1451  	ADCQ $0, mul1
  1452  	ADDQ mul0, acc6
  1453  	ADCQ $0, mul1
  1454  	MOVQ mul1, acc7
  1455  	// First reduction step
  1456  	MOVQ acc0, mul0
  1457  	MOVQ acc0, hlp
  1458  	SHLQ $32, acc0
  1459  	MULQ p256const1<>(SB)
  1460  	SHRQ $32, hlp
  1461  	ADDQ acc0, acc1
  1462  	ADCQ hlp, acc2
  1463  	ADCQ mul0, acc3
  1464  	ADCQ $0, mul1
  1465  	MOVQ mul1, acc0
  1466  	// Second reduction step
  1467  	MOVQ acc1, mul0
  1468  	MOVQ acc1, hlp
  1469  	SHLQ $32, acc1
  1470  	MULQ p256const1<>(SB)
  1471  	SHRQ $32, hlp
  1472  	ADDQ acc1, acc2
  1473  	ADCQ hlp, acc3
  1474  	ADCQ mul0, acc0
  1475  	ADCQ $0, mul1
  1476  	MOVQ mul1, acc1
  1477  	// Third reduction step
  1478  	MOVQ acc2, mul0
  1479  	MOVQ acc2, hlp
  1480  	SHLQ $32, acc2
  1481  	MULQ p256const1<>(SB)
  1482  	SHRQ $32, hlp
  1483  	ADDQ acc2, acc3
  1484  	ADCQ hlp, acc0
  1485  	ADCQ mul0, acc1
  1486  	ADCQ $0, mul1
  1487  	MOVQ mul1, acc2
  1488  	// Last reduction step
  1489  	MOVQ acc3, mul0
  1490  	MOVQ acc3, hlp
  1491  	SHLQ $32, acc3
  1492  	MULQ p256const1<>(SB)
  1493  	SHRQ $32, hlp
  1494  	ADDQ acc3, acc0
  1495  	ADCQ hlp, acc1
  1496  	ADCQ mul0, acc2
  1497  	ADCQ $0, mul1
  1498  	MOVQ mul1, acc3
  1499  	MOVQ $0, BP
  1500  	// Add bits [511:256] of the result
  1501  	ADCQ acc0, acc4
  1502  	ADCQ acc1, acc5
  1503  	ADCQ acc2, acc6
  1504  	ADCQ acc3, acc7
  1505  	ADCQ $0, hlp
  1506  	// Copy result
  1507  	MOVQ acc4, acc0
  1508  	MOVQ acc5, acc1
  1509  	MOVQ acc6, acc2
  1510  	MOVQ acc7, acc3
  1511  	// Subtract p256
  1512  	SUBQ $-1, acc4
  1513  	SBBQ p256const0<>(SB) ,acc5
  1514  	SBBQ $0, acc6
  1515  	SBBQ p256const1<>(SB), acc7
  1516  	SBBQ $0, hlp
  1517  	// If the result of the subtraction is negative, restore the previous result
  1518  	CMOVQCS acc0, acc4
  1519  	CMOVQCS acc1, acc5
  1520  	CMOVQCS acc2, acc6
  1521  	CMOVQCS acc3, acc7
  1522  
  1523  	RET
  1524  /* ---------------------------------------*/
  1525  TEXT p256SqrInternal(SB),NOSPLIT,$8
  1526  
  1527  	MOVQ acc4, mul0
  1528  	MULQ acc5
  1529  	MOVQ mul0, acc1
  1530  	MOVQ mul1, acc2
  1531  
  1532  	MOVQ acc4, mul0
  1533  	MULQ acc6
  1534  	ADDQ mul0, acc2
  1535  	ADCQ $0, mul1
  1536  	MOVQ mul1, acc3
  1537  
  1538  	MOVQ acc4, mul0
  1539  	MULQ acc7
  1540  	ADDQ mul0, acc3
  1541  	ADCQ $0, mul1
  1542  	MOVQ mul1, t0
  1543  
  1544  	MOVQ acc5, mul0
  1545  	MULQ acc6
  1546  	ADDQ mul0, acc3
  1547  	ADCQ $0, mul1
  1548  	MOVQ mul1, hlp
  1549  
  1550  	MOVQ acc5, mul0
  1551  	MULQ acc7
  1552  	ADDQ hlp, t0
  1553  	ADCQ $0, mul1
  1554  	ADDQ mul0, t0
  1555  	ADCQ $0, mul1
  1556  	MOVQ mul1, t1
  1557  
  1558  	MOVQ acc6, mul0
  1559  	MULQ acc7
  1560  	ADDQ mul0, t1
  1561  	ADCQ $0, mul1
  1562  	MOVQ mul1, t2
  1563  	XORQ t3, t3
  1564  	// *2
  1565  	ADDQ acc1, acc1
  1566  	ADCQ acc2, acc2
  1567  	ADCQ acc3, acc3
  1568  	ADCQ t0, t0
  1569  	ADCQ t1, t1
  1570  	ADCQ t2, t2
  1571  	ADCQ $0, t3
  1572  	// Missing products
  1573  	MOVQ acc4, mul0
  1574  	MULQ mul0
  1575  	MOVQ mul0, acc0
  1576  	MOVQ DX, acc4
  1577  
  1578  	MOVQ acc5, mul0
  1579  	MULQ mul0
  1580  	ADDQ acc4, acc1
  1581  	ADCQ mul0, acc2
  1582  	ADCQ $0, DX
  1583  	MOVQ DX, acc4
  1584  
  1585  	MOVQ acc6, mul0
  1586  	MULQ mul0
  1587  	ADDQ acc4, acc3
  1588  	ADCQ mul0, t0
  1589  	ADCQ $0, DX
  1590  	MOVQ DX, acc4
  1591  
  1592  	MOVQ acc7, mul0
  1593  	MULQ mul0
  1594  	ADDQ acc4, t1
  1595  	ADCQ mul0, t2
  1596  	ADCQ DX, t3
  1597  	// First reduction step
  1598  	MOVQ acc0, mul0
  1599  	MOVQ acc0, hlp
  1600  	SHLQ $32, acc0
  1601  	MULQ p256const1<>(SB)
  1602  	SHRQ $32, hlp
  1603  	ADDQ acc0, acc1
  1604  	ADCQ hlp, acc2
  1605  	ADCQ mul0, acc3
  1606  	ADCQ $0, mul1
  1607  	MOVQ mul1, acc0
  1608  	// Second reduction step
  1609  	MOVQ acc1, mul0
  1610  	MOVQ acc1, hlp
  1611  	SHLQ $32, acc1
  1612  	MULQ p256const1<>(SB)
  1613  	SHRQ $32, hlp
  1614  	ADDQ acc1, acc2
  1615  	ADCQ hlp, acc3
  1616  	ADCQ mul0, acc0
  1617  	ADCQ $0, mul1
  1618  	MOVQ mul1, acc1
  1619  	// Third reduction step
  1620  	MOVQ acc2, mul0
  1621  	MOVQ acc2, hlp
  1622  	SHLQ $32, acc2
  1623  	MULQ p256const1<>(SB)
  1624  	SHRQ $32, hlp
  1625  	ADDQ acc2, acc3
  1626  	ADCQ hlp, acc0
  1627  	ADCQ mul0, acc1
  1628  	ADCQ $0, mul1
  1629  	MOVQ mul1, acc2
  1630  	// Last reduction step
  1631  	MOVQ acc3, mul0
  1632  	MOVQ acc3, hlp
  1633  	SHLQ $32, acc3
  1634  	MULQ p256const1<>(SB)
  1635  	SHRQ $32, hlp
  1636  	ADDQ acc3, acc0
  1637  	ADCQ hlp, acc1
  1638  	ADCQ mul0, acc2
  1639  	ADCQ $0, mul1
  1640  	MOVQ mul1, acc3
  1641  	MOVQ $0, BP
  1642  	// Add bits [511:256] of the result
  1643  	ADCQ acc0, t0
  1644  	ADCQ acc1, t1
  1645  	ADCQ acc2, t2
  1646  	ADCQ acc3, t3
  1647  	ADCQ $0, hlp
  1648  	// Copy result
  1649  	MOVQ t0, acc4
  1650  	MOVQ t1, acc5
  1651  	MOVQ t2, acc6
  1652  	MOVQ t3, acc7
  1653  	// Subtract p256
  1654  	SUBQ $-1, acc4
  1655  	SBBQ p256const0<>(SB) ,acc5
  1656  	SBBQ $0, acc6
  1657  	SBBQ p256const1<>(SB), acc7
  1658  	SBBQ $0, hlp
  1659  	// If the result of the subtraction is negative, restore the previous result
  1660  	CMOVQCS t0, acc4
  1661  	CMOVQCS t1, acc5
  1662  	CMOVQCS t2, acc6
  1663  	CMOVQCS t3, acc7
  1664  
  1665  	RET
  1666  /* ---------------------------------------*/
  1667  #define p256MulBy2Inline\
  1668  	XORQ mul0, mul0;\
  1669  	ADDQ acc4, acc4;\
  1670  	ADCQ acc5, acc5;\
  1671  	ADCQ acc6, acc6;\
  1672  	ADCQ acc7, acc7;\
  1673  	ADCQ $0, mul0;\
  1674  	MOVQ acc4, t0;\
  1675  	MOVQ acc5, t1;\
  1676  	MOVQ acc6, t2;\
  1677  	MOVQ acc7, t3;\
  1678  	SUBQ $-1, t0;\
  1679  	SBBQ p256const0<>(SB), t1;\
  1680  	SBBQ $0, t2;\
  1681  	SBBQ p256const1<>(SB), t3;\
  1682  	SBBQ $0, mul0;\
  1683  	CMOVQCS acc4, t0;\
  1684  	CMOVQCS acc5, t1;\
  1685  	CMOVQCS acc6, t2;\
  1686  	CMOVQCS acc7, t3;
  1687  /* ---------------------------------------*/
  1688  #define p256AddInline \
  1689  	XORQ mul0, mul0;\
  1690  	ADDQ t0, acc4;\
  1691  	ADCQ t1, acc5;\
  1692  	ADCQ t2, acc6;\
  1693  	ADCQ t3, acc7;\
  1694  	ADCQ $0, mul0;\
  1695  	MOVQ acc4, t0;\
  1696  	MOVQ acc5, t1;\
  1697  	MOVQ acc6, t2;\
  1698  	MOVQ acc7, t3;\
  1699  	SUBQ $-1, t0;\
  1700  	SBBQ p256const0<>(SB), t1;\
  1701  	SBBQ $0, t2;\
  1702  	SBBQ p256const1<>(SB), t3;\
  1703  	SBBQ $0, mul0;\
  1704  	CMOVQCS acc4, t0;\
  1705  	CMOVQCS acc5, t1;\
  1706  	CMOVQCS acc6, t2;\
  1707  	CMOVQCS acc7, t3;
  1708  /* ---------------------------------------*/
  1709  #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
  1710  #define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
  1711  #define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
  1712  #define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
  1713  #define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
  1714  #define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
  1715  /* ---------------------------------------*/
  1716  #define x1in(off) (32*0 + off)(SP)
  1717  #define y1in(off) (32*1 + off)(SP)
  1718  #define z1in(off) (32*2 + off)(SP)
  1719  #define x2in(off) (32*3 + off)(SP)
  1720  #define y2in(off) (32*4 + off)(SP)
  1721  #define xout(off) (32*5 + off)(SP)
  1722  #define yout(off) (32*6 + off)(SP)
  1723  #define zout(off) (32*7 + off)(SP)
  1724  #define s2(off)   (32*8 + off)(SP)
  1725  #define z1sqr(off) (32*9 + off)(SP)
  1726  #define h(off)	  (32*10 + off)(SP)
  1727  #define r(off)	  (32*11 + off)(SP)
  1728  #define hsqr(off) (32*12 + off)(SP)
  1729  #define rsqr(off) (32*13 + off)(SP)
  1730  #define hcub(off) (32*14 + off)(SP)
  1731  #define rptr	  (32*15)(SP)
  1732  #define sel_save  (32*15 + 8)(SP)
  1733  #define zero_save (32*15 + 8 + 4)(SP)
  1734  
  1735  // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1736  TEXT ·p256PointAddAffineAsm(SB),0,$512-48
  1737  	// Move input to stack in order to free registers
  1738  	MOVQ res+0(FP), AX
  1739  	MOVQ in1+8(FP), BX
  1740  	MOVQ in2+16(FP), CX
  1741  	MOVQ sign+24(FP), DX
  1742  	MOVQ sel+32(FP), t1
  1743  	MOVQ zero+40(FP), t2
  1744  
  1745  	MOVOU (16*0)(BX), X0
  1746  	MOVOU (16*1)(BX), X1
  1747  	MOVOU (16*2)(BX), X2
  1748  	MOVOU (16*3)(BX), X3
  1749  	MOVOU (16*4)(BX), X4
  1750  	MOVOU (16*5)(BX), X5
  1751  
  1752  	MOVOU X0, x1in(16*0)
  1753  	MOVOU X1, x1in(16*1)
  1754  	MOVOU X2, y1in(16*0)
  1755  	MOVOU X3, y1in(16*1)
  1756  	MOVOU X4, z1in(16*0)
  1757  	MOVOU X5, z1in(16*1)
  1758  
  1759  	MOVOU (16*0)(CX), X0
  1760  	MOVOU (16*1)(CX), X1
  1761  
  1762  	MOVOU X0, x2in(16*0)
  1763  	MOVOU X1, x2in(16*1)
  1764  	// Store pointer to result
  1765  	MOVQ mul0, rptr
  1766  	MOVL t1, sel_save
  1767  	MOVL t2, zero_save
  1768  	// Negate y2in based on sign
  1769  	MOVQ (16*2 + 8*0)(CX), acc4
  1770  	MOVQ (16*2 + 8*1)(CX), acc5
  1771  	MOVQ (16*2 + 8*2)(CX), acc6
  1772  	MOVQ (16*2 + 8*3)(CX), acc7
  1773  	MOVQ $-1, acc0
  1774  	MOVQ p256const0<>(SB), acc1
  1775  	MOVQ $0, acc2
  1776  	MOVQ p256const1<>(SB), acc3
  1777  	XORQ mul0, mul0
  1778  	// Speculatively subtract
  1779  	SUBQ acc4, acc0
  1780  	SBBQ acc5, acc1
  1781  	SBBQ acc6, acc2
  1782  	SBBQ acc7, acc3
  1783  	SBBQ $0, mul0
  1784  	MOVQ acc0, t0
  1785  	MOVQ acc1, t1
  1786  	MOVQ acc2, t2
  1787  	MOVQ acc3, t3
  1788  	// Add in case the operand was > p256
  1789  	ADDQ $-1, acc0
  1790  	ADCQ p256const0<>(SB), acc1
  1791  	ADCQ $0, acc2
  1792  	ADCQ p256const1<>(SB), acc3
  1793  	ADCQ $0, mul0
  1794  	CMOVQNE t0, acc0
  1795  	CMOVQNE t1, acc1
  1796  	CMOVQNE t2, acc2
  1797  	CMOVQNE t3, acc3
  1798  	// If condition is 0, keep original value
  1799  	TESTQ DX, DX
  1800  	CMOVQEQ acc4, acc0
  1801  	CMOVQEQ acc5, acc1
  1802  	CMOVQEQ acc6, acc2
  1803  	CMOVQEQ acc7, acc3
  1804  	// Store result
  1805  	MOVQ acc0, y2in(8*0)
  1806  	MOVQ acc1, y2in(8*1)
  1807  	MOVQ acc2, y2in(8*2)
  1808  	MOVQ acc3, y2in(8*3)
  1809  	// Begin point add
  1810  	LDacc (z1in)
  1811  	CALL p256SqrInternal(SB)	// z1ˆ2
  1812  	ST (z1sqr)
  1813  
  1814  	LDt (x2in)
  1815  	CALL p256MulInternal(SB)	// x2 * z1ˆ2
  1816  
  1817  	LDt (x1in)
  1818  	CALL p256SubInternal(SB)	// h = u2 - u1
  1819  	ST (h)
  1820  
  1821  	LDt (z1in)
  1822  	CALL p256MulInternal(SB)	// z3 = h * z1
  1823  	ST (zout)
  1824  
  1825  	LDacc (z1sqr)
  1826  	CALL p256MulInternal(SB)	// z1ˆ3
  1827  
  1828  	LDt (y2in)
  1829  	CALL p256MulInternal(SB)	// s2 = y2 * z1ˆ3
  1830  	ST (s2)
  1831  
  1832  	LDt (y1in)
  1833  	CALL p256SubInternal(SB)	// r = s2 - s1
  1834  	ST (r)
  1835  
  1836  	CALL p256SqrInternal(SB)	// rsqr = rˆ2
  1837  	ST (rsqr)
  1838  
  1839  	LDacc (h)
  1840  	CALL p256SqrInternal(SB)	// hsqr = hˆ2
  1841  	ST (hsqr)
  1842  
  1843  	LDt (h)
  1844  	CALL p256MulInternal(SB)	// hcub = hˆ3
  1845  	ST (hcub)
  1846  
  1847  	LDt (y1in)
  1848  	CALL p256MulInternal(SB)	// y1 * hˆ3
  1849  	ST (s2)
  1850  
  1851  	LDacc (x1in)
  1852  	LDt (hsqr)
  1853  	CALL p256MulInternal(SB)	// u1 * hˆ2
  1854  	ST (h)
  1855  
  1856  	p256MulBy2Inline			// u1 * hˆ2 * 2, inline
  1857  	LDacc (rsqr)
  1858  	CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  1859  
  1860  	LDt (hcub)
  1861  	CALL p256SubInternal(SB)
  1862  	ST (xout)
  1863  
  1864  	MOVQ acc4, t0
  1865  	MOVQ acc5, t1
  1866  	MOVQ acc6, t2
  1867  	MOVQ acc7, t3
  1868  	LDacc (h)
  1869  	CALL p256SubInternal(SB)
  1870  
  1871  	LDt (r)
  1872  	CALL p256MulInternal(SB)
  1873  
  1874  	LDt (s2)
  1875  	CALL p256SubInternal(SB)
  1876  	ST (yout)
  1877  	// Load stored values from stack
  1878  	MOVQ rptr, AX
  1879  	MOVL sel_save, BX
  1880  	MOVL zero_save, CX
  1881  	// The result is not valid if (sel == 0), conditional choose
  1882  	MOVOU xout(16*0), X0
  1883  	MOVOU xout(16*1), X1
  1884  	MOVOU yout(16*0), X2
  1885  	MOVOU yout(16*1), X3
  1886  	MOVOU zout(16*0), X4
  1887  	MOVOU zout(16*1), X5
  1888  
  1889  	MOVL BX, X6
  1890  	MOVL CX, X7
  1891  
  1892  	PXOR X8, X8
  1893  	PCMPEQL X9, X9
  1894  
  1895  	PSHUFD $0, X6, X6
  1896  	PSHUFD $0, X7, X7
  1897  
  1898  	PCMPEQL X8, X6
  1899  	PCMPEQL X8, X7
  1900  
  1901  	MOVOU X6, X15
  1902  	PANDN X9, X15
  1903  
  1904  	MOVOU x1in(16*0), X9
  1905  	MOVOU x1in(16*1), X10
  1906  	MOVOU y1in(16*0), X11
  1907  	MOVOU y1in(16*1), X12
  1908  	MOVOU z1in(16*0), X13
  1909  	MOVOU z1in(16*1), X14
  1910  
  1911  	PAND X15, X0
  1912  	PAND X15, X1
  1913  	PAND X15, X2
  1914  	PAND X15, X3
  1915  	PAND X15, X4
  1916  	PAND X15, X5
  1917  
  1918  	PAND X6, X9
  1919  	PAND X6, X10
  1920  	PAND X6, X11
  1921  	PAND X6, X12
  1922  	PAND X6, X13
  1923  	PAND X6, X14
  1924  
  1925  	PXOR X9, X0
  1926  	PXOR X10, X1
  1927  	PXOR X11, X2
  1928  	PXOR X12, X3
  1929  	PXOR X13, X4
  1930  	PXOR X14, X5
  1931  	// Similarly if zero == 0
  1932  	PCMPEQL X9, X9
  1933  	MOVOU X7, X15
  1934  	PANDN X9, X15
  1935  
  1936  	MOVOU x2in(16*0), X9
  1937  	MOVOU x2in(16*1), X10
  1938  	MOVOU y2in(16*0), X11
  1939  	MOVOU y2in(16*1), X12
  1940  	MOVOU p256one<>+0x00(SB), X13
  1941  	MOVOU p256one<>+0x10(SB), X14
  1942  
  1943  	PAND X15, X0
  1944  	PAND X15, X1
  1945  	PAND X15, X2
  1946  	PAND X15, X3
  1947  	PAND X15, X4
  1948  	PAND X15, X5
  1949  
  1950  	PAND X7, X9
  1951  	PAND X7, X10
  1952  	PAND X7, X11
  1953  	PAND X7, X12
  1954  	PAND X7, X13
  1955  	PAND X7, X14
  1956  
  1957  	PXOR X9, X0
  1958  	PXOR X10, X1
  1959  	PXOR X11, X2
  1960  	PXOR X12, X3
  1961  	PXOR X13, X4
  1962  	PXOR X14, X5
  1963  	// Finally output the result
  1964  	MOVOU X0, (16*0)(AX)
  1965  	MOVOU X1, (16*1)(AX)
  1966  	MOVOU X2, (16*2)(AX)
  1967  	MOVOU X3, (16*3)(AX)
  1968  	MOVOU X4, (16*4)(AX)
  1969  	MOVOU X5, (16*5)(AX)
  1970  	MOVQ $0, rptr
  1971  
  1972  	RET
  1973  #undef x1in
  1974  #undef y1in
  1975  #undef z1in
  1976  #undef x2in
  1977  #undef y2in
  1978  #undef xout
  1979  #undef yout
  1980  #undef zout
  1981  #undef s2
  1982  #undef z1sqr
  1983  #undef h
  1984  #undef r
  1985  #undef hsqr
  1986  #undef rsqr
  1987  #undef hcub
  1988  #undef rptr
  1989  #undef sel_save
  1990  #undef zero_save
  1991  
  1992  // p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
  1993  // otherwise. It writes to [acc4..acc7], t0 and t1.
  1994  TEXT p256IsZero(SB),NOSPLIT,$0
  1995  	// AX contains a flag that is set if the input is zero.
  1996  	XORQ AX, AX
  1997  	MOVQ $1, t1
  1998  
  1999  	// Check whether [acc4..acc7] are all zero.
  2000  	MOVQ acc4, t0
  2001  	ORQ acc5, t0
  2002  	ORQ acc6, t0
  2003  	ORQ acc7, t0
  2004  
  2005  	// Set the zero flag if so. (CMOV of a constant to a register doesn't
  2006  	// appear to be supported in Go. Thus t1 = 1.)
  2007  	CMOVQEQ t1, AX
  2008  
  2009  	// XOR [acc4..acc7] with P and compare with zero again.
  2010  	XORQ $-1, acc4
  2011  	XORQ p256const0<>(SB), acc5
  2012  	XORQ p256const1<>(SB), acc7
  2013  	ORQ acc5, acc4
  2014  	ORQ acc6, acc4
  2015  	ORQ acc7, acc4
  2016  
  2017  	// Set the zero flag if so.
  2018  	CMOVQEQ t1, AX
  2019  	RET
  2020  
  2021  /* ---------------------------------------*/
  2022  #define x1in(off) (32*0 + off)(SP)
  2023  #define y1in(off) (32*1 + off)(SP)
  2024  #define z1in(off) (32*2 + off)(SP)
  2025  #define x2in(off) (32*3 + off)(SP)
  2026  #define y2in(off) (32*4 + off)(SP)
  2027  #define z2in(off) (32*5 + off)(SP)
  2028  
  2029  #define xout(off) (32*6 + off)(SP)
  2030  #define yout(off) (32*7 + off)(SP)
  2031  #define zout(off) (32*8 + off)(SP)
  2032  
  2033  #define u1(off)    (32*9 + off)(SP)
  2034  #define u2(off)    (32*10 + off)(SP)
  2035  #define s1(off)    (32*11 + off)(SP)
  2036  #define s2(off)    (32*12 + off)(SP)
  2037  #define z1sqr(off) (32*13 + off)(SP)
  2038  #define z2sqr(off) (32*14 + off)(SP)
  2039  #define h(off)     (32*15 + off)(SP)
  2040  #define r(off)     (32*16 + off)(SP)
  2041  #define hsqr(off)  (32*17 + off)(SP)
  2042  #define rsqr(off)  (32*18 + off)(SP)
  2043  #define hcub(off)  (32*19 + off)(SP)
  2044  #define rptr       (32*20)(SP)
  2045  #define points_eq  (32*20+8)(SP)
  2046  
  2047  //func p256PointAddAsm(res, in1, in2 *P256Point) int
  2048  TEXT ·p256PointAddAsm(SB),0,$680-32
  2049  	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  2050  	// Move input to stack in order to free registers
  2051  	MOVQ res+0(FP), AX
  2052  	MOVQ in1+8(FP), BX
  2053  	MOVQ in2+16(FP), CX
  2054  
  2055  	MOVOU (16*0)(BX), X0
  2056  	MOVOU (16*1)(BX), X1
  2057  	MOVOU (16*2)(BX), X2
  2058  	MOVOU (16*3)(BX), X3
  2059  	MOVOU (16*4)(BX), X4
  2060  	MOVOU (16*5)(BX), X5
  2061  
  2062  	MOVOU X0, x1in(16*0)
  2063  	MOVOU X1, x1in(16*1)
  2064  	MOVOU X2, y1in(16*0)
  2065  	MOVOU X3, y1in(16*1)
  2066  	MOVOU X4, z1in(16*0)
  2067  	MOVOU X5, z1in(16*1)
  2068  
  2069  	MOVOU (16*0)(CX), X0
  2070  	MOVOU (16*1)(CX), X1
  2071  	MOVOU (16*2)(CX), X2
  2072  	MOVOU (16*3)(CX), X3
  2073  	MOVOU (16*4)(CX), X4
  2074  	MOVOU (16*5)(CX), X5
  2075  
  2076  	MOVOU X0, x2in(16*0)
  2077  	MOVOU X1, x2in(16*1)
  2078  	MOVOU X2, y2in(16*0)
  2079  	MOVOU X3, y2in(16*1)
  2080  	MOVOU X4, z2in(16*0)
  2081  	MOVOU X5, z2in(16*1)
  2082  	// Store pointer to result
  2083  	MOVQ AX, rptr
  2084  	// Begin point add
  2085  	LDacc (z2in)
  2086  	CALL p256SqrInternal(SB)	// z2ˆ2
  2087  	ST (z2sqr)
  2088  	LDt (z2in)
  2089  	CALL p256MulInternal(SB)	// z2ˆ3
  2090  	LDt (y1in)
  2091  	CALL p256MulInternal(SB)	// s1 = z2ˆ3*y1
  2092  	ST (s1)
  2093  
  2094  	LDacc (z1in)
  2095  	CALL p256SqrInternal(SB)	// z1ˆ2
  2096  	ST (z1sqr)
  2097  	LDt (z1in)
  2098  	CALL p256MulInternal(SB)	// z1ˆ3
  2099  	LDt (y2in)
  2100  	CALL p256MulInternal(SB)	// s2 = z1ˆ3*y2
  2101  	ST (s2)
  2102  
  2103  	LDt (s1)
  2104  	CALL p256SubInternal(SB)	// r = s2 - s1
  2105  	ST (r)
  2106  	CALL p256IsZero(SB)
  2107  	MOVQ AX, points_eq
  2108  
  2109  	LDacc (z2sqr)
  2110  	LDt (x1in)
  2111  	CALL p256MulInternal(SB)	// u1 = x1 * z2ˆ2
  2112  	ST (u1)
  2113  	LDacc (z1sqr)
  2114  	LDt (x2in)
  2115  	CALL p256MulInternal(SB)	// u2 = x2 * z1ˆ2
  2116  	ST (u2)
  2117  
  2118  	LDt (u1)
  2119  	CALL p256SubInternal(SB)	// h = u2 - u1
  2120  	ST (h)
  2121  	CALL p256IsZero(SB)
  2122  	ANDQ points_eq, AX
  2123  	MOVQ AX, points_eq
  2124  
  2125  	LDacc (r)
  2126  	CALL p256SqrInternal(SB)	// rsqr = rˆ2
  2127  	ST (rsqr)
  2128  
  2129  	LDacc (h)
  2130  	CALL p256SqrInternal(SB)	// hsqr = hˆ2
  2131  	ST (hsqr)
  2132  
  2133  	LDt (h)
  2134  	CALL p256MulInternal(SB)	// hcub = hˆ3
  2135  	ST (hcub)
  2136  
  2137  	LDt (s1)
  2138  	CALL p256MulInternal(SB)
  2139  	ST (s2)
  2140  
  2141  	LDacc (z1in)
  2142  	LDt (z2in)
  2143  	CALL p256MulInternal(SB)	// z1 * z2
  2144  	LDt (h)
  2145  	CALL p256MulInternal(SB)	// z1 * z2 * h
  2146  	ST (zout)
  2147  
  2148  	LDacc (hsqr)
  2149  	LDt (u1)
  2150  	CALL p256MulInternal(SB)	// hˆ2 * u1
  2151  	ST (u2)
  2152  
  2153  	p256MulBy2Inline	// u1 * hˆ2 * 2, inline
  2154  	LDacc (rsqr)
  2155  	CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  2156  
  2157  	LDt (hcub)
  2158  	CALL p256SubInternal(SB)
  2159  	ST (xout)
  2160  
  2161  	MOVQ acc4, t0
  2162  	MOVQ acc5, t1
  2163  	MOVQ acc6, t2
  2164  	MOVQ acc7, t3
  2165  	LDacc (u2)
  2166  	CALL p256SubInternal(SB)
  2167  
  2168  	LDt (r)
  2169  	CALL p256MulInternal(SB)
  2170  
  2171  	LDt (s2)
  2172  	CALL p256SubInternal(SB)
  2173  	ST (yout)
  2174  
  2175  	MOVOU xout(16*0), X0
  2176  	MOVOU xout(16*1), X1
  2177  	MOVOU yout(16*0), X2
  2178  	MOVOU yout(16*1), X3
  2179  	MOVOU zout(16*0), X4
  2180  	MOVOU zout(16*1), X5
  2181  	// Finally output the result
  2182  	MOVQ rptr, AX
  2183  	MOVQ $0, rptr
  2184  	MOVOU X0, (16*0)(AX)
  2185  	MOVOU X1, (16*1)(AX)
  2186  	MOVOU X2, (16*2)(AX)
  2187  	MOVOU X3, (16*3)(AX)
  2188  	MOVOU X4, (16*4)(AX)
  2189  	MOVOU X5, (16*5)(AX)
  2190  
  2191  	MOVQ points_eq, AX
  2192  	MOVQ AX, ret+24(FP)
  2193  
  2194  	RET
  2195  #undef x1in
  2196  #undef y1in
  2197  #undef z1in
  2198  #undef x2in
  2199  #undef y2in
  2200  #undef z2in
  2201  #undef xout
  2202  #undef yout
  2203  #undef zout
  2204  #undef s1
  2205  #undef s2
  2206  #undef u1
  2207  #undef u2
  2208  #undef z1sqr
  2209  #undef z2sqr
  2210  #undef h
  2211  #undef r
  2212  #undef hsqr
  2213  #undef rsqr
  2214  #undef hcub
  2215  #undef rptr
  2216  /* ---------------------------------------*/
  2217  #define x(off) (32*0 + off)(SP)
  2218  #define y(off) (32*1 + off)(SP)
  2219  #define z(off) (32*2 + off)(SP)
  2220  
  2221  #define s(off)	(32*3 + off)(SP)
  2222  #define m(off)	(32*4 + off)(SP)
  2223  #define zsqr(off) (32*5 + off)(SP)
  2224  #define tmp(off)  (32*6 + off)(SP)
  2225  #define rptr	  (32*7)(SP)
  2226  
  2227  //func p256PointDoubleAsm(res, in *P256Point)
  2228  TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16
  2229  	// Move input to stack in order to free registers
  2230  	MOVQ res+0(FP), AX
  2231  	MOVQ in+8(FP), BX
  2232  
  2233  	MOVOU (16*0)(BX), X0
  2234  	MOVOU (16*1)(BX), X1
  2235  	MOVOU (16*2)(BX), X2
  2236  	MOVOU (16*3)(BX), X3
  2237  	MOVOU (16*4)(BX), X4
  2238  	MOVOU (16*5)(BX), X5
  2239  
  2240  	MOVOU X0, x(16*0)
  2241  	MOVOU X1, x(16*1)
  2242  	MOVOU X2, y(16*0)
  2243  	MOVOU X3, y(16*1)
  2244  	MOVOU X4, z(16*0)
  2245  	MOVOU X5, z(16*1)
  2246  	// Store pointer to result
  2247  	MOVQ AX, rptr
  2248  	// Begin point double
  2249  	LDacc (z)
  2250  	CALL p256SqrInternal(SB)
  2251  	ST (zsqr)
  2252  
  2253  	LDt (x)
  2254  	p256AddInline
  2255  	STt (m)
  2256  
  2257  	LDacc (z)
  2258  	LDt (y)
  2259  	CALL p256MulInternal(SB)
  2260  	p256MulBy2Inline
  2261  	MOVQ rptr, AX
  2262  	// Store z
  2263  	MOVQ t0, (16*4 + 8*0)(AX)
  2264  	MOVQ t1, (16*4 + 8*1)(AX)
  2265  	MOVQ t2, (16*4 + 8*2)(AX)
  2266  	MOVQ t3, (16*4 + 8*3)(AX)
  2267  
  2268  	LDacc (x)
  2269  	LDt (zsqr)
  2270  	CALL p256SubInternal(SB)
  2271  	LDt (m)
  2272  	CALL p256MulInternal(SB)
  2273  	ST (m)
  2274  	// Multiply by 3
  2275  	p256MulBy2Inline
  2276  	LDacc (m)
  2277  	p256AddInline
  2278  	STt (m)
  2279  	////////////////////////
  2280  	LDacc (y)
  2281  	p256MulBy2Inline
  2282  	t2acc
  2283  	CALL p256SqrInternal(SB)
  2284  	ST (s)
  2285  	CALL p256SqrInternal(SB)
  2286  	// Divide by 2
  2287  	XORQ mul0, mul0
  2288  	MOVQ acc4, t0
  2289  	MOVQ acc5, t1
  2290  	MOVQ acc6, t2
  2291  	MOVQ acc7, t3
  2292  
  2293  	ADDQ $-1, acc4
  2294  	ADCQ p256const0<>(SB), acc5
  2295  	ADCQ $0, acc6
  2296  	ADCQ p256const1<>(SB), acc7
  2297  	ADCQ $0, mul0
  2298  	TESTQ $1, t0
  2299  
  2300  	CMOVQEQ t0, acc4
  2301  	CMOVQEQ t1, acc5
  2302  	CMOVQEQ t2, acc6
  2303  	CMOVQEQ t3, acc7
  2304  	ANDQ t0, mul0
  2305  
  2306  	SHRQ $1, acc5, acc4
  2307  	SHRQ $1, acc6, acc5
  2308  	SHRQ $1, acc7, acc6
  2309  	SHRQ $1, mul0, acc7
  2310  	ST (y)
  2311  	/////////////////////////
  2312  	LDacc (x)
  2313  	LDt (s)
  2314  	CALL p256MulInternal(SB)
  2315  	ST (s)
  2316  	p256MulBy2Inline
  2317  	STt (tmp)
  2318  
  2319  	LDacc (m)
  2320  	CALL p256SqrInternal(SB)
  2321  	LDt (tmp)
  2322  	CALL p256SubInternal(SB)
  2323  
  2324  	MOVQ rptr, AX
  2325  	// Store x
  2326  	MOVQ acc4, (16*0 + 8*0)(AX)
  2327  	MOVQ acc5, (16*0 + 8*1)(AX)
  2328  	MOVQ acc6, (16*0 + 8*2)(AX)
  2329  	MOVQ acc7, (16*0 + 8*3)(AX)
  2330  
  2331  	acc2t
  2332  	LDacc (s)
  2333  	CALL p256SubInternal(SB)
  2334  
  2335  	LDt (m)
  2336  	CALL p256MulInternal(SB)
  2337  
  2338  	LDt (y)
  2339  	CALL p256SubInternal(SB)
  2340  	MOVQ rptr, AX
  2341  	// Store y
  2342  	MOVQ acc4, (16*2 + 8*0)(AX)
  2343  	MOVQ acc5, (16*2 + 8*1)(AX)
  2344  	MOVQ acc6, (16*2 + 8*2)(AX)
  2345  	MOVQ acc7, (16*2 + 8*3)(AX)
  2346  	///////////////////////
  2347  	MOVQ $0, rptr
  2348  
  2349  	RET
  2350  /* ---------------------------------------*/
  2351  

View as plain text