// Copyright 2019 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build ppc64 || ppc64le // Portions based on CRYPTOGAMS code with the following comment: // # ==================================================================== // # Written by Andy Polyakov for the OpenSSL // # project. The module is, however, dual licensed under OpenSSL and // # CRYPTOGAMS licenses depending on where you obtain it. For further // # details see http://www.openssl.org/~appro/cryptogams/. // # ==================================================================== // The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm // from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl // from commit d47afb3c. // Changes were made due to differences in the ABI and some register usage. // Some arguments were changed due to the way the Go code passes them. // Portions that use the stitched AES-GCM approach in counterCryptASM // are based on code found in // https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s #include "textflag.h" #define XIP R3 #define HTBL R4 #define INP R5 #define LEN R6 #define XL V0 #define XM V1 #define XH V2 #define IN V3 #define ZERO V4 #define T0 V5 #define T1 V6 #define T2 V7 #define XC2 V8 #define H V9 #define HH V10 #define HL V11 #define LEMASK V12 #define XL1 V13 #define XM1 V14 #define XH1 V15 #define IN1 V16 #define H2 V17 #define H2H V18 #define H2L V19 #define XL3 V20 #define XM2 V21 #define IN2 V22 #define H3L V23 #define H3 V24 #define H3H V25 #define XH3 V26 #define XM3 V27 #define IN3 V28 #define H4L V29 #define H4 V30 #define H4H V31 #define IN0 IN #define H21L HL #define H21H HH #define LOPERM H2L #define HIPERM H2H #define VXL VS32 #define VIN VS35 #define VXC2 VS40 #define VH VS41 #define VHH VS42 #define VHL VS43 #define VIN1 VS48 #define VH2 VS49 #define VH2H VS50 #define VH2L VS51 #define VIN2 VS54 #define VH3L VS55 #define VH3 VS56 #define VH3H VS57 #define VIN3 VS60 #define VH4L VS61 #define VH4 VS62 #define VH4H VS63 #define VIN0 VIN #define ESPERM V10 #define TMP2 V11 // The following macros provide appropriate // implementations for endianness as well as // ISA specific for power8 and power9. #ifdef GOARCH_ppc64le # ifdef GOPPC64_power9 #define P8_LXVB16X(RA,RB,VT) LXVB16X (RA)(RB), VT #define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA)(RB) # else #define NEEDS_ESPERM #define P8_LXVB16X(RA,RB,VT) \ LXVD2X (RA+RB), VT \ VPERM VT, VT, ESPERM, VT #define P8_STXVB16X(VS,RA,RB) \ VPERM VS, VS, ESPERM, TMP2; \ STXVD2X TMP2, (RA+RB) # endif #else #define P8_LXVB16X(RA,RB,VT) \ LXVD2X (RA+RB), VT #define P8_STXVB16X(VS,RA,RB) \ STXVD2X VS, (RA+RB) #endif #define MASK_PTR R8 #define MASKV V0 #define INV V1 // The following macros are used for // the stitched implementation within // counterCryptASM. // Load the initial GCM counter value // in V30 and set up the counter increment // in V31 #define SETUP_COUNTER \ P8_LXVB16X(COUNTER, R0, V30); \ VSPLTISB $1, V28; \ VXOR V31, V31, V31; \ VSLDOI $1, V31, V28, V31 // These macros set up the initial value // for a single encryption, or 4 or 8 // stitched encryptions implemented // with interleaving vciphers. // // The input value for each encryption // is generated by XORing the counter // from V30 with the first key in VS0 // and incrementing the counter. // // Single encryption in V15 #define GEN_VCIPHER_INPUT \ XXLOR VS0, VS0, V29 \ VXOR V30, V29, V15; \ VADDUWM V30, V31, V30 // 4 encryptions in V15 - V18 #define GEN_VCIPHER_4_INPUTS \ XXLOR VS0, VS0, V29; \ VXOR V30, V29, V15; \ VADDUWM V30, V31, V30; \ VXOR V30, V29, V16; \ VADDUWM V30, V31, V30; \ VXOR V30, V29, V17; \ VADDUWM V30, V31, V30; \ VXOR V30, V29, V18; \ VADDUWM V30, V31, V30 // 8 encryptions in V15 - V22 #define GEN_VCIPHER_8_INPUTS \ XXLOR VS0, VS0, V29; \ VXOR V30, V29, V15; \ VADDUWM V30, V31, V30; \ VXOR V30, V29, V16; \ VADDUWM V30, V31, V30; \ VXOR V30, V29, V17; \ VADDUWM V30, V31, V30; \ VXOR V30, V29, V18; \ VADDUWM V30, V31, V30; \ VXOR V30, V29, V19; \ VADDUWM V30, V31, V30; \ VXOR V30, V29, V20; \ VADDUWM V30, V31, V30; \ VXOR V30, V29, V21; \ VADDUWM V30, V31, V30; \ VXOR V30, V29, V22; \ VADDUWM V30, V31, V30 // Load the keys to be used for // encryption based on key_len. // Keys are in VS0 - VS14 // depending on key_len. // Valid keys sizes are verified // here. CR2 is set and used // throughout to check key_len. #define LOAD_KEYS(blk_key, key_len) \ MOVD $16, R16; \ MOVD $32, R17; \ MOVD $48, R18; \ MOVD $64, R19; \ LXVD2X (blk_key)(R0), VS0; \ LXVD2X (blk_key)(R16), VS1; \ LXVD2X (blk_key)(R17), VS2; \ LXVD2X (blk_key)(R18), VS3; \ LXVD2X (blk_key)(R19), VS4; \ ADD $64, R16; \ ADD $64, R17; \ ADD $64, R18; \ ADD $64, R19; \ LXVD2X (blk_key)(R16), VS5; \ LXVD2X (blk_key)(R17), VS6; \ LXVD2X (blk_key)(R18), VS7; \ LXVD2X (blk_key)(R19), VS8; \ ADD $64, R16; \ ADD $64, R17; \ ADD $64, R18; \ ADD $64, R19; \ LXVD2X (blk_key)(R16), VS9; \ LXVD2X (blk_key)(R17), VS10; \ CMP key_len, $12, CR2; \ CMP key_len, $10; \ BEQ keysLoaded; \ LXVD2X (blk_key)(R18), VS11; \ LXVD2X (blk_key)(R19), VS12; \ BEQ CR2, keysLoaded; \ ADD $64, R16; \ ADD $64, R17; \ LXVD2X (blk_key)(R16), VS13; \ LXVD2X (blk_key)(R17), VS14; \ CMP key_len, $14; \ BEQ keysLoaded; \ MOVD R0,0(R0); \ keysLoaded: // Encrypt 1 (vin) with first 9 // keys from VS1 - VS9. #define VCIPHER_1X9_KEYS(vin) \ XXLOR VS1, VS1, V23; \ XXLOR VS2, VS2, V24; \ XXLOR VS3, VS3, V25; \ XXLOR VS4, VS4, V26; \ XXLOR VS5, VS5, V27; \ VCIPHER vin, V23, vin; \ VCIPHER vin, V24, vin; \ VCIPHER vin, V25, vin; \ VCIPHER vin, V26, vin; \ VCIPHER vin, V27, vin; \ XXLOR VS6, VS6, V23; \ XXLOR VS7, VS7, V24; \ XXLOR VS8, VS8, V25; \ XXLOR VS9, VS9, V26; \ VCIPHER vin, V23, vin; \ VCIPHER vin, V24, vin; \ VCIPHER vin, V25, vin; \ VCIPHER vin, V26, vin // Encrypt 1 value (vin) with // 2 specified keys #define VCIPHER_1X2_KEYS(vin, key1, key2) \ XXLOR key1, key1, V25; \ XXLOR key2, key2, V26; \ VCIPHER vin, V25, vin; \ VCIPHER vin, V26, vin // Encrypt 4 values in V15 - V18 // with the specified key from // VS1 - VS9. #define VCIPHER_4X1_KEY(key) \ XXLOR key, key, V23; \ VCIPHER V15, V23, V15; \ VCIPHER V16, V23, V16; \ VCIPHER V17, V23, V17; \ VCIPHER V18, V23, V18 // Encrypt 8 values in V15 - V22 // with the specified key, // assuming it is a VSreg #define VCIPHER_8X1_KEY(key) \ XXLOR key, key, V23; \ VCIPHER V15, V23, V15; \ VCIPHER V16, V23, V16; \ VCIPHER V17, V23, V17; \ VCIPHER V18, V23, V18; \ VCIPHER V19, V23, V19; \ VCIPHER V20, V23, V20; \ VCIPHER V21, V23, V21; \ VCIPHER V22, V23, V22 // Load input block into V1-V4 // in big endian order and // update blk_inp by 64. #define LOAD_INPUT_BLOCK64(blk_inp) \ MOVD $16, R16; \ MOVD $32, R17; \ MOVD $48, R18; \ P8_LXVB16X(blk_inp,R0,V1); \ P8_LXVB16X(blk_inp,R16,V2); \ P8_LXVB16X(blk_inp,R17,V3); \ P8_LXVB16X(blk_inp,R18,V4); \ ADD $64, blk_inp // Load input block into V1-V8 // in big endian order and // Update blk_inp by 128 #define LOAD_INPUT_BLOCK128(blk_inp) \ MOVD $16, R16; \ MOVD $32, R17; \ MOVD $48, R18; \ MOVD $64, R19; \ MOVD $80, R20; \ MOVD $96, R21; \ MOVD $112, R22; \ P8_LXVB16X(blk_inp,R0,V1); \ P8_LXVB16X(blk_inp,R16,V2); \ P8_LXVB16X(blk_inp,R17,V3); \ P8_LXVB16X(blk_inp,R18,V4); \ P8_LXVB16X(blk_inp,R19,V5); \ P8_LXVB16X(blk_inp,R20,V6); \ P8_LXVB16X(blk_inp,R21,V7); \ P8_LXVB16X(blk_inp,R22,V8); \ ADD $128, blk_inp // Finish encryption on 8 streams and // XOR with input block #define VCIPHERLAST8_XOR_INPUT \ VCIPHERLAST V15, V23, V15; \ VCIPHERLAST V16, V23, V16; \ VCIPHERLAST V17, V23, V17; \ VCIPHERLAST V18, V23, V18; \ VCIPHERLAST V19, V23, V19; \ VCIPHERLAST V20, V23, V20; \ VCIPHERLAST V21, V23, V21; \ VCIPHERLAST V22, V23, V22; \ XXLXOR V1, V15, V1; \ XXLXOR V2, V16, V2; \ XXLXOR V3, V17, V3; \ XXLXOR V4, V18, V4; \ XXLXOR V5, V19, V5; \ XXLXOR V6, V20, V6; \ XXLXOR V7, V21, V7; \ XXLXOR V8, V22, V8 // Finish encryption on 4 streams and // XOR with input block #define VCIPHERLAST4_XOR_INPUT \ VCIPHERLAST V15, V23, V15; \ VCIPHERLAST V16, V23, V16; \ VCIPHERLAST V17, V23, V17; \ VCIPHERLAST V18, V23, V18; \ XXLXOR V1, V15, V1; \ XXLXOR V2, V16, V2; \ XXLXOR V3, V17, V3; \ XXLXOR V4, V18, V4 // Store output block from V1-V8 // in big endian order and // Update blk_out by 128 #define STORE_OUTPUT_BLOCK128(blk_out) \ P8_STXVB16X(V1,blk_out,R0); \ P8_STXVB16X(V2,blk_out,R16); \ P8_STXVB16X(V3,blk_out,R17); \ P8_STXVB16X(V4,blk_out,R18); \ P8_STXVB16X(V5,blk_out,R19); \ P8_STXVB16X(V6,blk_out,R20); \ P8_STXVB16X(V7,blk_out,R21); \ P8_STXVB16X(V8,blk_out,R22); \ ADD $128, blk_out // Store output block from V1-V4 // in big endian order and // Update blk_out by 64 #define STORE_OUTPUT_BLOCK64(blk_out) \ P8_STXVB16X(V1,blk_out,R0); \ P8_STXVB16X(V2,blk_out,R16); \ P8_STXVB16X(V3,blk_out,R17); \ P8_STXVB16X(V4,blk_out,R18); \ ADD $64, blk_out // func gcmInit(productTable *[256]byte, h []byte) TEXT ·gcmInit(SB), NOSPLIT, $0-32 MOVD productTable+0(FP), XIP MOVD h+8(FP), HTBL MOVD $0x10, R8 MOVD $0x20, R9 MOVD $0x30, R10 LXVD2X (HTBL)(R0), VH // Load H VSPLTISB $-16, XC2 // 0xf0 VSPLTISB $1, T0 // one VADDUBM XC2, XC2, XC2 // 0xe0 VXOR ZERO, ZERO, ZERO VOR XC2, T0, XC2 // 0xe1 VSLDOI $15, XC2, ZERO, XC2 // 0xe1... VSLDOI $1, ZERO, T0, T1 // ...1 VADDUBM XC2, XC2, XC2 // 0xc2... VSPLTISB $7, T2 VOR XC2, T1, XC2 // 0xc2....01 VSPLTB $0, H, T1 // most significant byte VSL H, T0, H // H<<=1 VSRAB T1, T2, T1 // broadcast carry bit VAND T1, XC2, T1 VXOR H, T1, IN // twisted H VSLDOI $8, IN, IN, H // twist even more ... VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0 VSLDOI $8, ZERO, H, HL // ... and split VSLDOI $8, H, ZERO, HH STXVD2X VXC2, (XIP+R0) // save pre-computed table STXVD2X VHL, (XIP+R8) MOVD $0x40, R8 STXVD2X VH, (XIP+R9) MOVD $0x50, R9 STXVD2X VHH, (XIP+R10) MOVD $0x60, R10 VPMSUMD IN, HL, XL // H.lo·H.lo VPMSUMD IN, H, XM // H.hi·H.lo+H.lo·H.hi VPMSUMD IN, HH, XH // H.hi·H.hi VPMSUMD XL, XC2, T2 // 1st reduction phase VSLDOI $8, XM, ZERO, T0 VSLDOI $8, ZERO, XM, T1 VXOR XL, T0, XL VXOR XH, T1, XH VSLDOI $8, XL, XL, XL VXOR XL, T2, XL VSLDOI $8, XL, XL, T1 // 2nd reduction phase VPMSUMD XL, XC2, XL VXOR T1, XH, T1 VXOR XL, T1, IN1 VSLDOI $8, IN1, IN1, H2 VSLDOI $8, ZERO, H2, H2L VSLDOI $8, H2, ZERO, H2H STXVD2X VH2L, (XIP+R8) // save H^2 MOVD $0x70, R8 STXVD2X VH2, (XIP+R9) MOVD $0x80, R9 STXVD2X VH2H, (XIP+R10) MOVD $0x90, R10 VPMSUMD IN, H2L, XL // H.lo·H^2.lo VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo VPMSUMD IN, H2, XM // H.hi·H^2.lo+H.lo·H^2.hi VPMSUMD IN1, H2, XM1 // H^2.hi·H^2.lo+H^2.lo·H^2.hi VPMSUMD IN, H2H, XH // H.hi·H^2.hi VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi VPMSUMD XL, XC2, T2 // 1st reduction phase VPMSUMD XL1, XC2, HH // 1st reduction phase VSLDOI $8, XM, ZERO, T0 VSLDOI $8, ZERO, XM, T1 VSLDOI $8, XM1, ZERO, HL VSLDOI $8, ZERO, XM1, H VXOR XL, T0, XL VXOR XH, T1, XH VXOR XL1, HL, XL1 VXOR XH1, H, XH1 VSLDOI $8, XL, XL, XL VSLDOI $8, XL1, XL1, XL1 VXOR XL, T2, XL VXOR XL1, HH, XL1 VSLDOI $8, XL, XL, T1 // 2nd reduction phase VSLDOI $8, XL1, XL1, H // 2nd reduction phase VPMSUMD XL, XC2, XL VPMSUMD XL1, XC2, XL1 VXOR T1, XH, T1 VXOR H, XH1, H VXOR XL, T1, XL VXOR XL1, H, XL1 VSLDOI $8, XL, XL, H VSLDOI $8, XL1, XL1, H2 VSLDOI $8, ZERO, H, HL VSLDOI $8, H, ZERO, HH VSLDOI $8, ZERO, H2, H2L VSLDOI $8, H2, ZERO, H2H STXVD2X VHL, (XIP+R8) // save H^3 MOVD $0xa0, R8 STXVD2X VH, (XIP+R9) MOVD $0xb0, R9 STXVD2X VHH, (XIP+R10) MOVD $0xc0, R10 STXVD2X VH2L, (XIP+R8) // save H^4 STXVD2X VH2, (XIP+R9) STXVD2X VH2H, (XIP+R10) RET // func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int) TEXT ·gcmHash(SB), NOSPLIT, $0-64 MOVD output+0(FP), XIP MOVD productTable+24(FP), HTBL MOVD inp+32(FP), INP MOVD len+56(FP), LEN MOVD $0x10, R8 MOVD $0x20, R9 MOVD $0x30, R10 LXVD2X (XIP)(R0), VXL // load Xi LXVD2X (HTBL)(R8), VHL // load pre-computed table MOVD $0x40, R8 LXVD2X (HTBL)(R9), VH MOVD $0x50, R9 LXVD2X (HTBL)(R10), VHH MOVD $0x60, R10 LXVD2X (HTBL)(R0), VXC2 #ifdef GOARCH_ppc64le LVSL (R0)(R0), LEMASK VSPLTISB $0x07, T0 VXOR LEMASK, T0, LEMASK VPERM XL, XL, LEMASK, XL #endif VXOR ZERO, ZERO, ZERO CMPU LEN, $64 BGE gcm_ghash_p8_4x LXVD2X (INP)(R0), VIN ADD $16, INP, INP SUBCCC $16, LEN, LEN #ifdef GOARCH_ppc64le VPERM IN, IN, LEMASK, IN #endif VXOR IN, XL, IN BEQ short LXVD2X (HTBL)(R8), VH2L // load H^2 MOVD $16, R8 LXVD2X (HTBL)(R9), VH2 ADD LEN, INP, R9 // end of input LXVD2X (HTBL)(R10), VH2H loop_2x: LXVD2X (INP)(R0), VIN1 #ifdef GOARCH_ppc64le VPERM IN1, IN1, LEMASK, IN1 #endif SUBC $32, LEN, LEN VPMSUMD IN, H2L, XL // H^2.lo·Xi.lo VPMSUMD IN1, HL, XL1 // H.lo·Xi+1.lo SUBE R11, R11, R11 // borrow?-1:0 VPMSUMD IN, H2, XM // H^2.hi·Xi.lo+H^2.lo·Xi.hi VPMSUMD IN1, H, XM1 // H.hi·Xi+1.lo+H.lo·Xi+1.hi AND LEN, R11, R11 VPMSUMD IN, H2H, XH // H^2.hi·Xi.hi VPMSUMD IN1, HH, XH1 // H.hi·Xi+1.hi ADD R11, INP, INP VXOR XL, XL1, XL VXOR XM, XM1, XM VPMSUMD XL, XC2, T2 // 1st reduction phase VSLDOI $8, XM, ZERO, T0 VSLDOI $8, ZERO, XM, T1 VXOR XH, XH1, XH VXOR XL, T0, XL VXOR XH, T1, XH VSLDOI $8, XL, XL, XL VXOR XL, T2, XL LXVD2X (INP)(R8), VIN ADD $32, INP, INP VSLDOI $8, XL, XL, T1 // 2nd reduction phase VPMSUMD XL, XC2, XL #ifdef GOARCH_ppc64le VPERM IN, IN, LEMASK, IN #endif VXOR T1, XH, T1 VXOR IN, T1, IN VXOR IN, XL, IN CMP R9, INP BGT loop_2x // done yet? CMPWU LEN, $0 BNE even short: VPMSUMD IN, HL, XL // H.lo·Xi.lo VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi VPMSUMD IN, HH, XH // H.hi·Xi.hi VPMSUMD XL, XC2, T2 // 1st reduction phase VSLDOI $8, XM, ZERO, T0 VSLDOI $8, ZERO, XM, T1 VXOR XL, T0, XL VXOR XH, T1, XH VSLDOI $8, XL, XL, XL VXOR XL, T2, XL VSLDOI $8, XL, XL, T1 // 2nd reduction phase VPMSUMD XL, XC2, XL VXOR T1, XH, T1 even: VXOR XL, T1, XL #ifdef GOARCH_ppc64le VPERM XL, XL, LEMASK, XL #endif STXVD2X VXL, (XIP+R0) OR R12, R12, R12 // write out Xi RET gcm_ghash_p8_4x: LVSL (R8)(R0), T0 // 0x0001..0e0f MOVD $0x70, R8 LXVD2X (HTBL)(R9), VH2 MOVD $0x80, R9 VSPLTISB $8, T1 // 0x0808..0808 MOVD $0x90, R10 LXVD2X (HTBL)(R8), VH3L // load H^3 MOVD $0xa0, R8 LXVD2X (HTBL)(R9), VH3 MOVD $0xb0, R9 LXVD2X (HTBL)(R10), VH3H MOVD $0xc0, R10 LXVD2X (HTBL)(R8), VH4L // load H^4 MOVD $0x10, R8 LXVD2X (HTBL)(R9), VH4 MOVD $0x20, R9 LXVD2X (HTBL)(R10), VH4H MOVD $0x30, R10 VSLDOI $8, ZERO, T1, T2 // 0x0000..0808 VADDUBM T0, T2, HIPERM // 0x0001..1617 VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f SRD $4, LEN, LEN // this allows to use sign bit as carry LXVD2X (INP)(R0), VIN0 // load input LXVD2X (INP)(R8), VIN1 SUBCCC $8, LEN, LEN LXVD2X (INP)(R9), VIN2 LXVD2X (INP)(R10), VIN3 ADD $0x40, INP, INP #ifdef GOARCH_ppc64le VPERM IN0, IN0, LEMASK, IN0 VPERM IN1, IN1, LEMASK, IN1 VPERM IN2, IN2, LEMASK, IN2 VPERM IN3, IN3, LEMASK, IN3 #endif VXOR IN0, XL, XH VPMSUMD IN1, H3L, XL1 VPMSUMD IN1, H3, XM1 VPMSUMD IN1, H3H, XH1 VPERM H2, H, HIPERM, H21L VPERM IN2, IN3, LOPERM, T0 VPERM H2, H, LOPERM, H21H VPERM IN2, IN3, HIPERM, T1 VPMSUMD IN2, H2, XM2 // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi VXOR XM2, XM1, XM2 VXOR XL3, XL1, XL3 VXOR XM3, XM2, XM3 VXOR XH3, XH1, XH3 BLT tail_4x loop_4x: LXVD2X (INP)(R0), VIN0 LXVD2X (INP)(R8), VIN1 SUBCCC $4, LEN, LEN LXVD2X (INP)(R9), VIN2 LXVD2X (INP)(R10), VIN3 ADD $0x40, INP, INP #ifdef GOARCH_ppc64le VPERM IN1, IN1, LEMASK, IN1 VPERM IN2, IN2, LEMASK, IN2 VPERM IN3, IN3, LEMASK, IN3 VPERM IN0, IN0, LEMASK, IN0 #endif VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi VPMSUMD IN1, H3L, XL1 VPMSUMD IN1, H3, XM1 VPMSUMD IN1, H3H, XH1 VXOR XL, XL3, XL VXOR XM, XM3, XM VXOR XH, XH3, XH VPERM IN2, IN3, LOPERM, T0 VPERM IN2, IN3, HIPERM, T1 VPMSUMD XL, XC2, T2 // 1st reduction phase VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo +H^2.lo·Xi+2.lo VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi +H^2.hi·Xi+2.hi VSLDOI $8, XM, ZERO, T0 VSLDOI $8, ZERO, XM, T1 VXOR XL, T0, XL VXOR XH, T1, XH VSLDOI $8, XL, XL, XL VXOR XL, T2, XL VSLDOI $8, XL, XL, T1 // 2nd reduction phase VPMSUMD IN2, H2, XM2 // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi VPMSUMD XL, XC2, XL VXOR XL3, XL1, XL3 VXOR XH3, XH1, XH3 VXOR XH, IN0, XH VXOR XM2, XM1, XM2 VXOR XH, T1, XH VXOR XM3, XM2, XM3 VXOR XH, XL, XH BGE loop_4x tail_4x: VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi VXOR XL, XL3, XL VXOR XM, XM3, XM VPMSUMD XL, XC2, T2 // 1st reduction phase VSLDOI $8, XM, ZERO, T0 VSLDOI $8, ZERO, XM, T1 VXOR XH, XH3, XH VXOR XL, T0, XL VXOR XH, T1, XH VSLDOI $8, XL, XL, XL VXOR XL, T2, XL VSLDOI $8, XL, XL, T1 // 2nd reduction phase VPMSUMD XL, XC2, XL VXOR T1, XH, T1 VXOR XL, T1, XL ADDCCC $4, LEN, LEN BEQ done_4x LXVD2X (INP)(R0), VIN0 CMPU LEN, $2 MOVD $-4, LEN BLT one LXVD2X (INP)(R8), VIN1 BEQ two three: LXVD2X (INP)(R9), VIN2 #ifdef GOARCH_ppc64le VPERM IN0, IN0, LEMASK, IN0 VPERM IN1, IN1, LEMASK, IN1 VPERM IN2, IN2, LEMASK, IN2 #endif VXOR IN0, XL, XH VOR H3L, H3L, H4L VOR H3, H3, H4 VOR H3H, H3H, H4H VPERM IN1, IN2, LOPERM, T0 VPERM IN1, IN2, HIPERM, T1 VPMSUMD IN1, H2, XM2 // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo VPMSUMD IN2, H, XM3 // H.hi·Xi+2.lo +H.lo·Xi+2.hi VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi VXOR XM3, XM2, XM3 JMP tail_4x two: #ifdef GOARCH_ppc64le VPERM IN0, IN0, LEMASK, IN0 VPERM IN1, IN1, LEMASK, IN1 #endif VXOR IN, XL, XH VPERM ZERO, IN1, LOPERM, T0 VPERM ZERO, IN1, HIPERM, T1 VSLDOI $8, ZERO, H2, H4L VOR H2, H2, H4 VSLDOI $8, H2, ZERO, H4H VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo VPMSUMD IN1, H, XM3 // H.hi·Xi+1.lo+H.lo·Xi+2.hi VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi JMP tail_4x one: #ifdef GOARCH_ppc64le VPERM IN0, IN0, LEMASK, IN0 #endif VSLDOI $8, ZERO, H, H4L VOR H, H, H4 VSLDOI $8, H, ZERO, H4H VXOR IN0, XL, XH VXOR XL3, XL3, XL3 VXOR XM3, XM3, XM3 VXOR XH3, XH3, XH3 JMP tail_4x done_4x: #ifdef GOARCH_ppc64le VPERM XL, XL, LEMASK, XL #endif STXVD2X VXL, (XIP+R0) // write out Xi RET // func gcmMul(output []byte, productTable *[256]byte) TEXT ·gcmMul(SB), NOSPLIT, $0-32 MOVD output+0(FP), XIP MOVD productTable+24(FP), HTBL MOVD $0x10, R8 MOVD $0x20, R9 MOVD $0x30, R10 LXVD2X (XIP)(R0), VIN // load Xi LXVD2X (HTBL)(R8), VHL // Load pre-computed table LXVD2X (HTBL)(R9), VH LXVD2X (HTBL)(R10), VHH LXVD2X (HTBL)(R0), VXC2 #ifdef GOARCH_ppc64le VSPLTISB $0x07, T0 VXOR LEMASK, T0, LEMASK VPERM IN, IN, LEMASK, IN #endif VXOR ZERO, ZERO, ZERO VPMSUMD IN, HL, XL // H.lo·Xi.lo VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi VPMSUMD IN, HH, XH // H.hi·Xi.hi VPMSUMD XL, XC2, T2 // 1st reduction phase VSLDOI $8, XM, ZERO, T0 VSLDOI $8, ZERO, XM, T1 VXOR XL, T0, XL VXOR XH, T1, XH VSLDOI $8, XL, XL, XL VXOR XL, T2, XL VSLDOI $8, XL, XL, T1 // 2nd reduction phase VPMSUMD XL, XC2, XL VXOR T1, XH, T1 VXOR XL, T1, XL #ifdef GOARCH_ppc64le VPERM XL, XL, LEMASK, XL #endif STXVD2X VXL, (XIP+R0) // write out Xi RET #define BLK_INP R3 #define BLK_OUT R4 #define BLK_KEY R5 #define KEY_LEN R6 #define BLK_IDX R7 #define IDX R8 #define IN_LEN R9 #define COUNTER R10 #define CONPTR R14 #define MASK V5 // Implementation of the counterCrypt function in assembler. // Original loop is unrolled to allow for multiple encryption // streams to be done in parallel, which is achieved by interleaving // vcipher instructions from each stream. This is also referred to as // stitching, and provides significant performance improvements. // Some macros are defined which enable execution for big or little // endian as well as different ISA targets. //func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32) //func counterCryptASM(xr, out, in, counter, key) TEXT ·counterCryptASM(SB), NOSPLIT, $16-72 MOVD xr(FP), KEY_LEN MOVD out+8(FP), BLK_OUT MOVD out_len+16(FP), R8 MOVD in+32(FP), BLK_INP MOVD in_len+40(FP), IN_LEN MOVD counter+56(FP), COUNTER MOVD key+64(FP), BLK_KEY // Set up permute string when needed. #ifdef NEEDS_ESPERM MOVD $·rcon(SB), R14 LVX (R14), ESPERM // Permute value for P8_ macros. #endif SETUP_COUNTER // V30 Counter V31 BE {0, 0, 0, 1} LOAD_KEYS(BLK_KEY, KEY_LEN) // VS1 - VS10/12/14 based on keysize CMP IN_LEN, $128 BLT block64 block128_loop: // Do 8 encryptions in parallel by setting // input values in V15-V22 and executing // vcipher on the updated value and the keys. GEN_VCIPHER_8_INPUTS VCIPHER_8X1_KEY(VS1) VCIPHER_8X1_KEY(VS2) VCIPHER_8X1_KEY(VS3) VCIPHER_8X1_KEY(VS4) VCIPHER_8X1_KEY(VS5) VCIPHER_8X1_KEY(VS6) VCIPHER_8X1_KEY(VS7) VCIPHER_8X1_KEY(VS8) VCIPHER_8X1_KEY(VS9) // Additional encryptions are done based on // the key length, with the last key moved // to V23 for use with VCIPHERLAST. // CR2 = CMP key_len, $12 XXLOR VS10, VS10, V23 BLT CR2, block128_last // key_len = 10 VCIPHER_8X1_KEY(VS10) VCIPHER_8X1_KEY(VS11) XXLOR VS12,VS12,V23 BEQ CR2, block128_last // ken_len = 12 VCIPHER_8X1_KEY(VS12) VCIPHER_8X1_KEY(VS13) XXLOR VS14,VS14,V23 // key_len = 14 block128_last: // vcipher encryptions are in V15-V22 at this // point with vcipherlast remaining to be done. // Load input block into V1-V8, setting index offsets // in R16-R22 to use with the STORE. LOAD_INPUT_BLOCK128(BLK_INP) // Do VCIPHERLAST on the last key for each encryption // stream and XOR the result with the corresponding // value from the input block. VCIPHERLAST8_XOR_INPUT // Store the results (8*16) and update BLK_OUT by 128. STORE_OUTPUT_BLOCK128(BLK_OUT) ADD $-128, IN_LEN // input size CMP IN_LEN, $128 // check if >= blocksize BGE block128_loop // next input block CMP IN_LEN, $0 BEQ done block64: CMP IN_LEN, $64 // Check if >= 64 BLT block16_loop // Do 4 encryptions in parallel by setting // input values in V15-V18 and executing // vcipher on the updated value and the keys. GEN_VCIPHER_4_INPUTS VCIPHER_4X1_KEY(VS1) VCIPHER_4X1_KEY(VS2) VCIPHER_4X1_KEY(VS3) VCIPHER_4X1_KEY(VS4) VCIPHER_4X1_KEY(VS5) VCIPHER_4X1_KEY(VS6) VCIPHER_4X1_KEY(VS7) VCIPHER_4X1_KEY(VS8) VCIPHER_4X1_KEY(VS9) // Check key length based on CR2 // Move last key to V23 for use with later vcipherlast XXLOR VS10, VS10, V23 BLT CR2, block64_last // size = 10 VCIPHER_4X1_KEY(VS10) // Encrypt next 2 keys VCIPHER_4X1_KEY(VS11) XXLOR VS12, VS12, V23 BEQ CR2, block64_last // size = 12 VCIPHER_4X1_KEY(VS12) // Encrypt last 2 keys VCIPHER_4X1_KEY(VS13) XXLOR VS14, VS14, V23 // size = 14 block64_last: LOAD_INPUT_BLOCK64(BLK_INP) // Load 64 bytes of input // Do VCIPHERLAST on the last for each encryption // stream and XOR the result with the corresponding // value from the input block. VCIPHERLAST4_XOR_INPUT // Store the results (4*16) and update BLK_OUT by 64. STORE_OUTPUT_BLOCK64(BLK_OUT) ADD $-64, IN_LEN // decrement input block length CMP IN_LEN, $0 // check for remaining length BEQ done block16_loop: CMP IN_LEN, $16 // More input BLT final_block // If not, then handle partial block // Single encryption, no stitching GEN_VCIPHER_INPUT // Generate input value for single encryption VCIPHER_1X9_KEYS(V15) // Encrypt V15 value with 9 keys XXLOR VS10, VS10, V23 // Last key -> V23 for later vcipiherlast // Key length based on CR2. (LT=10, EQ=12, GT=14) BLT CR2, block16_last // Finish for key size 10 VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys XXLOR VS12, VS12, V23 // Last key -> V23 for later vcipherlast BEQ CR2, block16_last // Finish for key size 12 VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys XXLOR VS14, VS14, V23 // Last key -> V23 for vcipherlast with key size 14 block16_last: P8_LXVB16X(BLK_INP, R0, V1) // Load input VCIPHERLAST V15, V23, V15 // Encrypt last value in V23 XXLXOR V15, V1, V1 // XOR with input P8_STXVB16X(V1,R0,BLK_OUT) // Store final encryption value to output ADD $16, BLK_INP // Increment input pointer ADD $16, BLK_OUT // Increment output pointer ADD $-16, IN_LEN // Decrement input length BR block16_loop // Check for next final_block: CMP IN_LEN, $0 BEQ done GEN_VCIPHER_INPUT // Generate input value for partial encryption VCIPHER_1X9_KEYS(V15) // Encrypt V15 with 9 keys XXLOR VS10, VS10, V23 // Save possible last key BLT CR2, final_block_last VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with next 2 keys XXLOR VS12, VS12, V23 // Save possible last key BEQ CR2, final_block_last VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys XXLOR VS14, VS14, V23 // Save last key final_block_last: VCIPHERLAST V15, V23, V15 // Finish encryption #ifdef GOPPC64_power10 // set up length SLD $56, IN_LEN, R17 LXVLL BLK_INP, R17, V25 VXOR V25, V15, V25 STXVLL V25, BLK_OUT, R17 #else ADD $32, R1, MASK_PTR MOVD $0, R16 P8_STXVB16X(V15, MASK_PTR, R0) CMP IN_LEN, $8 BLT next4 MOVD 0(MASK_PTR), R14 MOVD 0(BLK_INP), R15 XOR R14, R15, R14 MOVD R14, 0(BLK_OUT) ADD $8, R16 ADD $-8, IN_LEN next4: CMP IN_LEN, $4 BLT next2 MOVWZ (BLK_INP)(R16), R15 MOVWZ (MASK_PTR)(R16), R14 XOR R14, R15, R14 MOVW R14, (R16)(BLK_OUT) ADD $4, R16 ADD $-4, IN_LEN next2: CMP IN_LEN, $2 BLT next1 MOVHZ (BLK_INP)(R16), R15 MOVHZ (MASK_PTR)(R16), R14 XOR R14, R15, R14 MOVH R14, (R16)(BLK_OUT) ADD $2, R16 ADD $-2, IN_LEN next1: CMP IN_LEN, $1 BLT done MOVBZ (MASK_PTR)(R16), R14 MOVBZ (BLK_INP)(R16), R15 XOR R14, R15, R14 MOVB R14, (R16)(BLK_OUT) #endif done: // Save the updated counter value P8_STXVB16X(V30, COUNTER, R0) // Clear the keys XXLXOR VS0, VS0, VS0 XXLXOR VS1, VS1, VS1 XXLXOR VS2, VS2, VS2 XXLXOR VS3, VS3, VS3 XXLXOR VS4, VS4, VS4 XXLXOR VS5, VS5, VS5 XXLXOR VS6, VS6, VS6 XXLXOR VS7, VS7, VS7 XXLXOR VS8, VS8, VS8 XXLXOR VS9, VS9, VS9 XXLXOR VS10, VS10, VS10 XXLXOR VS11, VS11, VS11 XXLXOR VS12, VS12, VS12 XXLXOR VS13, VS13, VS13 XXLXOR VS14, VS14, VS14 RET