// Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build !purego #include "textflag.h" // func addMulVVW1024(z, x *uint, y uint) (c uint) TEXT ·addMulVVW1024(SB), $0-32 MOVD $16, R0 JMP addMulVVWx(SB) // func addMulVVW1536(z, x *uint, y uint) (c uint) TEXT ·addMulVVW1536(SB), $0-32 MOVD $24, R0 JMP addMulVVWx(SB) // func addMulVVW2048(z, x *uint, y uint) (c uint) TEXT ·addMulVVW2048(SB), $0-32 MOVD $32, R0 JMP addMulVVWx(SB) TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0 MOVD z+0(FP), R1 MOVD x+8(FP), R2 MOVD y+16(FP), R3 MOVD $0, R4 // The main loop of this code operates on a block of 4 words every iteration // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. loop: CBZ R0, done LDP.P 16(R2), (R5, R6) LDP.P 16(R2), (R7, R8) LDP (R1), (R9, R10) ADDS R4, R9 MUL R6, R3, R14 ADCS R14, R10 MUL R7, R3, R15 LDP 16(R1), (R11, R12) ADCS R15, R11 MUL R8, R3, R16 ADCS R16, R12 UMULH R8, R3, R20 ADC $0, R20 MUL R5, R3, R13 ADDS R13, R9 UMULH R5, R3, R17 ADCS R17, R10 UMULH R6, R3, R21 STP.P (R9, R10), 16(R1) ADCS R21, R11 UMULH R7, R3, R19 ADCS R19, R12 STP.P (R11, R12), 16(R1) ADC $0, R20, R4 SUB $4, R0 B loop done: MOVD R4, c+24(FP) RET