// Reference: www.ssrc.ucsc.edu/Papers/plank-fast13.pdf #include "textflag.h" #define low_tbl Y0 #define high_tbl Y1 #define mask Y2 #define in0 Y3 #define in1 Y4 #define in2 Y5 #define in3 Y6 #define in4 Y7 #define in5 Y8 #define in0_h Y10 #define in1_h Y11 #define in2_h Y12 #define in3_h Y13 #define in4_h Y14 #define in5_h Y15 #define in BX #define out DI #define len R8 #define pos R9 #define tmp0 R10 #define low_tblx X0 #define high_tblx X1 #define maskx X2 #define in0x X3 #define in0_hx X10 #define tmp0x X9 #define tmp1x X11 #define tmp2x X12 #define tmp3x X13 // func mulVectAVX2(tbl, d, p []byte) TEXT ·mulVectAVX2(SB), NOSPLIT, $0 MOVQ i+24(FP), in MOVQ o+48(FP), out MOVQ tbl+0(FP), tmp0 VMOVDQU (tmp0), low_tblx VMOVDQU 16(tmp0), high_tblx MOVB $0x0f, DX LONG $0x2069e3c4; WORD $0x00d2 // VPINSRB $0x00, EDX, XMM2, XMM2 VPBROADCASTB maskx, maskx MOVQ in_len+32(FP), len TESTQ $31, len JNZ one16b ymm: VINSERTI128 $1, low_tblx, low_tbl, low_tbl VINSERTI128 $1, high_tblx, high_tbl, high_tbl VINSERTI128 $1, maskx, mask, mask TESTQ $255, len JNZ not_aligned // 256bytes/loop aligned: MOVQ $0, pos loop256b: VMOVDQU (in)(pos*1), in0 VPSRLQ $4, in0, in0_h VPAND mask, in0_h, in0_h VPAND mask, in0, in0 VPSHUFB in0_h, high_tbl, in0_h VPSHUFB in0, low_tbl, in0 VPXOR in0, in0_h, in0 VMOVDQU in0, (out)(pos*1) VMOVDQU 32(in)(pos*1), in1 VPSRLQ $4, in1, in1_h VPAND mask, in1_h, in1_h VPAND mask, in1, in1 VPSHUFB in1_h, high_tbl, in1_h VPSHUFB in1, low_tbl, in1 VPXOR in1, in1_h, in1 VMOVDQU in1, 32(out)(pos*1) VMOVDQU 64(in)(pos*1), in2 VPSRLQ $4, in2, in2_h VPAND mask, in2_h, in2_h VPAND mask, in2, in2 VPSHUFB in2_h, high_tbl, in2_h VPSHUFB in2, low_tbl, in2 VPXOR in2, in2_h, in2 VMOVDQU in2, 64(out)(pos*1) VMOVDQU 96(in)(pos*1), in3 VPSRLQ $4, in3, in3_h VPAND mask, in3_h, in3_h VPAND mask, in3, in3 VPSHUFB in3_h, high_tbl, in3_h VPSHUFB in3, low_tbl, in3 VPXOR in3, in3_h, in3 VMOVDQU in3, 96(out)(pos*1) VMOVDQU 128(in)(pos*1), in4 VPSRLQ $4, in4, in4_h VPAND mask, in4_h, in4_h VPAND mask, in4, in4 VPSHUFB in4_h, high_tbl, in4_h VPSHUFB in4, low_tbl, in4 VPXOR in4, in4_h, in4 VMOVDQU in4, 128(out)(pos*1) VMOVDQU 160(in)(pos*1), in5 VPSRLQ $4, in5, in5_h VPAND mask, in5_h, in5_h VPAND mask, in5, in5 VPSHUFB in5_h, high_tbl, in5_h VPSHUFB in5, low_tbl, in5 VPXOR in5, in5_h, in5 VMOVDQU in5, 160(out)(pos*1) VMOVDQU 192(in)(pos*1), in0 VPSRLQ $4, in0, in0_h VPAND mask, in0_h, in0_h VPAND mask, in0, in0 VPSHUFB in0_h, high_tbl, in0_h VPSHUFB in0, low_tbl, in0 VPXOR in0, in0_h, in0 VMOVDQU in0, 192(out)(pos*1) VMOVDQU 224(in)(pos*1), in1 VPSRLQ $4, in1, in1_h VPAND mask, in1_h, in1_h VPAND mask, in1, in1 VPSHUFB in1_h, high_tbl, in1_h VPSHUFB in1, low_tbl, in1 VPXOR in1, in1_h, in1 VMOVDQU in1, 224(out)(pos*1) ADDQ $256, pos CMPQ len, pos JNE loop256b VZEROUPPER RET not_aligned: MOVQ len, tmp0 ANDQ $255, tmp0 loop32b: VMOVDQU -32(in)(len*1), in0 VPSRLQ $4, in0, in0_h VPAND mask, in0_h, in0_h VPAND mask, in0, in0 VPSHUFB in0_h, high_tbl, in0_h VPSHUFB in0, low_tbl, in0 VPXOR in0, in0_h, in0 VMOVDQU in0, -32(out)(len*1) SUBQ $32, len SUBQ $32, tmp0 JG loop32b CMPQ len, $256 JGE aligned VZEROUPPER RET one16b: VMOVDQU -16(in)(len*1), in0x VPSRLQ $4, in0x, in0_hx VPAND maskx, in0x, in0x VPAND maskx, in0_hx, in0_hx VPSHUFB in0_hx, high_tblx, in0_hx VPSHUFB in0x, low_tblx, in0x VPXOR in0x, in0_hx, in0x VMOVDQU in0x, -16(out)(len*1) SUBQ $16, len CMPQ len, $0 JNE ymm RET // func mulVectAddAVX2(tbl, d, p []byte) TEXT ·mulVectAddAVX2(SB), NOSPLIT, $0 MOVQ i+24(FP), in MOVQ o+48(FP), out MOVQ tbl+0(FP), tmp0 VMOVDQU (tmp0), low_tblx VMOVDQU 16(tmp0), high_tblx MOVB $0x0f, DX LONG $0x2069e3c4; WORD $0x00d2 VPBROADCASTB maskx, maskx MOVQ in_len+32(FP), len TESTQ $31, len JNZ one16b ymm: VINSERTI128 $1, low_tblx, low_tbl, low_tbl VINSERTI128 $1, high_tblx, high_tbl, high_tbl VINSERTI128 $1, maskx, mask, mask TESTQ $255, len JNZ not_aligned aligned: MOVQ $0, pos loop256b: VMOVDQU (in)(pos*1), in0 VPSRLQ $4, in0, in0_h VPAND mask, in0_h, in0_h VPAND mask, in0, in0 VPSHUFB in0_h, high_tbl, in0_h VPSHUFB in0, low_tbl, in0 VPXOR in0, in0_h, in0 VPXOR (out)(pos*1), in0, in0 VMOVDQU in0, (out)(pos*1) VMOVDQU 32(in)(pos*1), in1 VPSRLQ $4, in1, in1_h VPAND mask, in1_h, in1_h VPAND mask, in1, in1 VPSHUFB in1_h, high_tbl, in1_h VPSHUFB in1, low_tbl, in1 VPXOR in1, in1_h, in1 VPXOR 32(out)(pos*1), in1, in1 VMOVDQU in1, 32(out)(pos*1) VMOVDQU 64(in)(pos*1), in2 VPSRLQ $4, in2, in2_h VPAND mask, in2_h, in2_h VPAND mask, in2, in2 VPSHUFB in2_h, high_tbl, in2_h VPSHUFB in2, low_tbl, in2 VPXOR in2, in2_h, in2 VPXOR 64(out)(pos*1), in2, in2 VMOVDQU in2, 64(out)(pos*1) VMOVDQU 96(in)(pos*1), in3 VPSRLQ $4, in3, in3_h VPAND mask, in3_h, in3_h VPAND mask, in3, in3 VPSHUFB in3_h, high_tbl, in3_h VPSHUFB in3, low_tbl, in3 VPXOR in3, in3_h, in3 VPXOR 96(out)(pos*1), in3, in3 VMOVDQU in3, 96(out)(pos*1) VMOVDQU 128(in)(pos*1), in4 VPSRLQ $4, in4, in4_h VPAND mask, in4_h, in4_h VPAND mask, in4, in4 VPSHUFB in4_h, high_tbl, in4_h VPSHUFB in4, low_tbl, in4 VPXOR in4, in4_h, in4 VPXOR 128(out)(pos*1), in4, in4 VMOVDQU in4, 128(out)(pos*1) VMOVDQU 160(in)(pos*1), in5 VPSRLQ $4, in5, in5_h VPAND mask, in5_h, in5_h VPAND mask, in5, in5 VPSHUFB in5_h, high_tbl, in5_h VPSHUFB in5, low_tbl, in5 VPXOR in5, in5_h, in5 VPXOR 160(out)(pos*1), in5, in5 VMOVDQU in5, 160(out)(pos*1) VMOVDQU 192(in)(pos*1), in0 VPSRLQ $4, in0, in0_h VPAND mask, in0_h, in0_h VPAND mask, in0, in0 VPSHUFB in0_h, high_tbl, in0_h VPSHUFB in0, low_tbl, in0 VPXOR in0, in0_h, in0 VPXOR 192(out)(pos*1), in0, in0 VMOVDQU in0, 192(out)(pos*1) VMOVDQU 224(in)(pos*1), in1 VPSRLQ $4, in1, in1_h VPAND mask, in1_h, in1_h VPAND mask, in1, in1 VPSHUFB in1_h, high_tbl, in1_h VPSHUFB in1, low_tbl, in1 VPXOR in1, in1_h, in1 VPXOR 224(out)(pos*1), in1, in1 VMOVDQU in1, 224(out)(pos*1) ADDQ $256, pos CMPQ len, pos JNE loop256b VZEROUPPER RET not_aligned: MOVQ len, tmp0 ANDQ $255, tmp0 loop32b: VMOVDQU -32(in)(len*1), in0 VPSRLQ $4, in0, in0_h VPAND mask, in0_h, in0_h VPAND mask, in0, in0 VPSHUFB in0_h, high_tbl, in0_h VPSHUFB in0, low_tbl, in0 VPXOR in0, in0_h, in0 VPXOR -32(out)(len*1), in0, in0 VMOVDQU in0, -32(out)(len*1) SUBQ $32, len SUBQ $32, tmp0 JG loop32b CMPQ len, $256 JGE aligned VZEROUPPER RET one16b: VMOVDQU -16(in)(len*1), in0x VPSRLQ $4, in0x, in0_hx VPAND maskx, in0x, in0x VPAND maskx, in0_hx, in0_hx VPSHUFB in0_hx, high_tblx, in0_hx VPSHUFB in0x, low_tblx, in0x VPXOR in0x, in0_hx, in0x VPXOR -16(out)(len*1), in0x, in0x VMOVDQU in0x, -16(out)(len*1) SUBQ $16, len CMPQ len, $0 JNE ymm RET // func mulVectSSSE3(tbl, d, p []byte) TEXT ·mulVectSSSE3(SB), NOSPLIT, $0 MOVQ i+24(FP), in MOVQ o+48(FP), out MOVQ tbl+0(FP), tmp0 MOVOU (tmp0), low_tblx MOVOU 16(tmp0), high_tblx MOVB $15, tmp0 MOVQ tmp0, maskx PXOR tmp0x, tmp0x PSHUFB tmp0x, maskx MOVQ in_len+32(FP), len SHRQ $4, len loop: MOVOU (in), in0x MOVOU in0x, in0_hx PSRLQ $4, in0_hx PAND maskx, in0x PAND maskx, in0_hx MOVOU low_tblx, tmp1x MOVOU high_tblx, tmp2x PSHUFB in0x, tmp1x PSHUFB in0_hx, tmp2x PXOR tmp1x, tmp2x MOVOU tmp2x, (out) ADDQ $16, in ADDQ $16, out SUBQ $1, len JNZ loop RET // func mulVectAddSSSE3(tbl, d, p []byte) TEXT ·mulVectAddSSSE3(SB), NOSPLIT, $0 MOVQ i+24(FP), in MOVQ o+48(FP), out MOVQ tbl+0(FP), tmp0 MOVOU (tmp0), low_tblx MOVOU 16(tmp0), high_tblx MOVB $15, tmp0 MOVQ tmp0, maskx PXOR tmp0x, tmp0x PSHUFB tmp0x, maskx MOVQ in_len+32(FP), len SHRQ $4, len loop: MOVOU (in), in0x MOVOU in0x, in0_hx PSRLQ $4, in0_hx PAND maskx, in0x PAND maskx, in0_hx MOVOU low_tblx, tmp1x MOVOU high_tblx, tmp2x PSHUFB in0x, tmp1x PSHUFB in0_hx, tmp2x PXOR tmp1x, tmp2x MOVOU (out), tmp3x PXOR tmp3x, tmp2x MOVOU tmp2x, (out) ADDQ $16, in ADDQ $16, out SUBQ $1, len JNZ loop RET // func copy32B(dst, src []byte) TEXT ·copy32B(SB), NOSPLIT, $0 MOVQ dst+0(FP), SI MOVQ src+24(FP), DX MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU X0, (SI) MOVOU X1, 16(SI) RET