!14676 [MS][LITE][Develop]remove use of x18 on apple devices

From: @lx0095
Reviewed-by: @hangangqiang,@zhang_xue_tong
Signed-off-by: @zhang_xue_tong
This commit is contained in:
mindspore-ci-bot 2021-04-07 17:18:07 +08:00 committed by Gitee
commit 2513ed1ba7
31 changed files with 572 additions and 521 deletions

View File

@ -28,11 +28,11 @@ asm_function AdderFloatNeon64
ldr x8, [sp]
mov x18, #48 // sizeof(float) * 12
mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth
mov x20, #48 // sizeof(float) * 12
mul x17, x5, x20 // block stride of lhs/rhs: sizeof(float) * 12 * depth
mov x18, #4
mul x8, x8, x18
mov x20, #4
mul x8, x8, x20
LoopRowStart:
cmp x6, #4
@ -595,9 +595,9 @@ LoopRow4:
LoopColEnd:
add x0, x0, x17
mov x18, #4
mul x18, x18, x7
sub x11, x11, x18
mov x20, #4
mul x20, x20, x7
sub x11, x11, x20
mov x2, x11
subs x6, x6, #12
bgt LoopRowStart

View File

@ -33,12 +33,13 @@
// w16: per_channel
asm_function ConvDw3x3Int8Neon64
sub sp, sp, #176
sub sp, sp, #192
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
@ -84,16 +85,16 @@ asm_function ConvDw3x3Int8Neon64
mov x16, x1
add x17, x16, x5
add x18, x17, x5
add x25, x17, x5
ld1 {v9.8b}, [x16], x4
ld1 {v10.8b}, [x16], x4
ld1 {v11.8b}, [x16], x4
ld1 {v13.8b}, [x17], x4
ld1 {v14.8b}, [x17], x4
ld1 {v15.8b}, [x17], x4
ld1 {v17.8b}, [x18], x4
ld1 {v18.8b}, [x18], x4
ld1 {v19.8b}, [x18], x4
ld1 {v17.8b}, [x25], x4
ld1 {v18.8b}, [x25], x4
ld1 {v19.8b}, [x25], x4
ld1 {v21.4s}, [x3]
ld1 {v22.4s}, [x19]
@ -123,13 +124,13 @@ HEIGHT1_LOOP:
ld1 {v16.8b}, [x17]
smlal v23.4s, v0.4h, v10.4h
smlal2 v24.4s, v0.8h, v10.8h
ld1 {v20.8b}, [x18]
ld1 {v20.8b}, [x25]
add x1, x1, x21
ssubl v12.8h, v12.8b, v25.8b
smlal v21.4s, v1.4h, v10.4h
mov x16, x1
add x17, x16, x5
add x18, x17, x5
add x25, x17, x5
smlal2 v22.4s, v1.8h, v10.8h
ld1 {v9.8b}, [x16], x4
ssubl v16.8h, v16.8b, v25.8b
@ -159,17 +160,17 @@ HEIGHT1_LOOP:
smlal2 v24.4s, v5.8h, v16.8h
smlal v21.4s, v6.4h, v17.4h
smlal2 v22.4s, v6.8h, v17.8h
ld1 {v17.8b}, [x18], x4
ld1 {v17.8b}, [x25], x4
smlal v23.4s, v6.4h, v18.4h
smlal2 v24.4s, v6.8h, v18.8h
smlal v21.4s, v7.4h, v18.4h
smlal2 v22.4s, v7.8h, v18.8h
ld1 {v18.8b}, [x18], x4
ld1 {v18.8b}, [x25], x4
smlal v23.4s, v7.4h, v19.4h
smlal2 v24.4s, v7.8h, v19.8h
smlal v21.4s, v8.4h, v19.4h
smlal2 v22.4s, v8.8h, v19.8h
ld1 {v19.8b}, [x18], x4
ld1 {v19.8b}, [x25], x4
smlal v23.4s, v8.4h, v20.4h
smlal2 v24.4s, v8.8h, v20.8h
@ -278,7 +279,7 @@ WIDTH2_LEFT:
smlal2 v24.4s, v1.8h, v11.8h
smlal v21.4s, v2.4h, v11.4h
smlal2 v22.4s, v2.8h, v11.8h
ld1 {v20.8b}, [x18]
ld1 {v20.8b}, [x25]
smlal v23.4s, v2.4h, v12.4h
smlal2 v24.4s, v2.8h, v12.8h
smlal v21.4s, v3.4h, v13.4h
@ -443,12 +444,13 @@ OUTZP3:
st1 {v21.8b}, [x0], x6
End:
sub sp, sp, #176
sub sp, sp, #192
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ret
#endif

View File

@ -33,12 +33,13 @@
// w16: per_channel
asm_function ConvDw3x3Int8Stride2
sub sp, sp, #176
sub sp, sp, #192
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
@ -71,7 +72,7 @@ asm_function ConvDw3x3Int8Stride2
mov x16, x1
add x17, x16, x5
add x18, x17, x5
add x25, x17, x5
ld1 {v9.8b}, [x16], x4
ld1 {v10.8b}, [x16], x4
ssubl v9.8h, v9.8b, v28.8b
@ -83,11 +84,11 @@ asm_function ConvDw3x3Int8Stride2
ssubl v14.8h, v14.8b, v28.8b
ld1 {v16.8b}, [x17], x4
ssubl v15.8h, v15.8b, v28.8b
ld1 {v19.8b}, [x18], x4
ld1 {v19.8b}, [x25], x4
ssubl v16.8h, v16.8b, v28.8b
ld1 {v20.8b}, [x18], x4
ld1 {v20.8b}, [x25], x4
ssubl v19.8h, v19.8b, v28.8b
ld1 {v21.8b}, [x18], x4
ld1 {v21.8b}, [x25], x4
ssubl v20.8h, v20.8b, v28.8b
ssubl v21.8h, v21.8b, v28.8b
@ -108,7 +109,7 @@ HEIGHT1_LOOP:
ld1 {v17.8b}, [x17], x4
ssubl v12.8h, v12.8b, v28.8b
smlal v26.4s, v0.4h, v11.4h
ld1 {v22.8b}, [x18], x4
ld1 {v22.8b}, [x25], x4
ssubl v17.8h, v17.8b, v28.8b
smlal2 v27.4s, v0.8h, v11.8h
ld1 {v13.8b}, [x16], x4
@ -117,7 +118,7 @@ HEIGHT1_LOOP:
ld1 {v18.8b}, [x17], x4
ssubl v13.8h, v13.8b, v28.8b
smlal2 v25.4s, v1.8h, v10.8h
ld1 {v23.8b}, [x18], x4
ld1 {v23.8b}, [x25], x4
ssubl v18.8h, v18.8b, v28.8b
smlal v26.4s, v1.4h, v12.4h
mov v9.16b, v13.16b
@ -157,12 +158,12 @@ HEIGHT1_LOOP:
smlal2 v27.4s, v6.8h, v21.8h
smlal v24.4s, v7.4h, v20.4h
smlal2 v25.4s, v7.8h, v20.8h
ld1 {v20.8b}, [x18], x4
ld1 {v20.8b}, [x25], x4
smlal v26.4s, v7.4h, v22.4h
smlal2 v27.4s, v7.8h, v22.8h
smlal v24.4s, v8.4h, v21.4h
smlal2 v25.4s, v8.8h, v21.8h
ld1 {v21.8b}, [x18], x4
ld1 {v21.8b}, [x25], x4
ssubl v20.8h, v20.8b, v28.8b
smlal v26.4s, v8.4h, v23.4h
ssubl v21.8h, v21.8b, v28.8b
@ -260,7 +261,7 @@ WIDTH2_LEFT:
ld1 {v17.8b}, [x17], x4
ssubl v12.8h, v12.8b, v28.8b
smlal v26.4s, v0.4h, v11.4h
ld1 {v22.8b}, [x18], x4
ld1 {v22.8b}, [x25], x4
ssubl v17.8h, v17.8b, v28.8b
smlal2 v27.4s, v0.8h, v11.8h
ld1 {v13.8b}, [x16], x4
@ -269,7 +270,7 @@ WIDTH2_LEFT:
ld1 {v18.8b}, [x17], x4
ssubl v13.8h, v13.8b, v28.8b
smlal2 v25.4s, v1.8h, v10.8h
ld1 {v23.8b}, [x18], x4
ld1 {v23.8b}, [x25], x4
ssubl v18.8h, v18.8b, v28.8b
smlal v26.4s, v1.4h, v12.4h
ssubl v23.8h, v23.8b, v28.8b
@ -452,11 +453,12 @@ OUTZP3:
st1 {v24.8b}, [x0], x6
End:
sub sp, sp, #176
sub sp, sp, #192
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ret
#endif

View File

@ -19,12 +19,13 @@ asm_function ConvDwFp32Center
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #176
sub sp, sp, #192
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
@ -72,7 +73,7 @@ asm_function ConvDwFp32Center
mov v14.16b, v24.16b
mov v15.16b, v24.16b
LoopKh16:
mov x18, x7
mov x25, x7
mov x21, x16
LoopKw16:
mov x22, x21
@ -109,7 +110,7 @@ asm_function ConvDwFp32Center
ld1 {v23.4s}, [x22], x11
fmla v14.4s, v22.4s, v25.4s
fmla v15.4s, v23.4s, v25.4s
subs x18, x18, #1
subs x25, x25, #1
add x21, x21, x13
bne LoopKw16
add x16, x16, x12
@ -192,7 +193,7 @@ asm_function ConvDwFp32Center
mov v6.16b, v24.16b
mov v7.16b, v24.16b
LoopKh8:
mov x18, x7
mov x25, x7
mov x21, x16
LoopKw8:
mov x22, x21
@ -213,7 +214,7 @@ asm_function ConvDwFp32Center
ld1 {v23.4s}, [x22], x11
fmla v6.4s, v22.4s, v25.4s
fmla v7.4s, v23.4s, v25.4s
subs x18, x18, #1
subs x25, x25, #1
add x21, x21, x13
bne LoopKw8
add x16, x16, x12
@ -261,13 +262,13 @@ asm_function ConvDwFp32Center
mov x20, x6
mov v0.16b, v24.16b
LoopKh:
mov x18, x7
mov x25, x7
mov x22, x16
LoopKw:
ld1 {v16.4s}, [x22], x13
ld1 {v25.4s}, [x17], #16
fmla v0.4s, v16.4s, v25.4s
subs x18, x18, #1
subs x25, x25, #1
bne LoopKw
add x16, x16, x12
subs x20, x20, #1
@ -290,11 +291,12 @@ asm_function ConvDwFp32Center
subs x4, x4, #1
bne LoopH
sub sp, sp, #176
sub sp, sp, #192
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ret
#endif

View File

@ -13,8 +13,9 @@
// x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6
asm_function ConvDwFp32Indirect3x3
sub sp, sp, #16
sub sp, sp, #32
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
movi v31.4s, #6
scvtf v31.4s, v31.4s
@ -28,7 +29,7 @@ asm_function ConvDwFp32Indirect3x3
ldp x12, x13, [x1]
ldp x14, x15, [x1, #16]
ldp x16, x17, [x1, #32]
ldp x18, x19, [x1, #48]
ldp x21, x19, [x1, #48]
ldr x20, [x1, #64]
mov x9, x2
mov x10, x3
@ -56,7 +57,7 @@ asm_function ConvDwFp32Indirect3x3
ld1 {v5.4s}, [x17], #16
ld1 {v22.4s}, [x9], #16
fmla v29.4s, v3.4s, v20.4s
ld1 {v6.4s}, [x18], #16
ld1 {v6.4s}, [x21], #16
ld1 {v23.4s}, [x9], #16
fmla v29.4s, v4.4s, v21.4s
ld1 {v7.4s}, [x19], #16
@ -100,7 +101,7 @@ asm_function ConvDwFp32Indirect3x3
ld1 {v5.4s}, [x17], #16
ld1 {v22.4s}, [x9], #16
fmla v29.4s, v3.4s, v20.4s
ld1 {v6.4s}, [x18], #16
ld1 {v6.4s}, [x21], #16
ld1 {v23.4s}, [x9], #16
fmla v29.4s, v4.4s, v21.4s
ld1 {v7.4s}, [x19], #16
@ -141,7 +142,8 @@ asm_function ConvDwFp32Indirect3x3
cmp x5, #0
bgt LoopPixel
End:
sub sp, sp, #16
sub sp, sp, #32
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ret
#endif

View File

@ -13,17 +13,18 @@
// x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6
asm_function ConvDwFp32Indirect5x5
sub sp, sp, #160
sub sp, sp, #176
stp x19, x20, [sp, #64]
stp x21, x22, [sp, #80]
stp x23, x24, [sp, #96]
stp x25, x26, [sp, #112]
stp x27, x28, [sp, #128]
stp x29, x30, [sp, #144]
ldrb w8, [sp, #160]
ldrb w8, [sp, #176]
stp x2, x3, [sp]
stp x4, x6, [sp, #16]
stp x7, x8, [sp, #32]
stp x0, x1, [sp, #160]
movi v31.4s, #6
scvtf v31.4s, v31.4s
@ -44,7 +45,7 @@ asm_function ConvDwFp32Indirect5x5
ldp x12, x13, [x1, #48]
ldp x14, x15, [x1, #64]
ldp x16, x17, [x1, #80]
ldp x18, x19, [x1, #96]
ldp x0, x19, [x1, #96]
ldp x20, x21, [x1, #112]
ldp x22, x23, [x1, #128]
ldp x24, x25, [x1, #144]
@ -93,7 +94,7 @@ asm_function ConvDwFp32Indirect5x5
ld1 {v1.4s}, [x17], #16
ld1 {v19.4s}, [x5], #16
fmla v29.4s, v7.4s, v25.4s
ld1 {v2.4s}, [x18], #16
ld1 {v2.4s}, [x0], #16
ld1 {v20.4s}, [x5], #16
fmla v29.4s, v16.4s, v26.4s
ld1 {v3.4s}, [x19], #16
@ -160,7 +161,9 @@ asm_function ConvDwFp32Indirect5x5
RELU:
fmax v29.4s, v29.4s, v30.4s
WRITE:
st1 {v29.4s}, [x0], #16
ldr x4, [sp, #160]
st1 {v29.4s}, [x4], #16
str x4, [sp, #160]
ldr x4, [sp, #56]
ld1 {v29.4s}, [x4], #16
@ -195,7 +198,7 @@ asm_function ConvDwFp32Indirect5x5
ld1 {v1.4s}, [x17], #16
ld1 {v19.4s}, [x5], #16
fmla v29.4s, v7.4s, v25.4s
ld1 {v2.4s}, [x18], #16
ld1 {v2.4s}, [x0], #16
ld1 {v20.4s}, [x5], #16
fmla v29.4s, v16.4s, v26.4s
ld1 {v3.4s}, [x19], #16
@ -253,18 +256,24 @@ asm_function ConvDwFp32Indirect5x5
LeftWrite:
cmp x2, #4
bne Write3
st1 {v29.4s}, [x0], #16
ldr x4, [sp, #160]
st1 {v29.4s}, [x4], #16
str x4, [sp, #160]
b NextPixel
Write3:
sxtw x2, w2
tbnz w2, #1, Write2
tbnz w2, #0, Write1
Write2:
st1 {v29.2s}, [x0], #8
ldr x4, [sp, #160]
st1 {v29.2s}, [x4], #8
str x4, [sp, #160]
ext v29.16b, v29.16b, v29.16b, #8
tbz w2, #0, NextPixel
Write1:
str s29, [x0], #4
ldr x4, [sp, #160]
str s29, [x4], #4
str x4, [sp, #160]
NextPixel:
ldr x2, [sp, #24]
@ -279,6 +288,6 @@ End:
ldp x25, x26, [sp, #112]
ldp x27, x28, [sp, #128]
ldp x29, x30, [sp, #144]
add sp, sp, #160
add sp, sp, #176
ret
#endif

View File

@ -22,12 +22,13 @@ asm_function ConvDwInt8Center
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #176
sub sp, sp, #192
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
@ -51,9 +52,9 @@ asm_function ConvDwInt8Center
ld1 {v24.4s}, [x17], #16
ld1 {v25.4s}, [x17], #16
ldr x18, [sp, #80] // right shift
ld1 {v26.4s}, [x18], #16
ld1 {v27.4s}, [x18], #16
ldr x25, [sp, #80] // right shift
ld1 {v26.4s}, [x25], #16
ld1 {v27.4s}, [x25], #16
ldr x19, [sp, #88] // acc_min
ld1 {v28.4s}, [x19], #16
@ -90,7 +91,7 @@ asm_function ConvDwInt8Center
mov v6.16b, v17.16b
mov v7.16b, v18.16b
LoopKh4:
mov x18, x7
mov x25, x7
mov x21, x16
LoopKw4:
mov x22, x21
@ -116,7 +117,7 @@ asm_function ConvDwInt8Center
smlal v6.4s, v8.4h, v16.4h
smlal2 v7.4s, v8.8h, v16.8h
subs x18, x18, #1
subs x25, x25, #1
add x21, x21, x13
bne LoopKw4
add x16, x16, x12
@ -194,15 +195,15 @@ asm_function ConvDwInt8Center
mov x16, x3
add x17, x16, x9
add x18, x17, x9
add x21, x18, x9
add x25, x17, x9
add x21, x25, x9
st1 {v0.s}[0], [x16], #4
st1 {v1.s}[0], [x16], #4
st1 {v2.s}[0], [x17], #4
st1 {v3.s}[0], [x17], #4
st1 {v4.s}[0], [x18], #4
st1 {v5.s}[0], [x18], #4
st1 {v4.s}[0], [x25], #4
st1 {v5.s}[0], [x25], #4
st1 {v6.s}[0], [x21], #4
st1 {v7.s}[0], [x21], #4
@ -221,7 +222,7 @@ asm_function ConvDwInt8Center
mov v0.16b, v17.16b
mov v1.16b, v18.16b
LoopKh:
mov x18, x7
mov x25, x7
mov x22, x16
LoopKw:
ld1 {v15.8b}, [x22], x13
@ -229,7 +230,7 @@ asm_function ConvDwInt8Center
ld1 {v16.8h}, [x17], #16
smlal v0.4s, v14.4h, v16.4h
smlal2 v1.4s, v14.8h, v16.8h
subs x18, x18, #1
subs x25, x25, #1
bne LoopKw
add x16, x16, x12
subs x20, x20, #1
@ -271,11 +272,12 @@ asm_function ConvDwInt8Center
subs x4, x4, #1
bne LoopH
sub sp, sp, #176
sub sp, sp, #192
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ret
#endif

View File

@ -47,11 +47,11 @@ asm_function ConvSwFp32Center
LoopH:
mov x17, x1
mov x18, x5
mov x28, x5
mov x3, x0
cmp x18, #8
cmp x28, #8
blt LoopW
cmp x18, #16
cmp x28, #16
blt LoopW8
LoopW16:
@ -244,12 +244,12 @@ asm_function ConvSwFp32Center
st1 {v14.4s}, [x3], x9
st1 {v15.4s}, [x3], x9
add x17, x17, x19
sub x18, x18, #16
cmp x18, #0
sub x28, x28, #16
cmp x28, #0
ble LoopWEnd
cmp x18, #8
cmp x28, #8
blt LoopW
cmp x18, #16
cmp x28, #16
bge LoopW16
LoopW8:
mov x19, #8
@ -369,10 +369,10 @@ asm_function ConvSwFp32Center
st1 {v6.4s}, [x3], x9
st1 {v7.4s}, [x3], x9
add x17, x17, x19
sub x18, x18, #8
cmp x18, #0
sub x28, x28, #8
cmp x28, #0
ble LoopWEnd
cmp x18, #8
cmp x28, #8
bge LoopW8
LoopW:
mov x20, x17
@ -427,7 +427,7 @@ asm_function ConvSwFp32Center
Write:
st1 {v0.4s}, [x3], x9
add x17, x17, x12
subs x18, x18, #1
subs x28, x28, #1
bne LoopW
LoopWEnd:
add x0, x0, x8

View File

@ -33,12 +33,12 @@ asm_function DeconvDwFp32Center
mov x16, x1
mov x17, x4
LoopW:
mov x18, x15
mov x22, x15
mov x19, x2
mov x20, x5
ld1 {v1.4s}, [x16], x8
LoopKh:
mov x21, x18
mov x21, x22
mov x13, x6
LoopKw:
ld1 {v0.4s}, [x21]
@ -47,7 +47,7 @@ asm_function DeconvDwFp32Center
st1 {v0.4s}, [x21], x12
subs x13, x13, #1
bne LoopKw
add x18, x18, x11
add x22, x22, x11
subs x20, x20, #1
bne LoopKh
add x15, x15, x10

View File

@ -21,30 +21,31 @@
// w13: c8_nhwc_c4
asm_function MatmulFloatNeon64
sub sp, sp, #128
sub sp, sp, #144
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
ldr x9, [sp, #8]
ldr x14, [sp, #16]
mov w18, #32 // sizeof(float) * 8
mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth
mov x18, #4
mov w19, #32 // sizeof(float) * 8
mul w15, w5, w19 // block stride of lhs/rhs: sizeof(float) * 8 * depth
mov x19, #4
ldr x17, [sp]
cbz x14, NoWinoSteps
mul x8, x7, x17
mov x11, #8
mul x11, x11, x17
mul x8, x8, x18
mul x11, x11, x18
mul x8, x8, x19
mul x11, x11, x19
NoWinoSteps:
mul x17, x17, x18
mul x17, x17, x19
L1:
mov w10, w6 // reload lhs row
mov x12, x0 // reload lhs ptr
mov x18, x2 // reload dst ptr
mov x19, x2 // reload dst ptr
L2:
mov x16, x1 // reload rhs ptr
@ -254,435 +255,435 @@ Write:
b Write8
Write1:
str s8, [x18]
str s8, [x19]
cmp w10, #1
beq WriteEnd
add x18, x18, x17
str s10, [x18]
add x19, x19, x17
str s10, [x19]
cmp w10, #2
beq WriteEnd
add x18, x18, x17
str s12, [x18]
add x19, x19, x17
str s12, [x19]
cmp w10, #3
beq WriteEnd
add x18, x18, x17
str s14, [x18]
add x19, x19, x17
str s14, [x19]
cmp w10, #4
beq WriteEnd
add x18, x18, x17
str s16, [x18]
add x19, x19, x17
str s16, [x19]
cmp w10, #5
beq WriteEnd
add x18, x18, x17
str s18, [x18]
add x19, x19, x17
str s18, [x19]
cmp w10, #6
beq WriteEnd
add x18, x18, x17
str s20, [x18]
add x19, x19, x17
str s20, [x19]
cmp w10, #7
beq WriteEnd
add x18, x18, x17
str s22, [x18]
add x19, x19, x17
str s22, [x19]
cmp w10, #8
beq WriteEnd
add x18, x18, x17
str s24, [x18]
add x19, x19, x17
str s24, [x19]
cmp w10, #9
beq WriteEnd
add x18, x18, x17
str s26, [x18]
add x19, x19, x17
str s26, [x19]
cmp w10, #10
beq WriteEnd
add x18, x18, x17
str s28, [x18]
add x19, x19, x17
str s28, [x19]
cmp w10, #11
beq WriteEnd
add x18, x18, x17
str s30, [x18]
add x18, x18, x17
add x19, x19, x17
str s30, [x19]
add x19, x19, x17
b WriteEnd
Write2:
dup s9, v8.s[1]
stp s8, s9, [x18]
stp s8, s9, [x19]
cmp w10, #1
beq WriteEnd
add x18, x18, x17
add x19, x19, x17
dup s11, v10.s[1]
stp s10, s11, [x18]
stp s10, s11, [x19]
cmp w10, #2
beq WriteEnd
add x18, x18, x17
add x19, x19, x17
dup s13, v12.s[1]
stp s12, s13, [x18]
stp s12, s13, [x19]
cmp w10, #3
beq WriteEnd
add x18, x18, x17
add x19, x19, x17
dup s15, v14.s[1]
stp s14, s15, [x18]
stp s14, s15, [x19]
cmp w10, #4
beq WriteEnd
add x18, x18, x17
add x19, x19, x17
dup s17, v16.s[1]
stp s16, s17, [x18]
stp s16, s17, [x19]
cmp w10, #5
beq WriteEnd
add x18, x18, x17
add x19, x19, x17
dup s19, v18.s[1]
stp s18, s19, [x18]
stp s18, s19, [x19]
cmp w10, #6
beq WriteEnd
add x18, x18, x17
add x19, x19, x17
dup s21, v20.s[1]
stp s20, s21, [x18]
stp s20, s21, [x19]
cmp w10, #7
beq WriteEnd
add x18, x18, x17
add x19, x19, x17
dup s23, v22.s[1]
stp s22, s23, [x18]
stp s22, s23, [x19]
cmp w10, #8
beq WriteEnd
add x18, x18, x17
add x19, x19, x17
dup s25, v24.s[1]
stp s24, s25, [x18]
stp s24, s25, [x19]
cmp w10, #9
beq WriteEnd
add x18, x18, x17
add x19, x19, x17
dup s27, v26.s[1]
stp s26, s27, [x18]
stp s26, s27, [x19]
cmp w10, #10
beq WriteEnd
add x18, x18, x17
add x19, x19, x17
dup s29, v28.s[1]
stp s28, s29, [x18]
stp s28, s29, [x19]
cmp w10, #11
beq WriteEnd
add x18, x18, x17
add x19, x19, x17
dup s31, v30.s[1]
stp s30, s31, [x18]
add x18, x18, x17
stp s30, s31, [x19]
add x19, x19, x17
b WriteEnd
Write3:
add x13, x18, #8
add x13, x19, #8
dup s9, v8.s[1]
stp s8, s9, [x18]
add x18, x18, x17
stp s8, s9, [x19]
add x19, x19, x17
st1 {v8.s}[2], [x13], x17
cmp w10, #1
beq WriteEnd
dup s11, v10.s[1]
stp s10, s11, [x18]
add x18, x18, x17
stp s10, s11, [x19]
add x19, x19, x17
st1 {v10.s}[2], [x13], x17
cmp w10, #2
beq WriteEnd
dup s13, v12.s[1]
stp s12, s13, [x18]
add x18, x18, x17
stp s12, s13, [x19]
add x19, x19, x17
st1 {v12.s}[2], [x13], x17
cmp w10, #3
beq WriteEnd
dup s15, v14.s[1]
stp s14, s15, [x18]
add x18, x18, x17
stp s14, s15, [x19]
add x19, x19, x17
st1 {v14.s}[2], [x13], x17
cmp w10, #4
beq WriteEnd
dup s17, v16.s[1]
stp s16, s17, [x18]
add x18, x18, x17
stp s16, s17, [x19]
add x19, x19, x17
st1 {v16.s}[2], [x13], x17
cmp w10, #5
beq WriteEnd
dup s19, v18.s[1]
stp s18, s19, [x18]
add x18, x18, x17
stp s18, s19, [x19]
add x19, x19, x17
st1 {v18.s}[2], [x13], x17
cmp w10, #6
beq WriteEnd
dup s21, v20.s[1]
stp s20, s21, [x18]
add x18, x18, x17
stp s20, s21, [x19]
add x19, x19, x17
st1 {v20.s}[2], [x13], x17
cmp w10, #7
beq WriteEnd
dup s23, v22.s[1]
stp s22, s23, [x18]
add x18, x18, x17
stp s22, s23, [x19]
add x19, x19, x17
st1 {v22.s}[2], [x13], x17
cmp w10, #8
beq WriteEnd
dup s25, v24.s[1]
stp s24, s25, [x18]
add x18, x18, x17
stp s24, s25, [x19]
add x19, x19, x17
st1 {v24.s}[2], [x13], x17
cmp w10, #9
beq WriteEnd
dup s27, v26.s[1]
stp s26, s27, [x18]
add x18, x18, x17
stp s26, s27, [x19]
add x19, x19, x17
st1 {v26.s}[2], [x13], x17
cmp w10, #10
beq WriteEnd
dup s29, v28.s[1]
stp s28, s29, [x18]
add x18, x18, x17
stp s28, s29, [x19]
add x19, x19, x17
st1 {v28.s}[2], [x13], x17
cmp w10, #11
beq WriteEnd
dup s31, v30.s[1]
stp s30, s31, [x18]
add x18, x18, x17
stp s30, s31, [x19]
add x19, x19, x17
st1 {v30.s}[2], [x13]
b WriteEnd
Write4:
st1 {v8.4s}, [x18], x17
st1 {v8.4s}, [x19], x17
cmp w10, #1
beq WriteEnd
st1 {v10.4s}, [x18], x17
st1 {v10.4s}, [x19], x17
cmp w10, #2
beq WriteEnd
st1 {v12.4s}, [x18], x17
st1 {v12.4s}, [x19], x17
cmp w10, #3
beq WriteEnd
st1 {v14.4s}, [x18], x17
st1 {v14.4s}, [x19], x17
cmp w10, #4
beq WriteEnd
st1 {v16.4s}, [x18], x17
st1 {v16.4s}, [x19], x17
cmp w10, #5
beq WriteEnd
st1 {v18.4s}, [x18], x17
st1 {v18.4s}, [x19], x17
cmp w10, #6
beq WriteEnd
st1 {v20.4s}, [x18], x17
st1 {v20.4s}, [x19], x17
cmp w10, #7
beq WriteEnd
st1 {v22.4s}, [x18], x17
st1 {v22.4s}, [x19], x17
cmp w10, #8
beq WriteEnd
st1 {v24.4s}, [x18], x17
st1 {v24.4s}, [x19], x17
cmp w10, #9
beq WriteEnd
st1 {v26.4s}, [x18], x17
st1 {v26.4s}, [x19], x17
cmp w10, #10
beq WriteEnd
st1 {v28.4s}, [x18], x17
st1 {v28.4s}, [x19], x17
cmp w10, #11
beq WriteEnd
st1 {v30.4s}, [x18], x17
st1 {v30.4s}, [x19], x17
b WriteEnd
Write5:
add x13, x18, #16
st1 {v8.4s}, [x18], x17
add x13, x19, #16
st1 {v8.4s}, [x19], x17
str s9, [x13]
cmp w10, #1
beq WriteEnd
add x13, x13, x17
st1 {v10.4s}, [x18], x17
st1 {v10.4s}, [x19], x17
str s11, [x13]
cmp w10, #2
beq WriteEnd
add x13, x13, x17
st1 {v12.4s}, [x18], x17
st1 {v12.4s}, [x19], x17
str s13, [x13]
cmp w10, #3
beq WriteEnd
add x13, x13, x17
st1 {v14.4s}, [x18], x17
st1 {v14.4s}, [x19], x17
str s15, [x13]
cmp w10, #4
beq WriteEnd
add x13, x13, x17
st1 {v16.4s}, [x18], x17
st1 {v16.4s}, [x19], x17
str s17, [x13]
cmp w10, #5
beq WriteEnd
add x13, x13, x17
st1 {v18.4s}, [x18], x17
st1 {v18.4s}, [x19], x17
str s19, [x13]
cmp w10, #6
beq WriteEnd
add x13, x13, x17
st1 {v20.4s}, [x18], x17
st1 {v20.4s}, [x19], x17
str s21, [x13]
cmp w10, #7
beq WriteEnd
add x13, x13, x17
st1 {v22.4s}, [x18], x17
st1 {v22.4s}, [x19], x17
str s23, [x13]
cmp w10, #8
beq WriteEnd
add x13, x13, x17
st1 {v24.4s}, [x18], x17
st1 {v24.4s}, [x19], x17
str s25, [x13]
cmp w10, #9
beq WriteEnd
add x13, x13, x17
st1 {v26.4s}, [x18], x17
st1 {v26.4s}, [x19], x17
str s27, [x13]
cmp w10, #10
beq WriteEnd
add x13, x13, x17
st1 {v28.4s}, [x18], x17
st1 {v28.4s}, [x19], x17
str s29, [x13]
cmp w10, #11
beq WriteEnd
add x13, x13, x17
st1 {v30.4s}, [x18], x17
st1 {v30.4s}, [x19], x17
str s31, [x13]
b WriteEnd
Write6:
add x13, x18, #16
st1 {v8.4s}, [x18], x17
add x13, x19, #16
st1 {v8.4s}, [x19], x17
dup s8, v9.s[1]
stp s9, s8, [x13]
cmp w10, #1
beq WriteEnd
add x13, x13, x17
st1 {v10.4s}, [x18], x17
st1 {v10.4s}, [x19], x17
dup s10, v11.s[1]
stp s11, s10, [x13]
cmp w10, #2
beq WriteEnd
add x13, x13, x17
st1 {v12.4s}, [x18], x17
st1 {v12.4s}, [x19], x17
dup s12, v13.s[1]
stp s13, s12, [x13]
cmp w10, #3
beq WriteEnd
add x13, x13, x17
st1 {v14.4s}, [x18], x17
st1 {v14.4s}, [x19], x17
dup s14, v15.s[1]
stp s15, s14, [x13]
cmp w10, #4
beq WriteEnd
add x13, x13, x17
st1 {v16.4s}, [x18], x17
st1 {v16.4s}, [x19], x17
dup s16, v17.s[1]
stp s17, s16, [x13]
cmp w10, #5
beq WriteEnd
add x13, x13, x17
st1 {v18.4s}, [x18], x17
st1 {v18.4s}, [x19], x17
dup s18, v19.s[1]
stp s19, s18, [x13]
cmp w10, #6
beq WriteEnd
add x13, x13, x17
st1 {v20.4s}, [x18], x17
st1 {v20.4s}, [x19], x17
dup s20, v21.s[1]
stp s21, s20, [x13]
cmp w10, #7
beq WriteEnd
add x13, x13, x17
st1 {v22.4s}, [x18], x17
st1 {v22.4s}, [x19], x17
dup s22, v23.s[1]
stp s23, s22, [x13]
cmp w10, #8
beq WriteEnd
add x13, x13, x17
st1 {v24.4s}, [x18], x17
st1 {v24.4s}, [x19], x17
dup s24, v25.s[1]
stp s25, s24, [x13]
cmp w10, #9
beq WriteEnd
add x13, x13, x17
st1 {v26.4s}, [x18], x17
st1 {v26.4s}, [x19], x17
dup s26, v27.s[1]
stp s27, s26, [x13]
cmp w10, #10
beq WriteEnd
add x13, x13, x17
st1 {v28.4s}, [x18], x17
st1 {v28.4s}, [x19], x17
dup s28, v29.s[1]
stp s29, s28, [x13]
cmp w10, #11
beq WriteEnd
add x13, x13, x17
st1 {v30.4s}, [x18], x17
st1 {v30.4s}, [x19], x17
dup s30, v31.s[1]
stp s31, s30, [x13]
b WriteEnd
Write7:
add x13, x18, #16
add x16, x18, #24
st1 {v8.4s}, [x18], x17
add x13, x19, #16
add x16, x19, #24
st1 {v8.4s}, [x19], x17
dup s8, v9.s[1]
stp s9, s8, [x13]
add x13, x13, x17
st1 {v9.s}[2], [x16], x17
cmp w10, #1
beq WriteEnd
st1 {v10.4s}, [x18], x17
st1 {v10.4s}, [x19], x17
dup s10, v11.s[1]
stp s11, s10, [x13]
add x13, x13, x17
st1 {v11.s}[2], [x16], x17
cmp w10, #2
beq WriteEnd
st1 {v12.4s}, [x18], x17
st1 {v12.4s}, [x19], x17
dup s12, v13.s[1]
stp s13, s12, [x13]
add x13, x13, x17
st1 {v13.s}[2], [x16], x17
cmp w10, #3
beq WriteEnd
st1 {v14.4s}, [x18], x17
st1 {v14.4s}, [x19], x17
dup s14, v15.s[1]
stp s15, s14, [x13]
add x13, x13, x17
st1 {v15.s}[2], [x16], x17
cmp w10, #4
beq WriteEnd
st1 {v16.4s}, [x18], x17
st1 {v16.4s}, [x19], x17
dup s16, v17.s[1]
stp s17, s16, [x13]
add x13, x13, x17
st1 {v17.s}[2], [x16], x17
cmp w10, #5
beq WriteEnd
st1 {v18.4s}, [x18], x17
st1 {v18.4s}, [x19], x17
dup s18, v19.s[1]
stp s19, s18, [x13]
add x13, x13, x17
st1 {v19.s}[2], [x16], x17
cmp w10, #6
beq WriteEnd
st1 {v20.4s}, [x18], x17
st1 {v20.4s}, [x19], x17
dup s20, v21.s[1]
stp s21, s20, [x13]
add x13, x13, x17
st1 {v21.s}[2], [x16], x17
cmp w10, #7
beq WriteEnd
st1 {v22.4s}, [x18], x17
st1 {v22.4s}, [x19], x17
dup s22, v23.s[1]
stp s23, s22, [x13]
add x13, x13, x17
st1 {v23.s}[2], [x16], x17
cmp w10, #8
beq WriteEnd
st1 {v24.4s}, [x18], x17
st1 {v24.4s}, [x19], x17
dup s24, v25.s[1]
stp s25, s24, [x13]
add x13, x13, x17
st1 {v25.s}[2], [x16], x17
cmp w10, #9
beq WriteEnd
st1 {v26.4s}, [x18], x17
st1 {v26.4s}, [x19], x17
dup s26, v27.s[1]
stp s27, s26, [x13]
add x13, x13, x17
st1 {v27.s}[2], [x16], x17
cmp w10, #10
beq WriteEnd
st1 {v28.4s}, [x18], x17
st1 {v28.4s}, [x19], x17
dup s28, v29.s[1]
stp s29, s28, [x13]
add x13, x13, x17
st1 {v29.s}[2], [x16], x17
cmp w10, #11
beq WriteEnd
st1 {v30.4s}, [x18], x17
st1 {v30.4s}, [x19], x17
dup s30, v31.s[1]
stp s31, s30, [x13]
add x13, x13, x17
@ -697,54 +698,54 @@ WriteC8:
st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
b WriteEnd
WriteWino:
st1 {v8.4s, v9.4s}, [x18], x8
st1 {v10.4s, v11.4s}, [x18], x8
st1 {v12.4s, v13.4s}, [x18], x8
st1 {v14.4s, v15.4s}, [x18], x8
st1 {v16.4s, v17.4s}, [x18], x8
st1 {v18.4s, v19.4s}, [x18], x8
st1 {v20.4s, v21.4s}, [x18], x8
st1 {v22.4s, v23.4s}, [x18], x8
st1 {v24.4s, v25.4s}, [x18], x8
st1 {v26.4s, v27.4s}, [x18], x8
st1 {v28.4s, v29.4s}, [x18], x8
st1 {v30.4s, v31.4s}, [x18], x8
st1 {v8.4s, v9.4s}, [x19], x8
st1 {v10.4s, v11.4s}, [x19], x8
st1 {v12.4s, v13.4s}, [x19], x8
st1 {v14.4s, v15.4s}, [x19], x8
st1 {v16.4s, v17.4s}, [x19], x8
st1 {v18.4s, v19.4s}, [x19], x8
st1 {v20.4s, v21.4s}, [x19], x8
st1 {v22.4s, v23.4s}, [x19], x8
st1 {v24.4s, v25.4s}, [x19], x8
st1 {v26.4s, v27.4s}, [x19], x8
st1 {v28.4s, v29.4s}, [x19], x8
st1 {v30.4s, v31.4s}, [x19], x8
b WriteEnd
Write8:
st1 {v8.4s, v9.4s}, [x18], x17
st1 {v8.4s, v9.4s}, [x19], x17
cmp w10, #1
beq WriteEnd
st1 {v10.4s, v11.4s}, [x18], x17
st1 {v10.4s, v11.4s}, [x19], x17
cmp w10, #2
beq WriteEnd
st1 {v12.4s, v13.4s}, [x18], x17
st1 {v12.4s, v13.4s}, [x19], x17
cmp w10, #3
beq WriteEnd
st1 {v14.4s, v15.4s}, [x18], x17
st1 {v14.4s, v15.4s}, [x19], x17
cmp w10, #4
beq WriteEnd
st1 {v16.4s, v17.4s}, [x18], x17
st1 {v16.4s, v17.4s}, [x19], x17
cmp w10, #5
beq WriteEnd
st1 {v18.4s, v19.4s}, [x18], x17
st1 {v18.4s, v19.4s}, [x19], x17
cmp w10, #6
beq WriteEnd
st1 {v20.4s, v21.4s}, [x18], x17
st1 {v20.4s, v21.4s}, [x19], x17
cmp w10, #7
beq WriteEnd
st1 {v22.4s, v23.4s}, [x18], x17
st1 {v22.4s, v23.4s}, [x19], x17
cmp w10, #8
beq WriteEnd
st1 {v24.4s, v25.4s}, [x18], x17
st1 {v24.4s, v25.4s}, [x19], x17
cmp w10, #9
beq WriteEnd
st1 {v26.4s, v27.4s}, [x18], x17
st1 {v26.4s, v27.4s}, [x19], x17
cmp w10, #10
beq WriteEnd
st1 {v28.4s, v29.4s}, [x18], x17
st1 {v28.4s, v29.4s}, [x19], x17
cmp w10, #11
beq WriteEnd
st1 {v30.4s, v31.4s}, [x18], x17
st1 {v30.4s, v31.4s}, [x19], x17
WriteEnd:
subs w10, w10, #12 // lhs row - 12
@ -766,8 +767,9 @@ NoDstStep:
bgt L1
End1:
sub sp, sp, #128
sub sp, sp, #144
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ret
#endif

View File

@ -21,31 +21,32 @@
// x9: writeMode
asm_function MatmulFloatNeon64Opt
sub sp, sp, #144
sub sp, sp, #160
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
mov x18, #48 // sizeof(float) * 12
mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth
mov x21, #48 // sizeof(float) * 12
mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
cbnz x9, NoC8Steps
mov x11, x2
mov x18, #32
mul x16, x6, x18 // row * 8 * sizeof(float)
mov x21, #32
mul x16, x6, x21 // row * 8 * sizeof(float)
NoC8Steps:
cmp x9, #2
bne NoWinoSteps
mov x18, #4
mov x21, #4
mul x15, x7, x8
mul x15, x15, x18 // kernel_size * col *sizeof(float)
mov x18, #32
mul x16, x8, x18 // kernel_size * 8 * sizeof(float)
mul x15, x15, x21 // kernel_size * col *sizeof(float)
mov x21, #32
mul x16, x8, x21 // kernel_size * 8 * sizeof(float)
NoWinoSteps:
mov x18, #4
mul x8, x8, x18
mov x21, #4
mul x8, x8, x21
LoopRowStart:
cmp x6, #4
@ -1117,9 +1118,9 @@ LoopRow4:
LoopColEnd:
add x0, x0, x17
cbz x9, C8DstStep
mov x18, #4
mul x18, x18, x7
sub x11, x11, x18
mov x21, #4
mul x21, x21, x7
sub x11, x11, x21
mov x2, x11
b NoDstStep
C8DstStep:
@ -1129,9 +1130,10 @@ LoopColEnd:
subs x6, x6, #12
bgt LoopRowStart
sub sp, sp, #144
sub sp, sp, #160
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ret
#endif

View File

@ -67,7 +67,7 @@ L2:
cmp w16, #0
beq End2
mov x18, x1 // reload b ptr
mov x28, x1 // reload b ptr
mov x19, x7 // reload bias ptr
mov w20, w5 // reload depth
dup v16.4s, wzr
@ -94,10 +94,10 @@ L3:
ld1 {v1.16b}, [x17], #16
ld1 {v2.16b}, [x17], #16
ld1 {v3.16b}, [x17], #16
ld1 {v4.16b}, [x18], #16
ld1 {v5.16b}, [x18], #16
ld1 {v6.16b}, [x18], #16
ld1 {v7.16b}, [x18], #16
ld1 {v4.16b}, [x28], #16
ld1 {v5.16b}, [x28], #16
ld1 {v6.16b}, [x28], #16
ld1 {v7.16b}, [x28], #16
smull v8.8h, v4.8b, v0.8b
smull v9.8h, v5.8b, v0.8b

View File

@ -30,7 +30,7 @@
// x28: filter_zp
asm_function MatmulInt8Opt
sub sp, sp, #208
sub sp, sp, #224
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
@ -38,6 +38,7 @@ asm_function MatmulInt8Opt
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
stp x27, x28, [sp], #16
stp x29, x30, [sp], #16
ldr w8, [sp]
ldr w9, [sp, #8]
@ -55,7 +56,7 @@ asm_function MatmulInt8Opt
LoopRow:
mov x16, x1 // reload rhs ptr
mov x17, x4 // reload rhs col
mov x18, x7 // reload bias ptr
mov x29, x7 // reload bias ptr
mov x27, x2 // reload dst ptr
ldr x28, [sp, #64] // reload filter_zp
@ -158,7 +159,7 @@ LoopRow:
Bias:
cbz x7, NoBias
ld1 {v15.4s}, [x18], #16
ld1 {v15.4s}, [x29], #16
add v16.4s, v16.4s, v15.4s
add v17.4s, v17.4s, v15.4s
add v18.4s, v18.4s, v15.4s
@ -330,7 +331,7 @@ LoopColEnd:
b LoopRow
LoopRowEnd:
sub sp, sp, #208
sub sp, sp, #224
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
@ -338,5 +339,6 @@ LoopRowEnd:
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ldp x27, x28, [sp], #16
ldp x29, x30, [sp], #16
ret
#endif

View File

@ -20,9 +20,10 @@
// x7: bias
asm_function MatMulR4Int8Neon64
sub sp, sp, #128
sub sp, sp, #144
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
mov w15, #0 // b col index
mov w16, #0 // a row index
@ -40,7 +41,7 @@ L2:
cmp w16, w3
beq End2
mov x18, x1 // reload b ptr
mov x19, x1 // reload b ptr
mov x10, x7 // reload bias ptr
mov w11, w5 // reload depth
dup v16.4s, wzr
@ -67,10 +68,10 @@ L3:
ld1 {v1.16b}, [x17], #16
ld1 {v2.16b}, [x17], #16
ld1 {v3.16b}, [x17], #16
ld1 {v4.16b}, [x18], #16
ld1 {v5.16b}, [x18], #16
ld1 {v6.16b}, [x18], #16
ld1 {v7.16b}, [x18], #16
ld1 {v4.16b}, [x19], #16
ld1 {v5.16b}, [x19], #16
ld1 {v6.16b}, [x19], #16
ld1 {v7.16b}, [x19], #16
smull v8.8h, v4.8b, v0.8b
smull v9.8h, v5.8b, v0.8b
@ -172,8 +173,9 @@ End2:
b L1
End1:
sub sp, sp, #128
sub sp, sp, #144
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ret
#endif

View File

@ -30,13 +30,13 @@ asm_function MatrixMultiplyWinograd
mov x14, x1 // mat_b
LoopN:
mov x16, x0 // mat_a_m
sub x18, x5, x15 // ni
sub x22, x5, x15 // ni
sub x19, x17, x3 // mi
mul x18, x18, x17 // ni * m
mul x22, x22, x17 // ni * m
mov x11, x6 // in_channel
add x18, x18, x19 // (ni * m) + mi
mul x18, x18, x7 // x18 * c4_channel
add x20, x2, x18 // dst + offset
add x22, x22, x19 // (ni * m) + mi
mul x22, x22, x7 // x22 * c4_channel
add x20, x2, x22 // dst + offset
cmp x11, #16
bge LoopC16
cmp x11, #8

View File

@ -1,6 +1,5 @@
#ifdef __aarch64__
#include "nnacl/assembly_global.h"
.text
.align 5
//.p2align 5,,15

View File

@ -55,16 +55,16 @@ LoopH:
ld1 {v0.s}[2], [x17], x10
ld1 {v0.s}[3], [x17], x10
mov x11, x6
mov x18, x17
add x18, x14, x7
add x16, x18, x7
mov x20, x17
add x20, x14, x7
add x16, x20, x7
add x19, x16, x7
LoopLength4:
ld1 {v16.4s}, [x2]
ld1 {v20.4s}, [x14], #16
fmla v16.4s, v20.4s, v0.s[0]
ld1 {v21.4s}, [x18], #16
ld1 {v21.4s}, [x20], #16
fmul v17.4s, v21.4s, v0.s[1]
ld1 {v20.4s}, [x16], #16
fmla v16.4s, v20.4s, v0.s[2]
@ -90,14 +90,14 @@ LoopH:
ld1 {v0.s}[1], [x17], x10
ld1 {v0.s}[2], [x17], x10
mov x11, x6
mov x18, x17
add x18, x14, x7
add x16, x18, x7
mov x20, x17
add x20, x14, x7
add x16, x20, x7
LoopLength3:
ld1 {v16.4s}, [x2]
ld1 {v20.4s}, [x14], #16
fmla v16.4s, v20.4s, v0.s[0]
ld1 {v21.4s}, [x18], #16
ld1 {v21.4s}, [x20], #16
fmul v17.4s, v21.4s, v0.s[1]
ld1 {v20.4s}, [x16], #16
fmla v16.4s, v20.4s, v0.s[2]

View File

@ -18,6 +18,9 @@ asm_function WinogradTransRight
//x5: k
//x6: length
sub sp, sp, #16
stp x19, x20, [sp], #16
mov x8, #16 // 4 * sizeof(float)
mul x8, x6, x8
mul x9, x5, x8 // step for S
@ -43,7 +46,7 @@ LoopH:
cmp x12, #4
blt LoopKStart3
mov x16, x15
mov x18, x4
mov x19, x4
LoopK4:
ld1 {v0.s}[0], [x13], x10
ld1 {v0.s}[1], [x13], x10
@ -54,7 +57,7 @@ LoopH:
add x14, x17, x8
add x16, x14, x8
add x18, x16, x8
add x19, x16, x8
LoopLength4:
ld1 {v16.4s}, [x2]
@ -64,7 +67,7 @@ LoopH:
fmul v17.4s, v21.4s, v0.s[1]
ld1 {v20.4s}, [x16], #16
fmla v16.4s, v20.4s, v0.s[2]
ld1 {v21.4s}, [x18], #16
ld1 {v21.4s}, [x19], #16
fmla v17.4s, v21.4s, v0.s[3]
fadd v17.4s, v16.4s, v17.4s
@ -73,7 +76,7 @@ LoopH:
bne LoopLength4
sub x2, x2, x8
sub x12, x12, #4
mov x17, x18
mov x17, x19
cmp x12, #4
bge LoopK4
@ -107,7 +110,7 @@ LoopH:
bne LoopLength3
sub x2, x2, x8
sub x12, x12, #3
mov x17, x18
mov x17, x19
cmp x12, #3
bge LoopK3
@ -141,5 +144,7 @@ LoopH:
subs x4, x4, #1
bne LoopH
sub sp, sp, #16
ldp x19, x20, [sp], #16
ret
#endif

View File

@ -1,4 +1,5 @@
#ifdef ENABLE_AVX
#include "nnacl/assembly_global.h"
.text
.align 4
.global ConvDwFp32Avx3x3
@ -31,7 +32,7 @@
// 56: input_stride
// 64: relu
// 72: relu6
ConvDwFp32Avx3x3:
asm_function ConvDwFp32Avx3x3
pushq %r15
pushq %r14
pushq %r13

View File

@ -1,4 +1,5 @@
#ifdef ENABLE_AVX
#include "nnacl/assembly_global.h"
.text
.align 4
.global MatmulFloatAvxOpt
@ -34,7 +35,7 @@
// 72: stride
// 80: writeMode
MatmulFloatAvxOpt:
asm_function MatmulFloatAvxOpt
// rbx, rsp, rbp, r12-r15 must be saved according to x86 calling convention
pushq %r15
pushq %r14

View File

@ -19,12 +19,13 @@ asm_function ConvDwFp16Center
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ x29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #176
sub sp, sp, #192
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
@ -71,7 +72,7 @@ asm_function ConvDwFp16Center
mov v14.16b, v24.16b
mov v15.16b, v24.16b
LoopKh16:
mov x18, x7
mov x25, x7
mov x21, x16
LoopKw16:
mov x22, x21
@ -108,7 +109,7 @@ asm_function ConvDwFp16Center
ld1 {v23.8h}, [x22], x11
fmla v14.8h, v22.8h, v25.8h
fmla v15.8h, v23.8h, v25.8h
subs x18, x18, #1
subs x25, x25, #1
add x21, x21, x13
bne LoopKw16
add x16, x16, x12
@ -191,7 +192,7 @@ asm_function ConvDwFp16Center
mov v6.16b, v24.16b
mov v7.16b, v24.16b
LoopKh8:
mov x18, x7
mov x25, x7
mov x21, x16
LoopKw8:
mov x22, x21
@ -212,7 +213,7 @@ asm_function ConvDwFp16Center
ld1 {v23.8h}, [x22], x11
fmla v6.8h, v22.8h, v25.8h
fmla v7.8h, v23.8h, v25.8h
subs x18, x18, #1
subs x25, x25, #1
add x21, x21, x13
bne LoopKw8
add x16, x16, x12
@ -260,13 +261,13 @@ asm_function ConvDwFp16Center
mov x20, x6
mov v0.16b, v24.16b
LoopKh:
mov x18, x7
mov x25, x7
mov x22, x16
LoopKw:
ld1 {v16.8h}, [x22], x13
ld1 {v25.8h}, [x17], #16
fmla v0.8h, v16.8h, v25.8h
subs x18, x18, #1
subs x25, x25, #1
bne LoopKw
add x16, x16, x12
subs x20, x20, #1
@ -289,11 +290,12 @@ asm_function ConvDwFp16Center
subs x4, x4, #1
bne LoopH
sub sp, sp, #176
sub sp, sp, #192
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ret
#endif

View File

@ -33,12 +33,12 @@ asm_function DeconvDwFp16Center
mov x16, x1
mov x17, x4
LoopW:
mov x18, x15
mov x22, x15
mov x19, x2
mov x20, x5
ld1 {v1.8h}, [x16], x8
LoopKh:
mov x21, x18
mov x21, x22
mov x13, x6
LoopKw:
ld1 {v0.8h}, [x21]
@ -47,7 +47,7 @@ asm_function DeconvDwFp16Center
st1 {v0.8h}, [x21], x12
subs x13, x13, #1
bne LoopKw
add x18, x18, x11
add x22, x22, x11
subs x20, x20, #1
bne LoopKh
add x15, x15, x10

View File

@ -41,11 +41,12 @@ asm_function IndirectGemmFp16_16x8
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
// x19 ~ r29 should be also preserved
// whereas our coding style do not permit such amount of parameters
sub sp, sp, #128
sub sp, sp, #144
// performance between storing 4 registers at the same time and separately storing them on in-order cores
// is not tested yet
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
ldr x8, [sp, #0]
ldr x9, [sp, #8]
@ -548,87 +549,87 @@ IndirectGemmStart:
b WriteEnd
Write7:
add x17, x15, #8
add x18, x15, #10
add x19, x15, #10
add x16, x15, #12
st1 {v16.4h}, [x15], x7
ins v0.s[0], v16.s[2]
st1 {v0.h}[0], [x17], x7
st1 {v0.h}[1], [x18], x7
st1 {v0.h}[1], [x19], x7
st1 {v16.h}[6], [x16], x7
st1 {v17.4h}, [x15], x7
ins v1.s[0], v17.s[2]
st1 {v1.h}[0], [x17], x7
st1 {v1.h}[1], [x18], x7
st1 {v1.h}[1], [x19], x7
st1 {v17.h}[6], [x16], x7
st1 {v18.4h}, [x15], x7
ins v2.s[0], v18.s[2]
st1 {v2.h}[0], [x17], x7
st1 {v2.h}[1], [x18], x7
st1 {v2.h}[1], [x19], x7
st1 {v18.h}[6], [x16], x7
st1 {v19.4h}, [x15], x7
ins v3.s[0], v19.s[2]
st1 {v3.h}[0], [x17], x7
st1 {v3.h}[1], [x18], x7
st1 {v3.h}[1], [x19], x7
st1 {v19.h}[6], [x16], x7
st1 {v20.4h}, [x15], x7
ins v4.s[0], v20.s[2]
st1 {v4.h}[0], [x17], x7
st1 {v4.h}[1], [x18], x7
st1 {v4.h}[1], [x19], x7
st1 {v20.h}[6], [x16], x7
st1 {v21.4h}, [x15], x7
ins v5.s[0], v21.s[2]
st1 {v5.h}[0], [x17], x7
st1 {v5.h}[1], [x18], x7
st1 {v5.h}[1], [x19], x7
st1 {v21.h}[6], [x16], x7
st1 {v22.4h}, [x15], x7
ins v6.s[0], v22.s[2]
st1 {v6.h}[0], [x17], x7
st1 {v6.h}[1], [x18], x7
st1 {v6.h}[1], [x19], x7
st1 {v22.h}[6], [x16], x7
st1 {v23.4h}, [x15], x7
ins v7.s[0], v23.s[2]
st1 {v7.h}[0], [x17], x7
st1 {v7.h}[1], [x18], x7
st1 {v7.h}[1], [x19], x7
st1 {v23.h}[6], [x16], x7
st1 {v24.4h}, [x15], x7
ins v8.s[0], v24.s[2]
st1 {v8.h}[0], [x17], x7
st1 {v8.h}[1], [x18], x7
st1 {v8.h}[1], [x19], x7
st1 {v24.h}[6], [x16], x7
st1 {v25.4h}, [x15], x7
ins v9.s[0], v25.s[2]
st1 {v9.h}[0], [x17], x7
st1 {v9.h}[1], [x18], x7
st1 {v9.h}[1], [x19], x7
st1 {v25.h}[6], [x16], x7
st1 {v26.4h}, [x15], x7
ins v10.s[0], v26.s[2]
st1 {v10.h}[0], [x17], x7
st1 {v10.h}[1], [x18], x7
st1 {v10.h}[1], [x19], x7
st1 {v26.h}[6], [x16], x7
st1 {v27.4h}, [x15], x7
ins v11.s[0], v27.s[2]
st1 {v11.h}[0], [x17], x7
st1 {v11.h}[1], [x18], x7
st1 {v11.h}[1], [x19], x7
st1 {v27.h}[6], [x16], x7
st1 {v28.4h}, [x15], x7
ins v12.s[0], v28.s[2]
st1 {v12.h}[0], [x17], x7
st1 {v12.h}[1], [x18], x7
st1 {v12.h}[1], [x19], x7
st1 {v28.h}[6], [x16], x7
st1 {v29.4h}, [x15], x7
ins v13.s[0], v29.s[2]
st1 {v13.h}[0], [x17], x7
st1 {v13.h}[1], [x18], x7
st1 {v13.h}[1], [x19], x7
st1 {v29.h}[6], [x16], x7
st1 {v30.4h}, [x15], x7
ins v14.s[0], v30.s[2]
st1 {v14.h}[0], [x17], x7
st1 {v14.h}[1], [x18], x7
st1 {v14.h}[1], [x19], x7
st1 {v30.h}[6], [x16], x7
st1 {v31.4h}, [x15]
ins v15.s[0], v31.s[2]
st1 {v15.h}[0], [x17]
st1 {v15.h}[1], [x18]
st1 {v15.h}[1], [x19]
st1 {v31.h}[6], [x16]
add x0, x0, #14
b WriteEnd
@ -661,9 +662,10 @@ IndirectGemmStart:
NoStepForward:
bgt LoopOc
sub sp, sp, #128
sub sp, sp, #144
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ret
#endif

View File

@ -21,21 +21,22 @@
// w13: writeC8
asm_function MatmulFp16Neon64
sub sp, sp, #128
sub sp, sp, #144
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
stp x19, x20, [sp], #16
mov w18, #16 // sizeof(float16) * 8
mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float16) * 8 * depth
mov x11, x3 // bias flag
mov x18, #2
mov x19, #2
ldr x17, [sp]
mul x17, x17, x18
mul x17, x17, x19
L1:
mov w10, w6 // reload lhs row
mov x12, x0 // reload lhs ptr
mov x18, x2 // reload dst ptr
mov x19, x2 // reload dst ptr
L2:
mov x16, x1 // reload rhs ptr
@ -314,490 +315,490 @@ Write:
b Write8
Write1:
st1 {v16.h}[0], [x18], x17
st1 {v16.h}[0], [x19], x17
cmp w10, #1
beq WriteEnd
st1 {v17.h}[0], [x18], x17
st1 {v17.h}[0], [x19], x17
cmp w10, #2
beq WriteEnd
st1 {v18.h}[0], [x18], x17
st1 {v18.h}[0], [x19], x17
cmp w10, #3
beq WriteEnd
st1 {v19.h}[0], [x18], x17
st1 {v19.h}[0], [x19], x17
cmp w10, #4
beq WriteEnd
st1 {v20.h}[0], [x18], x17
st1 {v20.h}[0], [x19], x17
cmp w10, #5
beq WriteEnd
st1 {v21.h}[0], [x18], x17
st1 {v21.h}[0], [x19], x17
cmp w10, #6
beq WriteEnd
st1 {v22.h}[0], [x18], x17
st1 {v22.h}[0], [x19], x17
cmp w10, #7
beq WriteEnd
st1 {v23.h}[0], [x18], x17
st1 {v23.h}[0], [x19], x17
cmp w10, #8
beq WriteEnd
st1 {v24.h}[0], [x18], x17
st1 {v24.h}[0], [x19], x17
cmp w10, #9
beq WriteEnd
st1 {v25.h}[0], [x18], x17
st1 {v25.h}[0], [x19], x17
cmp w10, #10
beq WriteEnd
st1 {v26.h}[0], [x18], x17
st1 {v26.h}[0], [x19], x17
cmp w10, #11
beq WriteEnd
st1 {v27.h}[0], [x18], x17
st1 {v27.h}[0], [x19], x17
cmp w10, #12
beq WriteEnd
st1 {v28.h}[0], [x18], x17
st1 {v28.h}[0], [x19], x17
cmp w10, #13
beq WriteEnd
st1 {v29.h}[0], [x18], x17
st1 {v29.h}[0], [x19], x17
cmp w10, #14
beq WriteEnd
st1 {v30.h}[0], [x18], x17
st1 {v30.h}[0], [x19], x17
cmp w10, #15
beq WriteEnd
st1 {v31.h}[0], [x18], x17
st1 {v31.h}[0], [x19], x17
b WriteEnd
Write2:
add x13, x18, #2
st1 {v16.h}[0], [x18], x17
add x13, x19, #2
st1 {v16.h}[0], [x19], x17
st1 {v16.h}[1], [x13], x17
cmp w10, #1
beq WriteEnd
st1 {v17.h}[0], [x18], x17
st1 {v17.h}[0], [x19], x17
st1 {v17.h}[1], [x13], x17
cmp w10, #2
beq WriteEnd
st1 {v18.h}[0], [x18], x17
st1 {v18.h}[0], [x19], x17
st1 {v18.h}[1], [x13], x17
cmp w10, #3
beq WriteEnd
st1 {v19.h}[0], [x18], x17
st1 {v19.h}[0], [x19], x17
st1 {v19.h}[1], [x13], x17
cmp w10, #4
beq WriteEnd
st1 {v20.h}[0], [x18], x17
st1 {v20.h}[0], [x19], x17
st1 {v20.h}[1], [x13], x17
cmp w10, #5
beq WriteEnd
st1 {v21.h}[0], [x18], x17
st1 {v21.h}[0], [x19], x17
st1 {v21.h}[1], [x13], x17
cmp w10, #6
beq WriteEnd
st1 {v22.h}[0], [x18], x17
st1 {v22.h}[0], [x19], x17
st1 {v22.h}[1], [x13], x17
cmp w10, #7
beq WriteEnd
st1 {v23.h}[0], [x18], x17
st1 {v23.h}[0], [x19], x17
st1 {v23.h}[1], [x13], x17
cmp w10, #8
beq WriteEnd
st1 {v24.h}[0], [x18], x17
st1 {v24.h}[0], [x19], x17
st1 {v24.h}[1], [x13], x17
cmp w10, #9
beq WriteEnd
st1 {v25.h}[0], [x18], x17
st1 {v25.h}[0], [x19], x17
st1 {v25.h}[1], [x13], x17
cmp w10, #10
beq WriteEnd
st1 {v26.h}[0], [x18], x17
st1 {v26.h}[0], [x19], x17
st1 {v26.h}[1], [x13], x17
cmp w10, #11
beq WriteEnd
st1 {v27.h}[0], [x18], x17
st1 {v27.h}[0], [x19], x17
st1 {v27.h}[1], [x13], x17
cmp w10, #12
beq WriteEnd
st1 {v28.h}[0], [x18], x17
st1 {v28.h}[0], [x19], x17
st1 {v28.h}[1], [x13], x17
cmp w10, #13
beq WriteEnd
st1 {v29.h}[0], [x18], x17
st1 {v29.h}[0], [x19], x17
st1 {v29.h}[1], [x13], x17
cmp w10, #14
beq WriteEnd
st1 {v30.h}[0], [x18], x17
st1 {v30.h}[0], [x19], x17
st1 {v30.h}[1], [x13], x17
cmp w10, #15
beq WriteEnd
st1 {v31.h}[0], [x18], x17
st1 {v31.h}[0], [x19], x17
st1 {v31.h}[1], [x13], x17
b WriteEnd
Write3:
add x13, x18, #2
add x14, x18, #4
st1 {v16.h}[0], [x18], x17
add x13, x19, #2
add x14, x19, #4
st1 {v16.h}[0], [x19], x17
st1 {v16.h}[1], [x13], x17
st1 {v16.h}[2], [x14], x17
cmp w10, #1
beq WriteEnd
st1 {v17.h}[0], [x18], x17
st1 {v17.h}[0], [x19], x17
st1 {v17.h}[1], [x13], x17
st1 {v17.h}[2], [x14], x17
cmp w10, #2
beq WriteEnd
st1 {v18.h}[0], [x18], x17
st1 {v18.h}[0], [x19], x17
st1 {v18.h}[1], [x13], x17
st1 {v18.h}[2], [x14], x17
cmp w10, #3
beq WriteEnd
st1 {v19.h}[0], [x18], x17
st1 {v19.h}[0], [x19], x17
st1 {v19.h}[1], [x13], x17
st1 {v19.h}[2], [x14], x17
cmp w10, #4
beq WriteEnd
st1 {v20.h}[0], [x18], x17
st1 {v20.h}[0], [x19], x17
st1 {v20.h}[1], [x13], x17
st1 {v20.h}[2], [x14], x17
cmp w10, #5
beq WriteEnd
st1 {v21.h}[0], [x18], x17
st1 {v21.h}[0], [x19], x17
st1 {v21.h}[1], [x13], x17
st1 {v21.h}[2], [x14], x17
cmp w10, #6
beq WriteEnd
st1 {v22.h}[0], [x18], x17
st1 {v22.h}[0], [x19], x17
st1 {v22.h}[1], [x13], x17
st1 {v22.h}[2], [x14], x17
cmp w10, #7
beq WriteEnd
st1 {v23.h}[0], [x18], x17
st1 {v23.h}[0], [x19], x17
st1 {v23.h}[1], [x13], x17
st1 {v23.h}[2], [x14], x17
cmp w10, #8
beq WriteEnd
st1 {v24.h}[0], [x18], x17
st1 {v24.h}[0], [x19], x17
st1 {v24.h}[1], [x13], x17
st1 {v24.h}[2], [x14], x17
cmp w10, #9
beq WriteEnd
st1 {v25.h}[0], [x18], x17
st1 {v25.h}[0], [x19], x17
st1 {v25.h}[1], [x13], x17
st1 {v25.h}[2], [x14], x17
cmp w10, #10
beq WriteEnd
st1 {v26.h}[0], [x18], x17
st1 {v26.h}[0], [x19], x17
st1 {v26.h}[1], [x13], x17
st1 {v26.h}[2], [x14], x17
cmp w10, #11
beq WriteEnd
st1 {v27.h}[0], [x18], x17
st1 {v27.h}[0], [x19], x17
st1 {v27.h}[1], [x13], x17
st1 {v27.h}[2], [x14], x17
cmp w10, #12
beq WriteEnd
st1 {v28.h}[0], [x18], x17
st1 {v28.h}[0], [x19], x17
st1 {v28.h}[1], [x13], x17
st1 {v28.h}[2], [x14], x17
cmp w10, #13
beq WriteEnd
st1 {v29.h}[0], [x18], x17
st1 {v29.h}[0], [x19], x17
st1 {v29.h}[1], [x13], x17
st1 {v29.h}[2], [x14], x17
cmp w10, #14
beq WriteEnd
st1 {v30.h}[0], [x18], x17
st1 {v30.h}[0], [x19], x17
st1 {v30.h}[1], [x13], x17
st1 {v30.h}[2], [x14], x17
cmp w10, #15
beq WriteEnd
st1 {v31.h}[0], [x18], x17
st1 {v31.h}[0], [x19], x17
st1 {v31.h}[1], [x13], x17
st1 {v31.h}[2], [x14], x17
b WriteEnd
Write4:
st1 {v16.4h}, [x18], x17
st1 {v16.4h}, [x19], x17
cmp w10, #1
beq WriteEnd
st1 {v17.4h}, [x18], x17
st1 {v17.4h}, [x19], x17
cmp w10, #2
beq WriteEnd
st1 {v18.4h}, [x18], x17
st1 {v18.4h}, [x19], x17
cmp w10, #3
beq WriteEnd
st1 {v19.4h}, [x18], x17
st1 {v19.4h}, [x19], x17
cmp w10, #4
beq WriteEnd
st1 {v20.4h}, [x18], x17
st1 {v20.4h}, [x19], x17
cmp w10, #5
beq WriteEnd
st1 {v21.4h}, [x18], x17
st1 {v21.4h}, [x19], x17
cmp w10, #6
beq WriteEnd
st1 {v22.4h}, [x18], x17
st1 {v22.4h}, [x19], x17
cmp w10, #7
beq WriteEnd
st1 {v23.4h}, [x18], x17
st1 {v23.4h}, [x19], x17
cmp w10, #8
beq WriteEnd
st1 {v24.4h}, [x18], x17
st1 {v24.4h}, [x19], x17
cmp w10, #9
beq WriteEnd
st1 {v25.4h}, [x18], x17
st1 {v25.4h}, [x19], x17
cmp w10, #10
beq WriteEnd
st1 {v26.4h}, [x18], x17
st1 {v26.4h}, [x19], x17
cmp w10, #11
beq WriteEnd
st1 {v27.4h}, [x18], x17
st1 {v27.4h}, [x19], x17
cmp w10, #12
beq WriteEnd
st1 {v28.4h}, [x18], x17
st1 {v28.4h}, [x19], x17
cmp w10, #13
beq WriteEnd
st1 {v29.4h}, [x18], x17
st1 {v29.4h}, [x19], x17
cmp w10, #14
beq WriteEnd
st1 {v30.4h}, [x18], x17
st1 {v30.4h}, [x19], x17
cmp w10, #15
beq WriteEnd
st1 {v31.4h}, [x18], x17
st1 {v31.4h}, [x19], x17
b WriteEnd
Write5:
add x13, x18, #8
st1 {v16.4h}, [x18], x17
add x13, x19, #8
st1 {v16.4h}, [x19], x17
st1 {v16.h}[4], [x13], x17
cmp w10, #1
beq WriteEnd
st1 {v17.4h}, [x18], x17
st1 {v17.4h}, [x19], x17
st1 {v17.h}[4], [x13], x17
cmp w10, #2
beq WriteEnd
st1 {v18.4h}, [x18], x17
st1 {v18.4h}, [x19], x17
st1 {v18.h}[4], [x13], x17
cmp w10, #3
beq WriteEnd
st1 {v19.4h}, [x18], x17
st1 {v19.4h}, [x19], x17
st1 {v19.h}[4], [x13], x17
cmp w10, #4
beq WriteEnd
st1 {v20.4h}, [x18], x17
st1 {v20.4h}, [x19], x17
st1 {v20.h}[4], [x13], x17
cmp w10, #5
beq WriteEnd
st1 {v21.4h}, [x18], x17
st1 {v21.4h}, [x19], x17
st1 {v21.h}[4], [x13], x17
cmp w10, #6
beq WriteEnd
st1 {v22.4h}, [x18], x17
st1 {v22.4h}, [x19], x17
st1 {v22.h}[4], [x13], x17
cmp w10, #7
beq WriteEnd
st1 {v23.4h}, [x18], x17
st1 {v23.4h}, [x19], x17
st1 {v23.h}[4], [x13], x17
cmp w10, #8
beq WriteEnd
st1 {v24.4h}, [x18], x17
st1 {v24.4h}, [x19], x17
st1 {v24.h}[4], [x13], x17
cmp w10, #9
beq WriteEnd
st1 {v25.4h}, [x18], x17
st1 {v25.4h}, [x19], x17
st1 {v25.h}[4], [x13], x17
cmp w10, #10
beq WriteEnd
st1 {v26.4h}, [x18], x17
st1 {v26.4h}, [x19], x17
st1 {v26.h}[4], [x13], x17
cmp w10, #11
beq WriteEnd
st1 {v27.4h}, [x18], x17
st1 {v27.4h}, [x19], x17
st1 {v27.h}[4], [x13], x17
cmp w10, #12
beq WriteEnd
st1 {v28.4h}, [x18], x17
st1 {v28.4h}, [x19], x17
st1 {v28.h}[4], [x13], x17
cmp w10, #13
beq WriteEnd
st1 {v29.4h}, [x18], x17
st1 {v29.4h}, [x19], x17
st1 {v29.h}[4], [x13], x17
cmp w10, #14
beq WriteEnd
st1 {v30.4h}, [x18], x17
st1 {v30.4h}, [x19], x17
st1 {v30.h}[4], [x13], x17
cmp w10, #15
beq WriteEnd
st1 {v31.4h}, [x18], x17
st1 {v31.4h}, [x19], x17
st1 {v31.h}[4], [x13], x17
b WriteEnd
Write6:
add x13, x18, #8
add x14, x18, #10
st1 {v16.4h}, [x18], x17
add x13, x19, #8
add x14, x19, #10
st1 {v16.4h}, [x19], x17
st1 {v16.h}[4], [x13], x17
st1 {v16.h}[5], [x14], x17
cmp w10, #1
beq WriteEnd
st1 {v17.4h}, [x18], x17
st1 {v17.4h}, [x19], x17
st1 {v17.h}[4], [x13], x17
st1 {v17.h}[5], [x14], x17
cmp w10, #2
beq WriteEnd
st1 {v18.4h}, [x18], x17
st1 {v18.4h}, [x19], x17
st1 {v18.h}[4], [x13], x17
st1 {v18.h}[5], [x14], x17
cmp w10, #3
beq WriteEnd
st1 {v19.4h}, [x18], x17
st1 {v19.4h}, [x19], x17
st1 {v19.h}[4], [x13], x17
st1 {v19.h}[5], [x14], x17
cmp w10, #4
beq WriteEnd
st1 {v20.4h}, [x18], x17
st1 {v20.4h}, [x19], x17
st1 {v20.h}[4], [x13], x17
st1 {v20.h}[5], [x14], x17
cmp w10, #5
beq WriteEnd
st1 {v21.4h}, [x18], x17
st1 {v21.4h}, [x19], x17
st1 {v21.h}[4], [x13], x17
st1 {v21.h}[5], [x14], x17
cmp w10, #6
beq WriteEnd
st1 {v22.4h}, [x18], x17
st1 {v22.4h}, [x19], x17
st1 {v22.h}[4], [x13], x17
st1 {v22.h}[5], [x14], x17
cmp w10, #7
beq WriteEnd
st1 {v23.4h}, [x18], x17
st1 {v23.4h}, [x19], x17
st1 {v23.h}[4], [x13], x17
st1 {v23.h}[5], [x14], x17
cmp w10, #8
beq WriteEnd
st1 {v24.4h}, [x18], x17
st1 {v24.4h}, [x19], x17
st1 {v24.h}[4], [x13], x17
st1 {v24.h}[5], [x14], x17
cmp w10, #9
beq WriteEnd
st1 {v25.4h}, [x18], x17
st1 {v25.4h}, [x19], x17
st1 {v25.h}[4], [x13], x17
st1 {v25.h}[5], [x14], x17
cmp w10, #10
beq WriteEnd
st1 {v26.4h}, [x18], x17
st1 {v26.4h}, [x19], x17
st1 {v26.h}[4], [x13], x17
st1 {v26.h}[5], [x14], x17
cmp w10, #11
beq WriteEnd
st1 {v27.4h}, [x18], x17
st1 {v27.4h}, [x19], x17
st1 {v27.h}[4], [x13], x17
st1 {v27.h}[5], [x14], x17
cmp w10, #12
beq WriteEnd
st1 {v28.4h}, [x18], x17
st1 {v28.4h}, [x19], x17
st1 {v28.h}[4], [x13], x17
st1 {v28.h}[5], [x14], x17
cmp w10, #13
beq WriteEnd
st1 {v29.4h}, [x18], x17
st1 {v29.4h}, [x19], x17
st1 {v29.h}[4], [x13], x17
st1 {v29.h}[5], [x14], x17
cmp w10, #14
beq WriteEnd
st1 {v30.4h}, [x18], x17
st1 {v30.4h}, [x19], x17
st1 {v30.h}[4], [x13], x17
st1 {v30.h}[5], [x14], x17
cmp w10, #15
beq WriteEnd
st1 {v31.4h}, [x18], x17
st1 {v31.4h}, [x19], x17
st1 {v31.h}[4], [x13], x17
st1 {v31.h}[5], [x14], x17
b WriteEnd
Write7:
add x13, x18, #8
add x14, x18, #10
add x16, x18, #12
st1 {v16.4h}, [x18], x17
add x13, x19, #8
add x14, x19, #10
add x16, x19, #12
st1 {v16.4h}, [x19], x17
st1 {v16.h}[4], [x13], x17
st1 {v16.h}[5], [x14], x17
st1 {v16.h}[6], [x16], x17
cmp w10, #1
beq WriteEnd
st1 {v17.4h}, [x18], x17
st1 {v17.4h}, [x19], x17
st1 {v17.h}[4], [x13], x17
st1 {v17.h}[5], [x14], x17
st1 {v17.h}[6], [x16], x17
cmp w10, #2
beq WriteEnd
st1 {v18.4h}, [x18], x17
st1 {v18.4h}, [x19], x17
st1 {v18.h}[4], [x13], x17
st1 {v18.h}[5], [x14], x17
st1 {v18.h}[6], [x16], x17
cmp w10, #3
beq WriteEnd
st1 {v19.4h}, [x18], x17
st1 {v19.4h}, [x19], x17
st1 {v19.h}[4], [x13], x17
st1 {v19.h}[5], [x14], x17
st1 {v19.h}[6], [x16], x17
cmp w10, #4
beq WriteEnd
st1 {v20.4h}, [x18], x17
st1 {v20.4h}, [x19], x17
st1 {v20.h}[4], [x13], x17
st1 {v20.h}[5], [x14], x17
st1 {v20.h}[6], [x16], x17
cmp w10, #5
beq WriteEnd
st1 {v21.4h}, [x18], x17
st1 {v21.4h}, [x19], x17
st1 {v21.h}[4], [x13], x17
st1 {v21.h}[5], [x14], x17
st1 {v21.h}[6], [x16], x17
cmp w10, #6
beq WriteEnd
st1 {v22.4h}, [x18], x17
st1 {v22.4h}, [x19], x17
st1 {v22.h}[4], [x13], x17
st1 {v22.h}[5], [x14], x17
st1 {v22.h}[6], [x16], x17
cmp w10, #7
beq WriteEnd
st1 {v23.4h}, [x18], x17
st1 {v23.4h}, [x19], x17
st1 {v23.h}[4], [x13], x17
st1 {v23.h}[5], [x14], x17
st1 {v23.h}[6], [x16], x17
cmp w10, #8
beq WriteEnd
st1 {v24.4h}, [x18], x17
st1 {v24.4h}, [x19], x17
st1 {v24.h}[4], [x13], x17
st1 {v24.h}[5], [x14], x17
st1 {v24.h}[6], [x16], x17
cmp w10, #9
beq WriteEnd
st1 {v25.4h}, [x18], x17
st1 {v25.4h}, [x19], x17
st1 {v25.h}[4], [x13], x17
st1 {v25.h}[5], [x14], x17
st1 {v25.h}[6], [x16], x17
cmp w10, #10
beq WriteEnd
st1 {v26.4h}, [x18], x17
st1 {v26.4h}, [x19], x17
st1 {v26.h}[4], [x13], x17
st1 {v26.h}[5], [x14], x17
st1 {v26.h}[6], [x16], x17
cmp w10, #11
beq WriteEnd
st1 {v27.4h}, [x18], x17
st1 {v27.4h}, [x19], x17
st1 {v27.h}[4], [x13], x17
st1 {v27.h}[5], [x14], x17
st1 {v27.h}[6], [x16], x17
cmp w10, #12
beq WriteEnd
st1 {v28.4h}, [x18], x17
st1 {v28.4h}, [x19], x17
st1 {v28.h}[4], [x13], x17
st1 {v28.h}[5], [x14], x17
st1 {v28.h}[6], [x16], x17
cmp w10, #13
beq WriteEnd
st1 {v29.4h}, [x18], x17
st1 {v29.4h}, [x19], x17
st1 {v29.h}[4], [x13], x17
st1 {v29.h}[5], [x14], x17
st1 {v29.h}[6], [x16], x17
cmp w10, #14
beq WriteEnd
st1 {v30.4h}, [x18], x17
st1 {v30.4h}, [x19], x17
st1 {v30.h}[4], [x13], x17
st1 {v30.h}[5], [x14], x17
st1 {v30.h}[6], [x16], x17
cmp w10, #15
beq WriteEnd
st1 {v31.4h}, [x18], x17
st1 {v31.4h}, [x19], x17
st1 {v31.h}[4], [x13], x17
st1 {v31.h}[5], [x14], x17
st1 {v31.h}[6], [x16], x17
@ -809,52 +810,52 @@ WriteC8:
st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64
b WriteEnd
Write8:
st1 {v16.8h}, [x18], x17
st1 {v16.8h}, [x19], x17
cmp w10, #1
beq WriteEnd
st1 {v17.8h}, [x18], x17
st1 {v17.8h}, [x19], x17
cmp w10, #2
beq WriteEnd
st1 {v18.8h}, [x18], x17
st1 {v18.8h}, [x19], x17
cmp w10, #3
beq WriteEnd
st1 {v19.8h}, [x18], x17
st1 {v19.8h}, [x19], x17
cmp w10, #4
beq WriteEnd
st1 {v20.8h}, [x18], x17
st1 {v20.8h}, [x19], x17
cmp w10, #5
beq WriteEnd
st1 {v21.8h}, [x18], x17
st1 {v21.8h}, [x19], x17
cmp w10, #6
beq WriteEnd
st1 {v22.8h}, [x18], x17
st1 {v22.8h}, [x19], x17
cmp w10, #7
beq WriteEnd
st1 {v23.8h}, [x18], x17
st1 {v23.8h}, [x19], x17
cmp w10, #8
beq WriteEnd
st1 {v24.8h}, [x18], x17
st1 {v24.8h}, [x19], x17
cmp w10, #9
beq WriteEnd
st1 {v25.8h}, [x18], x17
st1 {v25.8h}, [x19], x17
cmp w10, #10
beq WriteEnd
st1 {v26.8h}, [x18], x17
st1 {v26.8h}, [x19], x17
cmp w10, #11
beq WriteEnd
st1 {v27.8h}, [x18], x17
st1 {v27.8h}, [x19], x17
cmp w10, #12
beq WriteEnd
st1 {v28.8h}, [x18], x17
st1 {v28.8h}, [x19], x17
cmp w10, #13
beq WriteEnd
st1 {v29.8h}, [x18], x17
st1 {v29.8h}, [x19], x17
cmp w10, #14
beq WriteEnd
st1 {v30.8h}, [x18], x17
st1 {v30.8h}, [x19], x17
cmp w10, #15
beq WriteEnd
st1 {v31.8h}, [x18], x17
st1 {v31.8h}, [x19], x17
WriteEnd:
subs w10, w10, #16 // lhs row - 8
@ -871,8 +872,9 @@ NoDstStep:
bgt L1
End1:
sub sp, sp, #128
sub sp, sp, #144
ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
ldp x19, x20, [sp], #16
ret
#endif

View File

@ -21,30 +21,31 @@
// x9: writeMode
asm_function MatmulFp16Neon64Opt
sub sp, sp, #80
sub sp, sp, #96
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
stp x19, x20, [sp], #16
stp x21, x22, [sp], #16
ldr x8, [sp]
ldr x9, [sp, #8]
mov x18, #32 // sizeof(float16_t) * 16
mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth
mov x21, #32 // sizeof(float16_t) * 16
mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth
cbnz x9, NoC8Steps
mov x11, x2
mov x18, #16
mul x16, x6, x18 // row * 8 * sizeof(float16_t)
mov x21, #16
mul x16, x6, x21 // row * 8 * sizeof(float16_t)
NoC8Steps:
cmp x9, #2
bne NoWinoSteps
mov x18, #2
mov x21, #2
mul x15, x7, x8
mul x15, x15, x18 // kernel_size * col *sizeof(float16_t)
mov x18, #16
mul x16, x8, x18 // kernel_size * 8 * sizeof(float16_t)
mul x15, x15, x21 // kernel_size * col *sizeof(float16_t)
mov x21, #16
mul x16, x8, x21 // kernel_size * 8 * sizeof(float16_t)
NoWinoSteps:
mov x18, #2
mul x8, x8, x18
mov x21, #2
mul x8, x8, x21
LoopRowStart:
cmp x6, #1
@ -1221,9 +1222,9 @@ LoopRow:
LoopColEnd:
add x0, x0, x17
cbz x9, C8DstStep
mov x18, #2
mul x18, x18, x7
sub x11, x11, x18
mov x21, #2
mul x21, x21, x7
sub x11, x11, x21
mov x2, x11
b NoDstStep
C8DstStep:
@ -1233,8 +1234,9 @@ LoopColEnd:
subs x6, x6, #16
bgt LoopRowStart
sub sp, sp, #80
sub sp, sp, #96
ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ret
#endif

View File

@ -31,13 +31,13 @@ asm_function MatrixMultiplyWinogradFp16
mov x14, x1 // mat_b
LoopN:
mov x16, x0 // mat_a_m
sub x18, x5, x15 // ni
sub x22, x5, x15 // ni
sub x19, x17, x3 // mi
mul x18, x18, x17 // ni * m
mul x22, x22, x17 // ni * m
mov x11, x6 // in_channel
add x18, x18, x19 // (ni * m) + mi
mul x18, x18, x13 // x18 * channel_in * 2
add x20, x2, x18 // dst + offset
add x22, x22, x19 // (ni * m) + mi
mul x22, x22, x13 // x22 * channel_in * 2
add x20, x2, x22 // dst + offset
cmp x11, #32
bge LoopC32
cmp x11, #16

View File

@ -9,8 +9,8 @@
asm_function WinogradTransLeftFp16
sub sp, sp, #32
stp x19, x20, [sp], #32
sub sp, sp, #16
stp x19, x20, [sp], #16
mov x8, #8 // 4 * sizeof(float16)
mul x8, x6, x8
@ -46,16 +46,16 @@ LoopH:
ld1 {v0.h}[2], [x17], x10
ld1 {v0.h}[3], [x17], x10
mov x11, x6
mov x18, x17
add x18, x14, x7
add x16, x18, x7
mov x20, x17
add x20, x14, x7
add x16, x20, x7
add x19, x16, x7
LoopLength4:
ld1 {v16.4h}, [x2]
ld1 {v20.4h}, [x14], #8
fmla v16.4h, v20.4h, v0.h[0]
ld1 {v21.4h}, [x18], #8
ld1 {v21.4h}, [x20], #8
fmul v17.4h, v21.4h, v0.h[1]
ld1 {v20.4h}, [x16], #8
fmla v16.4h, v20.4h, v0.h[2]
@ -81,14 +81,14 @@ LoopH:
ld1 {v0.h}[1], [x17], x10
ld1 {v0.h}[2], [x17], x10
mov x11, x6
mov x18, x17
add x18, x14, x7
add x16, x18, x7
mov x20, x17
add x20, x14, x7
add x16, x20, x7
LoopLength3:
ld1 {v16.4h}, [x2]
ld1 {v20.4h}, [x14], #8
fmla v16.4h, v20.4h, v0.h[0]
ld1 {v21.4h}, [x18], #8
ld1 {v21.4h}, [x20], #8
fmul v17.4h, v21.4h, v0.h[1]
ld1 {v20.4h}, [x16], #8
fmla v16.4h, v20.4h, v0.h[2]
@ -132,6 +132,6 @@ LoopH:
subs x4, x4, #1
bne LoopH
sub sp, sp, #32
ldp x19, x20, [sp], #32
sub sp, sp, #16
ldp x19, x20, [sp], #16
ret

View File

@ -9,6 +9,9 @@
asm_function WinogradTransRightFp16
sub sp, sp, #16
stp x19, x20, [sp], #16
mov x8, #8 // 4 * sizeof(float16)
mul x8, x6, x8
mul x9, x5, x8 // step for S
@ -34,7 +37,7 @@ LoopH:
cmp x12, #4
blt LoopKStart3
mov x16, x15
mov x18, x4
mov x19, x4
LoopK4:
ld1 {v0.h}[0], [x13], x10
ld1 {v0.h}[1], [x13], x10
@ -45,7 +48,7 @@ LoopH:
add x14, x17, x8
add x16, x14, x8
add x18, x16, x8
add x19, x16, x8
LoopLength4:
ld1 {v16.4h}, [x2]
@ -55,7 +58,7 @@ LoopH:
fmul v17.4h, v21.4h, v0.h[1]
ld1 {v20.4h}, [x16], #8
fmla v16.4h, v20.4h, v0.h[2]
ld1 {v21.4h}, [x18], #8
ld1 {v21.4h}, [x19], #8
fmla v17.4h, v21.4h, v0.h[3]
fadd v17.4h, v16.4h, v17.4h
@ -64,7 +67,7 @@ LoopH:
bne LoopLength4
sub x2, x2, x8
sub x12, x12, #4
mov x17, x18
mov x17, x19
cmp x12, #4
bge LoopK4
@ -98,7 +101,7 @@ LoopH:
bne LoopLength3
sub x2, x2, x8
sub x12, x12, #3
mov x17, x18
mov x17, x19
cmp x12, #3
bge LoopK3
@ -132,4 +135,7 @@ LoopH:
subs x4, x4, #1
bne LoopH
sub sp, sp, #16
ldp x19, x20, [sp], #16
ret

View File

@ -66,7 +66,7 @@ L2:
cmp w16, #0
beq End2
mov x18, x1 // reload b ptr
mov x28, x1 // reload b ptr
mov x19, x7 // reload bias ptr
mov w20, w5 // reload depth
dup v16.4s, wzr
@ -91,7 +91,7 @@ L3:
LoopD16:
ld1 {v0.16b, v1.16b}, [x17], #32
ld1 {v2.16b, v3.16b}, [x18], #32
ld1 {v2.16b, v3.16b}, [x28], #32
sdot v16.4s, v2.16b, v0.4b[0]
sdot v18.4s, v2.16b, v0.4b[1]
@ -104,7 +104,7 @@ LoopD16:
sdot v28.4s, v2.16b, v1.4b[2]
sdot v30.4s, v2.16b, v1.4b[3]
ld1 {v6.16b, v7.16b}, [x18], #32
ld1 {v6.16b, v7.16b}, [x28], #32
sdot v17.4s, v3.16b, v0.4b[0]
sdot v19.4s, v3.16b, v0.4b[1]
sdot v21.4s, v3.16b, v0.4b[2]
@ -126,7 +126,7 @@ LoopD16:
sdot v28.4s, v6.16b, v5.4b[2]
sdot v30.4s, v6.16b, v5.4b[3]
ld1 {v10.16b, v11.16b}, [x18], #32
ld1 {v10.16b, v11.16b}, [x28], #32
sdot v17.4s, v7.16b, v4.4b[0]
sdot v19.4s, v7.16b, v4.4b[1]
sdot v21.4s, v7.16b, v4.4b[2]
@ -148,7 +148,7 @@ LoopD16:
sdot v28.4s, v10.16b, v9.4b[2]
sdot v30.4s, v10.16b, v9.4b[3]
ld1 {v14.16b, v15.16b}, [x18], #32
ld1 {v14.16b, v15.16b}, [x28], #32
sdot v17.4s, v11.16b, v8.4b[0]
sdot v19.4s, v11.16b, v8.4b[1]
sdot v21.4s, v11.16b, v8.4b[2]
@ -187,7 +187,7 @@ LoopD4:
beq End3
ld1 {v0.16b, v1.16b}, [x17], #32
ld1 {v2.16b, v3.16b}, [x18], #32
ld1 {v2.16b, v3.16b}, [x28], #32
sdot v16.4s, v2.16b, v0.4b[0]
sdot v18.4s, v2.16b, v0.4b[1]

View File

@ -30,7 +30,7 @@
// x28: filter_zp
asm_function MatmulInt8DpOpt
sub sp, sp, #208
sub sp, sp, #224
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
@ -38,6 +38,7 @@ asm_function MatmulInt8DpOpt
stp x23, x24, [sp], #16
stp x25, x26, [sp], #16
stp x27, x28, [sp], #16
stp x29, x30, [sp], #16
ldr w8, [sp]
ldr w9, [sp, #8]
@ -56,7 +57,7 @@ asm_function MatmulInt8DpOpt
LoopRow:
mov x16, x1 // reload rhs ptr
mov x17, x4 // reload rhs col
mov x18, x7 // reload bias ptr
mov x29, x7 // reload bias ptr
mov x25, x6 // reload input_sum ptr
mov x27, x2 // reload dst ptr
ldr x28, [sp, #64] // reload filter_zp
@ -113,7 +114,7 @@ LoopRow:
Bias:
cbz x7, NoReadBias
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x18], #64
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x29], #64
add v16.4s, v16.4s, v0.4s
add v17.4s, v17.4s, v1.4s
add v18.4s, v18.4s, v2.4s
@ -423,8 +424,8 @@ LoopRow:
BiasHalf:
cbz x7, NoReadBiasHalf
ld1 {v0.4s, v1.4s}, [x18]
add x18, x18, #64
ld1 {v0.4s, v1.4s}, [x29]
add x29, x29, #64
add v16.4s, v16.4s, v0.4s
add v17.4s, v17.4s, v1.4s
add v20.4s, v20.4s, v0.4s
@ -612,8 +613,8 @@ LoopRow:
BiasQuarter:
cbz x7, NoReadBiasQuarter
ld1 {v0.4s}, [x18]
add x18, x18, #64
ld1 {v0.4s}, [x29]
add x29, x29, #64
add v16.4s, v16.4s, v0.4s
add v20.4s, v20.4s, v0.4s
add v24.4s, v24.4s, v0.4s
@ -1072,7 +1073,7 @@ LoopColEnd:
b LoopRow
LoopRowEnd:
sub sp, sp, #208
sub sp, sp, #224
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
@ -1080,5 +1081,6 @@ LoopRowEnd:
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ldp x27, x28, [sp], #16
ldp x29, x30, [sp], #16
ret
#endif

View File

@ -20,9 +20,10 @@
// x7: bias
asm_function MatMulOptR4Int8Neon64
sub sp, sp, #128
sub sp, sp, #144
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
stp x19, x20, [sp], #16
mov w15, #0 // b col index
mov w16, #0 // a row index
@ -40,7 +41,7 @@ L2:
cmp w16, w3
beq End2
mov x18, x1 // reload b ptr
mov x19, x1 // reload b ptr
mov x10, x7 // reload bias ptr
mov w11, w5 // reload depth
dup v16.4s, wzr
@ -67,10 +68,10 @@ L3:
ld1 {v1.16b}, [x17], #16
ld1 {v2.16b}, [x17], #16
ld1 {v3.16b}, [x17], #16
ld1 {v4.16b}, [x18], #16
ld1 {v5.16b}, [x18], #16
ld1 {v6.16b}, [x18], #16
ld1 {v7.16b}, [x18], #16
ld1 {v4.16b}, [x19], #16
ld1 {v5.16b}, [x19], #16
ld1 {v6.16b}, [x19], #16
ld1 {v7.16b}, [x19], #16
sdot v16.4s, v4.16b, v0.16b
sdot v17.4s, v5.16b, v0.16b
@ -135,8 +136,9 @@ End2:
b L1
End1:
sub sp, sp, #128
sub sp, sp, #144
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
ldp x19, x20, [sp], #16
ret
#endif