forked from OSSInnovation/mindspore
remove use of x18 on apple devices
This commit is contained in:
parent
31d4d40e20
commit
f1e1d054bf
|
@ -28,11 +28,11 @@ asm_function AdderFloatNeon64
|
|||
|
||||
ldr x8, [sp]
|
||||
|
||||
mov x18, #48 // sizeof(float) * 12
|
||||
mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth
|
||||
mov x20, #48 // sizeof(float) * 12
|
||||
mul x17, x5, x20 // block stride of lhs/rhs: sizeof(float) * 12 * depth
|
||||
|
||||
mov x18, #4
|
||||
mul x8, x8, x18
|
||||
mov x20, #4
|
||||
mul x8, x8, x20
|
||||
|
||||
LoopRowStart:
|
||||
cmp x6, #4
|
||||
|
@ -595,9 +595,9 @@ LoopRow4:
|
|||
|
||||
LoopColEnd:
|
||||
add x0, x0, x17
|
||||
mov x18, #4
|
||||
mul x18, x18, x7
|
||||
sub x11, x11, x18
|
||||
mov x20, #4
|
||||
mul x20, x20, x7
|
||||
sub x11, x11, x20
|
||||
mov x2, x11
|
||||
subs x6, x6, #12
|
||||
bgt LoopRowStart
|
||||
|
|
|
@ -33,12 +33,13 @@
|
|||
// w16: per_channel
|
||||
|
||||
asm_function ConvDw3x3Int8Neon64
|
||||
sub sp, sp, #176
|
||||
sub sp, sp, #192
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
stp x23, x24, [sp], #16
|
||||
stp x25, x26, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
|
@ -84,16 +85,16 @@ asm_function ConvDw3x3Int8Neon64
|
|||
|
||||
mov x16, x1
|
||||
add x17, x16, x5
|
||||
add x18, x17, x5
|
||||
add x25, x17, x5
|
||||
ld1 {v9.8b}, [x16], x4
|
||||
ld1 {v10.8b}, [x16], x4
|
||||
ld1 {v11.8b}, [x16], x4
|
||||
ld1 {v13.8b}, [x17], x4
|
||||
ld1 {v14.8b}, [x17], x4
|
||||
ld1 {v15.8b}, [x17], x4
|
||||
ld1 {v17.8b}, [x18], x4
|
||||
ld1 {v18.8b}, [x18], x4
|
||||
ld1 {v19.8b}, [x18], x4
|
||||
ld1 {v17.8b}, [x25], x4
|
||||
ld1 {v18.8b}, [x25], x4
|
||||
ld1 {v19.8b}, [x25], x4
|
||||
|
||||
ld1 {v21.4s}, [x3]
|
||||
ld1 {v22.4s}, [x19]
|
||||
|
@ -123,13 +124,13 @@ HEIGHT1_LOOP:
|
|||
ld1 {v16.8b}, [x17]
|
||||
smlal v23.4s, v0.4h, v10.4h
|
||||
smlal2 v24.4s, v0.8h, v10.8h
|
||||
ld1 {v20.8b}, [x18]
|
||||
ld1 {v20.8b}, [x25]
|
||||
add x1, x1, x21
|
||||
ssubl v12.8h, v12.8b, v25.8b
|
||||
smlal v21.4s, v1.4h, v10.4h
|
||||
mov x16, x1
|
||||
add x17, x16, x5
|
||||
add x18, x17, x5
|
||||
add x25, x17, x5
|
||||
smlal2 v22.4s, v1.8h, v10.8h
|
||||
ld1 {v9.8b}, [x16], x4
|
||||
ssubl v16.8h, v16.8b, v25.8b
|
||||
|
@ -159,17 +160,17 @@ HEIGHT1_LOOP:
|
|||
smlal2 v24.4s, v5.8h, v16.8h
|
||||
smlal v21.4s, v6.4h, v17.4h
|
||||
smlal2 v22.4s, v6.8h, v17.8h
|
||||
ld1 {v17.8b}, [x18], x4
|
||||
ld1 {v17.8b}, [x25], x4
|
||||
smlal v23.4s, v6.4h, v18.4h
|
||||
smlal2 v24.4s, v6.8h, v18.8h
|
||||
smlal v21.4s, v7.4h, v18.4h
|
||||
smlal2 v22.4s, v7.8h, v18.8h
|
||||
ld1 {v18.8b}, [x18], x4
|
||||
ld1 {v18.8b}, [x25], x4
|
||||
smlal v23.4s, v7.4h, v19.4h
|
||||
smlal2 v24.4s, v7.8h, v19.8h
|
||||
smlal v21.4s, v8.4h, v19.4h
|
||||
smlal2 v22.4s, v8.8h, v19.8h
|
||||
ld1 {v19.8b}, [x18], x4
|
||||
ld1 {v19.8b}, [x25], x4
|
||||
smlal v23.4s, v8.4h, v20.4h
|
||||
smlal2 v24.4s, v8.8h, v20.8h
|
||||
|
||||
|
@ -278,7 +279,7 @@ WIDTH2_LEFT:
|
|||
smlal2 v24.4s, v1.8h, v11.8h
|
||||
smlal v21.4s, v2.4h, v11.4h
|
||||
smlal2 v22.4s, v2.8h, v11.8h
|
||||
ld1 {v20.8b}, [x18]
|
||||
ld1 {v20.8b}, [x25]
|
||||
smlal v23.4s, v2.4h, v12.4h
|
||||
smlal2 v24.4s, v2.8h, v12.8h
|
||||
smlal v21.4s, v3.4h, v13.4h
|
||||
|
@ -443,12 +444,13 @@ OUTZP3:
|
|||
st1 {v21.8b}, [x0], x6
|
||||
|
||||
End:
|
||||
sub sp, sp, #176
|
||||
sub sp, sp, #192
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
ldp x25, x26, [sp], #16
|
||||
ret
|
||||
|
||||
#endif
|
||||
|
|
|
@ -33,12 +33,13 @@
|
|||
// w16: per_channel
|
||||
|
||||
asm_function ConvDw3x3Int8Stride2
|
||||
sub sp, sp, #176
|
||||
sub sp, sp, #192
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
stp x23, x24, [sp], #16
|
||||
stp x25, x26, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
|
@ -71,7 +72,7 @@ asm_function ConvDw3x3Int8Stride2
|
|||
|
||||
mov x16, x1
|
||||
add x17, x16, x5
|
||||
add x18, x17, x5
|
||||
add x25, x17, x5
|
||||
ld1 {v9.8b}, [x16], x4
|
||||
ld1 {v10.8b}, [x16], x4
|
||||
ssubl v9.8h, v9.8b, v28.8b
|
||||
|
@ -83,11 +84,11 @@ asm_function ConvDw3x3Int8Stride2
|
|||
ssubl v14.8h, v14.8b, v28.8b
|
||||
ld1 {v16.8b}, [x17], x4
|
||||
ssubl v15.8h, v15.8b, v28.8b
|
||||
ld1 {v19.8b}, [x18], x4
|
||||
ld1 {v19.8b}, [x25], x4
|
||||
ssubl v16.8h, v16.8b, v28.8b
|
||||
ld1 {v20.8b}, [x18], x4
|
||||
ld1 {v20.8b}, [x25], x4
|
||||
ssubl v19.8h, v19.8b, v28.8b
|
||||
ld1 {v21.8b}, [x18], x4
|
||||
ld1 {v21.8b}, [x25], x4
|
||||
ssubl v20.8h, v20.8b, v28.8b
|
||||
ssubl v21.8h, v21.8b, v28.8b
|
||||
|
||||
|
@ -108,7 +109,7 @@ HEIGHT1_LOOP:
|
|||
ld1 {v17.8b}, [x17], x4
|
||||
ssubl v12.8h, v12.8b, v28.8b
|
||||
smlal v26.4s, v0.4h, v11.4h
|
||||
ld1 {v22.8b}, [x18], x4
|
||||
ld1 {v22.8b}, [x25], x4
|
||||
ssubl v17.8h, v17.8b, v28.8b
|
||||
smlal2 v27.4s, v0.8h, v11.8h
|
||||
ld1 {v13.8b}, [x16], x4
|
||||
|
@ -117,7 +118,7 @@ HEIGHT1_LOOP:
|
|||
ld1 {v18.8b}, [x17], x4
|
||||
ssubl v13.8h, v13.8b, v28.8b
|
||||
smlal2 v25.4s, v1.8h, v10.8h
|
||||
ld1 {v23.8b}, [x18], x4
|
||||
ld1 {v23.8b}, [x25], x4
|
||||
ssubl v18.8h, v18.8b, v28.8b
|
||||
smlal v26.4s, v1.4h, v12.4h
|
||||
mov v9.16b, v13.16b
|
||||
|
@ -157,12 +158,12 @@ HEIGHT1_LOOP:
|
|||
smlal2 v27.4s, v6.8h, v21.8h
|
||||
smlal v24.4s, v7.4h, v20.4h
|
||||
smlal2 v25.4s, v7.8h, v20.8h
|
||||
ld1 {v20.8b}, [x18], x4
|
||||
ld1 {v20.8b}, [x25], x4
|
||||
smlal v26.4s, v7.4h, v22.4h
|
||||
smlal2 v27.4s, v7.8h, v22.8h
|
||||
smlal v24.4s, v8.4h, v21.4h
|
||||
smlal2 v25.4s, v8.8h, v21.8h
|
||||
ld1 {v21.8b}, [x18], x4
|
||||
ld1 {v21.8b}, [x25], x4
|
||||
ssubl v20.8h, v20.8b, v28.8b
|
||||
smlal v26.4s, v8.4h, v23.4h
|
||||
ssubl v21.8h, v21.8b, v28.8b
|
||||
|
@ -260,7 +261,7 @@ WIDTH2_LEFT:
|
|||
ld1 {v17.8b}, [x17], x4
|
||||
ssubl v12.8h, v12.8b, v28.8b
|
||||
smlal v26.4s, v0.4h, v11.4h
|
||||
ld1 {v22.8b}, [x18], x4
|
||||
ld1 {v22.8b}, [x25], x4
|
||||
ssubl v17.8h, v17.8b, v28.8b
|
||||
smlal2 v27.4s, v0.8h, v11.8h
|
||||
ld1 {v13.8b}, [x16], x4
|
||||
|
@ -269,7 +270,7 @@ WIDTH2_LEFT:
|
|||
ld1 {v18.8b}, [x17], x4
|
||||
ssubl v13.8h, v13.8b, v28.8b
|
||||
smlal2 v25.4s, v1.8h, v10.8h
|
||||
ld1 {v23.8b}, [x18], x4
|
||||
ld1 {v23.8b}, [x25], x4
|
||||
ssubl v18.8h, v18.8b, v28.8b
|
||||
smlal v26.4s, v1.4h, v12.4h
|
||||
ssubl v23.8h, v23.8b, v28.8b
|
||||
|
@ -452,11 +453,12 @@ OUTZP3:
|
|||
st1 {v24.8b}, [x0], x6
|
||||
|
||||
End:
|
||||
sub sp, sp, #176
|
||||
sub sp, sp, #192
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
ldp x25, x26, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -19,12 +19,13 @@ asm_function ConvDwFp32Center
|
|||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #176
|
||||
sub sp, sp, #192
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
stp x23, x24, [sp], #16
|
||||
stp x25, x26, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
|
@ -72,7 +73,7 @@ asm_function ConvDwFp32Center
|
|||
mov v14.16b, v24.16b
|
||||
mov v15.16b, v24.16b
|
||||
LoopKh16:
|
||||
mov x18, x7
|
||||
mov x25, x7
|
||||
mov x21, x16
|
||||
LoopKw16:
|
||||
mov x22, x21
|
||||
|
@ -109,7 +110,7 @@ asm_function ConvDwFp32Center
|
|||
ld1 {v23.4s}, [x22], x11
|
||||
fmla v14.4s, v22.4s, v25.4s
|
||||
fmla v15.4s, v23.4s, v25.4s
|
||||
subs x18, x18, #1
|
||||
subs x25, x25, #1
|
||||
add x21, x21, x13
|
||||
bne LoopKw16
|
||||
add x16, x16, x12
|
||||
|
@ -192,7 +193,7 @@ asm_function ConvDwFp32Center
|
|||
mov v6.16b, v24.16b
|
||||
mov v7.16b, v24.16b
|
||||
LoopKh8:
|
||||
mov x18, x7
|
||||
mov x25, x7
|
||||
mov x21, x16
|
||||
LoopKw8:
|
||||
mov x22, x21
|
||||
|
@ -213,7 +214,7 @@ asm_function ConvDwFp32Center
|
|||
ld1 {v23.4s}, [x22], x11
|
||||
fmla v6.4s, v22.4s, v25.4s
|
||||
fmla v7.4s, v23.4s, v25.4s
|
||||
subs x18, x18, #1
|
||||
subs x25, x25, #1
|
||||
add x21, x21, x13
|
||||
bne LoopKw8
|
||||
add x16, x16, x12
|
||||
|
@ -261,13 +262,13 @@ asm_function ConvDwFp32Center
|
|||
mov x20, x6
|
||||
mov v0.16b, v24.16b
|
||||
LoopKh:
|
||||
mov x18, x7
|
||||
mov x25, x7
|
||||
mov x22, x16
|
||||
LoopKw:
|
||||
ld1 {v16.4s}, [x22], x13
|
||||
ld1 {v25.4s}, [x17], #16
|
||||
fmla v0.4s, v16.4s, v25.4s
|
||||
subs x18, x18, #1
|
||||
subs x25, x25, #1
|
||||
bne LoopKw
|
||||
add x16, x16, x12
|
||||
subs x20, x20, #1
|
||||
|
@ -290,11 +291,12 @@ asm_function ConvDwFp32Center
|
|||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #176
|
||||
sub sp, sp, #192
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
ldp x25, x26, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -13,8 +13,9 @@
|
|||
// x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6
|
||||
|
||||
asm_function ConvDwFp32Indirect3x3
|
||||
sub sp, sp, #16
|
||||
sub sp, sp, #32
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
|
||||
movi v31.4s, #6
|
||||
scvtf v31.4s, v31.4s
|
||||
|
@ -28,7 +29,7 @@ asm_function ConvDwFp32Indirect3x3
|
|||
ldp x12, x13, [x1]
|
||||
ldp x14, x15, [x1, #16]
|
||||
ldp x16, x17, [x1, #32]
|
||||
ldp x18, x19, [x1, #48]
|
||||
ldp x21, x19, [x1, #48]
|
||||
ldr x20, [x1, #64]
|
||||
mov x9, x2
|
||||
mov x10, x3
|
||||
|
@ -56,7 +57,7 @@ asm_function ConvDwFp32Indirect3x3
|
|||
ld1 {v5.4s}, [x17], #16
|
||||
ld1 {v22.4s}, [x9], #16
|
||||
fmla v29.4s, v3.4s, v20.4s
|
||||
ld1 {v6.4s}, [x18], #16
|
||||
ld1 {v6.4s}, [x21], #16
|
||||
ld1 {v23.4s}, [x9], #16
|
||||
fmla v29.4s, v4.4s, v21.4s
|
||||
ld1 {v7.4s}, [x19], #16
|
||||
|
@ -100,7 +101,7 @@ asm_function ConvDwFp32Indirect3x3
|
|||
ld1 {v5.4s}, [x17], #16
|
||||
ld1 {v22.4s}, [x9], #16
|
||||
fmla v29.4s, v3.4s, v20.4s
|
||||
ld1 {v6.4s}, [x18], #16
|
||||
ld1 {v6.4s}, [x21], #16
|
||||
ld1 {v23.4s}, [x9], #16
|
||||
fmla v29.4s, v4.4s, v21.4s
|
||||
ld1 {v7.4s}, [x19], #16
|
||||
|
@ -141,7 +142,8 @@ asm_function ConvDwFp32Indirect3x3
|
|||
cmp x5, #0
|
||||
bgt LoopPixel
|
||||
End:
|
||||
sub sp, sp, #16
|
||||
sub sp, sp, #32
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -13,17 +13,18 @@
|
|||
// x0: output, x1: input, x2: weights, x3: bias, x4: channels, x5: output_width, x6: input_stride, x7: relu, x8: relu6
|
||||
|
||||
asm_function ConvDwFp32Indirect5x5
|
||||
sub sp, sp, #160
|
||||
sub sp, sp, #176
|
||||
stp x19, x20, [sp, #64]
|
||||
stp x21, x22, [sp, #80]
|
||||
stp x23, x24, [sp, #96]
|
||||
stp x25, x26, [sp, #112]
|
||||
stp x27, x28, [sp, #128]
|
||||
stp x29, x30, [sp, #144]
|
||||
ldrb w8, [sp, #160]
|
||||
ldrb w8, [sp, #176]
|
||||
stp x2, x3, [sp]
|
||||
stp x4, x6, [sp, #16]
|
||||
stp x7, x8, [sp, #32]
|
||||
stp x0, x1, [sp, #160]
|
||||
|
||||
movi v31.4s, #6
|
||||
scvtf v31.4s, v31.4s
|
||||
|
@ -44,7 +45,7 @@ asm_function ConvDwFp32Indirect5x5
|
|||
ldp x12, x13, [x1, #48]
|
||||
ldp x14, x15, [x1, #64]
|
||||
ldp x16, x17, [x1, #80]
|
||||
ldp x18, x19, [x1, #96]
|
||||
ldp x0, x19, [x1, #96]
|
||||
ldp x20, x21, [x1, #112]
|
||||
ldp x22, x23, [x1, #128]
|
||||
ldp x24, x25, [x1, #144]
|
||||
|
@ -93,7 +94,7 @@ asm_function ConvDwFp32Indirect5x5
|
|||
ld1 {v1.4s}, [x17], #16
|
||||
ld1 {v19.4s}, [x5], #16
|
||||
fmla v29.4s, v7.4s, v25.4s
|
||||
ld1 {v2.4s}, [x18], #16
|
||||
ld1 {v2.4s}, [x0], #16
|
||||
ld1 {v20.4s}, [x5], #16
|
||||
fmla v29.4s, v16.4s, v26.4s
|
||||
ld1 {v3.4s}, [x19], #16
|
||||
|
@ -160,7 +161,9 @@ asm_function ConvDwFp32Indirect5x5
|
|||
RELU:
|
||||
fmax v29.4s, v29.4s, v30.4s
|
||||
WRITE:
|
||||
st1 {v29.4s}, [x0], #16
|
||||
ldr x4, [sp, #160]
|
||||
st1 {v29.4s}, [x4], #16
|
||||
str x4, [sp, #160]
|
||||
|
||||
ldr x4, [sp, #56]
|
||||
ld1 {v29.4s}, [x4], #16
|
||||
|
@ -195,7 +198,7 @@ asm_function ConvDwFp32Indirect5x5
|
|||
ld1 {v1.4s}, [x17], #16
|
||||
ld1 {v19.4s}, [x5], #16
|
||||
fmla v29.4s, v7.4s, v25.4s
|
||||
ld1 {v2.4s}, [x18], #16
|
||||
ld1 {v2.4s}, [x0], #16
|
||||
ld1 {v20.4s}, [x5], #16
|
||||
fmla v29.4s, v16.4s, v26.4s
|
||||
ld1 {v3.4s}, [x19], #16
|
||||
|
@ -253,18 +256,24 @@ asm_function ConvDwFp32Indirect5x5
|
|||
LeftWrite:
|
||||
cmp x2, #4
|
||||
bne Write3
|
||||
st1 {v29.4s}, [x0], #16
|
||||
ldr x4, [sp, #160]
|
||||
st1 {v29.4s}, [x4], #16
|
||||
str x4, [sp, #160]
|
||||
b NextPixel
|
||||
Write3:
|
||||
sxtw x2, w2
|
||||
tbnz w2, #1, Write2
|
||||
tbnz w2, #0, Write1
|
||||
Write2:
|
||||
st1 {v29.2s}, [x0], #8
|
||||
ldr x4, [sp, #160]
|
||||
st1 {v29.2s}, [x4], #8
|
||||
str x4, [sp, #160]
|
||||
ext v29.16b, v29.16b, v29.16b, #8
|
||||
tbz w2, #0, NextPixel
|
||||
Write1:
|
||||
str s29, [x0], #4
|
||||
ldr x4, [sp, #160]
|
||||
str s29, [x4], #4
|
||||
str x4, [sp, #160]
|
||||
|
||||
NextPixel:
|
||||
ldr x2, [sp, #24]
|
||||
|
@ -279,6 +288,6 @@ End:
|
|||
ldp x25, x26, [sp, #112]
|
||||
ldp x27, x28, [sp, #128]
|
||||
ldp x29, x30, [sp, #144]
|
||||
add sp, sp, #160
|
||||
add sp, sp, #176
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -22,12 +22,13 @@ asm_function ConvDwInt8Center
|
|||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #176
|
||||
sub sp, sp, #192
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
stp x23, x24, [sp], #16
|
||||
stp x25, x26, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
|
@ -51,9 +52,9 @@ asm_function ConvDwInt8Center
|
|||
ld1 {v24.4s}, [x17], #16
|
||||
ld1 {v25.4s}, [x17], #16
|
||||
|
||||
ldr x18, [sp, #80] // right shift
|
||||
ld1 {v26.4s}, [x18], #16
|
||||
ld1 {v27.4s}, [x18], #16
|
||||
ldr x25, [sp, #80] // right shift
|
||||
ld1 {v26.4s}, [x25], #16
|
||||
ld1 {v27.4s}, [x25], #16
|
||||
|
||||
ldr x19, [sp, #88] // acc_min
|
||||
ld1 {v28.4s}, [x19], #16
|
||||
|
@ -90,7 +91,7 @@ asm_function ConvDwInt8Center
|
|||
mov v6.16b, v17.16b
|
||||
mov v7.16b, v18.16b
|
||||
LoopKh4:
|
||||
mov x18, x7
|
||||
mov x25, x7
|
||||
mov x21, x16
|
||||
LoopKw4:
|
||||
mov x22, x21
|
||||
|
@ -116,7 +117,7 @@ asm_function ConvDwInt8Center
|
|||
smlal v6.4s, v8.4h, v16.4h
|
||||
smlal2 v7.4s, v8.8h, v16.8h
|
||||
|
||||
subs x18, x18, #1
|
||||
subs x25, x25, #1
|
||||
add x21, x21, x13
|
||||
bne LoopKw4
|
||||
add x16, x16, x12
|
||||
|
@ -194,15 +195,15 @@ asm_function ConvDwInt8Center
|
|||
|
||||
mov x16, x3
|
||||
add x17, x16, x9
|
||||
add x18, x17, x9
|
||||
add x21, x18, x9
|
||||
add x25, x17, x9
|
||||
add x21, x25, x9
|
||||
|
||||
st1 {v0.s}[0], [x16], #4
|
||||
st1 {v1.s}[0], [x16], #4
|
||||
st1 {v2.s}[0], [x17], #4
|
||||
st1 {v3.s}[0], [x17], #4
|
||||
st1 {v4.s}[0], [x18], #4
|
||||
st1 {v5.s}[0], [x18], #4
|
||||
st1 {v4.s}[0], [x25], #4
|
||||
st1 {v5.s}[0], [x25], #4
|
||||
st1 {v6.s}[0], [x21], #4
|
||||
st1 {v7.s}[0], [x21], #4
|
||||
|
||||
|
@ -221,7 +222,7 @@ asm_function ConvDwInt8Center
|
|||
mov v0.16b, v17.16b
|
||||
mov v1.16b, v18.16b
|
||||
LoopKh:
|
||||
mov x18, x7
|
||||
mov x25, x7
|
||||
mov x22, x16
|
||||
LoopKw:
|
||||
ld1 {v15.8b}, [x22], x13
|
||||
|
@ -229,7 +230,7 @@ asm_function ConvDwInt8Center
|
|||
ld1 {v16.8h}, [x17], #16
|
||||
smlal v0.4s, v14.4h, v16.4h
|
||||
smlal2 v1.4s, v14.8h, v16.8h
|
||||
subs x18, x18, #1
|
||||
subs x25, x25, #1
|
||||
bne LoopKw
|
||||
add x16, x16, x12
|
||||
subs x20, x20, #1
|
||||
|
@ -271,11 +272,12 @@ asm_function ConvDwInt8Center
|
|||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #176
|
||||
sub sp, sp, #192
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
ldp x25, x26, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -47,11 +47,11 @@ asm_function ConvSwFp32Center
|
|||
|
||||
LoopH:
|
||||
mov x17, x1
|
||||
mov x18, x5
|
||||
mov x28, x5
|
||||
mov x3, x0
|
||||
cmp x18, #8
|
||||
cmp x28, #8
|
||||
blt LoopW
|
||||
cmp x18, #16
|
||||
cmp x28, #16
|
||||
blt LoopW8
|
||||
|
||||
LoopW16:
|
||||
|
@ -244,12 +244,12 @@ asm_function ConvSwFp32Center
|
|||
st1 {v14.4s}, [x3], x9
|
||||
st1 {v15.4s}, [x3], x9
|
||||
add x17, x17, x19
|
||||
sub x18, x18, #16
|
||||
cmp x18, #0
|
||||
sub x28, x28, #16
|
||||
cmp x28, #0
|
||||
ble LoopWEnd
|
||||
cmp x18, #8
|
||||
cmp x28, #8
|
||||
blt LoopW
|
||||
cmp x18, #16
|
||||
cmp x28, #16
|
||||
bge LoopW16
|
||||
LoopW8:
|
||||
mov x19, #8
|
||||
|
@ -369,10 +369,10 @@ asm_function ConvSwFp32Center
|
|||
st1 {v6.4s}, [x3], x9
|
||||
st1 {v7.4s}, [x3], x9
|
||||
add x17, x17, x19
|
||||
sub x18, x18, #8
|
||||
cmp x18, #0
|
||||
sub x28, x28, #8
|
||||
cmp x28, #0
|
||||
ble LoopWEnd
|
||||
cmp x18, #8
|
||||
cmp x28, #8
|
||||
bge LoopW8
|
||||
LoopW:
|
||||
mov x20, x17
|
||||
|
@ -427,7 +427,7 @@ asm_function ConvSwFp32Center
|
|||
Write:
|
||||
st1 {v0.4s}, [x3], x9
|
||||
add x17, x17, x12
|
||||
subs x18, x18, #1
|
||||
subs x28, x28, #1
|
||||
bne LoopW
|
||||
LoopWEnd:
|
||||
add x0, x0, x8
|
||||
|
|
|
@ -33,12 +33,12 @@ asm_function DeconvDwFp32Center
|
|||
mov x16, x1
|
||||
mov x17, x4
|
||||
LoopW:
|
||||
mov x18, x15
|
||||
mov x22, x15
|
||||
mov x19, x2
|
||||
mov x20, x5
|
||||
ld1 {v1.4s}, [x16], x8
|
||||
LoopKh:
|
||||
mov x21, x18
|
||||
mov x21, x22
|
||||
mov x13, x6
|
||||
LoopKw:
|
||||
ld1 {v0.4s}, [x21]
|
||||
|
@ -47,7 +47,7 @@ asm_function DeconvDwFp32Center
|
|||
st1 {v0.4s}, [x21], x12
|
||||
subs x13, x13, #1
|
||||
bne LoopKw
|
||||
add x18, x18, x11
|
||||
add x22, x22, x11
|
||||
subs x20, x20, #1
|
||||
bne LoopKh
|
||||
add x15, x15, x10
|
||||
|
|
|
@ -21,30 +21,31 @@
|
|||
// w13: c8_nhwc_c4
|
||||
|
||||
asm_function MatmulFloatNeon64
|
||||
sub sp, sp, #128
|
||||
sub sp, sp, #144
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
|
||||
ldr x9, [sp, #8]
|
||||
ldr x14, [sp, #16]
|
||||
|
||||
mov w18, #32 // sizeof(float) * 8
|
||||
mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth
|
||||
mov x18, #4
|
||||
mov w19, #32 // sizeof(float) * 8
|
||||
mul w15, w5, w19 // block stride of lhs/rhs: sizeof(float) * 8 * depth
|
||||
mov x19, #4
|
||||
ldr x17, [sp]
|
||||
cbz x14, NoWinoSteps
|
||||
mul x8, x7, x17
|
||||
mov x11, #8
|
||||
mul x11, x11, x17
|
||||
mul x8, x8, x18
|
||||
mul x11, x11, x18
|
||||
mul x8, x8, x19
|
||||
mul x11, x11, x19
|
||||
NoWinoSteps:
|
||||
mul x17, x17, x18
|
||||
mul x17, x17, x19
|
||||
|
||||
L1:
|
||||
mov w10, w6 // reload lhs row
|
||||
mov x12, x0 // reload lhs ptr
|
||||
mov x18, x2 // reload dst ptr
|
||||
mov x19, x2 // reload dst ptr
|
||||
|
||||
L2:
|
||||
mov x16, x1 // reload rhs ptr
|
||||
|
@ -254,435 +255,435 @@ Write:
|
|||
b Write8
|
||||
|
||||
Write1:
|
||||
str s8, [x18]
|
||||
str s8, [x19]
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
str s10, [x18]
|
||||
add x19, x19, x17
|
||||
str s10, [x19]
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
str s12, [x18]
|
||||
add x19, x19, x17
|
||||
str s12, [x19]
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
str s14, [x18]
|
||||
add x19, x19, x17
|
||||
str s14, [x19]
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
str s16, [x18]
|
||||
add x19, x19, x17
|
||||
str s16, [x19]
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
str s18, [x18]
|
||||
add x19, x19, x17
|
||||
str s18, [x19]
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
str s20, [x18]
|
||||
add x19, x19, x17
|
||||
str s20, [x19]
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
str s22, [x18]
|
||||
add x19, x19, x17
|
||||
str s22, [x19]
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
str s24, [x18]
|
||||
add x19, x19, x17
|
||||
str s24, [x19]
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
str s26, [x18]
|
||||
add x19, x19, x17
|
||||
str s26, [x19]
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
str s28, [x18]
|
||||
add x19, x19, x17
|
||||
str s28, [x19]
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
str s30, [x18]
|
||||
add x18, x18, x17
|
||||
add x19, x19, x17
|
||||
str s30, [x19]
|
||||
add x19, x19, x17
|
||||
b WriteEnd
|
||||
Write2:
|
||||
dup s9, v8.s[1]
|
||||
stp s8, s9, [x18]
|
||||
stp s8, s9, [x19]
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
add x19, x19, x17
|
||||
dup s11, v10.s[1]
|
||||
stp s10, s11, [x18]
|
||||
stp s10, s11, [x19]
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
add x19, x19, x17
|
||||
dup s13, v12.s[1]
|
||||
stp s12, s13, [x18]
|
||||
stp s12, s13, [x19]
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
add x19, x19, x17
|
||||
dup s15, v14.s[1]
|
||||
stp s14, s15, [x18]
|
||||
stp s14, s15, [x19]
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
add x19, x19, x17
|
||||
dup s17, v16.s[1]
|
||||
stp s16, s17, [x18]
|
||||
stp s16, s17, [x19]
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
add x19, x19, x17
|
||||
dup s19, v18.s[1]
|
||||
stp s18, s19, [x18]
|
||||
stp s18, s19, [x19]
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
add x19, x19, x17
|
||||
dup s21, v20.s[1]
|
||||
stp s20, s21, [x18]
|
||||
stp s20, s21, [x19]
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
add x19, x19, x17
|
||||
dup s23, v22.s[1]
|
||||
stp s22, s23, [x18]
|
||||
stp s22, s23, [x19]
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
add x19, x19, x17
|
||||
dup s25, v24.s[1]
|
||||
stp s24, s25, [x18]
|
||||
stp s24, s25, [x19]
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
add x19, x19, x17
|
||||
dup s27, v26.s[1]
|
||||
stp s26, s27, [x18]
|
||||
stp s26, s27, [x19]
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
add x19, x19, x17
|
||||
dup s29, v28.s[1]
|
||||
stp s28, s29, [x18]
|
||||
stp s28, s29, [x19]
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
add x18, x18, x17
|
||||
add x19, x19, x17
|
||||
dup s31, v30.s[1]
|
||||
stp s30, s31, [x18]
|
||||
add x18, x18, x17
|
||||
stp s30, s31, [x19]
|
||||
add x19, x19, x17
|
||||
b WriteEnd
|
||||
Write3:
|
||||
add x13, x18, #8
|
||||
add x13, x19, #8
|
||||
dup s9, v8.s[1]
|
||||
stp s8, s9, [x18]
|
||||
add x18, x18, x17
|
||||
stp s8, s9, [x19]
|
||||
add x19, x19, x17
|
||||
st1 {v8.s}[2], [x13], x17
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
dup s11, v10.s[1]
|
||||
stp s10, s11, [x18]
|
||||
add x18, x18, x17
|
||||
stp s10, s11, [x19]
|
||||
add x19, x19, x17
|
||||
st1 {v10.s}[2], [x13], x17
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
dup s13, v12.s[1]
|
||||
stp s12, s13, [x18]
|
||||
add x18, x18, x17
|
||||
stp s12, s13, [x19]
|
||||
add x19, x19, x17
|
||||
st1 {v12.s}[2], [x13], x17
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
dup s15, v14.s[1]
|
||||
stp s14, s15, [x18]
|
||||
add x18, x18, x17
|
||||
stp s14, s15, [x19]
|
||||
add x19, x19, x17
|
||||
st1 {v14.s}[2], [x13], x17
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
dup s17, v16.s[1]
|
||||
stp s16, s17, [x18]
|
||||
add x18, x18, x17
|
||||
stp s16, s17, [x19]
|
||||
add x19, x19, x17
|
||||
st1 {v16.s}[2], [x13], x17
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
dup s19, v18.s[1]
|
||||
stp s18, s19, [x18]
|
||||
add x18, x18, x17
|
||||
stp s18, s19, [x19]
|
||||
add x19, x19, x17
|
||||
st1 {v18.s}[2], [x13], x17
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
dup s21, v20.s[1]
|
||||
stp s20, s21, [x18]
|
||||
add x18, x18, x17
|
||||
stp s20, s21, [x19]
|
||||
add x19, x19, x17
|
||||
st1 {v20.s}[2], [x13], x17
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
dup s23, v22.s[1]
|
||||
stp s22, s23, [x18]
|
||||
add x18, x18, x17
|
||||
stp s22, s23, [x19]
|
||||
add x19, x19, x17
|
||||
st1 {v22.s}[2], [x13], x17
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
dup s25, v24.s[1]
|
||||
stp s24, s25, [x18]
|
||||
add x18, x18, x17
|
||||
stp s24, s25, [x19]
|
||||
add x19, x19, x17
|
||||
st1 {v24.s}[2], [x13], x17
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
dup s27, v26.s[1]
|
||||
stp s26, s27, [x18]
|
||||
add x18, x18, x17
|
||||
stp s26, s27, [x19]
|
||||
add x19, x19, x17
|
||||
st1 {v26.s}[2], [x13], x17
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
dup s29, v28.s[1]
|
||||
stp s28, s29, [x18]
|
||||
add x18, x18, x17
|
||||
stp s28, s29, [x19]
|
||||
add x19, x19, x17
|
||||
st1 {v28.s}[2], [x13], x17
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
dup s31, v30.s[1]
|
||||
stp s30, s31, [x18]
|
||||
add x18, x18, x17
|
||||
stp s30, s31, [x19]
|
||||
add x19, x19, x17
|
||||
st1 {v30.s}[2], [x13]
|
||||
b WriteEnd
|
||||
Write4:
|
||||
st1 {v8.4s}, [x18], x17
|
||||
st1 {v8.4s}, [x19], x17
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
st1 {v10.4s}, [x18], x17
|
||||
st1 {v10.4s}, [x19], x17
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
st1 {v12.4s}, [x18], x17
|
||||
st1 {v12.4s}, [x19], x17
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
st1 {v14.4s}, [x18], x17
|
||||
st1 {v14.4s}, [x19], x17
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
st1 {v16.4s}, [x18], x17
|
||||
st1 {v16.4s}, [x19], x17
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
st1 {v18.4s}, [x18], x17
|
||||
st1 {v18.4s}, [x19], x17
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
st1 {v20.4s}, [x18], x17
|
||||
st1 {v20.4s}, [x19], x17
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
st1 {v22.4s}, [x18], x17
|
||||
st1 {v22.4s}, [x19], x17
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.4s}, [x18], x17
|
||||
st1 {v24.4s}, [x19], x17
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
st1 {v26.4s}, [x18], x17
|
||||
st1 {v26.4s}, [x19], x17
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
st1 {v28.4s}, [x18], x17
|
||||
st1 {v28.4s}, [x19], x17
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
st1 {v30.4s}, [x18], x17
|
||||
st1 {v30.4s}, [x19], x17
|
||||
b WriteEnd
|
||||
Write5:
|
||||
add x13, x18, #16
|
||||
st1 {v8.4s}, [x18], x17
|
||||
add x13, x19, #16
|
||||
st1 {v8.4s}, [x19], x17
|
||||
str s9, [x13]
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v10.4s}, [x18], x17
|
||||
st1 {v10.4s}, [x19], x17
|
||||
str s11, [x13]
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v12.4s}, [x18], x17
|
||||
st1 {v12.4s}, [x19], x17
|
||||
str s13, [x13]
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v14.4s}, [x18], x17
|
||||
st1 {v14.4s}, [x19], x17
|
||||
str s15, [x13]
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v16.4s}, [x18], x17
|
||||
st1 {v16.4s}, [x19], x17
|
||||
str s17, [x13]
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v18.4s}, [x18], x17
|
||||
st1 {v18.4s}, [x19], x17
|
||||
str s19, [x13]
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v20.4s}, [x18], x17
|
||||
st1 {v20.4s}, [x19], x17
|
||||
str s21, [x13]
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v22.4s}, [x18], x17
|
||||
st1 {v22.4s}, [x19], x17
|
||||
str s23, [x13]
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v24.4s}, [x18], x17
|
||||
st1 {v24.4s}, [x19], x17
|
||||
str s25, [x13]
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v26.4s}, [x18], x17
|
||||
st1 {v26.4s}, [x19], x17
|
||||
str s27, [x13]
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v28.4s}, [x18], x17
|
||||
st1 {v28.4s}, [x19], x17
|
||||
str s29, [x13]
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v30.4s}, [x18], x17
|
||||
st1 {v30.4s}, [x19], x17
|
||||
str s31, [x13]
|
||||
b WriteEnd
|
||||
Write6:
|
||||
add x13, x18, #16
|
||||
st1 {v8.4s}, [x18], x17
|
||||
add x13, x19, #16
|
||||
st1 {v8.4s}, [x19], x17
|
||||
dup s8, v9.s[1]
|
||||
stp s9, s8, [x13]
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v10.4s}, [x18], x17
|
||||
st1 {v10.4s}, [x19], x17
|
||||
dup s10, v11.s[1]
|
||||
stp s11, s10, [x13]
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v12.4s}, [x18], x17
|
||||
st1 {v12.4s}, [x19], x17
|
||||
dup s12, v13.s[1]
|
||||
stp s13, s12, [x13]
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v14.4s}, [x18], x17
|
||||
st1 {v14.4s}, [x19], x17
|
||||
dup s14, v15.s[1]
|
||||
stp s15, s14, [x13]
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v16.4s}, [x18], x17
|
||||
st1 {v16.4s}, [x19], x17
|
||||
dup s16, v17.s[1]
|
||||
stp s17, s16, [x13]
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v18.4s}, [x18], x17
|
||||
st1 {v18.4s}, [x19], x17
|
||||
dup s18, v19.s[1]
|
||||
stp s19, s18, [x13]
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v20.4s}, [x18], x17
|
||||
st1 {v20.4s}, [x19], x17
|
||||
dup s20, v21.s[1]
|
||||
stp s21, s20, [x13]
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v22.4s}, [x18], x17
|
||||
st1 {v22.4s}, [x19], x17
|
||||
dup s22, v23.s[1]
|
||||
stp s23, s22, [x13]
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v24.4s}, [x18], x17
|
||||
st1 {v24.4s}, [x19], x17
|
||||
dup s24, v25.s[1]
|
||||
stp s25, s24, [x13]
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v26.4s}, [x18], x17
|
||||
st1 {v26.4s}, [x19], x17
|
||||
dup s26, v27.s[1]
|
||||
stp s27, s26, [x13]
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v28.4s}, [x18], x17
|
||||
st1 {v28.4s}, [x19], x17
|
||||
dup s28, v29.s[1]
|
||||
stp s29, s28, [x13]
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
add x13, x13, x17
|
||||
st1 {v30.4s}, [x18], x17
|
||||
st1 {v30.4s}, [x19], x17
|
||||
dup s30, v31.s[1]
|
||||
stp s31, s30, [x13]
|
||||
b WriteEnd
|
||||
Write7:
|
||||
add x13, x18, #16
|
||||
add x16, x18, #24
|
||||
st1 {v8.4s}, [x18], x17
|
||||
add x13, x19, #16
|
||||
add x16, x19, #24
|
||||
st1 {v8.4s}, [x19], x17
|
||||
dup s8, v9.s[1]
|
||||
stp s9, s8, [x13]
|
||||
add x13, x13, x17
|
||||
st1 {v9.s}[2], [x16], x17
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
st1 {v10.4s}, [x18], x17
|
||||
st1 {v10.4s}, [x19], x17
|
||||
dup s10, v11.s[1]
|
||||
stp s11, s10, [x13]
|
||||
add x13, x13, x17
|
||||
st1 {v11.s}[2], [x16], x17
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
st1 {v12.4s}, [x18], x17
|
||||
st1 {v12.4s}, [x19], x17
|
||||
dup s12, v13.s[1]
|
||||
stp s13, s12, [x13]
|
||||
add x13, x13, x17
|
||||
st1 {v13.s}[2], [x16], x17
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
st1 {v14.4s}, [x18], x17
|
||||
st1 {v14.4s}, [x19], x17
|
||||
dup s14, v15.s[1]
|
||||
stp s15, s14, [x13]
|
||||
add x13, x13, x17
|
||||
st1 {v15.s}[2], [x16], x17
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
st1 {v16.4s}, [x18], x17
|
||||
st1 {v16.4s}, [x19], x17
|
||||
dup s16, v17.s[1]
|
||||
stp s17, s16, [x13]
|
||||
add x13, x13, x17
|
||||
st1 {v17.s}[2], [x16], x17
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
st1 {v18.4s}, [x18], x17
|
||||
st1 {v18.4s}, [x19], x17
|
||||
dup s18, v19.s[1]
|
||||
stp s19, s18, [x13]
|
||||
add x13, x13, x17
|
||||
st1 {v19.s}[2], [x16], x17
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
st1 {v20.4s}, [x18], x17
|
||||
st1 {v20.4s}, [x19], x17
|
||||
dup s20, v21.s[1]
|
||||
stp s21, s20, [x13]
|
||||
add x13, x13, x17
|
||||
st1 {v21.s}[2], [x16], x17
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
st1 {v22.4s}, [x18], x17
|
||||
st1 {v22.4s}, [x19], x17
|
||||
dup s22, v23.s[1]
|
||||
stp s23, s22, [x13]
|
||||
add x13, x13, x17
|
||||
st1 {v23.s}[2], [x16], x17
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.4s}, [x18], x17
|
||||
st1 {v24.4s}, [x19], x17
|
||||
dup s24, v25.s[1]
|
||||
stp s25, s24, [x13]
|
||||
add x13, x13, x17
|
||||
st1 {v25.s}[2], [x16], x17
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
st1 {v26.4s}, [x18], x17
|
||||
st1 {v26.4s}, [x19], x17
|
||||
dup s26, v27.s[1]
|
||||
stp s27, s26, [x13]
|
||||
add x13, x13, x17
|
||||
st1 {v27.s}[2], [x16], x17
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
st1 {v28.4s}, [x18], x17
|
||||
st1 {v28.4s}, [x19], x17
|
||||
dup s28, v29.s[1]
|
||||
stp s29, s28, [x13]
|
||||
add x13, x13, x17
|
||||
st1 {v29.s}[2], [x16], x17
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
st1 {v30.4s}, [x18], x17
|
||||
st1 {v30.4s}, [x19], x17
|
||||
dup s30, v31.s[1]
|
||||
stp s31, s30, [x13]
|
||||
add x13, x13, x17
|
||||
|
@ -697,54 +698,54 @@ WriteC8:
|
|||
st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
|
||||
b WriteEnd
|
||||
WriteWino:
|
||||
st1 {v8.4s, v9.4s}, [x18], x8
|
||||
st1 {v10.4s, v11.4s}, [x18], x8
|
||||
st1 {v12.4s, v13.4s}, [x18], x8
|
||||
st1 {v14.4s, v15.4s}, [x18], x8
|
||||
st1 {v16.4s, v17.4s}, [x18], x8
|
||||
st1 {v18.4s, v19.4s}, [x18], x8
|
||||
st1 {v20.4s, v21.4s}, [x18], x8
|
||||
st1 {v22.4s, v23.4s}, [x18], x8
|
||||
st1 {v24.4s, v25.4s}, [x18], x8
|
||||
st1 {v26.4s, v27.4s}, [x18], x8
|
||||
st1 {v28.4s, v29.4s}, [x18], x8
|
||||
st1 {v30.4s, v31.4s}, [x18], x8
|
||||
st1 {v8.4s, v9.4s}, [x19], x8
|
||||
st1 {v10.4s, v11.4s}, [x19], x8
|
||||
st1 {v12.4s, v13.4s}, [x19], x8
|
||||
st1 {v14.4s, v15.4s}, [x19], x8
|
||||
st1 {v16.4s, v17.4s}, [x19], x8
|
||||
st1 {v18.4s, v19.4s}, [x19], x8
|
||||
st1 {v20.4s, v21.4s}, [x19], x8
|
||||
st1 {v22.4s, v23.4s}, [x19], x8
|
||||
st1 {v24.4s, v25.4s}, [x19], x8
|
||||
st1 {v26.4s, v27.4s}, [x19], x8
|
||||
st1 {v28.4s, v29.4s}, [x19], x8
|
||||
st1 {v30.4s, v31.4s}, [x19], x8
|
||||
b WriteEnd
|
||||
Write8:
|
||||
st1 {v8.4s, v9.4s}, [x18], x17
|
||||
st1 {v8.4s, v9.4s}, [x19], x17
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
st1 {v10.4s, v11.4s}, [x18], x17
|
||||
st1 {v10.4s, v11.4s}, [x19], x17
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
st1 {v12.4s, v13.4s}, [x18], x17
|
||||
st1 {v12.4s, v13.4s}, [x19], x17
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
st1 {v14.4s, v15.4s}, [x18], x17
|
||||
st1 {v14.4s, v15.4s}, [x19], x17
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
st1 {v16.4s, v17.4s}, [x18], x17
|
||||
st1 {v16.4s, v17.4s}, [x19], x17
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
st1 {v18.4s, v19.4s}, [x18], x17
|
||||
st1 {v18.4s, v19.4s}, [x19], x17
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
st1 {v20.4s, v21.4s}, [x18], x17
|
||||
st1 {v20.4s, v21.4s}, [x19], x17
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
st1 {v22.4s, v23.4s}, [x18], x17
|
||||
st1 {v22.4s, v23.4s}, [x19], x17
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.4s, v25.4s}, [x18], x17
|
||||
st1 {v24.4s, v25.4s}, [x19], x17
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
st1 {v26.4s, v27.4s}, [x18], x17
|
||||
st1 {v26.4s, v27.4s}, [x19], x17
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
st1 {v28.4s, v29.4s}, [x18], x17
|
||||
st1 {v28.4s, v29.4s}, [x19], x17
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
st1 {v30.4s, v31.4s}, [x18], x17
|
||||
st1 {v30.4s, v31.4s}, [x19], x17
|
||||
|
||||
WriteEnd:
|
||||
subs w10, w10, #12 // lhs row - 12
|
||||
|
@ -766,8 +767,9 @@ NoDstStep:
|
|||
bgt L1
|
||||
|
||||
End1:
|
||||
sub sp, sp, #128
|
||||
sub sp, sp, #144
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -21,31 +21,32 @@
|
|||
// x9: writeMode
|
||||
|
||||
asm_function MatmulFloatNeon64Opt
|
||||
sub sp, sp, #144
|
||||
sub sp, sp, #160
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
|
||||
mov x18, #48 // sizeof(float) * 12
|
||||
mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float) * 12 * depth
|
||||
mov x21, #48 // sizeof(float) * 12
|
||||
mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float) * 12 * depth
|
||||
cbnz x9, NoC8Steps
|
||||
mov x11, x2
|
||||
mov x18, #32
|
||||
mul x16, x6, x18 // row * 8 * sizeof(float)
|
||||
mov x21, #32
|
||||
mul x16, x6, x21 // row * 8 * sizeof(float)
|
||||
NoC8Steps:
|
||||
cmp x9, #2
|
||||
bne NoWinoSteps
|
||||
mov x18, #4
|
||||
mov x21, #4
|
||||
mul x15, x7, x8
|
||||
mul x15, x15, x18 // kernel_size * col *sizeof(float)
|
||||
mov x18, #32
|
||||
mul x16, x8, x18 // kernel_size * 8 * sizeof(float)
|
||||
mul x15, x15, x21 // kernel_size * col *sizeof(float)
|
||||
mov x21, #32
|
||||
mul x16, x8, x21 // kernel_size * 8 * sizeof(float)
|
||||
NoWinoSteps:
|
||||
mov x18, #4
|
||||
mul x8, x8, x18
|
||||
mov x21, #4
|
||||
mul x8, x8, x21
|
||||
|
||||
LoopRowStart:
|
||||
cmp x6, #4
|
||||
|
@ -1117,9 +1118,9 @@ LoopRow4:
|
|||
LoopColEnd:
|
||||
add x0, x0, x17
|
||||
cbz x9, C8DstStep
|
||||
mov x18, #4
|
||||
mul x18, x18, x7
|
||||
sub x11, x11, x18
|
||||
mov x21, #4
|
||||
mul x21, x21, x7
|
||||
sub x11, x11, x21
|
||||
mov x2, x11
|
||||
b NoDstStep
|
||||
C8DstStep:
|
||||
|
@ -1129,9 +1130,10 @@ LoopColEnd:
|
|||
subs x6, x6, #12
|
||||
bgt LoopRowStart
|
||||
|
||||
sub sp, sp, #144
|
||||
sub sp, sp, #160
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -67,7 +67,7 @@ L2:
|
|||
cmp w16, #0
|
||||
beq End2
|
||||
|
||||
mov x18, x1 // reload b ptr
|
||||
mov x28, x1 // reload b ptr
|
||||
mov x19, x7 // reload bias ptr
|
||||
mov w20, w5 // reload depth
|
||||
dup v16.4s, wzr
|
||||
|
@ -94,10 +94,10 @@ L3:
|
|||
ld1 {v1.16b}, [x17], #16
|
||||
ld1 {v2.16b}, [x17], #16
|
||||
ld1 {v3.16b}, [x17], #16
|
||||
ld1 {v4.16b}, [x18], #16
|
||||
ld1 {v5.16b}, [x18], #16
|
||||
ld1 {v6.16b}, [x18], #16
|
||||
ld1 {v7.16b}, [x18], #16
|
||||
ld1 {v4.16b}, [x28], #16
|
||||
ld1 {v5.16b}, [x28], #16
|
||||
ld1 {v6.16b}, [x28], #16
|
||||
ld1 {v7.16b}, [x28], #16
|
||||
|
||||
smull v8.8h, v4.8b, v0.8b
|
||||
smull v9.8h, v5.8b, v0.8b
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
// x28: filter_zp
|
||||
|
||||
asm_function MatmulInt8Opt
|
||||
sub sp, sp, #208
|
||||
sub sp, sp, #224
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
|
@ -38,6 +38,7 @@ asm_function MatmulInt8Opt
|
|||
stp x23, x24, [sp], #16
|
||||
stp x25, x26, [sp], #16
|
||||
stp x27, x28, [sp], #16
|
||||
stp x29, x30, [sp], #16
|
||||
|
||||
ldr w8, [sp]
|
||||
ldr w9, [sp, #8]
|
||||
|
@ -55,7 +56,7 @@ asm_function MatmulInt8Opt
|
|||
LoopRow:
|
||||
mov x16, x1 // reload rhs ptr
|
||||
mov x17, x4 // reload rhs col
|
||||
mov x18, x7 // reload bias ptr
|
||||
mov x29, x7 // reload bias ptr
|
||||
mov x27, x2 // reload dst ptr
|
||||
ldr x28, [sp, #64] // reload filter_zp
|
||||
|
||||
|
@ -158,7 +159,7 @@ LoopRow:
|
|||
|
||||
Bias:
|
||||
cbz x7, NoBias
|
||||
ld1 {v15.4s}, [x18], #16
|
||||
ld1 {v15.4s}, [x29], #16
|
||||
add v16.4s, v16.4s, v15.4s
|
||||
add v17.4s, v17.4s, v15.4s
|
||||
add v18.4s, v18.4s, v15.4s
|
||||
|
@ -330,7 +331,7 @@ LoopColEnd:
|
|||
b LoopRow
|
||||
|
||||
LoopRowEnd:
|
||||
sub sp, sp, #208
|
||||
sub sp, sp, #224
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
|
@ -338,5 +339,6 @@ LoopRowEnd:
|
|||
ldp x23, x24, [sp], #16
|
||||
ldp x25, x26, [sp], #16
|
||||
ldp x27, x28, [sp], #16
|
||||
ldp x29, x30, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -20,9 +20,10 @@
|
|||
// x7: bias
|
||||
|
||||
asm_function MatMulR4Int8Neon64
|
||||
sub sp, sp, #128
|
||||
sub sp, sp, #144
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
|
||||
mov w15, #0 // b col index
|
||||
mov w16, #0 // a row index
|
||||
|
@ -40,7 +41,7 @@ L2:
|
|||
cmp w16, w3
|
||||
beq End2
|
||||
|
||||
mov x18, x1 // reload b ptr
|
||||
mov x19, x1 // reload b ptr
|
||||
mov x10, x7 // reload bias ptr
|
||||
mov w11, w5 // reload depth
|
||||
dup v16.4s, wzr
|
||||
|
@ -67,10 +68,10 @@ L3:
|
|||
ld1 {v1.16b}, [x17], #16
|
||||
ld1 {v2.16b}, [x17], #16
|
||||
ld1 {v3.16b}, [x17], #16
|
||||
ld1 {v4.16b}, [x18], #16
|
||||
ld1 {v5.16b}, [x18], #16
|
||||
ld1 {v6.16b}, [x18], #16
|
||||
ld1 {v7.16b}, [x18], #16
|
||||
ld1 {v4.16b}, [x19], #16
|
||||
ld1 {v5.16b}, [x19], #16
|
||||
ld1 {v6.16b}, [x19], #16
|
||||
ld1 {v7.16b}, [x19], #16
|
||||
|
||||
smull v8.8h, v4.8b, v0.8b
|
||||
smull v9.8h, v5.8b, v0.8b
|
||||
|
@ -172,8 +173,9 @@ End2:
|
|||
b L1
|
||||
|
||||
End1:
|
||||
sub sp, sp, #128
|
||||
sub sp, sp, #144
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -30,13 +30,13 @@ asm_function MatrixMultiplyWinograd
|
|||
mov x14, x1 // mat_b
|
||||
LoopN:
|
||||
mov x16, x0 // mat_a_m
|
||||
sub x18, x5, x15 // ni
|
||||
sub x22, x5, x15 // ni
|
||||
sub x19, x17, x3 // mi
|
||||
mul x18, x18, x17 // ni * m
|
||||
mul x22, x22, x17 // ni * m
|
||||
mov x11, x6 // in_channel
|
||||
add x18, x18, x19 // (ni * m) + mi
|
||||
mul x18, x18, x7 // x18 * c4_channel
|
||||
add x20, x2, x18 // dst + offset
|
||||
add x22, x22, x19 // (ni * m) + mi
|
||||
mul x22, x22, x7 // x22 * c4_channel
|
||||
add x20, x2, x22 // dst + offset
|
||||
cmp x11, #16
|
||||
bge LoopC16
|
||||
cmp x11, #8
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
#ifdef __aarch64__
|
||||
#include "nnacl/assembly_global.h"
|
||||
|
||||
.text
|
||||
.align 5
|
||||
//.p2align 5,,15
|
||||
|
|
|
@ -55,16 +55,16 @@ LoopH:
|
|||
ld1 {v0.s}[2], [x17], x10
|
||||
ld1 {v0.s}[3], [x17], x10
|
||||
mov x11, x6
|
||||
mov x18, x17
|
||||
add x18, x14, x7
|
||||
add x16, x18, x7
|
||||
mov x20, x17
|
||||
add x20, x14, x7
|
||||
add x16, x20, x7
|
||||
add x19, x16, x7
|
||||
|
||||
LoopLength4:
|
||||
ld1 {v16.4s}, [x2]
|
||||
ld1 {v20.4s}, [x14], #16
|
||||
fmla v16.4s, v20.4s, v0.s[0]
|
||||
ld1 {v21.4s}, [x18], #16
|
||||
ld1 {v21.4s}, [x20], #16
|
||||
fmul v17.4s, v21.4s, v0.s[1]
|
||||
ld1 {v20.4s}, [x16], #16
|
||||
fmla v16.4s, v20.4s, v0.s[2]
|
||||
|
@ -90,14 +90,14 @@ LoopH:
|
|||
ld1 {v0.s}[1], [x17], x10
|
||||
ld1 {v0.s}[2], [x17], x10
|
||||
mov x11, x6
|
||||
mov x18, x17
|
||||
add x18, x14, x7
|
||||
add x16, x18, x7
|
||||
mov x20, x17
|
||||
add x20, x14, x7
|
||||
add x16, x20, x7
|
||||
LoopLength3:
|
||||
ld1 {v16.4s}, [x2]
|
||||
ld1 {v20.4s}, [x14], #16
|
||||
fmla v16.4s, v20.4s, v0.s[0]
|
||||
ld1 {v21.4s}, [x18], #16
|
||||
ld1 {v21.4s}, [x20], #16
|
||||
fmul v17.4s, v21.4s, v0.s[1]
|
||||
ld1 {v20.4s}, [x16], #16
|
||||
fmla v16.4s, v20.4s, v0.s[2]
|
||||
|
|
|
@ -18,6 +18,9 @@ asm_function WinogradTransRight
|
|||
//x5: k
|
||||
//x6: length
|
||||
|
||||
sub sp, sp, #16
|
||||
stp x19, x20, [sp], #16
|
||||
|
||||
mov x8, #16 // 4 * sizeof(float)
|
||||
mul x8, x6, x8
|
||||
mul x9, x5, x8 // step for S
|
||||
|
@ -43,7 +46,7 @@ LoopH:
|
|||
cmp x12, #4
|
||||
blt LoopKStart3
|
||||
mov x16, x15
|
||||
mov x18, x4
|
||||
mov x19, x4
|
||||
LoopK4:
|
||||
ld1 {v0.s}[0], [x13], x10
|
||||
ld1 {v0.s}[1], [x13], x10
|
||||
|
@ -54,7 +57,7 @@ LoopH:
|
|||
|
||||
add x14, x17, x8
|
||||
add x16, x14, x8
|
||||
add x18, x16, x8
|
||||
add x19, x16, x8
|
||||
|
||||
LoopLength4:
|
||||
ld1 {v16.4s}, [x2]
|
||||
|
@ -64,7 +67,7 @@ LoopH:
|
|||
fmul v17.4s, v21.4s, v0.s[1]
|
||||
ld1 {v20.4s}, [x16], #16
|
||||
fmla v16.4s, v20.4s, v0.s[2]
|
||||
ld1 {v21.4s}, [x18], #16
|
||||
ld1 {v21.4s}, [x19], #16
|
||||
fmla v17.4s, v21.4s, v0.s[3]
|
||||
|
||||
fadd v17.4s, v16.4s, v17.4s
|
||||
|
@ -73,7 +76,7 @@ LoopH:
|
|||
bne LoopLength4
|
||||
sub x2, x2, x8
|
||||
sub x12, x12, #4
|
||||
mov x17, x18
|
||||
mov x17, x19
|
||||
|
||||
cmp x12, #4
|
||||
bge LoopK4
|
||||
|
@ -107,7 +110,7 @@ LoopH:
|
|||
bne LoopLength3
|
||||
sub x2, x2, x8
|
||||
sub x12, x12, #3
|
||||
mov x17, x18
|
||||
mov x17, x19
|
||||
cmp x12, #3
|
||||
bge LoopK3
|
||||
|
||||
|
@ -141,5 +144,7 @@ LoopH:
|
|||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #16
|
||||
ldp x19, x20, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#ifdef ENABLE_AVX
|
||||
#include "nnacl/assembly_global.h"
|
||||
.text
|
||||
.align 4
|
||||
.global ConvDwFp32Avx3x3
|
||||
|
@ -31,7 +32,7 @@
|
|||
// 56: input_stride
|
||||
// 64: relu
|
||||
// 72: relu6
|
||||
ConvDwFp32Avx3x3:
|
||||
asm_function ConvDwFp32Avx3x3
|
||||
pushq %r15
|
||||
pushq %r14
|
||||
pushq %r13
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#ifdef ENABLE_AVX
|
||||
#include "nnacl/assembly_global.h"
|
||||
.text
|
||||
.align 4
|
||||
.global MatmulFloatAvxOpt
|
||||
|
@ -34,7 +35,7 @@
|
|||
// 72: stride
|
||||
// 80: writeMode
|
||||
|
||||
MatmulFloatAvxOpt:
|
||||
asm_function MatmulFloatAvxOpt
|
||||
// rbx, rsp, rbp, r12-r15 must be saved according to x86 calling convention
|
||||
pushq %r15
|
||||
pushq %r14
|
||||
|
|
|
@ -19,12 +19,13 @@ asm_function ConvDwFp16Center
|
|||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #176
|
||||
sub sp, sp, #192
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
stp x23, x24, [sp], #16
|
||||
stp x25, x26, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
|
@ -71,7 +72,7 @@ asm_function ConvDwFp16Center
|
|||
mov v14.16b, v24.16b
|
||||
mov v15.16b, v24.16b
|
||||
LoopKh16:
|
||||
mov x18, x7
|
||||
mov x25, x7
|
||||
mov x21, x16
|
||||
LoopKw16:
|
||||
mov x22, x21
|
||||
|
@ -108,7 +109,7 @@ asm_function ConvDwFp16Center
|
|||
ld1 {v23.8h}, [x22], x11
|
||||
fmla v14.8h, v22.8h, v25.8h
|
||||
fmla v15.8h, v23.8h, v25.8h
|
||||
subs x18, x18, #1
|
||||
subs x25, x25, #1
|
||||
add x21, x21, x13
|
||||
bne LoopKw16
|
||||
add x16, x16, x12
|
||||
|
@ -191,7 +192,7 @@ asm_function ConvDwFp16Center
|
|||
mov v6.16b, v24.16b
|
||||
mov v7.16b, v24.16b
|
||||
LoopKh8:
|
||||
mov x18, x7
|
||||
mov x25, x7
|
||||
mov x21, x16
|
||||
LoopKw8:
|
||||
mov x22, x21
|
||||
|
@ -212,7 +213,7 @@ asm_function ConvDwFp16Center
|
|||
ld1 {v23.8h}, [x22], x11
|
||||
fmla v6.8h, v22.8h, v25.8h
|
||||
fmla v7.8h, v23.8h, v25.8h
|
||||
subs x18, x18, #1
|
||||
subs x25, x25, #1
|
||||
add x21, x21, x13
|
||||
bne LoopKw8
|
||||
add x16, x16, x12
|
||||
|
@ -260,13 +261,13 @@ asm_function ConvDwFp16Center
|
|||
mov x20, x6
|
||||
mov v0.16b, v24.16b
|
||||
LoopKh:
|
||||
mov x18, x7
|
||||
mov x25, x7
|
||||
mov x22, x16
|
||||
LoopKw:
|
||||
ld1 {v16.8h}, [x22], x13
|
||||
ld1 {v25.8h}, [x17], #16
|
||||
fmla v0.8h, v16.8h, v25.8h
|
||||
subs x18, x18, #1
|
||||
subs x25, x25, #1
|
||||
bne LoopKw
|
||||
add x16, x16, x12
|
||||
subs x20, x20, #1
|
||||
|
@ -289,11 +290,12 @@ asm_function ConvDwFp16Center
|
|||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #176
|
||||
sub sp, sp, #192
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ldp x23, x24, [sp], #16
|
||||
ldp x25, x26, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -33,12 +33,12 @@ asm_function DeconvDwFp16Center
|
|||
mov x16, x1
|
||||
mov x17, x4
|
||||
LoopW:
|
||||
mov x18, x15
|
||||
mov x22, x15
|
||||
mov x19, x2
|
||||
mov x20, x5
|
||||
ld1 {v1.8h}, [x16], x8
|
||||
LoopKh:
|
||||
mov x21, x18
|
||||
mov x21, x22
|
||||
mov x13, x6
|
||||
LoopKw:
|
||||
ld1 {v0.8h}, [x21]
|
||||
|
@ -47,7 +47,7 @@ asm_function DeconvDwFp16Center
|
|||
st1 {v0.8h}, [x21], x12
|
||||
subs x13, x13, #1
|
||||
bne LoopKw
|
||||
add x18, x18, x11
|
||||
add x22, x22, x11
|
||||
subs x20, x20, #1
|
||||
bne LoopKh
|
||||
add x15, x15, x10
|
||||
|
|
|
@ -41,11 +41,12 @@ asm_function IndirectGemmFp16_16x8
|
|||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ r29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
sub sp, sp, #128
|
||||
sub sp, sp, #144
|
||||
// performance between storing 4 registers at the same time and separately storing them on in-order cores
|
||||
// is not tested yet
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
|
||||
ldr x8, [sp, #0]
|
||||
ldr x9, [sp, #8]
|
||||
|
@ -548,87 +549,87 @@ IndirectGemmStart:
|
|||
b WriteEnd
|
||||
Write7:
|
||||
add x17, x15, #8
|
||||
add x18, x15, #10
|
||||
add x19, x15, #10
|
||||
add x16, x15, #12
|
||||
st1 {v16.4h}, [x15], x7
|
||||
ins v0.s[0], v16.s[2]
|
||||
st1 {v0.h}[0], [x17], x7
|
||||
st1 {v0.h}[1], [x18], x7
|
||||
st1 {v0.h}[1], [x19], x7
|
||||
st1 {v16.h}[6], [x16], x7
|
||||
st1 {v17.4h}, [x15], x7
|
||||
ins v1.s[0], v17.s[2]
|
||||
st1 {v1.h}[0], [x17], x7
|
||||
st1 {v1.h}[1], [x18], x7
|
||||
st1 {v1.h}[1], [x19], x7
|
||||
st1 {v17.h}[6], [x16], x7
|
||||
st1 {v18.4h}, [x15], x7
|
||||
ins v2.s[0], v18.s[2]
|
||||
st1 {v2.h}[0], [x17], x7
|
||||
st1 {v2.h}[1], [x18], x7
|
||||
st1 {v2.h}[1], [x19], x7
|
||||
st1 {v18.h}[6], [x16], x7
|
||||
st1 {v19.4h}, [x15], x7
|
||||
ins v3.s[0], v19.s[2]
|
||||
st1 {v3.h}[0], [x17], x7
|
||||
st1 {v3.h}[1], [x18], x7
|
||||
st1 {v3.h}[1], [x19], x7
|
||||
st1 {v19.h}[6], [x16], x7
|
||||
st1 {v20.4h}, [x15], x7
|
||||
ins v4.s[0], v20.s[2]
|
||||
st1 {v4.h}[0], [x17], x7
|
||||
st1 {v4.h}[1], [x18], x7
|
||||
st1 {v4.h}[1], [x19], x7
|
||||
st1 {v20.h}[6], [x16], x7
|
||||
st1 {v21.4h}, [x15], x7
|
||||
ins v5.s[0], v21.s[2]
|
||||
st1 {v5.h}[0], [x17], x7
|
||||
st1 {v5.h}[1], [x18], x7
|
||||
st1 {v5.h}[1], [x19], x7
|
||||
st1 {v21.h}[6], [x16], x7
|
||||
st1 {v22.4h}, [x15], x7
|
||||
ins v6.s[0], v22.s[2]
|
||||
st1 {v6.h}[0], [x17], x7
|
||||
st1 {v6.h}[1], [x18], x7
|
||||
st1 {v6.h}[1], [x19], x7
|
||||
st1 {v22.h}[6], [x16], x7
|
||||
st1 {v23.4h}, [x15], x7
|
||||
ins v7.s[0], v23.s[2]
|
||||
st1 {v7.h}[0], [x17], x7
|
||||
st1 {v7.h}[1], [x18], x7
|
||||
st1 {v7.h}[1], [x19], x7
|
||||
st1 {v23.h}[6], [x16], x7
|
||||
st1 {v24.4h}, [x15], x7
|
||||
ins v8.s[0], v24.s[2]
|
||||
st1 {v8.h}[0], [x17], x7
|
||||
st1 {v8.h}[1], [x18], x7
|
||||
st1 {v8.h}[1], [x19], x7
|
||||
st1 {v24.h}[6], [x16], x7
|
||||
st1 {v25.4h}, [x15], x7
|
||||
ins v9.s[0], v25.s[2]
|
||||
st1 {v9.h}[0], [x17], x7
|
||||
st1 {v9.h}[1], [x18], x7
|
||||
st1 {v9.h}[1], [x19], x7
|
||||
st1 {v25.h}[6], [x16], x7
|
||||
st1 {v26.4h}, [x15], x7
|
||||
ins v10.s[0], v26.s[2]
|
||||
st1 {v10.h}[0], [x17], x7
|
||||
st1 {v10.h}[1], [x18], x7
|
||||
st1 {v10.h}[1], [x19], x7
|
||||
st1 {v26.h}[6], [x16], x7
|
||||
st1 {v27.4h}, [x15], x7
|
||||
ins v11.s[0], v27.s[2]
|
||||
st1 {v11.h}[0], [x17], x7
|
||||
st1 {v11.h}[1], [x18], x7
|
||||
st1 {v11.h}[1], [x19], x7
|
||||
st1 {v27.h}[6], [x16], x7
|
||||
st1 {v28.4h}, [x15], x7
|
||||
ins v12.s[0], v28.s[2]
|
||||
st1 {v12.h}[0], [x17], x7
|
||||
st1 {v12.h}[1], [x18], x7
|
||||
st1 {v12.h}[1], [x19], x7
|
||||
st1 {v28.h}[6], [x16], x7
|
||||
st1 {v29.4h}, [x15], x7
|
||||
ins v13.s[0], v29.s[2]
|
||||
st1 {v13.h}[0], [x17], x7
|
||||
st1 {v13.h}[1], [x18], x7
|
||||
st1 {v13.h}[1], [x19], x7
|
||||
st1 {v29.h}[6], [x16], x7
|
||||
st1 {v30.4h}, [x15], x7
|
||||
ins v14.s[0], v30.s[2]
|
||||
st1 {v14.h}[0], [x17], x7
|
||||
st1 {v14.h}[1], [x18], x7
|
||||
st1 {v14.h}[1], [x19], x7
|
||||
st1 {v30.h}[6], [x16], x7
|
||||
st1 {v31.4h}, [x15]
|
||||
ins v15.s[0], v31.s[2]
|
||||
st1 {v15.h}[0], [x17]
|
||||
st1 {v15.h}[1], [x18]
|
||||
st1 {v15.h}[1], [x19]
|
||||
st1 {v31.h}[6], [x16]
|
||||
add x0, x0, #14
|
||||
b WriteEnd
|
||||
|
@ -661,9 +662,10 @@ IndirectGemmStart:
|
|||
NoStepForward:
|
||||
bgt LoopOc
|
||||
|
||||
sub sp, sp, #128
|
||||
sub sp, sp, #144
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
||||
|
|
|
@ -21,21 +21,22 @@
|
|||
// w13: writeC8
|
||||
|
||||
asm_function MatmulFp16Neon64
|
||||
sub sp, sp, #128
|
||||
sub sp, sp, #144
|
||||
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
|
||||
st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
|
||||
mov w18, #16 // sizeof(float16) * 8
|
||||
mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float16) * 8 * depth
|
||||
mov x11, x3 // bias flag
|
||||
mov x18, #2
|
||||
mov x19, #2
|
||||
ldr x17, [sp]
|
||||
mul x17, x17, x18
|
||||
mul x17, x17, x19
|
||||
|
||||
L1:
|
||||
mov w10, w6 // reload lhs row
|
||||
mov x12, x0 // reload lhs ptr
|
||||
mov x18, x2 // reload dst ptr
|
||||
mov x19, x2 // reload dst ptr
|
||||
|
||||
L2:
|
||||
mov x16, x1 // reload rhs ptr
|
||||
|
@ -314,490 +315,490 @@ Write:
|
|||
b Write8
|
||||
|
||||
Write1:
|
||||
st1 {v16.h}[0], [x18], x17
|
||||
st1 {v16.h}[0], [x19], x17
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
st1 {v17.h}[0], [x18], x17
|
||||
st1 {v17.h}[0], [x19], x17
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
st1 {v18.h}[0], [x18], x17
|
||||
st1 {v18.h}[0], [x19], x17
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
st1 {v19.h}[0], [x18], x17
|
||||
st1 {v19.h}[0], [x19], x17
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
st1 {v20.h}[0], [x18], x17
|
||||
st1 {v20.h}[0], [x19], x17
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
st1 {v21.h}[0], [x18], x17
|
||||
st1 {v21.h}[0], [x19], x17
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
st1 {v22.h}[0], [x18], x17
|
||||
st1 {v22.h}[0], [x19], x17
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
st1 {v23.h}[0], [x18], x17
|
||||
st1 {v23.h}[0], [x19], x17
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.h}[0], [x18], x17
|
||||
st1 {v24.h}[0], [x19], x17
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
st1 {v25.h}[0], [x18], x17
|
||||
st1 {v25.h}[0], [x19], x17
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
st1 {v26.h}[0], [x18], x17
|
||||
st1 {v26.h}[0], [x19], x17
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
st1 {v27.h}[0], [x18], x17
|
||||
st1 {v27.h}[0], [x19], x17
|
||||
cmp w10, #12
|
||||
beq WriteEnd
|
||||
st1 {v28.h}[0], [x18], x17
|
||||
st1 {v28.h}[0], [x19], x17
|
||||
cmp w10, #13
|
||||
beq WriteEnd
|
||||
st1 {v29.h}[0], [x18], x17
|
||||
st1 {v29.h}[0], [x19], x17
|
||||
cmp w10, #14
|
||||
beq WriteEnd
|
||||
st1 {v30.h}[0], [x18], x17
|
||||
st1 {v30.h}[0], [x19], x17
|
||||
cmp w10, #15
|
||||
beq WriteEnd
|
||||
st1 {v31.h}[0], [x18], x17
|
||||
st1 {v31.h}[0], [x19], x17
|
||||
b WriteEnd
|
||||
Write2:
|
||||
add x13, x18, #2
|
||||
st1 {v16.h}[0], [x18], x17
|
||||
add x13, x19, #2
|
||||
st1 {v16.h}[0], [x19], x17
|
||||
st1 {v16.h}[1], [x13], x17
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
st1 {v17.h}[0], [x18], x17
|
||||
st1 {v17.h}[0], [x19], x17
|
||||
st1 {v17.h}[1], [x13], x17
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
st1 {v18.h}[0], [x18], x17
|
||||
st1 {v18.h}[0], [x19], x17
|
||||
st1 {v18.h}[1], [x13], x17
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
st1 {v19.h}[0], [x18], x17
|
||||
st1 {v19.h}[0], [x19], x17
|
||||
st1 {v19.h}[1], [x13], x17
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
st1 {v20.h}[0], [x18], x17
|
||||
st1 {v20.h}[0], [x19], x17
|
||||
st1 {v20.h}[1], [x13], x17
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
st1 {v21.h}[0], [x18], x17
|
||||
st1 {v21.h}[0], [x19], x17
|
||||
st1 {v21.h}[1], [x13], x17
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
st1 {v22.h}[0], [x18], x17
|
||||
st1 {v22.h}[0], [x19], x17
|
||||
st1 {v22.h}[1], [x13], x17
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
st1 {v23.h}[0], [x18], x17
|
||||
st1 {v23.h}[0], [x19], x17
|
||||
st1 {v23.h}[1], [x13], x17
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.h}[0], [x18], x17
|
||||
st1 {v24.h}[0], [x19], x17
|
||||
st1 {v24.h}[1], [x13], x17
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
st1 {v25.h}[0], [x18], x17
|
||||
st1 {v25.h}[0], [x19], x17
|
||||
st1 {v25.h}[1], [x13], x17
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
st1 {v26.h}[0], [x18], x17
|
||||
st1 {v26.h}[0], [x19], x17
|
||||
st1 {v26.h}[1], [x13], x17
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
st1 {v27.h}[0], [x18], x17
|
||||
st1 {v27.h}[0], [x19], x17
|
||||
st1 {v27.h}[1], [x13], x17
|
||||
cmp w10, #12
|
||||
beq WriteEnd
|
||||
st1 {v28.h}[0], [x18], x17
|
||||
st1 {v28.h}[0], [x19], x17
|
||||
st1 {v28.h}[1], [x13], x17
|
||||
cmp w10, #13
|
||||
beq WriteEnd
|
||||
st1 {v29.h}[0], [x18], x17
|
||||
st1 {v29.h}[0], [x19], x17
|
||||
st1 {v29.h}[1], [x13], x17
|
||||
cmp w10, #14
|
||||
beq WriteEnd
|
||||
st1 {v30.h}[0], [x18], x17
|
||||
st1 {v30.h}[0], [x19], x17
|
||||
st1 {v30.h}[1], [x13], x17
|
||||
cmp w10, #15
|
||||
beq WriteEnd
|
||||
st1 {v31.h}[0], [x18], x17
|
||||
st1 {v31.h}[0], [x19], x17
|
||||
st1 {v31.h}[1], [x13], x17
|
||||
b WriteEnd
|
||||
Write3:
|
||||
add x13, x18, #2
|
||||
add x14, x18, #4
|
||||
st1 {v16.h}[0], [x18], x17
|
||||
add x13, x19, #2
|
||||
add x14, x19, #4
|
||||
st1 {v16.h}[0], [x19], x17
|
||||
st1 {v16.h}[1], [x13], x17
|
||||
st1 {v16.h}[2], [x14], x17
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
st1 {v17.h}[0], [x18], x17
|
||||
st1 {v17.h}[0], [x19], x17
|
||||
st1 {v17.h}[1], [x13], x17
|
||||
st1 {v17.h}[2], [x14], x17
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
st1 {v18.h}[0], [x18], x17
|
||||
st1 {v18.h}[0], [x19], x17
|
||||
st1 {v18.h}[1], [x13], x17
|
||||
st1 {v18.h}[2], [x14], x17
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
st1 {v19.h}[0], [x18], x17
|
||||
st1 {v19.h}[0], [x19], x17
|
||||
st1 {v19.h}[1], [x13], x17
|
||||
st1 {v19.h}[2], [x14], x17
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
st1 {v20.h}[0], [x18], x17
|
||||
st1 {v20.h}[0], [x19], x17
|
||||
st1 {v20.h}[1], [x13], x17
|
||||
st1 {v20.h}[2], [x14], x17
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
st1 {v21.h}[0], [x18], x17
|
||||
st1 {v21.h}[0], [x19], x17
|
||||
st1 {v21.h}[1], [x13], x17
|
||||
st1 {v21.h}[2], [x14], x17
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
st1 {v22.h}[0], [x18], x17
|
||||
st1 {v22.h}[0], [x19], x17
|
||||
st1 {v22.h}[1], [x13], x17
|
||||
st1 {v22.h}[2], [x14], x17
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
st1 {v23.h}[0], [x18], x17
|
||||
st1 {v23.h}[0], [x19], x17
|
||||
st1 {v23.h}[1], [x13], x17
|
||||
st1 {v23.h}[2], [x14], x17
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.h}[0], [x18], x17
|
||||
st1 {v24.h}[0], [x19], x17
|
||||
st1 {v24.h}[1], [x13], x17
|
||||
st1 {v24.h}[2], [x14], x17
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
st1 {v25.h}[0], [x18], x17
|
||||
st1 {v25.h}[0], [x19], x17
|
||||
st1 {v25.h}[1], [x13], x17
|
||||
st1 {v25.h}[2], [x14], x17
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
st1 {v26.h}[0], [x18], x17
|
||||
st1 {v26.h}[0], [x19], x17
|
||||
st1 {v26.h}[1], [x13], x17
|
||||
st1 {v26.h}[2], [x14], x17
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
st1 {v27.h}[0], [x18], x17
|
||||
st1 {v27.h}[0], [x19], x17
|
||||
st1 {v27.h}[1], [x13], x17
|
||||
st1 {v27.h}[2], [x14], x17
|
||||
cmp w10, #12
|
||||
beq WriteEnd
|
||||
st1 {v28.h}[0], [x18], x17
|
||||
st1 {v28.h}[0], [x19], x17
|
||||
st1 {v28.h}[1], [x13], x17
|
||||
st1 {v28.h}[2], [x14], x17
|
||||
cmp w10, #13
|
||||
beq WriteEnd
|
||||
st1 {v29.h}[0], [x18], x17
|
||||
st1 {v29.h}[0], [x19], x17
|
||||
st1 {v29.h}[1], [x13], x17
|
||||
st1 {v29.h}[2], [x14], x17
|
||||
cmp w10, #14
|
||||
beq WriteEnd
|
||||
st1 {v30.h}[0], [x18], x17
|
||||
st1 {v30.h}[0], [x19], x17
|
||||
st1 {v30.h}[1], [x13], x17
|
||||
st1 {v30.h}[2], [x14], x17
|
||||
cmp w10, #15
|
||||
beq WriteEnd
|
||||
st1 {v31.h}[0], [x18], x17
|
||||
st1 {v31.h}[0], [x19], x17
|
||||
st1 {v31.h}[1], [x13], x17
|
||||
st1 {v31.h}[2], [x14], x17
|
||||
b WriteEnd
|
||||
Write4:
|
||||
st1 {v16.4h}, [x18], x17
|
||||
st1 {v16.4h}, [x19], x17
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
st1 {v17.4h}, [x18], x17
|
||||
st1 {v17.4h}, [x19], x17
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
st1 {v18.4h}, [x18], x17
|
||||
st1 {v18.4h}, [x19], x17
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
st1 {v19.4h}, [x18], x17
|
||||
st1 {v19.4h}, [x19], x17
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
st1 {v20.4h}, [x18], x17
|
||||
st1 {v20.4h}, [x19], x17
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
st1 {v21.4h}, [x18], x17
|
||||
st1 {v21.4h}, [x19], x17
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
st1 {v22.4h}, [x18], x17
|
||||
st1 {v22.4h}, [x19], x17
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
st1 {v23.4h}, [x18], x17
|
||||
st1 {v23.4h}, [x19], x17
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.4h}, [x18], x17
|
||||
st1 {v24.4h}, [x19], x17
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
st1 {v25.4h}, [x18], x17
|
||||
st1 {v25.4h}, [x19], x17
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
st1 {v26.4h}, [x18], x17
|
||||
st1 {v26.4h}, [x19], x17
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
st1 {v27.4h}, [x18], x17
|
||||
st1 {v27.4h}, [x19], x17
|
||||
cmp w10, #12
|
||||
beq WriteEnd
|
||||
st1 {v28.4h}, [x18], x17
|
||||
st1 {v28.4h}, [x19], x17
|
||||
cmp w10, #13
|
||||
beq WriteEnd
|
||||
st1 {v29.4h}, [x18], x17
|
||||
st1 {v29.4h}, [x19], x17
|
||||
cmp w10, #14
|
||||
beq WriteEnd
|
||||
st1 {v30.4h}, [x18], x17
|
||||
st1 {v30.4h}, [x19], x17
|
||||
cmp w10, #15
|
||||
beq WriteEnd
|
||||
st1 {v31.4h}, [x18], x17
|
||||
st1 {v31.4h}, [x19], x17
|
||||
b WriteEnd
|
||||
Write5:
|
||||
add x13, x18, #8
|
||||
st1 {v16.4h}, [x18], x17
|
||||
add x13, x19, #8
|
||||
st1 {v16.4h}, [x19], x17
|
||||
st1 {v16.h}[4], [x13], x17
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
st1 {v17.4h}, [x18], x17
|
||||
st1 {v17.4h}, [x19], x17
|
||||
st1 {v17.h}[4], [x13], x17
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
st1 {v18.4h}, [x18], x17
|
||||
st1 {v18.4h}, [x19], x17
|
||||
st1 {v18.h}[4], [x13], x17
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
st1 {v19.4h}, [x18], x17
|
||||
st1 {v19.4h}, [x19], x17
|
||||
st1 {v19.h}[4], [x13], x17
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
st1 {v20.4h}, [x18], x17
|
||||
st1 {v20.4h}, [x19], x17
|
||||
st1 {v20.h}[4], [x13], x17
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
st1 {v21.4h}, [x18], x17
|
||||
st1 {v21.4h}, [x19], x17
|
||||
st1 {v21.h}[4], [x13], x17
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
st1 {v22.4h}, [x18], x17
|
||||
st1 {v22.4h}, [x19], x17
|
||||
st1 {v22.h}[4], [x13], x17
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
st1 {v23.4h}, [x18], x17
|
||||
st1 {v23.4h}, [x19], x17
|
||||
st1 {v23.h}[4], [x13], x17
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.4h}, [x18], x17
|
||||
st1 {v24.4h}, [x19], x17
|
||||
st1 {v24.h}[4], [x13], x17
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
st1 {v25.4h}, [x18], x17
|
||||
st1 {v25.4h}, [x19], x17
|
||||
st1 {v25.h}[4], [x13], x17
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
st1 {v26.4h}, [x18], x17
|
||||
st1 {v26.4h}, [x19], x17
|
||||
st1 {v26.h}[4], [x13], x17
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
st1 {v27.4h}, [x18], x17
|
||||
st1 {v27.4h}, [x19], x17
|
||||
st1 {v27.h}[4], [x13], x17
|
||||
cmp w10, #12
|
||||
beq WriteEnd
|
||||
st1 {v28.4h}, [x18], x17
|
||||
st1 {v28.4h}, [x19], x17
|
||||
st1 {v28.h}[4], [x13], x17
|
||||
cmp w10, #13
|
||||
beq WriteEnd
|
||||
st1 {v29.4h}, [x18], x17
|
||||
st1 {v29.4h}, [x19], x17
|
||||
st1 {v29.h}[4], [x13], x17
|
||||
cmp w10, #14
|
||||
beq WriteEnd
|
||||
st1 {v30.4h}, [x18], x17
|
||||
st1 {v30.4h}, [x19], x17
|
||||
st1 {v30.h}[4], [x13], x17
|
||||
cmp w10, #15
|
||||
beq WriteEnd
|
||||
st1 {v31.4h}, [x18], x17
|
||||
st1 {v31.4h}, [x19], x17
|
||||
st1 {v31.h}[4], [x13], x17
|
||||
b WriteEnd
|
||||
Write6:
|
||||
add x13, x18, #8
|
||||
add x14, x18, #10
|
||||
st1 {v16.4h}, [x18], x17
|
||||
add x13, x19, #8
|
||||
add x14, x19, #10
|
||||
st1 {v16.4h}, [x19], x17
|
||||
st1 {v16.h}[4], [x13], x17
|
||||
st1 {v16.h}[5], [x14], x17
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
st1 {v17.4h}, [x18], x17
|
||||
st1 {v17.4h}, [x19], x17
|
||||
st1 {v17.h}[4], [x13], x17
|
||||
st1 {v17.h}[5], [x14], x17
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
st1 {v18.4h}, [x18], x17
|
||||
st1 {v18.4h}, [x19], x17
|
||||
st1 {v18.h}[4], [x13], x17
|
||||
st1 {v18.h}[5], [x14], x17
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
st1 {v19.4h}, [x18], x17
|
||||
st1 {v19.4h}, [x19], x17
|
||||
st1 {v19.h}[4], [x13], x17
|
||||
st1 {v19.h}[5], [x14], x17
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
st1 {v20.4h}, [x18], x17
|
||||
st1 {v20.4h}, [x19], x17
|
||||
st1 {v20.h}[4], [x13], x17
|
||||
st1 {v20.h}[5], [x14], x17
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
st1 {v21.4h}, [x18], x17
|
||||
st1 {v21.4h}, [x19], x17
|
||||
st1 {v21.h}[4], [x13], x17
|
||||
st1 {v21.h}[5], [x14], x17
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
st1 {v22.4h}, [x18], x17
|
||||
st1 {v22.4h}, [x19], x17
|
||||
st1 {v22.h}[4], [x13], x17
|
||||
st1 {v22.h}[5], [x14], x17
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
st1 {v23.4h}, [x18], x17
|
||||
st1 {v23.4h}, [x19], x17
|
||||
st1 {v23.h}[4], [x13], x17
|
||||
st1 {v23.h}[5], [x14], x17
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.4h}, [x18], x17
|
||||
st1 {v24.4h}, [x19], x17
|
||||
st1 {v24.h}[4], [x13], x17
|
||||
st1 {v24.h}[5], [x14], x17
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
st1 {v25.4h}, [x18], x17
|
||||
st1 {v25.4h}, [x19], x17
|
||||
st1 {v25.h}[4], [x13], x17
|
||||
st1 {v25.h}[5], [x14], x17
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
st1 {v26.4h}, [x18], x17
|
||||
st1 {v26.4h}, [x19], x17
|
||||
st1 {v26.h}[4], [x13], x17
|
||||
st1 {v26.h}[5], [x14], x17
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
st1 {v27.4h}, [x18], x17
|
||||
st1 {v27.4h}, [x19], x17
|
||||
st1 {v27.h}[4], [x13], x17
|
||||
st1 {v27.h}[5], [x14], x17
|
||||
cmp w10, #12
|
||||
beq WriteEnd
|
||||
st1 {v28.4h}, [x18], x17
|
||||
st1 {v28.4h}, [x19], x17
|
||||
st1 {v28.h}[4], [x13], x17
|
||||
st1 {v28.h}[5], [x14], x17
|
||||
cmp w10, #13
|
||||
beq WriteEnd
|
||||
st1 {v29.4h}, [x18], x17
|
||||
st1 {v29.4h}, [x19], x17
|
||||
st1 {v29.h}[4], [x13], x17
|
||||
st1 {v29.h}[5], [x14], x17
|
||||
cmp w10, #14
|
||||
beq WriteEnd
|
||||
st1 {v30.4h}, [x18], x17
|
||||
st1 {v30.4h}, [x19], x17
|
||||
st1 {v30.h}[4], [x13], x17
|
||||
st1 {v30.h}[5], [x14], x17
|
||||
cmp w10, #15
|
||||
beq WriteEnd
|
||||
st1 {v31.4h}, [x18], x17
|
||||
st1 {v31.4h}, [x19], x17
|
||||
st1 {v31.h}[4], [x13], x17
|
||||
st1 {v31.h}[5], [x14], x17
|
||||
b WriteEnd
|
||||
Write7:
|
||||
add x13, x18, #8
|
||||
add x14, x18, #10
|
||||
add x16, x18, #12
|
||||
st1 {v16.4h}, [x18], x17
|
||||
add x13, x19, #8
|
||||
add x14, x19, #10
|
||||
add x16, x19, #12
|
||||
st1 {v16.4h}, [x19], x17
|
||||
st1 {v16.h}[4], [x13], x17
|
||||
st1 {v16.h}[5], [x14], x17
|
||||
st1 {v16.h}[6], [x16], x17
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
st1 {v17.4h}, [x18], x17
|
||||
st1 {v17.4h}, [x19], x17
|
||||
st1 {v17.h}[4], [x13], x17
|
||||
st1 {v17.h}[5], [x14], x17
|
||||
st1 {v17.h}[6], [x16], x17
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
st1 {v18.4h}, [x18], x17
|
||||
st1 {v18.4h}, [x19], x17
|
||||
st1 {v18.h}[4], [x13], x17
|
||||
st1 {v18.h}[5], [x14], x17
|
||||
st1 {v18.h}[6], [x16], x17
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
st1 {v19.4h}, [x18], x17
|
||||
st1 {v19.4h}, [x19], x17
|
||||
st1 {v19.h}[4], [x13], x17
|
||||
st1 {v19.h}[5], [x14], x17
|
||||
st1 {v19.h}[6], [x16], x17
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
st1 {v20.4h}, [x18], x17
|
||||
st1 {v20.4h}, [x19], x17
|
||||
st1 {v20.h}[4], [x13], x17
|
||||
st1 {v20.h}[5], [x14], x17
|
||||
st1 {v20.h}[6], [x16], x17
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
st1 {v21.4h}, [x18], x17
|
||||
st1 {v21.4h}, [x19], x17
|
||||
st1 {v21.h}[4], [x13], x17
|
||||
st1 {v21.h}[5], [x14], x17
|
||||
st1 {v21.h}[6], [x16], x17
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
st1 {v22.4h}, [x18], x17
|
||||
st1 {v22.4h}, [x19], x17
|
||||
st1 {v22.h}[4], [x13], x17
|
||||
st1 {v22.h}[5], [x14], x17
|
||||
st1 {v22.h}[6], [x16], x17
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
st1 {v23.4h}, [x18], x17
|
||||
st1 {v23.4h}, [x19], x17
|
||||
st1 {v23.h}[4], [x13], x17
|
||||
st1 {v23.h}[5], [x14], x17
|
||||
st1 {v23.h}[6], [x16], x17
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.4h}, [x18], x17
|
||||
st1 {v24.4h}, [x19], x17
|
||||
st1 {v24.h}[4], [x13], x17
|
||||
st1 {v24.h}[5], [x14], x17
|
||||
st1 {v24.h}[6], [x16], x17
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
st1 {v25.4h}, [x18], x17
|
||||
st1 {v25.4h}, [x19], x17
|
||||
st1 {v25.h}[4], [x13], x17
|
||||
st1 {v25.h}[5], [x14], x17
|
||||
st1 {v25.h}[6], [x16], x17
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
st1 {v26.4h}, [x18], x17
|
||||
st1 {v26.4h}, [x19], x17
|
||||
st1 {v26.h}[4], [x13], x17
|
||||
st1 {v26.h}[5], [x14], x17
|
||||
st1 {v26.h}[6], [x16], x17
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
st1 {v27.4h}, [x18], x17
|
||||
st1 {v27.4h}, [x19], x17
|
||||
st1 {v27.h}[4], [x13], x17
|
||||
st1 {v27.h}[5], [x14], x17
|
||||
st1 {v27.h}[6], [x16], x17
|
||||
cmp w10, #12
|
||||
beq WriteEnd
|
||||
st1 {v28.4h}, [x18], x17
|
||||
st1 {v28.4h}, [x19], x17
|
||||
st1 {v28.h}[4], [x13], x17
|
||||
st1 {v28.h}[5], [x14], x17
|
||||
st1 {v28.h}[6], [x16], x17
|
||||
cmp w10, #13
|
||||
beq WriteEnd
|
||||
st1 {v29.4h}, [x18], x17
|
||||
st1 {v29.4h}, [x19], x17
|
||||
st1 {v29.h}[4], [x13], x17
|
||||
st1 {v29.h}[5], [x14], x17
|
||||
st1 {v29.h}[6], [x16], x17
|
||||
cmp w10, #14
|
||||
beq WriteEnd
|
||||
st1 {v30.4h}, [x18], x17
|
||||
st1 {v30.4h}, [x19], x17
|
||||
st1 {v30.h}[4], [x13], x17
|
||||
st1 {v30.h}[5], [x14], x17
|
||||
st1 {v30.h}[6], [x16], x17
|
||||
cmp w10, #15
|
||||
beq WriteEnd
|
||||
st1 {v31.4h}, [x18], x17
|
||||
st1 {v31.4h}, [x19], x17
|
||||
st1 {v31.h}[4], [x13], x17
|
||||
st1 {v31.h}[5], [x14], x17
|
||||
st1 {v31.h}[6], [x16], x17
|
||||
|
@ -809,52 +810,52 @@ WriteC8:
|
|||
st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64
|
||||
b WriteEnd
|
||||
Write8:
|
||||
st1 {v16.8h}, [x18], x17
|
||||
st1 {v16.8h}, [x19], x17
|
||||
cmp w10, #1
|
||||
beq WriteEnd
|
||||
st1 {v17.8h}, [x18], x17
|
||||
st1 {v17.8h}, [x19], x17
|
||||
cmp w10, #2
|
||||
beq WriteEnd
|
||||
st1 {v18.8h}, [x18], x17
|
||||
st1 {v18.8h}, [x19], x17
|
||||
cmp w10, #3
|
||||
beq WriteEnd
|
||||
st1 {v19.8h}, [x18], x17
|
||||
st1 {v19.8h}, [x19], x17
|
||||
cmp w10, #4
|
||||
beq WriteEnd
|
||||
st1 {v20.8h}, [x18], x17
|
||||
st1 {v20.8h}, [x19], x17
|
||||
cmp w10, #5
|
||||
beq WriteEnd
|
||||
st1 {v21.8h}, [x18], x17
|
||||
st1 {v21.8h}, [x19], x17
|
||||
cmp w10, #6
|
||||
beq WriteEnd
|
||||
st1 {v22.8h}, [x18], x17
|
||||
st1 {v22.8h}, [x19], x17
|
||||
cmp w10, #7
|
||||
beq WriteEnd
|
||||
st1 {v23.8h}, [x18], x17
|
||||
st1 {v23.8h}, [x19], x17
|
||||
cmp w10, #8
|
||||
beq WriteEnd
|
||||
st1 {v24.8h}, [x18], x17
|
||||
st1 {v24.8h}, [x19], x17
|
||||
cmp w10, #9
|
||||
beq WriteEnd
|
||||
st1 {v25.8h}, [x18], x17
|
||||
st1 {v25.8h}, [x19], x17
|
||||
cmp w10, #10
|
||||
beq WriteEnd
|
||||
st1 {v26.8h}, [x18], x17
|
||||
st1 {v26.8h}, [x19], x17
|
||||
cmp w10, #11
|
||||
beq WriteEnd
|
||||
st1 {v27.8h}, [x18], x17
|
||||
st1 {v27.8h}, [x19], x17
|
||||
cmp w10, #12
|
||||
beq WriteEnd
|
||||
st1 {v28.8h}, [x18], x17
|
||||
st1 {v28.8h}, [x19], x17
|
||||
cmp w10, #13
|
||||
beq WriteEnd
|
||||
st1 {v29.8h}, [x18], x17
|
||||
st1 {v29.8h}, [x19], x17
|
||||
cmp w10, #14
|
||||
beq WriteEnd
|
||||
st1 {v30.8h}, [x18], x17
|
||||
st1 {v30.8h}, [x19], x17
|
||||
cmp w10, #15
|
||||
beq WriteEnd
|
||||
st1 {v31.8h}, [x18], x17
|
||||
st1 {v31.8h}, [x19], x17
|
||||
|
||||
WriteEnd:
|
||||
subs w10, w10, #16 // lhs row - 8
|
||||
|
@ -871,8 +872,9 @@ NoDstStep:
|
|||
bgt L1
|
||||
|
||||
End1:
|
||||
sub sp, sp, #128
|
||||
sub sp, sp, #144
|
||||
ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
|
||||
ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -21,30 +21,31 @@
|
|||
// x9: writeMode
|
||||
|
||||
asm_function MatmulFp16Neon64Opt
|
||||
sub sp, sp, #80
|
||||
sub sp, sp, #96
|
||||
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
stp x21, x22, [sp], #16
|
||||
|
||||
ldr x8, [sp]
|
||||
ldr x9, [sp, #8]
|
||||
|
||||
mov x18, #32 // sizeof(float16_t) * 16
|
||||
mul x17, x5, x18 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth
|
||||
mov x21, #32 // sizeof(float16_t) * 16
|
||||
mul x17, x5, x21 // block stride of lhs/rhs: sizeof(float16_t) * 16 * depth
|
||||
cbnz x9, NoC8Steps
|
||||
mov x11, x2
|
||||
mov x18, #16
|
||||
mul x16, x6, x18 // row * 8 * sizeof(float16_t)
|
||||
mov x21, #16
|
||||
mul x16, x6, x21 // row * 8 * sizeof(float16_t)
|
||||
NoC8Steps:
|
||||
cmp x9, #2
|
||||
bne NoWinoSteps
|
||||
mov x18, #2
|
||||
mov x21, #2
|
||||
mul x15, x7, x8
|
||||
mul x15, x15, x18 // kernel_size * col *sizeof(float16_t)
|
||||
mov x18, #16
|
||||
mul x16, x8, x18 // kernel_size * 8 * sizeof(float16_t)
|
||||
mul x15, x15, x21 // kernel_size * col *sizeof(float16_t)
|
||||
mov x21, #16
|
||||
mul x16, x8, x21 // kernel_size * 8 * sizeof(float16_t)
|
||||
NoWinoSteps:
|
||||
mov x18, #2
|
||||
mul x8, x8, x18
|
||||
mov x21, #2
|
||||
mul x8, x8, x21
|
||||
|
||||
LoopRowStart:
|
||||
cmp x6, #1
|
||||
|
@ -1221,9 +1222,9 @@ LoopRow:
|
|||
LoopColEnd:
|
||||
add x0, x0, x17
|
||||
cbz x9, C8DstStep
|
||||
mov x18, #2
|
||||
mul x18, x18, x7
|
||||
sub x11, x11, x18
|
||||
mov x21, #2
|
||||
mul x21, x21, x7
|
||||
sub x11, x11, x21
|
||||
mov x2, x11
|
||||
b NoDstStep
|
||||
C8DstStep:
|
||||
|
@ -1233,8 +1234,9 @@ LoopColEnd:
|
|||
subs x6, x6, #16
|
||||
bgt LoopRowStart
|
||||
|
||||
sub sp, sp, #80
|
||||
sub sp, sp, #96
|
||||
ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ldp x21, x22, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -31,13 +31,13 @@ asm_function MatrixMultiplyWinogradFp16
|
|||
mov x14, x1 // mat_b
|
||||
LoopN:
|
||||
mov x16, x0 // mat_a_m
|
||||
sub x18, x5, x15 // ni
|
||||
sub x22, x5, x15 // ni
|
||||
sub x19, x17, x3 // mi
|
||||
mul x18, x18, x17 // ni * m
|
||||
mul x22, x22, x17 // ni * m
|
||||
mov x11, x6 // in_channel
|
||||
add x18, x18, x19 // (ni * m) + mi
|
||||
mul x18, x18, x13 // x18 * channel_in * 2
|
||||
add x20, x2, x18 // dst + offset
|
||||
add x22, x22, x19 // (ni * m) + mi
|
||||
mul x22, x22, x13 // x22 * channel_in * 2
|
||||
add x20, x2, x22 // dst + offset
|
||||
cmp x11, #32
|
||||
bge LoopC32
|
||||
cmp x11, #16
|
||||
|
|
|
@ -9,8 +9,8 @@
|
|||
|
||||
asm_function WinogradTransLeftFp16
|
||||
|
||||
sub sp, sp, #32
|
||||
stp x19, x20, [sp], #32
|
||||
sub sp, sp, #16
|
||||
stp x19, x20, [sp], #16
|
||||
|
||||
mov x8, #8 // 4 * sizeof(float16)
|
||||
mul x8, x6, x8
|
||||
|
@ -46,16 +46,16 @@ LoopH:
|
|||
ld1 {v0.h}[2], [x17], x10
|
||||
ld1 {v0.h}[3], [x17], x10
|
||||
mov x11, x6
|
||||
mov x18, x17
|
||||
add x18, x14, x7
|
||||
add x16, x18, x7
|
||||
mov x20, x17
|
||||
add x20, x14, x7
|
||||
add x16, x20, x7
|
||||
add x19, x16, x7
|
||||
|
||||
LoopLength4:
|
||||
ld1 {v16.4h}, [x2]
|
||||
ld1 {v20.4h}, [x14], #8
|
||||
fmla v16.4h, v20.4h, v0.h[0]
|
||||
ld1 {v21.4h}, [x18], #8
|
||||
ld1 {v21.4h}, [x20], #8
|
||||
fmul v17.4h, v21.4h, v0.h[1]
|
||||
ld1 {v20.4h}, [x16], #8
|
||||
fmla v16.4h, v20.4h, v0.h[2]
|
||||
|
@ -81,14 +81,14 @@ LoopH:
|
|||
ld1 {v0.h}[1], [x17], x10
|
||||
ld1 {v0.h}[2], [x17], x10
|
||||
mov x11, x6
|
||||
mov x18, x17
|
||||
add x18, x14, x7
|
||||
add x16, x18, x7
|
||||
mov x20, x17
|
||||
add x20, x14, x7
|
||||
add x16, x20, x7
|
||||
LoopLength3:
|
||||
ld1 {v16.4h}, [x2]
|
||||
ld1 {v20.4h}, [x14], #8
|
||||
fmla v16.4h, v20.4h, v0.h[0]
|
||||
ld1 {v21.4h}, [x18], #8
|
||||
ld1 {v21.4h}, [x20], #8
|
||||
fmul v17.4h, v21.4h, v0.h[1]
|
||||
ld1 {v20.4h}, [x16], #8
|
||||
fmla v16.4h, v20.4h, v0.h[2]
|
||||
|
@ -132,6 +132,6 @@ LoopH:
|
|||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #32
|
||||
ldp x19, x20, [sp], #32
|
||||
sub sp, sp, #16
|
||||
ldp x19, x20, [sp], #16
|
||||
ret
|
||||
|
|
|
@ -9,6 +9,9 @@
|
|||
|
||||
asm_function WinogradTransRightFp16
|
||||
|
||||
sub sp, sp, #16
|
||||
stp x19, x20, [sp], #16
|
||||
|
||||
mov x8, #8 // 4 * sizeof(float16)
|
||||
mul x8, x6, x8
|
||||
mul x9, x5, x8 // step for S
|
||||
|
@ -34,7 +37,7 @@ LoopH:
|
|||
cmp x12, #4
|
||||
blt LoopKStart3
|
||||
mov x16, x15
|
||||
mov x18, x4
|
||||
mov x19, x4
|
||||
LoopK4:
|
||||
ld1 {v0.h}[0], [x13], x10
|
||||
ld1 {v0.h}[1], [x13], x10
|
||||
|
@ -45,7 +48,7 @@ LoopH:
|
|||
|
||||
add x14, x17, x8
|
||||
add x16, x14, x8
|
||||
add x18, x16, x8
|
||||
add x19, x16, x8
|
||||
|
||||
LoopLength4:
|
||||
ld1 {v16.4h}, [x2]
|
||||
|
@ -55,7 +58,7 @@ LoopH:
|
|||
fmul v17.4h, v21.4h, v0.h[1]
|
||||
ld1 {v20.4h}, [x16], #8
|
||||
fmla v16.4h, v20.4h, v0.h[2]
|
||||
ld1 {v21.4h}, [x18], #8
|
||||
ld1 {v21.4h}, [x19], #8
|
||||
fmla v17.4h, v21.4h, v0.h[3]
|
||||
|
||||
fadd v17.4h, v16.4h, v17.4h
|
||||
|
@ -64,7 +67,7 @@ LoopH:
|
|||
bne LoopLength4
|
||||
sub x2, x2, x8
|
||||
sub x12, x12, #4
|
||||
mov x17, x18
|
||||
mov x17, x19
|
||||
|
||||
cmp x12, #4
|
||||
bge LoopK4
|
||||
|
@ -98,7 +101,7 @@ LoopH:
|
|||
bne LoopLength3
|
||||
sub x2, x2, x8
|
||||
sub x12, x12, #3
|
||||
mov x17, x18
|
||||
mov x17, x19
|
||||
cmp x12, #3
|
||||
bge LoopK3
|
||||
|
||||
|
@ -132,4 +135,7 @@ LoopH:
|
|||
subs x4, x4, #1
|
||||
bne LoopH
|
||||
|
||||
sub sp, sp, #16
|
||||
ldp x19, x20, [sp], #16
|
||||
|
||||
ret
|
|
@ -66,7 +66,7 @@ L2:
|
|||
cmp w16, #0
|
||||
beq End2
|
||||
|
||||
mov x18, x1 // reload b ptr
|
||||
mov x28, x1 // reload b ptr
|
||||
mov x19, x7 // reload bias ptr
|
||||
mov w20, w5 // reload depth
|
||||
dup v16.4s, wzr
|
||||
|
@ -91,7 +91,7 @@ L3:
|
|||
|
||||
LoopD16:
|
||||
ld1 {v0.16b, v1.16b}, [x17], #32
|
||||
ld1 {v2.16b, v3.16b}, [x18], #32
|
||||
ld1 {v2.16b, v3.16b}, [x28], #32
|
||||
|
||||
sdot v16.4s, v2.16b, v0.4b[0]
|
||||
sdot v18.4s, v2.16b, v0.4b[1]
|
||||
|
@ -104,7 +104,7 @@ LoopD16:
|
|||
sdot v28.4s, v2.16b, v1.4b[2]
|
||||
sdot v30.4s, v2.16b, v1.4b[3]
|
||||
|
||||
ld1 {v6.16b, v7.16b}, [x18], #32
|
||||
ld1 {v6.16b, v7.16b}, [x28], #32
|
||||
sdot v17.4s, v3.16b, v0.4b[0]
|
||||
sdot v19.4s, v3.16b, v0.4b[1]
|
||||
sdot v21.4s, v3.16b, v0.4b[2]
|
||||
|
@ -126,7 +126,7 @@ LoopD16:
|
|||
sdot v28.4s, v6.16b, v5.4b[2]
|
||||
sdot v30.4s, v6.16b, v5.4b[3]
|
||||
|
||||
ld1 {v10.16b, v11.16b}, [x18], #32
|
||||
ld1 {v10.16b, v11.16b}, [x28], #32
|
||||
sdot v17.4s, v7.16b, v4.4b[0]
|
||||
sdot v19.4s, v7.16b, v4.4b[1]
|
||||
sdot v21.4s, v7.16b, v4.4b[2]
|
||||
|
@ -148,7 +148,7 @@ LoopD16:
|
|||
sdot v28.4s, v10.16b, v9.4b[2]
|
||||
sdot v30.4s, v10.16b, v9.4b[3]
|
||||
|
||||
ld1 {v14.16b, v15.16b}, [x18], #32
|
||||
ld1 {v14.16b, v15.16b}, [x28], #32
|
||||
sdot v17.4s, v11.16b, v8.4b[0]
|
||||
sdot v19.4s, v11.16b, v8.4b[1]
|
||||
sdot v21.4s, v11.16b, v8.4b[2]
|
||||
|
@ -187,7 +187,7 @@ LoopD4:
|
|||
beq End3
|
||||
|
||||
ld1 {v0.16b, v1.16b}, [x17], #32
|
||||
ld1 {v2.16b, v3.16b}, [x18], #32
|
||||
ld1 {v2.16b, v3.16b}, [x28], #32
|
||||
|
||||
sdot v16.4s, v2.16b, v0.4b[0]
|
||||
sdot v18.4s, v2.16b, v0.4b[1]
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
// x28: filter_zp
|
||||
|
||||
asm_function MatmulInt8DpOpt
|
||||
sub sp, sp, #208
|
||||
sub sp, sp, #224
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
|
@ -38,6 +38,7 @@ asm_function MatmulInt8DpOpt
|
|||
stp x23, x24, [sp], #16
|
||||
stp x25, x26, [sp], #16
|
||||
stp x27, x28, [sp], #16
|
||||
stp x29, x30, [sp], #16
|
||||
|
||||
ldr w8, [sp]
|
||||
ldr w9, [sp, #8]
|
||||
|
@ -56,7 +57,7 @@ asm_function MatmulInt8DpOpt
|
|||
LoopRow:
|
||||
mov x16, x1 // reload rhs ptr
|
||||
mov x17, x4 // reload rhs col
|
||||
mov x18, x7 // reload bias ptr
|
||||
mov x29, x7 // reload bias ptr
|
||||
mov x25, x6 // reload input_sum ptr
|
||||
mov x27, x2 // reload dst ptr
|
||||
ldr x28, [sp, #64] // reload filter_zp
|
||||
|
@ -113,7 +114,7 @@ LoopRow:
|
|||
|
||||
Bias:
|
||||
cbz x7, NoReadBias
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x18], #64
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x29], #64
|
||||
add v16.4s, v16.4s, v0.4s
|
||||
add v17.4s, v17.4s, v1.4s
|
||||
add v18.4s, v18.4s, v2.4s
|
||||
|
@ -423,8 +424,8 @@ LoopRow:
|
|||
|
||||
BiasHalf:
|
||||
cbz x7, NoReadBiasHalf
|
||||
ld1 {v0.4s, v1.4s}, [x18]
|
||||
add x18, x18, #64
|
||||
ld1 {v0.4s, v1.4s}, [x29]
|
||||
add x29, x29, #64
|
||||
add v16.4s, v16.4s, v0.4s
|
||||
add v17.4s, v17.4s, v1.4s
|
||||
add v20.4s, v20.4s, v0.4s
|
||||
|
@ -612,8 +613,8 @@ LoopRow:
|
|||
|
||||
BiasQuarter:
|
||||
cbz x7, NoReadBiasQuarter
|
||||
ld1 {v0.4s}, [x18]
|
||||
add x18, x18, #64
|
||||
ld1 {v0.4s}, [x29]
|
||||
add x29, x29, #64
|
||||
add v16.4s, v16.4s, v0.4s
|
||||
add v20.4s, v20.4s, v0.4s
|
||||
add v24.4s, v24.4s, v0.4s
|
||||
|
@ -1072,7 +1073,7 @@ LoopColEnd:
|
|||
b LoopRow
|
||||
|
||||
LoopRowEnd:
|
||||
sub sp, sp, #208
|
||||
sub sp, sp, #224
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
|
@ -1080,5 +1081,6 @@ LoopRowEnd:
|
|||
ldp x23, x24, [sp], #16
|
||||
ldp x25, x26, [sp], #16
|
||||
ldp x27, x28, [sp], #16
|
||||
ldp x29, x30, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
|
@ -20,9 +20,10 @@
|
|||
// x7: bias
|
||||
|
||||
asm_function MatMulOptR4Int8Neon64
|
||||
sub sp, sp, #128
|
||||
sub sp, sp, #144
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
stp x19, x20, [sp], #16
|
||||
|
||||
mov w15, #0 // b col index
|
||||
mov w16, #0 // a row index
|
||||
|
@ -40,7 +41,7 @@ L2:
|
|||
cmp w16, w3
|
||||
beq End2
|
||||
|
||||
mov x18, x1 // reload b ptr
|
||||
mov x19, x1 // reload b ptr
|
||||
mov x10, x7 // reload bias ptr
|
||||
mov w11, w5 // reload depth
|
||||
dup v16.4s, wzr
|
||||
|
@ -67,10 +68,10 @@ L3:
|
|||
ld1 {v1.16b}, [x17], #16
|
||||
ld1 {v2.16b}, [x17], #16
|
||||
ld1 {v3.16b}, [x17], #16
|
||||
ld1 {v4.16b}, [x18], #16
|
||||
ld1 {v5.16b}, [x18], #16
|
||||
ld1 {v6.16b}, [x18], #16
|
||||
ld1 {v7.16b}, [x18], #16
|
||||
ld1 {v4.16b}, [x19], #16
|
||||
ld1 {v5.16b}, [x19], #16
|
||||
ld1 {v6.16b}, [x19], #16
|
||||
ld1 {v7.16b}, [x19], #16
|
||||
|
||||
sdot v16.4s, v4.16b, v0.16b
|
||||
sdot v17.4s, v5.16b, v0.16b
|
||||
|
@ -135,8 +136,9 @@ End2:
|
|||
b L1
|
||||
|
||||
End1:
|
||||
sub sp, sp, #128
|
||||
sub sp, sp, #144
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
|
||||
ldp x19, x20, [sp], #16
|
||||
ret
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue