Skip to content

Commit 3b1aef1

Browse files
committed
Use LMUL2 loads in main block.
1 parent daa3215 commit 3b1aef1

File tree

2 files changed

+14
-10
lines changed

2 files changed

+14
-10
lines changed

kernel/riscv64/dgemm_kernel_8x8_zvl256b.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1653,11 +1653,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, F
16531653
FLOAT B7 = B[7];
16541654
B += 8;
16551655

1656-
// LMUL = 2 does worst here
1657-
vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[0*4], 4 );
1658-
vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[1*4], 4 );
1656+
vfloat64m2_t A00 = __riscv_vle64_v_f64m2( A, 8 );
1657+
vfloat64m1_t A0 = __riscv_vget_v_f64m2_f64m1(A00, 0);
1658+
vfloat64m1_t A1 = __riscv_vget_v_f64m2_f64m1(A00, 1);
16591659
A += 8;
16601660

1661+
// LMUL = 2 does worst here
16611662
vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, 4 );
16621663
vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, 4 );
16631664
vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, 4 );
@@ -1686,8 +1687,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, F
16861687
B7 = B[7];
16871688
B += 8;
16881689

1689-
A0 = __riscv_vle64_v_f64m1( &A[0*4], 4 );
1690-
A1 = __riscv_vle64_v_f64m1( &A[1*4], 4 );
1690+
A00 = __riscv_vle64_v_f64m2( A, 8 );
1691+
A0 = __riscv_vget_v_f64m2_f64m1(A00, 0);
1692+
A1 = __riscv_vget_v_f64m2_f64m1(A00, 1);
16911693
A += 8;
16921694

16931695
result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, 4 );

kernel/riscv64/sgemm_kernel_16x8_zvl256b.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2158,11 +2158,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, F
21582158
FLOAT B7 = B[7];
21592159
B += 8;
21602160

2161-
// LMUL = 2 does worst here
2162-
vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[0*8], 8 );
2163-
vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[1*8], 8 );
2161+
vfloat32m2_t A00 = __riscv_vle32_v_f32m2( A, 16 );
2162+
vfloat32m1_t A0 = __riscv_vget_v_f32m2_f32m1(A00, 0);
2163+
vfloat32m1_t A1 = __riscv_vget_v_f32m2_f32m1(A00, 1);
21642164
A += 16;
21652165

2166+
// LMUL = 2 does worst here
21662167
vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, 8 );
21672168
vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, 8 );
21682169
vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, 8 );
@@ -2191,8 +2192,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, F
21912192
B7 = B[7];
21922193
B += 8;
21932194

2194-
A0 = __riscv_vle32_v_f32m1( &A[0*8], 8 );
2195-
A1 = __riscv_vle32_v_f32m1( &A[1*8], 8 );
2195+
A00 = __riscv_vle32_v_f32m2( A, 16 );
2196+
A0 = __riscv_vget_v_f32m2_f32m1(A00, 0);
2197+
A1 = __riscv_vget_v_f32m2_f32m1(A00, 1);
21962198
A += 16;
21972199

21982200
result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, 8 );

0 commit comments

Comments
 (0)