@@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4747#define VX4 $vr21
4848#define res1 $vr19
4949#define res2 $vr20
50+ #define RCP $f2
51+ #define VALPHA $vr3
5052
5153 PROLOGUE
5254
@@ -55,10 +57,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5557 LDINT INCX, 0 (INCX)
5658#endif
5759
58- vxor.v res1, res1, res1
59- vxor.v res2, res2, res2
6060 bge $r0, N, .L999
6161 beq $r0, INCX, .L999
62+ addi.d $sp, $sp, -32
63+ st.d $ra, $sp, 0
64+ st.d N, $sp, 8
65+ st.d X, $sp, 16
66+ st.d INCX, $sp, 24
67+ #ifdef DYNAMIC_ARCH
68+ bl camax_k_LA264
69+ #else
70+ bl camax_k
71+ #endif
72+ ld.d $ra, $sp, 0
73+ ld.d N, $sp, 8
74+ ld.d X, $sp, 16
75+ ld.d INCX, $sp, 24
76+ addi.d $sp, $sp, 32
77+
78+ frecip.s RCP, $f0
79+ vreplvei.w VALPHA, $vr2, 0
80+ vxor.v res1, res1, res1
81+ vxor.v res2, res2, res2
82+ fcmp.ceq.s $fcc0, $f0, $f19
83+ bcnez $fcc0, .L999
6284 li.d TEMP, 1
6385 slli.d TEMP, TEMP, ZBASE_SHIFT
6486 slli.d INCX, INCX, ZBASE_SHIFT
@@ -69,16 +91,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6991
7092.L10:
7193 vld VX0, X, 0 * SIZE
72- vfcvtl.d.s VX1, VX0
73- vfcvth.d.s VX2, VX0
74- vfmadd.d res1, VX1, VX1, res1
75- vfmadd.d res2, VX2, VX2, res2
76- vld VX0, X, 4 * SIZE
77- vfcvtl.d.s VX3, VX0
78- vfcvth.d.s VX4, VX0
79- vfmadd.d res1, VX3, VX3, res1
80- vfmadd.d res2, VX4, VX4, res2
8194 addi.d I, I, -1
95+ vld VX0, X, 0 * SIZE
96+ vld VX1, X, 4 * SIZE
97+ vfmul.s VX0, VX0, VALPHA
98+ vfmul.s VX1, VX1, VALPHA
99+
100+ vfmadd.s res1, VX0, VX0, res1
101+ vfmadd.s res2, VX1, VX1, res2
102+
82103 addi.d X, X, 8 * SIZE
83104 blt $r0, I, .L10
84105 b .L996
@@ -99,10 +120,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
99120 vinsgr2vr.w VX0, t3, 2
100121 vinsgr2vr.w VX0, t4, 3
101122 add .d X, X, INCX
102- vfcvtl.d.s VX1, VX0
103- vfcvth.d.s VX2, VX0
104- vfmadd.d res1, VX1, VX1, res1
105- vfmadd.d res2, VX2, VX2, res2
123+ vfmul.s VX0, VX0, VALPHA
124+ vfmadd.s res1, VX0, VX0, res1
125+
106126 ld.w t1, X, 0 * SIZE
107127 ld.w t2, X, 1 * SIZE
108128 add .d X, X, INCX
@@ -113,19 +133,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
113133 vinsgr2vr.w VX0, t3, 2
114134 vinsgr2vr.w VX0, t4, 3
115135 add .d X, X, INCX
116- vfcvtl.d.s VX3, VX0
117- vfcvth.d.s VX4, VX0
118- vfmadd.d res1, VX3, VX3, res1
119- vfmadd.d res2, VX4, VX4, res2
136+ vfmul.s VX0, VX0, VALPHA
137+ vfmadd.s res2, VX0, VX0, res2
138+
120139 addi.d I, I, -1
121140 blt $r0, I, .L21
122141 b .L996
123142 .align 3
124143
125144.L996:
126- vfadd.d res1, res1, res2
127- vreplvei.d VX1, res1, 1
128- vfadd.d res1, VX1, res1
145+ vfadd.s res1, res1, res2
146+ vreplvei.w VX1, res1, 1
147+ vreplvei.w VX2, res1, 2
148+ vreplvei.w VX3, res1, 3
149+ vfadd.s res1, VX1, res1
150+ vfadd.s res1, VX2, res1
151+ vfadd.s res1, VX3, res1
129152 .align 3
130153
131154.L997:
@@ -137,18 +160,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
137160 fld .s a1, X, 0 * SIZE
138161 fld .s a2, X, 1 * SIZE
139162 addi.d I, I, -1
140- fcvt.d. s a1, a1
141- fcvt.d. s a2, a2
142- fmadd.d res, a1, a1, res
143- fmadd.d res, a2, a2, res
163+ fmul . s a1, a1, RCP
164+ fmul . s a2, a2, RCP
165+ fmadd.s res, a1, a1, res
166+ fmadd.s res, a2, a2, res
144167 add .d X, X, INCX
145168 blt $r0, I, .L998
146169 .align 3
147170
148171.L999:
149- fsqrt .d res, res
172+ fsqrt .s res, res
173+ fmul .s $f0, res, $f0
150174 move $r4, $r17
151- fcvt.s.d $f0, $f19
152175 jirl $r0, $r1, 0x0
153176 .align 3
154177
0 commit comments