@@ -52,17 +52,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5252/* Don't change following FR unless you know the effects. */
5353#define res1 $vr19
5454#define res2 $vr20
55+ #define RCP $f2
56+ #define VALPHA $vr3
57+
58+ // The optimization for snrm2 cannot simply involve
59+ // extending the data type from float to double and
60+ // then summing the squares of the data. LAPACK tests
61+ // have shown that this approach can still lead to data overflow.
62+ // Instead, we need to find the maximum absolute value in the entire
63+ // array and divide each data element by this maximum value before
64+ // performing the calculation. This approach can avoid overflow (and does not require extending the data type).
5565
5666 PROLOGUE
5767
5868#ifdef F_INTERFACE
5969 LDINT N, 0 (N)
6070 LDINT INCX, 0 (INCX)
6171#endif
62- vxor.v res1, res1, res1
63- vxor.v res2, res2, res2
6472 bge $r0, N, .L999
6573 beq $r0, INCX, .L999
74+
75+ addi.d $sp, $sp, -32
76+ st.d $ra, $sp, 0
77+ st.d N, $sp, 8
78+ st.d X, $sp, 16
79+ st.d INCX, $sp, 24
80+ #ifdef DYNAMIC_ARCH
81+ bl samax_k_LA264
82+ #else
83+ bl samax_k
84+ #endif
85+ ld.d $ra, $sp, 0
86+ ld.d N, $sp, 8
87+ ld.d X, $sp, 16
88+ ld.d INCX, $sp, 24
89+ addi.d $sp, $sp, 32
90+
91+ frecip.s RCP, $f0
92+ vreplvei.w VALPHA, $vr2, 0
93+ vxor.v res1, res1, res1
94+ vxor.v res2, res2, res2
95+ fcmp.ceq.s $fcc0, $f0, $f19
96+ bcnez $fcc0, .L999
6697 li.d TEMP, SIZE
6798 slli.d INCX, INCX, BASE_SHIFT
6899 srai.d I, N, 3
@@ -75,14 +106,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
75106 vld VX5, X, 4 * SIZE
76107 addi.d I, I, -1
77108 addi.d X, X, 8 * SIZE
78- vfcvtl.d.s VX1, VX0
79- vfcvth.d.s VX2, VX0
80- vfcvtl.d.s VX3, VX5
81- vfcvth.d.s VX4, VX5
82- vfmadd.d res1, VX1, VX1, res1
83- vfmadd.d res2, VX2, VX2, res2
84- vfmadd.d res1, VX3, VX3, res1
85- vfmadd.d res2, VX4, VX4, res2
109+
110+ vfmul.s VX0, VX0, VALPHA
111+ vfmul.s VX5, VX5, VALPHA
112+
113+ vfmadd.s res1, VX0, VX0, res1
114+ vfmadd.s res2, VX5, VX5, res2
86115 blt $r0, I, .L10
87116 b .L996
88117 .align 3
@@ -104,10 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
104133 vinsgr2vr.w VX0, t2, 1
105134 vinsgr2vr.w VX0, t3, 2
106135 vinsgr2vr.w VX0, t4, 3
107- vfcvtl.d.s VX1, VX0
108- vfcvth.d.s VX2, VX0
109- vfmadd.d res1, VX1, VX1, res1
110- vfmadd.d res2, VX2, VX2, res2
136+ vfmul.s VX0, VX0, VALPHA
137+ vfmadd.s res1, VX0, VX0, res1
138+
111139 ld.w t1, X, 0
112140 add .d X, X, INCX
113141 ld.w t2, X, 0
@@ -120,19 +148,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
120148 vinsgr2vr.w VX0, t2, 1
121149 vinsgr2vr.w VX0, t3, 2
122150 vinsgr2vr.w VX0, t4, 3
123- vfcvtl.d.s VX3, VX0
124- vfcvth.d.s VX4, VX0
125- vfmadd.d res1, VX3, VX3, res1
126- vfmadd.d res2, VX4, VX4, res2
151+ vfmul.s VX0, VX0, VALPHA
152+ vfmadd.s res2, VX0, VX0, res2
127153 addi.d I, I, -1
128154 blt $r0, I, .L21
129- b .L996
130155 .align 3
131156
132157.L996:
133- vfadd.d res1, res1, res2
134- vreplvei.d VX1, res1, 1
135- vfadd.d res1, VX1, res1
158+ vfadd.s res1, res1, res2
159+ vreplvei.w VX1, res1, 1
160+ vreplvei.w VX2, res1, 2
161+ vreplvei.w VX3, res1, 3
162+ vfadd.s res1, VX1, res1
163+ vfadd.s res1, VX2, res1
164+ vfadd.s res1, VX3, res1
136165 .align 3
137166
138167.L997:
@@ -143,16 +172,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
143172.L998:
144173 fld .s $f15, X, 0
145174 addi.d I, I, -1
146- fcvt.d.s $f15, $f15
147- fmadd.d $f19, $f15, $f15, $f19
175+ fmul .s $f15, $f15, RCP
176+ fmadd.s $f19, $f15, $f15, $f19
148177 add .d X, X, INCX
149178 blt $r0, I, .L998
150179 .align 3
151180
152181.L999:
153- fsqrt .d $f19, $f19
182+ fsqrt .s $f19, $f19
183+ fmul .s $f0, $f19, $f0
154184 move $r4, $r17
155- fcvt.s.d $f0, $f19
156185 jirl $r0, $r1, 0x0
157186 .align 3
158187
0 commit comments