@@ -59,6 +59,10 @@ blake3_hash_many_avx512:
59
59
sub rsp , 144
60
60
and rsp , 0xFFFFFFFFFFFFFFC0
61
61
neg r9
62
+ #ifdef _ILP32
63
+ mov esi , esi
64
+ mov edx , edx
65
+ #endif
62
66
kmovw k1 , r9d
63
67
vmovd xmm0 , r8d
64
68
vpbroadcastd ymm0 , xmm0
@@ -107,6 +111,7 @@ blake3_hash_many_avx512:
107
111
cmp rdx , qword ptr [ rsp + 0x80 ]
108
112
cmove eax , ebx
109
113
mov dword ptr [ rsp + 0x88 ], eax
114
+ #ifndef _ILP32
110
115
mov r8 , qword ptr [ rdi ]
111
116
mov r9 , qword ptr [ rdi + 0x8 ]
112
117
mov r10 , qword ptr [ rdi + 0x10 ]
@@ -115,6 +120,16 @@ blake3_hash_many_avx512:
115
120
mov r13 , qword ptr [ rdi + 0x48 ]
116
121
mov r14 , qword ptr [ rdi + 0x50 ]
117
122
mov r15 , qword ptr [ rdi + 0x58 ]
123
+ #else
124
+ mov r8d , dword ptr [ rdi ]
125
+ mov r9d , dword ptr [ rdi + 0x4 ]
126
+ mov r10d , dword ptr [ rdi + 0x8 ]
127
+ mov r11d , dword ptr [ rdi + 0xc ]
128
+ mov r12d , dword ptr [ rdi + 0x20 ]
129
+ mov r13d , dword ptr [ rdi + 0x24 ]
130
+ mov r14d , dword ptr [ rdi + 0x28 ]
131
+ mov r15d , dword ptr [ rdi + 0x2c ]
132
+ #endif
118
133
vmovdqu32 ymm16 , ymmword ptr [ rdx + r8 - 0x2 * 0x20 ]
119
134
vinserti64x4 zmm16 , zmm16 , ymmword ptr [ rdx + r12 - 0x2 * 0x20 ], 0x01
120
135
vmovdqu32 ymm17 , ymmword ptr [ rdx + r9 - 0x2 * 0x20 ]
@@ -127,6 +142,7 @@ blake3_hash_many_avx512:
127
142
vinserti64x4 zmm19 , zmm19 , ymmword ptr [ rdx + r15 - 0x2 * 0x20 ], 0x01
128
143
vpunpcklqdq zmm10 , zmm18 , zmm19
129
144
vpunpckhqdq zmm11 , zmm18 , zmm19
145
+ #ifndef _ILP32
130
146
mov r8 , qword ptr [ rdi + 0x20 ]
131
147
mov r9 , qword ptr [ rdi + 0x28 ]
132
148
mov r10 , qword ptr [ rdi + 0x30 ]
@@ -135,6 +151,16 @@ blake3_hash_many_avx512:
135
151
mov r13 , qword ptr [ rdi + 0x68 ]
136
152
mov r14 , qword ptr [ rdi + 0x70 ]
137
153
mov r15 , qword ptr [ rdi + 0x78 ]
154
+ #else
155
+ mov r8d , dword ptr [ rdi + 0x10 ]
156
+ mov r9d , dword ptr [ rdi + 0x14 ]
157
+ mov r10d , dword ptr [ rdi + 0x18 ]
158
+ mov r11d , dword ptr [ rdi + 0x1c ]
159
+ mov r12d , dword ptr [ rdi + 0x30 ]
160
+ mov r13d , dword ptr [ rdi + 0x34 ]
161
+ mov r14d , dword ptr [ rdi + 0x38 ]
162
+ mov r15d , dword ptr [ rdi + 0x3c ]
163
+ #endif
138
164
vmovdqu32 ymm16 , ymmword ptr [ rdx + r8 - 0x2 * 0x20 ]
139
165
vinserti64x4 zmm16 , zmm16 , ymmword ptr [ rdx + r12 - 0x2 * 0x20 ], 0x01
140
166
vmovdqu32 ymm17 , ymmword ptr [ rdx + r9 - 0x2 * 0x20 ]
@@ -169,6 +195,7 @@ blake3_hash_many_avx512:
169
195
vmovdqa32 zmm23 , zmm19
170
196
vpermt2d zmm19 , zmm27 , zmm8
171
197
vpermt2d zmm23 , zmm31 , zmm8
198
+ #ifndef _ILP32
172
199
mov r8 , qword ptr [ rdi ]
173
200
mov r9 , qword ptr [ rdi + 0x8 ]
174
201
mov r10 , qword ptr [ rdi + 0x10 ]
@@ -177,6 +204,16 @@ blake3_hash_many_avx512:
177
204
mov r13 , qword ptr [ rdi + 0x48 ]
178
205
mov r14 , qword ptr [ rdi + 0x50 ]
179
206
mov r15 , qword ptr [ rdi + 0x58 ]
207
+ #else
208
+ mov r8d , dword ptr [ rdi ]
209
+ mov r9d , dword ptr [ rdi + 0x4 ]
210
+ mov r10d , dword ptr [ rdi + 0x8 ]
211
+ mov r11d , dword ptr [ rdi + 0xc ]
212
+ mov r12d , dword ptr [ rdi + 0x20 ]
213
+ mov r13d , dword ptr [ rdi + 0x24 ]
214
+ mov r14d , dword ptr [ rdi + 0x28 ]
215
+ mov r15d , dword ptr [ rdi + 0x2c ]
216
+ #endif
180
217
vmovdqu32 ymm24 , ymmword ptr [ r8 + rdx - 0x1 * 0x20 ]
181
218
vinserti64x4 zmm24 , zmm24 , ymmword ptr [ r12 + rdx - 0x1 * 0x20 ], 0x01
182
219
vmovdqu32 ymm25 , ymmword ptr [ r9 + rdx - 0x1 * 0x20 ]
@@ -197,6 +234,7 @@ blake3_hash_many_avx512:
197
234
prefetcht0 [ r14 + rdx + 0x80 ]
198
235
prefetcht0 [ r11 + rdx + 0x80 ]
199
236
prefetcht0 [ r15 + rdx + 0x80 ]
237
+ #ifndef _ILP32
200
238
mov r8 , qword ptr [ rdi + 0x20 ]
201
239
mov r9 , qword ptr [ rdi + 0x28 ]
202
240
mov r10 , qword ptr [ rdi + 0x30 ]
@@ -205,6 +243,16 @@ blake3_hash_many_avx512:
205
243
mov r13 , qword ptr [ rdi + 0x68 ]
206
244
mov r14 , qword ptr [ rdi + 0x70 ]
207
245
mov r15 , qword ptr [ rdi + 0x78 ]
246
+ #else
247
+ mov r8d , dword ptr [ rdi + 0x10 ]
248
+ mov r9d , dword ptr [ rdi + 0x14 ]
249
+ mov r10d , dword ptr [ rdi + 0x18 ]
250
+ mov r11d , dword ptr [ rdi + 0x1c ]
251
+ mov r12d , dword ptr [ rdi + 0x30 ]
252
+ mov r13d , dword ptr [ rdi + 0x34 ]
253
+ mov r14d , dword ptr [ rdi + 0x38 ]
254
+ mov r15d , dword ptr [ rdi + 0x3c ]
255
+ #endif
208
256
vmovdqu32 ymm24 , ymmword ptr [ r8 + rdx - 0x1 * 0x20 ]
209
257
vinserti64x4 zmm24 , zmm24 , ymmword ptr [ r12 + rdx - 0x1 * 0x20 ], 0x01
210
258
vmovdqu32 ymm25 , ymmword ptr [ r9 + rdx - 0x1 * 0x20 ]
@@ -1095,7 +1143,11 @@ blake3_hash_many_avx512:
1095
1143
vpaddd zmm1 {k2} , zmm1 , dword ptr [ ADD1 + rip ] {1to16}
1096
1144
vmovdqa32 zmmword ptr [ rsp ], zmm2
1097
1145
vmovdqa32 zmmword ptr [ rsp + 0x1 * 0x40 ], zmm1
1146
+ #ifndef _ILP32
1098
1147
add rdi , 128
1148
+ #else
1149
+ add rdi , 64
1150
+ #endif
1099
1151
add rbx , 512
1100
1152
mov qword ptr [ rbp + 0x50 ], rbx
1101
1153
sub rsi , 16
@@ -1125,6 +1177,7 @@ blake3_hash_many_avx512:
1125
1177
vpbroadcastd ymm5 , dword ptr [ rcx + 0x14 ]
1126
1178
vpbroadcastd ymm6 , dword ptr [ rcx + 0x18 ]
1127
1179
vpbroadcastd ymm7 , dword ptr [ rcx + 0x1C ]
1180
+ #ifndef _ILP32
1128
1181
mov r8 , qword ptr [ rdi ]
1129
1182
mov r9 , qword ptr [ rdi + 0x8 ]
1130
1183
mov r10 , qword ptr [ rdi + 0x10 ]
@@ -1133,6 +1186,16 @@ blake3_hash_many_avx512:
1133
1186
mov r13 , qword ptr [ rdi + 0x28 ]
1134
1187
mov r14 , qword ptr [ rdi + 0x30 ]
1135
1188
mov r15 , qword ptr [ rdi + 0x38 ]
1189
+ #else
1190
+ mov r8d , dword ptr [ rdi ]
1191
+ mov r9d , dword ptr [ rdi + 0x4 ]
1192
+ mov r10d , dword ptr [ rdi + 0x8 ]
1193
+ mov r11d , dword ptr [ rdi + 0xc ]
1194
+ mov r12d , dword ptr [ rdi + 0x10 ]
1195
+ mov r13d , dword ptr [ rdi + 0x14 ]
1196
+ mov r14d , dword ptr [ rdi + 0x18 ]
1197
+ mov r15d , dword ptr [ rdi + 0x1c ]
1198
+ #endif
1136
1199
movzx eax , byte ptr [ rbp + 0x38 ]
1137
1200
movzx ebx , byte ptr [ rbp + 0x40 ]
1138
1201
or eax , ebx
@@ -2055,7 +2118,11 @@ blake3_hash_many_avx512:
2055
2118
vmovdqa ymmword ptr [ rsp + 0x2 * 0x20 ], ymm2
2056
2119
add rbx , 256
2057
2120
mov qword ptr [ rbp + 0x50 ], rbx
2121
+ #ifndef _ILP32
2058
2122
add rdi , 64
2123
+ #else
2124
+ add rdi , 32
2125
+ #endif
2059
2126
sub rsi , 8
2060
2127
3 :
2061
2128
mov rbx , qword ptr [ rbp + 0x50 ]
@@ -2078,10 +2145,17 @@ blake3_hash_many_avx512:
2078
2145
kmovw k2 , eax
2079
2146
vpblendmd zmm13 {k2} , zmm13 , zmm12
2080
2147
vbroadcasti32x4 zmm15 , xmmword ptr [ BLAKE3_IV + rip ]
2148
+ #ifndef _ILP32
2081
2149
mov r8 , qword ptr [ rdi ]
2082
2150
mov r9 , qword ptr [ rdi + 0x8 ]
2083
2151
mov r10 , qword ptr [ rdi + 0x10 ]
2084
2152
mov r11 , qword ptr [ rdi + 0x18 ]
2153
+ #else
2154
+ mov r8d , dword ptr [ rdi ]
2155
+ mov r9d , dword ptr [ rdi + 0x4 ]
2156
+ mov r10d , dword ptr [ rdi + 0x8 ]
2157
+ mov r11d , dword ptr [ rdi + 0xc ]
2158
+ #endif
2085
2159
mov eax , 43690
2086
2160
kmovw k3 , eax
2087
2161
mov eax , 34952
@@ -2195,7 +2269,11 @@ blake3_hash_many_avx512:
2195
2269
vmovdqa xmmword ptr [ rsp ], xmm0
2196
2270
vmovdqa xmmword ptr [ rsp + 0x40 ], xmm2
2197
2271
add rbx , 128
2272
+ #ifndef _ILP32
2198
2273
add rdi , 32
2274
+ #else
2275
+ add rdi , 16
2276
+ #endif
2199
2277
sub rsi , 4
2200
2278
3 :
2201
2279
test esi , 0x2
@@ -2209,8 +2287,13 @@ blake3_hash_many_avx512:
2209
2287
vpinsrd xmm14 , xmm14 , dword ptr [ rsp + 0x44 ], 1
2210
2288
vpinsrd xmm14 , xmm14 , dword ptr [ BLAKE3_BLOCK_LEN + rip ], 2
2211
2289
vinserti128 ymm13 , ymm13 , xmm14 , 0x01
2290
+ #ifndef _ILP32
2212
2291
mov r8 , qword ptr [ rdi ]
2213
2292
mov r9 , qword ptr [ rdi + 0x8 ]
2293
+ #else
2294
+ mov r8d , dword ptr [ rdi ]
2295
+ mov r9d , dword ptr [ rdi + 0x4 ]
2296
+ #endif
2214
2297
movzx eax , byte ptr [ rbp + 0x40 ]
2215
2298
or eax , r13d
2216
2299
xor edx , edx
@@ -2308,7 +2391,11 @@ blake3_hash_many_avx512:
2308
2391
vmovdqa xmmword ptr [ rsp ], xmm0
2309
2392
vmovdqa xmmword ptr [ rsp + 0x4 * 0x10 ], xmm2
2310
2393
add rbx , 64
2394
+ #ifndef _ILP32
2311
2395
add rdi , 16
2396
+ #else
2397
+ add rdi , 8
2398
+ #endif
2312
2399
sub rsi , 2
2313
2400
3 :
2314
2401
test esi , 0x1
@@ -2319,7 +2406,11 @@ blake3_hash_many_avx512:
2319
2406
vpinsrd xmm14 , xmm14 , dword ptr [ rsp + 0x40 ], 1
2320
2407
vpinsrd xmm14 , xmm14 , dword ptr [ BLAKE3_BLOCK_LEN + rip ], 2
2321
2408
vmovdqa xmm15 , xmmword ptr [ BLAKE3_IV + rip ]
2409
+ #ifndef _ILP32
2322
2410
mov r8 , qword ptr [ rdi ]
2411
+ #else
2412
+ mov r8d , dword ptr [ rdi ]
2413
+ #endif
2323
2414
movzx eax , byte ptr [ rbp + 0x40 ]
2324
2415
or eax , r13d
2325
2416
xor edx , edx
0 commit comments