Skip to content

Commit da1d1c2

Browse files
committed
[x32] Fix BLAKE3 assembly
The x86-64 assembly implementations of BLAKE3 are used both in 64-bit and in 32-bit pointer mode, but only worked in 64-bit pointer mode. This PR adds support to also allow them to work in 32-bit pointer mode.
1 parent 68fd102 commit da1d1c2

File tree

4 files changed

+190
-0
lines changed

4 files changed

+190
-0
lines changed

llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ blake3_hash_many_avx2:
4545
mov rbp, rsp
4646
sub rsp, 680
4747
and rsp, 0xFFFFFFFFFFFFFFC0
48+
#ifdef _ILP32
49+
mov esi, esi
50+
mov edx, edx
51+
#endif
4852
neg r9d
4953
vmovd xmm0, r9d
5054
vpbroadcastd ymm0, xmm0
@@ -77,6 +81,7 @@ blake3_hash_many_avx2:
7781
vpbroadcastd ymm5, dword ptr [rcx+0x14]
7882
vpbroadcastd ymm6, dword ptr [rcx+0x18]
7983
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
84+
#ifndef _ILP32
8085
mov r8, qword ptr [rdi]
8186
mov r9, qword ptr [rdi+0x8]
8287
mov r10, qword ptr [rdi+0x10]
@@ -85,6 +90,16 @@ blake3_hash_many_avx2:
8590
mov r13, qword ptr [rdi+0x28]
8691
mov r14, qword ptr [rdi+0x30]
8792
mov r15, qword ptr [rdi+0x38]
93+
#else
94+
mov r8d, dword ptr [rdi]
95+
mov r9d, dword ptr [rdi+0x4]
96+
mov r10d, dword ptr [rdi+0x8]
97+
mov r11d, dword ptr [rdi+0xc]
98+
mov r12d, dword ptr [rdi+0x10]
99+
mov r13d, dword ptr [rdi+0x14]
100+
mov r14d, dword ptr [rdi+0x18]
101+
mov r15d, dword ptr [rdi+0x1c]
102+
#endif
88103
movzx eax, byte ptr [rbp+0x38]
89104
movzx ebx, byte ptr [rbp+0x40]
90105
or eax, ebx
@@ -1305,7 +1320,11 @@ blake3_hash_many_avx2:
13051320
vmovdqa ymm0, ymmword ptr [rsp+0x260]
13061321
vpsubd ymm2, ymm0, ymm2
13071322
vmovdqa ymmword ptr [rsp+0x260], ymm2
1323+
#ifndef _ILP32
13081324
add rdi, 64
1325+
#else
1326+
add rdi, 32
1327+
#endif
13091328
add rbx, 256
13101329
mov qword ptr [rbp+0x50], rbx
13111330
sub rsi, 8
@@ -1346,10 +1365,17 @@ blake3_hash_many_avx2:
13461365
vpblendd ymm15, ymm15, ymm12, 0x44
13471366
vmovdqa ymmword ptr [rsp], ymm14
13481367
vmovdqa ymmword ptr [rsp+0x20], ymm15
1368+
#ifndef _ILP32
13491369
mov r8, qword ptr [rdi]
13501370
mov r9, qword ptr [rdi+0x8]
13511371
mov r10, qword ptr [rdi+0x10]
13521372
mov r11, qword ptr [rdi+0x18]
1373+
#else
1374+
mov r8d, dword ptr [rdi]
1375+
mov r9d, dword ptr [rdi+0x4]
1376+
mov r10d, dword ptr [rdi+0x8]
1377+
mov r11d, dword ptr [rdi+0xc]
1378+
#endif
13531379
movzx eax, byte ptr [rbp+0x40]
13541380
or eax, r13d
13551381
xor edx, edx
@@ -1557,7 +1583,11 @@ blake3_hash_many_avx2:
15571583
vmovaps xmmword ptr [rsp+0x240], xmm0
15581584
vmovaps xmmword ptr [rsp+0x260], xmm2
15591585
add rbx, 128
1586+
#ifndef _ILP32
15601587
add rdi, 32
1588+
#else
1589+
add rdi, 16
1590+
#endif
15611591
sub rsi, 4
15621592
3:
15631593
test rsi, 0x2
@@ -1573,8 +1603,13 @@ blake3_hash_many_avx2:
15731603
vinserti128 ymm13, ymm13, xmm14, 0x01
15741604
vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
15751605
vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
1606+
#ifndef _ILP32
15761607
mov r8, qword ptr [rdi]
15771608
mov r9, qword ptr [rdi+0x8]
1609+
#else
1610+
mov r8d, dword ptr [rdi]
1611+
mov r9d, dword ptr [rdi+0x4]
1612+
#endif
15781613
movzx eax, byte ptr [rbp+0x40]
15791614
or eax, r13d
15801615
xor edx, edx
@@ -1683,7 +1718,11 @@ blake3_hash_many_avx2:
16831718
vmovaps ymmword ptr [rsp+0x240], ymm0
16841719
vmovaps ymmword ptr [rsp+0x260], ymm2
16851720
add rbx, 64
1721+
#ifndef _ILP32
16861722
add rdi, 16
1723+
#else
1724+
add rdi, 8
1725+
#endif
16871726
sub rsi, 2
16881727
3:
16891728
test rsi, 0x1
@@ -1695,7 +1734,11 @@ blake3_hash_many_avx2:
16951734
vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
16961735
vmovdqa xmm14, xmmword ptr [ROT16+rip]
16971736
vmovdqa xmm15, xmmword ptr [ROT8+rip]
1737+
#ifndef _ILP32
16981738
mov r8, qword ptr [rdi]
1739+
#else
1740+
mov r8d, dword ptr [rdi]
1741+
#endif
16991742
movzx eax, byte ptr [rbp+0x40]
17001743
or eax, r13d
17011744
xor edx, edx

llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ blake3_hash_many_avx512:
5959
sub rsp, 144
6060
and rsp, 0xFFFFFFFFFFFFFFC0
6161
neg r9
62+
#ifdef _ILP32
63+
mov esi, esi
64+
mov edx, edx
65+
#endif
6266
kmovw k1, r9d
6367
vmovd xmm0, r8d
6468
vpbroadcastd ymm0, xmm0
@@ -107,6 +111,7 @@ blake3_hash_many_avx512:
107111
cmp rdx, qword ptr [rsp+0x80]
108112
cmove eax, ebx
109113
mov dword ptr [rsp+0x88], eax
114+
#ifndef _ILP32
110115
mov r8, qword ptr [rdi]
111116
mov r9, qword ptr [rdi+0x8]
112117
mov r10, qword ptr [rdi+0x10]
@@ -115,6 +120,16 @@ blake3_hash_many_avx512:
115120
mov r13, qword ptr [rdi+0x48]
116121
mov r14, qword ptr [rdi+0x50]
117122
mov r15, qword ptr [rdi+0x58]
123+
#else
124+
mov r8d, dword ptr [rdi]
125+
mov r9d, dword ptr [rdi+0x4]
126+
mov r10d, dword ptr [rdi+0x8]
127+
mov r11d, dword ptr [rdi+0xc]
128+
mov r12d, dword ptr [rdi+0x20]
129+
mov r13d, dword ptr [rdi+0x24]
130+
mov r14d, dword ptr [rdi+0x28]
131+
mov r15d, dword ptr [rdi+0x2c]
132+
#endif
118133
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
119134
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
120135
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
@@ -127,6 +142,7 @@ blake3_hash_many_avx512:
127142
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
128143
vpunpcklqdq zmm10, zmm18, zmm19
129144
vpunpckhqdq zmm11, zmm18, zmm19
145+
#ifndef _ILP32
130146
mov r8, qword ptr [rdi+0x20]
131147
mov r9, qword ptr [rdi+0x28]
132148
mov r10, qword ptr [rdi+0x30]
@@ -135,6 +151,16 @@ blake3_hash_many_avx512:
135151
mov r13, qword ptr [rdi+0x68]
136152
mov r14, qword ptr [rdi+0x70]
137153
mov r15, qword ptr [rdi+0x78]
154+
#else
155+
mov r8d, dword ptr [rdi+0x10]
156+
mov r9d, dword ptr [rdi+0x14]
157+
mov r10d, dword ptr [rdi+0x18]
158+
mov r11d, dword ptr [rdi+0x1c]
159+
mov r12d, dword ptr [rdi+0x30]
160+
mov r13d, dword ptr [rdi+0x34]
161+
mov r14d, dword ptr [rdi+0x38]
162+
mov r15d, dword ptr [rdi+0x3c]
163+
#endif
138164
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
139165
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
140166
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
@@ -169,6 +195,7 @@ blake3_hash_many_avx512:
169195
vmovdqa32 zmm23, zmm19
170196
vpermt2d zmm19, zmm27, zmm8
171197
vpermt2d zmm23, zmm31, zmm8
198+
#ifndef _ILP32
172199
mov r8, qword ptr [rdi]
173200
mov r9, qword ptr [rdi+0x8]
174201
mov r10, qword ptr [rdi+0x10]
@@ -177,6 +204,16 @@ blake3_hash_many_avx512:
177204
mov r13, qword ptr [rdi+0x48]
178205
mov r14, qword ptr [rdi+0x50]
179206
mov r15, qword ptr [rdi+0x58]
207+
#else
208+
mov r8d, dword ptr [rdi]
209+
mov r9d, dword ptr [rdi+0x4]
210+
mov r10d, dword ptr [rdi+0x8]
211+
mov r11d, dword ptr [rdi+0xc]
212+
mov r12d, dword ptr [rdi+0x20]
213+
mov r13d, dword ptr [rdi+0x24]
214+
mov r14d, dword ptr [rdi+0x28]
215+
mov r15d, dword ptr [rdi+0x2c]
216+
#endif
180217
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
181218
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
182219
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
@@ -197,6 +234,7 @@ blake3_hash_many_avx512:
197234
prefetcht0 [r14+rdx+0x80]
198235
prefetcht0 [r11+rdx+0x80]
199236
prefetcht0 [r15+rdx+0x80]
237+
#ifndef _ILP32
200238
mov r8, qword ptr [rdi+0x20]
201239
mov r9, qword ptr [rdi+0x28]
202240
mov r10, qword ptr [rdi+0x30]
@@ -205,6 +243,16 @@ blake3_hash_many_avx512:
205243
mov r13, qword ptr [rdi+0x68]
206244
mov r14, qword ptr [rdi+0x70]
207245
mov r15, qword ptr [rdi+0x78]
246+
#else
247+
mov r8d, dword ptr [rdi+0x10]
248+
mov r9d, dword ptr [rdi+0x14]
249+
mov r10d, dword ptr [rdi+0x18]
250+
mov r11d, dword ptr [rdi+0x1c]
251+
mov r12d, dword ptr [rdi+0x30]
252+
mov r13d, dword ptr [rdi+0x34]
253+
mov r14d, dword ptr [rdi+0x38]
254+
mov r15d, dword ptr [rdi+0x3c]
255+
#endif
208256
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
209257
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
210258
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
@@ -1095,7 +1143,11 @@ blake3_hash_many_avx512:
10951143
vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
10961144
vmovdqa32 zmmword ptr [rsp], zmm2
10971145
vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
1146+
#ifndef _ILP32
10981147
add rdi, 128
1148+
#else
1149+
add rdi, 64
1150+
#endif
10991151
add rbx, 512
11001152
mov qword ptr [rbp+0x50], rbx
11011153
sub rsi, 16
@@ -1125,6 +1177,7 @@ blake3_hash_many_avx512:
11251177
vpbroadcastd ymm5, dword ptr [rcx+0x14]
11261178
vpbroadcastd ymm6, dword ptr [rcx+0x18]
11271179
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
1180+
#ifndef _ILP32
11281181
mov r8, qword ptr [rdi]
11291182
mov r9, qword ptr [rdi+0x8]
11301183
mov r10, qword ptr [rdi+0x10]
@@ -1133,6 +1186,16 @@ blake3_hash_many_avx512:
11331186
mov r13, qword ptr [rdi+0x28]
11341187
mov r14, qword ptr [rdi+0x30]
11351188
mov r15, qword ptr [rdi+0x38]
1189+
#else
1190+
mov r8d, dword ptr [rdi]
1191+
mov r9d, dword ptr [rdi+0x4]
1192+
mov r10d, dword ptr [rdi+0x8]
1193+
mov r11d, dword ptr [rdi+0xc]
1194+
mov r12d, dword ptr [rdi+0x10]
1195+
mov r13d, dword ptr [rdi+0x14]
1196+
mov r14d, dword ptr [rdi+0x18]
1197+
mov r15d, dword ptr [rdi+0x1c]
1198+
#endif
11361199
movzx eax, byte ptr [rbp+0x38]
11371200
movzx ebx, byte ptr [rbp+0x40]
11381201
or eax, ebx
@@ -2055,7 +2118,11 @@ blake3_hash_many_avx512:
20552118
vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
20562119
add rbx, 256
20572120
mov qword ptr [rbp+0x50], rbx
2121+
#ifndef _ILP32
20582122
add rdi, 64
2123+
#else
2124+
add rdi, 32
2125+
#endif
20592126
sub rsi, 8
20602127
3:
20612128
mov rbx, qword ptr [rbp+0x50]
@@ -2078,10 +2145,17 @@ blake3_hash_many_avx512:
20782145
kmovw k2, eax
20792146
vpblendmd zmm13 {k2}, zmm13, zmm12
20802147
vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
2148+
#ifndef _ILP32
20812149
mov r8, qword ptr [rdi]
20822150
mov r9, qword ptr [rdi+0x8]
20832151
mov r10, qword ptr [rdi+0x10]
20842152
mov r11, qword ptr [rdi+0x18]
2153+
#else
2154+
mov r8d, dword ptr [rdi]
2155+
mov r9d, dword ptr [rdi+0x4]
2156+
mov r10d, dword ptr [rdi+0x8]
2157+
mov r11d, dword ptr [rdi+0xc]
2158+
#endif
20852159
mov eax, 43690
20862160
kmovw k3, eax
20872161
mov eax, 34952
@@ -2195,7 +2269,11 @@ blake3_hash_many_avx512:
21952269
vmovdqa xmmword ptr [rsp], xmm0
21962270
vmovdqa xmmword ptr [rsp+0x40], xmm2
21972271
add rbx, 128
2272+
#ifndef _ILP32
21982273
add rdi, 32
2274+
#else
2275+
add rdi, 16
2276+
#endif
21992277
sub rsi, 4
22002278
3:
22012279
test esi, 0x2
@@ -2209,8 +2287,13 @@ blake3_hash_many_avx512:
22092287
vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
22102288
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
22112289
vinserti128 ymm13, ymm13, xmm14, 0x01
2290+
#ifndef _ILP32
22122291
mov r8, qword ptr [rdi]
22132292
mov r9, qword ptr [rdi+0x8]
2293+
#else
2294+
mov r8d, dword ptr [rdi]
2295+
mov r9d, dword ptr [rdi+0x4]
2296+
#endif
22142297
movzx eax, byte ptr [rbp+0x40]
22152298
or eax, r13d
22162299
xor edx, edx
@@ -2308,7 +2391,11 @@ blake3_hash_many_avx512:
23082391
vmovdqa xmmword ptr [rsp], xmm0
23092392
vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
23102393
add rbx, 64
2394+
#ifndef _ILP32
23112395
add rdi, 16
2396+
#else
2397+
add rdi, 8
2398+
#endif
23122399
sub rsi, 2
23132400
3:
23142401
test esi, 0x1
@@ -2319,7 +2406,11 @@ blake3_hash_many_avx512:
23192406
vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
23202407
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
23212408
vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
2409+
#ifndef _ILP32
23222410
mov r8, qword ptr [rdi]
2411+
#else
2412+
mov r8d, dword ptr [rdi]
2413+
#endif
23232414
movzx eax, byte ptr [rbp+0x40]
23242415
or eax, r13d
23252416
xor edx, edx

0 commit comments

Comments
 (0)