Skip to content

Commit cc25135

Browse files
committed
[CGP]: Optimize mul.overflow.
- Detect cases where LHS & RHS values will not cause overflow (when the Hi parts are zero). - Detect cases where either of LHS or RHS values could not cause overflow (when one of the Hi parts is zero).
1 parent 49a24b3 commit cc25135

20 files changed

+13143
-3000
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 573 additions & 0 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AArch64/i128-math.ll

Lines changed: 434 additions & 70 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AArch64/i128_with_overflow.ll

Lines changed: 158 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -223,22 +223,49 @@ cleanup:
223223

224224
define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
225225
; CHECK-LABEL: test_umul_i128:
226-
; CHECK: // %bb.0: // %entry
226+
; CHECK: // %bb.0: // %overflow.entry
227+
; CHECK-NEXT: cbz x1, .LBB4_3
228+
; CHECK-NEXT: // %bb.1: // %overflow.lhs
229+
; CHECK-NEXT: cbz x3, .LBB4_5
230+
; CHECK-NEXT: // %bb.2: // %overflow
227231
; CHECK-NEXT: mul x9, x3, x0
228232
; CHECK-NEXT: cmp x1, #0
229233
; CHECK-NEXT: ccmp x3, #0, #4, ne
230-
; CHECK-NEXT: umulh x8, x1, x2
231-
; CHECK-NEXT: umulh x10, x3, x0
234+
; CHECK-NEXT: umulh x10, x1, x2
235+
; CHECK-NEXT: umulh x8, x3, x0
232236
; CHECK-NEXT: madd x9, x1, x2, x9
233-
; CHECK-NEXT: ccmp xzr, x8, #0, eq
234-
; CHECK-NEXT: umulh x11, x0, x2
235237
; CHECK-NEXT: ccmp xzr, x10, #0, eq
238+
; CHECK-NEXT: umulh x11, x0, x2
239+
; CHECK-NEXT: ccmp xzr, x8, #0, eq
240+
; CHECK-NEXT: mul x0, x0, x2
236241
; CHECK-NEXT: cset w8, ne
237242
; CHECK-NEXT: adds x1, x11, x9
238243
; CHECK-NEXT: csinc w8, w8, wzr, lo
239-
; CHECK-NEXT: cmp w8, #1
240-
; CHECK-NEXT: b.ne .LBB4_2
241-
; CHECK-NEXT: // %bb.1: // %if.then
244+
; CHECK-NEXT: tbnz w8, #0, .LBB4_7
245+
; CHECK-NEXT: b .LBB4_8
246+
; CHECK-NEXT: .LBB4_3: // %overflow.no.lhs
247+
; CHECK-NEXT: umulh x8, x0, x2
248+
; CHECK-NEXT: cbz x3, .LBB4_9
249+
; CHECK-NEXT: // %bb.4: // %overflow.no.lhs.only
250+
; CHECK-NEXT: madd x8, x1, x2, x8
251+
; CHECK-NEXT: umulh x9, x0, x3
252+
; CHECK-NEXT: mul x10, x0, x3
253+
; CHECK-NEXT: mul x11, x1, x3
254+
; CHECK-NEXT: mul x0, x0, x2
255+
; CHECK-NEXT: b .LBB4_6
256+
; CHECK-NEXT: .LBB4_5: // %overflow.no.rhs.only
257+
; CHECK-NEXT: umulh x8, x2, x0
258+
; CHECK-NEXT: umulh x9, x2, x1
259+
; CHECK-NEXT: madd x8, x3, x0, x8
260+
; CHECK-NEXT: mul x10, x2, x1
261+
; CHECK-NEXT: mul x11, x3, x1
262+
; CHECK-NEXT: mul x0, x2, x0
263+
; CHECK-NEXT: .LBB4_6: // %overflow.res
264+
; CHECK-NEXT: adds x1, x8, x10
265+
; CHECK-NEXT: adcs xzr, x9, x11
266+
; CHECK-NEXT: cset w8, ne
267+
; CHECK-NEXT: tbz w8, #0, .LBB4_8
268+
; CHECK-NEXT: .LBB4_7: // %if.then
242269
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
243270
; CHECK-NEXT: .cfi_def_cfa_offset 16
244271
; CHECK-NEXT: .cfi_offset w30, -16
@@ -247,10 +274,15 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
247274
; CHECK-NEXT: sxtw x0, w0
248275
; CHECK-NEXT: asr x1, x0, #63
249276
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
277+
; CHECK-NEXT: .LBB4_8: // %cleanup
250278
; CHECK-NEXT: ret
251-
; CHECK-NEXT: .LBB4_2: // %if.end
279+
; CHECK-NEXT: .LBB4_9: // %overflow.no
280+
; CHECK-NEXT: madd x8, x0, x3, x8
252281
; CHECK-NEXT: mul x0, x0, x2
253-
; CHECK-NEXT: ret
282+
; CHECK-NEXT: madd x1, x1, x2, x8
283+
; CHECK-NEXT: mov w8, wzr
284+
; CHECK-NEXT: tbnz w8, #0, .LBB4_7
285+
; CHECK-NEXT: b .LBB4_8
254286
entry:
255287
%0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
256288
%1 = extractvalue { i128, i1 } %0, 1
@@ -272,35 +304,115 @@ cleanup:
272304

273305
define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
274306
; CHECK-LABEL: test_smul_i128:
275-
; CHECK: // %bb.0: // %entry
307+
; CHECK: // %bb.0: // %overflow.entry
308+
; CHECK-NEXT: asr x8, x2, #63
309+
; CHECK-NEXT: cmp x1, x0, asr #63
310+
; CHECK-NEXT: b.eq .LBB5_3
311+
; CHECK-NEXT: // %bb.1: // %overflow.lhs
312+
; CHECK-NEXT: cmp x3, x8
313+
; CHECK-NEXT: b.eq .LBB5_5
314+
; CHECK-NEXT: // %bb.2: // %overflow
315+
; CHECK-NEXT: asr x9, x1, #63
316+
; CHECK-NEXT: umulh x10, x0, x2
317+
; CHECK-NEXT: asr x13, x3, #63
318+
; CHECK-NEXT: mul x11, x1, x2
319+
; CHECK-NEXT: umulh x8, x1, x2
320+
; CHECK-NEXT: mul x9, x9, x2
321+
; CHECK-NEXT: adds x10, x11, x10
322+
; CHECK-NEXT: mul x14, x0, x3
323+
; CHECK-NEXT: umulh x12, x0, x3
324+
; CHECK-NEXT: adc x8, x8, x9
325+
; CHECK-NEXT: mul x13, x0, x13
326+
; CHECK-NEXT: asr x11, x8, #63
327+
; CHECK-NEXT: adds x9, x14, x10
328+
; CHECK-NEXT: mul x15, x1, x3
329+
; CHECK-NEXT: smulh x10, x1, x3
330+
; CHECK-NEXT: mov x1, x9
331+
; CHECK-NEXT: adc x9, x12, x13
332+
; CHECK-NEXT: asr x12, x9, #63
333+
; CHECK-NEXT: mul x0, x0, x2
334+
; CHECK-NEXT: adds x8, x8, x9
335+
; CHECK-NEXT: asr x9, x1, #63
336+
; CHECK-NEXT: adc x11, x11, x12
337+
; CHECK-NEXT: adds x8, x15, x8
338+
; CHECK-NEXT: adc x10, x10, x11
339+
; CHECK-NEXT: cmp x8, x9
340+
; CHECK-NEXT: ccmp x10, x9, #0, eq
341+
; CHECK-NEXT: b .LBB5_7
342+
; CHECK-NEXT: .LBB5_3: // %overflow.no.lhs
343+
; CHECK-NEXT: cmp x3, x8
344+
; CHECK-NEXT: b.eq .LBB5_10
345+
; CHECK-NEXT: // %bb.4: // %overflow.no.lhs.only
346+
; CHECK-NEXT: asr x8, x1, #63
347+
; CHECK-NEXT: asr x10, x3, #63
348+
; CHECK-NEXT: eor x9, x0, x8
349+
; CHECK-NEXT: eor x11, x1, x8
350+
; CHECK-NEXT: eor x12, x2, x10
351+
; CHECK-NEXT: subs x9, x9, x8
352+
; CHECK-NEXT: sbc x8, x11, x8
353+
; CHECK-NEXT: cmp x1, #0
354+
; CHECK-NEXT: eor x11, x3, x10
355+
; CHECK-NEXT: csel x8, x8, x1, lt
356+
; CHECK-NEXT: csel x9, x9, x0, lt
357+
; CHECK-NEXT: cset w13, lt
358+
; CHECK-NEXT: subs x12, x12, x10
359+
; CHECK-NEXT: sbc x10, x11, x10
360+
; CHECK-NEXT: cmp x3, #0
361+
; CHECK-NEXT: csel x11, x12, x2, lt
362+
; CHECK-NEXT: csel x10, x10, x3, lt
363+
; CHECK-NEXT: umulh x12, x9, x11
364+
; CHECK-NEXT: mul x15, x8, x10
365+
; CHECK-NEXT: madd x8, x8, x11, x12
366+
; CHECK-NEXT: cset w12, lt
367+
; CHECK-NEXT: mul x14, x9, x11
368+
; CHECK-NEXT: mul x11, x9, x10
369+
; CHECK-NEXT: umulh x9, x9, x10
370+
; CHECK-NEXT: eor w10, w12, w13
371+
; CHECK-NEXT: b .LBB5_6
372+
; CHECK-NEXT: .LBB5_5: // %overflow.no.rhs.only
373+
; CHECK-NEXT: asr x8, x3, #63
276374
; CHECK-NEXT: asr x10, x1, #63
277-
; CHECK-NEXT: umulh x11, x0, x2
278-
; CHECK-NEXT: asr x14, x3, #63
279-
; CHECK-NEXT: mov x8, x1
280-
; CHECK-NEXT: mul x12, x1, x2
281-
; CHECK-NEXT: umulh x9, x1, x2
282-
; CHECK-NEXT: mul x10, x10, x2
283-
; CHECK-NEXT: adds x11, x12, x11
284-
; CHECK-NEXT: mul x15, x0, x3
285-
; CHECK-NEXT: umulh x13, x0, x3
286-
; CHECK-NEXT: adc x9, x9, x10
287-
; CHECK-NEXT: mul x14, x0, x14
288-
; CHECK-NEXT: mul x16, x1, x3
289-
; CHECK-NEXT: adds x1, x15, x11
290-
; CHECK-NEXT: asr x11, x9, #63
291-
; CHECK-NEXT: smulh x8, x8, x3
292-
; CHECK-NEXT: adc x10, x13, x14
293-
; CHECK-NEXT: asr x12, x10, #63
294-
; CHECK-NEXT: adds x9, x9, x10
295-
; CHECK-NEXT: adc x10, x11, x12
296-
; CHECK-NEXT: adds x9, x16, x9
297-
; CHECK-NEXT: asr x11, x1, #63
298-
; CHECK-NEXT: adc x8, x8, x10
299-
; CHECK-NEXT: eor x8, x8, x11
300-
; CHECK-NEXT: eor x9, x9, x11
301-
; CHECK-NEXT: orr x8, x9, x8
302-
; CHECK-NEXT: cbz x8, .LBB5_2
303-
; CHECK-NEXT: // %bb.1: // %if.then
375+
; CHECK-NEXT: eor x9, x2, x8
376+
; CHECK-NEXT: eor x11, x3, x8
377+
; CHECK-NEXT: eor x12, x0, x10
378+
; CHECK-NEXT: subs x9, x9, x8
379+
; CHECK-NEXT: sbc x8, x11, x8
380+
; CHECK-NEXT: cmp x3, #0
381+
; CHECK-NEXT: eor x11, x1, x10
382+
; CHECK-NEXT: csel x8, x8, x3, lt
383+
; CHECK-NEXT: csel x9, x9, x2, lt
384+
; CHECK-NEXT: cset w13, lt
385+
; CHECK-NEXT: subs x12, x12, x10
386+
; CHECK-NEXT: sbc x10, x11, x10
387+
; CHECK-NEXT: cmp x1, #0
388+
; CHECK-NEXT: csel x11, x12, x0, lt
389+
; CHECK-NEXT: csel x10, x10, x1, lt
390+
; CHECK-NEXT: umulh x12, x9, x11
391+
; CHECK-NEXT: mul x14, x9, x11
392+
; CHECK-NEXT: mul x15, x8, x10
393+
; CHECK-NEXT: madd x8, x8, x11, x12
394+
; CHECK-NEXT: cset w12, lt
395+
; CHECK-NEXT: mul x11, x9, x10
396+
; CHECK-NEXT: umulh x9, x9, x10
397+
; CHECK-NEXT: eor w10, w13, w12
398+
; CHECK-NEXT: .LBB5_6: // %overflow.res
399+
; CHECK-NEXT: sbfx x12, x10, #0, #1
400+
; CHECK-NEXT: adds x8, x8, x11
401+
; CHECK-NEXT: adc x9, x9, x15
402+
; CHECK-NEXT: eor x13, x14, x12
403+
; CHECK-NEXT: eor x8, x8, x12
404+
; CHECK-NEXT: add x0, x13, x10
405+
; CHECK-NEXT: cmp x0, x10
406+
; CHECK-NEXT: cset w10, lo
407+
; CHECK-NEXT: cinc x1, x8, lo
408+
; CHECK-NEXT: eor x8, x9, x12
409+
; CHECK-NEXT: cmp x1, x10
410+
; CHECK-NEXT: cinc x8, x8, lo
411+
; CHECK-NEXT: cmp x8, #0
412+
; CHECK-NEXT: .LBB5_7: // %overflow.res
413+
; CHECK-NEXT: cset w8, ne
414+
; CHECK-NEXT: tbz w8, #0, .LBB5_9
415+
; CHECK-NEXT: .LBB5_8: // %if.then
304416
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
305417
; CHECK-NEXT: .cfi_def_cfa_offset 16
306418
; CHECK-NEXT: .cfi_offset w30, -16
@@ -309,10 +421,16 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
309421
; CHECK-NEXT: sxtw x0, w0
310422
; CHECK-NEXT: asr x1, x0, #63
311423
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
424+
; CHECK-NEXT: .LBB5_9: // %cleanup
312425
; CHECK-NEXT: ret
313-
; CHECK-NEXT: .LBB5_2: // %if.end
426+
; CHECK-NEXT: .LBB5_10: // %overflow.no
427+
; CHECK-NEXT: umulh x8, x0, x2
428+
; CHECK-NEXT: madd x8, x0, x3, x8
314429
; CHECK-NEXT: mul x0, x0, x2
315-
; CHECK-NEXT: ret
430+
; CHECK-NEXT: madd x1, x1, x2, x8
431+
; CHECK-NEXT: mov w8, wzr
432+
; CHECK-NEXT: tbnz w8, #0, .LBB5_8
433+
; CHECK-NEXT: b .LBB5_9
316434
entry:
317435
%0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
318436
%1 = extractvalue { i128, i1 } %0, 1

0 commit comments

Comments
 (0)