From f288bce4b2b4205e8b40056317c7cf0525e9033b Mon Sep 17 00:00:00 2001
From: TAKAI Kousuke <62541129+t-a-k@users.noreply.github.com>
Date: Sat, 26 Apr 2025 02:15:40 +0900
Subject: [PATCH 1/3] pp_{add,subtract,multiply}: use
 __builtin_{add,sub,mul}_overflow if available

This will hopefully make the code faster and smaller, and
make more cases to be handled as "simple common cases".

Note that this change uses HAS_BUILTIN_{ADD,SUB,MUL}_OVERFLOW macros
which have already been defined in config.h but seem not to have been
used by existing code.

t/op/64bitint.t: Add tests to exercise "simple common cases".
Note that these tests should pass even before this change.
---
 inline.h          | 176 ++++++++++++++++++++++++++++++++++++++++++++++
 pod/perldelta.pod |   6 ++
 pp.c              |  84 +++-------------------
 pp_hot.c          |  12 +---
 t/op/64bitint.t   |  26 +++++++
 5 files changed, 219 insertions(+), 85 deletions(-)
diff --git a/inline.h b/inline.h
index bd428e011ac6..cace7315547d 100644
--- a/inline.h
+++ b/inline.h
@@ -3405,6 +3405,182 @@ S_lossless_NV_to_IV(const NV nv, IV *ivp)
     return FALSE;
 }
 
+/*
+ * S_iv_{add,sub,mul}_may_overflow(a, b, p) virtually compute "a <op> b"
+ * (where <op> is +, -, or *) in infinite precision, and, if the result
+ * is (or may be) not representable with IV, return true.
+ * Otherwise (no overflow), store the result to *p and return false.
+ * These functions allow false positives (so their names contain "may")
+ * to speed up simple common cases.
+ */
+
+/* Define IV_*_OVERFLOW_IS_EXPENSIVE below to nonzero value
+ * if strict overflow checks are too expensive
+ * (for example, for CPUs that has no hardware overflow detection flag).
+ * If these macro has nonzero value, or overflow-checking compiler intrinsics
+ * are not available, good-old heuristics (with some false positives)
+ * will be used.  */
+#  ifndef IV_ADD_SUB_OVERFLOW_IS_EXPENSIVE
+#    define IV_ADD_SUB_OVERFLOW_IS_EXPENSIVE 0
+#  endif
+#  ifndef IV_MUL_OVERFLOW_IS_EXPENSIVE
+/* Strict overflow check for IV multiplication is generally expensive
+ * when IV is a multi-word integer.  */
+#    define IV_MUL_OVERFLOW_IS_EXPENSIVE (IVSIZE > LONGSIZE)
+#  endif
+
+#  if defined(I_STDCKDINT) && !IV_ADD_SUB_OVERFLOW_IS_EXPENSIVE
+/* XXX Preparation for upcoming C23, but I_STDCKDINT is not yet tested */
+#    define S_iv_add_may_overflow(il, ir, result) ckd_add(result, il, ir)
+#  elif defined(HAS_BUILTIN_ADD_OVERFLOW) && !IV_ADD_SUB_OVERFLOW_IS_EXPENSIVE
+#    define S_iv_add_may_overflow __builtin_add_overflow
+#  else
+PERL_STATIC_INLINE bool
+S_iv_add_may_overflow (IV il, IV ir, IV *const result)
+{
+    /* topl and topr hold only 2 bits */
+    PERL_UINT_FAST8_T const topl = ((UV)il) >> (UVSIZE * 8 - 2);
+    PERL_UINT_FAST8_T const topr = ((UV)ir) >> (UVSIZE * 8 - 2);
+
+    /* if both are in a range that can't under/overflow, do a simple integer
+     * add: if the top of both numbers are 00  or 11, then it's safe */
+    if (!( ((topl+1) | (topr+1)) & 2)) {
+        *result = il + ir;
+        return false;
+    }
+    return true;                   /* addition may overflow */
+}
+#  endif
+
+/*
+ * S_uv_{add,sub,mul}_overflow(a, b, p) are similar, but the results are UV
+ * and they should perform strict overflow check (no false positives).
+ */
+
+#  if defined(I_STDCKDINT)
+/* XXX Preparation for upcoming C23, but I_STDCKDINT is not yet tested */
+#    define S_uv_add_overflow(auv, buv, result) ckd_add(result, auv, buv)
+#  elif defined(HAS_BUILTIN_ADD_OVERFLOW)
+#    define S_uv_add_overflow __builtin_add_overflow
+#  else
+PERL_STATIC_INLINE bool
+S_uv_add_overflow (UV auv, UV buv, UV *const result)
+{
+    return (*result = auv + buv) < auv;
+}
+#  endif
+
+#  if defined(I_STDCKDINT) && !IV_ADD_SUB_OVERFLOW_IS_EXPENSIVE
+/* XXX Preparation for upcoming C23, but I_STDCKDINT is not yet tested */
+#    define S_iv_sub_may_overflow(il, ir, result) ckd_sub(result, il, ir)
+#  elif defined(HAS_BUILTIN_SUB_OVERFLOW) && !IV_ADD_SUB_OVERFLOW_IS_EXPENSIVE
+#    define S_iv_sub_may_overflow __builtin_sub_overflow
+#  else
+PERL_STATIC_INLINE bool
+S_iv_sub_may_overflow (IV il, IV ir, IV *const result)
+{
+    PERL_UINT_FAST8_T const topl = ((UV)il) >> (UVSIZE * 8 - 2);
+    PERL_UINT_FAST8_T const topr = ((UV)ir) >> (UVSIZE * 8 - 2);
+
+    /* if both are in a range that can't under/overflow, do a simple integer
+     * subtract: if the top of both numbers are 00  or 11, then it's safe */
+    if (!( ((topl+1) | (topr+1)) & 2)) {
+        *result = il - ir;
+        return false;
+    }
+    return true;                   /* subtraction may overflow */
+}
+#  endif
+
+#  if defined(I_STDCKDINT)
+/* XXX Preparation for upcoming C23, but I_STDCKDINT is not yet tested */
+#    define S_uv_sub_overflow(auv, buv, result) ckd_sub(result, auv, buv)
+#  elif defined(HAS_BUILTIN_SUB_OVERFLOW)
+#    define S_uv_sub_overflow __builtin_sub_overflow
+#  else
+PERL_STATIC_INLINE bool
+S_uv_sub_overflow (UV auv, UV buv, UV *const result)
+{
+    return (*result = auv - buv) > auv;
+}
+#  endif
+
+#  if defined(I_STDCKDINT) && !IV_MUL_OVERFLOW_IS_EXPENSIVE
+/* XXX Preparation for upcoming C23, but I_STDCKDINT is not yet tested */
+#    define S_iv_mul_may_overflow(il, ir, result) ckd_mul(result, il, ir)
+#  elif defined(HAS_BUILTIN_MUL_OVERFLOW) && !IV_MUL_OVERFLOW_IS_EXPENSIVE
+#    define S_iv_mul_may_overflow __builtin_mul_overflow
+#  else
+PERL_STATIC_INLINE bool
+S_iv_mul_may_overflow (IV il, IV ir, IV *const result)
+{
+    UV const topl = ((UV)il) >> (UVSIZE * 4 - 1);
+    UV const topr = ((UV)ir) >> (UVSIZE * 4 - 1);
+
+    /* if both are in a range that can't under/overflow, do a simple integer
+     * multiply: if the top halves(*) of both numbers are 00...00  or 11...11,
+     * then it's safe.
+     * (*) for 32-bits, the "top half" is the top 17 bits,
+     *     for 64-bits, its 33 bits */
+    if (!(
+              ((topl+1) | (topr+1))
+            & ( (((UV)1) << (UVSIZE * 4 + 1)) - 2) /* 11..110 */
+    )) {
+        *result = il * ir;
+        return false;
+    }
+    return true;                   /* multiplication may overflow */
+}
+#  endif
+
+#  if defined(I_STDCKDINT)
+/* XXX Preparation for upcoming C23, but I_STDCKDINT is not yet tested */
+#    define S_uv_mul_overflow(auv, buv, result) ckd_mul(result, auv, buv)
+#  elif defined(HAS_BUILTIN_MUL_OVERFLOW)
+#    define S_uv_mul_overflow   __builtin_mul_overflow
+#  else
+PERL_STATIC_INLINE bool
+S_uv_mul_overflow (UV auv, UV buv, UV *const result)
+{
+    const UV topmask = (~ (UV)0) << (4 * sizeof (UV));
+    const UV botmask = ~topmask;
+
+#    if UVSIZE > LONGSIZE && UVSIZE <= 2 * LONGSIZE
+    unsigned long alow, ahigh, blow, bhigh;
+#    else
+    UV alow, ahigh, blow, bhigh;
+#    endif
+
+    /* If this does sign extension on unsigned it's time for plan B  */
+    ahigh = auv >> (4 * sizeof (UV));
+    alow  = auv & botmask;
+    bhigh = buv >> (4 * sizeof (UV));
+    blow  = buv & botmask;
+
+    if (ahigh && bhigh)
+        /* eg 32 bit is at least 0x10000 * 0x10000 == 0x100000000
+           which is overflow.  */
+        return true;
+
+    UV product_middle = 0;
+    if (ahigh || bhigh) {
+        /* One operand is large, 1 small */
+        /* Either ahigh or bhigh is zero here, so the addition below
+           can't overflow.  */
+        product_middle = ahigh * blow + alow * bhigh;
+        if (product_middle & topmask)
+            return true;
+        /* OK, product_middle won't lose bits when we shift it.  */
+        product_middle <<= 4 * sizeof (UV);
+    }
+    /* else: eg 32 bit is at most 0xFFFF * 0xFFFF == 0xFFFE0001
+       so the unsigned multiply cannot overflow.  */
+
+    UV product_low = alow * blow;
+    return S_uv_add_overflow(product_middle, product_low, result);
+}
+#  endif
+
 #endif
 
 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 495b14061d77..b0bbf9433623 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -89,6 +89,12 @@ There may well be none in a stable release.
 
 =item *
 
+Simple (non-overflowing) addition (C<+>), subtraction (C<->) and
+multiplication (C<*>) of IVs are slightly sped up, as long as
+sufficient underlying C compiler support is available.
+
+=item *
+
 XXX
 
 =back
diff --git a/pp.c b/pp.c
index 189b2ddbdd34..1e39cbf437b2 100644
--- a/pp.c
+++ b/pp.c
@@ -1336,23 +1336,12 @@ PP(pp_multiply)
         U32 flags = (svl->sv_flags & svr->sv_flags);
         if (flags & SVf_IOK) {
             /* both args are simple IVs */
-            UV topl, topr;
+            IV result;
             il = SvIVX(svl);
             ir = SvIVX(svr);
           do_iv:
-            topl = ((UV)il) >> (UVSIZE * 4 - 1);
-            topr = ((UV)ir) >> (UVSIZE * 4 - 1);
-
-            /* if both are in a range that can't under/overflow, do a
-             * simple integer multiply: if the top halves(*) of both numbers
-             * are 00...00  or 11...11, then it's safe.
-             * (*) for 32-bits, the "top half" is the top 17 bits,
-             *     for 64-bits, its 33 bits */
-            if (!(
-                      ((topl+1) | (topr+1))
-                    & ( (((UV)1) << (UVSIZE * 4 + 1)) - 2) /* 11..110 */
-            )) {
-                TARGi(il * ir, 0); /* args not GMG, so can't be tainted */
+            if (!S_iv_mul_may_overflow(il, ir, &result)) {
+                TARGi(result, 0); /* args not GMG, so can't be tainted */
                 goto ret;
             }
             goto generic;
@@ -1388,12 +1377,9 @@ PP(pp_multiply)
         if (SvIV_please_nomg(svl)) {
             bool auvok = SvUOK(svl);
             bool buvok = SvUOK(svr);
-            const UV topmask = (~ (UV)0) << (4 * sizeof (UV));
-            const UV botmask = ~((~ (UV)0) << (4 * sizeof (UV)));
             UV alow;
-            UV ahigh;
             UV blow;
-            UV bhigh;
+            UV product;
 
             if (auvok) {
                 alow = SvUVX(svl);
@@ -1420,19 +1406,7 @@ PP(pp_multiply)
                 }
             }
 
-            /* If this does sign extension on unsigned it's time for plan B  */
-            ahigh = alow >> (4 * sizeof (UV));
-            alow &= botmask;
-            bhigh = blow >> (4 * sizeof (UV));
-            blow &= botmask;
-            if (ahigh && bhigh) {
-                NOOP;
-                /* eg 32 bit is at least 0x10000 * 0x10000 == 0x100000000
-                   which is overflow. Drop to NVs below.  */
-            } else if (!ahigh && !bhigh) {
-                /* eg 32 bit is at most 0xFFFF * 0xFFFF == 0xFFFE0001
-                   so the unsigned multiply cannot overflow.  */
-                const UV product = alow * blow;
+            if (!S_uv_mul_overflow(alow, blow, &product)) {
                 if (auvok == buvok) {
                     /* -ve * -ve or +ve * +ve gives a +ve result.  */
                     TARGu(product, 1);
@@ -1442,42 +1416,6 @@ PP(pp_multiply)
                     TARGi(NEGATE_2IV(product), 1);
                     goto ret;
                 } /* else drop to NVs below. */
-            } else {
-                /* One operand is large, 1 small */
-                UV product_middle;
-                if (bhigh) {
-                    /* swap the operands */
-                    ahigh = bhigh;
-                    bhigh = blow; /* bhigh now the temp var for the swap */
-                    blow = alow;
-                    alow = bhigh;
-                }
-                /* now, ((ahigh * blow) << half_UV_len) + (alow * blow)
-                   multiplies can't overflow. shift can, add can, -ve can.  */
-                product_middle = ahigh * blow;
-                if (!(product_middle & topmask)) {
-                    /* OK, (ahigh * blow) won't lose bits when we shift it.  */
-                    UV product_low;
-                    product_middle <<= (4 * sizeof (UV));
-                    product_low = alow * blow;
-
-                    /* as for pp_add, UV + something mustn't get smaller.
-                       IIRC ANSI mandates this wrapping *behaviour* for
-                       unsigned whatever the actual representation*/
-                    product_low += product_middle;
-                    if (product_low >= product_middle) {
-                        /* didn't overflow */
-                        if (auvok == buvok) {
-                            /* -ve * -ve or +ve * +ve gives a +ve result.  */
-                            TARGu(product_low, 1);
-                            goto ret;
-                        } else if (product_low <= ABS_IV_MIN) {
-                            /* -ve result, which could overflow an IV  */
-                            TARGi(NEGATE_2IV(product_low), 1);
-                            goto ret;
-                        } /* else drop to NVs below. */
-                    }
-                } /* product_middle too large */
             } /* ahigh && bhigh */
         } /* SvIOK(svl) */
     } /* SvIOK(svr) */
@@ -1929,18 +1867,12 @@ PP(pp_subtract)
         U32 flags = (svl->sv_flags & svr->sv_flags);
         if (flags & SVf_IOK) {
             /* both args are simple IVs */
-            UV topl, topr;
+            IV result;
             il = SvIVX(svl);
             ir = SvIVX(svr);
           do_iv:
-            topl = ((UV)il) >> (UVSIZE * 8 - 2);
-            topr = ((UV)ir) >> (UVSIZE * 8 - 2);
-
-            /* if both are in a range that can't under/overflow, do a
-             * simple integer subtract: if the top of both numbers
-             * are 00  or 11, then it's safe */
-            if (!( ((topl+1) | (topr+1)) & 2)) {
-                TARGi(il - ir, 0); /* args not GMG, so can't be tainted */
+            if (!S_iv_sub_may_overflow(il, ir, &result)) {
+                TARGi(result, 0); /* args not GMG, so can't be tainted */
                 goto ret;
             }
             goto generic;
diff --git a/pp_hot.c b/pp_hot.c
index 3f6cd29611bc..b0d5861c4f2f 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -1827,18 +1827,12 @@ PP(pp_add)
         U32 flags = (svl->sv_flags & svr->sv_flags);
         if (flags & SVf_IOK) {
             /* both args are simple IVs */
-            UV topl, topr;
+            IV result;
             il = SvIVX(svl);
             ir = SvIVX(svr);
           do_iv:
-            topl = ((UV)il) >> (UVSIZE * 8 - 2);
-            topr = ((UV)ir) >> (UVSIZE * 8 - 2);
-
-            /* if both are in a range that can't under/overflow, do a
-             * simple integer add: if the top of both numbers
-             * are 00  or 11, then it's safe */
-            if (!( ((topl+1) | (topr+1)) & 2)) {
-                TARGi(il + ir, 0); /* args not GMG, so can't be tainted */
+            if (!S_iv_add_may_overflow(il, ir, &result)) {
+                TARGi(result, 0); /* args not GMG, so can't be tainted */
                 goto ret;
             }
             goto generic;
diff --git a/t/op/64bitint.t b/t/op/64bitint.t
index dcaa94b7bbab..bbf52f92bdf6 100644
--- a/t/op/64bitint.t
+++ b/t/op/64bitint.t
@@ -469,4 +469,30 @@ cmp_ok  0x3ffffffffffffffe % -0xc000000000000000, '==', -0x8000000000000002, 'mo
 cmp_ok  0x3fffffffffffffff % -0xc000000000000000, '==', -0x8000000000000001, 'modulo is (IV_MIN-1)';
 cmp_ok  0x4000000000000000 % -0xc000000000000000, '==', -0x8000000000000000, 'modulo is IV_MIN';
 
+# Arithmetic close to IV overflow
+
+# These had been handled in generic (slower) code, but now in fast path
+# (as "simple common case").  Either way, these tests should pass.
+$q = 9223372036854775800;
+cmp_ok 5 + $q, '==', 9223372036854775805, "5 + $q";
+cmp_ok $q - -5, '==', 9223372036854775805, "$q - -5";
+$q = 1111111111111111111;
+cmp_ok $q * 5, '==', 5555555555555555555, "$q * 5";
+
+# IV <op> IV -> UV/NV promotion
+
+$q = 7777777777777777777;
+$r = 2222222222222222223;
+# Note 10000000000000000000 can be represented accurately in both
+# IEEE double (binary64; 0x1.158e460913dp+63) and decimal format (1e+19)
+cmp_ok $q + $r, '==', 10000000000000000000, 'IV + IV promotes to UV';
+cmp_ok -$q + -$r, '==', -10000000000000000000, 'IV + IV promotes to NV';
+cmp_ok $q - -$r, '==', 10000000000000000000, 'IV - IV promotes to UV';
+cmp_ok -$q - $r, '==', -10000000000000000000, 'IV - IV promotes to NV';
+$q = 3000000000;
+$r = 4000000000;
+cmp_ok $q * $r, '==', 12000000000000000000, 'IV * IV promotes to UV';
+cmp_ok $q * -$r, '==', -12000000000000000000, 'IV * IV promotes to UV then NV';
+cmp_ok +($q * 2) * $r, '==', 24000000000000000000, 'IV * IV promotes to NV';
+
 done_testing();

From 4f3d212d416088ba0352529c24a0b0e3c7cd66c9 Mon Sep 17 00:00:00 2001
From: TAKAI Kousuke <62541129+t-a-k@users.noreply.github.com>
Date: Thu, 31 Jul 2025 17:11:46 +0900
Subject: [PATCH 2/3] S_uv_mul_overflow in inline.h: Add casts to make
 multiplications done in UV

If C compiler doesn't know __builtin_mul_overflow, S_uv_mul_overflow
will be implemented with fallback "long multiplication" algorithm,
but it had a bug that elemental multiplications were done in unsigned
long precision instead of UV precision.  It will lead wrong result
when unsigned long is narrower than UV (for example -Duse64bitint
on 32-bit platform).
---
 inline.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/inline.h b/inline.h
index cace7315547d..4b1fccbf1d3a 100644
--- a/inline.h
+++ b/inline.h
@@ -3546,6 +3546,8 @@ S_uv_mul_overflow (UV auv, UV buv, UV *const result)
     const UV botmask = ~topmask;
 
 #    if UVSIZE > LONGSIZE && UVSIZE <= 2 * LONGSIZE
+    /* If UV is double-word integer, declare these variables as single-word
+       integers to help compiler to avoid double-word multiplication.  */
     unsigned long alow, ahigh, blow, bhigh;
 #    else
     UV alow, ahigh, blow, bhigh;
@@ -3567,7 +3569,7 @@ S_uv_mul_overflow (UV auv, UV buv, UV *const result)
         /* One operand is large, 1 small */
         /* Either ahigh or bhigh is zero here, so the addition below
            can't overflow.  */
-        product_middle = ahigh * blow + alow * bhigh;
+        product_middle = (UV)ahigh * blow + (UV)alow * bhigh;
         if (product_middle & topmask)
             return true;
         /* OK, product_middle won't lose bits when we shift it.  */
@@ -3576,7 +3578,7 @@ S_uv_mul_overflow (UV auv, UV buv, UV *const result)
     /* else: eg 32 bit is at most 0xFFFF * 0xFFFF == 0xFFFE0001
        so the unsigned multiply cannot overflow.  */
 
-    UV product_low = alow * blow;
+    UV product_low = (UV)alow * blow;
     return S_uv_add_overflow(product_middle, product_low, result);
 }
 #  endif

From 1e6ea20e26b84d313d555b6f7d4f0b3ee08da640 Mon Sep 17 00:00:00 2001
From: TAKAI Kousuke <62541129+t-a-k@users.noreply.github.com>
Date: Wed, 6 Aug 2025 02:13:09 +0900
Subject: [PATCH 3/3] inline.h: Comments fixed and added

(intended to be squashed before merge)
---
 inline.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/inline.h b/inline.h
index 4b1fccbf1d3a..104a44adc596 100644
--- a/inline.h
+++ b/inline.h
@@ -3416,8 +3416,8 @@ S_lossless_NV_to_IV(const NV nv, IV *ivp)
 
 /* Define IV_*_OVERFLOW_IS_EXPENSIVE below to nonzero value
  * if strict overflow checks are too expensive
- * (for example, for CPUs that has no hardware overflow detection flag).
- * If these macro has nonzero value, or overflow-checking compiler intrinsics
+ * (for example, for CPUs that have no hardware overflow detection flags).
+ * If these macros have nonzero value, or overflow-checking compiler intrinsics
  * are not available, good-old heuristics (with some false positives)
  * will be used.  */
 #  ifndef IV_ADD_SUB_OVERFLOW_IS_EXPENSIVE
@@ -3466,6 +3466,10 @@ S_iv_add_may_overflow (IV il, IV ir, IV *const result)
 PERL_STATIC_INLINE bool
 S_uv_add_overflow (UV auv, UV buv, UV *const result)
 {
+    /* (auv + buv) < auv means that the addition wrapped around,
+       i.e. overflowed.  Note that unsigned integer overflow is well-defined
+       in standard C to wrap around, in constrast to signed integer overflow
+       whose behaviour is undefined.  */
     return (*result = auv + buv) < auv;
 }
 #  endif