feat(byte_array): add slice

Gilad Chase · Gilad Chase · commit d4f7adeb755f · 2025-09-14T14:06:30.000+03:00
1. When a non-trivial slice is under 31 bytes, it is held entirely in the remainder word.
Rationale: there are enough cases where this is essential for it to be the default for all cases.
For example, for byte-arrays with empty data arrays and spans from short-strings (to be implemented)
it is essential, and for slices inside the data array that don't end in a word boundary moving
the slice into remainder simplifies handling.

2. For slices over 31 bytes that end inside the remainder word or end at a data-word boundary
   (meaning the last byte is at index 30 of one of the cells of `data`), we slice the data
   accordingly and adjust the start offset.

3. For slices over 31 bytes that end in the data span before the word boundary (meaning they end at
   an index k &lt; 30), we pop the last word from the data span and move into the remainder word,
   which creates a similar representation as in (2) above.

Invariants: In all slices above, the following hold:
- data array always contains full bytes31 (like ByteArray) with the exception of
the first word, that can have a start-offset (the remainder of the offset is masked in `into` before
passing into ByteArray).
- The optional remainder word always starts at the word boundary: in other words, the remainder word
  never has a start-offset, only a negative offset (`remainder_word_len`). This simplifies the logic
  and is a more consistent representation of a slice.

Possible optimizations:
- Perform the slice_bytes31 calls lazily in `into` instead of in `slice`: this improves slices at
  the cost of `into`, which is worth it performance-wise. However, this makes the logic more
  complicated and the byte-span representation less strict. For example, we can relax the
  relaxation on remainder-word not having start-offset for &lt;= 31 sized words, and only slice it
  at `into`.
- Optimize slice_bytes31: split both ends at once instead of calling split_bytes31 twice, without
  having to duplicate `split_bytes31` logic.
diff --git a/corelib/src/byte_array.cairo b/corelib/src/byte_array.cairo
@@ -55,6 +55,7 @@ use crate::cmp::min;
 use crate::integer::{U32TryIntoNonZero, u128_safe_divmod};
 #[feature("bounded-int-utils")]
 use crate::internal::bounded_int::{BoundedInt, downcast, upcast};
+use crate::num::traits::CheckedAdd;
 #[allow(unused_imports)]
 use crate::serde::Serde;
 use crate::traits::{Into, TryInto};
@@ -623,6 +624,57 @@ pub impl ByteArraySpanImpl of ByteSpanTrait {
     fn is_empty(self: @ByteSpan) -> bool {
         self.len() == 0
     }
+
+    /// Returns a slice of the ByteSpan from the given start position with the given length.
+    fn slice(self: @ByteSpan, start: usize, len: usize) -> Option<ByteSpan> {
+        if len == 0 {
+            return Some(Default::default());
+        }
+        if start.checked_add(len)? > self.len() {
+            return None;
+        }
+
+        let abs_start = start.checked_add(upcast(*self.first_char_start_offset))?;
+        let (start_word, start_offset) = DivRem::div_rem(abs_start, BYTES_IN_BYTES31_NONZERO);
+        let (end_word, end_offset) = DivRem::div_rem(
+            abs_start.checked_add(len)?, BYTES_IN_BYTES31_NONZERO,
+        );
+        let data_len = self.data.len();
+        let remainder_len = upcast(*self.remainder_len);
+
+        // Single word slice - extract from that word only
+        if start_word == end_word {
+            let word = if start_word < data_len {
+                slice_bytes31((*self.data[start_word]).into(), BYTES_IN_BYTES31, start_offset, len)
+            } else {
+                slice_bytes31(*self.remainder_word, remainder_len, start_offset, len)
+            };
+            return Some(
+                ByteSpan {
+                    data: [].span(),
+                    first_char_start_offset: 0,
+                    remainder_word: word,
+                    remainder_len: downcast(len).unwrap(),
+                },
+            );
+        }
+
+        // Multi-word slice - data words plus optional remainder
+        let remainder = if end_word < data_len {
+            slice_bytes31((*self.data[end_word]).into(), BYTES_IN_BYTES31, 0, end_offset)
+        } else {
+            slice_bytes31(*self.remainder_word, remainder_len, 0, end_offset)
+        };
+
+        Some(
+            ByteSpan {
+                data: self.data.slice(start_word, min(end_word, data_len) - start_word),
+                first_char_start_offset: downcast(start_offset).unwrap(),
+                remainder_word: remainder,
+                remainder_len: downcast(end_offset).unwrap(),
+            },
+        )
+    }
 }
 
 impl ByteSpanDefault of Default<ByteSpan> {
@@ -675,3 +727,24 @@ impl ByteSpanIntoByteArray of Into<ByteSpan, ByteArray> {
         ba
     }
 }
+
+/// Extracts a slice of bytes from a word.
+/// Returns bytes [start, start+len) where byte 0 is the leftmost (most significant) byte.
+/// The input `bytes31` and the output `bytes31`s are represented using `felt252`s to improve
+/// performance.
+///
+/// Note: this function assumes that:
+/// 1. `word` is validly convertible to a `bytes31` which has no more than `word_len` bytes of data.
+/// 2. `start + len <= word_len`.
+/// 3. `word_len <= BYTES_IN_BYTES31`.
+/// If these assumptions are not met, it can corrupt the result. Thus, this should be a
+/// private function. We could add masking/assertions but it would be more expensive.
+fn slice_bytes31(word: felt252, word_len: usize, start: usize, len: usize) -> felt252 {
+    if len == 0 {
+        return 0;
+    }
+    // Remove suffix: keep only bytes [0, start+len).
+    let (_, without_suffix) = split_bytes31(word, word_len, word_len - (start + len));
+    let (without_prefix_and_suffix, _) = split_bytes31(without_suffix, start + len, len);
+    without_prefix_and_suffix
+}
diff --git a/corelib/src/test/byte_array_test.cairo b/corelib/src/test/byte_array_test.cairo
@@ -1,4 +1,5 @@
 use crate::byte_array::{ByteSpanTrait, ToByteSpanTrait};
+use crate::num::traits::Bounded;
 use crate::test::test_utils::{assert_eq, assert_ne};
 
 #[test]
@@ -516,25 +517,28 @@ fn test_span_len() {
     assert_eq!(span.len(), 1);
     assert!(!span.is_empty());
 
+    let ba_31: ByteArray = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcde";
+    let span = ba_31.span();
+    assert_eq!(span.len(), 31, "wrong span len");
+    assert!(!span.is_empty());
+
     // Test empty.
     let empty_ba: ByteArray = "";
     let empty_span = empty_ba.span();
     assert_eq!(empty_span.len(), 0);
     assert!(empty_span.is_empty());
 
-    // TODO(giladchase): Add start-offset using slice once supported.
     // First word in the array, second in last word.
     let two_byte31: ByteArray = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefg";
-    let mut single_span = two_byte31.span();
-    assert_eq!(single_span.len(), 33, "len error with start offset");
+    let mut single_span = two_byte31.span().slice(1, 32).unwrap();
+    assert_eq!(single_span.len(), 32, "len error with start offset");
     assert!(!single_span.is_empty());
 
-    // TODO(giladchase): Add start-offset using slice once supported.
     // First word in the array, second in the array, third in last word.
     let three_bytes31: ByteArray =
         "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789#$"; // 64 chars.
-    let mut three_span = three_bytes31.span();
-    assert_eq!(three_span.len(), 64, "len error with size-3 bytearray");
+    let mut three_span = three_bytes31.span().slice(1, 63).unwrap();
+    assert_eq!(three_span.len(), 63, "len error with size-3 bytearray");
     assert!(!three_span.is_empty());
 }
 
@@ -549,6 +553,101 @@ fn test_span_copy() {
     assert_eq!(ba, span.into());
 }
 
+#[test]
+fn test_span_slice_empty() {
+    let ba: ByteArray = "hello";
+    let span = ba.span();
+
+    let empty = span.slice(2, 0).unwrap();
+    assert_eq!(empty.len(), 0);
+    assert!(empty.is_empty());
+
+    let empty_string: ByteArray = "";
+    assert_eq!(empty_string, empty.into());
+}
+
+// TODO(giladchase): replace assert+is_none with assert_eq when we have PartialEq.
+#[test]
+fn test_span_slice_out_of_bounds() {
+    let ba: ByteArray = "hello";
+    let span = ba.span();
+
+    assert!(span.slice(3, 5).is_none(), "end out of bounds");
+    assert!(span.slice(6, 1).is_none(), "start out of bounds");
+
+    assert!(
+        span.slice(1, 3).unwrap().slice(Bounded::<usize>::MAX, 1).is_none(),
+        "start offset overflow",
+    );
+    assert!(span.slice(Bounded::<usize>::MAX, 1).is_none());
+    assert!(span.slice(1, Bounded::<usize>::MAX).is_none());
+
+    let empty_string: ByteArray = "";
+    assert!(empty_string.span().slice(0, 2).is_none(), "empty slice is sliceable");
+}
+
+#[test]
+fn test_span_slice_under_31_bytes() {
+    // Word entirely in remainder word.
+    let ba: ByteArray = "abcde";
+    let span = ba.span();
+
+    let mut slice: ByteArray = span.slice(0, 3).unwrap().into();
+    assert_eq!(slice, "abc", "first 3 bytes");
+
+    slice = span.slice(2, 2).unwrap().into();
+    assert_eq!(slice, "cd", "middle 2 bytes");
+
+    slice = span.slice(4, 1).unwrap().into();
+    assert_eq!(slice, "e", "last byte");
+}
+
+#[test]
+fn test_span_slice_exactly_31_bytes() {
+    // 1 full data word, empty last_word.
+    let ba_31: ByteArray = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcde"; // 31 bytes
+    let span31 = ba_31.span();
+    assert_eq!(span31.len(), 31);
+
+    let ba: ByteArray = span31.slice(0, 31).unwrap().into();
+    assert_eq!(ba, ba_31);
+
+    // Debug: Let's check what byte is at position 10
+    assert_eq!(ba_31.at(10), Some('K'));
+    assert_eq!(ba_31.at(11), Some('L'));
+
+    // Partial slice
+    let ba: ByteArray = span31.slice(10, 10).unwrap().into();
+    assert_eq!(ba, "KLMNOPQRST", "middle 10 bytes");
+}
+
+#[test]
+fn test_span_slice_positions() {
+    // Two full bytes31 + remainder with 2 bytes.
+    let ba: ByteArray =
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789#$"; // 64 bytes
+    let span = ba.span();
+
+    // Slice from middle of first word to middle of second word.
+    let short_slice_across_data_words = span.slice(10, 30).unwrap();
+    let mut ba_from_span: ByteArray = short_slice_across_data_words.into();
+    assert_eq!(ba_from_span, "KLMNOPQRSTUVWXYZabcdefghijklmn", "multi-word short slice failed");
+
+    // Slice spanning multiple words.
+    let long_slice_across_data_words = span.slice(5, 50).unwrap();
+    ba_from_span = long_slice_across_data_words.into();
+    assert_eq!(
+        ba_from_span,
+        "FGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz012",
+        "multi-word long slice failed",
+    );
+
+    // Slice from second word into remainder.
+    let short_slice_into_remainder_word = span.slice(29, 20).unwrap();
+    ba_from_span = short_slice_into_remainder_word.into();
+    assert_eq!(ba_from_span, "defghijklmnopqrstuvw", "short slice into remainder word failed");
+}
+
 #[test]
 fn test_span_into_bytearray() {
     let empty_ba: ByteArray = "";
@@ -561,5 +660,8 @@ fn test_span_into_bytearray() {
     // Data word and remainder.
     let large_ba: ByteArray = "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVW"; // 40 bytes
     assert_eq!(large_ba.span().into(), large_ba);
-    // TODO(giladchase): test with slice.
+
+    // Test sliced span with offset.
+    let sliced: ByteArray = large_ba.span().slice(10, 25).unwrap().into();
+    assert_eq!(sliced, ":;<=>?@ABCDEFGHIJKLMNOPQR");
 }