Skip to content

Commit d4f7ade

Browse files
author
Gilad Chase
committed
feat(byte_array): add slice
1. When a non-trivial slice is under 31 bytes, it is held entirely in the remainder word. Rationale: there are enough cases where this is essential for it to be the default for all cases. For example, for byte-arrays with empty data arrays and spans from short-strings (to be implemented) it is essential, and for slices inside the data array that don't end in a word boundary moving the slice into remainder simplifies handling. 2. For slices over 31 bytes that end inside the remainder word or end at a data-word boundary (meaning the last byte is at index 30 of one of the cells of `data`), we slice the data accordingly and adjust the start offset. 3. For slices over 31 bytes that end in the data span before the word boundary (meaning they end at an index k < 30), we pop the last word from the data span and move into the remainder word, which creates a similar representation as in (2) above. Invariants: In all slices above, the following hold: - data array always contains full bytes31 (like ByteArray) with the exception of the first word, that can have a start-offset (the remainder of the offset is masked in `into` before passing into ByteArray). - The optional remainder word always starts at the word boundary: in other words, the remainder word never has a start-offset, only a negative offset (`remainder_word_len`). This simplifies the logic and is a more consistent representation of a slice. Possible optimizations: - Perform the slice_bytes31 calls lazily in `into` instead of in `slice`: this improves slices at the cost of `into`, which is worth it performance-wise. However, this makes the logic more complicated and the byte-span representation less strict. For example, we can relax the relaxation on remainder-word not having start-offset for <= 31 sized words, and only slice it at `into`. - Optimize slice_bytes31: split both ends at once instead of calling split_bytes31 twice, without having to duplicate `split_bytes31` logic.
1 parent 7b0a77a commit d4f7ade

File tree

2 files changed

+182
-7
lines changed

2 files changed

+182
-7
lines changed

corelib/src/byte_array.cairo

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ use crate::cmp::min;
5555
use crate::integer::{U32TryIntoNonZero, u128_safe_divmod};
5656
#[feature("bounded-int-utils")]
5757
use crate::internal::bounded_int::{BoundedInt, downcast, upcast};
58+
use crate::num::traits::CheckedAdd;
5859
#[allow(unused_imports)]
5960
use crate::serde::Serde;
6061
use crate::traits::{Into, TryInto};
@@ -623,6 +624,57 @@ pub impl ByteArraySpanImpl of ByteSpanTrait {
623624
fn is_empty(self: @ByteSpan) -> bool {
624625
self.len() == 0
625626
}
627+
628+
/// Returns a slice of the ByteSpan from the given start position with the given length.
629+
fn slice(self: @ByteSpan, start: usize, len: usize) -> Option<ByteSpan> {
630+
if len == 0 {
631+
return Some(Default::default());
632+
}
633+
if start.checked_add(len)? > self.len() {
634+
return None;
635+
}
636+
637+
let abs_start = start.checked_add(upcast(*self.first_char_start_offset))?;
638+
let (start_word, start_offset) = DivRem::div_rem(abs_start, BYTES_IN_BYTES31_NONZERO);
639+
let (end_word, end_offset) = DivRem::div_rem(
640+
abs_start.checked_add(len)?, BYTES_IN_BYTES31_NONZERO,
641+
);
642+
let data_len = self.data.len();
643+
let remainder_len = upcast(*self.remainder_len);
644+
645+
// Single word slice - extract from that word only
646+
if start_word == end_word {
647+
let word = if start_word < data_len {
648+
slice_bytes31((*self.data[start_word]).into(), BYTES_IN_BYTES31, start_offset, len)
649+
} else {
650+
slice_bytes31(*self.remainder_word, remainder_len, start_offset, len)
651+
};
652+
return Some(
653+
ByteSpan {
654+
data: [].span(),
655+
first_char_start_offset: 0,
656+
remainder_word: word,
657+
remainder_len: downcast(len).unwrap(),
658+
},
659+
);
660+
}
661+
662+
// Multi-word slice - data words plus optional remainder
663+
let remainder = if end_word < data_len {
664+
slice_bytes31((*self.data[end_word]).into(), BYTES_IN_BYTES31, 0, end_offset)
665+
} else {
666+
slice_bytes31(*self.remainder_word, remainder_len, 0, end_offset)
667+
};
668+
669+
Some(
670+
ByteSpan {
671+
data: self.data.slice(start_word, min(end_word, data_len) - start_word),
672+
first_char_start_offset: downcast(start_offset).unwrap(),
673+
remainder_word: remainder,
674+
remainder_len: downcast(end_offset).unwrap(),
675+
},
676+
)
677+
}
626678
}
627679

628680
impl ByteSpanDefault of Default<ByteSpan> {
@@ -675,3 +727,24 @@ impl ByteSpanIntoByteArray of Into<ByteSpan, ByteArray> {
675727
ba
676728
}
677729
}
730+
731+
/// Extracts a slice of bytes from a word.
732+
/// Returns bytes [start, start+len) where byte 0 is the leftmost (most significant) byte.
733+
/// The input `bytes31` and the output `bytes31`s are represented using `felt252`s to improve
734+
/// performance.
735+
///
736+
/// Note: this function assumes that:
737+
/// 1. `word` is validly convertible to a `bytes31` which has no more than `word_len` bytes of data.
738+
/// 2. `start + len <= word_len`.
739+
/// 3. `word_len <= BYTES_IN_BYTES31`.
740+
/// If these assumptions are not met, it can corrupt the result. Thus, this should be a
741+
/// private function. We could add masking/assertions but it would be more expensive.
742+
fn slice_bytes31(word: felt252, word_len: usize, start: usize, len: usize) -> felt252 {
743+
if len == 0 {
744+
return 0;
745+
}
746+
// Remove suffix: keep only bytes [0, start+len).
747+
let (_, without_suffix) = split_bytes31(word, word_len, word_len - (start + len));
748+
let (without_prefix_and_suffix, _) = split_bytes31(without_suffix, start + len, len);
749+
without_prefix_and_suffix
750+
}

corelib/src/test/byte_array_test.cairo

Lines changed: 109 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use crate::byte_array::{ByteSpanTrait, ToByteSpanTrait};
2+
use crate::num::traits::Bounded;
23
use crate::test::test_utils::{assert_eq, assert_ne};
34

45
#[test]
@@ -516,25 +517,28 @@ fn test_span_len() {
516517
assert_eq!(span.len(), 1);
517518
assert!(!span.is_empty());
518519

520+
let ba_31: ByteArray = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcde";
521+
let span = ba_31.span();
522+
assert_eq!(span.len(), 31, "wrong span len");
523+
assert!(!span.is_empty());
524+
519525
// Test empty.
520526
let empty_ba: ByteArray = "";
521527
let empty_span = empty_ba.span();
522528
assert_eq!(empty_span.len(), 0);
523529
assert!(empty_span.is_empty());
524530

525-
// TODO(giladchase): Add start-offset using slice once supported.
526531
// First word in the array, second in last word.
527532
let two_byte31: ByteArray = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefg";
528-
let mut single_span = two_byte31.span();
529-
assert_eq!(single_span.len(), 33, "len error with start offset");
533+
let mut single_span = two_byte31.span().slice(1, 32).unwrap();
534+
assert_eq!(single_span.len(), 32, "len error with start offset");
530535
assert!(!single_span.is_empty());
531536

532-
// TODO(giladchase): Add start-offset using slice once supported.
533537
// First word in the array, second in the array, third in last word.
534538
let three_bytes31: ByteArray =
535539
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789#$"; // 64 chars.
536-
let mut three_span = three_bytes31.span();
537-
assert_eq!(three_span.len(), 64, "len error with size-3 bytearray");
540+
let mut three_span = three_bytes31.span().slice(1, 63).unwrap();
541+
assert_eq!(three_span.len(), 63, "len error with size-3 bytearray");
538542
assert!(!three_span.is_empty());
539543
}
540544

@@ -549,6 +553,101 @@ fn test_span_copy() {
549553
assert_eq!(ba, span.into());
550554
}
551555

556+
#[test]
557+
fn test_span_slice_empty() {
558+
let ba: ByteArray = "hello";
559+
let span = ba.span();
560+
561+
let empty = span.slice(2, 0).unwrap();
562+
assert_eq!(empty.len(), 0);
563+
assert!(empty.is_empty());
564+
565+
let empty_string: ByteArray = "";
566+
assert_eq!(empty_string, empty.into());
567+
}
568+
569+
// TODO(giladchase): replace assert+is_none with assert_eq when we have PartialEq.
570+
#[test]
571+
fn test_span_slice_out_of_bounds() {
572+
let ba: ByteArray = "hello";
573+
let span = ba.span();
574+
575+
assert!(span.slice(3, 5).is_none(), "end out of bounds");
576+
assert!(span.slice(6, 1).is_none(), "start out of bounds");
577+
578+
assert!(
579+
span.slice(1, 3).unwrap().slice(Bounded::<usize>::MAX, 1).is_none(),
580+
"start offset overflow",
581+
);
582+
assert!(span.slice(Bounded::<usize>::MAX, 1).is_none());
583+
assert!(span.slice(1, Bounded::<usize>::MAX).is_none());
584+
585+
let empty_string: ByteArray = "";
586+
assert!(empty_string.span().slice(0, 2).is_none(), "empty slice is sliceable");
587+
}
588+
589+
#[test]
590+
fn test_span_slice_under_31_bytes() {
591+
// Word entirely in remainder word.
592+
let ba: ByteArray = "abcde";
593+
let span = ba.span();
594+
595+
let mut slice: ByteArray = span.slice(0, 3).unwrap().into();
596+
assert_eq!(slice, "abc", "first 3 bytes");
597+
598+
slice = span.slice(2, 2).unwrap().into();
599+
assert_eq!(slice, "cd", "middle 2 bytes");
600+
601+
slice = span.slice(4, 1).unwrap().into();
602+
assert_eq!(slice, "e", "last byte");
603+
}
604+
605+
#[test]
606+
fn test_span_slice_exactly_31_bytes() {
607+
// 1 full data word, empty last_word.
608+
let ba_31: ByteArray = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcde"; // 31 bytes
609+
let span31 = ba_31.span();
610+
assert_eq!(span31.len(), 31);
611+
612+
let ba: ByteArray = span31.slice(0, 31).unwrap().into();
613+
assert_eq!(ba, ba_31);
614+
615+
// Debug: Let's check what byte is at position 10
616+
assert_eq!(ba_31.at(10), Some('K'));
617+
assert_eq!(ba_31.at(11), Some('L'));
618+
619+
// Partial slice
620+
let ba: ByteArray = span31.slice(10, 10).unwrap().into();
621+
assert_eq!(ba, "KLMNOPQRST", "middle 10 bytes");
622+
}
623+
624+
#[test]
625+
fn test_span_slice_positions() {
626+
// Two full bytes31 + remainder with 2 bytes.
627+
let ba: ByteArray =
628+
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789#$"; // 64 bytes
629+
let span = ba.span();
630+
631+
// Slice from middle of first word to middle of second word.
632+
let short_slice_across_data_words = span.slice(10, 30).unwrap();
633+
let mut ba_from_span: ByteArray = short_slice_across_data_words.into();
634+
assert_eq!(ba_from_span, "KLMNOPQRSTUVWXYZabcdefghijklmn", "multi-word short slice failed");
635+
636+
// Slice spanning multiple words.
637+
let long_slice_across_data_words = span.slice(5, 50).unwrap();
638+
ba_from_span = long_slice_across_data_words.into();
639+
assert_eq!(
640+
ba_from_span,
641+
"FGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz012",
642+
"multi-word long slice failed",
643+
);
644+
645+
// Slice from second word into remainder.
646+
let short_slice_into_remainder_word = span.slice(29, 20).unwrap();
647+
ba_from_span = short_slice_into_remainder_word.into();
648+
assert_eq!(ba_from_span, "defghijklmnopqrstuvw", "short slice into remainder word failed");
649+
}
650+
552651
#[test]
553652
fn test_span_into_bytearray() {
554653
let empty_ba: ByteArray = "";
@@ -561,5 +660,8 @@ fn test_span_into_bytearray() {
561660
// Data word and remainder.
562661
let large_ba: ByteArray = "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVW"; // 40 bytes
563662
assert_eq!(large_ba.span().into(), large_ba);
564-
// TODO(giladchase): test with slice.
663+
664+
// Test sliced span with offset.
665+
let sliced: ByteArray = large_ba.span().slice(10, 25).unwrap().into();
666+
assert_eq!(sliced, ":;<=>?@ABCDEFGHIJKLMNOPQR");
565667
}

0 commit comments

Comments
 (0)