From fdbbd60aaf9191822cbf7622cab3459ea62a4a50 Mon Sep 17 00:00:00 2001 From: Perceval Wajsburt Date: Wed, 4 Jun 2025 23:53:35 +0200 Subject: [PATCH] Add pad_value option to refold --- README.md | 14 +++++++++++++- foldedtensor/__init__.py | 33 +++++++++++++++++++++++++++------ foldedtensor/functions.cpp | 19 ++++++++++++++++--- tests/test_folded_tensor.py | 37 +++++++++++++++++++++++++++++++++++++ 4 files changed, 93 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index d6bf614..040d8d6 100644 --- a/README.md +++ b/README.md @@ -47,11 +47,14 @@ ft = as_folded_tensor( [0, 1, 2], [3], ], + pad_value=-1, ) # FoldedTensor([[0, 1, 2], -# [3, 0, 0]]) +# [3, -1, -1]]) ``` +`pad_value` allows changing the value used to pad the nested sequences. + You can also specify names and flattened/unflattened dimensions at the time of creation: ```python @@ -87,6 +90,15 @@ print(ft.refold(("lines", "words"))) # [2, 3], # [4, 3]]) +# Use a custom value for padding when refolding +print(ft.refold(("lines", "words"), pad_value=-1)) +# FoldedTensor([[ 1, -1], +# [-1, -1], +# [-1, -1], +# [-1, -1], +# [ 2, 3], +# [ 4, 3]]) + # Refold on the words dim only: flatten everything print(ft.refold(("words",))) # FoldedTensor([1, 2, 3, 4, 3]) diff --git a/foldedtensor/__init__.py b/foldedtensor/__init__.py index 5499ab8..224a1ee 100644 --- a/foldedtensor/__init__.py +++ b/foldedtensor/__init__.py @@ -65,6 +65,7 @@ def forward( ctx, self: "FoldedTensor", dims: Tuple[int], + pad_value: Union[int, float, bool] = 0, ) -> "FoldedTensor": ctx.set_materialize_grads(False) ctx.lengths = self.lengths @@ -82,8 +83,11 @@ def forward( indexer = torch.from_numpy(np_new_indexer).to(device) ctx.output_indexer = indexer shape_suffix = data.shape[len(self.data_dims) :] - refolded_data = torch.zeros( - (*shape_prefix, *shape_suffix), dtype=data.dtype, device=device + refolded_data = torch.full( + (*shape_prefix, *shape_suffix), + pad_value, + dtype=data.dtype, + device=device, ) refolded_data.view(-1, *shape_suffix)[indexer] = data.view( -1, *shape_suffix @@ -112,7 +116,7 @@ def backward(ctx, grad_output): grad_input.view(-1, *shape_suffix)[ctx.input_indexer] = grad_output.reshape( -1, *shape_suffix ).index_select(0, ctx.output_indexer) - return grad_input, None + return grad_input, None, None type_to_dtype_dict = { @@ -148,6 +152,7 @@ def as_folded_tensor( dtype: Optional[torch.dtype] = None, lengths: Optional[List[List[int]]] = None, device: Optional[Union[str, torch.device]] = None, + pad_value: Union[int, float, bool] = 0, ): """ Converts a tensor or nested sequence into a FoldedTensor. @@ -168,6 +173,8 @@ def as_folded_tensor( must be provided. If `data` is a sequence, this argument must be `None`. device: Optional[Unit[str, torch.device]] The device of the output tensor + pad_value: Union[int, float, bool] + Value used to pad the nested sequences. Defaults to ``0``. """ if full_names is not None: if data_dims is not None: @@ -212,6 +219,7 @@ def as_folded_tensor( data, data_dims, np.dtype(dtype), + pad_value, ) indexer = torch.from_numpy(indexer) padded = torch.from_numpy(padded) @@ -396,7 +404,20 @@ def clone(self): cloned._mask = self._mask.clone() return cloned - def refold(self, *dims: Union[Sequence[Union[int, str]], int, str]): + def refold( + self, + *dims: Union[Sequence[Union[int, str]], int, str], + pad_value: Union[int, float, bool] = 0, + ): + """Change which dimensions are padded. + + Parameters + ---------- + *dims: Union[Sequence[Union[int, str]], int, str] + Dimensions to keep padded. + pad_value: Union[int, float, bool] + Value used to pad the folded dimensions. Defaults to ``0``. + """ if not isinstance(dims[0], (int, str)): assert len(dims) == 1, ( "Expected the first only argument to be a " @@ -414,10 +435,10 @@ def refold(self, *dims: Union[Sequence[Union[int, str]], int, str]): f"could not be refolded with dimensions {list(dims)}" ) - if dims == self.data_dims: + if dims == self.data_dims and pad_value == 0: return self - return Refold.apply(self, dims) + return Refold.apply(self, dims, pad_value) def reduce_foldedtensor(self: FoldedTensor): diff --git a/foldedtensor/functions.cpp b/foldedtensor/functions.cpp index 718c69b..0bbf832 100644 --- a/foldedtensor/functions.cpp +++ b/foldedtensor/functions.cpp @@ -200,7 +200,8 @@ std::tuple< nested_py_list_to_padded_np_array( const py::list &nested_list, std::vector data_dims, - py::dtype &dtype) { + py::dtype &dtype, + py::object pad_value) { // Will contain the variable lengths of the nested lists // One sequence per dimension, containing the lengths of the lists at that dimension std::vector> lengths; @@ -236,7 +237,11 @@ nested_py_list_to_padded_np_array( // Create the padded array from the shape inferred during `flatten_py_list` py::array padded_array = py::array(py::dtype(dtype), shape); - padded_array[py::make_tuple(py::ellipsis())] = 0; + if (PyArray_FillWithScalar( + reinterpret_cast(padded_array.ptr()), + pad_value.ptr()) < 0) { + throw py::error_already_set(); + } // Get the strides of the array const py::ssize_t *array_strides = padded_array.strides(); @@ -311,7 +316,15 @@ PYBIND11_MODULE(_C, m) { init_numpy(); m.def("make_refolding_indexer", &make_refolding_indexer, "Build an indexer to refold data into a different shape"); - m.def("nested_py_list_to_padded_array", &nested_py_list_to_padded_np_array, "Converts a nested Python list to a padded array"); + m.def( + "nested_py_list_to_padded_array", + &nested_py_list_to_padded_np_array, + py::arg("nested_list"), + py::arg("data_dims"), + py::arg("dtype"), + py::arg("pad_value") = 0, + "Converts a nested Python list to a padded array" + ); } #pragma clang diagnostic pop diff --git a/tests/test_folded_tensor.py b/tests/test_folded_tensor.py index a4d2e24..260aac4 100644 --- a/tests/test_folded_tensor.py +++ b/tests/test_folded_tensor.py @@ -213,6 +213,23 @@ def test_refold_lines(ft): ).all() +def test_refold_custom_pad_value(ft): + ft2 = ft.refold("lines", "words", pad_value=-1) + assert ( + ft2.data + == torch.tensor( + [ + [1, -1], + [-1, -1], + [-1, -1], + [-1, -1], + [2, 3], + [4, 3], + ] + ) + ).all() + + def test_embedding(ft): embedder = torch.nn.Embedding(10, 16) embedding = embedder(ft.refold("words")) @@ -299,6 +316,26 @@ def test_pad_embedding(): ).all() +def test_custom_pad_value(): + ft = as_folded_tensor( + [ + [0, 1, 2], + [3, 4], + ], + pad_value=-1, + dtype=torch.long, + ) + assert ( + ft.data + == torch.tensor( + [ + [0, 1, 2], + [3, 4, -1], + ] + ) + ).all() + + def test_empty_args(): ft = as_folded_tensor( [