Skip to content

feat: add tree to virtual array conversion #1393

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 26 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
ca5d453
add tree to virtual array conversion
pfackeldey Feb 26, 2025
5ee61eb
Merge branch 'main' into pfackeldey/uproot_lazy
ianna Apr 1, 2025
9b57eba
Merge branch 'main' into pfackeldey/uproot_lazy
ianna Apr 11, 2025
96df355
chore: update pre-commit hooks (#1418)
pre-commit-ci[bot] Apr 14, 2025
b9fbbe9
ci: pin Chrome version for Pyodide tests (#1422)
ariostas Apr 18, 2025
92bef17
chore: update pre-commit hooks (#1423)
pre-commit-ci[bot] Apr 21, 2025
f48de18
fix: issue with empty big_endian array (#1420)
pfackeldey Apr 22, 2025
0ce4d86
fix: safer branch title access (#1421)
pfackeldey Apr 22, 2025
4ddc7c5
docs: add contributing guide (#1425)
ianna Apr 22, 2025
40e596d
improve virtual array loading
pfackeldey Apr 23, 2025
7e178b9
style: pre-commit fixes
pre-commit-ci[bot] Apr 23, 2025
3ee4b52
Merge branch 'main' into pfackeldey/uproot_lazy
pfackeldey Apr 23, 2025
bc79a48
add 'access_log' kwarg
pfackeldey Apr 23, 2025
4e20125
style: pre-commit fixes
pre-commit-ci[bot] Apr 23, 2025
664550a
don't re-define 'Accessed'
pfackeldey Apr 23, 2025
01d7238
add doc string
pfackeldey Apr 24, 2025
fe275cd
minor doc string fixes
pfackeldey Apr 24, 2025
56eebf6
Merge branch 'main' into pfackeldey/uproot_lazy
ianna Apr 25, 2025
4993678
use form_with_unique_keys from awkward
pfackeldey Apr 25, 2025
4402665
rm accidentally committed file...
pfackeldey Apr 29, 2025
1d925ba
refactor virtual buffer loading to better work with form mappings
pfackeldey Apr 29, 2025
92df7c6
require awkward v2.8.2
pfackeldey May 7, 2025
8929615
Merge branch 'main' into pfackeldey/uproot_lazy
pfackeldey May 7, 2025
f8a3461
Merge branch 'main' into pfackeldey/uproot_lazy
pfackeldey Jul 3, 2025
f9788f7
Merge branch 'main' into pfackeldey/uproot_lazy
pfackeldey Jul 29, 2025
6ec4589
Merge branch 'main' into pfackeldey/uproot_lazy
pfackeldey Aug 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ classifiers = [
"Topic :: Utilities"
]
dependencies = [
"awkward>=2.4.6",
"awkward>=2.8.2",
"cramjam>=2.5.0",
"xxhash",
"numpy",
Expand Down
157 changes: 104 additions & 53 deletions src/uproot/_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from uproot.source.chunk import SourcePerformanceCounters

try:
from typing import TYPE_CHECKING, Final
from typing import TYPE_CHECKING, Final, NamedTuple

from typing_extensions import Any, Protocol, TypeVar
except ImportError:
Expand All @@ -19,7 +19,7 @@
import numpy

import uproot
from uproot._util import no_filter, unset
from uproot._util import get_ttree_form, no_filter, unset
from uproot.behaviors.RNTuple import HasFields
from uproot.behaviors.RNTuple import (
_regularize_step_size as _RNTuple_regularize_step_size,
Expand Down Expand Up @@ -972,13 +972,91 @@ def load_buffers(
return container


class FormMappingInfoWithVirtualArrays(TrivialFormMappingInfo):
def buffer_replacements(
self,
tree: HasBranches,
keys: frozenset[str],
start: int,
stop: int,
decompression_executor,
interpretation_executor,
options: Any,
) -> Mapping[str, AwkArray]:
awkward = uproot.extras.awkward()

def generator(tree, buffer_key):
form_key, attribute = self.parse_buffer_key(buffer_key)
key = self._form_key_to_key[form_key]
branch = tree[key]

def _generator():
array = branch.array(
entry_start=start,
entry_stop=stop,
interpretation_executor=interpretation_executor,
decompression_executor=decompression_executor,
library="ak",
ak_add_doc=options.get("ak_add_doc"),
)

# add to access_log
access_log = options.get("access_log")
if access_log is not None:
if not hasattr(access_log, "__iadd__"):
raise ValueError(
f"{access_log=} needs to implement '__iadd__'."
)
else:
access_log += [Accessed(branch=key, buffer_key=buffer_key)]

# Convert the sub-array into buffers
ttree_subform, _, ttree_container = awkward.to_buffers(array)

# Load the associated projection subform
projection_subform = self._form.content(key)

# Correlate each TTree form key with the projection form key
for (src, src_dtype), (dst, dst_dtype) in zip(
ttree_subform.expected_from_buffers().items(),
projection_subform.expected_from_buffers(self.buffer_key).items(),
):
# Return the corresponding array from the TTree if buffer key matches
if buffer_key == dst:
if src_dtype != dst_dtype:
raise TypeError(
f"Data type mismatch: {src_dtype} != {dst_dtype}"
)
return ttree_container[src]

# Raise an error if the buffer key is not found
raise ValueError(
f"Buffer key {buffer_key} not found in form {self._form}"
)

return _generator

container = {}
for buffer_key, _ in self._form.expected_from_buffers().items():
container[buffer_key] = generator(tree, buffer_key)

return container


class TrivialFormMapping(ImplementsFormMapping):
def __call__(self, form: Form) -> tuple[Form, TrivialFormMappingInfo]:
dask_awkward = uproot.extras.dask_awkward()
new_form = dask_awkward.lib.utils.form_with_unique_keys(form, "<root>")
awkward = uproot.extras.awkward()
new_form = awkward.forms.form_with_unique_keys(form, ("<root>",))
return new_form, TrivialFormMappingInfo(new_form)


class FormMappingWithVirtualArrays(ImplementsFormMapping):
def __call__(self, form: Form) -> tuple[Form, FormMappingInfoWithVirtualArrays]:
awkward = uproot.extras.awkward()
new_form = awkward.forms.form_with_unique_keys(form, ("<root>",))
return new_form, FormMappingInfoWithVirtualArrays(new_form)


T = TypeVar("T")


Expand Down Expand Up @@ -1039,7 +1117,18 @@ def read_tree(
# but not two of the keys required for buffer B
if all(k in self.common_keys for k in keys_for_buffer):
container[buffer_key] = mapping[buffer_key]
# Otherwise, introduce a placeholder
# if the form mapping info provides a replacements, use it
elif hasattr(self.form_mapping_info, "buffer_replacements"):
container[buffer_key] = self.form_mapping_info.buffer_replacements(
tree,
keys_for_buffer,
start,
stop,
self.decompression_executor,
self.interpretation_executor,
self.interp_options,
)[buffer_key]
# Otherwise, introduce a placeholder (default replacement)
else:
container[buffer_key] = awkward.typetracer.PlaceholderArray(
nplike=nplike,
Expand Down Expand Up @@ -1397,43 +1486,6 @@ def project_keys(self: T, keys: frozenset[str]) -> T:
)


def _get_ttree_form(
awkward,
ttree,
common_keys,
ak_add_doc,
):
contents = []
for key in common_keys:
branch = ttree[key]
content_form = branch.interpretation.awkward_form(ttree.file)
content_parameters = {}
if isinstance(ak_add_doc, bool):
if ak_add_doc:
content_parameters["__doc__"] = branch.title
elif isinstance(ak_add_doc, dict):
content_parameters.update(
{
key: branch.__getattribute__(value)
for key, value in ak_add_doc.items()
}
)
if len(content_parameters.keys()) != 0:
content_form = content_form.copy(parameters=content_parameters)
contents.append(content_form)

if isinstance(ak_add_doc, bool):
parameters = {"__doc__": ttree.title} if ak_add_doc else None
elif isinstance(ak_add_doc, dict):
parameters = (
{"__doc__": ttree.title} if "__doc__" in ak_add_doc.keys() else None
)
else:
parameters = None

return awkward.forms.RecordForm(contents, common_keys, parameters=parameters)


def _get_dak_array(
files,
filter_name,
Expand Down Expand Up @@ -1595,18 +1647,16 @@ def real_filter_branch(branch):
if isinstance(ttrees[0], HasFields):
base_form, _ = ttrees[0].to_akform(filter_name=common_keys)
else:
base_form = _get_ttree_form(
awkward, ttrees[0], common_keys, interp_options.get("ak_add_doc")
base_form = get_ttree_form(
ttrees[0], common_keys, interp_options.get("ak_add_doc")
)

if len(partition_args) == 0:
divisions.append(0)
partition_args.append((0, 0, 0))

if form_mapping is None:
expected_form = dask_awkward.lib.utils.form_with_unique_keys(
base_form, "<root>"
)
expected_form = awkward.forms.form_with_unique_keys(base_form, ("<root>",))
form_mapping_info = TrivialFormMappingInfo(expected_form)
else:
expected_form, form_mapping_info = form_mapping(base_form)
Expand Down Expand Up @@ -1673,9 +1723,7 @@ def _get_dak_array_delay_open(
full_paths=full_paths,
ignore_duplicates=True,
)
base_form = _get_ttree_form(
awkward, obj, common_keys, interp_options.get("ak_add_doc")
)
base_form = get_ttree_form(obj, common_keys, interp_options.get("ak_add_doc"))

divisions = [0]
partition_args = []
Expand Down Expand Up @@ -1711,9 +1759,7 @@ def _get_dak_array_delay_open(
)

if form_mapping is None:
expected_form = dask_awkward.lib.utils.form_with_unique_keys(
base_form, "<root>"
)
expected_form = awkward.forms.form_with_unique_keys(base_form, ("<root>",))
form_mapping_info = TrivialFormMappingInfo(expected_form)
else:
expected_form, form_mapping_info = form_mapping(base_form)
Expand All @@ -1738,3 +1784,8 @@ def _get_dak_array_delay_open(
divisions=None if divisions is None else tuple(divisions),
label="from-uproot",
)


class Accessed(NamedTuple):
branch: str
buffer_key: str
39 changes: 39 additions & 0 deletions src/uproot/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,45 @@ def awkward_form_of_iter(awkward, form):
raise RuntimeError(f"unrecognized form: {type(form)}")


def get_ttree_form(
ttree,
common_keys,
ak_add_doc,
):
import uproot

awkward = uproot.extras.awkward()
contents = []
for key in common_keys:
branch = ttree[key]
content_form = branch.interpretation.awkward_form(ttree.file)
content_parameters = {}
if isinstance(ak_add_doc, bool):
if ak_add_doc:
content_parameters["__doc__"] = branch.title
elif isinstance(ak_add_doc, dict):
content_parameters.update(
{
key: branch.__getattribute__(value)
for key, value in ak_add_doc.items()
}
)
if len(content_parameters.keys()) != 0:
content_form = content_form.copy(parameters=content_parameters)
contents.append(content_form)

if isinstance(ak_add_doc, bool):
parameters = {"__doc__": ttree.title} if ak_add_doc else None
elif isinstance(ak_add_doc, dict):
parameters = (
{"__doc__": ttree.title} if "__doc__" in ak_add_doc.keys() else None
)
else:
parameters = None

return awkward.forms.RecordForm(contents, common_keys, parameters=parameters)


def damerau_levenshtein(a, b, ratio=False):
"""
Calculates the Damerau-Levenshtein distance of two strings.
Expand Down
Loading
Loading