diff --git a/pyproject.toml b/pyproject.toml index 4ea1edecb..41d85260f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,7 @@ classifiers = [ "Topic :: Utilities" ] dependencies = [ - "awkward>=2.4.6", + "awkward>=2.8.2", "cramjam>=2.5.0", "xxhash", "numpy", diff --git a/src/uproot/_dask.py b/src/uproot/_dask.py index f097864f9..dd0a4c8b5 100644 --- a/src/uproot/_dask.py +++ b/src/uproot/_dask.py @@ -10,7 +10,7 @@ from uproot.source.chunk import SourcePerformanceCounters try: - from typing import TYPE_CHECKING, Final + from typing import TYPE_CHECKING, Final, NamedTuple from typing_extensions import Any, Protocol, TypeVar except ImportError: @@ -19,7 +19,7 @@ import numpy import uproot -from uproot._util import no_filter, unset +from uproot._util import get_ttree_form, no_filter, unset from uproot.behaviors.RNTuple import HasFields from uproot.behaviors.RNTuple import ( _regularize_step_size as _RNTuple_regularize_step_size, @@ -972,13 +972,91 @@ def load_buffers( return container +class FormMappingInfoWithVirtualArrays(TrivialFormMappingInfo): + def buffer_replacements( + self, + tree: HasBranches, + keys: frozenset[str], + start: int, + stop: int, + decompression_executor, + interpretation_executor, + options: Any, + ) -> Mapping[str, AwkArray]: + awkward = uproot.extras.awkward() + + def generator(tree, buffer_key): + form_key, attribute = self.parse_buffer_key(buffer_key) + key = self._form_key_to_key[form_key] + branch = tree[key] + + def _generator(): + array = branch.array( + entry_start=start, + entry_stop=stop, + interpretation_executor=interpretation_executor, + decompression_executor=decompression_executor, + library="ak", + ak_add_doc=options.get("ak_add_doc"), + ) + + # add to access_log + access_log = options.get("access_log") + if access_log is not None: + if not hasattr(access_log, "__iadd__"): + raise ValueError( + f"{access_log=} needs to implement '__iadd__'." + ) + else: + access_log += [Accessed(branch=key, buffer_key=buffer_key)] + + # Convert the sub-array into buffers + ttree_subform, _, ttree_container = awkward.to_buffers(array) + + # Load the associated projection subform + projection_subform = self._form.content(key) + + # Correlate each TTree form key with the projection form key + for (src, src_dtype), (dst, dst_dtype) in zip( + ttree_subform.expected_from_buffers().items(), + projection_subform.expected_from_buffers(self.buffer_key).items(), + ): + # Return the corresponding array from the TTree if buffer key matches + if buffer_key == dst: + if src_dtype != dst_dtype: + raise TypeError( + f"Data type mismatch: {src_dtype} != {dst_dtype}" + ) + return ttree_container[src] + + # Raise an error if the buffer key is not found + raise ValueError( + f"Buffer key {buffer_key} not found in form {self._form}" + ) + + return _generator + + container = {} + for buffer_key, _ in self._form.expected_from_buffers().items(): + container[buffer_key] = generator(tree, buffer_key) + + return container + + class TrivialFormMapping(ImplementsFormMapping): def __call__(self, form: Form) -> tuple[Form, TrivialFormMappingInfo]: - dask_awkward = uproot.extras.dask_awkward() - new_form = dask_awkward.lib.utils.form_with_unique_keys(form, "") + awkward = uproot.extras.awkward() + new_form = awkward.forms.form_with_unique_keys(form, ("",)) return new_form, TrivialFormMappingInfo(new_form) +class FormMappingWithVirtualArrays(ImplementsFormMapping): + def __call__(self, form: Form) -> tuple[Form, FormMappingInfoWithVirtualArrays]: + awkward = uproot.extras.awkward() + new_form = awkward.forms.form_with_unique_keys(form, ("",)) + return new_form, FormMappingInfoWithVirtualArrays(new_form) + + T = TypeVar("T") @@ -1039,7 +1117,18 @@ def read_tree( # but not two of the keys required for buffer B if all(k in self.common_keys for k in keys_for_buffer): container[buffer_key] = mapping[buffer_key] - # Otherwise, introduce a placeholder + # if the form mapping info provides a replacements, use it + elif hasattr(self.form_mapping_info, "buffer_replacements"): + container[buffer_key] = self.form_mapping_info.buffer_replacements( + tree, + keys_for_buffer, + start, + stop, + self.decompression_executor, + self.interpretation_executor, + self.interp_options, + )[buffer_key] + # Otherwise, introduce a placeholder (default replacement) else: container[buffer_key] = awkward.typetracer.PlaceholderArray( nplike=nplike, @@ -1397,43 +1486,6 @@ def project_keys(self: T, keys: frozenset[str]) -> T: ) -def _get_ttree_form( - awkward, - ttree, - common_keys, - ak_add_doc, -): - contents = [] - for key in common_keys: - branch = ttree[key] - content_form = branch.interpretation.awkward_form(ttree.file) - content_parameters = {} - if isinstance(ak_add_doc, bool): - if ak_add_doc: - content_parameters["__doc__"] = branch.title - elif isinstance(ak_add_doc, dict): - content_parameters.update( - { - key: branch.__getattribute__(value) - for key, value in ak_add_doc.items() - } - ) - if len(content_parameters.keys()) != 0: - content_form = content_form.copy(parameters=content_parameters) - contents.append(content_form) - - if isinstance(ak_add_doc, bool): - parameters = {"__doc__": ttree.title} if ak_add_doc else None - elif isinstance(ak_add_doc, dict): - parameters = ( - {"__doc__": ttree.title} if "__doc__" in ak_add_doc.keys() else None - ) - else: - parameters = None - - return awkward.forms.RecordForm(contents, common_keys, parameters=parameters) - - def _get_dak_array( files, filter_name, @@ -1595,8 +1647,8 @@ def real_filter_branch(branch): if isinstance(ttrees[0], HasFields): base_form, _ = ttrees[0].to_akform(filter_name=common_keys) else: - base_form = _get_ttree_form( - awkward, ttrees[0], common_keys, interp_options.get("ak_add_doc") + base_form = get_ttree_form( + ttrees[0], common_keys, interp_options.get("ak_add_doc") ) if len(partition_args) == 0: @@ -1604,9 +1656,7 @@ def real_filter_branch(branch): partition_args.append((0, 0, 0)) if form_mapping is None: - expected_form = dask_awkward.lib.utils.form_with_unique_keys( - base_form, "" - ) + expected_form = awkward.forms.form_with_unique_keys(base_form, ("",)) form_mapping_info = TrivialFormMappingInfo(expected_form) else: expected_form, form_mapping_info = form_mapping(base_form) @@ -1673,9 +1723,7 @@ def _get_dak_array_delay_open( full_paths=full_paths, ignore_duplicates=True, ) - base_form = _get_ttree_form( - awkward, obj, common_keys, interp_options.get("ak_add_doc") - ) + base_form = get_ttree_form(obj, common_keys, interp_options.get("ak_add_doc")) divisions = [0] partition_args = [] @@ -1711,9 +1759,7 @@ def _get_dak_array_delay_open( ) if form_mapping is None: - expected_form = dask_awkward.lib.utils.form_with_unique_keys( - base_form, "" - ) + expected_form = awkward.forms.form_with_unique_keys(base_form, ("",)) form_mapping_info = TrivialFormMappingInfo(expected_form) else: expected_form, form_mapping_info = form_mapping(base_form) @@ -1738,3 +1784,8 @@ def _get_dak_array_delay_open( divisions=None if divisions is None else tuple(divisions), label="from-uproot", ) + + +class Accessed(NamedTuple): + branch: str + buffer_key: str diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 9e73cb7b4..37804d9d6 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -691,6 +691,45 @@ def awkward_form_of_iter(awkward, form): raise RuntimeError(f"unrecognized form: {type(form)}") +def get_ttree_form( + ttree, + common_keys, + ak_add_doc, +): + import uproot + + awkward = uproot.extras.awkward() + contents = [] + for key in common_keys: + branch = ttree[key] + content_form = branch.interpretation.awkward_form(ttree.file) + content_parameters = {} + if isinstance(ak_add_doc, bool): + if ak_add_doc: + content_parameters["__doc__"] = branch.title + elif isinstance(ak_add_doc, dict): + content_parameters.update( + { + key: branch.__getattribute__(value) + for key, value in ak_add_doc.items() + } + ) + if len(content_parameters.keys()) != 0: + content_form = content_form.copy(parameters=content_parameters) + contents.append(content_form) + + if isinstance(ak_add_doc, bool): + parameters = {"__doc__": ttree.title} if ak_add_doc else None + elif isinstance(ak_add_doc, dict): + parameters = ( + {"__doc__": ttree.title} if "__doc__" in ak_add_doc.keys() else None + ) + else: + parameters = None + + return awkward.forms.RecordForm(contents, common_keys, parameters=parameters) + + def damerau_levenshtein(a, b, ratio=False): """ Calculates the Damerau-Levenshtein distance of two strings. diff --git a/src/uproot/behaviors/TBranch.py b/src/uproot/behaviors/TBranch.py index d3fb71018..9385a7256 100644 --- a/src/uproot/behaviors/TBranch.py +++ b/src/uproot/behaviors/TBranch.py @@ -25,7 +25,7 @@ import uproot.interpretation.grouped import uproot.language.python import uproot.source.chunk -from uproot._util import no_filter +from uproot._util import get_ttree_form, no_filter np_uint8 = numpy.dtype("u1") @@ -728,6 +728,143 @@ def show( stream.write(formatter.format(name, typename, interp).rstrip(" ") + "\n") + def virtual_arrays( + self, + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_branch=no_filter, + aliases=None, + recursive=True, + full_paths=True, + ignore_duplicates=False, + language=uproot.language.python.python_language, + entry_start=None, + entry_stop=None, + decompression_executor=None, + interpretation_executor=None, + array_cache="inherit", + ak_add_doc=False, + access_log=None, + ): + """ + Args: + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``TBranches`` by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``TBranches`` by type. + filter_branch (None or function of :doc:`uproot.behaviors.TBranch.TBranch` \u2192 bool, :doc:`uproot.interpretation.Interpretation`, or None): A + filter to select ``TBranches`` using the full + :doc:`uproot.behaviors.TBranch.TBranch` object. If the function + returns False or None, the ``TBranch`` is excluded; if the function + returns True, it is included with its standard + :ref:`uproot.behaviors.TBranch.TBranch.interpretation`; if an + :doc:`uproot.interpretation.Interpretation`, this interpretation + overrules the standard one. + aliases (None or dict of str \u2192 str): Mathematical expressions that + can be used in ``expressions`` or other aliases (without cycles). + Uses the ``language`` engine to evaluate. If None, only the + :ref:`uproot.behaviors.TBranch.TBranch.aliases` are available. + recursive (bool): If True, descend into any nested subbranches. + If False, only return the names of branches directly accessible + under this object. + full_paths (bool): If True, include the full path to each subbranch + with slashes (``/``); otherwise, use the descendant's name as + the output name. + ignore_duplicates (bool): If True, return a set of the keys; otherwise, return the full list of keys. + language (:doc:`uproot.language.Language`): Language used to interpret + the ``expressions`` and ``aliases``. + entry_start (None or int): The first entry to include. If None, start + at zero. If negative, count from the end, like a Python slice. + entry_stop (None or int): The first entry to exclude (i.e. one greater + than the last entry to include). If None, stop at + :ref:`uproot.behaviors.TTree.TTree.num_entries`. If negative, + count from the end, like a Python slice. + decompression_executor (None or Executor with a ``submit`` method): The + executor that is used to decompress ``TBaskets``; if None, the + file's :ref:`uproot.reading.ReadOnlyFile.decompression_executor` + is used. + interpretation_executor (None or Executor with a ``submit`` method): The + executor that is used to interpret uncompressed ``TBasket`` data as + arrays; if None, the file's :ref:`uproot.reading.ReadOnlyFile.interpretation_executor` + is used. + array_cache ("inherit", None, MutableMapping, or memory size): Cache of arrays; + if "inherit", use the file's cache; if None, do not use a cache; + if a memory size, create a new cache of this size. + ak_add_doc (bool | dict ): If True, add the TBranch ``title`` + to the Awkward ``__doc__`` parameter of the array. + if dict = {key:value}, add the TBranch ``value`` to the + Awkward ``key`` parameter of the array. + access_log (None or object with a ``__iadd__`` method): If an access_log is + provided, e.g. a list, all materializations of the virtual arrays are + tracked inside this reference. + + + Returns a group of virtual arrays from the ``TTree``. This method can only return Awkward Arrays. + + For example: + + .. code-block:: python + + >>> my_tree.virtual_arrays() + + >>> access_log = [] + >>> array = my_tree.virtual_arrays(access_log=access_log) + >>> ak.materialize(array.Jet_pt) + >>> print(access_log) + [Accessed(branch='Jet_pt', buffer_key='.Jet_pt-offsets'), Accessed(branch='Jet_pt', buffer_key='.Jet_pt.content-data')] + + + See also :ref:`uproot.behaviors.TBranch.HasBranches.arrays` to iterate over + the array in contiguous ranges of entries. + """ + from uproot._dask import FormMappingWithVirtualArrays + + awkward = uproot.extras.awkward() + + entry_start, entry_stop = _regularize_entries_start_stop( + self.num_entries, entry_start, entry_stop + ) + decompression_executor, interpretation_executor = _regularize_executors( + decompression_executor, interpretation_executor, self._file + ) + array_cache = _regularize_array_cache(array_cache, self._file) + + keys = self.keys( + filter_name=filter_name, + filter_typename=filter_typename, + filter_branch=filter_branch, + recursive=recursive, + full_paths=full_paths, + ignore_duplicates=ignore_duplicates, + ) + + base_form = get_ttree_form( + self, + keys, + ak_add_doc, + ) + + expected_form, form_mapping_info = FormMappingWithVirtualArrays()(base_form) + + # The buffer replacements of FormMappingInfoWithVirtualArrays are VirtualArrays + container = form_mapping_info.buffer_replacements( + self, + keys, + entry_start, + entry_stop, + decompression_executor, + interpretation_executor, + {"ak_add_doc": ak_add_doc, "access_log": access_log}, + ) + return awkward.from_buffers( + expected_form, + entry_stop - entry_start, + container, + behavior=form_mapping_info.behavior, + buffer_key=form_mapping_info.buffer_key, + ) + def arrays( self, expressions=None,