From ca5d453c8715518329455b353a2242365aece165 Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Wed, 26 Feb 2025 15:34:48 -0500 Subject: [PATCH 01/18] add tree to virtual array conversion --- src/uproot/_dask.py | 94 ++++++++++++++++++++++++++++++--- src/uproot/behaviors/TBranch.py | 74 ++++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 8 deletions(-) diff --git a/src/uproot/_dask.py b/src/uproot/_dask.py index a33abb23f..e810d135f 100644 --- a/src/uproot/_dask.py +++ b/src/uproot/_dask.py @@ -950,11 +950,93 @@ def load_buffers( return container + def load_virtual_arrays( + self, + tree: HasBranches, + keys: frozenset[str], + start: int, + stop: int, + decompression_executor, + interpretation_executor, + options: Any, + ) -> Mapping[str, AwkArray]: + awkward = uproot.extras.awkward() + + class Generator: + def __init__(self, branch, attribute): + self.branch = branch + self.attribute = attribute + + def __repr__(self): + return f"Generator({self.branch}, {self.attribute})" + + def __call__(self): + layout = self.branch.array( + entry_start=start, + entry_stop=stop, + interpretation_executor=interpretation_executor, + decompression_executor=decompression_executor, + library="ak", + ak_add_doc=options.get("ak_add_doc"), + ).layout + # this is a bit of a hack, but it works for now + if isinstance(layout, awkward.contents.NumpyArray): + return layout.data + elif isinstance(layout, awkward.contents.ListOffsetArray): + if self.attribute == "data": + return layout.content.data + elif self.attribute == "offsets": + return layout.offsets.data + else: + raise NotImplementedError() + else: + raise NotImplementedError() + + container = {} + for buffer_key, _ in self._form.expected_from_buffers().items(): + form_key, attribute = self.parse_buffer_key(buffer_key) + branch_name = self._form_key_to_key[form_key] + branch = tree[branch_name] + container[buffer_key] = Generator(branch, attribute) + + return container + + +def form_with_unique_keys(form: Form, key: str) -> Form: + awkward = uproot.extras.awkward() + + def impl(form: Form, key: str) -> None: + # Set form key + form.form_key = key + + # If the form is a record we need to loop over all fields in the + # record and set form that include the field name; this will keep + # recursing as well. + if form.is_record: + for field in form.fields: + impl(form.content(field), f"{key}.{field}") + + elif form.is_union: + for i, entry in enumerate(form.contents): + impl(entry, f"{key}#{i}") + + # NumPy like array is easy + elif form.is_numpy or form.is_unknown: + pass + + # Anything else grab the content and keep recursing + else: + impl(form.content, f"{key}.content") + + # Perform a "deep" copy without preserving references + form = awkward.forms.from_dict(form.to_dict()) + impl(form, key) + return form + class TrivialFormMapping(ImplementsFormMapping): def __call__(self, form: Form) -> tuple[Form, TrivialFormMappingInfo]: - dask_awkward = uproot.extras.dask_awkward() - new_form = dask_awkward.lib.utils.form_with_unique_keys(form, "") + new_form = form_with_unique_keys(form, "") return new_form, TrivialFormMappingInfo(new_form) @@ -1548,9 +1630,7 @@ def real_filter_branch(branch): partition_args.append((0, 0, 0)) if form_mapping is None: - expected_form = dask_awkward.lib.utils.form_with_unique_keys( - base_form, "" - ) + expected_form = form_with_unique_keys(base_form, "") form_mapping_info = TrivialFormMappingInfo(expected_form) else: expected_form, form_mapping_info = form_mapping(base_form) @@ -1651,9 +1731,7 @@ def _get_dak_array_delay_open( ) if form_mapping is None: - expected_form = dask_awkward.lib.utils.form_with_unique_keys( - base_form, "" - ) + expected_form = form_with_unique_keys(base_form, "") form_mapping_info = TrivialFormMappingInfo(expected_form) else: expected_form, form_mapping_info = form_mapping(base_form) diff --git a/src/uproot/behaviors/TBranch.py b/src/uproot/behaviors/TBranch.py index 3fb57ab7e..44de73186 100644 --- a/src/uproot/behaviors/TBranch.py +++ b/src/uproot/behaviors/TBranch.py @@ -673,6 +673,80 @@ def show( stream.write(formatter.format(name, typename, interp).rstrip(" ") + "\n") + def virtual_arrays( + self, + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_branch=no_filter, + aliases=None, + recursive=True, + full_paths=True, + ignore_duplicates=False, + language=uproot.language.python.python_language, + form_mapping=None, + entry_start=None, + entry_stop=None, + decompression_executor=None, + interpretation_executor=None, + array_cache="inherit", + ak_add_doc=False, + ): + from uproot._dask import ( + TrivialFormMappingInfo, + _get_ttree_form, + form_with_unique_keys, + ) + + awkward = uproot.extras.awkward() + + entry_start, entry_stop = _regularize_entries_start_stop( + self.num_entries, entry_start, entry_stop + ) + decompression_executor, interpretation_executor = _regularize_executors( + decompression_executor, interpretation_executor, self._file + ) + array_cache = _regularize_array_cache(array_cache, self._file) + + keys = self.keys( + filter_name=filter_name, + filter_typename=filter_typename, + filter_branch=filter_branch, + recursive=recursive, + full_paths=full_paths, + ignore_duplicates=ignore_duplicates, + ) + + base_form = _get_ttree_form( + awkward, + self, + keys, + ak_add_doc, + ) + + if form_mapping is None: + expected_form = form_with_unique_keys(base_form, "") + form_mapping_info = TrivialFormMappingInfo(expected_form) + else: + expected_form, form_mapping_info = form_mapping(base_form) + + container = form_mapping_info.load_virtual_arrays( + self, + keys, + entry_start, + entry_stop, + decompression_executor, + interpretation_executor, + {"ak_add_doc": ak_add_doc}, + ) + return awkward.from_buffers( + expected_form, + entry_stop - entry_start, + container, + behavior=form_mapping_info.behavior, + buffer_key=form_mapping_info.buffer_key, + ) + def arrays( self, expressions=None, From 96df355a635f832f49d7f4be682ebcc866a9cdfe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 14 Apr 2025 19:35:41 +0000 Subject: [PATCH 02/18] chore: update pre-commit hooks (#1418) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.4 β†’ v0.11.5](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.4...v0.11.5) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f165a8336..b1dbcbb0c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,7 +23,7 @@ repos: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.4 + rev: v0.11.5 hooks: - id: ruff args: [--fix, --show-fixes] From b9fbbe966d8952f6a01365fa37929659850aa2ec Mon Sep 17 00:00:00 2001 From: Andres Rios Tascon Date: Fri, 18 Apr 2025 11:51:40 -0400 Subject: [PATCH 03/18] ci: pin Chrome version for Pyodide tests (#1422) * Updated Pyodide version * Pinned chrome version * Changed chrome version * Try using node instead of chrome * Remove chrome-specific setup * Actually use Node * Go back to chrome --- .github/workflows/build-test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index c28b50045..9b8939256 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -127,8 +127,8 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 30 env: - PYODIDE_VERSION: 0.27.3 - PYODIDE_BUILD_VERSION: 0.29.3 + PYODIDE_VERSION: 0.27.5 + PYODIDE_BUILD_VERSION: 0.30.0 AWKWARD_VERSION: v2.7.4 steps: @@ -176,7 +176,7 @@ jobs: with: runner: selenium browser: chrome - browser-version: latest + browser-version: 1446681 github-token: ${{ secrets.GITHUB_TOKEN }} - name: Install dependencies From 92bef17c1a49a3b81af037948f6fad4d2ac29b34 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Apr 2025 19:43:45 +0000 Subject: [PATCH 04/18] chore: update pre-commit hooks (#1423) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.11.5 β†’ v0.11.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.11.5...v0.11.6) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b1dbcbb0c..3c710df54 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,7 +23,7 @@ repos: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.5 + rev: v0.11.6 hooks: - id: ruff args: [--fix, --show-fixes] From f48de18b04e6a9f6209e962c4b858c905de83492 Mon Sep 17 00:00:00 2001 From: Peter Fackeldey Date: Tue, 22 Apr 2025 02:00:30 -0400 Subject: [PATCH 05/18] fix: issue with empty big_endian array (#1420) fix issue with empty big_endian array Co-authored-by: Ianna Osborne --- src/uproot/writing/_cascadetree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uproot/writing/_cascadetree.py b/src/uproot/writing/_cascadetree.py index 7df7a85e7..35962e485 100644 --- a/src/uproot/writing/_cascadetree.py +++ b/src/uproot/writing/_cascadetree.py @@ -735,7 +735,7 @@ def extend(self, file, sink, data): ) ) tofill.append((branch_name, datum["compression"], big_endian, None)) - if datum["kind"] == "counter": + if datum["kind"] == "counter" and big_endian.size > 0: datum["tleaf_maximum_value"] = max( big_endian.max(), datum["tleaf_maximum_value"] ) From 0ce4d86cf7e891a5ce95c356a19217f867c4c04a Mon Sep 17 00:00:00 2001 From: Peter Fackeldey Date: Tue, 22 Apr 2025 11:56:39 -0400 Subject: [PATCH 06/18] fix: safer branch title access (#1421) * safer branch title access * empty str -> None --------- Co-authored-by: Ianna Osborne --- src/uproot/interpretation/library.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uproot/interpretation/library.py b/src/uproot/interpretation/library.py index c79c2f2c3..c62517e92 100644 --- a/src/uproot/interpretation/library.py +++ b/src/uproot/interpretation/library.py @@ -460,7 +460,7 @@ def _awkward_json_to_array(awkward, form, array): def _awkward_add_doc(awkward, array, branch, ak_add_doc): if ak_add_doc: - return awkward.with_parameter(array, "__doc__", branch.title) + return awkward.with_parameter(array, "__doc__", getattr(branch, "title", None)) else: return array From 4ddc7c5db0f7f5e23846179e4444e2e71a3c718a Mon Sep 17 00:00:00 2001 From: Ianna Osborne Date: Tue, 22 Apr 2025 22:50:39 +0200 Subject: [PATCH 07/18] docs: add contributing guide (#1425) * docs: add contributing guide * style: pre-commit fixes * Update CONTRIBUTING.md Co-authored-by: Andres Rios Tascon * Update CONTRIBUTING.md Co-authored-by: Andres Rios Tascon * Update CONTRIBUTING.md Co-authored-by: Andres Rios Tascon * use pre-commit * build local documentation howto --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Andres Rios Tascon --- CONTRIBUTING.md | 154 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..edddcbe5c --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,154 @@ +# Contributing to uproot + +Thank you for your interest in contributing to **uproot**! uproot is a community-driven project, and we welcome contributions of all kinds, including bug reports, feature requests, documentation improvements, and code contributions. + +This guide will help you get started with contributing. + +--- + +## πŸš€ Quick Start + +1. **Fork** the repository on GitHub. +2. **Clone** your fork locally: + ```bash + git clone git@github.com:YOUR_USERNAME/uproot5.git + cd uproot + ``` + +3. **Set up the development environment**: + + We recommend using a conda environment. You can reproduce the full developer environment with: + + ```bash + conda create -n uproot-py313 python=3.13 + conda activate uproot-py313 + + # Add conda-forge channel and prioritize it + conda config --add channels conda-forge + conda config --set channel_priority strict + + # Install dependencies + conda install xrootd + conda install root + conda install pandas + conda install dask + conda install minio + + # pip-only dependencies + pip install scikit-hep-testdata + pip install rangehttpserver + pip install boost_histogram + pip install hist + pip install dask_awkward + pip install awkward-pandas + pip install pytest-timeout + pip install fsspec-xrootd + + # Run local HTTP server (if needed for test data) + python -m RangeHTTPServer + ``` + +4. **Install Uproot in editable mode**: + ```bash + pip install -e ."[dev,test]" + ``` + +--- + +## 🌿 Branching and Naming Conventions + +- Always work on a **feature branch**: + ```bash + git checkout -b YOUR_USERNAME/my-cool-feature + ``` +- Use descriptive names, such as: + - `fix_streamer-parsing` + - `feature_custom-interpretation-api` + - `docs_improve-tutorials` + +--- + +## 🎨 Code Style + +- Use **pre-commit** for automatic linting, style issues, and formatting: + ```bash + pre-commit run --all + +--- + +## βœ… Running Tests + +Uproot uses **pytest** for testing. + +To run the full test suite: +```bash +python -m pytest tests -ss -vv +``` + +To run a specific test module: +```bash +pytest tests/test_my_module.py +``` + +Some tests may depend on having ROOT or XRootD installed. These are covered in the environment setup. + +--- + +## πŸ”ƒ Submitting a Pull Request + +1. Make sure all tests pass and your code is cleanly formatted. +2. Push your changes to your fork: + ```bash + git push origin YOUR_USERNAME/my-cool-feature + ``` +3. Open a pull request (PR) from your fork to the `main` branch of [scikit-hep/uproot](https://github.com/scikit-hep/uproot). +4. Fill in the PR template and explain what you did and why. +5. Be responsive to feedback from reviewersβ€”we’re all here to help! + +--- + +## πŸ› Reporting Bugs + +1. Check if the bug is already reported on the [issue tracker](https://github.com/scikit-hep/uproot/issues). +2. If not, [open a new issue](https://github.com/scikit-hep/uproot/issues/new/choose) and provide: + - A minimal reproducible example + - Expected vs. actual behavior + - Version info (`python -m uproot --version`) + - Any relevant stack trace or logs + +--- + +## πŸ’‘ Requesting Features + +1. Search the issues to see if a similar feature has been discussed. +2. If not, open a **Feature Request** issue and describe: + - What problem the feature solves + - A suggested implementation or interface (if applicable) + - Any related prior art in other libraries or experiments + +--- + +## πŸ“š Improving Documentation + +Documentation lives in the `docs-sphinx/` folder and is built using **Sphinx**. To build locally +make sure you have Sphinx and the ReadTheDocs theme installed in your virtualenv: +```bash +pip install sphinx sphinx-rtd-theme +``` +Navigate to your docs folder and invoke the Sphinx builder to produce HTML in the _build/html directory: +``` +cd docs-sphinx +sphinx-build -b html . _build/html +``` +Once it finishes, open: +``` +open _build/html/index.html +``` + +You can also suggest improvements to examples, tutorials, and API references. + +--- + +## πŸ™Œ Thanks! + +uproot thrives on its community. Whether you're fixing a typo, contributing a feature, or suggesting a design—you're making a difference! From 40e596de8bd8fb3d5f28271766e26357d6bcecec Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Wed, 23 Apr 2025 14:37:12 -0400 Subject: [PATCH 08/18] improve virtual array loading --- src/uproot/_dask.py | 58 ++++++++++++++++++--------------- src/uproot/behaviors/TBranch.py | 2 +- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/src/uproot/_dask.py b/src/uproot/_dask.py index b49b17310..10caaccc6 100644 --- a/src/uproot/_dask.py +++ b/src/uproot/_dask.py @@ -971,7 +971,7 @@ def load_buffers( return container - def load_virtual_arrays( + def load_virtual_buffers( self, tree: HasBranches, keys: frozenset[str], @@ -983,42 +983,46 @@ def load_virtual_arrays( ) -> Mapping[str, AwkArray]: awkward = uproot.extras.awkward() - class Generator: - def __init__(self, branch, attribute): - self.branch = branch - self.attribute = attribute - - def __repr__(self): - return f"Generator({self.branch}, {self.attribute})" + def generator(tree, buffer_key): + form_key, attribute = self.parse_buffer_key(buffer_key) + key = self._form_key_to_key[form_key] + branch = tree[key] - def __call__(self): - layout = self.branch.array( + def _generator(): + array = branch.array( entry_start=start, entry_stop=stop, interpretation_executor=interpretation_executor, decompression_executor=decompression_executor, library="ak", ak_add_doc=options.get("ak_add_doc"), - ).layout - # this is a bit of a hack, but it works for now - if isinstance(layout, awkward.contents.NumpyArray): - return layout.data - elif isinstance(layout, awkward.contents.ListOffsetArray): - if self.attribute == "data": - return layout.content.data - elif self.attribute == "offsets": - return layout.offsets.data - else: - raise NotImplementedError() - else: - raise NotImplementedError() + ) + + # Convert the sub-array into buffers + ttree_subform, _, ttree_container = awkward.to_buffers(array) + + # Load the associated projection subform + projection_subform = self._form.content(key) + + # Correlate each TTree form key with the projection form key + for (src, src_dtype), (dst, dst_dtype) in zip( + ttree_subform.expected_from_buffers().items(), + projection_subform.expected_from_buffers(self.buffer_key).items(), + ): + # Return the corresponding array from the TTree if buffer key matches + if buffer_key == dst: + if src_dtype != dst_dtype: + raise TypeError(f"Data type mismatch: {src_dtype} != {dst_dtype}") + return ttree_container[src] + + # Raise an error if the buffer key is not found + raise ValueError(f"Buffer key {buffer_key} not found in form {self._form}") + return _generator + container = {} for buffer_key, _ in self._form.expected_from_buffers().items(): - form_key, attribute = self.parse_buffer_key(buffer_key) - branch_name = self._form_key_to_key[form_key] - branch = tree[branch_name] - container[buffer_key] = Generator(branch, attribute) + container[buffer_key] = generator(tree, buffer_key) return container diff --git a/src/uproot/behaviors/TBranch.py b/src/uproot/behaviors/TBranch.py index 510dc62c3..037845e53 100644 --- a/src/uproot/behaviors/TBranch.py +++ b/src/uproot/behaviors/TBranch.py @@ -784,7 +784,7 @@ def virtual_arrays( else: expected_form, form_mapping_info = form_mapping(base_form) - container = form_mapping_info.load_virtual_arrays( + container = form_mapping_info.load_virtual_buffers( self, keys, entry_start, From 7e178b9f73d2d276613b2a8598ca79c7743514e0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 23 Apr 2025 18:37:55 +0000 Subject: [PATCH 09/18] style: pre-commit fixes --- src/uproot/_dask.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/uproot/_dask.py b/src/uproot/_dask.py index 10caaccc6..d90ac6960 100644 --- a/src/uproot/_dask.py +++ b/src/uproot/_dask.py @@ -1012,13 +1012,17 @@ def _generator(): # Return the corresponding array from the TTree if buffer key matches if buffer_key == dst: if src_dtype != dst_dtype: - raise TypeError(f"Data type mismatch: {src_dtype} != {dst_dtype}") + raise TypeError( + f"Data type mismatch: {src_dtype} != {dst_dtype}" + ) return ttree_container[src] # Raise an error if the buffer key is not found - raise ValueError(f"Buffer key {buffer_key} not found in form {self._form}") - return _generator + raise ValueError( + f"Buffer key {buffer_key} not found in form {self._form}" + ) + return _generator container = {} for buffer_key, _ in self._form.expected_from_buffers().items(): From bc79a486b3f943e93777b8227b1b3e52b42379d2 Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Wed, 23 Apr 2025 19:04:27 -0400 Subject: [PATCH 10/18] add 'access_log' kwarg --- src/uproot/_dask.py | 14 +++++++++++++- src/uproot/behaviors/TBranch.py | 3 ++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/uproot/_dask.py b/src/uproot/_dask.py index d90ac6960..6098ca4d1 100644 --- a/src/uproot/_dask.py +++ b/src/uproot/_dask.py @@ -10,7 +10,7 @@ from uproot.source.chunk import SourcePerformanceCounters try: - from typing import TYPE_CHECKING, Final + from typing import TYPE_CHECKING, Final, NamedTuple from typing_extensions import Any, Protocol, TypeVar except ImportError: @@ -998,6 +998,18 @@ def _generator(): ak_add_doc=options.get("ak_add_doc"), ) + # add to access_log + access_log = options.get("access_log") + if access_log is not None: + if not hasattr(access_log, "__iadd__"): + raise ValueError(f"{access_log=} needs to implement '__iadd__'.") + else: + class Accessed(NamedTuple): + branch: str + buffer_key: str + + access_log += [Accessed(branch=key, buffer_key=buffer_key)] + # Convert the sub-array into buffers ttree_subform, _, ttree_container = awkward.to_buffers(array) diff --git a/src/uproot/behaviors/TBranch.py b/src/uproot/behaviors/TBranch.py index 037845e53..3d392e545 100644 --- a/src/uproot/behaviors/TBranch.py +++ b/src/uproot/behaviors/TBranch.py @@ -745,6 +745,7 @@ def virtual_arrays( interpretation_executor=None, array_cache="inherit", ak_add_doc=False, + access_log=None, ): from uproot._dask import ( TrivialFormMappingInfo, @@ -791,7 +792,7 @@ def virtual_arrays( entry_stop, decompression_executor, interpretation_executor, - {"ak_add_doc": ak_add_doc}, + {"ak_add_doc": ak_add_doc, "access_log": access_log}, ) return awkward.from_buffers( expected_form, From 4e201254256c14e0a688fa1488e3c3847e3f908e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 23 Apr 2025 23:04:59 +0000 Subject: [PATCH 11/18] style: pre-commit fixes --- src/uproot/_dask.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/uproot/_dask.py b/src/uproot/_dask.py index 6098ca4d1..0370fb524 100644 --- a/src/uproot/_dask.py +++ b/src/uproot/_dask.py @@ -1002,8 +1002,11 @@ def _generator(): access_log = options.get("access_log") if access_log is not None: if not hasattr(access_log, "__iadd__"): - raise ValueError(f"{access_log=} needs to implement '__iadd__'.") + raise ValueError( + f"{access_log=} needs to implement '__iadd__'." + ) else: + class Accessed(NamedTuple): branch: str buffer_key: str From 664550a3ce7525b0898053a798d667d8a122ea60 Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Wed, 23 Apr 2025 19:17:38 -0400 Subject: [PATCH 12/18] don't re-define 'Accessed' --- src/uproot/_dask.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/uproot/_dask.py b/src/uproot/_dask.py index 0370fb524..cd2578d3c 100644 --- a/src/uproot/_dask.py +++ b/src/uproot/_dask.py @@ -1006,11 +1006,6 @@ def _generator(): f"{access_log=} needs to implement '__iadd__'." ) else: - - class Accessed(NamedTuple): - branch: str - buffer_key: str - access_log += [Accessed(branch=key, buffer_key=buffer_key)] # Convert the sub-array into buffers @@ -1839,3 +1834,8 @@ def _get_dak_array_delay_open( divisions=None if divisions is None else tuple(divisions), label="from-uproot", ) + + +class Accessed(NamedTuple): + branch: str + buffer_key: str From 01d723802e0d7be3a5962104f60074edb0665601 Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Thu, 24 Apr 2025 08:51:41 -0400 Subject: [PATCH 13/18] add doc string --- src/uproot/behaviors/TBranch.py | 71 +++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/src/uproot/behaviors/TBranch.py b/src/uproot/behaviors/TBranch.py index 3d392e545..8aaa6c938 100644 --- a/src/uproot/behaviors/TBranch.py +++ b/src/uproot/behaviors/TBranch.py @@ -747,6 +747,77 @@ def virtual_arrays( ak_add_doc=False, access_log=None, ): + """ + Args: + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``TBranches`` by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``TBranches`` by type. + filter_branch (None or function of :doc:`uproot.behaviors.TBranch.TBranch` \u2192 bool, :doc:`uproot.interpretation.Interpretation`, or None): A + filter to select ``TBranches`` using the full + :doc:`uproot.behaviors.TBranch.TBranch` object. If the function + returns False or None, the ``TBranch`` is excluded; if the function + returns True, it is included with its standard + :ref:`uproot.behaviors.TBranch.TBranch.interpretation`; if an + :doc:`uproot.interpretation.Interpretation`, this interpretation + overrules the standard one. + aliases (None or dict of str \u2192 str): Mathematical expressions that + can be used in ``expressions`` or other aliases (without cycles). + Uses the ``language`` engine to evaluate. If None, only the + :ref:`uproot.behaviors.TBranch.TBranch.aliases` are available. + recursive (bool): If True, descend into any nested subbranches. + If False, only return the names of branches directly accessible + under this object. + full_paths (bool): If True, include the full path to each subbranch + with slashes (``/``); otherwise, use the descendant's name as + the output name. + ignore_duplicates (bool): If True, return a set of the keys; otherwise, return the full list of keys. + language (:doc:`uproot.language.Language`): Language used to interpret + the ``expressions`` and ``aliases``. + entry_start (None or int): The first entry to include. If None, start + at zero. If negative, count from the end, like a Python slice. + entry_stop (None or int): The first entry to exclude (i.e. one greater + than the last entry to include). If None, stop at + :ref:`uproot.behaviors.TTree.TTree.num_entries`. If negative, + count from the end, like a Python slice. + decompression_executor (None or Executor with a ``submit`` method): The + executor that is used to decompress ``TBaskets``; if None, the + file's :ref:`uproot.reading.ReadOnlyFile.decompression_executor` + is used. + interpretation_executor (None or Executor with a ``submit`` method): The + executor that is used to interpret uncompressed ``TBasket`` data as + arrays; if None, the file's :ref:`uproot.reading.ReadOnlyFile.interpretation_executor` + is used. + array_cache ("inherit", None, MutableMapping, or memory size): Cache of arrays; + if "inherit", use the file's cache; if None, do not use a cache; + if a memory size, create a new cache of this size. + ak_add_doc (bool | dict ): If True and ``library="ak"``, add the TBranch ``title`` + to the Awkward ``__doc__`` parameter of the array. + if dict = {key:value} and ``library="ak"``, add the TBranch ``value`` to the + Awkward ``key`` parameter of the array. + access_log (None or object with a ``__iadd__`` method): If an access_log is + provided, e.g. a list, all materializations of the virtual arrays are + tracked inside this reference. + + + Returns a group of virtual arrays from the ``TTree``. This method can only return Awkward Arrays. + + For example: + + .. code-block:: python + + >>> my_tree.virtual_arrays() + + >>> access_log = [] + >>> array = my_tree.virtual_arrays(access_log=access_log) + >>> ak.materialize(array.Jet_pt) + >>> print(access_log) + [Accessed(branch='Jet_pt', buffer_key='.Jet_pt-offsets'), Accessed(branch='Jet_pt', buffer_key='.Jet_pt.content-data')] + + + See also :ref:`uproot.behaviors.TBranch.HasBranches.arrays` to iterate over + the array in contiguous ranges of entries. + """ from uproot._dask import ( TrivialFormMappingInfo, _get_ttree_form, From fe275cd09557ef344c65a685935c37b16400470e Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Thu, 24 Apr 2025 08:52:08 -0400 Subject: [PATCH 14/18] minor doc string fixes --- src/uproot/behaviors/TBranch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uproot/behaviors/TBranch.py b/src/uproot/behaviors/TBranch.py index 8aaa6c938..23fc99725 100644 --- a/src/uproot/behaviors/TBranch.py +++ b/src/uproot/behaviors/TBranch.py @@ -791,9 +791,9 @@ def virtual_arrays( array_cache ("inherit", None, MutableMapping, or memory size): Cache of arrays; if "inherit", use the file's cache; if None, do not use a cache; if a memory size, create a new cache of this size. - ak_add_doc (bool | dict ): If True and ``library="ak"``, add the TBranch ``title`` + ak_add_doc (bool | dict ): If True, add the TBranch ``title`` to the Awkward ``__doc__`` parameter of the array. - if dict = {key:value} and ``library="ak"``, add the TBranch ``value`` to the + if dict = {key:value}, add the TBranch ``value`` to the Awkward ``key`` parameter of the array. access_log (None or object with a ``__iadd__`` method): If an access_log is provided, e.g. a list, all materializations of the virtual arrays are From 499367845909f07aa69c6133374f13afebe619f0 Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Fri, 25 Apr 2025 13:23:15 -0400 Subject: [PATCH 15/18] use form_with_unique_keys from awkward --- foo | 0 src/uproot/_dask.py | 86 +++------------------------------ src/uproot/_util.py | 39 +++++++++++++++ src/uproot/behaviors/TBranch.py | 15 ++---- 4 files changed, 52 insertions(+), 88 deletions(-) create mode 100644 foo diff --git a/foo b/foo new file mode 100644 index 000000000..e69de29bb diff --git a/src/uproot/_dask.py b/src/uproot/_dask.py index cd2578d3c..1576857bf 100644 --- a/src/uproot/_dask.py +++ b/src/uproot/_dask.py @@ -19,7 +19,7 @@ import numpy import uproot -from uproot._util import no_filter, unset +from uproot._util import get_ttree_form, no_filter, unset from uproot.behaviors.RNTuple import HasFields from uproot.behaviors.RNTuple import ( _regularize_step_size as _RNTuple_regularize_step_size, @@ -1041,41 +1041,10 @@ def _generator(): return container -def form_with_unique_keys(form: Form, key: str) -> Form: - awkward = uproot.extras.awkward() - - def impl(form: Form, key: str) -> None: - # Set form key - form.form_key = key - - # If the form is a record we need to loop over all fields in the - # record and set form that include the field name; this will keep - # recursing as well. - if form.is_record: - for field in form.fields: - impl(form.content(field), f"{key}.{field}") - - elif form.is_union: - for i, entry in enumerate(form.contents): - impl(entry, f"{key}#{i}") - - # NumPy like array is easy - elif form.is_numpy or form.is_unknown: - pass - - # Anything else grab the content and keep recursing - else: - impl(form.content, f"{key}.content") - - # Perform a "deep" copy without preserving references - form = awkward.forms.from_dict(form.to_dict()) - impl(form, key) - return form - - class TrivialFormMapping(ImplementsFormMapping): def __call__(self, form: Form) -> tuple[Form, TrivialFormMappingInfo]: - new_form = form_with_unique_keys(form, "") + awkward = uproot.extras.awkward() + new_form = awkward.forms.form_with_unique_keys(form, ("",)) return new_form, TrivialFormMappingInfo(new_form) @@ -1497,43 +1466,6 @@ def project_keys(self: T, keys: frozenset[str]) -> T: ) -def _get_ttree_form( - awkward, - ttree, - common_keys, - ak_add_doc, -): - contents = [] - for key in common_keys: - branch = ttree[key] - content_form = branch.interpretation.awkward_form(ttree.file) - content_parameters = {} - if isinstance(ak_add_doc, bool): - if ak_add_doc: - content_parameters["__doc__"] = branch.title - elif isinstance(ak_add_doc, dict): - content_parameters.update( - { - key: branch.__getattribute__(value) - for key, value in ak_add_doc.items() - } - ) - if len(content_parameters.keys()) != 0: - content_form = content_form.copy(parameters=content_parameters) - contents.append(content_form) - - if isinstance(ak_add_doc, bool): - parameters = {"__doc__": ttree.title} if ak_add_doc else None - elif isinstance(ak_add_doc, dict): - parameters = ( - {"__doc__": ttree.title} if "__doc__" in ak_add_doc.keys() else None - ) - else: - parameters = None - - return awkward.forms.RecordForm(contents, common_keys, parameters=parameters) - - def _get_dak_array( files, filter_name, @@ -1695,8 +1627,8 @@ def real_filter_branch(branch): if isinstance(ttrees[0], HasFields): base_form = ttrees[0].to_akform(filter_name=common_keys) else: - base_form = _get_ttree_form( - awkward, ttrees[0], common_keys, interp_options.get("ak_add_doc") + base_form = get_ttree_form( + ttrees[0], common_keys, interp_options.get("ak_add_doc") ) if len(partition_args) == 0: @@ -1704,7 +1636,7 @@ def real_filter_branch(branch): partition_args.append((0, 0, 0)) if form_mapping is None: - expected_form = form_with_unique_keys(base_form, "") + expected_form = awkward.forms.form_with_unique_keys(base_form, ("",)) form_mapping_info = TrivialFormMappingInfo(expected_form) else: expected_form, form_mapping_info = form_mapping(base_form) @@ -1771,9 +1703,7 @@ def _get_dak_array_delay_open( full_paths=full_paths, ignore_duplicates=True, ) - base_form = _get_ttree_form( - awkward, obj, common_keys, interp_options.get("ak_add_doc") - ) + base_form = get_ttree_form(obj, common_keys, interp_options.get("ak_add_doc")) divisions = [0] partition_args = [] @@ -1809,7 +1739,7 @@ def _get_dak_array_delay_open( ) if form_mapping is None: - expected_form = form_with_unique_keys(base_form, "") + expected_form = awkward.forms.form_with_unique_keys(base_form, ("",)) form_mapping_info = TrivialFormMappingInfo(expected_form) else: expected_form, form_mapping_info = form_mapping(base_form) diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 23e707440..c42edfc33 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -691,6 +691,45 @@ def awkward_form_of_iter(awkward, form): raise RuntimeError(f"unrecognized form: {type(form)}") +def get_ttree_form( + ttree, + common_keys, + ak_add_doc, +): + import uproot + + awkward = uproot.extras.awkward() + contents = [] + for key in common_keys: + branch = ttree[key] + content_form = branch.interpretation.awkward_form(ttree.file) + content_parameters = {} + if isinstance(ak_add_doc, bool): + if ak_add_doc: + content_parameters["__doc__"] = branch.title + elif isinstance(ak_add_doc, dict): + content_parameters.update( + { + key: branch.__getattribute__(value) + for key, value in ak_add_doc.items() + } + ) + if len(content_parameters.keys()) != 0: + content_form = content_form.copy(parameters=content_parameters) + contents.append(content_form) + + if isinstance(ak_add_doc, bool): + parameters = {"__doc__": ttree.title} if ak_add_doc else None + elif isinstance(ak_add_doc, dict): + parameters = ( + {"__doc__": ttree.title} if "__doc__" in ak_add_doc.keys() else None + ) + else: + parameters = None + + return awkward.forms.RecordForm(contents, common_keys, parameters=parameters) + + def damerau_levenshtein(a, b, ratio=False): """ Calculates the Damerau-Levenshtein distance of two strings. diff --git a/src/uproot/behaviors/TBranch.py b/src/uproot/behaviors/TBranch.py index 23fc99725..027ed2e7e 100644 --- a/src/uproot/behaviors/TBranch.py +++ b/src/uproot/behaviors/TBranch.py @@ -24,7 +24,7 @@ import uproot.interpretation.grouped import uproot.language.python import uproot.source.chunk -from uproot._util import no_filter +from uproot._util import get_ttree_form, no_filter np_uint8 = numpy.dtype("u1") @@ -818,12 +818,6 @@ def virtual_arrays( See also :ref:`uproot.behaviors.TBranch.HasBranches.arrays` to iterate over the array in contiguous ranges of entries. """ - from uproot._dask import ( - TrivialFormMappingInfo, - _get_ttree_form, - form_with_unique_keys, - ) - awkward = uproot.extras.awkward() entry_start, entry_stop = _regularize_entries_start_stop( @@ -843,15 +837,16 @@ def virtual_arrays( ignore_duplicates=ignore_duplicates, ) - base_form = _get_ttree_form( - awkward, + base_form = get_ttree_form( self, keys, ak_add_doc, ) if form_mapping is None: - expected_form = form_with_unique_keys(base_form, "") + from uproot._dask import TrivialFormMappingInfo + + expected_form = awkward.forms.form_with_unique_keys(base_form, ("",)) form_mapping_info = TrivialFormMappingInfo(expected_form) else: expected_form, form_mapping_info = form_mapping(base_form) From 440266529a88eacb9fe6f11a3ca0cdcda46ddeae Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Tue, 29 Apr 2025 11:23:42 -0400 Subject: [PATCH 16/18] rm accidentally committed file... --- foo | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 foo diff --git a/foo b/foo deleted file mode 100644 index e69de29bb..000000000 From 1d925bab13953ae49b6f24aaa5f25de4c736c81d Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Tue, 29 Apr 2025 12:27:03 -0400 Subject: [PATCH 17/18] refactor virtual buffer loading to better work with form mappings --- src/uproot/_dask.py | 24 ++++++++++++++++++++++-- src/uproot/behaviors/TBranch.py | 14 +++++--------- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/src/uproot/_dask.py b/src/uproot/_dask.py index 1576857bf..e04ed6d8f 100644 --- a/src/uproot/_dask.py +++ b/src/uproot/_dask.py @@ -971,7 +971,9 @@ def load_buffers( return container - def load_virtual_buffers( + +class FormMappingInfoWithVirtualArrays(TrivialFormMappingInfo): + def buffer_replacements( self, tree: HasBranches, keys: frozenset[str], @@ -1048,6 +1050,13 @@ def __call__(self, form: Form) -> tuple[Form, TrivialFormMappingInfo]: return new_form, TrivialFormMappingInfo(new_form) +class FormMappingWithVirtualArrays(ImplementsFormMapping): + def __call__(self, form: Form) -> tuple[Form, FormMappingInfoWithVirtualArrays]: + awkward = uproot.extras.awkward() + new_form = awkward.forms.form_with_unique_keys(form, ("",)) + return new_form, FormMappingInfoWithVirtualArrays(new_form) + + T = TypeVar("T") @@ -1108,7 +1117,18 @@ def read_tree( # but not two of the keys required for buffer B if all(k in self.common_keys for k in keys_for_buffer): container[buffer_key] = mapping[buffer_key] - # Otherwise, introduce a placeholder + # if the form mapping info provides a replacements, use it + elif hasattr(self.form_mapping_info, "buffer_replacements"): + container[buffer_key] = self.form_mapping_info.buffer_replacements( + tree, + keys_for_buffer, + start, + stop, + self.decompression_executor, + self.interpretation_executor, + self.interp_options, + )[buffer_key] + # Otherwise, introduce a placeholder (default replacement) else: container[buffer_key] = awkward.typetracer.PlaceholderArray( nplike=nplike, diff --git a/src/uproot/behaviors/TBranch.py b/src/uproot/behaviors/TBranch.py index 027ed2e7e..8ea5be49c 100644 --- a/src/uproot/behaviors/TBranch.py +++ b/src/uproot/behaviors/TBranch.py @@ -738,7 +738,6 @@ def virtual_arrays( full_paths=True, ignore_duplicates=False, language=uproot.language.python.python_language, - form_mapping=None, entry_start=None, entry_stop=None, decompression_executor=None, @@ -818,6 +817,8 @@ def virtual_arrays( See also :ref:`uproot.behaviors.TBranch.HasBranches.arrays` to iterate over the array in contiguous ranges of entries. """ + from uproot._dask import FormMappingWithVirtualArrays + awkward = uproot.extras.awkward() entry_start, entry_stop = _regularize_entries_start_stop( @@ -843,15 +844,10 @@ def virtual_arrays( ak_add_doc, ) - if form_mapping is None: - from uproot._dask import TrivialFormMappingInfo - - expected_form = awkward.forms.form_with_unique_keys(base_form, ("",)) - form_mapping_info = TrivialFormMappingInfo(expected_form) - else: - expected_form, form_mapping_info = form_mapping(base_form) + expected_form, form_mapping_info = FormMappingWithVirtualArrays()(base_form) - container = form_mapping_info.load_virtual_buffers( + # The buffer replacements of FormMappingInfoWithVirtualArrays are VirtualArrays + container = form_mapping_info.buffer_replacements( self, keys, entry_start, From 92df7c6d3984d7a82571229b9f331921738342c8 Mon Sep 17 00:00:00 2001 From: pfackeldey Date: Wed, 7 May 2025 09:00:17 -0400 Subject: [PATCH 18/18] require awkward v2.8.2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a1024afec..7c6cb8996 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,7 @@ classifiers = [ "Topic :: Utilities" ] dependencies = [ - "awkward>=2.4.6", + "awkward>=2.8.2", "cramjam>=2.5.0", "xxhash", "numpy",