From a8a1f2859560fb3e7f0520aa56311ae435f9aab3 Mon Sep 17 00:00:00 2001 From: Zach Marshall Date: Tue, 22 Jul 2025 00:44:36 +0200 Subject: [PATCH 1/7] Allowing ATLAS-style file names for ROOT files The ATLAS production system produces ROOT files with names like "something.root.1" (or "something.root.2" in case the job is the second attempt at producing the file, and so on). The suffix is always "." followed by an integer. In order to ensure that these files are also correctly treated by uproot when they're opened using the file:object style of calling open(), we need either a more general regex or an alternative regex pattern. Here I'm adding an alternative to be as specific as possible about what the alternative should be. There are other ways to solve this problem, but I don't think naming a bajillion files in our production system, or changing the behavior of PanDA, is going to be a viable solution, so better to parse things this way. Very fundamentally, I don't quite understand why this regex is necessary. One could simply split on the colon and then try/except based on the first part being a file name and the second part being an object in that file. Since you look for the magic in the file anyway, I'm not sure what this function provides for you except perhaps tidier error messages, at the cost of this sort of difficulty in case file naming is abnormal (to be clear, I recognize that this file naming of PanDA's is really, really not nice, and ROOT itself has broken opening files on occasion because of it). --- src/uproot/_util.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 3196bda76..e3635ba7e 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -314,12 +314,19 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]: separator = "::" parts = urlpath.split(separator) object_regex = re.compile(r"(.+\.root):(.*$)", re.IGNORECASE) + # Some files like "something.root.1" need to also be treated as ROOT files + object_regex_with_extra = re.compile(r"(.+\.root\.+[0-9]+):(.*$)", re.IGNORECASE) for i, part in enumerate(reversed(parts)): match = object_regex.match(part) if match: obj = re.sub(r"/+", "/", match.group(2).strip().lstrip("/")).rstrip("/") parts[-i - 1] = match.group(1) break + match_with_extra = object_regex_with_extra.match(part) + if match_with_extra: + obj = re.sub(r"/+", "/", match_with_extra.group(2).strip().lstrip("/")).rstrip("/") + parts[-i - 1] = match_with_extra.group(1) + break urlpath = separator.join(parts) return urlpath, obj From 0d4a4f059cedbfbba311f5382d1d5704857c51ae Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Jul 2025 22:46:55 +0000 Subject: [PATCH 2/7] style: pre-commit fixes --- src/uproot/_util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/uproot/_util.py b/src/uproot/_util.py index e3635ba7e..7742c7616 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -324,7 +324,9 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]: break match_with_extra = object_regex_with_extra.match(part) if match_with_extra: - obj = re.sub(r"/+", "/", match_with_extra.group(2).strip().lstrip("/")).rstrip("/") + obj = re.sub( + r"/+", "/", match_with_extra.group(2).strip().lstrip("/") + ).rstrip("/") parts[-i - 1] = match_with_extra.group(1) break From 3d85769cf592e07394dcf8c1319b3469d4b3d283 Mon Sep 17 00:00:00 2001 From: Zach Marshall Date: Tue, 22 Jul 2025 16:43:13 +0200 Subject: [PATCH 3/7] Removing file extension requirements altogether following discussions --- src/uproot/_util.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 7742c7616..4312812ed 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -313,22 +313,13 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]: separator = "::" parts = urlpath.split(separator) - object_regex = re.compile(r"(.+\.root):(.*$)", re.IGNORECASE) - # Some files like "something.root.1" need to also be treated as ROOT files - object_regex_with_extra = re.compile(r"(.+\.root\.+[0-9]+):(.*$)", re.IGNORECASE) + object_regex = re.compile(r"(^.*):(.*$)", re.IGNORECASE) for i, part in enumerate(reversed(parts)): match = object_regex.match(part) if match: obj = re.sub(r"/+", "/", match.group(2).strip().lstrip("/")).rstrip("/") parts[-i - 1] = match.group(1) break - match_with_extra = object_regex_with_extra.match(part) - if match_with_extra: - obj = re.sub( - r"/+", "/", match_with_extra.group(2).strip().lstrip("/") - ).rstrip("/") - parts[-i - 1] = match_with_extra.group(1) - break urlpath = separator.join(parts) return urlpath, obj From c97e5ca619518a135ba3c20be272b1b0345504ec Mon Sep 17 00:00:00 2001 From: Zach Marshall Date: Tue, 22 Jul 2025 18:11:07 +0200 Subject: [PATCH 4/7] Update src/uproot/_util.py Suggested change (including ".root" somewhere in the file name) works for me! Co-authored-by: Andres Rios Tascon --- src/uproot/_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 4312812ed..de5d4f7ad 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -313,7 +313,7 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]: separator = "::" parts = urlpath.split(separator) - object_regex = re.compile(r"(^.*):(.*$)", re.IGNORECASE) + object_regex = re.compile(r"(.+\.root.*):(.*$)", re.IGNORECASE) for i, part in enumerate(reversed(parts)): match = object_regex.match(part) if match: From 639807629bac1f56605f66182b915a970c7a5f62 Mon Sep 17 00:00:00 2001 From: Zach Marshall Date: Thu, 24 Jul 2025 23:11:20 +0200 Subject: [PATCH 5/7] Removing formerly invalid file that is now valid from tests --- tests/test_0976_path_object_split.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_0976_path_object_split.py b/tests/test_0976_path_object_split.py index a5617fa18..d657e7f8f 100644 --- a/tests/test_0976_path_object_split.py +++ b/tests/test_0976_path_object_split.py @@ -202,7 +202,6 @@ def test_url_no_split(input_value): "local/file.root.zip://Events", "local/file.roo://Events", "local/file://Events", - "http://xcacheserver:8762//https://originserver:4212/path/file.root.1:CollectionTree", ], ) def test_url_split_invalid(input_value): From 7831bced0d429f6830699f2bc8085b379ad40ea4 Mon Sep 17 00:00:00 2001 From: Andres Rios Tascon Date: Fri, 25 Jul 2025 08:21:35 -0400 Subject: [PATCH 6/7] Changed regex --- src/uproot/_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uproot/_util.py b/src/uproot/_util.py index de5d4f7ad..9e73cb7b4 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -313,11 +313,11 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]: separator = "::" parts = urlpath.split(separator) - object_regex = re.compile(r"(.+\.root.*):(.*$)", re.IGNORECASE) + object_regex = re.compile(r"(.+\.root(\.[0-9]+)?):(.*$)", re.IGNORECASE) for i, part in enumerate(reversed(parts)): match = object_regex.match(part) if match: - obj = re.sub(r"/+", "/", match.group(2).strip().lstrip("/")).rstrip("/") + obj = re.sub(r"/+", "/", match.group(3).strip().lstrip("/")).rstrip("/") parts[-i - 1] = match.group(1) break From 2f003933b0a52c111c806ec500efc957406db696 Mon Sep 17 00:00:00 2001 From: Andres Rios Tascon Date: Mon, 28 Jul 2025 09:30:07 -0400 Subject: [PATCH 7/7] Added validity test --- tests/test_0976_path_object_split.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_0976_path_object_split.py b/tests/test_0976_path_object_split.py index d657e7f8f..28830478d 100644 --- a/tests/test_0976_path_object_split.py +++ b/tests/test_0976_path_object_split.py @@ -163,6 +163,13 @@ "Dir/Test", ), ), + ( + "http://xcacheserver:8762//https://originserver:4212/path/file.root.1:CollectionTree", + ( + "http://xcacheserver:8762//https://originserver:4212/path/file.root.1", + "CollectionTree", + ), + ), ], ) def test_url_split(input_value, expected_output):