-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
[pydap backend] enables downloading/processing multiple arrays within single http request #10629
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
5fdeab9
baa23e3
52fd3f9
f99f400
7a0e2c5
827101a
7515616
ee03ed6
b3b77ab
3ff1a9a
ccd7954
c244b3a
25b7092
bac007b
844e580
7461489
f0237a3
037ee09
4d6e33f
5f2adfb
84305e4
6efb311
511da84
5ed9d4a
b3c77a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from __future__ import annotations | ||
|
||
import warnings | ||
from collections.abc import Iterable | ||
from typing import TYPE_CHECKING, Any | ||
|
||
|
@@ -35,8 +36,11 @@ | |
|
||
|
||
class PydapArrayWrapper(BackendArray): | ||
def __init__(self, array): | ||
def __init__(self, array, batch=False, cache=None, checksums=True): | ||
self.array = array | ||
self._batch = batch | ||
self._cache = cache | ||
self._checksums = checksums | ||
|
||
@property | ||
def shape(self) -> tuple[int, ...]: | ||
|
@@ -52,13 +56,27 @@ def __getitem__(self, key): | |
) | ||
|
||
def _getitem(self, key): | ||
result = robust_getitem(self.array, key, catch=ValueError) | ||
# in some cases, pydap doesn't squeeze axes automatically like numpy | ||
result = np.asarray(result) | ||
if self.array.id in self._cache.keys(): | ||
# safely avoid re-downloading some coordinates | ||
result = self._cache[self.array.id] | ||
elif self._batch and hasattr(self.array, "dataset"): | ||
# this are both True only for pydap>3.5.5 | ||
from pydap.lib import resolve_batch_for_all_variables | ||
|
||
dataset = self.array.dataset | ||
resolve_batch_for_all_variables(self.array, key, checksums=self._checksums) | ||
result = np.asarray( | ||
dataset._current_batch_promise.wait_for_result(self.array.id) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible to avoid private APIs here? |
||
) | ||
else: | ||
result = robust_getitem(self.array, key, catch=ValueError) | ||
try: | ||
result = np.asarray(result.data) | ||
except AttributeError: | ||
result = np.asarray(result) | ||
Comment on lines
+73
to
+76
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is worth some explanation. Did a change in pydap break |
||
axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) | ||
if result.ndim + len(axis) != self.array.ndim and axis: | ||
result = np.squeeze(result, axis) | ||
|
||
return result | ||
|
||
|
||
|
@@ -81,7 +99,15 @@ class PydapDataStore(AbstractDataStore): | |
be useful if the netCDF4 library is not available. | ||
""" | ||
|
||
def __init__(self, dataset, group=None): | ||
def __init__( | ||
self, | ||
dataset, | ||
group=None, | ||
session=None, | ||
batch=False, | ||
protocol=None, | ||
checksums=True, | ||
): | ||
""" | ||
Parameters | ||
---------- | ||
|
@@ -91,6 +117,11 @@ def __init__(self, dataset, group=None): | |
""" | ||
self.dataset = dataset | ||
self.group = group | ||
self._batch = batch | ||
self._batch_done = False | ||
self._array_cache = {} # holds 1D dimension data | ||
self._protocol = protocol | ||
self._checksums = checksums # true by default | ||
|
||
@classmethod | ||
def open( | ||
|
@@ -103,6 +134,8 @@ def open( | |
timeout=None, | ||
verify=None, | ||
user_charset=None, | ||
batch=False, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it make sense to have the default be |
||
checksums=True, | ||
): | ||
from pydap.client import open_url | ||
from pydap.net import DEFAULT_TIMEOUT | ||
|
@@ -117,6 +150,7 @@ def open( | |
DeprecationWarning, | ||
) | ||
output_grid = False # new default behavior | ||
|
||
kwargs = { | ||
"url": url, | ||
"application": application, | ||
|
@@ -133,13 +167,28 @@ def open( | |
# pydap dataset | ||
dataset = url.ds | ||
args = {"dataset": dataset} | ||
args["checksums"] = checksums | ||
Comment on lines
169
to
+170
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please specify |
||
if group: | ||
# only then, change the default | ||
args["group"] = group | ||
if url.startswith(("http", "dap2")): | ||
args["protocol"] = "dap2" | ||
elif url.startswith("dap4"): | ||
args["protocol"] = "dap4" | ||
if batch: | ||
if args["protocol"] == "dap2": | ||
warnings.warn( | ||
f"`batch={batch}` is currently only compatible with the `DAP4` " | ||
"protocol. Make sue the OPeNDAP server implements the `DAP4` " | ||
"protocol and then replace the scheme of the url with `dap4` " | ||
"to make use of it. Setting `batch=False`.", | ||
stacklevel=2, | ||
) | ||
Comment on lines
+179
to
+185
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Generally, if a user explicitly specifies an invalid argument, the preferred pattern is to raise an exception, rather than warning and ignoring what the user asked for. Likely pydap already does this? Generally, we re-raise errors from Xarray only when we can add Xarray-specific details that are helpful to users. |
||
else: | ||
# only update if dap4 | ||
args["batch"] = batch | ||
return cls(**args) | ||
|
||
def open_store_variable(self, var): | ||
data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) | ||
try: | ||
dimensions = [ | ||
dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims | ||
|
@@ -148,6 +197,25 @@ def open_store_variable(self, var): | |
# GridType does not have a dims attribute - instead get `dimensions` | ||
# see https://github.com/pydap/pydap/issues/485 | ||
dimensions = var.dimensions | ||
if ( | ||
self._protocol == "dap4" | ||
and var.name in dimensions | ||
and hasattr(var, "dataset") # only True for pydap>3.5.5 | ||
): | ||
if not var.dataset._batch_mode: | ||
# for dap4, always batch all dimensions at once | ||
var.dataset.enable_batch_mode() | ||
data_array = self._get_data_array(var) | ||
data = indexing.LazilyIndexedArray(data_array) | ||
if not self._batch and var.dataset._batch_mode: | ||
# if `batch=False``, restore it for all other variables | ||
var.dataset.disable_batch_mode() | ||
else: | ||
# all non-dimension variables | ||
data = indexing.LazilyIndexedArray( | ||
PydapArrayWrapper(var, self._batch, self._array_cache, self._checksums) | ||
) | ||
|
||
return Variable(dimensions, data, var.attributes) | ||
|
||
def get_variables(self): | ||
|
@@ -165,6 +233,7 @@ def get_variables(self): | |
# check the key is not a BaseType or GridType | ||
if not isinstance(self.ds[var], GroupType) | ||
] | ||
|
||
return FrozenDict((k, self.open_store_variable(self.ds[k])) for k in _vars) | ||
|
||
def get_attrs(self): | ||
|
@@ -176,18 +245,35 @@ def get_attrs(self): | |
"libdap", | ||
"invocation", | ||
"dimensions", | ||
"path", | ||
"Maps", | ||
) | ||
attrs = self.ds.attributes | ||
list(map(attrs.pop, opendap_attrs, [None] * 6)) | ||
attrs = dict(self.ds.attributes) | ||
list(map(attrs.pop, opendap_attrs, [None] * 8)) | ||
return Frozen(attrs) | ||
|
||
def get_dimensions(self): | ||
return Frozen(self.ds.dimensions) | ||
return Frozen(sorted(self.ds.dimensions)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To potentially address the issues with dimensions in Datatree, and the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is only dataset level dimensions, not variable level dimensions. At the dataset level, dimension order doesn't really matter, so I doubt this is going to fix the issue, unfortunately. |
||
|
||
@property | ||
def ds(self): | ||
return get_group(self.dataset, self.group) | ||
|
||
def _get_data_array(self, var): | ||
"""gets dimension data all at once, storing the numpy | ||
arrays within a cached dictionary | ||
""" | ||
from pydap.lib import get_batch_data | ||
|
||
if not self._batch_done or var.id not in self._array_cache: | ||
# store all dim data into a dict for reuse | ||
self._array_cache = get_batch_data( | ||
var.parent, self._array_cache, self._checksums | ||
) | ||
self._batch_done = True | ||
|
||
return self._array_cache[var.id] | ||
|
||
|
||
class PydapBackendEntrypoint(BackendEntrypoint): | ||
""" | ||
|
@@ -231,6 +317,8 @@ def open_dataset( | |
timeout=None, | ||
verify=None, | ||
user_charset=None, | ||
batch=False, | ||
checksums=True, | ||
) -> Dataset: | ||
store = PydapDataStore.open( | ||
url=filename_or_obj, | ||
|
@@ -241,6 +329,8 @@ def open_dataset( | |
timeout=timeout, | ||
verify=verify, | ||
user_charset=user_charset, | ||
batch=batch, | ||
checksums=checksums, | ||
) | ||
store_entrypoint = StoreBackendEntrypoint() | ||
with close_on_error(store): | ||
|
@@ -273,6 +363,8 @@ def open_datatree( | |
timeout=None, | ||
verify=None, | ||
user_charset=None, | ||
batch=False, | ||
checksums=True, | ||
) -> DataTree: | ||
groups_dict = self.open_groups_as_dict( | ||
filename_or_obj, | ||
|
@@ -285,10 +377,12 @@ def open_datatree( | |
decode_timedelta=decode_timedelta, | ||
group=group, | ||
application=None, | ||
session=None, | ||
timeout=None, | ||
verify=None, | ||
user_charset=None, | ||
session=session, | ||
timeout=timeout, | ||
verify=application, | ||
user_charset=user_charset, | ||
batch=batch, | ||
checksums=checksums, | ||
) | ||
|
||
return datatree_from_dict_with_io_cleanup(groups_dict) | ||
|
@@ -310,6 +404,8 @@ def open_groups_as_dict( | |
timeout=None, | ||
verify=None, | ||
user_charset=None, | ||
batch=False, | ||
checksums=True, | ||
) -> dict[str, Dataset]: | ||
from xarray.core.treenode import NodePath | ||
|
||
|
@@ -321,6 +417,8 @@ def open_groups_as_dict( | |
timeout=timeout, | ||
verify=verify, | ||
user_charset=user_charset, | ||
batch=batch, | ||
checksums=checksums, | ||
) | ||
|
||
# Check for a group and make it a parent if it exists | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Xarray should already avoid downloading 1D coordinates multiple times, because coordinates are saved in memory as NumPy arrays and pandas indexes. If this is not the case, please file a bug report to discuss :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, I see this was discussed earlier, and seems to be the same issue as #10560.
I would prefer a more general solution rather that something specifically for pydap, which will be harder to maintain.