You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Currently, input fields for each ensemble member are downloaded using the ECMWF Open Data client, which takes ~5 minutes or above per member in case downloading during the heavy traffic times. Such as below retry.
============================================================
Processing member 2 (2/50)
============================================================
Creating input state for ensemble member 2
Getting surface fields...
Retrieving ['10u', '10v', '2d', '2t', 'msl', 'skt', 'sp', 'tcw'] data for member 2
Recovering from HTTP error [429 Too Many Requests], attempt 1 of 500
Retrying in 120 seconds
This becomes expensive when running inference on a GPU system. A possible workaround is to download and store inputs separately before passing them to the GPU.
import os
import datetime
from collections import defaultdict
import time
import numpy as np
import earthkit.data as ekd
import earthkit.regrid as ekr
from ecmwf.opendata import Client as OpendataClient
PARAM_SFC = ["10u", "10v", "2d", "2t", "msl", "skt", "sp", "tcw"]
PARAM_SFC_FC = ["lsm", "z", "slor", "sdor"]
PARAM_SOIL = ["sot"]
PARAM_PL = ["gh", "t", "u", "v", "w", "q"]
LEVELS = [1000, 925, 850, 700, 600, 500, 400, 300, 250, 200, 150, 100, 50]
SOIL_LEVELS = [1, 2]
# Configuration for multi-run
ENSEMBLE_MEMBERS = list(range(1, 51)) # Members 1-50
LEAD_TIME = 72 # Hours
OUTPUT_DIR = "ensemble_outputs"
DATE = OpendataClient("ecmwf").latest()
print("Initial date is", DATE)
# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
date_str = DATE.strftime("%Y%m%d_%H%M")
def get_open_data(param, levelist=[], number=None):
fields = defaultdict(list)
# Get the data for the current date and the previous date
for date in [DATE - datetime.timedelta(hours=6), DATE]:
if number is None:
data = ekd.from_source("ecmwf-open-data", date=date, param=param, levelist=levelist)
else:
data = ekd.from_source("ecmwf-open-data", date=date, param=param, levelist=levelist,
number=[number], stream='enfo')
for f in data:
# Open data is between -180 and 180, we need to shift it to 0-360
assert f.to_numpy().shape == (721, 1440)
values = np.roll(f.to_numpy(), -f.shape[1] // 2, axis=1)
# Interpolate the data from 0.25 to N320
values = ekr.interpolate(values, {"grid": (0.25, 0.25)}, {"grid": "N320"})
# Add the values to the list
name = f"{f.metadata('param')}_{f.metadata('levelist')}" if levelist else f.metadata("param")
fields[name].append(values)
# Create a single matrix for each parameter
for param, values in fields.items():
fields[param] = np.stack(values)
return fields
def get_input_fields(number):
"""Get input fields for a specific ensemble member."""
fields = {}
# Add single level fields
fields.update(get_open_data(param=PARAM_SFC, number=number))
fields.update(get_open_data(param=PARAM_SFC_FC)) # Constant fields
# Add soil fields
soil = get_open_data(param=PARAM_SOIL, levelist=SOIL_LEVELS, number=number)
# Rename soil parameters
mapping = {'sot_1': 'stl1', 'sot_2': 'stl2',
'vsw_1': 'swvl1', 'vsw_2': 'swvl2'}
for k, v in soil.items():
fields[mapping[k]] = v
# Add pressure level fields
fields.update(get_open_data(param=PARAM_PL, levelist=LEVELS, number=number))
# Convert geopotential height to geopotential
for level in LEVELS:
gh = fields.pop(f"gh_{level}")
fields[f"z_{level}"] = gh * 9.80665
return fields
%%timeit
for member in ENSEMBLE_MEMBERS:
start_time = time.time()
print(f"\n{'='*60}")
print(f"Processing ensemble member {member}/{len(ENSEMBLE_MEMBERS)}")
print(f"{'='*60}")
# Get input fields for this member
print(f"Retrieving initial conditions for member {member}...")
fields = get_input_fields(member)
input_state = dict(date=DATE, fields=fields)
print(fields)
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
Working with ECMWF open data to collect initial condition inputs for AIFS, using a modified version of the notebook at:
https://huggingface.co/ecmwf/aifs-ens-1.0/blob/main/run_AIFS_ENS_v1.ipynb
Currently, input fields for each ensemble member are downloaded using the ECMWF Open Data client, which takes ~5 minutes or above per member in case downloading during the heavy traffic times. Such as below retry.
This becomes expensive when running inference on a GPU system. A possible workaround is to download and store inputs separately before passing them to the GPU.
Exploring alternatives like:
s3source in [earthkit.data](https://github.com/ecmwf/earthkit-data/blob/develop/src/earthkit/data/sources/s3.py)Is there a recommended approach or interest in using a kerchunk-based solution being tried (e.g., [kerchunk issue #546](fsspec/kerchunk#546)) as done for GEFS S3 data ([discussion #572](fsspec/kerchunk#572))?
Beta Was this translation helpful? Give feedback.
All reactions