Skip to content

Commit e450d5a

Browse files
committed
Provide location for DuckDB extensions if HOME not set
Why these changes are being introduced: In the AWS Lambda context, the HOME env var is empty string ''. DuckDB has a canned error response for this, suggesting to, "Specify a home directory using the SET home_directory='/path/to/dir' option". How this addresses that need: If HOME is unset or empty string, set an explicit secret and extension directory at `/tmp/.duckdb/*` locations. Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-541
1 parent 5b56965 commit e450d5a

File tree

3 files changed

+99
-10
lines changed

3 files changed

+99
-10
lines changed

tests/test_metadata.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# ruff: noqa: S105, S108
2+
13
import glob
24
import os
35
from pathlib import Path
@@ -262,3 +264,72 @@ def test_tdm_current_records_most_recent_version(timdex_metadata_with_deltas):
262264
== most_recent.iloc[0]["run_timestamp"]
263265
)
264266
assert current_version.iloc[0]["run_id"] == most_recent.iloc[0]["run_id"]
267+
268+
269+
def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_set_and_valid(
270+
monkeypatch, tmp_path_factory, timdex_dataset_with_runs
271+
):
272+
preset_home = tmp_path_factory.mktemp("my-account")
273+
monkeypatch.setenv("HOME", str(preset_home))
274+
275+
tdm = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location)
276+
df = (
277+
tdm.conn.query(
278+
"""
279+
select
280+
current_setting('secret_directory') as secret_directory,
281+
current_setting('extension_directory') as extension_directory
282+
;
283+
"""
284+
)
285+
.to_df()
286+
.iloc[0]
287+
)
288+
assert "my-account" in df.secret_directory
289+
assert df.extension_directory == "" # expected and okay when HOME set
290+
291+
292+
def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_unset(
293+
monkeypatch, timdex_dataset_with_runs
294+
):
295+
monkeypatch.delenv("HOME", raising=False)
296+
297+
tdm = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location)
298+
299+
df = (
300+
tdm.conn.query(
301+
"""
302+
select
303+
current_setting('secret_directory') as secret_directory,
304+
current_setting('extension_directory') as extension_directory
305+
;
306+
"""
307+
)
308+
.to_df()
309+
.iloc[0]
310+
)
311+
assert df.secret_directory == "/tmp/.duckdb/secrets"
312+
assert df.extension_directory == "/tmp/.duckdb/extensions"
313+
314+
315+
def test_tdm_prepare_duckdb_secret_and_extensions_home_env_var_set_but_empty(
316+
monkeypatch, timdex_dataset_with_runs
317+
):
318+
monkeypatch.setenv("HOME", "") # simulate AWS Lambda environment
319+
320+
tdm = TIMDEXDatasetMetadata(timdex_dataset_with_runs.location)
321+
322+
df = (
323+
tdm.conn.query(
324+
"""
325+
select
326+
current_setting('secret_directory') as secret_directory,
327+
current_setting('extension_directory') as extension_directory
328+
;
329+
"""
330+
)
331+
.to_df()
332+
.iloc[0]
333+
)
334+
assert df.secret_directory == "/tmp/.duckdb/secrets"
335+
assert df.extension_directory == "/tmp/.duckdb/extensions"

timdex_dataset_api/dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,8 @@ def get_s3_filesystem() -> fs.FileSystem:
226226
def setup_duckdb_context(self) -> DuckDBPyConnection:
227227
"""Create a DuckDB connection that metadata and data query and retrieval.
228228
229-
This relies on TIMDEXDatasetMetadata.setup_duckdb_context() to produce a DuckDB
230-
connection that has all metadata already created.
229+
This method extends TIMDEXDatasetMetadata's pre-existing DuckDB connection, adding
230+
a 'data' schema and any other configurations needed.
231231
"""
232232
start_time = time.perf_counter()
233233

timdex_dataset_api/metadata.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,35 @@ def configure_duckdb_connection(self, conn: DuckDBPyConnection) -> None:
148148
149149
These configurations include things like memory settings, AWS authentication, etc.
150150
"""
151+
self._install_duckdb_extensions(conn)
151152
self._configure_duckdb_s3_secret(conn)
152153
self._configure_duckdb_memory_profile(conn)
153154

155+
def _install_duckdb_extensions(self, conn: DuckDBPyConnection) -> None:
156+
"""Ensure DuckDB capable of installing extensions and install any required."""
157+
# ensure secrets and extensions paths are accessible
158+
home_env = os.getenv("HOME")
159+
use_fallback_home = not home_env or not Path(home_env).is_dir()
160+
161+
if use_fallback_home:
162+
duckdb_home = Path("/tmp/.duckdb") # noqa: S108
163+
secrets_dir = duckdb_home / "secrets"
164+
extensions_dir = duckdb_home / "extensions"
165+
166+
secrets_dir.mkdir(parents=True, exist_ok=True)
167+
extensions_dir.mkdir(parents=True, exist_ok=True)
168+
169+
conn.execute(f"set secret_directory='{secrets_dir.as_posix()}';")
170+
conn.execute(f"set extension_directory='{extensions_dir.as_posix()}';")
171+
172+
# install HTTPFS extension
173+
conn.execute(
174+
"""
175+
install httpfs;
176+
load httpfs;
177+
"""
178+
)
179+
154180
def _configure_duckdb_s3_secret(
155181
self,
156182
conn: DuckDBPyConnection,
@@ -161,14 +187,6 @@ def _configure_duckdb_s3_secret(
161187
If a scope is provided, e.g. an S3 URI prefix like 's3://timdex', set a scope
162188
parameter in the config. Else, leave it blank.
163189
"""
164-
# install httpfs extension
165-
conn.execute(
166-
"""
167-
install httpfs;
168-
load httpfs;
169-
"""
170-
)
171-
172190
# establish scope string
173191
scope_str = f", scope '{scope}'" if scope else ""
174192

0 commit comments

Comments
 (0)