From 33f822f65270f14aa91ca11844dc4dd0e89e69e7 Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Mon, 24 Mar 2025 13:04:45 -0700 Subject: [PATCH 01/17] Drop obsolete .gitkeep --- tutorials/euclid_access/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tutorials/euclid_access/.gitkeep diff --git a/tutorials/euclid_access/.gitkeep b/tutorials/euclid_access/.gitkeep deleted file mode 100644 index e69de29b..00000000 From 52c966b95bdff1374721c426e796f32cb93b4aee Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Mon, 24 Mar 2025 17:47:00 -0700 Subject: [PATCH 02/17] Add euclid-hats-parquet notebook --- .../euclid-hats-parquet.md | 235 ++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 tutorials/parquet-catalog-demos/euclid-hats-parquet.md diff --git a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md new file mode 100644 index 00000000..37712b78 --- /dev/null +++ b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md @@ -0,0 +1,235 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.1 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# Euclid Quick Release 1: MER Catalogs in HATS Parquet + ++++ + +## Learning Goals +By the end of this tutorial, you will: + +- # [TODO] + ++++ + +## Introduction + ++++ + +This notebook explores the HATS version of the [Euclid Q1](https://irsa.ipac.caltech.edu/data/Euclid/docs/overview_q1.html) MER Catalogs that has been created by IRSA. +In this version, the three MER Catalogs (MER, MER Morphology, and MER Cutouts) have been joined by Object ID into a single Parquet dataset. +It contains 29,953,430 rows and 601 columns. The total size is 32 GB. +The dataset is partitioned following [HATS](https://hats.readthedocs.io/) (Hierarchical Adaptive Tiling Scheme). + ++++ + +## Installs and imports + +```{code-cell} +# !pip install 'hats>=0.5' 'lsdb>=0.5' matplotlib numpy s3fs +``` + +```{code-cell} +import os + +import dask.distributed +import hats +import lsdb +import matplotlib.colors +import matplotlib.pyplot as plt +import numpy as np +``` + +## 1. Setup + +```{code-cell} +# Need UPath for the testing bucket. Otherwise hats will ignore the credentials that Fornax +# provides under the hood. Will be unnecessary after the dataset is released in a public bucket. +from upath import UPath + +# AWS S3 path where this dataset is stored. +s3_bucket = "irsa-fornax-testdata" +s3_key = "EUCLID/q1/mer_catalogue/hats" +euclid_s3_path = UPath(f"s3://{s3_bucket}/{s3_key}") +``` + +We will use the [`hats`](https://hats.readthedocs.io/) library to visualize the catalog and access the schema. + +```{code-cell} +# Load the parquet dataset using hats. +euclid_hats = hats.read_hats(euclid_s3_path) +``` + +## 2. Visualize the on-sky density of Q1 Objects and HATS partitions + ++++ + +Euclid Q1 covers four non-contiguous fields: Euclid Deep Field North (22.9 sq deg), Euclid Deep Field Fornax (12.1 sq deg), Euclid Deep Field South (28.1 sq deg), and LDN1641. +We can visualize the Object density in the four fields using `hats`. + +```{code-cell} +# Visualize the on-sky distribution of objects in the Q1 MER Catalog. +hats.inspection.plot_density(euclid_hats) +``` + +HATS (Hierarchical Adaptive Tiling Scheme) is a spatial partitioning based on HEALPix that aims to +produce partitions (files) of roughly equal size. This makes them more efficient to work with, +especially for large-scale analyses and/or parallel processing. +HATS does this by adjusting the partitioning order (i.e., HEALPix order at which data is partitioned) +according to the on-sky density of the objects or sources (rows) in the dataset. +In other words, dense regions are partitioned at a +higher HEALPix order (smaller pixel size) to reduce the number of objects in those partitions towards the mean; +vice versa for sparse regions. + +We can see this by plotting the partitioning orders. + +```{code-cell} +# Visualize the HEALPix order of each partition. +hats.inspection.plot_pixels(euclid_hats) +``` + +## 3. CMD of stars in Euclid Q1 + ++++ + +In this section, we query the Euclid Q1 MER catalogs for likely stars and create a color-magnitude diagram (CMD), following +[Introduction to Euclid Q1 MER catalog](https://caltech-ipac.github.io/irsa-tutorials/tutorials/euclid_access/2_Euclid_intro_MER_catalog.html). +Here, we'll use [`lsdb`](https://docs.lsdb.io/) to query the parquet files that are sitting in an S3 bucket (the intro notebook uses `pyvo` to query the TAP service). +`lsdb` enables efficient, large-scale queries on HATS catalogs, so let's look at *all* likely stars in Euclid Q1 instead of limiting to 10,000. + ++++ + +`lsdb` uses Dask for parallelization. Set up the workers. + +```{code-cell} +client = dask.distributed.Client( + n_workers=os.cpu_count(), threads_per_worker=2, memory_limit="auto" +) +``` + +The data will be lazy-loaded. This means that commands like `query` are not executed until the data is actually required. + +```{code-cell} +# Load the parquet dataset using lsdb. +columns = [ + "TILEID", + "FLUX_VIS_PSF", + "FLUX_Y_TEMPLFIT", + "FLUX_J_TEMPLFIT", + "FLUX_H_TEMPLFIT", + "POINT_LIKE_FLAG", +] +euclid_lsdb = lsdb.read_hats(UPath(f"s3://{s3_bucket}/{s3_key}"), columns=columns) + +# Set up the query for likely stars. +star_cuts = "FLUX_VIS_PSF > 0 & FLUX_Y_TEMPLFIT > 0 & FLUX_J_TEMPLFIT > 0 & FLUX_H_TEMPLFIT > 0 & POINT_LIKE_FLAG == 1" +euclid_stars = euclid_lsdb.query(star_cuts) +``` + +```{code-cell} +# Peek at the data. +euclid_stars.head(10) +``` + +We peeked at the data but we haven't loaded all of it yet. +What we really need in order to create a CMD is the magnitudes, so let's calculate those now. +Appending `.compute()` to the commands will trigger Dask to actually load this data into memory. +It is not strictly necessary, but will allow us to look at the data repeatedly without having to re-load it each time. + +```{code-cell} +# Calculate magnitudes. Appending `.compute()` triggers Dask to load this data now. +mag_y = (-2.5 * np.log10(euclid_stars["FLUX_Y_TEMPLFIT"]) + 23.9).compute() +mag_h = (-2.5 * np.log10(euclid_stars["FLUX_H_TEMPLFIT"]) + 23.9).compute() + +print(f"Loaded magnitudes of {len(mag_y):,} likely stars in Euclid Q1.") +``` + +Create the CMD + +```{code-cell} +hb = plt.hexbin(mag_y - mag_h, mag_y, norm=matplotlib.colors.LogNorm(vmin=1, vmax=50_000)) +plt.colorbar(hb) +plt.xlabel("Y-H") +plt.ylabel("Y") +plt.xlim(-10, 10) +plt.ylim(10, 35) +plt.title("Stars in Euclid Q1 MER Catalog") +plt.show() +``` + +```{code-cell} +# Close the Dask client. +client.close() +``` + +## 4. Schema + +IRSA's +[Cloud Access](https://caltech-ipac.github.io/irsa-tutorials/tutorials/cloud_access/cloud-access-intro.html#navigate-a-catalog-and-perform-a-basic-query) +notebook shows how to work with parquet schemas. +`hats` will return the same pyarrow schema object shown in that notebook, so let's use it. + +```{code-cell} +# Fetch the pyarrow schema from hats. +schema = euclid_hats.schema +print(f"{len(schema)} columns in the combined Euclid Q1 MER Catalogs") +``` + +The three catalogs MER, MER Morphology, and MER Cutouts have been joined together in this parquet version. +You can see their original schemas at +[Euclid Final Catalog description](http://st-dm.pages.euclid-sgs.uk/data-product-doc/dmq1/merdpd/dpcards/mer_finalcatalog.html). + +Two columns have been added to the top of the schema. +'_healpix_29' is the pixel index at HEALPix order 29. +It is used by `hats` and is generally useful for spatial queries. +'TILEID' is the Euclid MER tile index, added for convenience. + +```{code-cell} +schema.names[:5] +``` + +Three columns have been added to the bottom: 'Norder', 'Dir', and 'Npix'. These are the HATS partitioning columns. + +```{code-cell} +schema.names[-5:] +``` + +In the original schemas, a handful of column names appear in both the main MER catalog and one of its secondaries (MER Cutouts). +To ensure that column names are unique in the parquet, the name of the secondary catalog was appended +to the affected column names, separated by "-". + +```{code-cell} +# Find the column names that were affected in the secondary catalogs. +mer_secondaries = ["MORPH", "CUTOUTS"] +[name for name in euclid_hats.schema.names if name.split("-")[-1] in mer_secondaries] +``` + +For each of these, there is another column without the catalog name appended, which belongs to the main catalog. + +```{code-cell} +print("-- MER column --") +print(schema.field("RIGHT_ASCENSION")) +print(schema.field("RIGHT_ASCENSION").metadata) + +print("-- MER Cutouts column --") +print(schema.field("RIGHT_ASCENSION-CUTOUTS")) +print(schema.field("RIGHT_ASCENSION-CUTOUTS").metadata) +``` + +## About this notebook + +**Authors:** Troy Raen (Developer; Caltech/IPAC-IRSA) and the IRSA Data Science Team. + +**Contact:** [IRSA Helpdesk](https://irsa.ipac.caltech.edu/docs/help_desk.html) with questions or problems. + +**Updated:** 2025-03-24 From f90fc39555d76cbcbeb9b94d87ea00e2e053ed21 Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Mon, 24 Mar 2025 17:50:08 -0700 Subject: [PATCH 03/17] Update index.md --- index.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/index.md b/index.md index 58af2e96..583be73b 100644 --- a/index.md +++ b/index.md @@ -32,12 +32,13 @@ caption: Cloud data access --- tutorials/cloud_access/cloud-access-intro +tutorials/cloud_access/euclid-cloud-access +tutorials/parquet-catalog-demos/euclid-hats-parquet tutorials/parquet-catalog-demos/wise-allwise-catalog-demo tutorials/parquet-catalog-demos/neowise-source-table-strategies tutorials/parquet-catalog-demos/neowise-source-table-lightcurves tutorials/openuniversesims/openuniverse2024_roman_simulated_timedomainsurvey tutorials/openuniversesims/openuniverse2024_roman_simulated_wideareasurvey -tutorials/cloud_access/euclid-cloud-access ``` @@ -69,6 +70,7 @@ tutorials/euclid_access/3_Euclid_intro_1D_spectra tutorials/euclid_access/4_Euclid_intro_PHZ_catalog tutorials/euclid_access/5_Euclid_intro_SPE_catalog tutorials/cloud_access/euclid-cloud-access +tutorials/parquet-catalog-demos/euclid-hats-parquet ``` @@ -112,4 +114,4 @@ tutorials/parallelize/Parallelize_Convolution **Authors:** IRSA Scientists and Developers wrote and maintain these notebooks. -**Contact:** [the IRSA Helpdesk](https://irsa.ipac.caltech.edu/docs/help_desk.html) with questions or reporting problems. \ No newline at end of file +**Contact:** [the IRSA Helpdesk](https://irsa.ipac.caltech.edu/docs/help_desk.html) with questions or reporting problems. From 1a3eef171bfbc2983d324a4c22e4c8e6c1933045 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Brigitta=20Sip=C5=91cz?= Date: Mon, 24 Mar 2025 18:43:57 -0700 Subject: [PATCH 04/17] TMP: ignore euclid-hats-parquet notebook until data is fully public --- conf.py | 2 ++ ignore_testing | 1 + 2 files changed, 3 insertions(+) diff --git a/conf.py b/conf.py index 6eb97f78..2a2befaa 100644 --- a/conf.py +++ b/conf.py @@ -58,6 +58,8 @@ # Both NEOWISE parquet notebooks work with large data that doesn't work within CircleCI or GHA resource limits nb_execution_excludepatterns += ['neowise-source-table-strategies.md', 'neowise-source-table-lightcurves.md',] + # Data is not yet public + nb_execution_excludepatterns += ['euclid-hats-parquet.md', ] if platform.platform().startswith("mac") or platform.platform().startswith("win"): # The way the notebooks use the multiprocessing module is known to not work on non-Linux diff --git a/ignore_testing b/ignore_testing index e69de29b..e0419716 100644 --- a/ignore_testing +++ b/ignore_testing @@ -0,0 +1 @@ +euclid-hats-parquet From 9335d49b021f99b4c83e9ef5075e36f9e273241b Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Mon, 24 Mar 2025 23:20:27 -0700 Subject: [PATCH 05/17] Apply review feedback from @bsipocz --- .../euclid-hats-parquet.md | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md index 37712b78..dae2bddc 100644 --- a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md +++ b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md @@ -18,7 +18,8 @@ kernelspec: ## Learning Goals By the end of this tutorial, you will: -- # [TODO] +- Understand the format, partitioning, and schema of this dataset. +- Be able to query this dataset for likely stars. +++ @@ -26,10 +27,23 @@ By the end of this tutorial, you will: +++ -This notebook explores the HATS version of the [Euclid Q1](https://irsa.ipac.caltech.edu/data/Euclid/docs/overview_q1.html) MER Catalogs that has been created by IRSA. -In this version, the three MER Catalogs (MER, MER Morphology, and MER Cutouts) have been joined by Object ID into a single Parquet dataset. -It contains 29,953,430 rows and 601 columns. The total size is 32 GB. -The dataset is partitioned following [HATS](https://hats.readthedocs.io/) (Hierarchical Adaptive Tiling Scheme). +This notebook demonstrates accesses to a copy of the +[Euclid Q1](https://irsa.ipac.caltech.edu/data/Euclid/docs/overview_q1.html) MER Catalogs +that is in Apache Parquet format, partitioned according to the +Hierarchical Adaptive Tiling Scheme (HATS), and stored in an AWS S3 bucket. + +This is a single parquet dataset which comprises all three MER Catalogs +-- MER, MER Morphology, and MER Cutouts -- which have been joined by Object ID. +Their schemas (pre-join) can be seen at +[Euclid Final Catalog description](http://st-dm.pages.euclid-sgs.uk/data-product-doc/dmq1/merdpd/dpcards/mer_finalcatalog.html). +Minor modifications were made to the parquet schema to accommodate the join (de-duplicating column names) +and for the HATS standard. These differences are shown below. + +HATS is a spatial partitioning scheme based on HEALPix that aims to +produce partitions (files) of roughly equal size. +This makes them more efficient to work with, +especially for large-scale analyses and/or parallel processing. +This notebook demonstrates the basics. +++ @@ -63,7 +77,7 @@ s3_key = "EUCLID/q1/mer_catalogue/hats" euclid_s3_path = UPath(f"s3://{s3_bucket}/{s3_key}") ``` -We will use the [`hats`](https://hats.readthedocs.io/) library to visualize the catalog and access the schema. +We will use [`hats`](https://hats.readthedocs.io/) to visualize the catalog and access the schema. ```{code-cell} # Load the parquet dataset using hats. @@ -82,9 +96,6 @@ We can visualize the Object density in the four fields using `hats`. hats.inspection.plot_density(euclid_hats) ``` -HATS (Hierarchical Adaptive Tiling Scheme) is a spatial partitioning based on HEALPix that aims to -produce partitions (files) of roughly equal size. This makes them more efficient to work with, -especially for large-scale analyses and/or parallel processing. HATS does this by adjusting the partitioning order (i.e., HEALPix order at which data is partitioned) according to the on-sky density of the objects or sources (rows) in the dataset. In other words, dense regions are partitioned at a @@ -174,6 +185,10 @@ client.close() ## 4. Schema ++++ + +The three catalogs MER, MER Morphology, and MER Cutouts have been joined together in this parquet version. + IRSA's [Cloud Access](https://caltech-ipac.github.io/irsa-tutorials/tutorials/cloud_access/cloud-access-intro.html#navigate-a-catalog-and-perform-a-basic-query) notebook shows how to work with parquet schemas. @@ -185,10 +200,6 @@ schema = euclid_hats.schema print(f"{len(schema)} columns in the combined Euclid Q1 MER Catalogs") ``` -The three catalogs MER, MER Morphology, and MER Cutouts have been joined together in this parquet version. -You can see their original schemas at -[Euclid Final Catalog description](http://st-dm.pages.euclid-sgs.uk/data-product-doc/dmq1/merdpd/dpcards/mer_finalcatalog.html). - Two columns have been added to the top of the schema. '_healpix_29' is the pixel index at HEALPix order 29. It is used by `hats` and is generally useful for spatial queries. @@ -230,6 +241,6 @@ print(schema.field("RIGHT_ASCENSION-CUTOUTS").metadata) **Authors:** Troy Raen (Developer; Caltech/IPAC-IRSA) and the IRSA Data Science Team. -**Contact:** [IRSA Helpdesk](https://irsa.ipac.caltech.edu/docs/help_desk.html) with questions or problems. - **Updated:** 2025-03-24 + +**Contact:** [IRSA Helpdesk](https://irsa.ipac.caltech.edu/docs/help_desk.html) with questions or problems. From 8bc87e4c40056f46e30dd545aa8881ba5b35a6cd Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Tue, 25 Mar 2025 17:43:41 -0700 Subject: [PATCH 06/17] Add a sentence about Parquet --- tutorials/parquet-catalog-demos/euclid-hats-parquet.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md index dae2bddc..8e74e321 100644 --- a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md +++ b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md @@ -31,6 +31,9 @@ This notebook demonstrates accesses to a copy of the [Euclid Q1](https://irsa.ipac.caltech.edu/data/Euclid/docs/overview_q1.html) MER Catalogs that is in Apache Parquet format, partitioned according to the Hierarchical Adaptive Tiling Scheme (HATS), and stored in an AWS S3 bucket. +Parquet is a file format that enables flexible and efficient data access by, among other things, +supporting the application of both column and row filters when reading the data (very similar to a SQL query) +so that only the desired data is loaded into memory. This is a single parquet dataset which comprises all three MER Catalogs -- MER, MER Morphology, and MER Cutouts -- which have been joined by Object ID. @@ -241,6 +244,6 @@ print(schema.field("RIGHT_ASCENSION-CUTOUTS").metadata) **Authors:** Troy Raen (Developer; Caltech/IPAC-IRSA) and the IRSA Data Science Team. -**Updated:** 2025-03-24 +**Updated:** 2025-03-25 **Contact:** [IRSA Helpdesk](https://irsa.ipac.caltech.edu/docs/help_desk.html) with questions or problems. From a43f395518efde1de5f9a68b26c254c6bc379ea6 Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Wed, 26 Mar 2025 15:46:00 -0700 Subject: [PATCH 07/17] Add the common Euclid abbreviations "ERO" and "Q1" index headers --- index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index.md b/index.md index 583be73b..a5aca3c7 100644 --- a/index.md +++ b/index.md @@ -44,7 +44,7 @@ tutorials/openuniversesims/openuniverse2024_roman_simulated_wideareasurvey ## Accessing Euclid data -### Euclid Early Release Observation +### Euclid Early Release Observation (ERO) ```{toctree} --- @@ -56,7 +56,7 @@ tutorials/euclid_access/Euclid_ERO ``` -### Euclid Quick Release 1 +### Euclid Quick Release 1 (Q1) ```{toctree} --- From af6b5d7a38ea42cc6e4e5dce44d173ed466813bb Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Fri, 28 Mar 2025 04:03:36 -0700 Subject: [PATCH 08/17] Apply @afaisst feedback. Use euclid_s3_path. --- tutorials/parquet-catalog-demos/euclid-hats-parquet.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md index 8e74e321..e880b133 100644 --- a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md +++ b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md @@ -143,7 +143,7 @@ columns = [ "FLUX_H_TEMPLFIT", "POINT_LIKE_FLAG", ] -euclid_lsdb = lsdb.read_hats(UPath(f"s3://{s3_bucket}/{s3_key}"), columns=columns) +euclid_lsdb = lsdb.read_hats(euclid_s3_path, columns=columns) # Set up the query for likely stars. star_cuts = "FLUX_VIS_PSF > 0 & FLUX_Y_TEMPLFIT > 0 & FLUX_J_TEMPLFIT > 0 & FLUX_H_TEMPLFIT > 0 & POINT_LIKE_FLAG == 1" From d4e8b48c4a3b317a37cb8596eb82c4b1c969f237 Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Fri, 28 Mar 2025 04:37:44 -0700 Subject: [PATCH 09/17] Temp fix. Uninstall numpy and pyerfa before installs. --- tutorials/parquet-catalog-demos/euclid-hats-parquet.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md index e880b133..bdeb47cf 100644 --- a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md +++ b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md @@ -53,6 +53,7 @@ This notebook demonstrates the basics. ## Installs and imports ```{code-cell} +# !pip uninstall -y numpy pyerfa # !pip install 'hats>=0.5' 'lsdb>=0.5' matplotlib numpy s3fs ``` @@ -65,6 +66,9 @@ import lsdb import matplotlib.colors import matplotlib.pyplot as plt import numpy as np +# NOTE: If you run into an error that starts with, +# "A module that was compiled using NumPy 1.x cannot be run in NumPy 2.1.3 as it may crash.", +# make sure you have restarted the kernel since doing `pip install`. Then re-run the cell. ``` ## 1. Setup From 6dcd3e12e84e35aae753d2ce8940177da2d38a90 Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Fri, 28 Mar 2025 04:44:13 -0700 Subject: [PATCH 10/17] Add anon=True option for IPAC --- tutorials/parquet-catalog-demos/euclid-hats-parquet.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md index bdeb47cf..1cd187ad 100644 --- a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md +++ b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md @@ -53,7 +53,7 @@ This notebook demonstrates the basics. ## Installs and imports ```{code-cell} -# !pip uninstall -y numpy pyerfa +# !pip uninstall -y numpy pyerfa # Helps resolve numpy>=2.0 dependency issues. # !pip install 'hats>=0.5' 'lsdb>=0.5' matplotlib numpy s3fs ``` @@ -82,6 +82,9 @@ from upath import UPath s3_bucket = "irsa-fornax-testdata" s3_key = "EUCLID/q1/mer_catalogue/hats" euclid_s3_path = UPath(f"s3://{s3_bucket}/{s3_key}") + +# Note: If running from IPAC, you need an anonymous connection. Uncomment the next line. +# euclid_s3_path = UPath(f"s3://{s3_bucket}/{s3_key}", anon=True) ``` We will use [`hats`](https://hats.readthedocs.io/) to visualize the catalog and access the schema. From c4c46dd11db6ab9e5a255893ed158f808bac8a89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Brigitta=20Sip=C5=91cz?= Date: Fri, 28 Mar 2025 07:54:07 -0700 Subject: [PATCH 11/17] Adding new dependencies to the central requirements file, too --- .binder/requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.binder/requirements.txt b/.binder/requirements.txt index 64e7a775..45f0d74d 100644 --- a/.binder/requirements.txt +++ b/.binder/requirements.txt @@ -1,6 +1,6 @@ # For the content of the tutorials tqdm -numpy>=1.24 +numpy>=2 matplotlib>=3.7 astropy>=5.3 pyvo>=1.5 @@ -19,6 +19,8 @@ reproject photutils>=2.0 fsspec sep>=1.4 +hats>=0.5 +lsdb>=0.5 # For supporting myst-based notebooks jupytext # Making admonotions look nice for the myst notebooks From 1dceeddd0a58a602c26f8f1611c648edcfd88a2c Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Fri, 28 Mar 2025 19:20:05 -0700 Subject: [PATCH 12/17] Apply suggestions from @bsipocz code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Brigitta Sipőcz --- tutorials/parquet-catalog-demos/euclid-hats-parquet.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md index 1cd187ad..c2070294 100644 --- a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md +++ b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md @@ -66,9 +66,12 @@ import lsdb import matplotlib.colors import matplotlib.pyplot as plt import numpy as np -# NOTE: If you run into an error that starts with, -# "A module that was compiled using NumPy 1.x cannot be run in NumPy 2.1.3 as it may crash.", -# make sure you have restarted the kernel since doing `pip install`. Then re-run the cell. +``` + +```{tip} +If you run into an error that starts with, +"A module that was compiled using NumPy 1.x cannot be run in NumPy 2.1.3 as it may crash.", +make sure you have restarted the kernel since doing `pip install`. Then re-run the cell. ``` ## 1. Setup From 31a55cae5a37a697f5f88e92642fd9b4c2b97c2c Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Fri, 28 Mar 2025 20:06:52 -0700 Subject: [PATCH 13/17] Apply suggestions from @bsipocz code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Brigitta Sipőcz --- tutorials/parquet-catalog-demos/euclid-hats-parquet.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md index c2070294..f53ae56c 100644 --- a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md +++ b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md @@ -11,7 +11,7 @@ kernelspec: name: python3 --- -# Euclid Quick Release 1: MER Catalogs in HATS Parquet +# Euclid Q1: MER Catalogs in HATS Parquet +++ From 7eb2cc126ce545707554add877481bcc5d184de7 Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Sat, 29 Mar 2025 03:22:07 -0700 Subject: [PATCH 14/17] Add pyerfa>=2.0.1.3 to binder requirements.txt. Needed for numpy>=2.0. --- .binder/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/.binder/requirements.txt b/.binder/requirements.txt index 45f0d74d..662cac6e 100644 --- a/.binder/requirements.txt +++ b/.binder/requirements.txt @@ -21,6 +21,7 @@ fsspec sep>=1.4 hats>=0.5 lsdb>=0.5 +pyerfa>=2.0.1.3 # For supporting myst-based notebooks jupytext # Making admonotions look nice for the myst notebooks From f1c5686fbe3308b1220e0afb79f7c6754e72eb15 Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Sat, 29 Mar 2025 03:25:23 -0700 Subject: [PATCH 15/17] Apply suggestions from @afaisst and @bsipocz code reviews. --- .../euclid-hats-parquet.md | 88 ++++++++++--------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md index f53ae56c..4b326d01 100644 --- a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md +++ b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md @@ -18,8 +18,11 @@ kernelspec: ## Learning Goals By the end of this tutorial, you will: -- Understand the format, partitioning, and schema of this dataset. -- Be able to query this dataset for likely stars. +- Access basic metadata to understand the format and schema of this unified HATS Parquet dataset. +- Visualize the HATS partitioning of this dataset. +- Query this dataset for likely stars and create a color-magnitude diagram. (Recreate the figure from + [Introduction to Euclid Q1 MER catalog](https://caltech-ipac.github.io/irsa-tutorials/tutorials/euclid_access/2_Euclid_intro_MER_catalog.html), + this time with *all* likely stars.) +++ @@ -27,34 +30,41 @@ By the end of this tutorial, you will: +++ -This notebook demonstrates accesses to a copy of the +This notebook demonstrates accesses to a version of the [Euclid Q1](https://irsa.ipac.caltech.edu/data/Euclid/docs/overview_q1.html) MER Catalogs that is in Apache Parquet format, partitioned according to the Hierarchical Adaptive Tiling Scheme (HATS), and stored in an AWS S3 bucket. -Parquet is a file format that enables flexible and efficient data access by, among other things, -supporting the application of both column and row filters when reading the data (very similar to a SQL query) -so that only the desired data is loaded into memory. -This is a single parquet dataset which comprises all three MER Catalogs +The catalog version accessed here is a single dataset which comprises all three MER Catalogs -- MER, MER Morphology, and MER Cutouts -- which have been joined by Object ID. Their schemas (pre-join) can be seen at [Euclid Final Catalog description](http://st-dm.pages.euclid-sgs.uk/data-product-doc/dmq1/merdpd/dpcards/mer_finalcatalog.html). Minor modifications were made to the parquet schema to accommodate the join (de-duplicating column names) and for the HATS standard. These differences are shown below. +Parquet is a file format that enables flexible and efficient data access by, among other things, +supporting the application of both column and row filters when reading the data (very similar to a SQL query) +so that only the desired data is loaded into memory. + HATS is a spatial partitioning scheme based on HEALPix that aims to produce partitions (files) of roughly equal size. -This makes them more efficient to work with, +This makes the files more efficient to work with, especially for large-scale analyses and/or parallel processing. -This notebook demonstrates the basics. +It does this by adapting the HEALPix order at which data is partitioned in a given catalog based +on the on-sky density of the rows it contains. +In other words, data from dense regions of sky will be partitioned at a higher order +(i.e., higher resolution; smaller pixel size) than data in sparse regions. +HATS-aware python packages are being developed to take full advantage of the partitioning. +In this notebook, we will use the [hats](https://hats.readthedocs.io/) library to visualize the +catalog and access the schema, and [lsdb](https://docs.lsdb.io/) to do a query for all likely stars. +++ ## Installs and imports ```{code-cell} -# !pip uninstall -y numpy pyerfa # Helps resolve numpy>=2.0 dependency issues. -# !pip install 'hats>=0.5' 'lsdb>=0.5' matplotlib numpy s3fs +# # Uncomment the next line to install dependencies if needed. +# !pip install 'hats>=0.5' 'lsdb>=0.5' matplotlib 'numpy>=2.0' 'pyerfa>=2.0.1.3' s3fs ``` ```{code-cell} @@ -74,27 +84,28 @@ If you run into an error that starts with, make sure you have restarted the kernel since doing `pip install`. Then re-run the cell. ``` ++++ + ## 1. Setup ```{code-cell} -# Need UPath for the testing bucket. Otherwise hats will ignore the credentials that Fornax -# provides under the hood. Will be unnecessary after the dataset is released in a public bucket. -from upath import UPath - # AWS S3 path where this dataset is stored. s3_bucket = "irsa-fornax-testdata" s3_key = "EUCLID/q1/mer_catalogue/hats" -euclid_s3_path = UPath(f"s3://{s3_bucket}/{s3_key}") - -# Note: If running from IPAC, you need an anonymous connection. Uncomment the next line. -# euclid_s3_path = UPath(f"s3://{s3_bucket}/{s3_key}", anon=True) -``` - -We will use [`hats`](https://hats.readthedocs.io/) to visualize the catalog and access the schema. - -```{code-cell} -# Load the parquet dataset using hats. -euclid_hats = hats.read_hats(euclid_s3_path) +euclid_s3_path = f"s3://{s3_bucket}/{s3_key}" + +# Temporary try/except to handle credentials in different environments before public release. +try: + # If running from within IPAC's network (maybe VPN'd in with "tunnel-all"), + # your IP address acts as your credentials and this should just work. + hats.read_hats(euclid_s3_path) +except FileNotFoundError: + # If running from Fornax, credentials are provided automatically under the hood, but + # hats ignores them in the call above and raises a FileNotFoundError. + # Construct a UPath which will pick up the credentials. + from upath import UPath + + euclid_s3_path = UPath(f"s3://{s3_bucket}/{s3_key}") ``` ## 2. Visualize the on-sky density of Q1 Objects and HATS partitions @@ -105,20 +116,17 @@ Euclid Q1 covers four non-contiguous fields: Euclid Deep Field North (22.9 sq de We can visualize the Object density in the four fields using `hats`. ```{code-cell} +# Load the dataset. +euclid_hats = hats.read_hats(euclid_s3_path) + # Visualize the on-sky distribution of objects in the Q1 MER Catalog. hats.inspection.plot_density(euclid_hats) ``` -HATS does this by adjusting the partitioning order (i.e., HEALPix order at which data is partitioned) -according to the on-sky density of the objects or sources (rows) in the dataset. -In other words, dense regions are partitioned at a -higher HEALPix order (smaller pixel size) to reduce the number of objects in those partitions towards the mean; -vice versa for sparse regions. - -We can see this by plotting the partitioning orders. +We can see how the on-sky density maps to the HATS partitions by calling `plot_pixels`. ```{code-cell} -# Visualize the HEALPix order of each partition. +# Visualize the HEALPix orders of the dataset partitions. hats.inspection.plot_pixels(euclid_hats) ``` @@ -128,12 +136,10 @@ hats.inspection.plot_pixels(euclid_hats) In this section, we query the Euclid Q1 MER catalogs for likely stars and create a color-magnitude diagram (CMD), following [Introduction to Euclid Q1 MER catalog](https://caltech-ipac.github.io/irsa-tutorials/tutorials/euclid_access/2_Euclid_intro_MER_catalog.html). -Here, we'll use [`lsdb`](https://docs.lsdb.io/) to query the parquet files that are sitting in an S3 bucket (the intro notebook uses `pyvo` to query the TAP service). +Here, we use `lsdb` to query the parquet files that are sitting in an S3 bucket (the intro notebook uses `pyvo` to query the TAP service). `lsdb` enables efficient, large-scale queries on HATS catalogs, so let's look at *all* likely stars in Euclid Q1 instead of limiting to 10,000. -+++ - -`lsdb` uses Dask for parallelization. Set up the workers. +`lsdb` uses Dask for parallelization. So first, set up the workers. ```{code-cell} client = dask.distributed.Client( @@ -144,7 +150,7 @@ client = dask.distributed.Client( The data will be lazy-loaded. This means that commands like `query` are not executed until the data is actually required. ```{code-cell} -# Load the parquet dataset using lsdb. +# Load the dataset. columns = [ "TILEID", "FLUX_VIS_PSF", @@ -209,7 +215,9 @@ notebook shows how to work with parquet schemas. ```{code-cell} # Fetch the pyarrow schema from hats. +euclid_hats = hats.read_hats(euclid_s3_path) schema = euclid_hats.schema + print(f"{len(schema)} columns in the combined Euclid Q1 MER Catalogs") ``` @@ -254,6 +262,6 @@ print(schema.field("RIGHT_ASCENSION-CUTOUTS").metadata) **Authors:** Troy Raen (Developer; Caltech/IPAC-IRSA) and the IRSA Data Science Team. -**Updated:** 2025-03-25 +**Updated:** 2025-03-29 **Contact:** [IRSA Helpdesk](https://irsa.ipac.caltech.edu/docs/help_desk.html) with questions or problems. From 13251509edd5c4cb7aae172770c3eabe012a10d9 Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Mon, 5 May 2025 22:26:38 -0700 Subject: [PATCH 16/17] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jaladh Singhal Co-authored-by: Brigitta Sipőcz --- .../parquet-catalog-demos/euclid-hats-parquet.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md index 4b326d01..0e3a6496 100644 --- a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md +++ b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md @@ -53,7 +53,7 @@ especially for large-scale analyses and/or parallel processing. It does this by adapting the HEALPix order at which data is partitioned in a given catalog based on the on-sky density of the rows it contains. In other words, data from dense regions of sky will be partitioned at a higher order -(i.e., higher resolution; smaller pixel size) than data in sparse regions. +(i.e., higher resolution; more pixels/tiles with smaller area) than data in sparse regions. HATS-aware python packages are being developed to take full advantage of the partitioning. In this notebook, we will use the [hats](https://hats.readthedocs.io/) library to visualize the catalog and access the schema, and [lsdb](https://docs.lsdb.io/) to do a query for all likely stars. @@ -62,6 +62,10 @@ catalog and access the schema, and [lsdb](https://docs.lsdb.io/) to do a query f ## Installs and imports +```{important} +We rely on ``hast``, ``lsdb``, ``numpy``, and ``pyerfa`` features that have been recently added, so please make sure you have the respective versions v0.5, v0.5, v2.0, and v2.0.1.3 or newer installed. +``` + ```{code-cell} # # Uncomment the next line to install dependencies if needed. # !pip install 'hats>=0.5' 'lsdb>=0.5' matplotlib 'numpy>=2.0' 'pyerfa>=2.0.1.3' s3fs @@ -130,7 +134,7 @@ We can see how the on-sky density maps to the HATS partitions by calling `plot_p hats.inspection.plot_pixels(euclid_hats) ``` -## 3. CMD of stars in Euclid Q1 +## 3. CMD of ALL stars in Euclid Q1 +++ @@ -164,6 +168,7 @@ euclid_lsdb = lsdb.read_hats(euclid_s3_path, columns=columns) # Set up the query for likely stars. star_cuts = "FLUX_VIS_PSF > 0 & FLUX_Y_TEMPLFIT > 0 & FLUX_J_TEMPLFIT > 0 & FLUX_H_TEMPLFIT > 0 & POINT_LIKE_FLAG == 1" euclid_stars = euclid_lsdb.query(star_cuts) +euclid_stars ``` ```{code-cell} @@ -202,7 +207,7 @@ plt.show() client.close() ``` -## 4. Schema +## 4. Inspecting MER Catalog's Parquet Schema +++ From 8b71974d0e3ec866cdd3f0221b224cd23d11faf0 Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Mon, 5 May 2025 23:16:02 -0700 Subject: [PATCH 17/17] Apply suggestions from @jaladh-singhal code review --- .../parquet-catalog-demos/euclid-hats-parquet.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md index 0e3a6496..da419417 100644 --- a/tutorials/parquet-catalog-demos/euclid-hats-parquet.md +++ b/tutorials/parquet-catalog-demos/euclid-hats-parquet.md @@ -46,8 +46,9 @@ Parquet is a file format that enables flexible and efficient data access by, amo supporting the application of both column and row filters when reading the data (very similar to a SQL query) so that only the desired data is loaded into memory. -HATS is a spatial partitioning scheme based on HEALPix that aims to -produce partitions (files) of roughly equal size. +[HATS](https://hats.readthedocs.io/) is a spatial partitioning scheme based on +[HEALPix](https://healpix.jpl.nasa.gov/) +that aims to produce partitions (files) of roughly equal size. This makes the files more efficient to work with, especially for large-scale analyses and/or parallel processing. It does this by adapting the HEALPix order at which data is partitioned in a given catalog based @@ -143,9 +144,10 @@ In this section, we query the Euclid Q1 MER catalogs for likely stars and create Here, we use `lsdb` to query the parquet files that are sitting in an S3 bucket (the intro notebook uses `pyvo` to query the TAP service). `lsdb` enables efficient, large-scale queries on HATS catalogs, so let's look at *all* likely stars in Euclid Q1 instead of limiting to 10,000. -`lsdb` uses Dask for parallelization. So first, set up the workers. +`lsdb` uses Dask for parallelization. Set up the client and workers. ```{code-cell} +# This client will be used *implicitly* by all subsequent calls that require it. client = dask.distributed.Client( n_workers=os.cpu_count(), threads_per_worker=2, memory_limit="auto" ) @@ -172,7 +174,7 @@ euclid_stars ``` ```{code-cell} -# Peek at the data. +# Peek at the data. This must execute the query to load at least some data, so may take some time. euclid_stars.head(10) ``` @@ -267,6 +269,6 @@ print(schema.field("RIGHT_ASCENSION-CUTOUTS").metadata) **Authors:** Troy Raen (Developer; Caltech/IPAC-IRSA) and the IRSA Data Science Team. -**Updated:** 2025-03-29 +**Updated:** 2025-05-05 **Contact:** [IRSA Helpdesk](https://irsa.ipac.caltech.edu/docs/help_desk.html) with questions or problems.