From 17f99e9159ee4fb26be16a3cabccce7f9a7c84c0 Mon Sep 17 00:00:00 2001 From: thommodin Date: Fri, 25 Jul 2025 15:26:02 +1000 Subject: [PATCH 1/4] added nrmn_m13_survey json --- .../config/dataset/nrmn_m13_survey.json | 218 ++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 aodn_cloud_optimised/config/dataset/nrmn_m13_survey.json diff --git a/aodn_cloud_optimised/config/dataset/nrmn_m13_survey.json b/aodn_cloud_optimised/config/dataset/nrmn_m13_survey.json new file mode 100644 index 00000000..15d85e66 --- /dev/null +++ b/aodn_cloud_optimised/config/dataset/nrmn_m13_survey.json @@ -0,0 +1,218 @@ +{ + "dataset_name": "nrmn_m13_survey_qc", + "logger_name": "nrmn_m13_survey_qc", + "cloud_optimised_format": "parquet", + "metadata_uuid": "", + "pandas_read_csv_config": { + "delimiter": ",", + "header": 0, + "index_col": null, + "parse_dates": false, + "na_values": null, + "encoding": "utf-8" + }, + "schema": { + "survey_id": { + "type": "int64", + "nullable": false, + "long_name": "Survey Identifier", + "standard_name": "survey_id" + }, + "dataset_id": { + "type": "string", + "nullable": false, + "long_name": "Dataset Identifier", + "standard_name": "dataset_id" + }, + "country": { + "type": "string", + "nullable": true, + "long_name": "Country of the survey", + "standard_name": "country" + }, + "area": { + "type": "string", + "nullable": true, + "long_name": "Survey Area Description", + "standard_name": "area" + }, + "location": { + "type": "string", + "nullable": true, + "long_name": "Location Description", + "standard_name": "location" + }, + "site_code": { + "type": "string", + "nullable": true, + "long_name": "Site Code Identifier", + "standard_name": "site_code" + }, + "site_name": { + "type": "string", + "nullable": true, + "long_name": "Site Name Description", + "standard_name": "site_name" + }, + "lat": { + "type": "float64", + "nullable": false, + "long_name": "Latitude of the survey site", + "standard_name": "latitude", + "units": "degrees" + }, + "lon": { + "type": "float64", + "nullable": false, + "long_name": "Longitude of the survey site", + "standard_name": "longitude", + "units": "degrees" + }, + "timestamp": { + "type": "string", + "nullable": false, + "long_name": "Survey Date", + "standard_name": "date" + }, + "depth": { + "type": "float64", + "nullable": false, + "long_name": "Depth of survey location", + "standard_name": "depth", + "units": "meters" + }, + "program": { + "type": "string", + "nullable": false, + "long_name": "Program Name", + "standard_name": "program" + }, + "visibility": { + "type": "float64", + "nullable": true, + "long_name": "Visibility during survey", + "standard_name": "visibility", + "units": "meters" + }, + "original_label_scheme": { + "type": "string", + "nullable": false, + "long_name": "Original Labeling Scheme", + "standard_name": "original_label_scheme" + }, + "original_category": { + "type": "string", + "nullable": false, + "long_name": "Original Category Label", + "standard_name": "original_category" + }, + "rls_category": { + "type": "string", + "nullable": false, + "long_name": "RLS Category", + "standard_name": "rls_category" + }, + "catami_category": { + "type": "string", + "nullable": false, + "long_name": "CATAMI Category", + "standard_name": "catami_category" + }, + "catami_lineage": { + "type": "string", + "nullable": false, + "long_name": "CATAMI Lineage", + "standard_name": "catami_lineage" + }, + "dead": { + "type": "bool", + "nullable": false, + "long_name": "Is the organism dead", + "standard_name": "dead" + }, + "bleached": { + "type": "bool", + "nullable": false, + "long_name": "Is the organism bleached", + "standard_name": "bleached" + }, + "count": { + "type": "float64", + "nullable": true, + "long_name": "Count of organisms", + "standard_name": "count" + }, + "label_count": { + "type": "float64", + "nullable": true, + "long_name": "Label Count", + "standard_name": "label_count" + }, + "coverage": { + "type": "float64", + "nullable": false, + "long_name": "Coverage Percentage", + "standard_name": "coverage", + "units": "%" + }, + "source": { + "type": "string", + "nullable": false, + "long_name": "Data Source", + "standard_name": "source" + }, + "rls_lineage": { + "type": "string", + "nullable": false, + "long_name": "RLS Lineage", + "standard_name": "rls_lineage" + }, + "deployment_id": { + "type": "float64", + "nullable": true, + "long_name": "Deployment Identifier", + "standard_name": "deployment_id" + }, + "campaign_id": { + "type": "float64", + "nullable": true, + "long_name": "Campaign Identifier", + "standard_name": "campaign_id" + }, + "annotation_method": { + "type": "string", + "nullable": true, + "long_name": "Annotation Method", + "standard_name": "annotation_method" + }, + "sq_annotation_set_id": { + "type": "float64", + "nullable": true, + "long_name": "SQ Annotation Set Identifier", + "standard_name": "sq_annotation_set_id" + }, + "sq_annotation_set_owner": { + "type": "string", + "nullable": true, + "long_name": "SQ Annotation Set Owner", + "standard_name": "sq_annotation_set_owner" + } + }, + "aws_opendata_registry": null, + "run_settings": { + "batch_size": 1, + "cluster": { + "mode": "coiled", + "restart_every_path": false + }, + "paths": [ + { + "s3_uri": "s3://aodn-dataflow-dev/thomas.galindo/processing/stored/transformed_surveys.csv", + "filter": [] + } + ], + "clear_existing_data": true, + "raise_error": true, + "force_previous_parquet_deletion": true + } +} \ No newline at end of file From 7fdcdeda90da081abcefa7f42f71ae361f18fc2d Mon Sep 17 00:00:00 2001 From: "lbesnard (Loz)" Date: Mon, 28 Jul 2025 13:10:00 +1000 Subject: [PATCH 2/4] Update common.json --- aodn_cloud_optimised/config/common.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aodn_cloud_optimised/config/common.json b/aodn_cloud_optimised/config/common.json index 77286380..8c3990a1 100644 --- a/aodn_cloud_optimised/config/common.json +++ b/aodn_cloud_optimised/config/common.json @@ -1,6 +1,6 @@ { "BUCKET_RAW_DEFAULT": "imos-data", - "BUCKET_OPTIMISED_DEFAULT": "aodn-cloud-optimised", + "BUCKET_OPTIMISED_DEFAULT": "imos-data-lab-optimised", "ROOT_PREFIX_CLOUD_OPTIMISED_PATH": "", "BUCKET_INTEGRATION_TESTING_RAW_DEFAULT": "imos-data", "BUCKET_INTEGRATION_TESTING_OPTIMISED_DEFAULT": "imos-data-lab-optimised", From 6ce75df4890338cb650656287fb9ba7586b2a0a3 Mon Sep 17 00:00:00 2001 From: Tom Galindo <98626996+thommodin@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:21:19 +1000 Subject: [PATCH 3/4] Update nrmn_m13_survey.json Running into pre-commit issues locally, manual committing using GitHub gui --- .../config/dataset/nrmn_m13_survey.json | 315 ++++++++++-------- 1 file changed, 173 insertions(+), 142 deletions(-) diff --git a/aodn_cloud_optimised/config/dataset/nrmn_m13_survey.json b/aodn_cloud_optimised/config/dataset/nrmn_m13_survey.json index 15d85e66..2b24072b 100644 --- a/aodn_cloud_optimised/config/dataset/nrmn_m13_survey.json +++ b/aodn_cloud_optimised/config/dataset/nrmn_m13_survey.json @@ -2,217 +2,248 @@ "dataset_name": "nrmn_m13_survey_qc", "logger_name": "nrmn_m13_survey_qc", "cloud_optimised_format": "parquet", + "gattrs_to_variables": [], + "partition_keys": [ + "timestamp", + "polygon" + ], + "time_extent": { + "time": "date", + "partition_timestamp_period": "Y" + }, + "spatial_extent": { + "lat": "lat", + "lon": "lon", + "spatial_resolution": 5 + }, "metadata_uuid": "", "pandas_read_csv_config": { "delimiter": ",", "header": 0, "index_col": null, - "parse_dates": false, "na_values": null, - "encoding": "utf-8" + "encoding": "utf-8", + "parse_dates": [ + "date" + ], + "date_format": "ISO8601", + "low_memory": false }, "schema": { + "index": { + "type": "int64", + "nullable": false + }, "survey_id": { - "type": "int64", - "nullable": false, - "long_name": "Survey Identifier", - "standard_name": "survey_id" + "name": "survey_id", + "type": "int64", + "nullable": false, + "long_name": "Survey Identifier" }, "dataset_id": { - "type": "string", - "nullable": false, - "long_name": "Dataset Identifier", - "standard_name": "dataset_id" + "type": "string", + "nullable": false, + "long_name": "Dataset Identifier" }, "country": { - "type": "string", - "nullable": true, - "long_name": "Country of the survey", - "standard_name": "country" + "type": "string", + "nullable": true, + "long_name": "Country of the survey" }, "area": { - "type": "string", - "nullable": true, - "long_name": "Survey Area Description", - "standard_name": "area" + "type": "string", + "nullable": true, + "long_name": "Area of the survey" }, "location": { - "type": "string", - "nullable": true, - "long_name": "Location Description", - "standard_name": "location" + "type": "string", + "nullable": true, + "long_name": "Location of the survey" }, "site_code": { - "type": "string", - "nullable": true, - "long_name": "Site Code Identifier", - "standard_name": "site_code" + "type": "string", + "nullable": true, + "long_name": "NRMN site Code identifier" }, "site_name": { - "type": "string", - "nullable": true, - "long_name": "Site Name Description", - "standard_name": "site_name" + "type": "string", + "nullable": true, + "long_name": "NRMN Site Name identifier" + }, + "date": { + "type": "string", + "nullable": false, + "long_name": "Survey Date" }, "lat": { - "type": "float64", - "nullable": false, - "long_name": "Latitude of the survey site", - "standard_name": "latitude", - "units": "degrees" + "type": "float", + "nullable": false, + "long_name": "Latitude of the survey site", + "standard_name": "latitude", + "units": "degree_north" }, "lon": { - "type": "float64", - "nullable": false, - "long_name": "Longitude of the survey site", - "standard_name": "longitude", - "units": "degrees" - }, - "timestamp": { - "type": "string", - "nullable": false, - "long_name": "Survey Date", - "standard_name": "date" + "type": "float", + "nullable": false, + "long_name": "Longitude of the survey site", + "standard_name": "longitude", + "units": "degree_east" }, "depth": { - "type": "float64", - "nullable": false, - "long_name": "Depth of survey location", - "standard_name": "depth", - "units": "meters" + "type": "float", + "nullable": false, + "description": "Depth below sea level", + "standard_name": "sea_floor_depth_below_sea_level", + "units": "m" }, "program": { - "type": "string", - "nullable": false, - "long_name": "Program Name", - "standard_name": "program" + "type": "string", + "nullable": false, + "long_name": "Program Name" }, "visibility": { - "type": "float64", - "nullable": true, - "long_name": "Visibility during survey", - "standard_name": "visibility", - "units": "meters" + "type": "float", + "nullable": true, + "long_name": "Visibility during survey", + "units": "meters" }, "original_label_scheme": { - "type": "string", - "nullable": false, - "long_name": "Original Labeling Scheme", - "standard_name": "original_label_scheme" - }, - "original_category": { - "type": "string", - "nullable": false, - "long_name": "Original Category Label", - "standard_name": "original_category" + "type": "string", + "nullable": false, + "description": "The original labeling scheme before translation to RLS and Catami", + "long_name": "Original Labeling Scheme" }, "rls_category": { - "type": "string", - "nullable": false, - "long_name": "RLS Category", - "standard_name": "rls_category" + "type": "string", + "nullable": false, + "long_name": "RLS Category" + }, + "rls_lineage": { + "type": "string", + "nullable": false, + "long_name": "RLS Lineage", + "standard_name": "rls_lineage" }, "catami_category": { - "type": "string", - "nullable": false, - "long_name": "CATAMI Category", - "standard_name": "catami_category" + "type": "string", + "nullable": false, + "long_name": "CATAMI Category" }, "catami_lineage": { - "type": "string", - "nullable": false, - "long_name": "CATAMI Lineage", - "standard_name": "catami_lineage" + "type": "string", + "nullable": false, + "long_name": "CATAMI Lineage" }, "dead": { - "type": "bool", - "nullable": false, - "long_name": "Is the organism dead", - "standard_name": "dead" + "type": "bool", + "nullable": false, + "long_name": "Is the organism dead" }, "bleached": { - "type": "bool", - "nullable": false, - "long_name": "Is the organism bleached", - "standard_name": "bleached" - }, - "count": { - "type": "float64", - "nullable": true, - "long_name": "Count of organisms", - "standard_name": "count" - }, - "label_count": { - "type": "float64", - "nullable": true, - "long_name": "Label Count", - "standard_name": "label_count" + "type": "bool", + "nullable": false, + "long_name": "Is the organism bleached" + }, + "category_annotation_count": { + "type": "int64", + "nullable": true, + "long_name": "Count of organisms", + "standard_name": "count" + }, + "survey_annotation_count": { + "type": "int64", + "nullable": true, + "long_name": "Label Count" }, "coverage": { - "type": "float64", - "nullable": false, - "long_name": "Coverage Percentage", - "standard_name": "coverage", - "units": "%" + "type": "float", + "nullable": false, + "long_name": "Coverage Percentage", + "units": "%" }, "source": { - "type": "string", - "nullable": false, - "long_name": "Data Source", - "standard_name": "source" - }, - "rls_lineage": { - "type": "string", - "nullable": false, - "long_name": "RLS Lineage", - "standard_name": "rls_lineage" + "type": "string", + "nullable": false, + "description": "The data source of the row. Can be SQ+ or NRMN", + "long_name": "Data Source" }, "deployment_id": { - "type": "float64", - "nullable": true, - "long_name": "Deployment Identifier", - "standard_name": "deployment_id" + "type": "float", + "nullable": true, + "long_name": "Deployment Identifier" }, "campaign_id": { - "type": "float64", - "nullable": true, - "long_name": "Campaign Identifier", - "standard_name": "campaign_id" + "type": "float", + "nullable": true, + "long_name": "Campaign Identifier" }, "annotation_method": { - "type": "string", - "nullable": true, - "long_name": "Annotation Method", - "standard_name": "annotation_method" + "type": "string", + "nullable": true, + "description": "The annotation method applied to the survey", + "long_name": "Annotation Method" }, "sq_annotation_set_id": { - "type": "float64", - "nullable": true, - "long_name": "SQ Annotation Set Identifier", - "standard_name": "sq_annotation_set_id" + "type": "float", + "nullable": true, + "long_name": "SQ Annotation Set Identifier" }, "sq_annotation_set_owner": { - "type": "string", - "nullable": true, - "long_name": "SQ Annotation Set Owner", - "standard_name": "sq_annotation_set_owner" + "type": "string", + "nullable": true, + "long_name": "SQ Annotation Set Owner" + }, + "filename": { + "type": "string" + }, + "timestamp": { + "type": "int64" + }, + "polygon": { + "type": "string" } }, - "aws_opendata_registry": null, + "aws_opendata_registry": { + "Name": "test", + "Description": "test", + "Documentation": "test", + "Contact": "info@aodn.org.au", + "ManagedBy": "AODN", + "UpdateFrequency": "never", + "Tags": [ + "coral", + "macroalgae" + ], + "License": "http://creativecommons.org/licenses/by/4.0/", + "Resources": [ + { + "Description": "test", + "ARN": "test", + "Region": "test", + "Type": "S3 Bucket" + } + ], + "DataAtWork": { + "Tutorials": [] + }, + "Citation": "test" + }, "run_settings": { "batch_size": 1, "cluster": { - "mode": "coiled", + "mode": "local", "restart_every_path": false }, "paths": [ { - "s3_uri": "s3://aodn-dataflow-dev/thomas.galindo/processing/stored/transformed_surveys.csv", - "filter": [] + "s3_uri": "s3://aodn-dataflow-dev/thomas.galindo/processing/stored/", + "filter": [ + "transformed_surveys.csv" + ] } ], "clear_existing_data": true, "raise_error": true, "force_previous_parquet_deletion": true } -} \ No newline at end of file +} From 39fdef4f549cb0c18c48f5aa3809b24f71232f43 Mon Sep 17 00:00:00 2001 From: Tom Galindo <98626996+thommodin@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:22:41 +1000 Subject: [PATCH 4/4] Update common.json update back to regular default --- aodn_cloud_optimised/config/common.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aodn_cloud_optimised/config/common.json b/aodn_cloud_optimised/config/common.json index 8c3990a1..77286380 100644 --- a/aodn_cloud_optimised/config/common.json +++ b/aodn_cloud_optimised/config/common.json @@ -1,6 +1,6 @@ { "BUCKET_RAW_DEFAULT": "imos-data", - "BUCKET_OPTIMISED_DEFAULT": "imos-data-lab-optimised", + "BUCKET_OPTIMISED_DEFAULT": "aodn-cloud-optimised", "ROOT_PREFIX_CLOUD_OPTIMISED_PATH": "", "BUCKET_INTEGRATION_TESTING_RAW_DEFAULT": "imos-data", "BUCKET_INTEGRATION_TESTING_OPTIMISED_DEFAULT": "imos-data-lab-optimised",