From ccae8ff9b682dfcef5faf56882c540cf0ae3cc2c Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 30 Sep 2025 18:33:34 -0700 Subject: [PATCH 01/41] Implement st_haszm using WKBBytesExecutor instead (missing one last edge case) --- .../tests/functions/test_functions.py | 12 ++ rust/sedona-functions/src/st_haszm.rs | 114 ++++++++++++++---- 2 files changed, 103 insertions(+), 23 deletions(-) diff --git a/python/sedonadb/tests/functions/test_functions.py b/python/sedonadb/tests/functions/test_functions.py index 8811f703..015526ae 100644 --- a/python/sedonadb/tests/functions/test_functions.py +++ b/python/sedonadb/tests/functions/test_functions.py @@ -473,6 +473,8 @@ def test_st_geomfromwkb(eng, geom): ("POINT Z EMPTY", True), ("POINT M EMPTY", False), ("POINT ZM EMPTY", True), + # SedonaDB can't parse this yet: https://github.com/apache/sedona-db/issues/162 + # ("POINT (0 0 0)", True), ("POINT Z (0 0 0)", True), ("POINT M (0 0 0)", False), ("POINT ZM (0 0 0 0)", True), @@ -481,8 +483,18 @@ def test_st_geomfromwkb(eng, geom): ("LINESTRING Z (0 0 0, 1 1 1)", True), ("POLYGON EMPTY", False), ("MULTIPOINT ((0 0), (1 1))", False), + ("MULTIPOINT Z ((0 0 0))", True), + # SedonaDB can't parse this yet: https://github.com/apache/sedona-db/issues/162 + # ("MULTIPOINT ((0 0 0))", True), + ("MULTIPOINT ZM ((0 0 0 0))", True), ("GEOMETRYCOLLECTION EMPTY", False), + # Z-dim specified only in the nested geometry ("GEOMETRYCOLLECTION (POINT Z (0 0 0))", True), + # SedonaDB can't parse this yet: https://github.com/apache/sedona-db/issues/162 + # Z-dim specified only on the geom collection level but not the nested geometry level + # ("GEOMETRYCOLLECTION Z (POINT (0 0 0))", True), + # Z-dim specified on both levels + ("GEOMETRYCOLLECTION Z (POINT Z (0 0 0))", True), ("GEOMETRYCOLLECTION (GEOMETRYCOLLECTION (POINT Z (0 0 0)))", True), ], ) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index eeae58f5..1e3111f7 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -16,7 +16,7 @@ // under the License. use std::sync::Arc; -use crate::executor::WkbExecutor; +use crate::executor::WkbBytesExecutor; use arrow_array::builder::BooleanBuilder; use arrow_schema::DataType; use datafusion_common::error::Result; @@ -90,13 +90,13 @@ impl SedonaScalarKernel for STHasZm { _ => unreachable!(), }; - let executor = WkbExecutor::new(arg_types, args); + let executor = WkbBytesExecutor::new(arg_types, args); let mut builder = BooleanBuilder::with_capacity(executor.num_iterations()); executor.execute_wkb_void(|maybe_item| { match maybe_item { Some(item) => { - builder.append_option(invoke_scalar(&item, dim_index)?); + builder.append_option(infer_haszm(&item, dim_index)?); } None => builder.append_null(), } @@ -107,28 +107,59 @@ impl SedonaScalarKernel for STHasZm { } } -fn invoke_scalar(item: &Wkb, dim_index: usize) -> Result> { - match item.as_type() { - geo_traits::GeometryType::GeometryCollection(collection) => { - use geo_traits::GeometryCollectionTrait; - if collection.num_geometries() == 0 { - Ok(Some(false)) - } else { - // PostGIS doesn't allow creating a GeometryCollection with geometries of different dimensions - // so we can just check the dimension of the first one - let first_geom = unsafe { collection.geometry_unchecked(0) }; - invoke_scalar(first_geom, dim_index) - } - } - _ => { - let geom_dim = item.dim(); - match dim_index { - 2 => Ok(Some(matches!(geom_dim, Dimensions::Xyz | Dimensions::Xyzm))), - 3 => Ok(Some(matches!(geom_dim, Dimensions::Xym | Dimensions::Xyzm))), - _ => sedona_internal_err!("unexpected dim_index"), - } +/// Fast-path inference of geometry type name from raw WKB bytes +/// An error will be thrown for invalid WKB bytes input +/// +/// Spec: https://libgeos.org/specifications/wkb/ +fn infer_haszm(buf: &[u8], dim_index: usize) -> Result> { + if buf.len() < 5 { + return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); + } + + let byte_order = buf[0]; + let code = match byte_order { + 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), + 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + + // 0000 -> xy or unspecified + // 1000 -> xyz + // 2000 -> xym + // 3000 -> xyzm + match code / 1000 { + // If xy, it's possible we need to infer the dimension + 0 => {}, + 1 => return Ok(Some(dim_index == 2)), + 2 => return Ok(Some(dim_index == 3)), + 3 => return Ok(Some(true)), + _ => return sedona_internal_err!("Unexpected code: {code}"), + }; + + // If GeometryCollection (7), we need to check the dimension of the next geometry + if code & 0x7 == 7 { + // The next 4 bytes are the number of geometries in the collection + let num_geometries = match byte_order { + 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), + 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + // Check the dimension of the first geometry since they all have to be the same dimension + // Note: Attempting to create the following geometries error and are thus not possible to create: + // - Nested geometry dimension doesn't match the **specified** geom collection z-dimension + // - GEOMETRYCOLLECTION M (POINT Z (1 1 1)) + // - Nested geometry doesn't have the specified dimension + // - GEOMETRYCOLLECTION Z (POINT (1 1)) + // - Nested geometries have different dimensions + // - GEOMETRYCOLLECTION (POINT Z (1 1 1), POINT (1 1)) + if num_geometries >= 1 { + return infer_haszm(&buf[9..], dim_index); } + // If empty geometry (num_geometries == 0), fallback to below logic to check the geom collection's dimension + // GEOMETRY COLLECTION Z EMPTY hasz -> true } + // If code was unspecified / xy and we couldn't infer the dimension, it must be xy + Ok(Some(false)) } #[cfg(test)] @@ -172,6 +203,16 @@ mod tests { let result = m_tester.invoke_scalar("POINT (1 2)").unwrap(); m_tester.assert_scalar_result_equals(result, false); + // SedonaDB can't parse this yet: https://github.com/apache/sedona-db/issues/162 + // Infer z-dimension + // let result = z_tester.invoke_scalar("POINT (1 2 3)").unwrap(); + // z_tester.assert_scalar_result_equals(result, true); + + // SedonaDB can't parse this yet: https://github.com/apache/sedona-db/issues/162 + // Infer z-dimension + // let result = m_tester.invoke_scalar("POINT (1 2 3)").unwrap(); + // m_tester.assert_scalar_result_equals(result, false); + let result = z_tester.invoke_scalar("POINT M (1 2 3)").unwrap(); z_tester.assert_scalar_result_equals(result, false); @@ -184,11 +225,27 @@ mod tests { let result = m_tester.invoke_wkb_scalar(None).unwrap(); m_tester.assert_scalar_result_equals(result, ScalarValue::Null); + // Z-dimension specified only in the nested geometry, but not the geom collection level let result = z_tester .invoke_wkb_scalar(Some("GEOMETRYCOLLECTION (POINT Z (1 2 3))")) .unwrap(); z_tester.assert_scalar_result_equals(result, ScalarValue::Boolean(Some(true))); + // Z-dimension specified on both the geom collection and nested geometry level + // Geometry collection with Z dimension both on the geom collection and nested geometry level + let result = z_tester + .invoke_wkb_scalar(Some("GEOMETRYCOLLECTION Z (POINT Z (1 2 3))")) + .unwrap(); + z_tester.assert_scalar_result_equals(result, ScalarValue::Boolean(Some(true))); + + // SedonaDB can't parse this yet: https://github.com/apache/sedona-db/issues/162 + // Z-dimension specified only on the geom collection level but not the nested geometry level + // Geometry collection with Z dimension both on the geom collection and nested geometry level + // let result = z_tester + // .invoke_wkb_scalar(Some("GEOMETRYCOLLECTION Z (POINT (1 2 3))")) + // .unwrap(); + // z_tester.assert_scalar_result_equals(result, ScalarValue::Boolean(Some(true))); + let result = m_tester .invoke_wkb_scalar(Some("GEOMETRYCOLLECTION (POINT M (1 2 3))")) .unwrap(); @@ -203,5 +260,16 @@ mod tests { .invoke_wkb_scalar(Some("GEOMETRYCOLLECTION EMPTY")) .unwrap(); m_tester.assert_scalar_result_equals(result, ScalarValue::Boolean(Some(false))); + + // Empty geometry collections with Z or M dimensions + let result = z_tester + .invoke_wkb_scalar(Some("GEOMETRYCOLLECTION Z EMPTY")) + .unwrap(); + z_tester.assert_scalar_result_equals(result, ScalarValue::Boolean(Some(true))); + + let result = m_tester + .invoke_wkb_scalar(Some("GEOMETRYCOLLECTION M EMPTY")) + .unwrap(); + m_tester.assert_scalar_result_equals(result, ScalarValue::Boolean(Some(true))); } } From 07aa206ca321cd3195df99c0c687de1f96ed3a33 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 30 Sep 2025 18:34:30 -0700 Subject: [PATCH 02/41] Add note about handling last edge case --- rust/sedona-functions/src/st_haszm.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index 1e3111f7..b8d1a924 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -158,6 +158,10 @@ fn infer_haszm(buf: &[u8], dim_index: usize) -> Result> { // If empty geometry (num_geometries == 0), fallback to below logic to check the geom collection's dimension // GEOMETRY COLLECTION Z EMPTY hasz -> true } + + // TODO: Last check: check how many dimensions the 1st coordinate has (all other coordinates must have the same) + // e.g handle this case: POINT (0 0 0) -> xyz dimension + // If code was unspecified / xy and we couldn't infer the dimension, it must be xy Ok(Some(false)) } From cf031fbcaf5f1633551972079ddf1f0ba8046213 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 30 Sep 2025 18:41:04 -0700 Subject: [PATCH 03/41] Minor fix to the comments --- rust/sedona-functions/src/st_haszm.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index b8d1a924..32c6bd7a 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -136,7 +136,7 @@ fn infer_haszm(buf: &[u8], dim_index: usize) -> Result> { _ => return sedona_internal_err!("Unexpected code: {code}"), }; - // If GeometryCollection (7), we need to check the dimension of the next geometry + // If GeometryCollection (7), we need to check the dimension of the first geometry if code & 0x7 == 7 { // The next 4 bytes are the number of geometries in the collection let num_geometries = match byte_order { @@ -160,9 +160,9 @@ fn infer_haszm(buf: &[u8], dim_index: usize) -> Result> { } // TODO: Last check: check how many dimensions the 1st coordinate has (all other coordinates must have the same) - // e.g handle this case: POINT (0 0 0) -> xyz dimension + // e.g handle this case: POINT (0 0 0) -> xyz dimension, POINT (0 0 0 0) -> xyzm dimension - // If code was unspecified / xy and we couldn't infer the dimension, it must be xy + // If code was unspecified and we couldn't infer the dimension, it must be xy Ok(Some(false)) } From 526fc3b40a4d198084a410eac255017a90ce167a Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 30 Sep 2025 18:53:01 -0700 Subject: [PATCH 04/41] Fix pre-commit --- rust/sedona-functions/src/st_haszm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index 32c6bd7a..62a823be 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -129,7 +129,7 @@ fn infer_haszm(buf: &[u8], dim_index: usize) -> Result> { // 3000 -> xyzm match code / 1000 { // If xy, it's possible we need to infer the dimension - 0 => {}, + 0 => {} 1 => return Ok(Some(dim_index == 2)), 2 => return Ok(Some(dim_index == 3)), 3 => return Ok(Some(true)), From a491d91a6a7cbdc3446c8973504edc79929d6ff4 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 2 Oct 2025 08:55:51 -0700 Subject: [PATCH 05/41] Fix cargo clippy --- rust/sedona-functions/src/st_haszm.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index 62a823be..42f346bb 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -23,11 +23,9 @@ use datafusion_common::error::Result; use datafusion_expr::{ scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, Volatility, }; -use geo_traits::{Dimensions, GeometryTrait}; use sedona_common::sedona_internal_err; use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher}; -use wkb::reader::Wkb; pub fn st_hasz_udf() -> SedonaScalarUDF { SedonaScalarUDF::new( @@ -96,7 +94,7 @@ impl SedonaScalarKernel for STHasZm { executor.execute_wkb_void(|maybe_item| { match maybe_item { Some(item) => { - builder.append_option(infer_haszm(&item, dim_index)?); + builder.append_option(infer_haszm(item, dim_index)?); } None => builder.append_null(), } From cf90bcf07369a144616a1708f9cb88f5f7d468f9 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 2 Oct 2025 21:36:17 -0700 Subject: [PATCH 06/41] Save progress --- Cargo.lock | 2 ++ rust/sedona-functions/src/st_haszm.rs | 14 ++++++++++++++ rust/sedona-geometry/Cargo.toml | 2 ++ rust/sedona-geometry/src/lib.rs | 1 + 4 files changed, 19 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 4a0603cb..1c42ef72 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4914,10 +4914,12 @@ dependencies = [ name = "sedona-geometry" version = "0.2.0" dependencies = [ + "datafusion-common", "geo-traits 0.2.0", "geo-types", "lru", "rstest", + "sedona-common", "serde", "serde_json", "serde_with", diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index 42f346bb..b178c3cf 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -110,6 +110,20 @@ impl SedonaScalarKernel for STHasZm { /// /// Spec: https://libgeos.org/specifications/wkb/ fn infer_haszm(buf: &[u8], dim_index: usize) -> Result> { + + use sedona_geometry::wkb_header::WkbHeader; + use geo_traits::Dimensions; + let header = WkbHeader::new(buf); + let dimension = header.dimension(); + if matches!(dimension, Dimensions::Xyz | Dimensions::Xyzm) { + return Ok(Some(dim_index == 2)); + } + if matches!(dimension, Dimensions::Xym | Dimensions::Xyzm) { + return Ok(Some(dim_index == 3)); + } + return Ok(Some(false)); + + if buf.len() < 5 { return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); } diff --git a/rust/sedona-geometry/Cargo.toml b/rust/sedona-geometry/Cargo.toml index 8f127589..c3019f76 100644 --- a/rust/sedona-geometry/Cargo.toml +++ b/rust/sedona-geometry/Cargo.toml @@ -34,8 +34,10 @@ serde_json = { workspace = true } wkt = { workspace = true } [dependencies] +datafusion-common = { workspace = true } geo-traits = { workspace = true } lru = { workspace = true } +sedona-common = { path = "../sedona-common" } serde = { workspace = true } serde_with = { workspace = true } thiserror = { workspace = true } diff --git a/rust/sedona-geometry/src/lib.rs b/rust/sedona-geometry/src/lib.rs index 65cc5936..f189ec7b 100644 --- a/rust/sedona-geometry/src/lib.rs +++ b/rust/sedona-geometry/src/lib.rs @@ -24,3 +24,4 @@ pub mod point_count; pub mod transform; pub mod types; pub mod wkb_factory; +pub mod wkb_header; From 22a60875d3a9cc8d5e46829d4d12568d25e0615d Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 3 Oct 2025 21:46:09 -0700 Subject: [PATCH 07/41] Pull out dimension calculation logic into new wkb_header.rs --- rust/sedona-functions/src/st_haszm.rs | 79 +++----------- rust/sedona-geometry/src/wkb_header.rs | 144 +++++++++++++++++++++++++ 2 files changed, 159 insertions(+), 64 deletions(-) create mode 100644 rust/sedona-geometry/src/wkb_header.rs diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index b178c3cf..8bcd8233 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -23,8 +23,9 @@ use datafusion_common::error::Result; use datafusion_expr::{ scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, Volatility, }; -use sedona_common::sedona_internal_err; +use geo_traits::Dimensions; use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_geometry::wkb_header::WkbHeader; use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher}; pub fn st_hasz_udf() -> SedonaScalarUDF { @@ -110,72 +111,22 @@ impl SedonaScalarKernel for STHasZm { /// /// Spec: https://libgeos.org/specifications/wkb/ fn infer_haszm(buf: &[u8], dim_index: usize) -> Result> { - - use sedona_geometry::wkb_header::WkbHeader; - use geo_traits::Dimensions; - let header = WkbHeader::new(buf); - let dimension = header.dimension(); - if matches!(dimension, Dimensions::Xyz | Dimensions::Xyzm) { - return Ok(Some(dim_index == 2)); + let header = WkbHeader::new(buf)?; + let dimension = header.dimension()?; + + if dim_index == 2 { + return Ok(Some(matches!( + dimension, + Dimensions::Xyz | Dimensions::Xyzm + ))); } - if matches!(dimension, Dimensions::Xym | Dimensions::Xyzm) { - return Ok(Some(dim_index == 3)); + if dim_index == 3 { + return Ok(Some(matches!( + dimension, + Dimensions::Xym | Dimensions::Xyzm + ))); } return Ok(Some(false)); - - - if buf.len() < 5 { - return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); - } - - let byte_order = buf[0]; - let code = match byte_order { - 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), - 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), - other => return sedona_internal_err!("Unexpected byte order: {other}"), - }; - - // 0000 -> xy or unspecified - // 1000 -> xyz - // 2000 -> xym - // 3000 -> xyzm - match code / 1000 { - // If xy, it's possible we need to infer the dimension - 0 => {} - 1 => return Ok(Some(dim_index == 2)), - 2 => return Ok(Some(dim_index == 3)), - 3 => return Ok(Some(true)), - _ => return sedona_internal_err!("Unexpected code: {code}"), - }; - - // If GeometryCollection (7), we need to check the dimension of the first geometry - if code & 0x7 == 7 { - // The next 4 bytes are the number of geometries in the collection - let num_geometries = match byte_order { - 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), - 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), - other => return sedona_internal_err!("Unexpected byte order: {other}"), - }; - // Check the dimension of the first geometry since they all have to be the same dimension - // Note: Attempting to create the following geometries error and are thus not possible to create: - // - Nested geometry dimension doesn't match the **specified** geom collection z-dimension - // - GEOMETRYCOLLECTION M (POINT Z (1 1 1)) - // - Nested geometry doesn't have the specified dimension - // - GEOMETRYCOLLECTION Z (POINT (1 1)) - // - Nested geometries have different dimensions - // - GEOMETRYCOLLECTION (POINT Z (1 1 1), POINT (1 1)) - if num_geometries >= 1 { - return infer_haszm(&buf[9..], dim_index); - } - // If empty geometry (num_geometries == 0), fallback to below logic to check the geom collection's dimension - // GEOMETRY COLLECTION Z EMPTY hasz -> true - } - - // TODO: Last check: check how many dimensions the 1st coordinate has (all other coordinates must have the same) - // e.g handle this case: POINT (0 0 0) -> xyz dimension, POINT (0 0 0 0) -> xyzm dimension - - // If code was unspecified and we couldn't infer the dimension, it must be xy - Ok(Some(false)) } #[cfg(test)] diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs new file mode 100644 index 00000000..2995a247 --- /dev/null +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::types::GeometryTypeId; +use datafusion_common::error::{DataFusionError, Result}; +use geo_traits::Dimensions; +use sedona_common::sedona_internal_err; + +pub struct WkbHeader<'a> { + buf: &'a [u8], + geometry_type_id: GeometryTypeId, + dimensions: Option, +} + +impl<'a> WkbHeader<'a> { + /// Creates a new [WkbHeader] from a buffer + pub fn new(buf: &'a [u8]) -> Result { + if buf.len() < 5 { + return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); + }; + + let byte_order = buf[0]; + + // Parse geometry type + let code = match byte_order { + 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), + 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + + let geometry_type_id = GeometryTypeId::try_from_wkb_id(code & 0x7) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + Ok(Self { + buf, + geometry_type_id, + // Compute the following fields lazily since they require a bit more effort (recursion) + dimensions: None, + }) + } + + /// Returns the byte order of the WKB + pub fn byte_order(&self) -> u8 { + self.buf[0] + } + + /// Returns the geometry type id of the WKB by only parsing the header instead of the entire WKB + /// 1 -> Point + /// 2 -> LineString + /// 3 -> Polygon + /// 4 -> MultiPoint + /// 5 -> MultiLineString + /// 6 -> MultiPolygon + /// 7 -> GeometryCollection + /// + /// Spec: https://libgeos.org/specifications/wkb/ + pub fn geometry_type_id(&self) -> GeometryTypeId { + self.geometry_type_id + } + + /// Returns the dimension of the WKB by only parsing what's minimally necessary instead of the entire WKB + /// 0 -> XY + /// 1 -> XYZ + /// 2 -> XYM + /// 3 -> XYZM + /// + /// Spec: https://libgeos.org/specifications/wkb/ + pub fn dimension(mut self) -> Result { + // Calculate the dimension if we haven't already + if self.dimensions.is_none() { + self.dimensions = Some(parse_dimension(&self.buf)?); + } + self.dimensions.ok_or_else(|| { + DataFusionError::External("Unexpected internal state in WkbHeader".into()) + }) + } +} + +fn parse_dimension(buf: &[u8]) -> Result { + if buf.len() < 5 { + return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); + } + + let byte_order = buf[0]; + + let code = match byte_order { + 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), + 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + + // 0000 -> xy or unspecified + // 1000 -> xyz + // 2000 -> xym + // 3000 -> xyzm + match code / 1000 { + // If xy, it's possible we need to infer the dimension + 0 => {} + 1 => return Ok(Dimensions::Xyz), + 2 => return Ok(Dimensions::Xym), + 3 => return Ok(Dimensions::Xyzm), + _ => return sedona_internal_err!("Unexpected code: {code}"), + }; + + // Try to infer dimension + // If GeometryCollection (7), we need to check the dimension of the first geometry + if code & 0x7 == 7 { + // The next 4 bytes are the number of geometries in the collection + let num_geometries = match byte_order { + 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), + 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + // Check the dimension of the first geometry since they all have to be the same dimension + // Note: Attempting to create the following geometries error and are thus not possible to create: + // - Nested geometry dimension doesn't match the **specified** geom collection z-dimension + // - GEOMETRYCOLLECTION M (POINT Z (1 1 1)) + // - Nested geometry doesn't have the specified dimension + // - GEOMETRYCOLLECTION Z (POINT (1 1)) + // - Nested geometries have different dimensions + // - GEOMETRYCOLLECTION (POINT Z (1 1 1), POINT (1 1)) + if num_geometries >= 1 { + return parse_dimension(&buf[9..]); + } + // If empty geometry (num_geometries == 0), fallback to below logic to check the geom collection's dimension + // GEOMETRY COLLECTION Z EMPTY hasz -> true + } + + Ok(Dimensions::Xy) +} From 5c616af81c115aaf7ff40cee460a7eb3d88f9efe Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 3 Oct 2025 22:17:55 -0700 Subject: [PATCH 08/41] Add MULTIPOINT_WITH_INFERRED_Z_DIMENSION_WKB fixture --- rust/sedona-testing/src/fixtures.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/rust/sedona-testing/src/fixtures.rs b/rust/sedona-testing/src/fixtures.rs index 502bfaec..7232673e 100644 --- a/rust/sedona-testing/src/fixtures.rs +++ b/rust/sedona-testing/src/fixtures.rs @@ -22,3 +22,17 @@ pub const MULTIPOINT_WITH_EMPTY_CHILD_WKB: [u8; 30] = [ 0x01, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x7f, ]; + +/// A well-known binary blob of MULTIPOINT ((0 0 0)) where outer dimension is specified for xy +/// while inner point's dimension is actually xyz +pub const MULTIPOINT_WITH_INFERRED_Z_DIMENSION_WKB: [u8; 38] = [ + 0x01, // byte-order + 0x04, 0x00, 0x00, 0x00, // multipoint with xy-dimension specified + 0x01, 0x00, 0x00, 0x00, // 1 point + // nested point geom + 0x01, // byte-order + 0xe9, 0x03, 0x00, 0x00, // point with xyz-dimension specified + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // x-coordinate of point + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // y-coordinate of point + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // z-coordinate of point +]; From 207ecb114ed8c64f2b91c18ec17fccc7804275e3 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 3 Oct 2025 22:18:58 -0700 Subject: [PATCH 09/41] Fix dimension calculation to support all collection types and add fixture as test --- rust/sedona-functions/src/st_haszm.rs | 20 +++++++++++++++++++- rust/sedona-geometry/src/wkb_header.rs | 4 ++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index 8bcd8233..a0ded033 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -135,7 +135,9 @@ mod tests { use datafusion_expr::ScalarUDF; use rstest::rstest; use sedona_schema::datatypes::{WKB_GEOMETRY, WKB_VIEW_GEOMETRY}; - use sedona_testing::testers::ScalarUdfTester; + use sedona_testing::{ + fixtures::MULTIPOINT_WITH_INFERRED_Z_DIMENSION_WKB, testers::ScalarUdfTester, + }; use super::*; @@ -239,4 +241,20 @@ mod tests { .unwrap(); m_tester.assert_scalar_result_equals(result, ScalarValue::Boolean(Some(true))); } + + #[test] + fn multipoint_with_inferred_z_dimension() { + let z_tester = ScalarUdfTester::new(st_hasz_udf().into(), vec![WKB_GEOMETRY]); + let m_tester = ScalarUdfTester::new(st_hasm_udf().into(), vec![WKB_GEOMETRY]); + + let scalar = ScalarValue::Binary(Some(MULTIPOINT_WITH_INFERRED_Z_DIMENSION_WKB.to_vec())); + assert_eq!( + z_tester.invoke_scalar(scalar.clone()).unwrap(), + ScalarValue::Boolean(Some(true)) + ); + assert_eq!( + m_tester.invoke_scalar(scalar.clone()).unwrap(), + ScalarValue::Boolean(Some(false)) + ); + } } diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 2995a247..20e0f627 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -117,8 +117,8 @@ fn parse_dimension(buf: &[u8]) -> Result { }; // Try to infer dimension - // If GeometryCollection (7), we need to check the dimension of the first geometry - if code & 0x7 == 7 { + // If geometry is a collection (MULTIPOINT, ... GEOMETRYCOLLECTION, code 4-7), we need to check the dimension of the first geometry + if code & 0x7 >= 4 { // The next 4 bytes are the number of geometries in the collection let num_geometries = match byte_order { 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), From 43009f80af3d7c539aa02282495fb2144c14c72d Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 3 Oct 2025 22:21:38 -0700 Subject: [PATCH 10/41] Fix clippy and clean up --- rust/sedona-functions/src/st_haszm.rs | 6 +++--- rust/sedona-geometry/src/wkb_header.rs | 8 +------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index a0ded033..70fade4d 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -95,7 +95,7 @@ impl SedonaScalarKernel for STHasZm { executor.execute_wkb_void(|maybe_item| { match maybe_item { Some(item) => { - builder.append_option(infer_haszm(item, dim_index)?); + builder.append_option(invoke_scalar(item, dim_index)?); } None => builder.append_null(), } @@ -110,7 +110,7 @@ impl SedonaScalarKernel for STHasZm { /// An error will be thrown for invalid WKB bytes input /// /// Spec: https://libgeos.org/specifications/wkb/ -fn infer_haszm(buf: &[u8], dim_index: usize) -> Result> { +fn invoke_scalar(buf: &[u8], dim_index: usize) -> Result> { let header = WkbHeader::new(buf)?; let dimension = header.dimension()?; @@ -126,7 +126,7 @@ fn infer_haszm(buf: &[u8], dim_index: usize) -> Result> { Dimensions::Xym | Dimensions::Xyzm ))); } - return Ok(Some(false)); + Ok(Some(false)) } #[cfg(test)] diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 20e0f627..7c2104ec 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -73,16 +73,10 @@ impl<'a> WkbHeader<'a> { } /// Returns the dimension of the WKB by only parsing what's minimally necessary instead of the entire WKB - /// 0 -> XY - /// 1 -> XYZ - /// 2 -> XYM - /// 3 -> XYZM - /// - /// Spec: https://libgeos.org/specifications/wkb/ pub fn dimension(mut self) -> Result { // Calculate the dimension if we haven't already if self.dimensions.is_none() { - self.dimensions = Some(parse_dimension(&self.buf)?); + self.dimensions = Some(parse_dimension(self.buf)?); } self.dimensions.ok_or_else(|| { DataFusionError::External("Unexpected internal state in WkbHeader".into()) From 1078bdd98b3e5ab8fbf733efe33a214fcae7209d Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 3 Oct 2025 22:24:12 -0700 Subject: [PATCH 11/41] Remove public byte_order method since it's not needed atm --- rust/sedona-geometry/src/wkb_header.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 7c2104ec..84615389 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -53,11 +53,6 @@ impl<'a> WkbHeader<'a> { }) } - /// Returns the byte order of the WKB - pub fn byte_order(&self) -> u8 { - self.buf[0] - } - /// Returns the geometry type id of the WKB by only parsing the header instead of the entire WKB /// 1 -> Point /// 2 -> LineString From 4d4e7e0e92ae3b8e21272a828e46d8fa1acd6c26 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 3 Oct 2025 22:38:09 -0700 Subject: [PATCH 12/41] Perform all wkb_header operations lazily and cache the values as Option fields --- rust/sedona-geometry/src/wkb_header.rs | 55 ++++++++++++++++---------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 84615389..8ebbf6e0 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -20,35 +20,20 @@ use datafusion_common::error::{DataFusionError, Result}; use geo_traits::Dimensions; use sedona_common::sedona_internal_err; +/// Fast-path WKB header parser +/// Performs operations lazily and caches them after the first computation pub struct WkbHeader<'a> { buf: &'a [u8], - geometry_type_id: GeometryTypeId, + geometry_type_id: Option, dimensions: Option, } impl<'a> WkbHeader<'a> { /// Creates a new [WkbHeader] from a buffer pub fn new(buf: &'a [u8]) -> Result { - if buf.len() < 5 { - return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); - }; - - let byte_order = buf[0]; - - // Parse geometry type - let code = match byte_order { - 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), - 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), - other => return sedona_internal_err!("Unexpected byte order: {other}"), - }; - - let geometry_type_id = GeometryTypeId::try_from_wkb_id(code & 0x7) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - Ok(Self { buf, - geometry_type_id, - // Compute the following fields lazily since they require a bit more effort (recursion) + geometry_type_id: None, dimensions: None, }) } @@ -63,8 +48,13 @@ impl<'a> WkbHeader<'a> { /// 7 -> GeometryCollection /// /// Spec: https://libgeos.org/specifications/wkb/ - pub fn geometry_type_id(&self) -> GeometryTypeId { - self.geometry_type_id + pub fn geometry_type_id(mut self) -> Result { + if self.geometry_type_id.is_none() { + self.geometry_type_id = Some(parse_geometry_type_id(self.buf)?); + } + self.geometry_type_id.ok_or_else(|| { + DataFusionError::External("Unexpected internal state in WkbHeader".into()) + }) } /// Returns the dimension of the WKB by only parsing what's minimally necessary instead of the entire WKB @@ -131,3 +121,26 @@ fn parse_dimension(buf: &[u8]) -> Result { Ok(Dimensions::Xy) } + +fn parse_geometry_type_id(buf: &[u8]) -> Result { + if buf.len() < 5 { + return sedona_internal_err!("Invalid WKB: buffer too small"); + }; + + let byte_order = buf[0]; + + // Parse geometry type + let code = match byte_order { + 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), + 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + + // Only low 3 bits is for the base type, high bits include additional info + let code = code & 0x7; + + let geometry_type_id = GeometryTypeId::try_from_wkb_id(code) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + Ok(geometry_type_id) +} From dfd6c1addf584907bd013a3d0772c28f4803d617 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 3 Oct 2025 23:09:02 -0700 Subject: [PATCH 13/41] Add python integration test benches --- benchmarks/test_functions.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/benchmarks/test_functions.py b/benchmarks/test_functions.py index 6d5c33b8..14460440 100644 --- a/benchmarks/test_functions.py +++ b/benchmarks/test_functions.py @@ -132,6 +132,38 @@ def queries(): benchmark(queries) + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) + @pytest.mark.parametrize( + "table", + [ + "collections_simple", + "collections_complex", + ], + ) + def test_st_hasm(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect(f"SELECT ST_HasM(geom1) from {table}") + + benchmark(queries) + + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) + @pytest.mark.parametrize( + "table", + [ + "collections_simple", + "collections_complex", + ], + ) + def test_st_hasz(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect(f"SELECT ST_HasZ(geom1) from {table}") + + benchmark(queries) + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) @pytest.mark.parametrize( "table", From 0ef812d2d511af339d57f4566dac047a52b02267 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Sat, 4 Oct 2025 08:58:01 -0700 Subject: [PATCH 14/41] Add tests for wkb_header --- rust/sedona-geometry/src/wkb_header.rs | 217 +++++++++++++++++++++++++ 1 file changed, 217 insertions(+) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 8ebbf6e0..99e7fd98 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -144,3 +144,220 @@ fn parse_geometry_type_id(buf: &[u8]) -> Result { Ok(geometry_type_id) } + +#[cfg(test)] +mod tests { + use super::*; + use std::str::FromStr; + use wkt::Wkt; + + fn make_wkb(wkt_value: &'static str) -> Vec { + let geom = Wkt::::from_str(wkt_value).unwrap(); + let mut buf: Vec = vec![]; + wkb::writer::write_geometry(&mut buf, &geom, Default::default()).unwrap(); + buf + } + + #[test] + fn geometry_type_id() { + let wkb = make_wkb("POINT (1 2)"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); + + let wkb = make_wkb("LINESTRING (1 2, 3 4)"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::LineString + ); + + let wkb = make_wkb("POLYGON ((0 0, 0 1, 1 0, 0 0))"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Polygon); + + let wkb = make_wkb("MULTIPOINT ((1 2), (3 4))"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::MultiPoint + ); + + let wkb = make_wkb("MULTILINESTRING ((1 2, 3 4))"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::MultiLineString + ); + + let wkb = make_wkb("MULTIPOLYGON (((0 0, 0 1, 1 0, 0 0)))"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::MultiPolygon + ); + + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT (1 2))"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::GeometryCollection + ); + + // Some cases with z and m dimensions + let wkb = make_wkb("POINT Z (1 2 3)"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); + + let wkb = make_wkb("LINESTRING Z (1 2 3, 4 5 6)"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::LineString + ); + + let wkb = make_wkb("POLYGON M ((0 0 0, 0 1 0, 1 0 0, 0 0 0))"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Polygon); + } + + #[test] + fn empty_geometry_type_id() { + let wkb = make_wkb("POINT EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); + + let wkb = make_wkb("LINESTRING EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::LineString + ); + + let wkb = make_wkb("POLYGON EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Polygon); + + let wkb = make_wkb("MULTIPOINT EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::MultiPoint + ); + + let wkb = make_wkb("MULTILINESTRING EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::MultiLineString + ); + + let wkb = make_wkb("MULTIPOLYGON EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::MultiPolygon + ); + + let wkb = make_wkb("GEOMETRYCOLLECTION EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::GeometryCollection + ); + + // z, m cases + let wkb = make_wkb("POINT Z EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); + + let wkb = make_wkb("POINT M EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); + + let wkb = make_wkb("LINESTRING ZM EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::LineString + ); + } + + #[test] + fn dimension() { + let wkb = make_wkb("POINT (1 2)"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xy); + + let wkb = make_wkb("POINT Z (1 2 3)"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xyz); + + let wkb = make_wkb("POINT M (1 2 3)"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xym); + + let wkb = make_wkb("POINT ZM (1 2 3 4)"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xyzm); + } + + #[test] + fn inferred_collections_dimension() { + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z (1 2 3))"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xyz); + + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xyzm); + + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT M (1 2 3))"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xym); + + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xyzm); + } + + #[test] + fn empty_geometry_dimension() { + // POINTs + let wkb = make_wkb("POINT EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xy); + + let wkb = make_wkb("POINT Z EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xyz); + + let wkb = make_wkb("POINT M EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xym); + + let wkb = make_wkb("POINT ZM EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xyzm); + + // GEOMETRYCOLLECTIONs + let wkb = make_wkb("GEOMETRYCOLLECTION EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xy); + + let wkb = make_wkb("GEOMETRYCOLLECTION Z EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xyz); + + let wkb = make_wkb("GEOMETRYCOLLECTION M EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xym); + + let wkb = make_wkb("GEOMETRYCOLLECTION ZM EMPTY"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xyzm); + + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z EMPTY)"); + let header = WkbHeader::new(&wkb).unwrap(); + assert_eq!(header.dimension().unwrap(), Dimensions::Xyz); + } +} From 075d6e6f8bb97ac17969ebb35e0a8efa35f7e8dd Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Sat, 4 Oct 2025 21:04:26 -0700 Subject: [PATCH 15/41] Apply suggestion from @paleolimbot Co-authored-by: Dewey Dunnington --- python/sedonadb/tests/functions/test_functions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/sedonadb/tests/functions/test_functions.py b/python/sedonadb/tests/functions/test_functions.py index 015526ae..5529647e 100644 --- a/python/sedonadb/tests/functions/test_functions.py +++ b/python/sedonadb/tests/functions/test_functions.py @@ -473,8 +473,6 @@ def test_st_geomfromwkb(eng, geom): ("POINT Z EMPTY", True), ("POINT M EMPTY", False), ("POINT ZM EMPTY", True), - # SedonaDB can't parse this yet: https://github.com/apache/sedona-db/issues/162 - # ("POINT (0 0 0)", True), ("POINT Z (0 0 0)", True), ("POINT M (0 0 0)", False), ("POINT ZM (0 0 0 0)", True), From 491b3c7dc373ec6e25f5036a523e6e841edd006a Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Sat, 4 Oct 2025 21:07:26 -0700 Subject: [PATCH 16/41] Remaining clean up --- .../tests/functions/test_functions.py | 5 ----- rust/sedona-functions/src/st_haszm.rs | 21 ------------------- 2 files changed, 26 deletions(-) diff --git a/python/sedonadb/tests/functions/test_functions.py b/python/sedonadb/tests/functions/test_functions.py index 5529647e..81c0e56a 100644 --- a/python/sedonadb/tests/functions/test_functions.py +++ b/python/sedonadb/tests/functions/test_functions.py @@ -482,15 +482,10 @@ def test_st_geomfromwkb(eng, geom): ("POLYGON EMPTY", False), ("MULTIPOINT ((0 0), (1 1))", False), ("MULTIPOINT Z ((0 0 0))", True), - # SedonaDB can't parse this yet: https://github.com/apache/sedona-db/issues/162 - # ("MULTIPOINT ((0 0 0))", True), ("MULTIPOINT ZM ((0 0 0 0))", True), ("GEOMETRYCOLLECTION EMPTY", False), # Z-dim specified only in the nested geometry ("GEOMETRYCOLLECTION (POINT Z (0 0 0))", True), - # SedonaDB can't parse this yet: https://github.com/apache/sedona-db/issues/162 - # Z-dim specified only on the geom collection level but not the nested geometry level - # ("GEOMETRYCOLLECTION Z (POINT (0 0 0))", True), # Z-dim specified on both levels ("GEOMETRYCOLLECTION Z (POINT Z (0 0 0))", True), ("GEOMETRYCOLLECTION (GEOMETRYCOLLECTION (POINT Z (0 0 0)))", True), diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index 70fade4d..17729026 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -107,9 +107,6 @@ impl SedonaScalarKernel for STHasZm { } /// Fast-path inference of geometry type name from raw WKB bytes -/// An error will be thrown for invalid WKB bytes input -/// -/// Spec: https://libgeos.org/specifications/wkb/ fn invoke_scalar(buf: &[u8], dim_index: usize) -> Result> { let header = WkbHeader::new(buf)?; let dimension = header.dimension()?; @@ -172,16 +169,6 @@ mod tests { let result = m_tester.invoke_scalar("POINT (1 2)").unwrap(); m_tester.assert_scalar_result_equals(result, false); - // SedonaDB can't parse this yet: https://github.com/apache/sedona-db/issues/162 - // Infer z-dimension - // let result = z_tester.invoke_scalar("POINT (1 2 3)").unwrap(); - // z_tester.assert_scalar_result_equals(result, true); - - // SedonaDB can't parse this yet: https://github.com/apache/sedona-db/issues/162 - // Infer z-dimension - // let result = m_tester.invoke_scalar("POINT (1 2 3)").unwrap(); - // m_tester.assert_scalar_result_equals(result, false); - let result = z_tester.invoke_scalar("POINT M (1 2 3)").unwrap(); z_tester.assert_scalar_result_equals(result, false); @@ -207,14 +194,6 @@ mod tests { .unwrap(); z_tester.assert_scalar_result_equals(result, ScalarValue::Boolean(Some(true))); - // SedonaDB can't parse this yet: https://github.com/apache/sedona-db/issues/162 - // Z-dimension specified only on the geom collection level but not the nested geometry level - // Geometry collection with Z dimension both on the geom collection and nested geometry level - // let result = z_tester - // .invoke_wkb_scalar(Some("GEOMETRYCOLLECTION Z (POINT (1 2 3))")) - // .unwrap(); - // z_tester.assert_scalar_result_equals(result, ScalarValue::Boolean(Some(true))); - let result = m_tester .invoke_wkb_scalar(Some("GEOMETRYCOLLECTION (POINT M (1 2 3))")) .unwrap(); From 7efccc004b446d8917f70992693af4a7542aa7f9 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Sat, 4 Oct 2025 21:10:12 -0700 Subject: [PATCH 17/41] Update to method to dimensions plural --- rust/sedona-functions/src/st_haszm.rs | 2 +- rust/sedona-geometry/src/wkb_header.rs | 48 +++++++++++++------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index 17729026..e6bf21df 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -109,7 +109,7 @@ impl SedonaScalarKernel for STHasZm { /// Fast-path inference of geometry type name from raw WKB bytes fn invoke_scalar(buf: &[u8], dim_index: usize) -> Result> { let header = WkbHeader::new(buf)?; - let dimension = header.dimension()?; + let dimension = header.dimensions()?; if dim_index == 2 { return Ok(Some(matches!( diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 99e7fd98..fc5e8dc4 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -58,10 +58,10 @@ impl<'a> WkbHeader<'a> { } /// Returns the dimension of the WKB by only parsing what's minimally necessary instead of the entire WKB - pub fn dimension(mut self) -> Result { + pub fn dimensions(mut self) -> Result { // Calculate the dimension if we haven't already if self.dimensions.is_none() { - self.dimensions = Some(parse_dimension(self.buf)?); + self.dimensions = Some(parse_dimensions(self.buf)?); } self.dimensions.ok_or_else(|| { DataFusionError::External("Unexpected internal state in WkbHeader".into()) @@ -69,7 +69,7 @@ impl<'a> WkbHeader<'a> { } } -fn parse_dimension(buf: &[u8]) -> Result { +fn parse_dimensions(buf: &[u8]) -> Result { if buf.len() < 5 { return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); } @@ -113,7 +113,7 @@ fn parse_dimension(buf: &[u8]) -> Result { // - Nested geometries have different dimensions // - GEOMETRYCOLLECTION (POINT Z (1 1 1), POINT (1 1)) if num_geometries >= 1 { - return parse_dimension(&buf[9..]); + return parse_dimensions(&buf[9..]); } // If empty geometry (num_geometries == 0), fallback to below logic to check the geom collection's dimension // GEOMETRY COLLECTION Z EMPTY hasz -> true @@ -283,81 +283,81 @@ mod tests { } #[test] - fn dimension() { + fn dimensions() { let wkb = make_wkb("POINT (1 2)"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xy); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); let wkb = make_wkb("POINT Z (1 2 3)"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xyz); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); let wkb = make_wkb("POINT M (1 2 3)"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xym); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xym); let wkb = make_wkb("POINT ZM (1 2 3 4)"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xyzm); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); } #[test] - fn inferred_collections_dimension() { + fn inferred_collections_dimensions() { let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z (1 2 3))"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xyz); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xyzm); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT M (1 2 3))"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xym); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xym); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xyzm); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); } #[test] - fn empty_geometry_dimension() { + fn empty_geometry_dimensions() { // POINTs let wkb = make_wkb("POINT EMPTY"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xy); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); let wkb = make_wkb("POINT Z EMPTY"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xyz); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); let wkb = make_wkb("POINT M EMPTY"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xym); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xym); let wkb = make_wkb("POINT ZM EMPTY"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xyzm); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); // GEOMETRYCOLLECTIONs let wkb = make_wkb("GEOMETRYCOLLECTION EMPTY"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xy); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); let wkb = make_wkb("GEOMETRYCOLLECTION Z EMPTY"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xyz); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); let wkb = make_wkb("GEOMETRYCOLLECTION M EMPTY"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xym); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xym); let wkb = make_wkb("GEOMETRYCOLLECTION ZM EMPTY"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xyzm); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z EMPTY)"); let header = WkbHeader::new(&wkb).unwrap(); - assert_eq!(header.dimension().unwrap(), Dimensions::Xyz); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); } } From 06501e58d663e8d7812cafd8fa30c008e1a5fe45 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Sat, 4 Oct 2025 21:11:17 -0700 Subject: [PATCH 18/41] Rename method to try_new --- rust/sedona-functions/src/st_haszm.rs | 2 +- rust/sedona-geometry/src/wkb_header.rs | 76 +++++++++++++------------- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index e6bf21df..459719c3 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -108,7 +108,7 @@ impl SedonaScalarKernel for STHasZm { /// Fast-path inference of geometry type name from raw WKB bytes fn invoke_scalar(buf: &[u8], dim_index: usize) -> Result> { - let header = WkbHeader::new(buf)?; + let header = WkbHeader::try_new(buf)?; let dimension = header.dimensions()?; if dim_index == 2 { diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index fc5e8dc4..12f5b653 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -30,7 +30,7 @@ pub struct WkbHeader<'a> { impl<'a> WkbHeader<'a> { /// Creates a new [WkbHeader] from a buffer - pub fn new(buf: &'a [u8]) -> Result { + pub fn try_new(buf: &'a [u8]) -> Result { Ok(Self { buf, geometry_type_id: None, @@ -161,43 +161,43 @@ mod tests { #[test] fn geometry_type_id() { let wkb = make_wkb("POINT (1 2)"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); let wkb = make_wkb("LINESTRING (1 2, 3 4)"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!( header.geometry_type_id().unwrap(), GeometryTypeId::LineString ); let wkb = make_wkb("POLYGON ((0 0, 0 1, 1 0, 0 0))"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Polygon); let wkb = make_wkb("MULTIPOINT ((1 2), (3 4))"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!( header.geometry_type_id().unwrap(), GeometryTypeId::MultiPoint ); let wkb = make_wkb("MULTILINESTRING ((1 2, 3 4))"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!( header.geometry_type_id().unwrap(), GeometryTypeId::MultiLineString ); let wkb = make_wkb("MULTIPOLYGON (((0 0, 0 1, 1 0, 0 0)))"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!( header.geometry_type_id().unwrap(), GeometryTypeId::MultiPolygon ); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT (1 2))"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!( header.geometry_type_id().unwrap(), GeometryTypeId::GeometryCollection @@ -205,61 +205,61 @@ mod tests { // Some cases with z and m dimensions let wkb = make_wkb("POINT Z (1 2 3)"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); let wkb = make_wkb("LINESTRING Z (1 2 3, 4 5 6)"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!( header.geometry_type_id().unwrap(), GeometryTypeId::LineString ); let wkb = make_wkb("POLYGON M ((0 0 0, 0 1 0, 1 0 0, 0 0 0))"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Polygon); } #[test] fn empty_geometry_type_id() { let wkb = make_wkb("POINT EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); let wkb = make_wkb("LINESTRING EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!( header.geometry_type_id().unwrap(), GeometryTypeId::LineString ); let wkb = make_wkb("POLYGON EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Polygon); let wkb = make_wkb("MULTIPOINT EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!( header.geometry_type_id().unwrap(), GeometryTypeId::MultiPoint ); let wkb = make_wkb("MULTILINESTRING EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!( header.geometry_type_id().unwrap(), GeometryTypeId::MultiLineString ); let wkb = make_wkb("MULTIPOLYGON EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!( header.geometry_type_id().unwrap(), GeometryTypeId::MultiPolygon ); let wkb = make_wkb("GEOMETRYCOLLECTION EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!( header.geometry_type_id().unwrap(), GeometryTypeId::GeometryCollection @@ -267,15 +267,15 @@ mod tests { // z, m cases let wkb = make_wkb("POINT Z EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); let wkb = make_wkb("POINT M EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); let wkb = make_wkb("LINESTRING ZM EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!( header.geometry_type_id().unwrap(), GeometryTypeId::LineString @@ -285,38 +285,38 @@ mod tests { #[test] fn dimensions() { let wkb = make_wkb("POINT (1 2)"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); let wkb = make_wkb("POINT Z (1 2 3)"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); let wkb = make_wkb("POINT M (1 2 3)"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xym); let wkb = make_wkb("POINT ZM (1 2 3 4)"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); } #[test] fn inferred_collections_dimensions() { let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z (1 2 3))"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT M (1 2 3))"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xym); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); } @@ -324,40 +324,40 @@ mod tests { fn empty_geometry_dimensions() { // POINTs let wkb = make_wkb("POINT EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); let wkb = make_wkb("POINT Z EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); let wkb = make_wkb("POINT M EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xym); let wkb = make_wkb("POINT ZM EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); // GEOMETRYCOLLECTIONs let wkb = make_wkb("GEOMETRYCOLLECTION EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); let wkb = make_wkb("GEOMETRYCOLLECTION Z EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); let wkb = make_wkb("GEOMETRYCOLLECTION M EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xym); let wkb = make_wkb("GEOMETRYCOLLECTION ZM EMPTY"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z EMPTY)"); - let header = WkbHeader::new(&wkb).unwrap(); + let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); } } From 1b397fdaa06d9597ca795c83a5d9389a1e944076 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Sat, 4 Oct 2025 21:30:40 -0700 Subject: [PATCH 19/41] Update fixture to be multipoint ((1 2 3)) instead of all zeros --- rust/sedona-testing/src/fixtures.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rust/sedona-testing/src/fixtures.rs b/rust/sedona-testing/src/fixtures.rs index 7232673e..a5588295 100644 --- a/rust/sedona-testing/src/fixtures.rs +++ b/rust/sedona-testing/src/fixtures.rs @@ -23,7 +23,7 @@ pub const MULTIPOINT_WITH_EMPTY_CHILD_WKB: [u8; 30] = [ 0x00, 0x00, 0x00, 0x00, 0xf8, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x7f, ]; -/// A well-known binary blob of MULTIPOINT ((0 0 0)) where outer dimension is specified for xy +/// A well-known binary blob of MULTIPOINT ((1 2 3)) where outer dimension is specified for xy /// while inner point's dimension is actually xyz pub const MULTIPOINT_WITH_INFERRED_Z_DIMENSION_WKB: [u8; 38] = [ 0x01, // byte-order @@ -32,7 +32,7 @@ pub const MULTIPOINT_WITH_INFERRED_Z_DIMENSION_WKB: [u8; 38] = [ // nested point geom 0x01, // byte-order 0xe9, 0x03, 0x00, 0x00, // point with xyz-dimension specified - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // x-coordinate of point - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // y-coordinate of point - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // z-coordinate of point + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x-coordinate of point + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y-coordinate of point + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, // z-coordinate of point ]; From 9ce9f08c4fb670d53ad0a14e2a446c05666d956e Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Mon, 6 Oct 2025 23:22:34 -0700 Subject: [PATCH 20/41] Implement refactor --- rust/sedona-geometry/src/wkb_header.rs | 495 +++++++++++++++++++++---- 1 file changed, 415 insertions(+), 80 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 12f5b653..614afa39 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -16,25 +16,117 @@ // under the License. use crate::types::GeometryTypeId; -use datafusion_common::error::{DataFusionError, Result}; +use datafusion_common::{ + error::{DataFusionError, Result}, + exec_err, +}; use geo_traits::Dimensions; use sedona_common::sedona_internal_err; +const SRID_FLAG_BIT: u32 = 0x20000000; + /// Fast-path WKB header parser /// Performs operations lazily and caches them after the first computation -pub struct WkbHeader<'a> { - buf: &'a [u8], - geometry_type_id: Option, - dimensions: Option, +// pub struct WkbHeader<'a> { +pub struct WkbHeader { + geometry_type: u32, + // Not applicable for a point + // number of points for a linestring + // number of rings for a polygon + // number of geometries for a MULTIPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION + size: u32, + // SRID if given buffer was EWKB. Otherwise, 0. + srid: u32, + // First x,y coordinates for a point. Otherwise (f64::NAN, f64::NAN) if empty + first_xy: (f64, f64), + // Dimensions of the geometry: xy, xyz, xym, or xyzm + // dimensions: Dimensions, + first_coord_dimensions: Option, + // buf: &'a [u8], + // geometry_type_id: Option, + // dimensions: Dimensions, } -impl<'a> WkbHeader<'a> { +// impl<'a> WkbHeader<'a> { +impl WkbHeader { /// Creates a new [WkbHeader] from a buffer - pub fn try_new(buf: &'a [u8]) -> Result { + // pub fn try_new(buf: &'a [u8]) -> Result { + pub fn try_new(buf: &[u8]) -> Result { + if buf.len() < 5 { + return sedona_internal_err!("Invalid WKB: buffer too small -> try_new"); + }; + + let byte_order = buf[0]; + + // Parse geometry type + let geometry_type = match byte_order { + 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), + 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + + let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + println!("top geometry_type_id: {:?}", geometry_type_id); + + let i; + let srid; + // if EWKB + if geometry_type & SRID_FLAG_BIT != 0 { + panic!("EWKB was detected"); + // srid = match byte_order { + // 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), + // 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), + // other => return sedona_internal_err!("Unexpected byte order: {other}"), + // }; + // i = 9; + } else { + srid = 0; + i = 5; + } + + let size; + if geometry_type_id == GeometryTypeId::Point { + // Dummy value for a point + size = 1; + } else { + size = match byte_order { + 0 => u32::from_be_bytes([buf[i + 0], buf[i + 1], buf[i + 2], buf[i + 3]]), + 1 => u32::from_le_bytes([buf[i + 0], buf[i + 1], buf[i + 2], buf[i + 3]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + } + + // Default values for empty geometries + let first_x; + let first_y; + // TODO: rename to first_geom_dimensions + let first_coord_dimensions: Option; + println!("top level buf len: {}", buf.len()); + + let first_geom_idx = first_geom_idx(buf)?; + if let Some(i) = first_geom_idx { + println!("first_geom_idx: {}", i); + first_coord_dimensions = Some(parse_dimensions(&buf[i..])?); + println!("first_coord_dimensions: {:?}", first_coord_dimensions); + (first_x, first_y) = first_xy(&buf[i..])?; + println!("first_x: {}, first_y: {}", first_x, first_y); + } else { + first_coord_dimensions = None; + first_x = f64::NAN; + first_y = f64::NAN; + } + println!(""); + Ok(Self { - buf, - geometry_type_id: None, - dimensions: None, + // buf, + // geometry_type_id: None, + geometry_type, + srid, + size, + first_xy: (first_x, first_y), + first_coord_dimensions, }) } @@ -48,101 +140,342 @@ impl<'a> WkbHeader<'a> { /// 7 -> GeometryCollection /// /// Spec: https://libgeos.org/specifications/wkb/ - pub fn geometry_type_id(mut self) -> Result { - if self.geometry_type_id.is_none() { - self.geometry_type_id = Some(parse_geometry_type_id(self.buf)?); - } - self.geometry_type_id.ok_or_else(|| { - DataFusionError::External("Unexpected internal state in WkbHeader".into()) - }) + pub fn geometry_type_id(self) -> Result { + // Only low 3 bits is for the base type, high bits include additional info + let code = self.geometry_type & 0x7; + + let geometry_type_id = GeometryTypeId::try_from_wkb_id(code) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + Ok(geometry_type_id) } - /// Returns the dimension of the WKB by only parsing what's minimally necessary instead of the entire WKB - pub fn dimensions(mut self) -> Result { - // Calculate the dimension if we haven't already - if self.dimensions.is_none() { - self.dimensions = Some(parse_dimensions(self.buf)?); - } - self.dimensions.ok_or_else(|| { - DataFusionError::External("Unexpected internal state in WkbHeader".into()) - }) + // Not applicable for a point + // Number of points for a linestring + // Number of rings for a polygon + // Number of geometries for a MULTIPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION + pub fn size(self) -> u32 { + self.size + } + + // SRID if given buffer was EWKB. Otherwise, 0. + pub fn srid(self) -> u32 { + self.srid + } + + pub fn first_xy(self) -> (f64, f64) { + self.first_xy + } + + /// Returns the top-level dimension of the WKB + pub fn dimensions(self) -> Result { + let dimensions = match self.geometry_type / 1000 { + 0 => Dimensions::Xy, + 1 => Dimensions::Xyz, + 2 => Dimensions::Xym, + 3 => Dimensions::Xyzm, + _ => sedona_internal_err!("Unexpected code: {}", self.geometry_type)?, + }; + Ok(dimensions) + + // TODO: move this to st_haszm + // // 0000 -> xy + // // 1000 -> xyz + // // 2000 -> xym + // // 3000 -> xyzm + // let top_level_dimension = match self.geometry_type / 1000 { + // 0 => Dimensions::Xy, + // 1 => Dimensions::Xyz, + // 2 => Dimensions::Xym, + // 3 => Dimensions::Xyzm, + // _ => sedona_internal_err!("Unexpected code: {}", self.geometry_type)?, + // }; + + // // Infer dimension based on first coordinate dimension for cases where it differs from top-level + // // e.g GEOMETRYCOLLECTION (POINT Z (1 2 3)) + // if let Some(first_coord_dimensions) = self.first_coord_dimensions { + // return Ok(first_coord_dimensions); + // } + + // Ok(top_level_dimension) + } + + pub fn first_coord_dimensions(&self) -> Option { + self.first_coord_dimensions } } -fn parse_dimensions(buf: &[u8]) -> Result { +// For MULITPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION, returns the index to the first nested +// non-collection geometry (POINT, LINESTRING, or POLYGON), or None if empty +// For POINT, LINESTRING, POLYGON, returns 0 as it already is a non-collection geometry +fn first_geom_idx(buf: &[u8]) -> Result> { if buf.len() < 5 { - return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); + return exec_err!("Invalid WKB: buffer too small -> first_geom_idx"); } let byte_order = buf[0]; - - let code = match byte_order { + let geometry_type = match byte_order { 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), other => return sedona_internal_err!("Unexpected byte order: {other}"), }; + let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7) + .map_err(|e| DataFusionError::External(Box::new(e)))?; - // 0000 -> xy or unspecified - // 1000 -> xyz - // 2000 -> xym - // 3000 -> xyzm - match code / 1000 { - // If xy, it's possible we need to infer the dimension - 0 => {} - 1 => return Ok(Dimensions::Xyz), - 2 => return Ok(Dimensions::Xym), - 3 => return Ok(Dimensions::Xyzm), - _ => return sedona_internal_err!("Unexpected code: {code}"), + println!("first_geom_idx: geometry_type_id: {:?}", geometry_type_id); + + match geometry_type_id { + GeometryTypeId::Point | GeometryTypeId::LineString | GeometryTypeId::Polygon => { + return Ok(Some(0)) + } + GeometryTypeId::MultiPoint + | GeometryTypeId::MultiLineString + | GeometryTypeId::MultiPolygon + | GeometryTypeId::GeometryCollection => { + if buf.len() < 9 { + return exec_err!("Invalid WKB: buffer too small"); + } + let num_geometries = match byte_order { + 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), + 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + + if num_geometries == 0 { + return Ok(None); + } + + let mut i = 9; + if geometry_type & SRID_FLAG_BIT != 0 { + i += 4; + panic!("EWKB was detected"); + } + + // Recursive call to get the first geom of the first nested geometry + // Add to current offset of i + let off = first_geom_idx(&buf[i..]); + if let Ok(Some(off)) = off { + return Ok(Some(i + off)); + } else { + return Ok(None); + } + } + _ => return sedona_internal_err!("Unexpected geometry type: {geometry_type_id:?}"), + } +} + +// Given a point, linestring, or polygon, return the first xy coordinate +// If the geometry, is empty, (NaN, NaN) is returned +fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { + if buf.len() < 5 { + return exec_err!("Invalid WKB: buffer too small -> first_xy"); + } + + let byte_order = buf[0]; + let geometry_type = match byte_order { + 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), + 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), }; - // Try to infer dimension - // If geometry is a collection (MULTIPOINT, ... GEOMETRYCOLLECTION, code 4-7), we need to check the dimension of the first geometry - if code & 0x7 >= 4 { - // The next 4 bytes are the number of geometries in the collection - let num_geometries = match byte_order { - 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), - 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), + let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + println!("first_xy: geometry_type_id: {:?}", geometry_type_id); + + // 1 (byte_order) + 4 (geometry_type) = 5 + let mut i = 5; + + // Skip the SRID if it's present + if geometry_type & SRID_FLAG_BIT != 0 { + i += 4; + panic!("EWKB was detected"); + } + + if matches!( + geometry_type_id, + GeometryTypeId::LineString | GeometryTypeId::Polygon + ) { + if buf.len() < i + 4 { + return exec_err!( + "Invalid WKB: buffer too small -> first_xy3 {} is not < {}", + buf.len(), + i + 4 + ); + } + let size = match byte_order { + 0 => u32::from_be_bytes([buf[i + 0], buf[i + 1], buf[i + 2], buf[i + 3]]), + 1 => u32::from_le_bytes([buf[i + 0], buf[i + 1], buf[i + 2], buf[i + 3]]), other => return sedona_internal_err!("Unexpected byte order: {other}"), }; - // Check the dimension of the first geometry since they all have to be the same dimension - // Note: Attempting to create the following geometries error and are thus not possible to create: - // - Nested geometry dimension doesn't match the **specified** geom collection z-dimension - // - GEOMETRYCOLLECTION M (POINT Z (1 1 1)) - // - Nested geometry doesn't have the specified dimension - // - GEOMETRYCOLLECTION Z (POINT (1 1)) - // - Nested geometries have different dimensions - // - GEOMETRYCOLLECTION (POINT Z (1 1 1), POINT (1 1)) - if num_geometries >= 1 { - return parse_dimensions(&buf[9..]); + + // (NaN, NaN) for empty geometries + if size == 0 { + return Ok((f64::NAN, f64::NAN)); } - // If empty geometry (num_geometries == 0), fallback to below logic to check the geom collection's dimension - // GEOMETRY COLLECTION Z EMPTY hasz -> true + // + 4 for size + i += 4; } - Ok(Dimensions::Xy) + if buf.len() < i + 8 { + return exec_err!( + "Invalid WKB: buffer too small -> first_xy4 {} is not < {}", + i + 8, + buf.len() + ); + } + let x = parse_coord(&buf[i + 0..], byte_order)?; + let y = parse_coord(&buf[i + 8..], byte_order)?; + return Ok((x, y)); + + // 9 + 8 (x) + 8 (y) + // if buf.len() < 9 + 8 + 8 { + // return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); + // } + + // let x = parse_coord(&buf[9..], byte_order)?; + // let y = parse_coord(&buf[17..], byte_order)?; + // return Ok((x, y)); + + // 1 (byte_order) + 4 (geometry_type) + 4 (size) = 9 + // let i = 9; + // let srid_offset = 0; + + // if matches!( + // geometry_type_id, + // GeometryTypeId::LineString | GeometryTypeId::Polygon + // ) { + // // i is index of the first coordinate + // return Ok(parse_xy(&buf[i..], byte_order)?); + // } else if matches!( + // geometry_type_id, + // GeometryTypeId::MultiPoint + // | GeometryTypeId::MultiLineString + // | GeometryTypeId::MultiPolygon + // | GeometryTypeId::GeometryCollection + // ) { + // // i is the index of the first nested geometry + // let first_nested_geom_buf = &buf[i..]; + // // Recursive call to get the first xy coord of the first nested geometry + // return Ok(first_xy(&first_nested_geom_buf, geometry_type_id)?); + // } else { + // return sedona_internal_err!("Unexpected geometry type: {geometry_type_id:?}"); + // } + + // match geometry_type_id { + // // 1 (byte_order) + 4 (geometry_type) = 5 (no size) + // GeometryTypeId::Point => return Ok(parse_xy(&buf[5..], byte_order)?), + // // 1 (byte_order) + 4 (geometry_type) + 4 (size) = 9 + // GeometryTypeId::LineString | GeometryTypeId::Polygon => return Ok(parse_xy(&buf[9..], byte_order)?), + // // 1 (byte_order) + 4 (geometry_type) + 4 (size) = 9 + // GeometryTypeId::MultiPoint + // | GeometryTypeId::MultiLineString + // | GeometryTypeId::MultiPolygon + // | GeometryTypeId::GeometryCollection => { + // let size = match byte_order { + // 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), + // 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), + // other => return sedona_internal_err!("Unexpected byte order: {other}"), + // }; + // // (NaN, NaN) for empty geometries + // if size == 0 { + // return Ok((f64::NAN, f64::NAN)); + // } else { + // let first_nested_geom_buf = &buf[9..]; + // // Recursive call to get the first xy coord of the first nested geometry + // return Ok(first_xy(&first_nested_geom_buf, geometry_type_id)?); + // } + // }, + // _ => return sedona_internal_err!("Unexpected geometry type: {geometry_type_id:?}"), + // } } -fn parse_geometry_type_id(buf: &[u8]) -> Result { - if buf.len() < 5 { - return sedona_internal_err!("Invalid WKB: buffer too small"); +// Given a buffer starting at the coordinate itself, parse the x and y coordinates +fn parse_coord(buf: &[u8], byte_order: u8) -> Result { + if buf.len() < 8 { + return sedona_internal_err!("Invalid WKB: buffer too small -> parse_coord"); + } + + let coord: f64 = match byte_order { + 0 => f64::from_be_bytes([ + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], + ]), + 1 => f64::from_le_bytes([ + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], + ]), + other => return sedona_internal_err!("Unexpected byte order: {other}")?, }; + Ok(coord) +} + +// Parses the top-level dimension of the geometry +fn parse_dimensions(buf: &[u8]) -> Result { + if buf.len() < 9 { + return sedona_internal_err!("Invalid WKB: buffer too small -> parse_dimensions"); + } + let byte_order = buf[0]; - // Parse geometry type let code = match byte_order { 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), - other => return sedona_internal_err!("Unexpected byte order: {other}"), + other => sedona_internal_err!("Unexpected byte order: {other}")?, }; - // Only low 3 bits is for the base type, high bits include additional info - let code = code & 0x7; - - let geometry_type_id = GeometryTypeId::try_from_wkb_id(code) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + match code / 1000 { + 0 => Ok(Dimensions::Xy), + 1 => Ok(Dimensions::Xyz), + 2 => Ok(Dimensions::Xym), + 3 => Ok(Dimensions::Xyzm), + _ => sedona_internal_err!("Unexpected code: {code:?}"), + } - Ok(geometry_type_id) + // if buf.len() < 5 { + // return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); + // } + + // let byte_order = buf[0]; + + // let code = match byte_order { + // 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), + // 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), + // other => return sedona_internal_err!("Unexpected byte order: {other}"), + // }; + + // let geometry_type_id = GeometryTypeId::try_from_wkb_id(code) + // .map_err(|e| DataFusionError::External(Box::new(e)))?; + + // match geometry_type_id { + // GeometryTypeId::Point | GeometryTypeId::LineString | GeometryTypeId::Polygon => { + // // 0000 -> xy or unspecified + // // 1000 -> xyz + // // 2000 -> xym + // // 3000 -> xyzm + // match code / 1000 { + // // If xy, it's possible we need to infer the dimension + // 0 => return Ok(None), + // 1 => return Ok(Some(Dimensions::Xyz)), + // 2 => return Ok(Some(Dimensions::Xym)), + // 3 => return Ok(Some(Dimensions::Xyzm)), + // _ => return sedona_internal_err!("Unexpected code: {code}"), + // }; + // } + // GeometryTypeId::MultiPoint | GeometryTypeId::MultiLineString | GeometryTypeId::MultiPolygon | GeometryTypeId::GeometryCollection => { + // // If nested geometry, then recursive call + // let num_geometries = match byte_order { + // 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), + // 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), + // other => return sedona_internal_err!("Unexpected byte order: {other}"), + // }; + + // if num_geometries == 0 { + // return Ok(None); + // } + // // Recursive call to get the dimension of the first nested geometry + // return parse_dimensions(&buf[9..]); + // } + // _ => return sedona_internal_err!("Unexpected geometry type: {geometry_type_id:?}"), + // } } #[cfg(test)] @@ -302,22 +635,24 @@ mod tests { } #[test] - fn inferred_collections_dimensions() { + fn first_coord_dimensions() { + // Top-level dimension is xy, while nested geometry is xyz let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z (1 2 3))"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); + assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xyz); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); + assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xyzm); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT M (1 2 3))"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.dimensions().unwrap(), Dimensions::Xym); + assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xym); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); + assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xyzm); } #[test] @@ -356,8 +691,8 @@ mod tests { let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); - let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z EMPTY)"); - let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); + // let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z EMPTY)"); + // let header = WkbHeader::try_new(&wkb).unwrap(); + // assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); } } From 617f9b8a26b58c110f51b63a9a5649e4bf8a7b23 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Mon, 6 Oct 2025 23:23:14 -0700 Subject: [PATCH 21/41] Remove inferred dimension case --- rust/sedona-geometry/src/wkb_header.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 614afa39..5fa08b2a 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -690,9 +690,5 @@ mod tests { let wkb = make_wkb("GEOMETRYCOLLECTION ZM EMPTY"); let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); - - // let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z EMPTY)"); - // let header = WkbHeader::try_new(&wkb).unwrap(); - // assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); } } From 96311fa8ef5f071dc844d8a9bb58a27d99dfcfcf Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Mon, 6 Oct 2025 23:29:56 -0700 Subject: [PATCH 22/41] Move logic to st_haszm --- rust/sedona-functions/src/st_haszm.rs | 16 +++++++++++++--- rust/sedona-geometry/src/wkb_header.rs | 23 +---------------------- 2 files changed, 14 insertions(+), 25 deletions(-) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index 459719c3..c60373e5 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -24,6 +24,7 @@ use datafusion_expr::{ scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, Volatility, }; use geo_traits::Dimensions; +use sedona_common::sedona_internal_err; use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; use sedona_geometry::wkb_header::WkbHeader; use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher}; @@ -109,17 +110,26 @@ impl SedonaScalarKernel for STHasZm { /// Fast-path inference of geometry type name from raw WKB bytes fn invoke_scalar(buf: &[u8], dim_index: usize) -> Result> { let header = WkbHeader::try_new(buf)?; - let dimension = header.dimensions()?; + let top_level_dimensions = header.dimensions()?; + + // Infer dimension based on first coordinate dimension for cases where it differs from top-level + // e.g GEOMETRYCOLLECTION (POINT Z (1 2 3)) + let dimensions; + if let Some(first_coord_dimensions) = header.first_coord_dimensions() { + dimensions = first_coord_dimensions; + } else { + dimensions = top_level_dimensions; + } if dim_index == 2 { return Ok(Some(matches!( - dimension, + dimensions, Dimensions::Xyz | Dimensions::Xyzm ))); } if dim_index == 3 { return Ok(Some(matches!( - dimension, + dimensions, Dimensions::Xym | Dimensions::Xyzm ))); } diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 5fa08b2a..9f751b21 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -168,7 +168,7 @@ impl WkbHeader { } /// Returns the top-level dimension of the WKB - pub fn dimensions(self) -> Result { + pub fn dimensions(&self) -> Result { let dimensions = match self.geometry_type / 1000 { 0 => Dimensions::Xy, 1 => Dimensions::Xyz, @@ -177,27 +177,6 @@ impl WkbHeader { _ => sedona_internal_err!("Unexpected code: {}", self.geometry_type)?, }; Ok(dimensions) - - // TODO: move this to st_haszm - // // 0000 -> xy - // // 1000 -> xyz - // // 2000 -> xym - // // 3000 -> xyzm - // let top_level_dimension = match self.geometry_type / 1000 { - // 0 => Dimensions::Xy, - // 1 => Dimensions::Xyz, - // 2 => Dimensions::Xym, - // 3 => Dimensions::Xyzm, - // _ => sedona_internal_err!("Unexpected code: {}", self.geometry_type)?, - // }; - - // // Infer dimension based on first coordinate dimension for cases where it differs from top-level - // // e.g GEOMETRYCOLLECTION (POINT Z (1 2 3)) - // if let Some(first_coord_dimensions) = self.first_coord_dimensions { - // return Ok(first_coord_dimensions); - // } - - // Ok(top_level_dimension) } pub fn first_coord_dimensions(&self) -> Option { From d1d4fc8d0f51ed577a5877d4430991926b564f1e Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Mon, 6 Oct 2025 23:40:36 -0700 Subject: [PATCH 23/41] Add empty_geometry_first_coord_dimensions test --- rust/sedona-geometry/src/wkb_header.rs | 106 +++++++++++++++---------- 1 file changed, 62 insertions(+), 44 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 9f751b21..88219583 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -70,20 +70,16 @@ impl WkbHeader { println!("top geometry_type_id: {:?}", geometry_type_id); - let i; - let srid; + let mut i = 5; + let mut srid = 0; // if EWKB if geometry_type & SRID_FLAG_BIT != 0 { - panic!("EWKB was detected"); - // srid = match byte_order { - // 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), - // 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), - // other => return sedona_internal_err!("Unexpected byte order: {other}"), - // }; - // i = 9; - } else { - srid = 0; - i = 5; + srid = match byte_order { + 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), + 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + i = 9; } let size; @@ -120,8 +116,6 @@ impl WkbHeader { println!(""); Ok(Self { - // buf, - // geometry_type_id: None, geometry_type, srid, size, @@ -150,19 +144,21 @@ impl WkbHeader { Ok(geometry_type_id) } - // Not applicable for a point - // Number of points for a linestring - // Number of rings for a polygon - // Number of geometries for a MULTIPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION + /// Returns the size of the geometry + /// Not applicable for a point + /// Number of points for a linestring + /// Number of rings for a polygon + /// Number of geometries for a MULTIPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION pub fn size(self) -> u32 { self.size } - // SRID if given buffer was EWKB. Otherwise, 0. + /// Returns the SRID if given buffer was EWKB. Otherwise, 0. pub fn srid(self) -> u32 { self.srid } + /// Returns the first x, y coordinates for a point. Otherwise (f64::NAN, f64::NAN) if empty pub fn first_xy(self) -> (f64, f64) { self.first_xy } @@ -179,6 +175,7 @@ impl WkbHeader { Ok(dimensions) } + /// Returns the dimensions of the first coordinate of the geometry pub fn first_coord_dimensions(&self) -> Option { self.first_coord_dimensions } @@ -201,8 +198,6 @@ fn first_geom_idx(buf: &[u8]) -> Result> { let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7) .map_err(|e| DataFusionError::External(Box::new(e)))?; - println!("first_geom_idx: geometry_type_id: {:?}", geometry_type_id); - match geometry_type_id { GeometryTypeId::Point | GeometryTypeId::LineString | GeometryTypeId::Polygon => { return Ok(Some(0)) @@ -227,7 +222,6 @@ fn first_geom_idx(buf: &[u8]) -> Result> { let mut i = 9; if geometry_type & SRID_FLAG_BIT != 0 { i += 4; - panic!("EWKB was detected"); } // Recursive call to get the first geom of the first nested geometry @@ -267,7 +261,7 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { // Skip the SRID if it's present if geometry_type & SRID_FLAG_BIT != 0 { i += 4; - panic!("EWKB was detected"); + // panic!("EWKB was detected"); } if matches!( @@ -613,27 +607,6 @@ mod tests { assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); } - #[test] - fn first_coord_dimensions() { - // Top-level dimension is xy, while nested geometry is xyz - let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z (1 2 3))"); - let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xyz); - assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); - - let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); - let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xyzm); - - let wkb = make_wkb("GEOMETRYCOLLECTION (POINT M (1 2 3))"); - let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xym); - - let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); - let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xyzm); - } - #[test] fn empty_geometry_dimensions() { // POINTs @@ -670,4 +643,49 @@ mod tests { let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); } + + #[test] + fn first_coord_dimensions() { + // Top-level dimension is xy, while nested geometry is xyz + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z (1 2 3))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xyz); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xyzm); + + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT M (1 2 3))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xym); + + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xyzm); + } + + #[test] + fn empty_geometry_first_coord_dimensions() { + // Should return None for all of these since there are no coordinates + let wkb = make_wkb("POINT EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_coord_dimensions(), None); + + let wkb = make_wkb("LINESTRING EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_coord_dimensions(), None); + + let wkb = make_wkb("POLYGON EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_coord_dimensions(), None); + + let wkb = make_wkb("GEOMETRYCOLLECTION EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_coord_dimensions(), None); + + let wkb = make_wkb("GEOMETRYCOLLECTION Z EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_coord_dimensions(), None); + } } From 573f7c8ce52c1459b523be6fa112096a87239f58 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Mon, 6 Oct 2025 23:47:38 -0700 Subject: [PATCH 24/41] Add test for size --- rust/sedona-geometry/src/wkb_header.rs | 77 ++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 88219583..8df0d1c9 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -134,7 +134,7 @@ impl WkbHeader { /// 7 -> GeometryCollection /// /// Spec: https://libgeos.org/specifications/wkb/ - pub fn geometry_type_id(self) -> Result { + pub fn geometry_type_id(&self) -> Result { // Only low 3 bits is for the base type, high bits include additional info let code = self.geometry_type & 0x7; @@ -149,17 +149,17 @@ impl WkbHeader { /// Number of points for a linestring /// Number of rings for a polygon /// Number of geometries for a MULTIPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION - pub fn size(self) -> u32 { + pub fn size(&self) -> u32 { self.size } /// Returns the SRID if given buffer was EWKB. Otherwise, 0. - pub fn srid(self) -> u32 { + pub fn srid(&self) -> u32 { self.srid } /// Returns the first x, y coordinates for a point. Otherwise (f64::NAN, f64::NAN) if empty - pub fn first_xy(self) -> (f64, f64) { + pub fn first_xy(&self) -> (f64, f64) { self.first_xy } @@ -526,6 +526,75 @@ mod tests { assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Polygon); } + #[test] + fn size() { + let wkb = make_wkb("LINESTRING (1 2, 3 4)"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 2); + + let wkb = make_wkb("POLYGON ((0 0, 0 1, 1 0, 0 0), (1 1, 1 2, 2 1, 1 1))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 2); + + let wkb = make_wkb("MULTIPOINT ((1 2), (3 4))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 2); + + let wkb = make_wkb("MULTILINESTRING ((1 2, 3 4, 5 6), (7 8, 9 10, 11 12))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 2); + + let wkb = make_wkb("MULTIPOLYGON (((0 0, 0 1, 1 0, 0 0)), ((1 1, 1 2, 2 1, 1 1)))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 2); + + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT (1 2))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 1); + + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT (1 2), LINESTRING (1 2, 3 4), POLYGON ((0 0, 0 1, 1 0, 0 0)))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 3); + } + + #[test] + fn empty_size() { + let wkb = make_wkb("LINESTRING EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 0); + + let wkb = make_wkb("POLYGON EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 0); + + let wkb = make_wkb("MULTIPOINT EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 0); + + let wkb = make_wkb("MULTILINESTRING EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 0); + + let wkb = make_wkb("MULTIPOLYGON EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 0); + + let wkb = make_wkb("GEOMETRYCOLLECTION EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 0); + + let wkb = make_wkb("GEOMETRYCOLLECTION Z EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.size(), 0); + } + + #[test] + fn srid() { + let wkb = make_wkb("SRID=4326;POINT (1 2)"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.srid(), 4326); + } + #[test] fn empty_geometry_type_id() { let wkb = make_wkb("POINT EMPTY"); From b5984dc2035783c52272522641e76c7ba0c48204 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 7 Oct 2025 00:03:59 -0700 Subject: [PATCH 25/41] Add tests --- rust/sedona-geometry/src/wkb_header.rs | 66 ++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 8df0d1c9..ca813391 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -261,7 +261,6 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { // Skip the SRID if it's present if geometry_type & SRID_FLAG_BIT != 0 { i += 4; - // panic!("EWKB was detected"); } if matches!( @@ -588,11 +587,70 @@ mod tests { assert_eq!(header.size(), 0); } + // #[test] + // fn srid() { + // let wkb = make_wkb("SRID=4326;POINT (1 2)"); + // println!("wkb: {:?}", wkb); + // let header = WkbHeader::try_new(&wkb).unwrap(); + // assert_eq!(header.srid(), 4326); + // } + + #[test] + fn first_xy() { + let wkb = make_wkb("POINT (-5 -2)"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_xy(), (-5.0, -2.0)); + + let wkb = make_wkb("LINESTRING (1 2, 3 4)"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_xy(), (1.0, 2.0)); + + let wkb = make_wkb("POLYGON ((0 0, 0 1, 1 0, 0 0))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + let (x, y) = header.first_xy(); + println!("x: {}, y: {}", x, y); + assert_eq!(header.first_xy(), (0.0, 0.0)); + + let wkb = make_wkb("MULTIPOINT ((1 2), (3 4))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_xy(), (1.0, 2.0)); + + let wkb = make_wkb("MULTILINESTRING ((3 4, 1 2), (5 6, 7 8))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_xy(), (3.0, 4.0)); + + let wkb = make_wkb("MULTIPOLYGON (((-1 -1, 0 1, 1 -1, -1 -1)))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_xy(), (-1.0, -1.0)); + + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT (1 2))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_xy(), (1.0, 2.0)); + + let wkb = make_wkb("GEOMETRYCOLLECTION (POINT (1 2), LINESTRING (1 2, 3 4), POLYGON ((0 0, 0 1, 1 0, 0 0)))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_xy(), (1.0, 2.0)); + } + #[test] - fn srid() { - let wkb = make_wkb("SRID=4326;POINT (1 2)"); + fn empty_first_xy() { + let wkb = make_wkb("POINT EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + let (x, y) = header.first_xy(); + assert!(x.is_nan()); + assert!(y.is_nan()); + + let wkb = make_wkb("LINESTRING EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + let (x, y) = header.first_xy(); + assert!(x.is_nan()); + assert!(y.is_nan()); + + let wkb = make_wkb("GEOMETRYCOLLECTION Z EMPTY"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.srid(), 4326); + let (x, y) = header.first_xy(); + assert!(x.is_nan()); + assert!(y.is_nan()); } #[test] From 8c1d2f08f87d523348cbea8a52507672408a4f78 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 7 Oct 2025 00:09:28 -0700 Subject: [PATCH 26/41] Implement fix for first_xy POLYGON logic --- rust/sedona-geometry/src/wkb_header.rs | 32 ++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index ca813391..f6fb337f 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -286,6 +286,30 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { } // + 4 for size i += 4; + + // For POLYGON, after the number of rings, the next 4 bytes are the + // number of points in the exterior ring. We must skip that count to + // land on the first coordinate's x value. + if geometry_type_id == GeometryTypeId::Polygon { + if buf.len() < i + 4 { + return exec_err!( + "Invalid WKB: buffer too small -> polygon first ring size {} is not < {}", + buf.len(), + i + 4 + ); + } + let ring0_num_points = match byte_order { + 0 => u32::from_be_bytes([buf[i + 0], buf[i + 1], buf[i + 2], buf[i + 3]]), + 1 => u32::from_le_bytes([buf[i + 0], buf[i + 1], buf[i + 2], buf[i + 3]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + + // (NaN, NaN) for empty first ring + if ring0_num_points == 0 { + return Ok((f64::NAN, f64::NAN)); + } + i += 4; + } } if buf.len() < i + 8 { @@ -608,9 +632,13 @@ mod tests { let wkb = make_wkb("POLYGON ((0 0, 0 1, 1 0, 0 0))"); let header = WkbHeader::try_new(&wkb).unwrap(); let (x, y) = header.first_xy(); - println!("x: {}, y: {}", x, y); assert_eq!(header.first_xy(), (0.0, 0.0)); + // Another polygon test since that logic is more complicated + let wkb = make_wkb("POLYGON ((1.5 0.5, 1.5 1.5, 1.5 0.5), (0 0, 0 1, 1 0, 0 0))"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_xy(), (1.5, 0.5)); + let wkb = make_wkb("MULTIPOINT ((1 2), (3 4))"); let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.first_xy(), (1.0, 2.0)); @@ -619,7 +647,7 @@ mod tests { let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.first_xy(), (3.0, 4.0)); - let wkb = make_wkb("MULTIPOLYGON (((-1 -1, 0 1, 1 -1, -1 -1)))"); + let wkb = make_wkb("MULTIPOLYGON (((-1 -1, 0 1, 1 -1, -1 -1)), ((0 0, 0 1, 1 0, 0 0)))"); let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.first_xy(), (-1.0, -1.0)); From dc49aac65304130266dfff6eab9bac52894d81ee Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 7 Oct 2025 00:16:37 -0700 Subject: [PATCH 27/41] clean up --- rust/sedona-functions/src/st_haszm.rs | 1 - rust/sedona-geometry/src/wkb_header.rs | 166 +++---------------------- 2 files changed, 20 insertions(+), 147 deletions(-) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index c60373e5..3c81a180 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -24,7 +24,6 @@ use datafusion_expr::{ scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, Volatility, }; use geo_traits::Dimensions; -use sedona_common::sedona_internal_err; use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; use sedona_geometry::wkb_header::WkbHeader; use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher}; diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index f6fb337f..b8a85524 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -27,7 +27,6 @@ const SRID_FLAG_BIT: u32 = 0x20000000; /// Fast-path WKB header parser /// Performs operations lazily and caches them after the first computation -// pub struct WkbHeader<'a> { pub struct WkbHeader { geometry_type: u32, // Not applicable for a point @@ -39,18 +38,12 @@ pub struct WkbHeader { srid: u32, // First x,y coordinates for a point. Otherwise (f64::NAN, f64::NAN) if empty first_xy: (f64, f64), - // Dimensions of the geometry: xy, xyz, xym, or xyzm - // dimensions: Dimensions, + // Dimensions of the coordinate: xy, xyz, xym, or xyzm first_coord_dimensions: Option, - // buf: &'a [u8], - // geometry_type_id: Option, - // dimensions: Dimensions, } -// impl<'a> WkbHeader<'a> { impl WkbHeader { /// Creates a new [WkbHeader] from a buffer - // pub fn try_new(buf: &'a [u8]) -> Result { pub fn try_new(buf: &[u8]) -> Result { if buf.len() < 5 { return sedona_internal_err!("Invalid WKB: buffer too small -> try_new"); @@ -68,8 +61,6 @@ impl WkbHeader { let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7) .map_err(|e| DataFusionError::External(Box::new(e)))?; - println!("top geometry_type_id: {:?}", geometry_type_id); - let mut i = 5; let mut srid = 0; // if EWKB @@ -82,38 +73,31 @@ impl WkbHeader { i = 9; } - let size; - if geometry_type_id == GeometryTypeId::Point { + let size = if geometry_type_id == GeometryTypeId::Point { // Dummy value for a point - size = 1; + 1 } else { - size = match byte_order { - 0 => u32::from_be_bytes([buf[i + 0], buf[i + 1], buf[i + 2], buf[i + 3]]), - 1 => u32::from_le_bytes([buf[i + 0], buf[i + 1], buf[i + 2], buf[i + 3]]), + match byte_order { + 0 => u32::from_be_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), + 1 => u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), other => return sedona_internal_err!("Unexpected byte order: {other}"), - }; - } + } + }; // Default values for empty geometries let first_x; let first_y; - // TODO: rename to first_geom_dimensions let first_coord_dimensions: Option; - println!("top level buf len: {}", buf.len()); let first_geom_idx = first_geom_idx(buf)?; if let Some(i) = first_geom_idx { - println!("first_geom_idx: {}", i); first_coord_dimensions = Some(parse_dimensions(&buf[i..])?); - println!("first_coord_dimensions: {:?}", first_coord_dimensions); (first_x, first_y) = first_xy(&buf[i..])?; - println!("first_x: {}, first_y: {}", first_x, first_y); } else { first_coord_dimensions = None; first_x = f64::NAN; first_y = f64::NAN; } - println!(""); Ok(Self { geometry_type, @@ -199,15 +183,13 @@ fn first_geom_idx(buf: &[u8]) -> Result> { .map_err(|e| DataFusionError::External(Box::new(e)))?; match geometry_type_id { - GeometryTypeId::Point | GeometryTypeId::LineString | GeometryTypeId::Polygon => { - return Ok(Some(0)) - } + GeometryTypeId::Point | GeometryTypeId::LineString | GeometryTypeId::Polygon => Ok(Some(0)), GeometryTypeId::MultiPoint | GeometryTypeId::MultiLineString | GeometryTypeId::MultiPolygon | GeometryTypeId::GeometryCollection => { if buf.len() < 9 { - return exec_err!("Invalid WKB: buffer too small"); + exec_err!("Invalid WKB: buffer too small")? } let num_geometries = match byte_order { 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), @@ -228,12 +210,12 @@ fn first_geom_idx(buf: &[u8]) -> Result> { // Add to current offset of i let off = first_geom_idx(&buf[i..]); if let Ok(Some(off)) = off { - return Ok(Some(i + off)); + Ok(Some(i + off)) } else { - return Ok(None); + Ok(None) } } - _ => return sedona_internal_err!("Unexpected geometry type: {geometry_type_id:?}"), + _ => sedona_internal_err!("Unexpected geometry type: {geometry_type_id:?}"), } } @@ -253,7 +235,6 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7) .map_err(|e| DataFusionError::External(Box::new(e)))?; - println!("first_xy: geometry_type_id: {:?}", geometry_type_id); // 1 (byte_order) + 4 (geometry_type) = 5 let mut i = 5; @@ -275,8 +256,8 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { ); } let size = match byte_order { - 0 => u32::from_be_bytes([buf[i + 0], buf[i + 1], buf[i + 2], buf[i + 3]]), - 1 => u32::from_le_bytes([buf[i + 0], buf[i + 1], buf[i + 2], buf[i + 3]]), + 0 => u32::from_be_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), + 1 => u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), other => return sedona_internal_err!("Unexpected byte order: {other}"), }; @@ -299,8 +280,8 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { ); } let ring0_num_points = match byte_order { - 0 => u32::from_be_bytes([buf[i + 0], buf[i + 1], buf[i + 2], buf[i + 3]]), - 1 => u32::from_le_bytes([buf[i + 0], buf[i + 1], buf[i + 2], buf[i + 3]]), + 0 => u32::from_be_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), + 1 => u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), other => return sedona_internal_err!("Unexpected byte order: {other}"), }; @@ -319,70 +300,9 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { buf.len() ); } - let x = parse_coord(&buf[i + 0..], byte_order)?; + let x = parse_coord(&buf[i..], byte_order)?; let y = parse_coord(&buf[i + 8..], byte_order)?; - return Ok((x, y)); - - // 9 + 8 (x) + 8 (y) - // if buf.len() < 9 + 8 + 8 { - // return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); - // } - - // let x = parse_coord(&buf[9..], byte_order)?; - // let y = parse_coord(&buf[17..], byte_order)?; - // return Ok((x, y)); - - // 1 (byte_order) + 4 (geometry_type) + 4 (size) = 9 - // let i = 9; - // let srid_offset = 0; - - // if matches!( - // geometry_type_id, - // GeometryTypeId::LineString | GeometryTypeId::Polygon - // ) { - // // i is index of the first coordinate - // return Ok(parse_xy(&buf[i..], byte_order)?); - // } else if matches!( - // geometry_type_id, - // GeometryTypeId::MultiPoint - // | GeometryTypeId::MultiLineString - // | GeometryTypeId::MultiPolygon - // | GeometryTypeId::GeometryCollection - // ) { - // // i is the index of the first nested geometry - // let first_nested_geom_buf = &buf[i..]; - // // Recursive call to get the first xy coord of the first nested geometry - // return Ok(first_xy(&first_nested_geom_buf, geometry_type_id)?); - // } else { - // return sedona_internal_err!("Unexpected geometry type: {geometry_type_id:?}"); - // } - - // match geometry_type_id { - // // 1 (byte_order) + 4 (geometry_type) = 5 (no size) - // GeometryTypeId::Point => return Ok(parse_xy(&buf[5..], byte_order)?), - // // 1 (byte_order) + 4 (geometry_type) + 4 (size) = 9 - // GeometryTypeId::LineString | GeometryTypeId::Polygon => return Ok(parse_xy(&buf[9..], byte_order)?), - // // 1 (byte_order) + 4 (geometry_type) + 4 (size) = 9 - // GeometryTypeId::MultiPoint - // | GeometryTypeId::MultiLineString - // | GeometryTypeId::MultiPolygon - // | GeometryTypeId::GeometryCollection => { - // let size = match byte_order { - // 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), - // 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), - // other => return sedona_internal_err!("Unexpected byte order: {other}"), - // }; - // // (NaN, NaN) for empty geometries - // if size == 0 { - // return Ok((f64::NAN, f64::NAN)); - // } else { - // let first_nested_geom_buf = &buf[9..]; - // // Recursive call to get the first xy coord of the first nested geometry - // return Ok(first_xy(&first_nested_geom_buf, geometry_type_id)?); - // } - // }, - // _ => return sedona_internal_err!("Unexpected geometry type: {geometry_type_id:?}"), - // } + Ok((x, y)) } // Given a buffer starting at the coordinate itself, parse the x and y coordinates @@ -425,53 +345,6 @@ fn parse_dimensions(buf: &[u8]) -> Result { 3 => Ok(Dimensions::Xyzm), _ => sedona_internal_err!("Unexpected code: {code:?}"), } - - // if buf.len() < 5 { - // return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); - // } - - // let byte_order = buf[0]; - - // let code = match byte_order { - // 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), - // 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), - // other => return sedona_internal_err!("Unexpected byte order: {other}"), - // }; - - // let geometry_type_id = GeometryTypeId::try_from_wkb_id(code) - // .map_err(|e| DataFusionError::External(Box::new(e)))?; - - // match geometry_type_id { - // GeometryTypeId::Point | GeometryTypeId::LineString | GeometryTypeId::Polygon => { - // // 0000 -> xy or unspecified - // // 1000 -> xyz - // // 2000 -> xym - // // 3000 -> xyzm - // match code / 1000 { - // // If xy, it's possible we need to infer the dimension - // 0 => return Ok(None), - // 1 => return Ok(Some(Dimensions::Xyz)), - // 2 => return Ok(Some(Dimensions::Xym)), - // 3 => return Ok(Some(Dimensions::Xyzm)), - // _ => return sedona_internal_err!("Unexpected code: {code}"), - // }; - // } - // GeometryTypeId::MultiPoint | GeometryTypeId::MultiLineString | GeometryTypeId::MultiPolygon | GeometryTypeId::GeometryCollection => { - // // If nested geometry, then recursive call - // let num_geometries = match byte_order { - // 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), - // 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), - // other => return sedona_internal_err!("Unexpected byte order: {other}"), - // }; - - // if num_geometries == 0 { - // return Ok(None); - // } - // // Recursive call to get the dimension of the first nested geometry - // return parse_dimensions(&buf[9..]); - // } - // _ => return sedona_internal_err!("Unexpected geometry type: {geometry_type_id:?}"), - // } } #[cfg(test)] @@ -613,6 +486,7 @@ mod tests { // #[test] // fn srid() { + // // This doesn't work // let wkb = make_wkb("SRID=4326;POINT (1 2)"); // println!("wkb: {:?}", wkb); // let header = WkbHeader::try_new(&wkb).unwrap(); From 89ebb4cd91fae57cdb97510403d96e3e8fce4798 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 7 Oct 2025 00:26:54 -0700 Subject: [PATCH 28/41] Rename from first_coord_dimensions to first_geom_dimensions and adjust expected test results --- rust/sedona-geometry/src/wkb_header.rs | 56 +++++++++++++++----------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index b8a85524..8f809323 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -38,8 +38,9 @@ pub struct WkbHeader { srid: u32, // First x,y coordinates for a point. Otherwise (f64::NAN, f64::NAN) if empty first_xy: (f64, f64), - // Dimensions of the coordinate: xy, xyz, xym, or xyzm - first_coord_dimensions: Option, + // Dimensions of the first nested geometry of a collection or None if empty + // For POINT, LINESTRING, POLYGON, returns the dimensions of the geometry + first_geom_dimensions: Option, } impl WkbHeader { @@ -87,14 +88,14 @@ impl WkbHeader { // Default values for empty geometries let first_x; let first_y; - let first_coord_dimensions: Option; + let first_geom_dimensions: Option; let first_geom_idx = first_geom_idx(buf)?; if let Some(i) = first_geom_idx { - first_coord_dimensions = Some(parse_dimensions(&buf[i..])?); + first_geom_dimensions = Some(parse_dimensions(&buf[i..])?); (first_x, first_y) = first_xy(&buf[i..])?; } else { - first_coord_dimensions = None; + first_geom_dimensions = None; first_x = f64::NAN; first_y = f64::NAN; } @@ -104,7 +105,7 @@ impl WkbHeader { srid, size, first_xy: (first_x, first_y), - first_coord_dimensions, + first_geom_dimensions, }) } @@ -160,8 +161,8 @@ impl WkbHeader { } /// Returns the dimensions of the first coordinate of the geometry - pub fn first_coord_dimensions(&self) -> Option { - self.first_coord_dimensions + pub fn first_geom_dimensions(&self) -> Option { + self.first_geom_dimensions } } @@ -505,7 +506,6 @@ mod tests { let wkb = make_wkb("POLYGON ((0 0, 0 1, 1 0, 0 0))"); let header = WkbHeader::try_new(&wkb).unwrap(); - let (x, y) = header.first_xy(); assert_eq!(header.first_xy(), (0.0, 0.0)); // Another polygon test since that logic is more complicated @@ -674,47 +674,55 @@ mod tests { } #[test] - fn first_coord_dimensions() { + fn first_geom_dimensions() { // Top-level dimension is xy, while nested geometry is xyz let wkb = make_wkb("GEOMETRYCOLLECTION (POINT Z (1 2 3))"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xyz); + assert_eq!(header.first_geom_dimensions().unwrap(), Dimensions::Xyz); assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xyzm); + assert_eq!(header.first_geom_dimensions().unwrap(), Dimensions::Xyzm); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT M (1 2 3))"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xym); + assert_eq!(header.first_geom_dimensions().unwrap(), Dimensions::Xym); let wkb = make_wkb("GEOMETRYCOLLECTION (POINT ZM (1 2 3 4))"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions().unwrap(), Dimensions::Xyzm); + assert_eq!(header.first_geom_dimensions().unwrap(), Dimensions::Xyzm); } #[test] - fn empty_geometry_first_coord_dimensions() { - // Should return None for all of these since there are no coordinates + fn empty_geometry_first_geom_dimensions() { let wkb = make_wkb("POINT EMPTY"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions(), None); + assert_eq!(header.first_geom_dimensions(), Some(Dimensions::Xy)); let wkb = make_wkb("LINESTRING EMPTY"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions(), None); + assert_eq!(header.first_geom_dimensions(), Some(Dimensions::Xy)); - let wkb = make_wkb("POLYGON EMPTY"); + let wkb = make_wkb("POLYGON Z EMPTY"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions(), None); + assert_eq!(header.first_geom_dimensions(), Some(Dimensions::Xyz)); - let wkb = make_wkb("GEOMETRYCOLLECTION EMPTY"); + // Empty collections should return None + let wkb = make_wkb("MULTIPOINT EMPTY"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions(), None); + assert_eq!(header.first_geom_dimensions(), None); - let wkb = make_wkb("GEOMETRYCOLLECTION Z EMPTY"); + let wkb = make_wkb("MULTILINESTRING Z EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_geom_dimensions(), None); + + let wkb = make_wkb("MULTIPOLYGON M EMPTY"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.first_geom_dimensions(), None); + + let wkb = make_wkb("GEOMETRYCOLLECTION ZM EMPTY"); let header = WkbHeader::try_new(&wkb).unwrap(); - assert_eq!(header.first_coord_dimensions(), None); + assert_eq!(header.first_geom_dimensions(), None); } } From bdc0fae7d4e193d7a32f33cda70c45627645a50f Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 7 Oct 2025 00:44:01 -0700 Subject: [PATCH 29/41] Update name of method in st_haszm and update some sedona_internal_err to exec_err --- rust/sedona-functions/src/st_haszm.rs | 4 ++-- rust/sedona-geometry/src/wkb_header.rs | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index 3c81a180..c279ecef 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -114,8 +114,8 @@ fn invoke_scalar(buf: &[u8], dim_index: usize) -> Result> { // Infer dimension based on first coordinate dimension for cases where it differs from top-level // e.g GEOMETRYCOLLECTION (POINT Z (1 2 3)) let dimensions; - if let Some(first_coord_dimensions) = header.first_coord_dimensions() { - dimensions = first_coord_dimensions; + if let Some(first_geom_dimensions) = header.first_geom_dimensions() { + dimensions = first_geom_dimensions; } else { dimensions = top_level_dimensions; } diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 8f809323..cf02aa8a 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -47,7 +47,7 @@ impl WkbHeader { /// Creates a new [WkbHeader] from a buffer pub fn try_new(buf: &[u8]) -> Result { if buf.len() < 5 { - return sedona_internal_err!("Invalid WKB: buffer too small -> try_new"); + return exec_err!("Invalid WKB: buffer too small -> try_new"); }; let byte_order = buf[0]; @@ -155,7 +155,7 @@ impl WkbHeader { 1 => Dimensions::Xyz, 2 => Dimensions::Xym, 3 => Dimensions::Xyzm, - _ => sedona_internal_err!("Unexpected code: {}", self.geometry_type)?, + _ => exec_err!("Unexpected code: {}", self.geometry_type)?, }; Ok(dimensions) } @@ -309,7 +309,7 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { // Given a buffer starting at the coordinate itself, parse the x and y coordinates fn parse_coord(buf: &[u8], byte_order: u8) -> Result { if buf.len() < 8 { - return sedona_internal_err!("Invalid WKB: buffer too small -> parse_coord"); + return exec_err!("Invalid WKB: buffer too small -> parse_coord"); } let coord: f64 = match byte_order { @@ -328,7 +328,7 @@ fn parse_coord(buf: &[u8], byte_order: u8) -> Result { // Parses the top-level dimension of the geometry fn parse_dimensions(buf: &[u8]) -> Result { if buf.len() < 9 { - return sedona_internal_err!("Invalid WKB: buffer too small -> parse_dimensions"); + return exec_err!("Invalid WKB: buffer too small -> parse_dimensions"); } let byte_order = buf[0]; @@ -344,7 +344,7 @@ fn parse_dimensions(buf: &[u8]) -> Result { 1 => Ok(Dimensions::Xyz), 2 => Ok(Dimensions::Xym), 3 => Ok(Dimensions::Xyzm), - _ => sedona_internal_err!("Unexpected code: {code:?}"), + _ => exec_err!("Unexpected code: {code:?}"), } } From 3c83ff9e8108e595c2c338905af34b757895cf9f Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 22 Oct 2025 20:29:18 -0700 Subject: [PATCH 30/41] Use SedonaGeometryError and remove sedona and datafusion common dependencies from sedona-geometry crate --- Cargo.lock | 2 - rust/sedona-geometry/Cargo.toml | 2 - rust/sedona-geometry/src/wkb_header.rs | 150 +++++++++++++++++-------- 3 files changed, 104 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9e7102dc..94550ec3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4971,12 +4971,10 @@ dependencies = [ name = "sedona-geometry" version = "0.2.0" dependencies = [ - "datafusion-common", "geo-traits", "geo-types", "lru", "rstest", - "sedona-common", "serde", "serde_json", "serde_with", diff --git a/rust/sedona-geometry/Cargo.toml b/rust/sedona-geometry/Cargo.toml index c3019f76..8f127589 100644 --- a/rust/sedona-geometry/Cargo.toml +++ b/rust/sedona-geometry/Cargo.toml @@ -34,10 +34,8 @@ serde_json = { workspace = true } wkt = { workspace = true } [dependencies] -datafusion-common = { workspace = true } geo-traits = { workspace = true } lru = { workspace = true } -sedona-common = { path = "../sedona-common" } serde = { workspace = true } serde_with = { workspace = true } thiserror = { workspace = true } diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index cf02aa8a..8eb780e2 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -15,13 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::types::GeometryTypeId; -use datafusion_common::{ - error::{DataFusionError, Result}, - exec_err, -}; use geo_traits::Dimensions; -use sedona_common::sedona_internal_err; + +use crate::error::SedonaGeometryError; +use crate::types::GeometryTypeId; const SRID_FLAG_BIT: u32 = 0x20000000; @@ -45,9 +42,11 @@ pub struct WkbHeader { impl WkbHeader { /// Creates a new [WkbHeader] from a buffer - pub fn try_new(buf: &[u8]) -> Result { + pub fn try_new(buf: &[u8]) -> Result { if buf.len() < 5 { - return exec_err!("Invalid WKB: buffer too small -> try_new"); + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small -> try_new".to_string(), + )); }; let byte_order = buf[0]; @@ -56,11 +55,14 @@ impl WkbHeader { let geometry_type = match byte_order { 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), - other => return sedona_internal_err!("Unexpected byte order: {other}"), + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } }; - let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; let mut i = 5; let mut srid = 0; @@ -69,7 +71,11 @@ impl WkbHeader { srid = match byte_order { 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), - other => return sedona_internal_err!("Unexpected byte order: {other}"), + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } }; i = 9; } @@ -81,7 +87,11 @@ impl WkbHeader { match byte_order { 0 => u32::from_be_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), 1 => u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), - other => return sedona_internal_err!("Unexpected byte order: {other}"), + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } } }; @@ -119,12 +129,11 @@ impl WkbHeader { /// 7 -> GeometryCollection /// /// Spec: https://libgeos.org/specifications/wkb/ - pub fn geometry_type_id(&self) -> Result { + pub fn geometry_type_id(&self) -> Result { // Only low 3 bits is for the base type, high bits include additional info let code = self.geometry_type & 0x7; - let geometry_type_id = GeometryTypeId::try_from_wkb_id(code) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let geometry_type_id = GeometryTypeId::try_from_wkb_id(code)?; Ok(geometry_type_id) } @@ -149,13 +158,18 @@ impl WkbHeader { } /// Returns the top-level dimension of the WKB - pub fn dimensions(&self) -> Result { + pub fn dimensions(&self) -> Result { let dimensions = match self.geometry_type / 1000 { 0 => Dimensions::Xy, 1 => Dimensions::Xyz, 2 => Dimensions::Xym, 3 => Dimensions::Xyzm, - _ => exec_err!("Unexpected code: {}", self.geometry_type)?, + _ => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected code: {}", + self.geometry_type + ))) + } }; Ok(dimensions) } @@ -169,19 +183,24 @@ impl WkbHeader { // For MULITPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION, returns the index to the first nested // non-collection geometry (POINT, LINESTRING, or POLYGON), or None if empty // For POINT, LINESTRING, POLYGON, returns 0 as it already is a non-collection geometry -fn first_geom_idx(buf: &[u8]) -> Result> { +fn first_geom_idx(buf: &[u8]) -> Result, SedonaGeometryError> { if buf.len() < 5 { - return exec_err!("Invalid WKB: buffer too small -> first_geom_idx"); + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small -> first_geom_idx".to_string(), + )); } let byte_order = buf[0]; let geometry_type = match byte_order { 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), - other => return sedona_internal_err!("Unexpected byte order: {other}"), + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } }; - let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; match geometry_type_id { GeometryTypeId::Point | GeometryTypeId::LineString | GeometryTypeId::Polygon => Ok(Some(0)), @@ -190,12 +209,18 @@ fn first_geom_idx(buf: &[u8]) -> Result> { | GeometryTypeId::MultiPolygon | GeometryTypeId::GeometryCollection => { if buf.len() < 9 { - exec_err!("Invalid WKB: buffer too small")? + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small".to_string(), + )); } let num_geometries = match byte_order { 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), - other => return sedona_internal_err!("Unexpected byte order: {other}"), + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } }; if num_geometries == 0 { @@ -216,26 +241,35 @@ fn first_geom_idx(buf: &[u8]) -> Result> { Ok(None) } } - _ => sedona_internal_err!("Unexpected geometry type: {geometry_type_id:?}"), + _ => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected geometry type: {geometry_type_id:?}" + ))) + } } } // Given a point, linestring, or polygon, return the first xy coordinate // If the geometry, is empty, (NaN, NaN) is returned -fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { +fn first_xy(buf: &[u8]) -> Result<(f64, f64), SedonaGeometryError> { if buf.len() < 5 { - return exec_err!("Invalid WKB: buffer too small -> first_xy"); + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small -> first_xy".to_string(), + )); } let byte_order = buf[0]; let geometry_type = match byte_order { 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), - other => return sedona_internal_err!("Unexpected byte order: {other}"), + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } }; - let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; // 1 (byte_order) + 4 (geometry_type) = 5 let mut i = 5; @@ -250,16 +284,20 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { GeometryTypeId::LineString | GeometryTypeId::Polygon ) { if buf.len() < i + 4 { - return exec_err!( + return Err(SedonaGeometryError::Invalid(format!( "Invalid WKB: buffer too small -> first_xy3 {} is not < {}", buf.len(), i + 4 - ); + ))); } let size = match byte_order { 0 => u32::from_be_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), 1 => u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), - other => return sedona_internal_err!("Unexpected byte order: {other}"), + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } }; // (NaN, NaN) for empty geometries @@ -274,16 +312,20 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { // land on the first coordinate's x value. if geometry_type_id == GeometryTypeId::Polygon { if buf.len() < i + 4 { - return exec_err!( + return Err(SedonaGeometryError::Invalid(format!( "Invalid WKB: buffer too small -> polygon first ring size {} is not < {}", buf.len(), i + 4 - ); + ))); } let ring0_num_points = match byte_order { 0 => u32::from_be_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), 1 => u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), - other => return sedona_internal_err!("Unexpected byte order: {other}"), + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } }; // (NaN, NaN) for empty first ring @@ -295,11 +337,11 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { } if buf.len() < i + 8 { - return exec_err!( + return Err(SedonaGeometryError::Invalid(format!( "Invalid WKB: buffer too small -> first_xy4 {} is not < {}", i + 8, buf.len() - ); + ))); } let x = parse_coord(&buf[i..], byte_order)?; let y = parse_coord(&buf[i + 8..], byte_order)?; @@ -307,9 +349,11 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64)> { } // Given a buffer starting at the coordinate itself, parse the x and y coordinates -fn parse_coord(buf: &[u8], byte_order: u8) -> Result { +fn parse_coord(buf: &[u8], byte_order: u8) -> Result { if buf.len() < 8 { - return exec_err!("Invalid WKB: buffer too small -> parse_coord"); + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small -> parse_coord".to_string(), + )); } let coord: f64 = match byte_order { @@ -319,16 +363,22 @@ fn parse_coord(buf: &[u8], byte_order: u8) -> Result { 1 => f64::from_le_bytes([ buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], ]), - other => return sedona_internal_err!("Unexpected byte order: {other}")?, + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } }; Ok(coord) } // Parses the top-level dimension of the geometry -fn parse_dimensions(buf: &[u8]) -> Result { +fn parse_dimensions(buf: &[u8]) -> Result { if buf.len() < 9 { - return exec_err!("Invalid WKB: buffer too small -> parse_dimensions"); + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small -> parse_dimensions".to_string(), + )); } let byte_order = buf[0]; @@ -336,7 +386,11 @@ fn parse_dimensions(buf: &[u8]) -> Result { let code = match byte_order { 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), - other => sedona_internal_err!("Unexpected byte order: {other}")?, + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } }; match code / 1000 { @@ -344,7 +398,11 @@ fn parse_dimensions(buf: &[u8]) -> Result { 1 => Ok(Dimensions::Xyz), 2 => Ok(Dimensions::Xym), 3 => Ok(Dimensions::Xyzm), - _ => exec_err!("Unexpected code: {code:?}"), + _ => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected code: {code:?}" + ))) + } } } From 597c22e4585c6b72bc8c32b81da6da68805adf3f Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 22 Oct 2025 20:31:13 -0700 Subject: [PATCH 31/41] Fix write_geometry arg after merge --- rust/sedona-geometry/src/wkb_header.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 8eb780e2..4cc0226f 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -410,12 +410,13 @@ fn parse_dimensions(buf: &[u8]) -> Result { mod tests { use super::*; use std::str::FromStr; + use wkb::writer::{write_geometry, WriteOptions}; use wkt::Wkt; fn make_wkb(wkt_value: &'static str) -> Vec { let geom = Wkt::::from_str(wkt_value).unwrap(); let mut buf: Vec = vec![]; - wkb::writer::write_geometry(&mut buf, &geom, Default::default()).unwrap(); + write_geometry(&mut buf, &geom, &WriteOptions::default()).unwrap(); buf } From 55b94ef90b81d6f144cbf00805e1f5d58dcb5d01 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 22 Oct 2025 21:09:09 -0700 Subject: [PATCH 32/41] Create and use read_u32() helper function --- rust/sedona-geometry/src/wkb_header.rs | 108 +++++++------------------ 1 file changed, 27 insertions(+), 81 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 4cc0226f..d426ce1e 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -52,15 +52,7 @@ impl WkbHeader { let byte_order = buf[0]; // Parse geometry type - let geometry_type = match byte_order { - 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), - 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), - other => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" - ))) - } - }; + let geometry_type = read_u32(&buf[1..5], byte_order)?; let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; @@ -68,15 +60,7 @@ impl WkbHeader { let mut srid = 0; // if EWKB if geometry_type & SRID_FLAG_BIT != 0 { - srid = match byte_order { - 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), - 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), - other => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" - ))) - } - }; + srid = read_u32(&buf[5..9], byte_order)?; i = 9; } @@ -84,15 +68,7 @@ impl WkbHeader { // Dummy value for a point 1 } else { - match byte_order { - 0 => u32::from_be_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), - 1 => u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), - other => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" - ))) - } - } + read_u32(&buf[i..i + 4], byte_order)? }; // Default values for empty geometries @@ -191,15 +167,7 @@ fn first_geom_idx(buf: &[u8]) -> Result, SedonaGeometryError> { } let byte_order = buf[0]; - let geometry_type = match byte_order { - 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), - 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), - other => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" - ))) - } - }; + let geometry_type = read_u32(&buf[1..5], byte_order)?; let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; match geometry_type_id { @@ -213,15 +181,7 @@ fn first_geom_idx(buf: &[u8]) -> Result, SedonaGeometryError> { "Invalid WKB: buffer too small".to_string(), )); } - let num_geometries = match byte_order { - 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), - 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), - other => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" - ))) - } - }; + let num_geometries = read_u32(&buf[5..9], byte_order)?; if num_geometries == 0 { return Ok(None); @@ -259,15 +219,7 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64), SedonaGeometryError> { } let byte_order = buf[0]; - let geometry_type = match byte_order { - 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), - 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), - other => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" - ))) - } - }; + let geometry_type = read_u32(&buf[1..5], byte_order)?; let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; @@ -290,15 +242,7 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64), SedonaGeometryError> { i + 4 ))); } - let size = match byte_order { - 0 => u32::from_be_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), - 1 => u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), - other => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" - ))) - } - }; + let size = read_u32(&buf[i..i + 4], byte_order)?; // (NaN, NaN) for empty geometries if size == 0 { @@ -318,15 +262,7 @@ fn first_xy(buf: &[u8]) -> Result<(f64, f64), SedonaGeometryError> { i + 4 ))); } - let ring0_num_points = match byte_order { - 0 => u32::from_be_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), - 1 => u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), - other => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" - ))) - } - }; + let ring0_num_points = read_u32(&buf[i..i + 4], byte_order)?; // (NaN, NaN) for empty first ring if ring0_num_points == 0 { @@ -373,6 +309,24 @@ fn parse_coord(buf: &[u8], byte_order: u8) -> Result { Ok(coord) } +fn read_u32(buf: &[u8], byte_order: u8) -> Result { + if buf.len() < 4 { + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small -> read_u32".to_string(), + )); + } + + match byte_order { + 0 => Ok(u32::from_be_bytes([buf[0], buf[1], buf[2], buf[3]])), + 1 => Ok(u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]])), + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } + } +} + // Parses the top-level dimension of the geometry fn parse_dimensions(buf: &[u8]) -> Result { if buf.len() < 9 { @@ -383,15 +337,7 @@ fn parse_dimensions(buf: &[u8]) -> Result { let byte_order = buf[0]; - let code = match byte_order { - 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), - 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), - other => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" - ))) - } - }; + let code = read_u32(&buf[1..5], byte_order)?; match code / 1000 { 0 => Ok(Dimensions::Xy), From b77a762df1f3087f1c901d6effc2f139915d8f11 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 22 Oct 2025 21:15:11 -0700 Subject: [PATCH 33/41] Move all functions into WKbHeader as methods also rename helper to first_xy_coord --- rust/sedona-geometry/src/wkb_header.rs | 322 +++++++++++++------------ 1 file changed, 162 insertions(+), 160 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index d426ce1e..8d78a216 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -52,7 +52,7 @@ impl WkbHeader { let byte_order = buf[0]; // Parse geometry type - let geometry_type = read_u32(&buf[1..5], byte_order)?; + let geometry_type = Self::read_u32(&buf[1..5], byte_order)?; let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; @@ -60,7 +60,7 @@ impl WkbHeader { let mut srid = 0; // if EWKB if geometry_type & SRID_FLAG_BIT != 0 { - srid = read_u32(&buf[5..9], byte_order)?; + srid = Self::read_u32(&buf[5..9], byte_order)?; i = 9; } @@ -68,7 +68,7 @@ impl WkbHeader { // Dummy value for a point 1 } else { - read_u32(&buf[i..i + 4], byte_order)? + Self::read_u32(&buf[i..i + 4], byte_order)? }; // Default values for empty geometries @@ -76,10 +76,10 @@ impl WkbHeader { let first_y; let first_geom_dimensions: Option; - let first_geom_idx = first_geom_idx(buf)?; + let first_geom_idx = Self::first_geom_idx(buf)?; if let Some(i) = first_geom_idx { - first_geom_dimensions = Some(parse_dimensions(&buf[i..])?); - (first_x, first_y) = first_xy(&buf[i..])?; + first_geom_dimensions = Some(Self::parse_dimensions(&buf[i..])?); + (first_x, first_y) = Self::first_xy_coord(&buf[i..])?; } else { first_geom_dimensions = None; first_x = f64::NAN; @@ -154,200 +154,202 @@ impl WkbHeader { pub fn first_geom_dimensions(&self) -> Option { self.first_geom_dimensions } -} -// For MULITPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION, returns the index to the first nested -// non-collection geometry (POINT, LINESTRING, or POLYGON), or None if empty -// For POINT, LINESTRING, POLYGON, returns 0 as it already is a non-collection geometry -fn first_geom_idx(buf: &[u8]) -> Result, SedonaGeometryError> { - if buf.len() < 5 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small -> first_geom_idx".to_string(), - )); - } + // For MULITPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION, returns the index to the first nested + // non-collection geometry (POINT, LINESTRING, or POLYGON), or None if empty + // For POINT, LINESTRING, POLYGON, returns 0 as it already is a non-collection geometry + fn first_geom_idx(buf: &[u8]) -> Result, SedonaGeometryError> { + if buf.len() < 5 { + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small -> first_geom_idx".to_string(), + )); + } - let byte_order = buf[0]; - let geometry_type = read_u32(&buf[1..5], byte_order)?; - let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; - - match geometry_type_id { - GeometryTypeId::Point | GeometryTypeId::LineString | GeometryTypeId::Polygon => Ok(Some(0)), - GeometryTypeId::MultiPoint - | GeometryTypeId::MultiLineString - | GeometryTypeId::MultiPolygon - | GeometryTypeId::GeometryCollection => { - if buf.len() < 9 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small".to_string(), - )); - } - let num_geometries = read_u32(&buf[5..9], byte_order)?; + let byte_order = buf[0]; + let geometry_type = Self::read_u32(&buf[1..5], byte_order)?; + let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; - if num_geometries == 0 { - return Ok(None); + match geometry_type_id { + GeometryTypeId::Point | GeometryTypeId::LineString | GeometryTypeId::Polygon => { + Ok(Some(0)) } - - let mut i = 9; - if geometry_type & SRID_FLAG_BIT != 0 { - i += 4; + GeometryTypeId::MultiPoint + | GeometryTypeId::MultiLineString + | GeometryTypeId::MultiPolygon + | GeometryTypeId::GeometryCollection => { + if buf.len() < 9 { + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small".to_string(), + )); + } + let num_geometries = Self::read_u32(&buf[5..9], byte_order)?; + + if num_geometries == 0 { + return Ok(None); + } + + let mut i = 9; + if geometry_type & SRID_FLAG_BIT != 0 { + i += 4; + } + + // Recursive call to get the first geom of the first nested geometry + // Add to current offset of i + let off = Self::first_geom_idx(&buf[i..]); + if let Ok(Some(off)) = off { + Ok(Some(i + off)) + } else { + Ok(None) + } } - - // Recursive call to get the first geom of the first nested geometry - // Add to current offset of i - let off = first_geom_idx(&buf[i..]); - if let Ok(Some(off)) = off { - Ok(Some(i + off)) - } else { - Ok(None) + _ => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected geometry type: {geometry_type_id:?}" + ))) } } - _ => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected geometry type: {geometry_type_id:?}" - ))) - } - } -} - -// Given a point, linestring, or polygon, return the first xy coordinate -// If the geometry, is empty, (NaN, NaN) is returned -fn first_xy(buf: &[u8]) -> Result<(f64, f64), SedonaGeometryError> { - if buf.len() < 5 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small -> first_xy".to_string(), - )); } - let byte_order = buf[0]; - let geometry_type = read_u32(&buf[1..5], byte_order)?; - - let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; + // Given a point, linestring, or polygon, return the first xy coordinate + // If the geometry, is empty, (NaN, NaN) is returned + fn first_xy_coord(buf: &[u8]) -> Result<(f64, f64), SedonaGeometryError> { + if buf.len() < 5 { + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small -> first_xy".to_string(), + )); + } - // 1 (byte_order) + 4 (geometry_type) = 5 - let mut i = 5; + let byte_order = buf[0]; + let geometry_type = Self::read_u32(&buf[1..5], byte_order)?; - // Skip the SRID if it's present - if geometry_type & SRID_FLAG_BIT != 0 { - i += 4; - } + let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; - if matches!( - geometry_type_id, - GeometryTypeId::LineString | GeometryTypeId::Polygon - ) { - if buf.len() < i + 4 { - return Err(SedonaGeometryError::Invalid(format!( - "Invalid WKB: buffer too small -> first_xy3 {} is not < {}", - buf.len(), - i + 4 - ))); - } - let size = read_u32(&buf[i..i + 4], byte_order)?; + // 1 (byte_order) + 4 (geometry_type) = 5 + let mut i = 5; - // (NaN, NaN) for empty geometries - if size == 0 { - return Ok((f64::NAN, f64::NAN)); + // Skip the SRID if it's present + if geometry_type & SRID_FLAG_BIT != 0 { + i += 4; } - // + 4 for size - i += 4; - // For POLYGON, after the number of rings, the next 4 bytes are the - // number of points in the exterior ring. We must skip that count to - // land on the first coordinate's x value. - if geometry_type_id == GeometryTypeId::Polygon { + if matches!( + geometry_type_id, + GeometryTypeId::LineString | GeometryTypeId::Polygon + ) { if buf.len() < i + 4 { return Err(SedonaGeometryError::Invalid(format!( - "Invalid WKB: buffer too small -> polygon first ring size {} is not < {}", + "Invalid WKB: buffer too small -> first_xy3 {} is not < {}", buf.len(), i + 4 ))); } - let ring0_num_points = read_u32(&buf[i..i + 4], byte_order)?; + let size = Self::read_u32(&buf[i..i + 4], byte_order)?; - // (NaN, NaN) for empty first ring - if ring0_num_points == 0 { + // (NaN, NaN) for empty geometries + if size == 0 { return Ok((f64::NAN, f64::NAN)); } + // + 4 for size i += 4; - } - } - if buf.len() < i + 8 { - return Err(SedonaGeometryError::Invalid(format!( - "Invalid WKB: buffer too small -> first_xy4 {} is not < {}", - i + 8, - buf.len() - ))); - } - let x = parse_coord(&buf[i..], byte_order)?; - let y = parse_coord(&buf[i + 8..], byte_order)?; - Ok((x, y)) -} + // For POLYGON, after the number of rings, the next 4 bytes are the + // number of points in the exterior ring. We must skip that count to + // land on the first coordinate's x value. + if geometry_type_id == GeometryTypeId::Polygon { + if buf.len() < i + 4 { + return Err(SedonaGeometryError::Invalid(format!( + "Invalid WKB: buffer too small -> polygon first ring size {} is not < {}", + buf.len(), + i + 4 + ))); + } + let ring0_num_points = Self::read_u32(&buf[i..i + 4], byte_order)?; + + // (NaN, NaN) for empty first ring + if ring0_num_points == 0 { + return Ok((f64::NAN, f64::NAN)); + } + i += 4; + } + } -// Given a buffer starting at the coordinate itself, parse the x and y coordinates -fn parse_coord(buf: &[u8], byte_order: u8) -> Result { - if buf.len() < 8 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small -> parse_coord".to_string(), - )); + if buf.len() < i + 8 { + return Err(SedonaGeometryError::Invalid(format!( + "Invalid WKB: buffer too small -> first_xy4 {} is not < {}", + i + 8, + buf.len() + ))); + } + let x = Self::parse_coord(&buf[i..], byte_order)?; + let y = Self::parse_coord(&buf[i + 8..], byte_order)?; + Ok((x, y)) } - let coord: f64 = match byte_order { - 0 => f64::from_be_bytes([ - buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], - ]), - 1 => f64::from_le_bytes([ - buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], - ]), - other => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" - ))) + // Given a buffer starting at the coordinate itself, parse the x and y coordinates + fn parse_coord(buf: &[u8], byte_order: u8) -> Result { + if buf.len() < 8 { + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small -> parse_coord".to_string(), + )); } - }; - Ok(coord) -} + let coord: f64 = match byte_order { + 0 => f64::from_be_bytes([ + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], + ]), + 1 => f64::from_le_bytes([ + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], + ]), + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } + }; -fn read_u32(buf: &[u8], byte_order: u8) -> Result { - if buf.len() < 4 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small -> read_u32".to_string(), - )); + Ok(coord) } - match byte_order { - 0 => Ok(u32::from_be_bytes([buf[0], buf[1], buf[2], buf[3]])), - 1 => Ok(u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]])), - other => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" - ))) + fn read_u32(buf: &[u8], byte_order: u8) -> Result { + if buf.len() < 4 { + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small".to_string(), + )); } - } -} -// Parses the top-level dimension of the geometry -fn parse_dimensions(buf: &[u8]) -> Result { - if buf.len() < 9 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small -> parse_dimensions".to_string(), - )); + match byte_order { + 0 => Ok(u32::from_be_bytes([buf[0], buf[1], buf[2], buf[3]])), + 1 => Ok(u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]])), + other => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected byte order: {other}" + ))) + } + } } - let byte_order = buf[0]; + // Parses the top-level dimension of the geometry + fn parse_dimensions(buf: &[u8]) -> Result { + if buf.len() < 9 { + return Err(SedonaGeometryError::Invalid( + "Invalid WKB: buffer too small -> parse_dimensions".to_string(), + )); + } + + let byte_order = buf[0]; - let code = read_u32(&buf[1..5], byte_order)?; + let code = Self::read_u32(&buf[1..5], byte_order)?; - match code / 1000 { - 0 => Ok(Dimensions::Xy), - 1 => Ok(Dimensions::Xyz), - 2 => Ok(Dimensions::Xym), - 3 => Ok(Dimensions::Xyzm), - _ => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected code: {code:?}" - ))) + match code / 1000 { + 0 => Ok(Dimensions::Xy), + 1 => Ok(Dimensions::Xyz), + 2 => Ok(Dimensions::Xym), + 3 => Ok(Dimensions::Xyzm), + _ => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected code: {code:?}" + ))) + } } } } From a2218a9e26b037780ee5ee99e28ccb83bd23379d Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 22 Oct 2025 21:16:10 -0700 Subject: [PATCH 34/41] Catch the error instead of hiding it in first_geom_idx --- rust/sedona-geometry/src/wkb_header.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 8d78a216..9af101f6 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -195,8 +195,8 @@ impl WkbHeader { // Recursive call to get the first geom of the first nested geometry // Add to current offset of i - let off = Self::first_geom_idx(&buf[i..]); - if let Ok(Some(off)) = off { + let off = Self::first_geom_idx(&buf[i..])?; + if let Some(off) = off { Ok(Some(i + off)) } else { Ok(None) From b8f1cc36df54c70d2acd2b87f9eb5dacecc54978 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 22 Oct 2025 23:00:42 -0700 Subject: [PATCH 35/41] Use new WkbBuffer that calculates values by consuming bytes and keeping state --- rust/sedona-geometry/src/wkb_header.rs | 360 +++++++++++++++++-------- 1 file changed, 255 insertions(+), 105 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 9af101f6..48204d55 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -43,49 +43,103 @@ pub struct WkbHeader { impl WkbHeader { /// Creates a new [WkbHeader] from a buffer pub fn try_new(buf: &[u8]) -> Result { - if buf.len() < 5 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small -> try_new".to_string(), - )); - }; + let mut wkb_buffer = WkbBuffer::new(buf); - let byte_order = buf[0]; + wkb_buffer.read_endian()?; - // Parse geometry type - let geometry_type = Self::read_u32(&buf[1..5], byte_order)?; + let geometry_type = wkb_buffer.read_u32()?; let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; - let mut i = 5; + // let mut i = 5; let mut srid = 0; // if EWKB if geometry_type & SRID_FLAG_BIT != 0 { - srid = Self::read_u32(&buf[5..9], byte_order)?; - i = 9; + srid = wkb_buffer.read_u32()?; + // i = 9; } let size = if geometry_type_id == GeometryTypeId::Point { // Dummy value for a point 1 } else { - Self::read_u32(&buf[i..i + 4], byte_order)? + wkb_buffer.read_u32()? }; + // println!("geometry_type_id: {geometry_type_id:?}"); + // println!("size: {size}"); + // println!("buf: {:?}", &wkb_buffer.buf[wkb_buffer.offset..]); // Default values for empty geometries let first_x; let first_y; let first_geom_dimensions: Option; - let first_geom_idx = Self::first_geom_idx(buf)?; + let mut wkb_buffer = WkbBuffer::new(buf); // Reset: TODO: clean up later + + let first_geom_idx = wkb_buffer.first_geom_idx()?; // ERROR HERE if let Some(i) = first_geom_idx { - first_geom_dimensions = Some(Self::parse_dimensions(&buf[i..])?); - (first_x, first_y) = Self::first_xy_coord(&buf[i..])?; + // first_geom_dimensions = Some(self.parse_dimensions(&buf[i..])?); + + println!("starting parse_dimensions buf: {:?}", &buf[i..]); + let mut wkb_buffer = WkbBuffer::new(&buf[i..]); // Reset: TODO: clean up later + first_geom_dimensions = Some(wkb_buffer.parse_dimensions()?); + // (first_x, first_y) = self.first_xy_coord(&buf[i..])?; + + let mut wkb_buffer = WkbBuffer::new(&buf[i..]); // Reset: TODO: clean up later + println!( + "first_xy_coord: buf: {:?}", + &wkb_buffer.buf[wkb_buffer.offset..] + ); + (first_x, first_y) = wkb_buffer.first_xy_coord()?; } else { first_geom_dimensions = None; first_x = f64::NAN; first_y = f64::NAN; } + // if buf.len() < 5 { + // return Err(SedonaGeometryError::Invalid( + // "Invalid WKB: buffer too small".to_string(), + // )); + // }; + + // let byte_order = buf[0]; + + // // Parse geometry type + // let geometry_type = self.read_u32(&buf[1..5], byte_order)?; + + // let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; + + // let mut i = 5; + // let mut srid = 0; + // // if EWKB + // if geometry_type & SRID_FLAG_BIT != 0 { + // srid = self.read_u32(&buf[5..9], byte_order)?; + // i = 9; + // } + + // let size = if geometry_type_id == GeometryTypeId::Point { + // // Dummy value for a point + // 1 + // } else { + // self.read_u32(&buf[i..i + 4], byte_order)? + // }; + + // // Default values for empty geometries + // let first_x; + // let first_y; + // let first_geom_dimensions: Option; + + // let first_geom_idx = self.first_geom_idx(buf)?; + // if let Some(i) = first_geom_idx { + // first_geom_dimensions = Some(self.parse_dimensions(&buf[i..])?); + // (first_x, first_y) = self.first_xy_coord(&buf[i..])?; + // } else { + // first_geom_dimensions = None; + // first_x = f64::NAN; + // first_y = f64::NAN; + // } + Ok(Self { geometry_type, srid, @@ -154,20 +208,43 @@ impl WkbHeader { pub fn first_geom_dimensions(&self) -> Option { self.first_geom_dimensions } +} + +// A helper struct for calculating the WKBHeader +struct WkbBuffer<'a> { + buf: &'a [u8], + offset: usize, + remaining: usize, + last_endian: u8, +} + +impl<'a> WkbBuffer<'a> { + fn new(buf: &'a [u8]) -> Self { + Self { + buf, + offset: 0, + remaining: buf.len(), + last_endian: 0, + } + } // For MULITPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION, returns the index to the first nested // non-collection geometry (POINT, LINESTRING, or POLYGON), or None if empty // For POINT, LINESTRING, POLYGON, returns 0 as it already is a non-collection geometry - fn first_geom_idx(buf: &[u8]) -> Result, SedonaGeometryError> { - if buf.len() < 5 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small -> first_geom_idx".to_string(), - )); - } - - let byte_order = buf[0]; - let geometry_type = Self::read_u32(&buf[1..5], byte_order)?; + fn first_geom_idx(&mut self) -> Result, SedonaGeometryError> { + // if buf.len() < 5 { + // return Err(SedonaGeometryError::Invalid( + // "Invalid WKB: buffer too small".to_string(), + // )); + // } + + // let byte_order = buf[0]; + // let geometry_type = self.read_u32(&buf[1..5], byte_order)?; + println!("first_geom_idx: buf: {:?}", &self.buf[self.offset..]); + self.read_endian()?; + let geometry_type = self.read_u32()?; let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; + println!("geometry_type_id: {geometry_type_id:?}"); match geometry_type_id { GeometryTypeId::Point | GeometryTypeId::LineString | GeometryTypeId::Polygon => { @@ -177,27 +254,34 @@ impl WkbHeader { | GeometryTypeId::MultiLineString | GeometryTypeId::MultiPolygon | GeometryTypeId::GeometryCollection => { - if buf.len() < 9 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small".to_string(), - )); - } - let num_geometries = Self::read_u32(&buf[5..9], byte_order)?; + // if buf.len() < 9 { + // return Err(SedonaGeometryError::Invalid( + // "Invalid WKB: buffer too small".to_string(), + // )); + // } + // let num_geometries = self.read_u32(&buf[5..9], byte_order)?; + let num_geometries = self.read_u32()?; if num_geometries == 0 { return Ok(None); + // return Ok(Some(0)) } - let mut i = 9; + // let mut i = 9; if geometry_type & SRID_FLAG_BIT != 0 { - i += 4; + // i += 4; + // Skip the SRID + self.read_u32()?; } // Recursive call to get the first geom of the first nested geometry // Add to current offset of i - let off = Self::first_geom_idx(&buf[i..])?; + // let off = self.first_geom_idx(&buf[i..])?; + let mut nested_buffer = WkbBuffer::new(&self.buf[self.offset..]); + let off = nested_buffer.first_geom_idx()?; + // let off = self.first_geom_idx()?; if let Some(off) = off { - Ok(Some(i + off)) + Ok(Some(self.offset + off)) } else { Ok(None) } @@ -212,133 +296,199 @@ impl WkbHeader { // Given a point, linestring, or polygon, return the first xy coordinate // If the geometry, is empty, (NaN, NaN) is returned - fn first_xy_coord(buf: &[u8]) -> Result<(f64, f64), SedonaGeometryError> { - if buf.len() < 5 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small -> first_xy".to_string(), - )); - } - - let byte_order = buf[0]; - let geometry_type = Self::read_u32(&buf[1..5], byte_order)?; + fn first_xy_coord(&mut self) -> Result<(f64, f64), SedonaGeometryError> { + // if buf.len() < 5 { + // return Err(SedonaGeometryError::Invalid( + // "Invalid WKB: buffer too small -> first_xy".to_string(), + // )); + // } + + // let byte_order = buf[0]; + self.read_endian()?; + // let geometry_type = self.read_u32(&buf[1..5], byte_order)?; + let geometry_type = self.read_u32()?; let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; // 1 (byte_order) + 4 (geometry_type) = 5 - let mut i = 5; + // let mut i = 5; // Skip the SRID if it's present if geometry_type & SRID_FLAG_BIT != 0 { - i += 4; + // i += 4; + self.read_u32()?; } if matches!( geometry_type_id, GeometryTypeId::LineString | GeometryTypeId::Polygon ) { - if buf.len() < i + 4 { - return Err(SedonaGeometryError::Invalid(format!( - "Invalid WKB: buffer too small -> first_xy3 {} is not < {}", - buf.len(), - i + 4 - ))); - } - let size = Self::read_u32(&buf[i..i + 4], byte_order)?; + // if buf.len() < i + 4 { + // return Err(SedonaGeometryError::Invalid(format!( + // "Invalid WKB: buffer too small -> first_xy3 {} is not < {}", + // buf.len(), + // i + 4 + // ))); + // } + // let size = self.read_u32(&buf[i..i + 4], byte_order)?; + let size = self.read_u32()?; // (NaN, NaN) for empty geometries if size == 0 { return Ok((f64::NAN, f64::NAN)); } // + 4 for size - i += 4; + // i += 4; // For POLYGON, after the number of rings, the next 4 bytes are the // number of points in the exterior ring. We must skip that count to // land on the first coordinate's x value. if geometry_type_id == GeometryTypeId::Polygon { - if buf.len() < i + 4 { - return Err(SedonaGeometryError::Invalid(format!( - "Invalid WKB: buffer too small -> polygon first ring size {} is not < {}", - buf.len(), - i + 4 - ))); - } - let ring0_num_points = Self::read_u32(&buf[i..i + 4], byte_order)?; + // if buf.len() < i + 4 { + // return Err(SedonaGeometryError::Invalid(format!( + // "Invalid WKB: buffer too small -> polygon first ring size {} is not < {}", + // buf.len(), + // i + 4 + // ))); + // } + // let ring0_num_points = self.read_u32(&buf[i..i + 4], byte_order)?; + let ring0_num_points = self.read_u32()?; // (NaN, NaN) for empty first ring if ring0_num_points == 0 { return Ok((f64::NAN, f64::NAN)); } - i += 4; + // i += 4; } } - if buf.len() < i + 8 { + // if buf.len() < i + 8 { + // return Err(SedonaGeometryError::Invalid(format!( + // "Invalid WKB: buffer too small -> first_xy4 {} is not < {}", + // i + 8, + // buf.len() + // ))); + // } + // let x = self.parse_coord(&buf[i..], byte_order)?; + // let y = self.parse_coord(&buf[i + 8..], byte_order)?; + let x = self.parse_coord()?; + let y = self.parse_coord()?; + Ok((x, y)) + } + + fn read_endian(&mut self) -> Result<(), SedonaGeometryError> { + if self.remaining < 1 { return Err(SedonaGeometryError::Invalid(format!( - "Invalid WKB: buffer too small -> first_xy4 {} is not < {}", - i + 8, - buf.len() + "Invalid WKB: buffer too small. At offset: {}. Need 1 byte.", + self.offset ))); } - let x = Self::parse_coord(&buf[i..], byte_order)?; - let y = Self::parse_coord(&buf[i + 8..], byte_order)?; - Ok((x, y)) + self.last_endian = self.buf[self.offset]; + self.remaining -= 1; + self.offset += 1; + Ok(()) } - // Given a buffer starting at the coordinate itself, parse the x and y coordinates - fn parse_coord(buf: &[u8], byte_order: u8) -> Result { - if buf.len() < 8 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small -> parse_coord".to_string(), - )); + fn read_u32(&mut self) -> Result { + if self.remaining < 4 { + return Err(SedonaGeometryError::Invalid(format!( + "Invalid WKB: buffer too small. At offset: {}. Need 4 bytes.", + self.offset + ))); } - - let coord: f64 = match byte_order { - 0 => f64::from_be_bytes([ - buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], + // if buf.len() < 4 { + // return Err(SedonaGeometryError::Invalid( + // "Invalid WKB: buffer too small".to_string(), + // )); + // } + + let off = self.offset; + let num = match self.last_endian { + 0 => u32::from_be_bytes([ + self.buf[off], + self.buf[off + 1], + self.buf[off + 2], + self.buf[off + 3], ]), - 1 => f64::from_le_bytes([ - buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], + 1 => u32::from_le_bytes([ + self.buf[off], + self.buf[off + 1], + self.buf[off + 2], + self.buf[off + 3], ]), other => { return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" + "Unexpected byte order: {other:?}" ))) } }; - - Ok(coord) + self.remaining -= 4; + self.offset += 4; + Ok(num) } - fn read_u32(buf: &[u8], byte_order: u8) -> Result { - if buf.len() < 4 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small".to_string(), - )); + // Given a buffer starting at the coordinate itself, parse the x and y coordinates + fn parse_coord(&mut self) -> Result { + // if buf.len() < 8 { + // return Err(SedonaGeometryError::Invalid( + // "Invalid WKB: buffer too small -> parse_coord".to_string(), + // )); + // } + if self.remaining < 8 { + return Err(SedonaGeometryError::Invalid(format!( + "Invalid WKB: buffer too small. At offset: {}. Need 8 bytes.", + self.offset + ))); } - match byte_order { - 0 => Ok(u32::from_be_bytes([buf[0], buf[1], buf[2], buf[3]])), - 1 => Ok(u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]])), + let buf = &self.buf; + let off = self.offset; + let coord: f64 = match self.last_endian { + 0 => f64::from_be_bytes([ + buf[off], + buf[off + 1], + buf[off + 2], + buf[off + 3], + buf[off + 4], + buf[off + 5], + buf[off + 6], + buf[off + 7], + ]), + 1 => f64::from_le_bytes([ + buf[off], + buf[off + 1], + buf[off + 2], + buf[off + 3], + buf[off + 4], + buf[off + 5], + buf[off + 6], + buf[off + 7], + ]), other => { return Err(SedonaGeometryError::Invalid(format!( - "Unexpected byte order: {other}" + "Unexpected byte order: {other:?}" ))) } - } + }; + self.remaining -= 8; + self.offset += 8; + + Ok(coord) } // Parses the top-level dimension of the geometry - fn parse_dimensions(buf: &[u8]) -> Result { - if buf.len() < 9 { - return Err(SedonaGeometryError::Invalid( - "Invalid WKB: buffer too small -> parse_dimensions".to_string(), - )); - } - - let byte_order = buf[0]; - - let code = Self::read_u32(&buf[1..5], byte_order)?; + fn parse_dimensions(&mut self) -> Result { + // if buf.len() < 9 { + // return Err(SedonaGeometryError::Invalid( + // "Invalid WKB: buffer too small -> parse_dimensions".to_string(), + // )); + // } + + // let byte_order = buf[0]; + self.read_endian()?; + + // let code = self.read_u32(&buf[1..5], byte_order)?; + let code = self.read_u32()?; match code / 1000 { 0 => Ok(Dimensions::Xy), From 4e3e52568c7e176f92107d4c2f71385074aa3437 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 24 Oct 2025 22:31:22 -0700 Subject: [PATCH 36/41] Fix st_haszm to map sedona errors --- rust/sedona-functions/src/st_haszm.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rust/sedona-functions/src/st_haszm.rs b/rust/sedona-functions/src/st_haszm.rs index c279ecef..6cd7c6ba 100644 --- a/rust/sedona-functions/src/st_haszm.rs +++ b/rust/sedona-functions/src/st_haszm.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use crate::executor::WkbBytesExecutor; use arrow_array::builder::BooleanBuilder; use arrow_schema::DataType; -use datafusion_common::error::Result; +use datafusion_common::{error::Result, DataFusionError}; use datafusion_expr::{ scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, Volatility, }; @@ -108,8 +108,10 @@ impl SedonaScalarKernel for STHasZm { /// Fast-path inference of geometry type name from raw WKB bytes fn invoke_scalar(buf: &[u8], dim_index: usize) -> Result> { - let header = WkbHeader::try_new(buf)?; - let top_level_dimensions = header.dimensions()?; + let header = WkbHeader::try_new(buf).map_err(|e| DataFusionError::External(Box::new(e)))?; + let top_level_dimensions = header + .dimensions() + .map_err(|e| DataFusionError::External(Box::new(e)))?; // Infer dimension based on first coordinate dimension for cases where it differs from top-level // e.g GEOMETRYCOLLECTION (POINT Z (1 2 3)) From 92dd670aed688e0f94dc0bec3b0dd4135b03039d Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Mon, 27 Oct 2025 23:59:37 -0700 Subject: [PATCH 37/41] Fix bug, so dimensions supports EWKB and ISO WKB --- Cargo.lock | 1 + rust/sedona-geometry/Cargo.toml | 1 + rust/sedona-geometry/src/wkb_header.rs | 380 ++++++++++++++++++------- rust/sedona-testing/src/fixtures.rs | 129 +++++++++ 4 files changed, 406 insertions(+), 105 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 94550ec3..4ce2405e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4975,6 +4975,7 @@ dependencies = [ "geo-types", "lru", "rstest", + "sedona-testing", "serde", "serde_json", "serde_with", diff --git a/rust/sedona-geometry/Cargo.toml b/rust/sedona-geometry/Cargo.toml index 8f127589..7d3a45c1 100644 --- a/rust/sedona-geometry/Cargo.toml +++ b/rust/sedona-geometry/Cargo.toml @@ -30,6 +30,7 @@ result_large_err = "allow" [dev-dependencies] geo-types = { workspace = true } rstest = { workspace = true } +sedona-testing = { path = "../sedona-testing" } serde_json = { workspace = true } wkt = { workspace = true } diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 48204d55..84cbce0f 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -20,10 +20,13 @@ use geo_traits::Dimensions; use crate::error::SedonaGeometryError; use crate::types::GeometryTypeId; +const Z_FLAG_BIT: u32 = 0x80000000; +const M_FLAG_BIT: u32 = 0x40000000; const SRID_FLAG_BIT: u32 = 0x20000000; /// Fast-path WKB header parser /// Performs operations lazily and caches them after the first computation +#[derive(Debug)] pub struct WkbHeader { geometry_type: u32, // Not applicable for a point @@ -78,13 +81,24 @@ impl WkbHeader { let first_geom_idx = wkb_buffer.first_geom_idx()?; // ERROR HERE if let Some(i) = first_geom_idx { - // first_geom_dimensions = Some(self.parse_dimensions(&buf[i..])?); + // For simple geometries (POINT, LINESTRING, POLYGON), first_geom_idx returns 0 + // But we need to skip the SRID if it's present + let mut buffer_start = i; + if geometry_type & SRID_FLAG_BIT != 0 { + // Skip endian (1) + geometry_type (4) + SRID (4) = 9 bytes + buffer_start = 9; + } else { + // Skip endian (1) + geometry_type (4) = 5 bytes + buffer_start = 5; + } + // For parse_dimensions, we need to pass the buffer starting from the geometry header println!("starting parse_dimensions buf: {:?}", &buf[i..]); let mut wkb_buffer = WkbBuffer::new(&buf[i..]); // Reset: TODO: clean up later first_geom_dimensions = Some(wkb_buffer.parse_dimensions()?); // (first_x, first_y) = self.first_xy_coord(&buf[i..])?; + // For first_xy_coord, we need to pass the buffer starting from the geometry header let mut wkb_buffer = WkbBuffer::new(&buf[i..]); // Reset: TODO: clean up later println!( "first_xy_coord: buf: {:?}", @@ -189,19 +203,7 @@ impl WkbHeader { /// Returns the top-level dimension of the WKB pub fn dimensions(&self) -> Result { - let dimensions = match self.geometry_type / 1000 { - 0 => Dimensions::Xy, - 1 => Dimensions::Xyz, - 2 => Dimensions::Xym, - 3 => Dimensions::Xyzm, - _ => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected code: {}", - self.geometry_type - ))) - } - }; - Ok(dimensions) + calc_dimensions(self.geometry_type) } /// Returns the dimensions of the first coordinate of the geometry @@ -232,19 +234,9 @@ impl<'a> WkbBuffer<'a> { // non-collection geometry (POINT, LINESTRING, or POLYGON), or None if empty // For POINT, LINESTRING, POLYGON, returns 0 as it already is a non-collection geometry fn first_geom_idx(&mut self) -> Result, SedonaGeometryError> { - // if buf.len() < 5 { - // return Err(SedonaGeometryError::Invalid( - // "Invalid WKB: buffer too small".to_string(), - // )); - // } - - // let byte_order = buf[0]; - // let geometry_type = self.read_u32(&buf[1..5], byte_order)?; - println!("first_geom_idx: buf: {:?}", &self.buf[self.offset..]); self.read_endian()?; let geometry_type = self.read_u32()?; let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; - println!("geometry_type_id: {geometry_type_id:?}"); match geometry_type_id { GeometryTypeId::Point | GeometryTypeId::LineString | GeometryTypeId::Polygon => { @@ -254,19 +246,6 @@ impl<'a> WkbBuffer<'a> { | GeometryTypeId::MultiLineString | GeometryTypeId::MultiPolygon | GeometryTypeId::GeometryCollection => { - // if buf.len() < 9 { - // return Err(SedonaGeometryError::Invalid( - // "Invalid WKB: buffer too small".to_string(), - // )); - // } - // let num_geometries = self.read_u32(&buf[5..9], byte_order)?; - let num_geometries = self.read_u32()?; - - if num_geometries == 0 { - return Ok(None); - // return Ok(Some(0)) - } - // let mut i = 9; if geometry_type & SRID_FLAG_BIT != 0 { // i += 4; @@ -274,9 +253,19 @@ impl<'a> WkbBuffer<'a> { self.read_u32()?; } + // let num_geometries = self.read_u32(&buf[5..9], byte_order)?; + let num_geometries = self.read_u32()?; + + if num_geometries == 0 { + return Ok(None); + } + // Recursive call to get the first geom of the first nested geometry // Add to current offset of i // let off = self.first_geom_idx(&buf[i..])?; + // if self.offset >= self.buf.len() { + // return Ok(None); + // } let mut nested_buffer = WkbBuffer::new(&self.buf[self.offset..]); let off = nested_buffer.first_geom_idx()?; // let off = self.first_geom_idx()?; @@ -297,15 +286,7 @@ impl<'a> WkbBuffer<'a> { // Given a point, linestring, or polygon, return the first xy coordinate // If the geometry, is empty, (NaN, NaN) is returned fn first_xy_coord(&mut self) -> Result<(f64, f64), SedonaGeometryError> { - // if buf.len() < 5 { - // return Err(SedonaGeometryError::Invalid( - // "Invalid WKB: buffer too small -> first_xy".to_string(), - // )); - // } - - // let byte_order = buf[0]; self.read_endian()?; - // let geometry_type = self.read_u32(&buf[1..5], byte_order)?; let geometry_type = self.read_u32()?; let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; @@ -323,34 +304,17 @@ impl<'a> WkbBuffer<'a> { geometry_type_id, GeometryTypeId::LineString | GeometryTypeId::Polygon ) { - // if buf.len() < i + 4 { - // return Err(SedonaGeometryError::Invalid(format!( - // "Invalid WKB: buffer too small -> first_xy3 {} is not < {}", - // buf.len(), - // i + 4 - // ))); - // } - // let size = self.read_u32(&buf[i..i + 4], byte_order)?; let size = self.read_u32()?; // (NaN, NaN) for empty geometries if size == 0 { return Ok((f64::NAN, f64::NAN)); } - // + 4 for size - // i += 4; // For POLYGON, after the number of rings, the next 4 bytes are the // number of points in the exterior ring. We must skip that count to // land on the first coordinate's x value. if geometry_type_id == GeometryTypeId::Polygon { - // if buf.len() < i + 4 { - // return Err(SedonaGeometryError::Invalid(format!( - // "Invalid WKB: buffer too small -> polygon first ring size {} is not < {}", - // buf.len(), - // i + 4 - // ))); - // } // let ring0_num_points = self.read_u32(&buf[i..i + 4], byte_order)?; let ring0_num_points = self.read_u32()?; @@ -362,15 +326,6 @@ impl<'a> WkbBuffer<'a> { } } - // if buf.len() < i + 8 { - // return Err(SedonaGeometryError::Invalid(format!( - // "Invalid WKB: buffer too small -> first_xy4 {} is not < {}", - // i + 8, - // buf.len() - // ))); - // } - // let x = self.parse_coord(&buf[i..], byte_order)?; - // let y = self.parse_coord(&buf[i + 8..], byte_order)?; let x = self.parse_coord()?; let y = self.parse_coord()?; Ok((x, y)) @@ -396,11 +351,6 @@ impl<'a> WkbBuffer<'a> { self.offset ))); } - // if buf.len() < 4 { - // return Err(SedonaGeometryError::Invalid( - // "Invalid WKB: buffer too small".to_string(), - // )); - // } let off = self.offset; let num = match self.last_endian { @@ -429,11 +379,6 @@ impl<'a> WkbBuffer<'a> { // Given a buffer starting at the coordinate itself, parse the x and y coordinates fn parse_coord(&mut self) -> Result { - // if buf.len() < 8 { - // return Err(SedonaGeometryError::Invalid( - // "Invalid WKB: buffer too small -> parse_coord".to_string(), - // )); - // } if self.remaining < 8 { return Err(SedonaGeometryError::Invalid(format!( "Invalid WKB: buffer too small. At offset: {}. Need 8 bytes.", @@ -478,28 +423,49 @@ impl<'a> WkbBuffer<'a> { // Parses the top-level dimension of the geometry fn parse_dimensions(&mut self) -> Result { - // if buf.len() < 9 { - // return Err(SedonaGeometryError::Invalid( - // "Invalid WKB: buffer too small -> parse_dimensions".to_string(), - // )); - // } - // let byte_order = buf[0]; self.read_endian()?; + println!("here: endian: {:?}", self.last_endian); // let code = self.read_u32(&buf[1..5], byte_order)?; + println!( + "here: code: {:?}", + &self.buf[self.offset..self.offset + 4].to_vec() + ); let code = self.read_u32()?; + calc_dimensions(code) + } +} - match code / 1000 { - 0 => Ok(Dimensions::Xy), - 1 => Ok(Dimensions::Xyz), - 2 => Ok(Dimensions::Xym), - 3 => Ok(Dimensions::Xyzm), - _ => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected code: {code:?}" - ))) - } +fn calc_dimensions(code: u32) -> Result { + // Check for EWKB Z and M flags + let hasz = (code & Z_FLAG_BIT) != 0; + let hasm = (code & M_FLAG_BIT) != 0; + + match (hasz, hasm) { + (false, false) => {} + // If either flag is set, this must be EWKB (and not ISO WKB) + (true, false) => return Ok(Dimensions::Xyz), + (false, true) => return Ok(Dimensions::Xym), + (true, true) => return Ok(Dimensions::Xyzm), + } + + // if SRID flag is set, then it must be EWKB with no z or m + if code & SRID_FLAG_BIT != 0 { + return Ok(Dimensions::Xy); + } + + // Interpret as ISO WKB + match code / 1000 { + 0 => Ok(Dimensions::Xy), + 1 => Ok(Dimensions::Xyz), + 2 => Ok(Dimensions::Xym), + 3 => Ok(Dimensions::Xyzm), + _ => { + return Err(SedonaGeometryError::Invalid(format!( + "Unexpected code parse_dimensions: {:?}", + code + ))) } } } @@ -642,14 +608,126 @@ mod tests { assert_eq!(header.size(), 0); } - // #[test] - // fn srid() { - // // This doesn't work - // let wkb = make_wkb("SRID=4326;POINT (1 2)"); - // println!("wkb: {:?}", wkb); - // let header = WkbHeader::try_new(&wkb).unwrap(); - // assert_eq!(header.srid(), 4326); - // } + #[test] + fn ewkb_basic() { + use sedona_testing::fixtures::*; + + // Test POINT with SRID 4326 + let header = WkbHeader::try_new(&POINT_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); + assert_eq!(header.first_xy(), (1.0, 2.0)); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + + // Test POINT Z with SRID 3857 + let header = WkbHeader::try_new(&POINT_Z_WITH_SRID_3857_EWKB).unwrap(); + assert_eq!(header.srid(), 3857); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); + assert_eq!(header.first_xy(), (1.0, 2.0)); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xyz); + + // Test POINT M with SRID 4326 + let header = WkbHeader::try_new(&POINT_M_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); + assert_eq!(header.first_xy(), (1.0, 2.0)); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xym); + + // Test POINT ZM with SRID 4326 + let header = WkbHeader::try_new(&POINT_ZM_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); + assert_eq!(header.first_xy(), (1.0, 2.0)); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); + } + + #[test] + fn srid_linestring() { + use sedona_testing::fixtures::*; + + let header = WkbHeader::try_new(&LINESTRING_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::LineString + ); + assert_eq!(header.size(), 2); + assert_eq!(header.first_xy(), (1.0, 2.0)); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + } + + #[test] + fn srid_polygon() { + use sedona_testing::fixtures::*; + + let header = WkbHeader::try_new(&POLYGON_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Polygon); + assert_eq!(header.size(), 1); + assert_eq!(header.first_xy(), (0.0, 0.0)); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + } + + #[test] + fn multipoint_with_srid() { + use sedona_testing::fixtures::*; + + let header = WkbHeader::try_new(&MULTIPOINT_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::MultiPoint + ); + assert_eq!(header.size(), 2); + assert_eq!(header.first_xy(), (1.0, 2.0)); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + } + + #[test] + fn geometrycollection_with_srid() { + use sedona_testing::fixtures::*; + + let header = WkbHeader::try_new(&GEOMETRYCOLLECTION_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::GeometryCollection + ); + assert_eq!(header.size(), 1); + assert_eq!(header.first_xy(), (1.0, 2.0)); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + assert_eq!(header.first_geom_dimensions().unwrap(), Dimensions::Xy); + } + + #[test] + fn srid_empty_geometries_with_srid() { + use sedona_testing::fixtures::*; + + // Test POINT EMPTY with SRID + let header = WkbHeader::try_new(&POINT_EMPTY_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + + // Test GEOMETRYCOLLECTION EMPTY with SRID + let header = WkbHeader::try_new(&GEOMETRYCOLLECTION_EMPTY_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::GeometryCollection + ); + assert_eq!(header.size(), 0); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + assert_eq!(header.first_geom_dimensions(), None); + } + + #[test] + fn srid_no_srid_flag() { + // Test that regular WKB (without SRID flag) returns 0 for SRID + let wkb = make_wkb("POINT (1 2)"); + let header = WkbHeader::try_new(&wkb).unwrap(); + assert_eq!(header.srid(), 0); + } #[test] fn first_xy() { @@ -882,4 +960,96 @@ mod tests { let header = WkbHeader::try_new(&wkb).unwrap(); assert_eq!(header.first_geom_dimensions(), None); } + + #[test] + fn incomplete_buffers() { + // Test various incomplete buffer scenarios to ensure proper error handling + + // Empty buffer + let result = WkbHeader::try_new(&[]); + assert!(result.is_err()); + + // Endian Byte only, missing geometry type + let result = WkbHeader::try_new(&[0x01]); + assert!(result.is_err()); + + // Partial geometry type + let result = WkbHeader::try_new(&[0x01, 0x01, 0x00]); + assert!(result.is_err()); + + // Point with only a endian and geometry type + let result = WkbHeader::try_new(&[0x01, 0x01, 0x00, 0x00, 0x20]); + assert!(result.is_err()); + + // Point with only half a coordinate + let result = WkbHeader::try_new(&[0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]); + assert!(result.is_err()); + + // Point with exactly one coordinate (i.e. x, but no y) + let result = WkbHeader::try_new(&[ + 0x01, // endian + 0x01, 0x00, 0x00, 0x00, // geometry type + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // coord + ]); + assert!(result.is_err()); + + // GeometryCollection with an incomplete point + let result = WkbHeader::try_new(&[ + 0x01, 0x07, 0x00, 0x00, 0x00, + // 0xe6, 0x10, 0x00, 0x00, // SRID 4326 (little endian) + 0x01, 0x00, 0x00, 0x00, // number of geometries (1) + // Nested (incomplete) POINT + 0x01, // endian + 0x01, 0x00, 0x00, 0x00, // geometry type + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // coord + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]); + assert!(result.is_err()); + } + + #[test] + fn incomplete_ewkb_buffers() { + // Test incomplete EWKB buffers specifically + + // EWKB with SRID flag but missing the extra 4 bytes for the SRID + let result = WkbHeader::try_new(&[ + 0x01, // endian + 0x01, 0x00, 0x00, 0x20, // SRID flag is set + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // coord + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // coord + ]); + assert!(result.is_err()); + + // Nested geometry (MultiPoint) has SRID flag set, but missing the extra 4 bytes for it + let result = WkbHeader::try_new(&[ + 0x01, // endian + 0x04, 0x00, 0x00, 0x20, // geometry type with SRID flag is set + 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + // Nested point + 0x01, // endian + 0x01, 0x00, 0x00, 0x20, // geometry type withSRID flag is set + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // coord + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // coord + ]); + assert!(result.is_err()); + + // GeometryCollection with an incomplete linestring + let result = WkbHeader::try_new(&[ + 0x01, 0x07, 0x00, 0x00, 0x20, 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x01, 0x00, 0x00, 0x00, // number of geometries (1) + 0x01, 0x02, 0x00, 0x00, 0x20, 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x01, 0x00, 0x00, 0x00, // size + ]); + assert!(result.is_err()); + } + + #[test] + fn invalid_byte_order() { + // Test invalid byte order values + let result = WkbHeader::try_new(&[0x02, 0x01, 0x00, 0x00, 0x00]); + assert!(result.is_err()); + + let result = WkbHeader::try_new(&[0xff, 0x01, 0x00, 0x00, 0x00]); + assert!(result.is_err()); + } } diff --git a/rust/sedona-testing/src/fixtures.rs b/rust/sedona-testing/src/fixtures.rs index 7ee5b84e..c30a17d3 100644 --- a/rust/sedona-testing/src/fixtures.rs +++ b/rust/sedona-testing/src/fixtures.rs @@ -42,6 +42,135 @@ pub const MULTIPOINT_WITH_INFERRED_Z_DIMENSION_WKB: [u8; 38] = [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, // z-coordinate of point ]; +/// EWKB (Extended WKB) fixtures with SRID information +/// These are hard-coded EWKB byte arrays for testing SRID functionality + +/// EWKB for POINT (1 2) with SRID 4326 +/// Little endian, geometry type 1 (POINT) with SRID flag (0x20000000) +pub const POINT_WITH_SRID_4326_EWKB: [u8; 25] = [ + 0x01, // byte-order + 0x01, 0x00, 0x00, 0x20, // geometry type 1 (POINT) with SRID flag (0x20000000) + 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x-coordinate 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y-coordinate 2.0 +]; + +/// EWKB for POINT Z (1 2 3) with SRID 3857 +/// Little endian, geometry type 1001 (POINT Z) with SRID flag +pub const POINT_Z_WITH_SRID_3857_EWKB: [u8; 33] = [ + 0x01, // byte-order + 0x01, 0x00, 0x00, 0xa0, // geometry type + // 0xe9, 0x03, 0x00, 0x20, // geometry type 1001 (POINT Z) with SRID flag + 0x11, 0x0f, 0x00, 0x00, // SRID 3857 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x-coordinate 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y-coordinate 2.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, // z-coordinate 3.0 +]; + +pub const POINT_M_WITH_SRID_4326_EWKB: [u8; 33] = [ + 0x01, // byte-order + 0x01, 0x00, 0x00, 0x60, // geometry type + 0xe6, 0x10, 0x00, 0x00, // SRID + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x-coordinate 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y-coordinate 2.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, // m-coordinate 3.0 +]; + +/// EWKB for POINT ZM (1 2 3 4) with SRID 4326 +pub const POINT_ZM_WITH_SRID_4326_EWKB: [u8; 41] = [ + 0x01, // byte-order + 0x01, 0x00, 0x00, 0xe0, // geometry type + // 0xb9, 0x0b, 0x00, 0x20, // geometry type 3001 (POINT ZM) with SRID flag + 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, // z = 3.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x40, // m = 4.0 +]; + +/// EWKB for LINESTRING (1 2, 3 4) with SRID 4326 +/// Little endian, geometry type 2 (LINESTRING) with SRID flag +pub const LINESTRING_WITH_SRID_4326_EWKB: [u8; 45] = [ + 0x01, // byte-order + 0x02, 0x00, 0x00, 0x20, // geometry type + 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x02, 0x00, 0x00, 0x00, // number of points (2) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x1 = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y1 = 2.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, // x2 = 3.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x40, // y2 = 4.0 +]; + +/// EWKB for POLYGON ((0 0, 0 1, 1 0, 0 0)) with SRID 4326 +/// Little endian, geometry type 3 (POLYGON) with SRID flag +pub const POLYGON_WITH_SRID_4326_EWKB: [u8; 81] = [ + 0x01, // byte-order + 0x03, 0x00, 0x00, 0x20, // geometry type 3 (POLYGON) with SRID flag + 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x01, 0x00, 0x00, 0x00, // number of rings (1) + 0x04, 0x00, 0x00, 0x00, // number of points in exterior ring (4) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // x1 = 0.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // y1 = 0.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // x2 = 0.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // y2 = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x3 = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // y3 = 0.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // x4 = 0.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // y4 = 0.0 +]; + +/// EWKB for MULTIPOINT ((1 2), (3 4)) with SRID 4326 +/// Little endian, geometry type 4 (MULTIPOINT) with SRID flag +pub const MULTIPOINT_WITH_SRID_4326_EWKB: [u8; 55] = [ + 0x01, // byte-order + 0x04, 0x00, 0x00, 0x20, // geometry type 4 (MULTIPOINT) with SRID flag + 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x02, 0x00, 0x00, 0x00, // number of points (2) + // First point + 0x01, // byte-order + 0x01, 0x00, 0x00, 0x00, // geometry type 1 (POINT) - no SRID flag + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x1 = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y1 = 2.0 + // Second point + 0x01, // byte-order + 0x01, 0x00, 0x00, 0x00, // geometry type 1 (POINT) - no SRID flag + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, // x2 = 3.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x40, // y2 = 4.0 +]; + +/// EWKB for GEOMETRYCOLLECTION (POINT (1 2)) with SRID 4326 +/// Little endian, geometry type 7 (GEOMETRYCOLLECTION) with SRID flag +pub const GEOMETRYCOLLECTION_WITH_SRID_4326_EWKB: [u8; 34] = [ + 0x01, // byte-order + 0x07, 0x00, 0x00, 0x20, // geometry type 7 (GEOMETRYCOLLECTION) with SRID flag + 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x01, 0x00, 0x00, 0x00, // number of geometries (1) + // Nested POINT + 0x01, // byte-order + 0x01, 0x00, 0x00, 0x00, // geometry type 1 (POINT) - no SRID flag + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 +]; + +/// EWKB for POINT EMPTY with SRID 4326 +/// Little endian, geometry type 1 (POINT) with SRID flag +pub const POINT_EMPTY_WITH_SRID_4326_EWKB: [u8; 25] = [ + 0x01, // byte-order + 0x01, 0x00, 0x00, 0x20, // geometry type 1 (POINT) with SRID flag + 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x7f, // x = NaN + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x7f, // y = NaN +]; + +/// EWKB for GEOMETRYCOLLECTION EMPTY with SRID 4326 +/// Little endian, geometry type 7 (GEOMETRYCOLLECTION) with SRID flag +pub const GEOMETRYCOLLECTION_EMPTY_WITH_SRID_4326_EWKB: [u8; 13] = [ + 0x01, // byte-order + 0x07, 0x00, 0x00, 0x20, // geometry type 7 (GEOMETRYCOLLECTION) with SRID flag + 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x00, 0x00, 0x00, 0x00, // number of geometries (0) +]; + pub fn louisiana() -> LineString where T: WktFloat + Default + FromStr, From e4d269a166678e3cea79963d8030b7eddce2613a Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 28 Oct 2025 00:05:19 -0700 Subject: [PATCH 38/41] Clean up --- rust/sedona-geometry/src/wkb_header.rs | 104 ++----------------------- 1 file changed, 7 insertions(+), 97 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 84cbce0f..74698e8f 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -54,12 +54,10 @@ impl WkbHeader { let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; - // let mut i = 5; let mut srid = 0; // if EWKB if geometry_type & SRID_FLAG_BIT != 0 { srid = wkb_buffer.read_u32()?; - // i = 9; } let size = if geometry_type_id == GeometryTypeId::Point { @@ -68,9 +66,6 @@ impl WkbHeader { } else { wkb_buffer.read_u32()? }; - // println!("geometry_type_id: {geometry_type_id:?}"); - // println!("size: {size}"); - // println!("buf: {:?}", &wkb_buffer.buf[wkb_buffer.offset..]); // Default values for empty geometries let first_x; @@ -81,29 +76,12 @@ impl WkbHeader { let first_geom_idx = wkb_buffer.first_geom_idx()?; // ERROR HERE if let Some(i) = first_geom_idx { - // For simple geometries (POINT, LINESTRING, POLYGON), first_geom_idx returns 0 - // But we need to skip the SRID if it's present - let mut buffer_start = i; - if geometry_type & SRID_FLAG_BIT != 0 { - // Skip endian (1) + geometry_type (4) + SRID (4) = 9 bytes - buffer_start = 9; - } else { - // Skip endian (1) + geometry_type (4) = 5 bytes - buffer_start = 5; - } - // For parse_dimensions, we need to pass the buffer starting from the geometry header - println!("starting parse_dimensions buf: {:?}", &buf[i..]); let mut wkb_buffer = WkbBuffer::new(&buf[i..]); // Reset: TODO: clean up later first_geom_dimensions = Some(wkb_buffer.parse_dimensions()?); - // (first_x, first_y) = self.first_xy_coord(&buf[i..])?; // For first_xy_coord, we need to pass the buffer starting from the geometry header let mut wkb_buffer = WkbBuffer::new(&buf[i..]); // Reset: TODO: clean up later - println!( - "first_xy_coord: buf: {:?}", - &wkb_buffer.buf[wkb_buffer.offset..] - ); (first_x, first_y) = wkb_buffer.first_xy_coord()?; } else { first_geom_dimensions = None; @@ -111,49 +89,6 @@ impl WkbHeader { first_y = f64::NAN; } - // if buf.len() < 5 { - // return Err(SedonaGeometryError::Invalid( - // "Invalid WKB: buffer too small".to_string(), - // )); - // }; - - // let byte_order = buf[0]; - - // // Parse geometry type - // let geometry_type = self.read_u32(&buf[1..5], byte_order)?; - - // let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; - - // let mut i = 5; - // let mut srid = 0; - // // if EWKB - // if geometry_type & SRID_FLAG_BIT != 0 { - // srid = self.read_u32(&buf[5..9], byte_order)?; - // i = 9; - // } - - // let size = if geometry_type_id == GeometryTypeId::Point { - // // Dummy value for a point - // 1 - // } else { - // self.read_u32(&buf[i..i + 4], byte_order)? - // }; - - // // Default values for empty geometries - // let first_x; - // let first_y; - // let first_geom_dimensions: Option; - - // let first_geom_idx = self.first_geom_idx(buf)?; - // if let Some(i) = first_geom_idx { - // first_geom_dimensions = Some(self.parse_dimensions(&buf[i..])?); - // (first_x, first_y) = self.first_xy_coord(&buf[i..])?; - // } else { - // first_geom_dimensions = None; - // first_x = f64::NAN; - // first_y = f64::NAN; - // } - Ok(Self { geometry_type, srid, @@ -246,14 +181,11 @@ impl<'a> WkbBuffer<'a> { | GeometryTypeId::MultiLineString | GeometryTypeId::MultiPolygon | GeometryTypeId::GeometryCollection => { - // let mut i = 9; if geometry_type & SRID_FLAG_BIT != 0 { - // i += 4; // Skip the SRID self.read_u32()?; } - // let num_geometries = self.read_u32(&buf[5..9], byte_order)?; let num_geometries = self.read_u32()?; if num_geometries == 0 { @@ -262,24 +194,17 @@ impl<'a> WkbBuffer<'a> { // Recursive call to get the first geom of the first nested geometry // Add to current offset of i - // let off = self.first_geom_idx(&buf[i..])?; - // if self.offset >= self.buf.len() { - // return Ok(None); - // } let mut nested_buffer = WkbBuffer::new(&self.buf[self.offset..]); let off = nested_buffer.first_geom_idx()?; - // let off = self.first_geom_idx()?; if let Some(off) = off { Ok(Some(self.offset + off)) } else { Ok(None) } } - _ => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected geometry type: {geometry_type_id:?}" - ))) - } + _ => Err(SedonaGeometryError::Invalid(format!( + "Unexpected geometry type: {geometry_type_id:?}" + ))), } } @@ -291,12 +216,8 @@ impl<'a> WkbBuffer<'a> { let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7)?; - // 1 (byte_order) + 4 (geometry_type) = 5 - // let mut i = 5; - // Skip the SRID if it's present if geometry_type & SRID_FLAG_BIT != 0 { - // i += 4; self.read_u32()?; } @@ -315,14 +236,12 @@ impl<'a> WkbBuffer<'a> { // number of points in the exterior ring. We must skip that count to // land on the first coordinate's x value. if geometry_type_id == GeometryTypeId::Polygon { - // let ring0_num_points = self.read_u32(&buf[i..i + 4], byte_order)?; let ring0_num_points = self.read_u32()?; // (NaN, NaN) for empty first ring if ring0_num_points == 0 { return Ok((f64::NAN, f64::NAN)); } - // i += 4; } } @@ -423,15 +342,8 @@ impl<'a> WkbBuffer<'a> { // Parses the top-level dimension of the geometry fn parse_dimensions(&mut self) -> Result { - // let byte_order = buf[0]; self.read_endian()?; - println!("here: endian: {:?}", self.last_endian); - // let code = self.read_u32(&buf[1..5], byte_order)?; - println!( - "here: code: {:?}", - &self.buf[self.offset..self.offset + 4].to_vec() - ); let code = self.read_u32()?; calc_dimensions(code) } @@ -461,12 +373,10 @@ fn calc_dimensions(code: u32) -> Result { 1 => Ok(Dimensions::Xyz), 2 => Ok(Dimensions::Xym), 3 => Ok(Dimensions::Xyzm), - _ => { - return Err(SedonaGeometryError::Invalid(format!( - "Unexpected code parse_dimensions: {:?}", - code - ))) - } + _ => Err(SedonaGeometryError::Invalid(format!( + "Unexpected code parse_dimensions: {:?}", + code + ))), } } From c25ac66d4aaa692692c3431d7a835fe76d7c0c97 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 28 Oct 2025 00:10:56 -0700 Subject: [PATCH 39/41] Remove parse_dimensions function and rename to read_coord --- rust/sedona-geometry/src/wkb_header.rs | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 74698e8f..00f7d3ae 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -74,11 +74,14 @@ impl WkbHeader { let mut wkb_buffer = WkbBuffer::new(buf); // Reset: TODO: clean up later - let first_geom_idx = wkb_buffer.first_geom_idx()?; // ERROR HERE + let first_geom_idx = wkb_buffer.first_geom_idx()?; if let Some(i) = first_geom_idx { // For parse_dimensions, we need to pass the buffer starting from the geometry header let mut wkb_buffer = WkbBuffer::new(&buf[i..]); // Reset: TODO: clean up later - first_geom_dimensions = Some(wkb_buffer.parse_dimensions()?); + // Parse dimension + wkb_buffer.read_endian()?; + let code = wkb_buffer.read_u32()?; + first_geom_dimensions = Some(calc_dimensions(code)?); // For first_xy_coord, we need to pass the buffer starting from the geometry header let mut wkb_buffer = WkbBuffer::new(&buf[i..]); // Reset: TODO: clean up later @@ -245,8 +248,8 @@ impl<'a> WkbBuffer<'a> { } } - let x = self.parse_coord()?; - let y = self.parse_coord()?; + let x = self.read_coord()?; + let y = self.read_coord()?; Ok((x, y)) } @@ -297,7 +300,7 @@ impl<'a> WkbBuffer<'a> { } // Given a buffer starting at the coordinate itself, parse the x and y coordinates - fn parse_coord(&mut self) -> Result { + fn read_coord(&mut self) -> Result { if self.remaining < 8 { return Err(SedonaGeometryError::Invalid(format!( "Invalid WKB: buffer too small. At offset: {}. Need 8 bytes.", @@ -339,14 +342,6 @@ impl<'a> WkbBuffer<'a> { Ok(coord) } - - // Parses the top-level dimension of the geometry - fn parse_dimensions(&mut self) -> Result { - self.read_endian()?; - - let code = self.read_u32()?; - calc_dimensions(code) - } } fn calc_dimensions(code: u32) -> Result { @@ -374,7 +369,7 @@ fn calc_dimensions(code: u32) -> Result { 2 => Ok(Dimensions::Xym), 3 => Ok(Dimensions::Xyzm), _ => Err(SedonaGeometryError::Invalid(format!( - "Unexpected code parse_dimensions: {:?}", + "Unexpected code: {:?}", code ))), } From 86aeb3cbdcb034e6ff7f33386ce65af20dbe6573 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 28 Oct 2025 00:13:56 -0700 Subject: [PATCH 40/41] Create set_offset() function to avoid creating new WkbBuffers --- rust/sedona-geometry/src/wkb_header.rs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 00f7d3ae..715395e1 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -72,19 +72,19 @@ impl WkbHeader { let first_y; let first_geom_dimensions: Option; - let mut wkb_buffer = WkbBuffer::new(buf); // Reset: TODO: clean up later + wkb_buffer.set_offset(0); let first_geom_idx = wkb_buffer.first_geom_idx()?; if let Some(i) = first_geom_idx { - // For parse_dimensions, we need to pass the buffer starting from the geometry header - let mut wkb_buffer = WkbBuffer::new(&buf[i..]); // Reset: TODO: clean up later - // Parse dimension + // Reset to first_geom_idx and parse the dimensions + wkb_buffer.set_offset(i); + // Parse dimension wkb_buffer.read_endian()?; let code = wkb_buffer.read_u32()?; first_geom_dimensions = Some(calc_dimensions(code)?); // For first_xy_coord, we need to pass the buffer starting from the geometry header - let mut wkb_buffer = WkbBuffer::new(&buf[i..]); // Reset: TODO: clean up later + wkb_buffer.set_offset(i); (first_x, first_y) = wkb_buffer.first_xy_coord()?; } else { first_geom_dimensions = None; @@ -342,6 +342,11 @@ impl<'a> WkbBuffer<'a> { Ok(coord) } + + fn set_offset(&mut self, offset: usize) { + self.offset = offset; + self.remaining = self.buf.len() - offset; + } } fn calc_dimensions(code: u32) -> Result { From 92759f217e0d58fff3025f99149fc710059658d8 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Tue, 28 Oct 2025 00:26:03 -0700 Subject: [PATCH 41/41] Add EWKB GEOMETRYCOLLECTION w/ Z, M tests --- rust/sedona-geometry/src/wkb_header.rs | 83 +++++++++++++++++++++----- rust/sedona-testing/src/fixtures.rs | 51 ++++++++++++++-- 2 files changed, 114 insertions(+), 20 deletions(-) diff --git a/rust/sedona-geometry/src/wkb_header.rs b/rust/sedona-geometry/src/wkb_header.rs index 715395e1..c0cf226c 100644 --- a/rust/sedona-geometry/src/wkb_header.rs +++ b/rust/sedona-geometry/src/wkb_header.rs @@ -519,7 +519,7 @@ mod tests { } #[test] - fn ewkb_basic() { + fn ewkb() { use sedona_testing::fixtures::*; // Test POINT with SRID 4326 @@ -549,6 +549,57 @@ mod tests { assert_eq!(header.geometry_type_id().unwrap(), GeometryTypeId::Point); assert_eq!(header.first_xy(), (1.0, 2.0)); assert_eq!(header.dimensions().unwrap(), Dimensions::Xyzm); + + // Test GEOMETRYCOLLECTION with SRID 4326 + let header = WkbHeader::try_new(&GEOMETRYCOLLECTION_POINT_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::GeometryCollection + ); + assert_eq!(header.size(), 1); + assert_eq!(header.first_xy(), (1.0, 2.0)); + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + assert_eq!(header.first_geom_dimensions().unwrap(), Dimensions::Xy); + + // Test GEOMETRYCOLLECTION Z with SRID 4326 + let header = WkbHeader::try_new(&GEOMETRYCOLLECTION_POINT_Z_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::GeometryCollection + ); + assert_eq!(header.size(), 1); + assert_eq!(header.first_xy(), (1.0, 2.0)); + // Outer dimension specified as Xy, but inner dimension is Xyz + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + assert_eq!(header.first_geom_dimensions().unwrap(), Dimensions::Xyz); + + // Test GEOMETRYCOLLECTION M with SRID 4326 + let header = WkbHeader::try_new(&GEOMETRYCOLLECTION_POINT_M_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::GeometryCollection + ); + assert_eq!(header.size(), 1); + assert_eq!(header.first_xy(), (1.0, 2.0)); + // Outer dimension specified as Xy, but inner dimension is Xym + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + assert_eq!(header.first_geom_dimensions().unwrap(), Dimensions::Xym); + + // Test GEOMETRYCOLLECTION ZM with SRID 4326 + let header = WkbHeader::try_new(&GEOMETRYCOLLECTION_POINT_ZM_WITH_SRID_4326_EWKB).unwrap(); + assert_eq!(header.srid(), 4326); + assert_eq!( + header.geometry_type_id().unwrap(), + GeometryTypeId::GeometryCollection + ); + assert_eq!(header.size(), 1); + assert_eq!(header.first_xy(), (1.0, 2.0)); + // Outer dimension specified as Xy, but inner dimension is Xyzm + assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + assert_eq!(header.first_geom_dimensions().unwrap(), Dimensions::Xyzm); } #[test] @@ -593,21 +644,21 @@ mod tests { assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); } - #[test] - fn geometrycollection_with_srid() { - use sedona_testing::fixtures::*; - - let header = WkbHeader::try_new(&GEOMETRYCOLLECTION_WITH_SRID_4326_EWKB).unwrap(); - assert_eq!(header.srid(), 4326); - assert_eq!( - header.geometry_type_id().unwrap(), - GeometryTypeId::GeometryCollection - ); - assert_eq!(header.size(), 1); - assert_eq!(header.first_xy(), (1.0, 2.0)); - assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); - assert_eq!(header.first_geom_dimensions().unwrap(), Dimensions::Xy); - } + // #[test] + // fn geometrycollection_with_srid() { + // use sedona_testing::fixtures::*; + + // let header = WkbHeader::try_new(&GEOMETRYCOLLECTION_WITH_SRID_4326_EWKB).unwrap(); + // assert_eq!(header.srid(), 4326); + // assert_eq!( + // header.geometry_type_id().unwrap(), + // GeometryTypeId::GeometryCollection + // ); + // assert_eq!(header.size(), 1); + // assert_eq!(header.first_xy(), (1.0, 2.0)); + // assert_eq!(header.dimensions().unwrap(), Dimensions::Xy); + // assert_eq!(header.first_geom_dimensions().unwrap(), Dimensions::Xy); + // } #[test] fn srid_empty_geometries_with_srid() { diff --git a/rust/sedona-testing/src/fixtures.rs b/rust/sedona-testing/src/fixtures.rs index c30a17d3..b4c1bb15 100644 --- a/rust/sedona-testing/src/fixtures.rs +++ b/rust/sedona-testing/src/fixtures.rs @@ -42,9 +42,6 @@ pub const MULTIPOINT_WITH_INFERRED_Z_DIMENSION_WKB: [u8; 38] = [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, // z-coordinate of point ]; -/// EWKB (Extended WKB) fixtures with SRID information -/// These are hard-coded EWKB byte arrays for testing SRID functionality - /// EWKB for POINT (1 2) with SRID 4326 /// Little endian, geometry type 1 (POINT) with SRID flag (0x20000000) pub const POINT_WITH_SRID_4326_EWKB: [u8; 25] = [ @@ -140,7 +137,7 @@ pub const MULTIPOINT_WITH_SRID_4326_EWKB: [u8; 55] = [ /// EWKB for GEOMETRYCOLLECTION (POINT (1 2)) with SRID 4326 /// Little endian, geometry type 7 (GEOMETRYCOLLECTION) with SRID flag -pub const GEOMETRYCOLLECTION_WITH_SRID_4326_EWKB: [u8; 34] = [ +pub const GEOMETRYCOLLECTION_POINT_WITH_SRID_4326_EWKB: [u8; 34] = [ 0x01, // byte-order 0x07, 0x00, 0x00, 0x20, // geometry type 7 (GEOMETRYCOLLECTION) with SRID flag 0xe6, 0x10, 0x00, 0x00, // SRID 4326 @@ -152,6 +149,52 @@ pub const GEOMETRYCOLLECTION_WITH_SRID_4326_EWKB: [u8; 34] = [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 ]; +/// EWKB for GEOMETRYCOLLECTION (POINT Z (1 2 3)) with SRID 4326 +/// Little endian, geometry type 7 (GEOMETRYCOLLECTION) with SRID flag; nested POINT Z (Z flag set) +pub const GEOMETRYCOLLECTION_POINT_Z_WITH_SRID_4326_EWKB: [u8; 42] = [ + 0x01, // byte-order + 0x07, 0x00, 0x00, 0x20, // geometry type 7 (GEOMETRYCOLLECTION) with SRID flag + 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x01, 0x00, 0x00, 0x00, // number of geometries (1) + // Nested POINT Z + 0x01, // byte-order + 0x01, 0x00, 0x00, 0x80, // geometry type 1 (POINT) with Z flag + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, // z = 3.0 +]; + +/// EWKB for GEOMETRYCOLLECTION (POINT M (1 2 4)) with SRID 4326 +/// Little endian, geometry type 7 (GEOMETRYCOLLECTION) with SRID flag; nested POINT M (M flag set) +pub const GEOMETRYCOLLECTION_POINT_M_WITH_SRID_4326_EWKB: [u8; 42] = [ + 0x01, // byte-order + 0x07, 0x00, 0x00, 0x20, // geometry type 7 (GEOMETRYCOLLECTION) with SRID flag + 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x01, 0x00, 0x00, 0x00, // number of geometries (1) + // Nested POINT M + 0x01, // byte-order + 0x01, 0x00, 0x00, 0x40, // geometry type 1 (POINT) with M flag + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x40, // m = 4.0 +]; + +/// EWKB for GEOMETRYCOLLECTION (POINT ZM (1 2 3 4)) with SRID 4326 +/// Little endian, geometry type 7 (GEOMETRYCOLLECTION) with SRID flag; nested POINT ZM (Z and M flags set) +pub const GEOMETRYCOLLECTION_POINT_ZM_WITH_SRID_4326_EWKB: [u8; 50] = [ + 0x01, // byte-order + 0x07, 0x00, 0x00, 0x20, // geometry type 7 (GEOMETRYCOLLECTION) with SRID flag + 0xe6, 0x10, 0x00, 0x00, // SRID 4326 + 0x01, 0x00, 0x00, 0x00, // number of geometries (1) + // Nested POINT ZM + 0x01, // byte-order + 0x01, 0x00, 0x00, 0xc0, // geometry type 1 (POINT) with Z and M flags + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, // z = 3.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x40, // m = 4.0 +]; + /// EWKB for POINT EMPTY with SRID 4326 /// Little endian, geometry type 1 (POINT) with SRID flag pub const POINT_EMPTY_WITH_SRID_4326_EWKB: [u8; 25] = [