diff --git a/pyiceberg/avro/reader.py b/pyiceberg/avro/reader.py index bccc772022..6ceeff2118 100644 --- a/pyiceberg/avro/reader.py +++ b/pyiceberg/avro/reader.py @@ -260,6 +260,16 @@ def skip(self, decoder: BinaryDecoder) -> None: decoder.skip_bytes() +class GeographyReader(BinaryReader): + """Reads a geography from the stream. + """ + + +class GeometryReader(BinaryReader): + """Reads a geometry from the stream. + """ + + @dataclass(frozen=True, init=False) class DecimalReader(Reader): """Reads a value as a decimal. diff --git a/pyiceberg/avro/resolver.py b/pyiceberg/avro/resolver.py index c4ec393513..3a7e9ee4af 100644 --- a/pyiceberg/avro/resolver.py +++ b/pyiceberg/avro/resolver.py @@ -35,6 +35,8 @@ DoubleReader, FixedReader, FloatReader, + GeographyReader, + GeometryReader, IntegerReader, ListReader, MapReader, @@ -60,6 +62,8 @@ DoubleWriter, FixedWriter, FloatWriter, + GeographyWriter, + GeometryWriter, IntegerWriter, ListWriter, MapWriter, @@ -96,6 +100,8 @@ FloatType, IcebergType, IntegerType, + GeographyType, + GeometryType, ListType, LongType, MapType, @@ -181,6 +187,12 @@ def visit_float(self, float_type: FloatType) -> Writer: def visit_double(self, double_type: DoubleType) -> Writer: return DoubleWriter() + def visit_geography(self, geography_type: GeographyType) -> Writer: + return GeographyWriter() + + def visit_geometry(self, geometry_type: GeometryType) -> Writer: + return GeometryWriter() + def visit_date(self, date_type: DateType) -> Writer: return DateWriter() @@ -338,6 +350,12 @@ def visit_decimal(self, decimal_type: DecimalType, partner: Optional[IcebergType def visit_date(self, date_type: DateType, partner: Optional[IcebergType]) -> Writer: return DateWriter() + def visit_geography(self, geography_type: GeographyType, partner: Optional[IcebergType]) -> Writer: + return GeographyWriter() + + def visit_geometry(self, geometry_type: GeometryType, partner: Optional[IcebergType]) -> Writer: + return GeometryWriter() + def visit_time(self, time_type: TimeType, partner: Optional[IcebergType]) -> Writer: return TimeWriter() @@ -504,6 +522,12 @@ def visit_fixed(self, fixed_type: FixedType, partner: Optional[IcebergType]) -> def visit_binary(self, binary_type: BinaryType, partner: Optional[IcebergType]) -> Reader: return BinaryReader() + def visit_geography(self, geography_type: GeographyType, partner: Optional[IcebergType]) -> Reader: + return GeographyReader() + + def visit_geometry(self, geometry_type: GeometryType, partner: Optional[IcebergType]) -> Reader: + return GeometryReader() + def visit_unknown(self, unknown_type: UnknownType, partner: Optional[IcebergType]) -> Reader: return UnknownReader() diff --git a/pyiceberg/avro/writer.py b/pyiceberg/avro/writer.py index 6fa485f21a..de280cafbf 100644 --- a/pyiceberg/avro/writer.py +++ b/pyiceberg/avro/writer.py @@ -161,6 +161,18 @@ def write(self, encoder: BinaryEncoder, val: Any) -> None: encoder.write_bytes(val) +@dataclass(frozen=True) +class GeographyWriter(Writer): + def write(self, encoder: BinaryEncoder, val: Any) -> None: + encoder.write_bytes(val) + + +@dataclass(frozen=True) +class GeometryWriter(Writer): + def write(self, encoder: BinaryEncoder, val: Any) -> None: + encoder.write_bytes(val) + + @dataclass(frozen=True) class DecimalWriter(Writer): precision: int = dataclassfield() diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py index 7bf7b462e2..1f9c1aec98 100644 --- a/pyiceberg/conversions.py +++ b/pyiceberg/conversions.py @@ -51,6 +51,8 @@ DoubleType, FixedType, FloatType, + GeometryType, + GeographyType, IntegerType, LongType, PrimitiveType, @@ -167,6 +169,8 @@ def _(_: UUIDType, value_str: str) -> uuid.UUID: @partition_to_py.register(FixedType) @partition_to_py.register(BinaryType) +@partition_to_py.register(GeographyType) +@partition_to_py.register(GeometryType) @handle_none def _(_: PrimitiveType, value_str: str) -> bytes: return bytes(value_str, UTF8) @@ -275,6 +279,8 @@ def _(_: UUIDType, value: Union[uuid.UUID, bytes]) -> bytes: @to_bytes.register(BinaryType) @to_bytes.register(FixedType) +@to_bytes.register(GeographyType) +@to_bytes.register(GeometryType) def _(_: PrimitiveType, value: bytes) -> bytes: return value @@ -355,6 +361,8 @@ def _(_: StringType, b: bytes) -> str: @from_bytes.register(BinaryType) @from_bytes.register(FixedType) +@from_bytes.register(GeographyType) +@from_bytes.register(GeometryType) @from_bytes.register(UUIDType) def _(_: PrimitiveType, b: bytes) -> bytes: return b @@ -453,6 +461,8 @@ def _(t: FixedType, b: bytes) -> str: @to_json.register(BinaryType) +@to_json.register(GeographyType) +@to_json.register(GeometryType) def _(_: BinaryType, b: bytes) -> str: """Python bytes serializes into hexadecimal encoded string.""" return codecs.encode(b, "hex").decode(UTF8) @@ -580,6 +590,16 @@ def _(_: BinaryType, val: Union[bytes, str]) -> bytes: return val +@from_json.register(GeographyType) +def _(_: GeographyType, val: bytes) -> bytes: + return val + + +@from_json.register(GeometryType) +def _(_: GeometryType, val: bytes) -> bytes: + return val + + @from_json.register(DecimalType) def _(_: DecimalType, val: str) -> Decimal: """Convert JSON string into a Python Decimal.""" diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 2797371028..53bd34329e 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -160,6 +160,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IcebergType, IntegerType, ListType, @@ -754,6 +756,12 @@ def visit_unknown(self, _: UnknownType) -> pa.DataType: def visit_binary(self, _: BinaryType) -> pa.DataType: return pa.large_binary() + def visit_geography(self, _: GeographyType) -> pa.DataType: + return pa.large_binary() + + def visit_geometry(self, _: GeometryType) -> pa.DataType: + return pa.large_binary() + def _convert_scalar(value: Any, iceberg_type: IcebergType) -> pa.scalar: if not isinstance(iceberg_type, PrimitiveType): @@ -1977,6 +1985,12 @@ def visit_uuid(self, uuid_type: UUIDType) -> str: def visit_binary(self, binary_type: BinaryType) -> str: return "BYTE_ARRAY" + def visit_geography(self, geography_type: GeographyType) -> str: + return "BYTE_ARRAY" + + def visit_geometry(self, geometry_type: GeometryType) -> str: + return "BYTE_ARRAY" + def visit_unknown(self, unknown_type: UnknownType) -> str: return "UNKNOWN" diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 6333ace6e2..ada2a4f921 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -48,6 +48,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IcebergType, IntegerType, ListType, @@ -553,6 +555,10 @@ def primitive(self, primitive: PrimitiveType, primitive_partner: Optional[P]) -> return self.visit_fixed(primitive, primitive_partner) elif isinstance(primitive, BinaryType): return self.visit_binary(primitive, primitive_partner) + elif isinstance(primitive, GeographyType): + return self.visit_geography(primitive, primitive_partner) + elif isinstance(primitive, GeometryType): + return self.visit_geometry(primitive, primitive_partner) elif isinstance(primitive, UnknownType): return self.visit_unknown(primitive, primitive_partner) else: @@ -622,6 +628,14 @@ def visit_fixed(self, fixed_type: FixedType, partner: Optional[P]) -> T: def visit_binary(self, binary_type: BinaryType, partner: Optional[P]) -> T: """Visit a BinaryType.""" + @abstractmethod + def visit_geography(self, date_type: GeographyType, partner: Optional[P]) -> T: + """Visit a GeographyType.""" + + @abstractmethod + def visit_geometry(self, date_type: GeometryType, partner: Optional[P]) -> T: + """Visit a GeometryType.""" + @abstractmethod def visit_unknown(self, unknown_type: UnknownType, partner: Optional[P]) -> T: """Visit a UnknownType.""" @@ -747,6 +761,10 @@ def primitive(self, primitive: PrimitiveType) -> T: return self.visit_uuid(primitive) elif isinstance(primitive, BinaryType): return self.visit_binary(primitive) + elif isinstance(primitive, GeographyType): + return self.visit_geography(primitive) + elif isinstance(primitive, GeometryType): + return self.visit_geometry(primitive) elif isinstance(primitive, UnknownType): return self.visit_unknown(primitive) else: @@ -816,6 +834,14 @@ def visit_uuid(self, uuid_type: UUIDType) -> T: def visit_binary(self, binary_type: BinaryType) -> T: """Visit a BinaryType.""" + @abstractmethod + def visit_geography(self, geography_type: GeographyType) -> T: + """Visit a GeographyType.""" + + @abstractmethod + def visit_geometry(self, geometry_type: GeometryType) -> T: + """Visit a GeometryType.""" + @abstractmethod def visit_unknown(self, unknown_type: UnknownType) -> T: """Visit a UnknownType.""" @@ -1645,9 +1671,13 @@ def _(file_type: StringType, read_type: IcebergType) -> IcebergType: @promote.register(BinaryType) +@promote.register(GeographyType) +@promote.register(GeometryType) def _(file_type: BinaryType, read_type: IcebergType) -> IcebergType: if isinstance(read_type, StringType): return read_type + elif isinstance(read_type, (GeographyType, GeometryType)): + return read_type else: raise ResolveError(f"Cannot promote an binary to {read_type}") diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index 3f5a8d8998..ded75b827a 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -72,6 +72,8 @@ DateType, DecimalType, FixedType, + GeographyType, + GeometryType, IcebergType, IntegerType, LongType, @@ -372,7 +374,7 @@ def hash_func(v: Any) -> int: def hash_func(v: Any) -> int: return mmh3.hash(decimal_to_bytes(v)) - elif isinstance(source, (StringType, FixedType, BinaryType)): + elif isinstance(source, (StringType, FixedType, BinaryType, GeographyType, GeometryType)): def hash_func(v: Any) -> int: return mmh3.hash(v) diff --git a/pyiceberg/types.py b/pyiceberg/types.py index 5822597952..9a7d10c243 100644 --- a/pyiceberg/types.py +++ b/pyiceberg/types.py @@ -33,6 +33,7 @@ from __future__ import annotations import re +from enum import Enum from functools import cached_property from typing import ( Annotated, @@ -61,6 +62,8 @@ from pyiceberg.utils.singleton import Singleton DECIMAL_REGEX = re.compile(r"decimal\((\d+),\s*(\d+)\)") +GEOGRAPHY_REGEX = re.compile(r"""geography\s*(?:\(\s*([^,]*?)\s*(?:,\s*(\w*)\s*)?\))?""", re.IGNORECASE) +GEOMETRY_REGEX = re.compile(r"""geometry\s*(?:\(\s*([^)]*?)\s*\))?""", re.IGNORECASE) FIXED = "fixed" FIXED_PARSER = ParseNumberFromBrackets(FIXED) @@ -81,6 +84,7 @@ def _parse_decimal_type(decimal: Any) -> Tuple[int, int]: else: raise ValidationError(f"Could not parse {decimal} into a DecimalType") elif isinstance(decimal, dict): + _raise_if_any_missing_dictionary_key(decimal, "DecimalType", "precision", "scale") return decimal["precision"], decimal["scale"] else: return decimal @@ -90,11 +94,59 @@ def _parse_fixed_type(fixed: Any) -> int: if isinstance(fixed, str): return FIXED_PARSER.match(fixed) elif isinstance(fixed, dict): + _raise_if_any_missing_dictionary_key(fixed, "FixedType", "length") return fixed["length"] else: return fixed +def _parse_geography_type(geography: Any) -> Tuple[str, GeographyType.EdgeAlgorithm]: + if isinstance(geography, str): + matches = GEOGRAPHY_REGEX.search(geography) + if matches: + crs = None + edge_algorithm = None + if matches.group(1): + crs = matches.group(1) + if matches.group(2): + edge_algorithm = GeographyType.EdgeAlgorithm(matches.group(2).lower()) + return crs, edge_algorithm + else: + raise ValidationError(f"Could not parse {geography} into a GeographyType") + elif isinstance(geography, dict): + _raise_if_any_missing_dictionary_key(geography, "GeographyType", "crs", "edge_algorithm") + return geography["crs"], geography["edge_algorithm"] + else: + return geography + + +def _parse_geometry_type(geometry: Any) -> str: + if isinstance(geometry, str): + matches = GEOMETRY_REGEX.search(geometry) + if matches: + crs = None + if matches.group(1): + crs = matches.group(1) + return crs + else: + raise ValidationError(f"Could not parse {geometry} into a GeometryType") + elif isinstance(geometry, dict): + _raise_if_any_missing_dictionary_key(geometry, "GeometryType", "crs") + return geometry["crs"] + else: + return geometry + + +def _raise_if_any_missing_dictionary_key(d: Dict, expected_type: str, *keys: str): + missing_keys = [] + for key in keys: + if key not in d: + missing_keys.append(key) + if len(missing_keys) == 0: + return + raise ValidationError(f"Missing required key(s): {', '.join(missing_keys)} for {expected_type}") + + def strtobool(val: str) -> bool: """Convert a string representation of truth to true (1) or false (0). @@ -162,6 +214,12 @@ def handle_primitive_type(cls, v: Any, handler: ValidatorFunctionWrapHandler) -> if v.startswith("decimal"): precision, scale = _parse_decimal_type(v) return DecimalType(precision, scale) + if v.startswith("geography"): + crs, edge_algorithm = _parse_geography_type(v) + return GeographyType(crs, edge_algorithm) + if v.startswith("geometry"): + crs = _parse_geometry_type(v) + return GeometryType(crs) else: raise ValueError(f"Type not recognized: {v}") if isinstance(v, dict) and cls == IcebergType: @@ -850,6 +908,134 @@ class BinaryType(PrimitiveType): root: Literal["binary"] = Field(default="binary") +class GeographyType(PrimitiveType): + """A geography data type in Iceberg. + + Example: + >>> GeographyType() + GeographyType(crs="OGC:CRS84") + """ + + class EdgeAlgorithm(Enum): + SPHERICAL = 'spherical' + VINCENTY = 'vincenty' + THOMAS = 'thomas' + ANDOYER = 'andoyer' + KARNEY = 'karney' + + root: Tuple[Optional[str], Optional[EdgeAlgorithm]] + + default_crs: ClassVar[str] = 'OGC:CRS84' + + def __init__(self, crs: Optional[str]=None, edge_algorithm: Optional[EdgeAlgorithm]=None) -> None: + super().__init__(root=( + None if crs is None or crs == GeographyType.default_crs else crs, + edge_algorithm + )) + + @model_serializer + def ser_model(self) -> str: + """Serialize the model to a string.""" + if self.edge_algorithm is None: + if self.crs is None: + return "geography" + else: + return f"geography({self.crs})" + else: + return f"geography({self.crs or GeographyType.default_crs}, {self.edge_algorithm.value})" + + @property + def crs(self) -> str: + """Return the crs of the geography.""" + return self.root[0] + + @property + def edge_algorithm(self) -> EdgeAlgorithm: + """Return the algorithm of the geography.""" + return self.root[1] + + def __repr__(self) -> str: + """Return the string representation of the GeographyType class.""" + return f"GeographyType(crs={self.crs or GeographyType.default_crs}, edge_algorithm={self.edge_algorithm or GeographyType.EdgeAlgorithm.SPHERICAL.value})" + + def __str__(self) -> str: + """Return the string representation.""" + if self.edge_algorithm is None: + if self.crs is None: + return f"geography" + else: + return f"geometry({self.crs})" + return f"geography({self.crs or GeographyType.default_crs}, {self.edge_algorithm.value})" + + def __hash__(self) -> int: + """Return the hash of the crs.""" + return hash(self.root) + + def __getnewargs__(self) -> Tuple[str, EdgeAlgorithm]: + """Pickle the GeographyType class.""" + return self.crs, self.edge_algorithm + + def __eq__(self, other: Any) -> bool: + """Compare to root to another object.""" + return self.root == other.root if isinstance(other, GeographyType) else False + + def minimum_format_version(self) -> TableVersion: + return 3 + + +class GeometryType(PrimitiveType): + """A geometry data type in Iceberg. + + Example: + >>> GeometryType() + GeometryType(crs="OGC:CRS84") + """ + + root: Tuple[Optional[str]] + + default_crs: ClassVar[str] = 'OGC:CRS84' + + def __init__(self, crs: Optional[str]=None) -> None: + super().__init__(root=(None if crs is None or crs == GeometryType.default_crs else crs, )) + + @model_serializer + def ser_model(self) -> str: + """Serialize the model to a string.""" + if self.crs is None: + return "geometry" + return f"geometry({self.crs})" + + @property + def crs(self) -> str: + """Return the crs of the geometry.""" + return self.root[0] + + def __repr__(self) -> str: + """Return the string representation of the GeometryType class.""" + return f"GeometryType(crs={self.crs or GeometryType.default_crs})" + + def __str__(self) -> str: + """Return the string representation.""" + if self.crs is None: + return "geometry" + return f"geometry({self.crs})" + + def __hash__(self) -> int: + """Return the hash of the crs.""" + return hash(self.root) + + def __getnewargs__(self) -> str: + """Pickle the GeometryType class.""" + return (self.crs, ) + + def __eq__(self, other: Any) -> bool: + """Compare to root to another object.""" + return self.root == other.root if isinstance(other, GeometryType) else False + + def minimum_format_version(self) -> TableVersion: + return 3 + + class UnknownType(PrimitiveType): """An unknown data type in Iceberg can be represented using an instance of this class. diff --git a/pyiceberg/utils/schema_conversion.py b/pyiceberg/utils/schema_conversion.py index 232b8f0094..65888b77fe 100644 --- a/pyiceberg/utils/schema_conversion.py +++ b/pyiceberg/utils/schema_conversion.py @@ -35,6 +35,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IcebergType, IntegerType, ListType, @@ -370,6 +372,10 @@ def _convert_logical_type(self, avro_logical_type: Dict[str, Any]) -> IcebergTyp physical_type = avro_logical_type["type"] if logical_type == "decimal": return self._convert_logical_decimal_type(avro_logical_type) + elif logical_type == "geography": + return self._convert_logical_geography_type(avro_logical_type) + elif logical_type == "geometry": + return self._convert_logical_geometry_type(avro_logical_type) elif logical_type == "map": return self._convert_logical_map_type(avro_logical_type) elif logical_type == "timestamp-micros": @@ -409,6 +415,58 @@ def _convert_logical_decimal_type(self, avro_type: Dict[str, Any]) -> DecimalTyp """ return DecimalType(precision=avro_type["precision"], scale=avro_type["scale"]) + def _convert_logical_geography_type(self, avro_type: Dict[str, Any]) -> GeographyType: + """Convert an avro type to an Iceberg GeographyType. + + Args: + avro_type: The Avro type. + + Examples: + >>> from pyiceberg.utils.schema_conversion import AvroSchemaConversion + >>> avro_geography_type = { + ... "type": "bytes", + ... "logicalType": "geography", + ... "crs": 'OGC:CRS84', + ... "edge_algorithm": 'spherical' + ... } + >>> actual = AvroSchemaConversion()._convert_logical_decimal_type(geography) + >>> expected = GeographyType( + ... crs='OGC:CRS84', + ... edge_algorithm=EdgeAlgorithm.SPHERICAL + ... ) + >>> actual == expected + True + + Returns: + A Iceberg GeographyType. + """ + return GeographyType(crs=avro_type["crs"], edge_algorithm=avro_type["edge_algorithm"]) + + def _convert_logical_geometry_type(self, avro_type: Dict[str, Any]) -> GeometryType: + """Convert an avro type to an Iceberg GeometryType. + + Args: + avro_type: The Avro type. + + Examples: + >>> from pyiceberg.utils.schema_conversion import AvroSchemaConversion + >>> avro_geometry_type = { + ... "type": "bytes", + ... "logicalType": "geometry", + ... "crs": "OGC:CRS84 + ... } + >>> actual = AvroSchemaConversion()._convert_logical_decimal_type(avro_geometry_type) + >>> expected = GeometryType( + ... crs="OGC:CRS84" + ... ) + >>> actual == expected + True + + Returns: + A Iceberg GeometryType. + """ + return GeometryType(crs=avro_type["crs"]) + def _convert_logical_map_type(self, avro_type: Dict[str, Any]) -> MapType: """Convert an avro map type to an Iceberg MapType. @@ -626,5 +684,20 @@ def visit_uuid(self, uuid_type: UUIDType) -> AvroType: def visit_binary(self, binary_type: BinaryType) -> AvroType: return "bytes" + def visit_geography(self, geography_type: GeographyType) -> AvroType: + return { + "type": "bytes", + "logicalType": "geography", + "crs": geography_type.crs, + "edge_algorithm": geography_type.edge_algorithm, + } + + def visit_geometry(self, geometry_type: GeometryType) -> AvroType: + return { + "type": "bytes", + "logicalType": "geometry", + "crs": geometry_type.crs, + } + def visit_unknown(self, unknown_type: UnknownType) -> AvroType: return "null" diff --git a/tests/avro/test_reader.py b/tests/avro/test_reader.py index 82473d11d1..f0d1f5b57f 100644 --- a/tests/avro/test_reader.py +++ b/tests/avro/test_reader.py @@ -31,6 +31,8 @@ DoubleReader, FixedReader, FloatReader, + GeographyReader, + GeometryReader, IntegerReader, StringReader, StructReader, @@ -55,6 +57,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IntegerType, LongType, NestedField, @@ -337,6 +341,14 @@ def test_binary_reader() -> None: assert construct_reader(BinaryType()) == BinaryReader() +def test_geography_reader() -> None: + assert construct_reader(GeographyType()) == GeographyReader() + + +def test_geometry_reader() -> None: + assert construct_reader(GeometryType()) == GeometryReader() + + def test_unknown_type() -> None: assert construct_reader(UnknownType()) == UnknownReader() diff --git a/tests/avro/test_writer.py b/tests/avro/test_writer.py index c655156c2a..3d48875632 100644 --- a/tests/avro/test_writer.py +++ b/tests/avro/test_writer.py @@ -30,6 +30,8 @@ DoubleWriter, FixedWriter, FloatWriter, + GeographyWriter, + GeometryWriter, IntegerWriter, StringWriter, TimestampNanoWriter, @@ -49,6 +51,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IntegerType, ListType, LongType, @@ -136,6 +140,14 @@ def test_binary_writer() -> None: assert construct_writer(BinaryType()) == BinaryWriter() +def test_geography_writer() -> None: + assert construct_writer(GeographyType()) == GeographyWriter() + + +def test_geometry_writer() -> None: + assert construct_writer(GeometryType()) == GeometryWriter() + + def test_unknown_type() -> None: assert construct_writer(UnknownType()) == UnknownWriter() diff --git a/tests/conftest.py b/tests/conftest.py index 584b6c633a..0e56304f08 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -79,6 +79,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IntegerType, ListType, LongType, @@ -349,6 +351,8 @@ def table_schema_with_all_types() -> Schema: NestedField(field_id=22, name="inner_int", field_type=IntegerType(), required=True), ), ), + NestedField(field_id=23, name="geography", field_type=GeographyType(), required=True), + NestedField(field_id=24, name="geometry", field_type=GeometryType(), required=True), schema_id=1, identifier_field_ids=[2], ) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 4f121ba3bc..1956a6ef61 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -94,6 +94,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IntegerType, ListType, LongType, @@ -562,6 +564,16 @@ def test_binary_type_to_pyarrow() -> None: assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.large_binary() +def test_geography_type_to_pyarrow() -> None: + iceberg_type = GeographyType() + assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.large_binary() + + +def test_geometry_type_to_pyarrow() -> None: + iceberg_type = GeometryType() + assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.large_binary() + + def test_struct_type_to_pyarrow(table_schema_simple: Schema) -> None: expected = pa.struct( [ diff --git a/tests/table/test_metadata.py b/tests/table/test_metadata.py index 9141189ec5..3f87e6d3bd 100644 --- a/tests/table/test_metadata.py +++ b/tests/table/test_metadata.py @@ -43,6 +43,8 @@ from pyiceberg.types import ( BooleanType, FloatType, + GeographyType, + GeometryType, IntegerType, ListType, LongType, @@ -778,6 +780,8 @@ def test_new_table_metadata_with_v3_schema() -> None: NestedField(field_id=33, name="baz", field_type=BooleanType(), required=False), NestedField(field_id=34, name="qux", field_type=TimestampNanoType(), required=False), NestedField(field_id=35, name="quux", field_type=TimestamptzNanoType(), required=False), + NestedField(field_id=46, name="corge", field_type=GeographyType(), required=False), + NestedField(field_id=47, name="grault", field_type=GeometryType(), required=False), schema_id=10, identifier_field_ids=[22], ) @@ -805,6 +809,8 @@ def test_new_table_metadata_with_v3_schema() -> None: NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), NestedField(field_id=4, name="qux", field_type=TimestampNanoType(), required=False), NestedField(field_id=5, name="quux", field_type=TimestamptzNanoType(), required=False), + NestedField(field_id=6, name="corge", field_type=GeographyType(), required=False), + NestedField(field_id=7, name="grault", field_type=GeometryType(), required=False), schema_id=0, identifier_field_ids=[2], ) @@ -820,7 +826,7 @@ def test_new_table_metadata_with_v3_schema() -> None: location="s3://some_v1_location/", table_uuid=actual.table_uuid, last_updated_ms=actual.last_updated_ms, - last_column_id=5, + last_column_id=7, schemas=[expected_schema], schema_=expected_schema, current_schema_id=0, diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 7a7d4a6d8e..11cb56b617 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -100,6 +100,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IntegerType, LongType, NestedField, @@ -168,6 +170,16 @@ TimestamptzNanoType(), -1207196810, ), + ( + b"\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?", # POINT (1 1) in WKB + GeographyType(), + -1769831280 + ), + ( + b"\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?", # POINT (1 1) in WKB + GeometryType(), + -1769831280 + ) ], ) def test_bucket_hash_values(test_input: Any, test_type: PrimitiveType, expected: Any) -> None: diff --git a/tests/test_types.py b/tests/test_types.py index 2527f0e822..8db995ac6f 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -30,6 +30,8 @@ DoubleType, FixedType, FloatType, + GeographyType, + GeometryType, IcebergType, IntegerType, ListType, @@ -499,6 +501,41 @@ def test_repr_binary() -> None: assert repr(BinaryType()) == "BinaryType()" +def test_serialization_geography() -> None: + assert GeographyType().model_dump_json() == '"geography"' + + +def test_deserialization_geography() -> None: + assert GeographyType.model_validate_json('"geography"') == GeographyType() + assert GeographyType.model_validate_json('"geography(OGC:CRS84, thomas)"') == GeographyType(crs="OGC:CRS84", edge_algorithm=GeographyType.EdgeAlgorithm.THOMAS) + assert GeographyType.model_validate_json('"geography(OGC:CRS84)"') == GeographyType(crs="OGC:CRS84", edge_algorithm=None) + + +def test_str_geography() -> None: + assert str(GeographyType()) == "geography" + + +def test_repr_geography() -> None: + assert repr(GeographyType()) == "GeographyType(crs=OGC:CRS84, edge_algorithm=spherical)" + + +def test_serialization_geometry() -> None: + assert GeometryType().model_dump_json() == '"geometry"' + + +def test_deserialization_geometry() -> None: + assert GeometryType.model_validate_json('"geometry"') == GeometryType() + assert GeometryType.model_validate_json('"geometry(OGC:CRS84)"') == GeometryType(crs="OGC:CRS84") + + +def test_str_geometry() -> None: + assert str(GeometryType()) == "geometry" + + +def test_repr_geometry() -> None: + assert repr(GeometryType()) == "GeometryType(crs=OGC:CRS84)" + + def test_serialization_decimal() -> None: assert DecimalType(19, 25).model_dump_json() == '"decimal(19, 25)"' diff --git a/tests/utils/test_schema_conversion.py b/tests/utils/test_schema_conversion.py index eb44dcdff3..546975f684 100644 --- a/tests/utils/test_schema_conversion.py +++ b/tests/utils/test_schema_conversion.py @@ -26,6 +26,8 @@ DateType, DecimalType, FixedType, + GeographyType, + GeometryType, IntegerType, ListType, LongType, @@ -347,6 +349,18 @@ def test_convert_timestamp_micros_type() -> None: assert actual == TimestampType() +def test_convert_geography_type() -> None: + avro_logical_type = {"type": "bytes", "logicalType": "geography", "crs": "OGC:CRS84", "edge_algorithm": None} + actual = AvroSchemaConversion()._convert_logical_type(avro_logical_type) + assert actual == GeographyType() + + +def test_convert_geometry_type() -> None: + avro_logical_type = {"type": "string", "logicalType": "geometry", "crs": "OGC:CRS84"} + actual = AvroSchemaConversion()._convert_logical_type(avro_logical_type) + assert actual == GeometryType() + + def test_unknown_logical_type() -> None: """Test raising a ValueError when converting an unknown logical type as part of an Avro schema conversion""" avro_logical_type = {"type": "bytes", "logicalType": "date"}