diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/Rendering.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/Rendering.kt index 222a6da249..3a755ef104 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/Rendering.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/Rendering.kt @@ -16,6 +16,7 @@ import java.net.URL import java.time.LocalDateTime import java.time.LocalTime import kotlin.reflect.KType +import kotlin.reflect.KVariance import kotlin.reflect.full.isSubtypeOf import kotlin.reflect.jvm.jvmErasure import kotlin.reflect.typeOf @@ -89,7 +90,12 @@ internal fun renderType(type: KType?): String { append(name) if (type.arguments.isNotEmpty()) { val arguments = type.arguments.joinToString { - renderType(it.type) + when (it.variance) { + null -> "*" + KVariance.INVARIANT -> renderType(it.type) + KVariance.IN -> "in ${renderType(it.type)}" + KVariance.OUT -> "out ${renderType(it.type)}" + } } append("<$arguments>") } diff --git a/dataframe-jdbc/api/dataframe-jdbc.api b/dataframe-jdbc/api/dataframe-jdbc.api index ff54fb1b24..0017068d90 100644 --- a/dataframe-jdbc/api/dataframe-jdbc.api +++ b/dataframe-jdbc/api/dataframe-jdbc.api @@ -128,6 +128,15 @@ public abstract class org/jetbrains/kotlinx/dataframe/io/db/DbType { public static synthetic fun sqlQueryLimit$default (Lorg/jetbrains/kotlinx/dataframe/io/db/DbType;Ljava/lang/String;IILjava/lang/Object;)Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/io/db/DuckDb : org/jetbrains/kotlinx/dataframe/io/db/DbType { + public static final field INSTANCE Lorg/jetbrains/kotlinx/dataframe/io/db/DuckDb; + public fun buildTableMetadata (Ljava/sql/ResultSet;)Lorg/jetbrains/kotlinx/dataframe/io/TableMetadata; + public fun convertSqlTypeToColumnSchemaValue (Lorg/jetbrains/kotlinx/dataframe/io/TableColumnMetadata;)Lorg/jetbrains/kotlinx/dataframe/schema/ColumnSchema; + public fun convertSqlTypeToKType (Lorg/jetbrains/kotlinx/dataframe/io/TableColumnMetadata;)Lkotlin/reflect/KType; + public fun getDriverClassName ()Ljava/lang/String; + public fun isSystemTable (Lorg/jetbrains/kotlinx/dataframe/io/TableMetadata;)Z +} + public class org/jetbrains/kotlinx/dataframe/io/db/H2 : org/jetbrains/kotlinx/dataframe/io/db/DbType { public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/db/H2$Companion; public static final field MODE_MARIADB Ljava/lang/String; diff --git a/dataframe-jdbc/build.gradle.kts b/dataframe-jdbc/build.gradle.kts index f616d61891..591bb331ed 100644 --- a/dataframe-jdbc/build.gradle.kts +++ b/dataframe-jdbc/build.gradle.kts @@ -16,6 +16,7 @@ repositories { dependencies { api(projects.core) + compileOnly(libs.duckdb.jdbc) implementation(libs.kotlinLogging) testImplementation(libs.mariadb) testImplementation(libs.sqlite) @@ -26,6 +27,8 @@ dependencies { testImplementation(libs.junit) testImplementation(libs.sl4jsimple) testImplementation(libs.jts) + testImplementation(libs.duckdb.jdbc) + testImplementation(projects.dataframeJson) testImplementation(libs.kotestAssertions) { exclude("org.jetbrains.kotlin", "kotlin-stdlib-jdk8") } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt new file mode 100644 index 0000000000..3bf53c2576 --- /dev/null +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -0,0 +1,219 @@ +package org.jetbrains.kotlinx.dataframe.io.db + +import org.duckdb.DuckDBColumnType +import org.duckdb.DuckDBColumnType.ARRAY +import org.duckdb.DuckDBColumnType.BIGINT +import org.duckdb.DuckDBColumnType.BIT +import org.duckdb.DuckDBColumnType.BLOB +import org.duckdb.DuckDBColumnType.BOOLEAN +import org.duckdb.DuckDBColumnType.DATE +import org.duckdb.DuckDBColumnType.DECIMAL +import org.duckdb.DuckDBColumnType.DOUBLE +import org.duckdb.DuckDBColumnType.ENUM +import org.duckdb.DuckDBColumnType.FLOAT +import org.duckdb.DuckDBColumnType.HUGEINT +import org.duckdb.DuckDBColumnType.INTEGER +import org.duckdb.DuckDBColumnType.INTERVAL +import org.duckdb.DuckDBColumnType.JSON +import org.duckdb.DuckDBColumnType.LIST +import org.duckdb.DuckDBColumnType.MAP +import org.duckdb.DuckDBColumnType.SMALLINT +import org.duckdb.DuckDBColumnType.STRUCT +import org.duckdb.DuckDBColumnType.TIME +import org.duckdb.DuckDBColumnType.TIMESTAMP +import org.duckdb.DuckDBColumnType.TIMESTAMP_MS +import org.duckdb.DuckDBColumnType.TIMESTAMP_NS +import org.duckdb.DuckDBColumnType.TIMESTAMP_S +import org.duckdb.DuckDBColumnType.TIMESTAMP_WITH_TIME_ZONE +import org.duckdb.DuckDBColumnType.TIME_WITH_TIME_ZONE +import org.duckdb.DuckDBColumnType.TINYINT +import org.duckdb.DuckDBColumnType.UBIGINT +import org.duckdb.DuckDBColumnType.UHUGEINT +import org.duckdb.DuckDBColumnType.UINTEGER +import org.duckdb.DuckDBColumnType.UNION +import org.duckdb.DuckDBColumnType.UNKNOWN +import org.duckdb.DuckDBColumnType.USMALLINT +import org.duckdb.DuckDBColumnType.UTINYINT +import org.duckdb.DuckDBColumnType.UUID +import org.duckdb.DuckDBColumnType.VARCHAR +import org.duckdb.DuckDBResultSetMetaData +import org.duckdb.JsonNode +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.io.TableColumnMetadata +import org.jetbrains.kotlinx.dataframe.io.TableMetadata +import org.jetbrains.kotlinx.dataframe.io.db.DuckDb.convertSqlTypeToKType +import org.jetbrains.kotlinx.dataframe.io.getSchemaForAllSqlTables +import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables +import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema +import java.math.BigDecimal +import java.math.BigInteger +import java.sql.Array +import java.sql.Blob +import java.sql.DatabaseMetaData +import java.sql.ResultSet +import java.sql.Struct +import java.sql.Timestamp +import java.time.LocalDate +import java.time.LocalTime +import java.time.OffsetDateTime +import java.time.OffsetTime +import java.util.UUID +import kotlin.reflect.KType +import kotlin.reflect.KTypeProjection +import kotlin.reflect.full.createType +import kotlin.reflect.full.withNullability +import kotlin.reflect.typeOf + +/** + * Represents the [DuckDB](http://duckdb.org/) database type. + * + * This class provides methods to convert data from a [ResultSet] to the appropriate type for DuckDB, + * and to generate the corresponding [column schema][ColumnSchema]. + */ +public object DuckDb : DbType("duckdb") { + + /** the name of the class of the DuckDB JDBC driver */ + override val driverClassName: String = "org.duckdb.DuckDBDriver" + + /** + * How a column type from JDBC, [tableColumnMetadata], is read in Java/Kotlin. + * The returned type must exactly follow [ResultSet.getObject] of your specific database's JDBC driver. + * Returning `null` defer the implementation to the default one (which may not always be correct). + * + * Following [org.duckdb.DuckDBVector.getObject]. + */ + override fun convertSqlTypeToKType(tableColumnMetadata: TableColumnMetadata): KType = + tableColumnMetadata.sqlTypeName.toKType(tableColumnMetadata.isNullable) + + /** + * How a column from JDBC should be represented as DataFrame (value) column + * See [convertSqlTypeToKType]. + */ + override fun convertSqlTypeToColumnSchemaValue(tableColumnMetadata: TableColumnMetadata): ColumnSchema { + val type = convertSqlTypeToKType(tableColumnMetadata) + return ColumnSchema.Value(type) + } + + /** + * Follows exactly [org.duckdb.DuckDBVector.getObject]. + * + * "// dataframe-jdbc" is added for all types that are covered correctly by + * [org.jetbrains.kotlinx.dataframe.io.makeCommonSqlToKTypeMapping] at the moment, however, to cover + * all nested types, we'll use a full type-map for all [DuckDB types][DuckDBColumnType] exactly. + */ + @Suppress("ktlint:standard:blank-line-between-when-conditions") + internal fun String.toKType(isNullable: Boolean): KType { + val sqlTypeName = this + return when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) { + BOOLEAN -> typeOf() // dataframe-jdbc + TINYINT -> typeOf() + SMALLINT -> typeOf() + INTEGER -> typeOf() // dataframe-jdbc + BIGINT -> typeOf() // dataframe-jdbc + HUGEINT -> typeOf() + UHUGEINT -> typeOf() + UTINYINT -> typeOf() + USMALLINT -> typeOf() + UINTEGER -> typeOf() + UBIGINT -> typeOf() + FLOAT -> typeOf() // dataframe-jdbc + DOUBLE -> typeOf() // dataframe-jdbc + DECIMAL -> typeOf() // dataframe-jdbc + TIME -> typeOf() + TIME_WITH_TIME_ZONE -> typeOf() // dataframe-jdbc + DATE -> typeOf() + TIMESTAMP, TIMESTAMP_MS, TIMESTAMP_NS, TIMESTAMP_S -> typeOf() // dataframe-jdbc + TIMESTAMP_WITH_TIME_ZONE -> typeOf() // dataframe-jdbc + JSON -> typeOf() + BLOB -> typeOf() + UUID -> typeOf() + MAP -> { + val (key, value) = parseMapTypes(sqlTypeName) + Map::class.createType( + listOf( + KTypeProjection.invariant(key.toKType(false)), + KTypeProjection.invariant(value.toKType(true)), + ), + ) + } + + LIST, ARRAY -> { + // TODO requires #1266 and #1273 for specific types + // val listType = parseListType(sqlTypeName) + // Array::class.createType( + // listOf(KTypeProjection.invariant(listType.toKType(true))), + // ) + typeOf() + } + + STRUCT -> typeOf() // TODO requires #1266 for specific types + UNION -> typeOf() // Cannot handle this in Kotlin + VARCHAR -> typeOf() + UNKNOWN, BIT, INTERVAL, ENUM -> typeOf() + }.withNullability(isNullable) + } + + /** Parses "MAP(X, Y)" into "X" and "Y", taking parentheses into account */ + internal fun parseMapTypes(typeString: String): Pair { + if (!typeString.startsWith("MAP(") || !typeString.endsWith(")")) { + error("invalid MAP type: $typeString") + } + + val content = typeString.removeSurrounding("MAP(", ")") + + // Find the comma that separates key and value types + var parenCount = 0 + var commaIndex = -1 + for (i in content.indices) { + when (content[i]) { + '(' -> parenCount++ + + ')' -> parenCount-- + + ',' -> if (parenCount == 0) { + commaIndex = i + break + } + } + } + + if (commaIndex == -1) error("invalid MAP type: $typeString") + val keyType = content.take(commaIndex).trim() + val valueType = content.substring(commaIndex + 1).trim() + return Pair(keyType, valueType) + } + + /** Parses "X[]" and "X[123]" into "X", and "X[][]" into "X[]" */ + internal fun parseListType(typeString: String): String { + if (!typeString.endsWith("]")) { + error("invalid LIST/ARRAY type: $typeString") + } + + return typeString.take(typeString.indexOfLast { it == '[' }) + } + + /** + * How to filter out system tables from user-created ones when using + * [DataFrame.readAllSqlTables][DataFrame.Companion.readAllSqlTables] and + * [DataFrame.getSchemaForAllSqlTables][DataFrame.Companion.getSchemaForAllSqlTables]. + * + * The names of these can sometimes be found in the specific JDBC integration. + */ + override fun isSystemTable(tableMetadata: TableMetadata): Boolean = + tableMetadata.schemaName?.lowercase()?.contains("information_schema") == true || + tableMetadata.schemaName?.lowercase()?.contains("system") == true || + tableMetadata.name.lowercase().contains("system_") + + /** + * How to retrieve the correct table metadata when using + * [DataFrame.readAllSqlTables][DataFrame.Companion.readAllSqlTables] and + * [DataFrame.getSchemaForAllSqlTables][DataFrame.Companion.getSchemaForAllSqlTables]. + * The names of these can be found in the [DatabaseMetaData] implementation of the DuckDB JDBC integration. + */ + override fun buildTableMetadata(tables: ResultSet): TableMetadata = + TableMetadata( + tables.getString("TABLE_NAME"), + tables.getString("TABLE_SCHEM"), + tables.getString("TABLE_CAT"), + ) +} diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/util.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/util.kt index 45487b5586..1d6752cf90 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/util.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/util.kt @@ -84,9 +84,11 @@ public fun extractDBTypeFromUrl(url: String?): DbType { MsSql.dbTypeInJdbcUrl in url -> MsSql + DuckDb.dbTypeInJdbcUrl in url -> DuckDb + else -> throw IllegalArgumentException( "Unsupported database type in the url: $url. " + - "Only H2, MariaDB, MySQL, MSSQL, SQLite and PostgreSQL are supported!", + "Only H2, MariaDB, MySQL, MSSQL, SQLite, PostgreSQL, and DuckDB are supported!", ) } } else { diff --git a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/commonTestScenarios.kt b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/commonTestScenarios.kt index f6de7b3d5b..35435cf664 100644 --- a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/commonTestScenarios.kt +++ b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/commonTestScenarios.kt @@ -1,8 +1,11 @@ package org.jetbrains.kotlinx.dataframe.io +import io.kotest.assertions.withClue import io.kotest.matchers.shouldBe import org.intellij.lang.annotations.Language +import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.inferType import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.io.db.MsSql import java.sql.Connection @@ -128,3 +131,26 @@ internal fun inferNullability(connection: Connection) { connection.createStatement().execute("DROP TABLE TestTable1") } + +/** + * Helper to check whether the provided schema matches the inferred schema. + * + * It must hold that all types in the provided schema are equal or super to + * the corresponding types in the inferred schema. + */ +@Suppress("INVISIBLE_REFERENCE") +fun AnyFrame.assertInferredTypesMatchSchema() { + withClue({ + """ + |Inferred schema must be <: Provided schema + | + |Inferred Schema: + |${inferType().schema().toString().lines().joinToString("\n|")} + | + |Provided Schema: + |${schema().toString().lines().joinToString("\n|")} + """.trimMargin() + }) { + schema().compare(inferType().schema()).isSuperOrEqual() shouldBe true + } +} diff --git a/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt new file mode 100644 index 0000000000..70f723acee --- /dev/null +++ b/dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt @@ -0,0 +1,635 @@ +@file:Suppress("SqlDialectInspection") + +package org.jetbrains.kotlinx.dataframe.io.local + +import io.kotest.matchers.shouldBe +import org.duckdb.DuckDBConnection +import org.duckdb.DuckDBResultSet +import org.duckdb.JsonNode +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.annotations.ColumnName +import org.jetbrains.kotlinx.dataframe.annotations.DataSchema +import org.jetbrains.kotlinx.dataframe.api.cast +import org.jetbrains.kotlinx.dataframe.api.colsOf +import org.jetbrains.kotlinx.dataframe.api.convert +import org.jetbrains.kotlinx.dataframe.api.reorderColumnsByName +import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.api.single +import org.jetbrains.kotlinx.dataframe.api.toDataFrame +import org.jetbrains.kotlinx.dataframe.api.values +import org.jetbrains.kotlinx.dataframe.api.with +import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig +import org.jetbrains.kotlinx.dataframe.io.assertInferredTypesMatchSchema +import org.jetbrains.kotlinx.dataframe.io.db.DuckDb +import org.jetbrains.kotlinx.dataframe.io.getSchemaForAllSqlTables +import org.jetbrains.kotlinx.dataframe.io.getSchemaForResultSet +import org.jetbrains.kotlinx.dataframe.io.getSchemaForSqlTable +import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables +import org.jetbrains.kotlinx.dataframe.io.readDataFrame +import org.jetbrains.kotlinx.dataframe.io.readResultSet +import org.jetbrains.kotlinx.dataframe.io.readSqlQuery +import org.jetbrains.kotlinx.dataframe.io.readSqlTable +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema +import org.junit.Ignore +import org.junit.Test +import java.math.BigDecimal +import java.math.BigInteger +import java.nio.ByteBuffer +import java.sql.Blob +import java.sql.DriverManager +import java.sql.Timestamp +import java.time.LocalDate +import java.time.LocalTime +import java.time.OffsetDateTime +import java.util.UUID + +private const val URL = "jdbc:duckdb:" + +class DuckDbTest { + + // region expected schemas + @DataSchema + data class Person( + val id: Int, + val name: String, + val age: Int, + val salary: Double, + @ColumnName("hire_date") + val hireDate: LocalDate, + ) { + companion object { + val expected = listOf( + Person(1, "John Doe", 30, 50000.0, LocalDate.of(2020, 1, 15)), + Person(2, "Jane Smith", 28, 55000.0, LocalDate.of(2021, 3, 20)), + Person(3, "Bob Johnson", 35, 65000.0, LocalDate.of(2019, 11, 10)), + Person(4, "Alice Brown", 32, 60000.0, LocalDate.of(2020, 7, 1)), + ).toDataFrame() + } + } + + @DataSchema + data class GeneralPurposeTypes( + @ColumnName("bigint_col") + val bigintCol: Long, + @ColumnName("binary_col") + val binaryCol: Blob, + @ColumnName("bit_col") + val bitCol: String, + @ColumnName("bitstring_col") + val bitstringCol: String, + @ColumnName("blob_col") + val blobCol: Blob, + @ColumnName("bool_col") + val boolCol: Boolean, + @ColumnName("boolean_col") + val booleanCol: Boolean, + @ColumnName("bpchar_col") + val bpcharCol: String, + @ColumnName("bytea_col") + val byteaCol: Blob, + @ColumnName("char_col") + val charCol: String, + @ColumnName("date_col") + val dateCol: LocalDate, + @ColumnName("datetime_col") + val datetimeCol: Timestamp, + @ColumnName("decimal_col") + val decimalCol: BigDecimal, + @ColumnName("double_col") + val doubleCol: Double, + @ColumnName("enum_col") + val enumCol: String, + @ColumnName("float4_col") + val float4Col: Float, + @ColumnName("float8_col") + val float8Col: Double, + @ColumnName("float_col") + val floatCol: Float, + @ColumnName("hugeint_col") + val hugeintCol: BigInteger, + @ColumnName("int128_col") + val int128Col: BigInteger, + @ColumnName("int16_col") + val int16Col: Short, + @ColumnName("int1_col") + val int1Col: Byte, + @ColumnName("int2_col") + val int2Col: Short, + @ColumnName("int32_col") + val int32Col: Int, + @ColumnName("int4_col") + val int4Col: Int, + @ColumnName("int64_col") + val int64Col: Long, + @ColumnName("int8_col") + val int8Col: Long, + @ColumnName("int_col") + val intCol: Int, + @ColumnName("integer_col") + val integerCol: Int, + @ColumnName("interval_col") + val intervalCol: String, + @ColumnName("json_col") + val jsonCol: JsonNode, + @ColumnName("logical_col") + val logicalCol: Boolean, + @ColumnName("long_col") + val longCol: Long, + @ColumnName("numeric_col") + val numericCol: BigDecimal, + @ColumnName("real_col") + val realCol: Float, + @ColumnName("short_col") + val shortCol: Short, + @ColumnName("signed_col") + val signedCol: Int, + @ColumnName("smallint_col") + val smallintCol: Short, + @ColumnName("string_col") + val stringCol: String, + @ColumnName("text_col") + val textCol: String, + @ColumnName("time_col") + val timeCol: LocalTime, + @ColumnName("timestamp_col") + val timestampCol: Timestamp, + @ColumnName("timestamptz_col") + val timestamptzCol: OffsetDateTime, + @ColumnName("timestampwtz_col") + val timestampwtzCol: OffsetDateTime, + @ColumnName("tinyint_col") + val tinyintCol: Byte, + @ColumnName("ubigint_col") + val ubigintCol: BigInteger, + @ColumnName("uhugeint_col") + val uhugeintCol: BigInteger, + @ColumnName("uint128_col") + val uint128Col: BigInteger, + @ColumnName("uint16_col") + val uint16Col: Int, + @ColumnName("uint32_col") + val uint32Col: Long, + @ColumnName("uint64_col") + val uint64Col: BigInteger, + @ColumnName("uint8_col") + val uint8Col: Short, + @ColumnName("uint_col") + val uintCol: Long, + @ColumnName("usmallint_col") + val usmallintCol: Int, + @ColumnName("utinyint_col") + val utinyintCol: Short, + @ColumnName("uuid_col") + val uuidCol: UUID, + @ColumnName("varbinary_col") + val varbinaryCol: Blob, + @ColumnName("varchar_col") + val varcharCol: String, + ) { + companion object { + val expected = listOf( + GeneralPurposeTypes( + bigintCol = 9223372036854775807L, + binaryCol = DuckDBResultSet.DuckDBBlobResult(ByteBuffer.wrap("DEADBEEF".toByteArray())), + bitCol = "1010", + bitstringCol = "1010", + blobCol = DuckDBResultSet.DuckDBBlobResult(ByteBuffer.wrap("DEADBEEF".toByteArray())), + boolCol = true, + booleanCol = true, + bpcharCol = "test", + byteaCol = DuckDBResultSet.DuckDBBlobResult(ByteBuffer.wrap("DEADBEEF".toByteArray())), + charCol = "test", + dateCol = LocalDate.parse("2025-06-19"), + datetimeCol = Timestamp.valueOf("2025-06-19 12:34:56"), + decimalCol = BigDecimal("123.45"), + doubleCol = 3.14159, + enumCol = "female", + float4Col = 3.14f, + float8Col = 3.14159, + floatCol = 3.14f, + hugeintCol = BigInteger("170141183460469231731687303715884105727"), + int128Col = BigInteger("170141183460469231731687303715884105727"), + int16Col = 32767, + int1Col = 127, + int2Col = 32767, + int32Col = 2147483647, + int4Col = 2147483647, + int64Col = 9223372036854775807L, + int8Col = 9223372036854775807L, + intCol = 2147483647, + integerCol = 2147483647, + intervalCol = "1 year", + jsonCol = JsonNode("{\"key\": \"value\"}"), + logicalCol = true, + longCol = 9223372036854775807L, + numericCol = BigDecimal("123.45"), + realCol = 3.14f, + shortCol = 32767, + signedCol = 2147483647, + smallintCol = 32767, + stringCol = "test string", + textCol = "test text", + timeCol = LocalTime.parse("12:34:56"), + timestampCol = Timestamp.valueOf("2025-06-19 12:34:56"), + timestamptzCol = OffsetDateTime.parse("2025-06-19T12:34:56+02:00"), + timestampwtzCol = OffsetDateTime.parse("2025-06-19T12:34:56+02:00"), + tinyintCol = 127, + ubigintCol = BigInteger("18446744073709551615"), + uhugeintCol = BigInteger("340282366920938463463374607431768211455"), + uint128Col = BigInteger("340282366920938463463374607431768211455"), + uint16Col = 65535, + uint32Col = 4294967295L, + uint64Col = BigInteger("18446744073709551615"), + uint8Col = 255, + uintCol = 4294967295L, + usmallintCol = 65535, + utinyintCol = 255, + uuidCol = UUID.fromString("a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11"), + varbinaryCol = DuckDBResultSet.DuckDBBlobResult(ByteBuffer.wrap("DEADBEEF".toByteArray())), + varcharCol = "test string", + ), + ).toDataFrame() + } + } + + @DataSchema + data class NestedTypes( + @ColumnName("ijstruct_col") + val ijstructCol: java.sql.Struct, + @ColumnName("intarray_col") + val intarrayCol: java.sql.Array, + @ColumnName("intlist_col") + val intlistCol: java.sql.Array, + @ColumnName("intstringmap_col") + val intstringmapCol: Map, + @ColumnName("intstrinstinggmap_col") + val intstrinstinggmapCol: Map?>, + @ColumnName("stringarray_col") + val stringarrayCol: java.sql.Array, + @ColumnName("stringlist_col") + val stringlistCol: java.sql.Array, + @ColumnName("stringlistlist_col") + val stringlistlistCol: java.sql.Array, + @ColumnName("union_col") + val unionCol: Any, + ) + + // endregion + + @Test + fun `read simple dataframe from DuckDB`() { + val df: AnyFrame + val schema: DataFrameSchema + val subset: AnyFrame + DriverManager.getConnection(URL).use { connection -> + connection.prepareStatement( + """ + CREATE TABLE IF NOT EXISTS test_table ( + id INTEGER PRIMARY KEY, + name VARCHAR, + age INTEGER, + salary DOUBLE, + hire_date DATE + ) + """.trimIndent(), + ).executeUpdate() + + connection.prepareStatement( + """ + INSERT INTO test_table (id, name, age, salary, hire_date) + VALUES + (1, 'John Doe', 30, 50000.00, '2020-01-15'), + (2, 'Jane Smith', 28, 55000.00, '2021-03-20'), + (3, 'Bob Johnson', 35, 65000.00, '2019-11-10'), + (4, 'Alice Brown', 32, 60000.00, '2020-07-01') + """.trimIndent(), + ).executeUpdate() + + df = DataFrame.readSqlTable(connection, "test_table") + schema = DataFrame.getSchemaForSqlTable(connection, "test_table") + + subset = DataFrame.readSqlQuery(connection, """SELECT test_table.name, test_table.age FROM test_table""") + } + + schema.compare(Person.expected.schema()).isSuperOrEqual() shouldBe true + + df.cast(verify = true) shouldBe Person.expected + df.assertInferredTypesMatchSchema() + + subset.assertInferredTypesMatchSchema() + subset["name"] shouldBe df["name"] + subset["age"] shouldBe df["age"] + subset.columnsCount() shouldBe 2 + } + + @Test + fun `read simple dataframe from DuckDB ResultSet`() { + val df: AnyFrame + val schema: DataFrameSchema + DriverManager.getConnection(URL).use { connection -> + connection.prepareStatement( + """ + CREATE TABLE IF NOT EXISTS test_table ( + id INTEGER PRIMARY KEY, + name VARCHAR, + age INTEGER, + salary DOUBLE, + hire_date DATE + ) + """.trimIndent(), + ).executeUpdate() + + connection.prepareStatement( + """ + INSERT INTO test_table (id, name, age, salary, hire_date) + VALUES + (1, 'John Doe', 30, 50000.00, '2020-01-15'), + (2, 'Jane Smith', 28, 55000.00, '2021-03-20'), + (3, 'Bob Johnson', 35, 65000.00, '2019-11-10'), + (4, 'Alice Brown', 32, 60000.00, '2020-07-01') + """.trimIndent(), + ).executeUpdate() + + connection.prepareStatement("SELECT * FROM test_table").executeQuery().use { rs -> + df = DataFrame.readResultSet(rs, DuckDb) + schema = DataFrame.getSchemaForResultSet(rs, DuckDb) + } + } + + schema.compare(Person.expected.schema()).isSuperOrEqual() shouldBe true + + df.cast(verify = true) shouldBe Person.expected + df.assertInferredTypesMatchSchema() + } + + @Test + fun `read all tables`() { + val dfs: Map + val schemas: Map + DriverManager.getConnection(URL).use { connection -> + connection.prepareStatement( + """ + CREATE TABLE IF NOT EXISTS test_table ( + id INTEGER PRIMARY KEY, + name VARCHAR, + age INTEGER, + salary DOUBLE, + hire_date DATE + ) + """.trimIndent(), + ).executeUpdate() + + connection.prepareStatement( + """ + INSERT INTO test_table (id, name, age, salary, hire_date) + VALUES + (1, 'John Doe', 30, 50000.00, '2020-01-15'), + (2, 'Jane Smith', 28, 55000.00, '2021-03-20'), + (3, 'Bob Johnson', 35, 65000.00, '2019-11-10'), + (4, 'Alice Brown', 32, 60000.00, '2020-07-01') + """.trimIndent(), + ).executeUpdate() + + dfs = DataFrame.readAllSqlTables(connection = connection) + schemas = DataFrame.getSchemaForAllSqlTables(connection = connection) + } + + val df = dfs["test_table"]!! + val schema = schemas["test_table"]!! + + schema.compare(Person.expected.schema()).isSuperOrEqual() shouldBe true + + df.cast(verify = true) shouldBe Person.expected + df.assertInferredTypesMatchSchema() + } + + /** + * https://duckdb.org/docs/stable/sql/data_types/overview.html + */ + @Test + fun `read each general-purpose DuckDB type`() { + val df: AnyFrame + val schema: DataFrameSchema + DriverManager.getConnection(URL).use { connection -> + connection as DuckDBConnection + connection.prepareStatement( + """ + CREATE TABLE IF NOT EXISTS table1 ( + bigint_col BIGINT, + int8_col INT8, + int64_col INT64, + long_col LONG, + bit_col BIT, + bitstring_col BITSTRING, + blob_col BLOB, + bytea_col BYTEA, + binary_col BINARY, + varbinary_col VARBINARY, + boolean_col BOOLEAN, + bool_col BOOL, + logical_col LOGICAL, + date_col DATE, + decimal_col DECIMAL(10,2), + numeric_col NUMERIC(10,2), + double_col DOUBLE, + float8_col FLOAT8, + float_col FLOAT, + float4_col FLOAT4, + real_col REAL, + hugeint_col HUGEINT, + int128_col INT128, + integer_col INTEGER, + int4_col INT4, + int32_col INT32, + int_col INT, + signed_col SIGNED, + interval_col INTERVAL, + json_col JSON, + smallint_col SMALLINT, + int2_col INT2, + int16_col INT16, + short_col SHORT, + time_col TIME, + timestampwtz_col TIMESTAMP WITH TIME ZONE, + timestamptz_col TIMESTAMPTZ, + timestamp_col TIMESTAMP, + datetime_col DATETIME, + tinyint_col TINYINT, + int1_col INT1, + ubigint_col UBIGINT, + uint64_col UINT64, + uhugeint_col UHUGEINT, + uint128_col UINT128, + uint_col UINTEGER, + uint32_col UINT32, + usmallint_col USMALLINT, + uint16_col UINT16, + utinyint_col UTINYINT, + uint8_col UINT8, + uuid_col UUID, + varchar_col VARCHAR, + char_col CHAR(10), + bpchar_col BPCHAR(10), + text_col TEXT, + string_col STRING, + enum_col ENUM('male', 'female', 'other') + ) + """.trimIndent(), + ).executeUpdate() + + connection.prepareStatement( + """ + INSERT INTO table1 VALUES ( + 9223372036854775807, -- bigint + 9223372036854775807, -- int8 + 9223372036854775807, -- int64 + 9223372036854775807, -- long + '1010', -- bit + '1010', -- bitstring + 'DEADBEEF'::BLOB, -- blob + 'DEADBEEF'::BLOB, -- bytea + 'DEADBEEF'::BLOB, -- binary + 'DEADBEEF'::BLOB, -- varbinary + true, -- boolean + true, -- bool + true, -- logical + '2025-06-19', -- date + 123.45, -- decimal + 123.45, -- numeric + 3.14159, -- double + 3.14159, -- float8 + 3.14, -- float + 3.14, -- float4 + 3.14, -- real + '170141183460469231731687303715884105727', -- hugeint + '170141183460469231731687303715884105727', -- int128 + 2147483647, -- integer + 2147483647, -- int4 + 2147483647, -- int32 + 2147483647, -- int + 2147483647, -- signed + INTERVAL '1' YEAR, -- interval + '{"key": "value"}'::JSON, -- json + 32767, -- smallint + 32767, -- int2 + 32767, -- int16 + 32767, -- short + '12:34:56', -- time + '2025-06-19 12:34:56+02', -- timestampwtz + '2025-06-19 12:34:56+02', -- timestamptz + '2025-06-19 12:34:56', -- timestamp + '2025-06-19 12:34:56', -- datetime + 127, -- tinyint + 127, -- int1 + 18446744073709551615, -- ubigint + 18446744073709551615, -- uint64 + '340282366920938463463374607431768211455', -- uhugeint + '340282366920938463463374607431768211455', -- uint128 + 4294967295, -- uinteger + 4294967295, -- uint32 + 65535, -- usmallint + 65535, -- uint16 + 255, -- utinyint + 255, -- uint8 + 'a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11', -- uuid + 'test string', -- varchar + 'test', -- char + 'test', -- bpchar + 'test text', -- text + 'test string', -- string + 'female' -- enum + ) + """.trimIndent(), + ).executeUpdate() + + schema = DataFrame.getSchemaForSqlTable(connection, "table1") + df = DataFrame.readSqlTable(connection, "table1").reorderColumnsByName() + } + + schema.compare(GeneralPurposeTypes.expected.schema()).isSuperOrEqual() shouldBe true + + // on some systems OffsetDateTime's get converted to UTC sometimes, let's compare them as Instant instead + fun AnyFrame.fixOffsetDateTime() = convert { colsOf() }.with { it.toInstant() } + + df.cast(verify = true).fixOffsetDateTime() shouldBe + GeneralPurposeTypes.expected.fixOffsetDateTime() + df.assertInferredTypesMatchSchema() + } + + /** + * https://duckdb.org/docs/stable/sql/data_types/overview.html + */ + @Test + fun `read each nested DuckDB type`() { + val df: AnyFrame + val schema: DataFrameSchema + DriverManager.getConnection(URL).use { connection -> + connection as DuckDBConnection + connection.prepareStatement( + """ + CREATE TABLE IF NOT EXISTS table2 ( + intarray_col INTEGER[3], + stringarray_col VARCHAR[3], + intlist_col INTEGER[], + stringlist_col VARCHAR[], + stringlistlist_col VARCHAR[][], + intstringmap_col MAP(INTEGER, VARCHAR), + intstrinstinggmap_col MAP(INTEGER, MAP(VARCHAR, VARCHAR)), + ijstruct_col STRUCT(i INTEGER, j VARCHAR), + union_col UNION(num INTEGER, text VARCHAR), + ) + """.trimIndent(), + ).executeUpdate() + + connection.prepareStatement( + """ + INSERT INTO table2 VALUES ( + array_value(1, 2, NULL), -- int array + array_value('a', 'ab', 'abc'), -- string array + list_value(1, 2, 3), -- int list + list_value('a', 'ab', 'abc'), -- string list + list_value(list_value('a', 'ab'), list_value('abc'), NULL), -- string list list + MAP { 1: 'value1', 200: 'value2' }, -- int string map + MAP { 1: MAP { 'value1': 'a', 'value2': 'b' }, 200: MAP { 'value1': 'c', 'value2': 'd' } }, -- int string string map + { 'i': 42, 'j': 'answer' }, -- struct + union_value(num := 2), -- union + ) + """.trimIndent(), + ).executeUpdate() + + schema = DataFrame.getSchemaForSqlTable(connection, "table2") + df = DataFrame.readSqlTable(connection, "table2") + } + + df.assertInferredTypesMatchSchema() + + df.cast(verify = true) + df as DataFrame + + df.single().let { + it[{ "intarray_col"() }].array shouldBe arrayOf(1, 2, null) + it[{ "stringarray_col"() }].array shouldBe arrayOf("a", "ab", "abc") + it[{ "intlist_col"() }].array shouldBe arrayOf(1, 2, 3) + it[{ "stringlist_col"() }].array shouldBe arrayOf("a", "ab", "abc") + (it[{ "stringlistlist_col"() }].array as Array<*>) + .map { (it as java.sql.Array?)?.array } shouldBe listOf(arrayOf("a", "ab"), arrayOf("abc"), null) + it[{ "intstringmap_col">() }] shouldBe mapOf(1 to "value1", 200 to "value2") + it[{ "intstrinstinggmap_col">>() }] shouldBe mapOf( + 1 to mapOf("value1" to "a", "value2" to "b"), + 200 to mapOf("value1" to "c", "value2" to "d"), + ) + it[{ "ijstruct_col"() }].attributes shouldBe arrayOf(42, "answer") + it[{ "union_col"() }] shouldBe 2 + } + } + + // TODO Issue #1365 + @Ignore + @Test + fun `change read mode`() { + val config = DbConnectionConfig("jdbc:duckdb:", readOnly = true) + val df = config.readDataFrame("SELECT 1, 2, 3") + df.values().toList() shouldBe listOf(1, 2, 3) + } +} diff --git a/docs/StardustDocs/d.tree b/docs/StardustDocs/d.tree index 5ac4efb935..0eb814ea85 100644 --- a/docs/StardustDocs/d.tree +++ b/docs/StardustDocs/d.tree @@ -202,6 +202,7 @@ + diff --git a/docs/StardustDocs/topics/dataSources/Data-Sources.md b/docs/StardustDocs/topics/dataSources/Data-Sources.md index 1e83f48821..bb310cadf1 100644 --- a/docs/StardustDocs/topics/dataSources/Data-Sources.md +++ b/docs/StardustDocs/topics/dataSources/Data-Sources.md @@ -28,6 +28,7 @@ Below you'll find a list of supported sources along with instructions on how to - [SQLite](SQLite.md) - [H2](H2.md) - [MariaDB](MariaDB.md) + - [DuckDB](DuckDB.md) - [Custom SQL Source](Custom-SQL-Source.md) - [Custom integrations with unsupported data sources](Integrations.md) diff --git a/docs/StardustDocs/topics/dataSources/sql/DuckDB.md b/docs/StardustDocs/topics/dataSources/sql/DuckDB.md new file mode 100644 index 0000000000..d2a9501396 --- /dev/null +++ b/docs/StardustDocs/topics/dataSources/sql/DuckDB.md @@ -0,0 +1,107 @@ +# DuckDB + + +Work with DuckDB databases in Kotlin — read tables and queries into DataFrames using JDBC. + + + +Use Kotlin DataFrame to query and transform DuckDB data directly via JDBC. + + + +Read DuckDB data into Kotlin DataFrame with JDBC support. + + + + +Kotlin DataFrame supports reading from [DuckDB](https://duckdb.org/) databases using JDBC. + +This requires the [`dataframe-jdbc` module](Modules.md#dataframe-jdbc), +which is included by default in the general [`dataframe` artifact](Modules.md#dataframe-general) +and in [`%use dataframe`](SetupKotlinNotebook.md#integrate-kotlin-dataframe) for Kotlin Notebook. + +You’ll also need [the official DuckDB JDBC driver](https://duckdb.org/docs/stable/clients/java): + + + + +```kotlin +dependencies { + implementation("org.duckdb:duckdb_jdbc:$version") +} +``` + + + + +```kotlin +USE { + dependencies("org.duckdb:duckdb_jdbc:$version") +} +``` + + + + +The actual Maven Central driver version can be found +[here](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc). + +## Read + +A [`DataFrame`](DataFrame.md) instance can be loaded from a database in several ways: +a user can read data from a SQL table by a given name ([`readSqlTable`](readSqlDatabases.md)), +as the result of a user-defined SQL query ([`readSqlQuery`](readSqlDatabases.md)), +or from a given `ResultSet` ([`readResultSet`](readSqlDatabases.md)). +It is also possible to load all data from non-system tables, each into a separate `DataFrame` ([ +`readAllSqlTables`](readSqlDatabases.md)). + +See [](readSqlDatabases.md) for more details. + + + +```kotlin +val url = "jdbc:duckdb:/testDatabase" +val username = "duckdb" +val password = "password" + +val dbConfig = DbConnectionConfig(url, username, password) + +val tableName = "Customer" + +val df = DataFrame.readSqlTable(dbConfig, tableName) +``` + + + +### Extensions + +DuckDB has a special trick up its sleeve: it has support +for [extensions](https://duckdb.org/docs/stable/extensions/overview). +These can be installed, loaded, and used to connect to a different database via DuckDB. +See [Core Extensions](https://duckdb.org/docs/stable/core_extensions/overview) for a list of available extensions. + +For example, let's load a dataframe +from [Apache Iceberg via DuckDB](https://duckdb.org/docs/stable/core_extensions/iceberg/overview.html), +as Iceberg is an unsupported data source in DataFrame at the moment: + + + +```kotlin +// Creating an in-memory DuckDB database +val connection = DriverManager.getConnection("jdbc:duckdb:") +val df = connection.use { connection -> + // install and load Iceberg + connection.createStatement().execute("INSTALL iceberg; LOAD iceberg;") + + // query a table from Iceberg using a specific SQL query + DataFrame.readSqlQuery( + connection = connection, + sqlQuery = "SELECT * FROM iceberg_scan('data/iceberg/lineitem_iceberg', allow_moved_paths = true);", + ) +} +``` + + + +As you can see, the process is very similar to reading from any other JDBC database, +just without needing explicit DataFrame support. diff --git a/docs/StardustDocs/topics/dataSources/sql/SQL.md b/docs/StardustDocs/topics/dataSources/sql/SQL.md index c612e16283..bc4f99fa74 100644 --- a/docs/StardustDocs/topics/dataSources/sql/SQL.md +++ b/docs/StardustDocs/topics/dataSources/sql/SQL.md @@ -29,6 +29,7 @@ Kotlin DataFrame provides out-of-the-box support for the most common SQL databas - [SQLite](SQLite.md) - [H2](H2.md) - [MariaDB](MariaDB.md) +- [DuckDB](DuckDB.md) You can also define a [Custom SQL Source](Custom-SQL-Source.md) to work with any other JDBC-compatible database. diff --git a/docs/StardustDocs/topics/readSqlDatabases.md b/docs/StardustDocs/topics/readSqlDatabases.md index b539e51069..dd6e613177 100644 --- a/docs/StardustDocs/topics/readSqlDatabases.md +++ b/docs/StardustDocs/topics/readSqlDatabases.md @@ -44,22 +44,22 @@ Also, there are a few **extension functions** available on `Connection`, **NOTE:** This is an experimental module, and for now, -we only support four databases: MS SQL, MariaDB, MySQL, PostgreSQL, and SQLite. +we only support these databases: MS SQL, MariaDB, MySQL, PostgreSQL, SQLite, and DuckDB. Moreover, since release 0.15 we support the possibility to register custom SQL database, read more in our [guide](readSqlFromCustomDatabase.md). Additionally, support for JSON and date-time types is limited. Please take this into consideration when using these functions. -## Getting started with reading from SQL database in Gradle Project +## Getting started with reading from SQL database in a Gradle Project -In the first, you need to add a dependency +First, you need to add a dependency ```kotlin implementation("org.jetbrains.kotlinx:dataframe-jdbc:$dataframe_version") ``` -after that, you need to add a dependency for a JDBC driver for the used database, for example +after that, you need to add the dependency for the database's JDBC driver, for example For **MariaDB**: @@ -67,7 +67,7 @@ For **MariaDB**: implementation("org.mariadb.jdbc:mariadb-java-client:$version") ``` -Maven Central version could be found [here](https://mvnrepository.com/artifact/org.mariadb.jdbc/mariadb-java-client). +The Maven Central version can be found [here](https://mvnrepository.com/artifact/org.mariadb.jdbc/mariadb-java-client). For **PostgreSQL**: @@ -75,7 +75,7 @@ For **PostgreSQL**: implementation("org.postgresql:postgresql:$version") ``` -Maven Central version could be found [here](https://mvnrepository.com/artifact/org.postgresql/postgresql). +The Maven Central version can be found [here](https://mvnrepository.com/artifact/org.postgresql/postgresql). For **MySQL**: @@ -83,7 +83,7 @@ For **MySQL**: implementation("com.mysql:mysql-connector-j:$version") ``` -Maven Central version could be found [here](https://mvnrepository.com/artifact/com.mysql/mysql-connector-j). +The Maven Central version can be found [here](https://mvnrepository.com/artifact/com.mysql/mysql-connector-j). For **SQLite**: @@ -91,7 +91,7 @@ For **SQLite**: implementation("org.xerial:sqlite-jdbc:$version") ``` -Maven Central version could be found [here](https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc). +The Maven Central version can be found [here](https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc). For **MS SQL**: @@ -99,16 +99,24 @@ For **MS SQL**: implementation("com.microsoft.sqlserver:mssql-jdbc:$version") ``` -Maven Central version could be found [here](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc). +The Maven Central version can be found [here](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc). -In the second, be sure that you can establish a connection to the database. +For **DuckDB**: -For this, usually, you need to have three things: a URL to a database, a username, and a password. +```kotlin +implementation("org.duckdb:duckdb_jdbc:$version") +``` + +The Maven Central version can be found [here](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc). + +Next, be sure that you can establish a connection to the database. + +For this, usually, you need to have three things: a URL to the database, a username, and a password. -Call one of the following functions to collect data from a database and transform it to the dataframe. +Call one of the following functions to collect data from the database and transform it to a dataframe. -For example, if you have a local PostgreSQL database named as `testDatabase` with table `Customer`, -you could read first 100 rows and print the data just copying the code below: +For example, if you have a local PostgreSQL database named `testDatabase` with a table `Customer`, +you can read the first 100 rows and print the data by just copying the code below: ```kotlin import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig @@ -127,7 +135,7 @@ val df = DataFrame.readSqlTable(dbConfig, tableName, 100) df.print() ``` -Find a full example project [here](https://github.com/zaleslaw/KotlinDataFrame-SQL-Examples/). +You can find a full example project [here](https://github.com/zaleslaw/KotlinDataFrame-SQL-Examples/). ## Getting Started with Notebooks @@ -317,7 +325,7 @@ Note that reading from the `ResultSet` could potentially change its state. The `dbType: DbType` parameter specifies the type of our database (e.g., PostgreSQL, MySQL, etc.), supported by a library. -Currently, the following classes are available: `H2, MsSql, MariaDb, MySql, PostgreSql, Sqlite`. +Currently, the following classes are available: `H2, MsSql, MariaDb, MySql, PostgreSql, Sqlite, DuckDb`. Also, users have an ability to pass objects, describing their custom databases, more information in [guide](readSqlFromCustomDatabase.md). @@ -525,7 +533,7 @@ This function reads the schema from a `ResultSet` object provided by the user. The `dbType: DbType` parameter specifies the type of our database (e.g., PostgreSQL, MySQL, etc.), supported by a library. -Currently, the following classes are available: `H2, MariaDb, MySql, PostgreSql, Sqlite`. +Currently, the following classes are available: `H2, MsSql, MariaDb, MySql, PostgreSql, Sqlite, DuckDB`. Also, users have an ability to pass objects, describing their custom databases, more information in [guide](readSqlFromCustomDatabase.md). diff --git a/docs/StardustDocs/topics/schemas/gradle/Gradle-Plugin.md b/docs/StardustDocs/topics/schemas/gradle/Gradle-Plugin.md index e3e9b25651..e59a8f4ae3 100644 --- a/docs/StardustDocs/topics/schemas/gradle/Gradle-Plugin.md +++ b/docs/StardustDocs/topics/schemas/gradle/Gradle-Plugin.md @@ -184,7 +184,7 @@ dataframes { Find full example code [here](https://github.com/zaleslaw/KotlinDataFrame-SQL-Examples/blob/master/src/main/kotlin/Example_3_Import_schema_via_Gradle.kt). **NOTE:** This is an experimental functionality and, for now, -we only support four databases: MariaDB, MySQL, PostgreSQL, and SQLite. +we only support these databases: MariaDB, MySQL, PostgreSQL, SQLite, MS SQL, and DuckDB. Additionally, support for JSON and date-time types is limited. Please take this into consideration when using these functions. diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 1e1c9361a7..1377def238 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -54,7 +54,7 @@ shadow = "8.3.5" android-gradle-api = "7.3.1" # need to revise our tests to update ktor = "3.0.1" # needs jupyter compatibility with Kotlin 2.1 to update kotlin-compile-testing = "0.7.1" -duckdb = "1.2.2.0" +duckdb = "1.3.1.0" buildconfig = "5.6.7" benchmark = "0.4.12" diff --git a/tests/build.gradle.kts b/tests/build.gradle.kts index f39386869b..b6173d4cce 100644 --- a/tests/build.gradle.kts +++ b/tests/build.gradle.kts @@ -66,12 +66,14 @@ korro { include("docs/StardustDocs/topics/write.md") include("docs/StardustDocs/topics/rename.md") include("docs/StardustDocs/topics/guides/*.md") + include("docs/StardustDocs/topics/dataSources/sql/*.md") } samples = fileTree(project.projectDir) { include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/*.kt") include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/*.kt") include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/guides/*.kt") + include("src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/io/*.kt") } groupSamples { diff --git a/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/io/DuckDb.kt b/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/io/DuckDb.kt new file mode 100644 index 0000000000..d75d5b24c0 --- /dev/null +++ b/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/io/DuckDb.kt @@ -0,0 +1,48 @@ +package org.jetbrains.kotlinx.dataframe.samples.io + +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig +import org.jetbrains.kotlinx.dataframe.io.readSqlQuery +import org.jetbrains.kotlinx.dataframe.io.readSqlTable +import org.junit.Ignore +import org.junit.Test +import java.sql.DriverManager + +class DuckDb { + + @Ignore + @Test + fun readSqlTable() { + // SampleStart + val url = "jdbc:duckdb:/testDatabase" + val username = "duckdb" + val password = "password" + + val dbConfig = DbConnectionConfig(url, username, password) + + val tableName = "Customer" + + val df = DataFrame.readSqlTable(dbConfig, tableName) + // SampleEnd + } + + // source: https://duckdb.org/docs/stable/core_extensions/iceberg/overview.html + @Ignore + @Test + fun readIcebergExtension() { + // SampleStart + // Creating an in-memory DuckDB database + val connection = DriverManager.getConnection("jdbc:duckdb:") + val df = connection.use { connection -> + // install and load Iceberg + connection.createStatement().execute("INSTALL iceberg; LOAD iceberg;") + + // query a table from Iceberg using a specific SQL query + DataFrame.readSqlQuery( + connection = connection, + sqlQuery = "SELECT * FROM iceberg_scan('data/iceberg/lineitem_iceberg', allow_moved_paths = true);", + ) + } + // SampleEnd + } +}