Skip to content

duckdb support #1366

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import java.net.URL
import java.time.LocalDateTime
import java.time.LocalTime
import kotlin.reflect.KType
import kotlin.reflect.KVariance
import kotlin.reflect.full.isSubtypeOf
import kotlin.reflect.jvm.jvmErasure
import kotlin.reflect.typeOf
Expand Down Expand Up @@ -89,7 +90,12 @@ internal fun renderType(type: KType?): String {
append(name)
if (type.arguments.isNotEmpty()) {
val arguments = type.arguments.joinToString {
renderType(it.type)
when (it.variance) {
null -> "*"
KVariance.INVARIANT -> renderType(it.type)
KVariance.IN -> "in ${renderType(it.type)}"
KVariance.OUT -> "out ${renderType(it.type)}"
}
}
append("<$arguments>")
}
Expand Down
9 changes: 9 additions & 0 deletions dataframe-jdbc/api/dataframe-jdbc.api
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ public abstract class org/jetbrains/kotlinx/dataframe/io/db/DbType {
public static synthetic fun sqlQueryLimit$default (Lorg/jetbrains/kotlinx/dataframe/io/db/DbType;Ljava/lang/String;IILjava/lang/Object;)Ljava/lang/String;
}

public final class org/jetbrains/kotlinx/dataframe/io/db/DuckDb : org/jetbrains/kotlinx/dataframe/io/db/DbType {
public static final field INSTANCE Lorg/jetbrains/kotlinx/dataframe/io/db/DuckDb;
public fun buildTableMetadata (Ljava/sql/ResultSet;)Lorg/jetbrains/kotlinx/dataframe/io/TableMetadata;
public fun convertSqlTypeToColumnSchemaValue (Lorg/jetbrains/kotlinx/dataframe/io/TableColumnMetadata;)Lorg/jetbrains/kotlinx/dataframe/schema/ColumnSchema;
public fun convertSqlTypeToKType (Lorg/jetbrains/kotlinx/dataframe/io/TableColumnMetadata;)Lkotlin/reflect/KType;
public fun getDriverClassName ()Ljava/lang/String;
public fun isSystemTable (Lorg/jetbrains/kotlinx/dataframe/io/TableMetadata;)Z
}

public class org/jetbrains/kotlinx/dataframe/io/db/H2 : org/jetbrains/kotlinx/dataframe/io/db/DbType {
public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/db/H2$Companion;
public static final field MODE_MARIADB Ljava/lang/String;
Expand Down
3 changes: 3 additions & 0 deletions dataframe-jdbc/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ repositories {

dependencies {
api(projects.core)
compileOnly(libs.duckdb.jdbc)
implementation(libs.kotlinLogging)
testImplementation(libs.mariadb)
testImplementation(libs.sqlite)
Expand All @@ -26,6 +27,8 @@ dependencies {
testImplementation(libs.junit)
testImplementation(libs.sl4jsimple)
testImplementation(libs.jts)
testImplementation(libs.duckdb.jdbc)
testImplementation(projects.dataframeJson)
testImplementation(libs.kotestAssertions) {
exclude("org.jetbrains.kotlin", "kotlin-stdlib-jdk8")
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
package org.jetbrains.kotlinx.dataframe.io.db

import org.duckdb.DuckDBColumnType
import org.duckdb.DuckDBColumnType.ARRAY
import org.duckdb.DuckDBColumnType.BIGINT
import org.duckdb.DuckDBColumnType.BIT
import org.duckdb.DuckDBColumnType.BLOB
import org.duckdb.DuckDBColumnType.BOOLEAN
import org.duckdb.DuckDBColumnType.DATE
import org.duckdb.DuckDBColumnType.DECIMAL
import org.duckdb.DuckDBColumnType.DOUBLE
import org.duckdb.DuckDBColumnType.ENUM
import org.duckdb.DuckDBColumnType.FLOAT
import org.duckdb.DuckDBColumnType.HUGEINT
import org.duckdb.DuckDBColumnType.INTEGER
import org.duckdb.DuckDBColumnType.INTERVAL
import org.duckdb.DuckDBColumnType.JSON
import org.duckdb.DuckDBColumnType.LIST
import org.duckdb.DuckDBColumnType.MAP
import org.duckdb.DuckDBColumnType.SMALLINT
import org.duckdb.DuckDBColumnType.STRUCT
import org.duckdb.DuckDBColumnType.TIME
import org.duckdb.DuckDBColumnType.TIMESTAMP
import org.duckdb.DuckDBColumnType.TIMESTAMP_MS
import org.duckdb.DuckDBColumnType.TIMESTAMP_NS
import org.duckdb.DuckDBColumnType.TIMESTAMP_S
import org.duckdb.DuckDBColumnType.TIMESTAMP_WITH_TIME_ZONE
import org.duckdb.DuckDBColumnType.TIME_WITH_TIME_ZONE
import org.duckdb.DuckDBColumnType.TINYINT
import org.duckdb.DuckDBColumnType.UBIGINT
import org.duckdb.DuckDBColumnType.UHUGEINT
import org.duckdb.DuckDBColumnType.UINTEGER
import org.duckdb.DuckDBColumnType.UNION
import org.duckdb.DuckDBColumnType.UNKNOWN
import org.duckdb.DuckDBColumnType.USMALLINT
import org.duckdb.DuckDBColumnType.UTINYINT
import org.duckdb.DuckDBColumnType.UUID
import org.duckdb.DuckDBColumnType.VARCHAR
import org.duckdb.DuckDBResultSetMetaData
import org.duckdb.JsonNode
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.io.TableColumnMetadata
import org.jetbrains.kotlinx.dataframe.io.TableMetadata
import org.jetbrains.kotlinx.dataframe.io.db.DuckDb.convertSqlTypeToKType
import org.jetbrains.kotlinx.dataframe.io.getSchemaForAllSqlTables
import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables
import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema
import java.math.BigDecimal
import java.math.BigInteger
import java.sql.Array
import java.sql.Blob
import java.sql.DatabaseMetaData
import java.sql.ResultSet
import java.sql.Struct
import java.sql.Timestamp
import java.time.LocalDate
import java.time.LocalTime
import java.time.OffsetDateTime
import java.time.OffsetTime
import java.util.UUID
import kotlin.reflect.KType
import kotlin.reflect.KTypeProjection
import kotlin.reflect.full.createType
import kotlin.reflect.full.withNullability
import kotlin.reflect.typeOf

/**
* Represents the [DuckDB](http://duckdb.org/) database type.
*
* This class provides methods to convert data from a [ResultSet] to the appropriate type for DuckDB,
* and to generate the corresponding [column schema][ColumnSchema].
*/
public object DuckDb : DbType("duckdb") {

/** the name of the class of the DuckDB JDBC driver */
override val driverClassName: String = "org.duckdb.DuckDBDriver"

/**
* How a column type from JDBC, [tableColumnMetadata], is read in Java/Kotlin.
* The returned type must exactly follow [ResultSet.getObject] of your specific database's JDBC driver.
* Returning `null` defer the implementation to the default one (which may not always be correct).
*
* Following [org.duckdb.DuckDBVector.getObject].
*/
override fun convertSqlTypeToKType(tableColumnMetadata: TableColumnMetadata): KType =
tableColumnMetadata.sqlTypeName.toKType(tableColumnMetadata.isNullable)

/**
* How a column from JDBC should be represented as DataFrame (value) column
* See [convertSqlTypeToKType].
*/
override fun convertSqlTypeToColumnSchemaValue(tableColumnMetadata: TableColumnMetadata): ColumnSchema {
val type = convertSqlTypeToKType(tableColumnMetadata)
return ColumnSchema.Value(type)
}

/**
* Follows exactly [org.duckdb.DuckDBVector.getObject].
*
* "// dataframe-jdbc" is added for all types that are covered correctly by
* [org.jetbrains.kotlinx.dataframe.io.makeCommonSqlToKTypeMapping] at the moment, however, to cover
* all nested types, we'll use a full type-map for all [DuckDB types][DuckDBColumnType] exactly.
*/
@Suppress("ktlint:standard:blank-line-between-when-conditions")
internal fun String.toKType(isNullable: Boolean): KType {
val sqlTypeName = this
return when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) {
BOOLEAN -> typeOf<Boolean>() // dataframe-jdbc
TINYINT -> typeOf<Byte>()
SMALLINT -> typeOf<Short>()
INTEGER -> typeOf<Int>() // dataframe-jdbc
BIGINT -> typeOf<Long>() // dataframe-jdbc
HUGEINT -> typeOf<BigInteger>()
UHUGEINT -> typeOf<BigInteger>()
UTINYINT -> typeOf<Short>()
USMALLINT -> typeOf<Int>()
UINTEGER -> typeOf<Long>()
UBIGINT -> typeOf<BigInteger>()
FLOAT -> typeOf<Float>() // dataframe-jdbc
DOUBLE -> typeOf<Double>() // dataframe-jdbc
DECIMAL -> typeOf<BigDecimal>() // dataframe-jdbc
TIME -> typeOf<LocalTime>()
TIME_WITH_TIME_ZONE -> typeOf<OffsetTime>() // dataframe-jdbc
DATE -> typeOf<LocalDate>()
TIMESTAMP, TIMESTAMP_MS, TIMESTAMP_NS, TIMESTAMP_S -> typeOf<Timestamp>() // dataframe-jdbc
TIMESTAMP_WITH_TIME_ZONE -> typeOf<OffsetDateTime>() // dataframe-jdbc
JSON -> typeOf<JsonNode>()
BLOB -> typeOf<Blob>()
UUID -> typeOf<UUID>()
MAP -> {
val (key, value) = parseMapTypes(sqlTypeName)
Map::class.createType(
listOf(
KTypeProjection.invariant(key.toKType(false)),
KTypeProjection.invariant(value.toKType(true)),
),
)
}

LIST, ARRAY -> {
// TODO requires #1266 and #1273 for specific types
// val listType = parseListType(sqlTypeName)
// Array::class.createType(
// listOf(KTypeProjection.invariant(listType.toKType(true))),
// )
typeOf<Array>()
}

STRUCT -> typeOf<Struct>() // TODO requires #1266 for specific types
UNION -> typeOf<Any>() // Cannot handle this in Kotlin
VARCHAR -> typeOf<String>()
UNKNOWN, BIT, INTERVAL, ENUM -> typeOf<String>()
}.withNullability(isNullable)
}

/** Parses "MAP(X, Y)" into "X" and "Y", taking parentheses into account */
internal fun parseMapTypes(typeString: String): Pair<String, String> {
if (!typeString.startsWith("MAP(") || !typeString.endsWith(")")) {
error("invalid MAP type: $typeString")
}

val content = typeString.removeSurrounding("MAP(", ")")

// Find the comma that separates key and value types
var parenCount = 0
var commaIndex = -1
for (i in content.indices) {
when (content[i]) {
'(' -> parenCount++

')' -> parenCount--

',' -> if (parenCount == 0) {
commaIndex = i
break
}
}
}

if (commaIndex == -1) error("invalid MAP type: $typeString")
val keyType = content.take(commaIndex).trim()
val valueType = content.substring(commaIndex + 1).trim()
return Pair(keyType, valueType)
}

/** Parses "X[]" and "X[123]" into "X", and "X[][]" into "X[]" */
internal fun parseListType(typeString: String): String {
if (!typeString.endsWith("]")) {
error("invalid LIST/ARRAY type: $typeString")
}

return typeString.take(typeString.indexOfLast { it == '[' })
}

/**
* How to filter out system tables from user-created ones when using
* [DataFrame.readAllSqlTables][DataFrame.Companion.readAllSqlTables] and
* [DataFrame.getSchemaForAllSqlTables][DataFrame.Companion.getSchemaForAllSqlTables].
*
* The names of these can sometimes be found in the specific JDBC integration.
*/
override fun isSystemTable(tableMetadata: TableMetadata): Boolean =
tableMetadata.schemaName?.lowercase()?.contains("information_schema") == true ||
tableMetadata.schemaName?.lowercase()?.contains("system") == true ||
tableMetadata.name.lowercase().contains("system_")

/**
* How to retrieve the correct table metadata when using
* [DataFrame.readAllSqlTables][DataFrame.Companion.readAllSqlTables] and
* [DataFrame.getSchemaForAllSqlTables][DataFrame.Companion.getSchemaForAllSqlTables].
* The names of these can be found in the [DatabaseMetaData] implementation of the DuckDB JDBC integration.
*/
override fun buildTableMetadata(tables: ResultSet): TableMetadata =
TableMetadata(
tables.getString("TABLE_NAME"),
tables.getString("TABLE_SCHEM"),
tables.getString("TABLE_CAT"),
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,11 @@ public fun extractDBTypeFromUrl(url: String?): DbType {

MsSql.dbTypeInJdbcUrl in url -> MsSql

DuckDb.dbTypeInJdbcUrl in url -> DuckDb

else -> throw IllegalArgumentException(
"Unsupported database type in the url: $url. " +
"Only H2, MariaDB, MySQL, MSSQL, SQLite and PostgreSQL are supported!",
"Only H2, MariaDB, MySQL, MSSQL, SQLite, PostgreSQL, and DuckDB are supported!",
)
}
} else {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
package org.jetbrains.kotlinx.dataframe.io

import io.kotest.assertions.withClue
import io.kotest.matchers.collections.shouldBeIn
import io.kotest.matchers.shouldBe
import org.intellij.lang.annotations.Language
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.inferType
import org.jetbrains.kotlinx.dataframe.api.schema
import org.jetbrains.kotlinx.dataframe.io.db.MsSql
import org.jetbrains.kotlinx.dataframe.schema.CompareResult
import java.sql.Connection
import java.sql.ResultSet
import kotlin.reflect.typeOf
Expand Down Expand Up @@ -128,3 +133,26 @@ internal fun inferNullability(connection: Connection) {

connection.createStatement().execute("DROP TABLE TestTable1")
}

/**
* Helper to check whether the provided schema matches the inferred schema.
*
* It must hold that all types in the provided schema are equal or super to
* the corresponding types in the inferred schema.
*/
@Suppress("INVISIBLE_REFERENCE")
fun AnyFrame.assertInferredTypesMatchSchema() {
withClue({
"""
|Inferred schema must be <: Provided schema
|
|Inferred Schema:
|${inferType().schema().toString().lines().joinToString("\n|")}
|
|Provided Schema:
|${schema().toString().lines().joinToString("\n|")}
""".trimMargin()
}) {
schema().compare(inferType().schema()).isSuperOrEqual() shouldBe true
}
}
Loading