diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0be9db5f85..64fd5b2672 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -119,11 +119,11 @@ jobs: - name: Make target directories if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main') - run: mkdir -p scio-bom/target scio-tensorflow/target site/target scio-cassandra/cassandra3/target scio-elasticsearch/es8/target scio-jdbc/target scio-macros/target scio-grpc/target scio-elasticsearch/common/target scio-test/target scio-avro/target scio-elasticsearch/es7/target scio-snowflake/target scio-redis/target scio-extra/target scio-test/parquet/target scio-test/core/target scio-google-cloud-platform/target scio-smb/target scio-test/google-cloud-platform/target scio-neo4j/target scio-parquet/target scio-core/target scio-repl/target project/target + run: mkdir -p scio-bom/target scio-tensorflow/target site/target scio-cassandra/cassandra3/target scio-elasticsearch/es8/target scio-jdbc/target scio-macros/target scio-grpc/target scio-elasticsearch/common/target scio-test/target scio-avro/target scio-elasticsearch/es7/target scio-snowflake/target scio-redis/target scio-extra/target scio-test/parquet/target scio-test/core/target scio-google-cloud-platform/target scio-smb/target scio-test/google-cloud-platform/target scio-neo4j/target scio-parquet/target scio-core/target scio-repl/target scio-managed/target project/target - name: Compress target directories if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main') - run: tar cf targets.tar scio-bom/target scio-tensorflow/target site/target scio-cassandra/cassandra3/target scio-elasticsearch/es8/target scio-jdbc/target scio-macros/target scio-grpc/target scio-elasticsearch/common/target scio-test/target scio-avro/target scio-elasticsearch/es7/target scio-snowflake/target scio-redis/target scio-extra/target scio-test/parquet/target scio-test/core/target scio-google-cloud-platform/target scio-smb/target scio-test/google-cloud-platform/target scio-neo4j/target scio-parquet/target scio-core/target scio-repl/target project/target + run: tar cf targets.tar scio-bom/target scio-tensorflow/target site/target scio-cassandra/cassandra3/target scio-elasticsearch/es8/target scio-jdbc/target scio-macros/target scio-grpc/target scio-elasticsearch/common/target scio-test/target scio-avro/target scio-elasticsearch/es7/target scio-snowflake/target scio-redis/target scio-extra/target scio-test/parquet/target scio-test/core/target scio-google-cloud-platform/target scio-smb/target scio-test/google-cloud-platform/target scio-neo4j/target scio-parquet/target scio-core/target scio-repl/target scio-managed/target project/target - name: Upload target directories if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main') diff --git a/.scala-steward.conf b/.scala-steward.conf index f13e3930a7..91e2bc2dd7 100644 --- a/.scala-steward.conf +++ b/.scala-steward.conf @@ -57,17 +57,10 @@ updates.pin = [ # neo4j 5+ driver are released for java 17+ { groupId = "org.neo4j.driver", version = "4." }, - # tensorflow-core-api 5+ and ndarray 4+ driver are released for java 11+ - { groupId = "org.tensorflow", artifactId = "tensorflow-core-api", version = "4." }, - { groupId = "org.tensorflow", artifactId = "ndarray", version = "3." }, - # Do not update major version of elasticsearch { groupId = "co.elastic.clients", version = "8."}, # Do not update major version of cassandra { groupId = "com.datastax.cassandra", version = "3." }, { groupId = "org.apache.cassandra", version = "3." }, - - # caffeine v3 requires Java >= 11 - { groupId = "com.github.ben-manes.caffeine", version = "2." } ] diff --git a/README.md b/README.md index 14f32f499a..53bb2af10c 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,8 @@ Scio is a Scala API for [Apache Beam](http://beam.incubator.apache.org/) and [Go # Quick Start -Download and install the [Java Development Kit (JDK)](https://adoptopenjdk.net/index.html) version 8. +Download and install the Java Development Kit (JDK) version 11 or higher, +eg. [adoptium](https://adoptium.net/index.html) or [corretto](https://aws.amazon.com/corretto/). Install [sbt](https://www.scala-sbt.org/1.x/docs/Setup.html). @@ -79,6 +80,7 @@ Scio includes the following artifacts: - `scio-google-cloud-platform`: add-on for Google Cloud IO's: BigQuery, Bigtable, Pub/Sub, Datastore, Spanner - `scio-grpc`: add-on for gRPC service calls - `scio-jdbc`: add-on for JDBC IO +- `scio-managed`: add-on for Beam's managed transforms. Includes Iceberg - `scio-neo4j`: add-on for Neo4J IO - `scio-parquet`: add-on for Parquet - `scio-redis`: add-on for Redis diff --git a/build.sbt b/build.sbt index c458be80f2..f22204f7ed 100644 --- a/build.sbt +++ b/build.sbt @@ -79,7 +79,7 @@ val algebirdVersion = "0.13.10" val annoy4sVersion = "0.10.0" val annoyVersion = "0.2.6" val breezeVersion = "2.1.0" -val caffeineVersion = "2.9.3" +val caffeineVersion = "3.2.0" val cassandraDriverVersion = "3.11.5" val cassandraVersion = "3.11.19" val catsVersion = "2.13.0" @@ -91,6 +91,7 @@ val elasticsearch8Version = "8.19.7" val fansiVersion = "0.5.1" val featranVersion = "0.8.0" val httpAsyncClientVersion = "4.1.5" +val icebergVersion = "1.4.2" val jakartaJsonVersion = "2.1.3" val javaLshVersion = "0.12" val jedisVersion = "7.0.0" @@ -101,7 +102,7 @@ val kantanCodecsVersion = "0.6.0" val kantanCsvVersion = "0.8.0" val kryoVersion = "4.0.3" val magnoliaVersion = "1.1.10" -val magnolifyVersion = "0.7.4" +val magnolifyVersion = "0.9.0" val metricsVersion = "4.2.37" val munitVersion = "1.2.1" val neo4jDriverVersion = "4.4.21" @@ -116,7 +117,7 @@ val scalaMacrosVersion = "2.1.1" val scalatestVersion = "3.2.19" val shapelessVersion = "2.3.13" val sparkeyVersion = "3.2.5" -val tensorFlowVersion = "0.4.2" +val tensorFlowVersion = "1.0.0" val tensorFlowMetadataVersion = "1.16.1" val testContainersVersion = "0.43.6" val voyagerVersion = "2.1.0" @@ -137,73 +138,11 @@ lazy val nettyBom = Bom("io.netty" % "netty-bom" % nettyVersion) val NothingFilter: explicitdeps.ModuleFilter = { _ => false } // project -ThisBuild / tlBaseVersion := "0.14" +ThisBuild / tlBaseVersion := "0.15" ThisBuild / organization := "com.spotify" ThisBuild / organizationName := "Spotify AB" ThisBuild / startYear := Some(2016) ThisBuild / licenses := Seq(License.Apache2) -ThisBuild / developers := List( - Developer( - id = "sinisa_lyh", - name = "Neville Li", - email = "neville.lyh@gmail.com", - url = url("https://twitter.com/sinisa_lyh") - ), - Developer( - id = "ravwojdyla", - name = "Rafal Wojdyla", - email = "ravwojdyla@gmail.com", - url = url("https://twitter.com/ravwojdyla") - ), - Developer( - id = "andrewsmartin", - name = "Andrew Martin", - email = "andrewsmartin.mg@gmail.com", - url = url("https://twitter.com/andrew_martin92") - ), - Developer( - id = "fallonfofallon", - name = "Fallon Chen", - email = "fallon@spotify.com", - url = url("https://twitter.com/fallonfofallon") - ), - Developer( - id = "regadas", - name = "Filipe Regadas", - email = "filiperegadas@gmail.com", - url = url("https://twitter.com/regadas") - ), - Developer( - id = "jto", - name = "Julien Tournay", - email = "julient@spotify.com", - url = url("https://twitter.com/skaalf") - ), - Developer( - id = "clairemcginty", - name = "Claire McGinty", - email = "clairem@spotify.com", - url = url("http://github.com/clairemcginty") - ), - Developer( - id = "syodage", - name = "Shameera Rathnayaka", - email = "shameerayodage@gmail.com", - url = url("http://github.com/syodage") - ), - Developer( - id = "kellen", - name = "Kellen Dye", - email = "dye.kellen@gmail.com", - url = url("http://github.com/kellen") - ), - Developer( - id = "farzad-sedghi", - name = "farzad sedghi", - email = "farzadsedghi2@gmail.com", - url = url("http://github.com/farzad-sedghi") - ) -) // scala versions val scala213 = "2.13.17" @@ -211,7 +150,7 @@ val scala212 = "2.12.20" val scalaDefault = scala213 // compiler settings -ThisBuild / tlJdkRelease := Some(8) +ThisBuild / tlJdkRelease := Some(11) ThisBuild / tlFatalWarnings := false ThisBuild / scalaVersion := scalaDefault ThisBuild / crossScalaVersions := Seq(scalaDefault, scala212) @@ -382,140 +321,7 @@ ThisBuild / githubWorkflowAddedJobs ++= Seq( ) // mima -ThisBuild / mimaBinaryIssueFilters ++= Seq( - ProblemFilters.exclude[ReversedMissingMethodProblem]( - "com.spotify.scio.options.ScioOptions.setZstdDictionary" - ), - ProblemFilters.exclude[ReversedMissingMethodProblem]( - "com.spotify.scio.options.ScioOptions.getZstdDictionary" - ), - // removal of private classes - ProblemFilters.exclude[MissingClassProblem]( - "com.spotify.scio.coders.instances.kryo.GaxApiExceptionSerializer" - ), - ProblemFilters.exclude[MissingClassProblem]( - "com.spotify.scio.coders.instances.kryo.StatusRuntimeExceptionSerializer" - ), - ProblemFilters.exclude[MissingClassProblem]( - "com.spotify.scio.coders.instances.kryo.BigtableRetriesExhaustedExceptionSerializer" - ), - // added new Cache.get method - ProblemFilters.exclude[ReversedMissingMethodProblem]( - "com.spotify.scio.util.Cache.get" - ), - // added SortedMapCoder - ProblemFilters.exclude[DirectMissingMethodProblem]( - "com.spotify.scio.coders.instances.MutableMapCoder.*" - ), - ProblemFilters.exclude[DirectAbstractMethodProblem]( - "org.apache.beam.sdk.coders.Coder.verifyDeterministic" - ), - ProblemFilters.exclude[DirectAbstractMethodProblem]( - "org.apache.beam.sdk.coders.Coder.getCoderArguments" - ), - // added BQ Json object - ProblemFilters.exclude[MissingTypesProblem]( - "com.spotify.scio.bigquery.types.package$Json$" - ), - // tf-metadata upgrade - ProblemFilters.exclude[Problem]( - "org.tensorflow.metadata.v0.*" - ), - // relax type hierarchy for batch stream - ProblemFilters.exclude[IncompatibleMethTypeProblem]( - "com.spotify.scio.grpc.GrpcBatchDoFn.asyncLookup" - ), - // added TableRow syntax - ProblemFilters.exclude[DirectMissingMethodProblem]( - "com.spotify.scio.bigquery.syntax.TableRowOps.*" - ), - // narrow return type from Map to TableRow - ProblemFilters.exclude[IncompatibleResultTypeProblem]( - "com.spotify.scio.bigquery.syntax.TableRowOps.getRecord$extension" - ), - ProblemFilters.exclude[IncompatibleResultTypeProblem]( - "com.spotify.scio.bigquery.syntax.TableRowOps.getRecord" - ), - // narrow return type from Seq to List - ProblemFilters.exclude[IncompatibleResultTypeProblem]( - "com.spotify.scio.bigquery.syntax.TableRowOps.getRepeated$extension" - ), - ProblemFilters.exclude[IncompatibleResultTypeProblem]( - "com.spotify.scio.bigquery.syntax.TableRowOps.getRepeated" - ), - // BQ api v1 update - ProblemFilters.exclude[IncompatibleResultTypeProblem]( - "com.spotify.scio.bigquery.BigQueryStorageTap.*" - ), - ProblemFilters.exclude[IncompatibleMethTypeProblem]( - "com.spotify.scio.bigquery.BigQueryStorageTap.*" - ), - ProblemFilters.exclude[IncompatibleMethTypeProblem]( - "com.spotify.scio.bigquery.BigQueryTaps.*" - ), - ProblemFilters.exclude[IncompatibleResultTypeProblem]( - "com.spotify.scio.bigquery.StorageUtil.tableReadOptions" - ), - ProblemFilters.exclude[MissingClassProblem]("com.spotify.scio.tensorflow.PredictDoFn"), - ProblemFilters.exclude[MissingClassProblem]("com.spotify.scio.tensorflow.PredictDoFn$"), - ProblemFilters.exclude[MissingClassProblem]("com.spotify.scio.tensorflow.SavedBundlePredictDoFn"), - ProblemFilters.exclude[MissingClassProblem]( - "com.spotify.scio.tensorflow.SavedBundlePredictDoFn$" - ), - ProblemFilters.exclude[DirectMissingMethodProblem]( - "com.spotify.scio.tensorflow.package.tensorFlowPredictSCollectionOps" - ), - ProblemFilters.exclude[MissingClassProblem]( - "com.spotify.scio.tensorflow.syntax.PredictSCollectionOps" - ), - ProblemFilters.exclude[MissingClassProblem]( - "com.spotify.scio.tensorflow.syntax.PredictSCollectionOps$" - ), - ProblemFilters.exclude[DirectMissingMethodProblem]( - "com.spotify.scio.tensorflow.syntax.SCollectionSyntax.tensorFlowPredictSCollectionOps" - ), - // dropped custom BigQueryAvroUtilsWrapper - ProblemFilters.exclude[MissingClassProblem]( - "org.apache.beam.sdk.io.gcp.bigquery.BigQueryAvroUtilsWrapper" - ), - // Changes in avro SlowGenericRecordCoder - ProblemFilters.exclude[Problem]( - "com.spotify.scio.coders.avro.SlowGenericRecordCoder*" - ), - // tablerow json fix - ProblemFilters.exclude[DirectMissingMethodProblem]( - "com.spotify.scio.bigquery.types.package#Json.apply" - ), - ProblemFilters.exclude[IncompatibleResultTypeProblem]( - "com.spotify.scio.bigquery.types.package#Json.parse" - ), - // Adding BigQuery Format API in 0.14 patch - ProblemFilters.exclude[DirectMissingMethodProblem]( - "com.spotify.scio.bigquery.BigQueryTyped#Table.copy" - ), - ProblemFilters.exclude[DirectMissingMethodProblem]( - "com.spotify.scio.bigquery.BigQueryTyped#Table.this" - ), - ProblemFilters.exclude[ReversedMissingMethodProblem]( - "com.spotify.scio.bigquery.Writes#WriteParamDefaults.com$spotify$scio$bigquery$Writes$WriteParamDefaults$_setter_$DefaultFormat_=" - ), - ProblemFilters.exclude[ReversedMissingMethodProblem]( - "com.spotify.scio.bigquery.Writes#WriteParamDefaults.DefaultFormat" - ), - ProblemFilters.exclude[DirectMissingMethodProblem]( - "com.spotify.scio.bigquery.syntax.SCollectionTypedOps.saveAsTypedBigQueryTable$extension" - ), - ProblemFilters.exclude[DirectMissingMethodProblem]( - "com.spotify.scio.bigquery.syntax.SCollectionTypedOps.saveAsTypedBigQueryTable" - ), - ProblemFilters.exclude[DirectMissingMethodProblem]( - "com.spotify.scio.bigquery.syntax.SCollectionTypedOps.saveAsTypedBigQueryTable$extension" - ), - ProblemFilters.exclude[MissingTypesProblem]("com.spotify.scio.bigtable.BigtableWrite$WriteParam"), - ProblemFilters.exclude[MissingTypesProblem]("com.spotify.scio.bigtable.BigtableWrite$Default$"), - // Only used for testing, shouldn't be public - ProblemFilters.exclude[Problem]("com.spotify.scio.bigquery.validation.Country.*") -) +ThisBuild / mimaBinaryIssueFilters ++= Seq() // headers lazy val currentYear = java.time.LocalDate.now().getYear @@ -827,6 +633,7 @@ lazy val `scio-bom` = project `scio-grpc`, `scio-jdbc`, `scio-macros`, + `scio-managed`, `scio-neo4j`, `scio-parquet`, `scio-redis`, @@ -934,10 +741,7 @@ lazy val `scio-test` = project ) .settings(commonSettings) .settings( - description := "Scio helpers for ScalaTest", - // only releases after 0.14.4 - tlMimaPreviousVersions := tlMimaPreviousVersions.value - .filter(v => VersionNumber(v).numbers.last >= 4) + description := "Scio helpers for ScalaTest" ) lazy val `scio-test-core` = project @@ -968,10 +772,7 @@ lazy val `scio-test-core` = project "org.apache.beam" % "beam-runners-direct-java" % beamVersion % Runtime, // test "org.slf4j" % "slf4j-simple" % slf4jVersion % Test - ), - // only releases after 0.14.4 - tlMimaPreviousVersions := tlMimaPreviousVersions.value - .filter(v => VersionNumber(v).numbers.last >= 4) + ) ) lazy val `scio-test-google-cloud-platform` = project @@ -992,10 +793,7 @@ lazy val `scio-test-google-cloud-platform` = project "org.typelevel" %% "cats-kernel" % catsVersion, // test "org.slf4j" % "slf4j-simple" % slf4jVersion % Test - ), - // only releases after 0.14.4 - tlMimaPreviousVersions := tlMimaPreviousVersions.value - .filter(v => VersionNumber(v).numbers.last >= 4) + ) ) lazy val `scio-test-parquet` = project @@ -1010,9 +808,6 @@ lazy val `scio-test-parquet` = project .settings(commonSettings) .settings( description := "Scio helpers for ScalaTest", - // only releases after 0.14.4 - tlMimaPreviousVersions := tlMimaPreviousVersions.value - .filter(v => VersionNumber(v).numbers.last >= 4), libraryDependencies ++= Seq( "com.spotify" %% "magnolify-parquet" % magnolifyVersion, "org.apache.avro" % "avro" % avroVersion, @@ -1021,7 +816,7 @@ lazy val `scio-test-parquet` = project "org.apache.parquet" % "parquet-column" % parquetVersion, "org.apache.parquet" % "parquet-common" % parquetVersion, "org.apache.parquet" % "parquet-hadoop" % parquetVersion, - "org.tensorflow" % "tensorflow-core-api" % tensorFlowVersion % Provided + "org.tensorflow" % "tensorflow-core-native" % tensorFlowVersion % Provided ) ) @@ -1052,6 +847,9 @@ lazy val `scio-avro` = project // compile "com.esotericsoftware" % "kryo-shaded" % kryoVersion, "com.google.protobuf" % "protobuf-java" % protobufVersion, + "com.spotify" %% "magnolify-avro" % magnolifyVersion, + "com.spotify" %% "magnolify-protobuf" % magnolifyVersion, + "com.spotify" %% "magnolify-shared" % magnolifyVersion, "com.twitter" %% "chill" % chillVersion, "com.twitter" % "chill-java" % chillVersion, "me.lyh" %% "protobuf-generic" % protobufGenericVersion, @@ -1122,6 +920,10 @@ lazy val `scio-google-cloud-platform` = project "com.google.http-client" % "google-http-client" % gcpBom.key.value, "com.google.http-client" % "google-http-client-gson" % gcpBom.key.value, "com.google.protobuf" % "protobuf-java" % protobufVersion, + "com.spotify" %% "magnolify-bigquery" % magnolifyVersion, + "com.spotify" %% "magnolify-bigtable" % magnolifyVersion, + "com.spotify" %% "magnolify-datastore" % magnolifyVersion, + "com.spotify" %% "magnolify-shared" % magnolifyVersion, "com.twitter" %% "chill" % chillVersion, "com.twitter" % "chill-java" % chillVersion, "commons-io" % "commons-io" % commonsIoVersion, @@ -1310,6 +1112,23 @@ lazy val `scio-grpc` = project ) ) +lazy val `scio-managed` = project + .in(file("scio-managed")) + .dependsOn( + `scio-core` % "compile;test->test" + ) + .settings(commonSettings) + .settings( + description := "Scio add-on for Beam's managed transforms", + libraryDependencies ++= Seq( + // compile + "org.apache.beam" % "beam-sdks-java-core" % beamVersion, + "org.apache.beam" % "beam-sdks-java-managed" % beamVersion, + "com.spotify" %% "magnolify-beam" % magnolifyVersion + // test + ) + ) + lazy val `scio-jdbc` = project .in(file("scio-jdbc")) .dependsOn( @@ -1396,7 +1215,7 @@ lazy val `scio-parquet` = project "org.slf4j" % "log4j-over-slf4j" % slf4jVersion, // log4j is excluded from hadoop "org.slf4j" % "slf4j-api" % slf4jVersion, // provided - "org.tensorflow" % "tensorflow-core-api" % tensorFlowVersion % Provided, + "org.tensorflow" % "tensorflow-core-native" % tensorFlowVersion % Provided, "com.google.cloud.bigdataoss" % "gcs-connector" % s"hadoop2-$bigdataossVersion" % Provided, // runtime "org.apache.hadoop" % "hadoop-client" % hadoopVersion % Runtime excludeAll (Exclude.metricsCore), @@ -1421,8 +1240,7 @@ lazy val `scio-snowflake` = project "joda-time" % "joda-time" % jodaTimeVersion, "org.apache.beam" % "beam-sdks-java-core" % beamVersion, "org.apache.beam" % "beam-sdks-java-io-snowflake" % beamVersion - ), - tlMimaPreviousVersions := Set.empty // TODO: remove once released + ) ) val tensorFlowMetadataSourcesDir = @@ -1444,15 +1262,15 @@ lazy val `scio-tensorflow` = project ).reduce(_ | _), libraryDependencies ++= Seq( // compile + "com.spotify" %% "magnolify-tensorflow" % magnolifyVersion, "org.apache.beam" % "beam-sdks-java-core" % beamVersion, "org.apache.beam" % "beam-vendor-guava-32_1_2-jre" % beamVendorVersion, "org.apache.commons" % "commons-compress" % commonsCompressVersion, - "org.tensorflow" % "tensorflow-core-api" % tensorFlowVersion, + "org.tensorflow" % "tensorflow-core-native" % tensorFlowVersion, // test "com.spotify" %% "featran-core" % featranVersion % Test, "com.spotify" %% "featran-scio" % featranVersion % Test, "com.spotify" %% "featran-tensorflow" % featranVersion % Test, - "com.spotify" %% "magnolify-tensorflow" % magnolifyVersion % Test, "org.slf4j" % "slf4j-simple" % slf4jVersion % Test ), Compile / tensorFlowMetadataSourcesDir := target.value / s"metadata-$tensorFlowMetadataVersion", @@ -1507,17 +1325,18 @@ lazy val `scio-examples` = project .enablePlugins(NoPublishPlugin) .disablePlugins(ScalafixPlugin) .dependsOn( - `scio-core` % "compile->test", `scio-avro` % "compile->test", + `scio-core` % "compile->test", + `scio-elasticsearch8`, + `scio-extra`, `scio-google-cloud-platform`, `scio-jdbc`, - `scio-extra`, - `scio-elasticsearch8`, + `scio-managed`, `scio-neo4j`, - `scio-tensorflow`, - `scio-smb`, - `scio-redis`, `scio-parquet`, + `scio-redis`, + `scio-smb`, + `scio-tensorflow`, `socco-plugin` ) .settings(commonSettings) @@ -1575,12 +1394,13 @@ lazy val `scio-examples` = project "com.mysql" % "mysql-connector-j" % "9.5.0", "com.softwaremill.magnolia1_2" %% "magnolia" % magnoliaVersion, "com.spotify" %% "magnolify-avro" % magnolifyVersion, + "com.spotify" %% "magnolify-beam" % magnolifyVersion, "com.spotify" %% "magnolify-bigtable" % magnolifyVersion, + "com.spotify" %% "magnolify-bigquery" % magnolifyVersion, "com.spotify" %% "magnolify-datastore" % magnolifyVersion, "com.spotify" %% "magnolify-guava" % magnolifyVersion, "com.spotify" %% "magnolify-neo4j" % magnolifyVersion, "com.spotify" %% "magnolify-parquet" % magnolifyVersion, - "com.spotify" %% "magnolify-shared" % magnolifyVersion, "com.spotify" %% "magnolify-tensorflow" % magnolifyVersion, "com.twitter" %% "algebird-core" % algebirdVersion, "joda-time" % "joda-time" % jodaTimeVersion, @@ -1592,6 +1412,7 @@ lazy val `scio-examples` = project "org.apache.beam" % "beam-sdks-java-extensions-sql" % beamVersion, "org.apache.beam" % "beam-sdks-java-io-google-cloud-platform" % beamVersion, "org.apache.beam" % "beam-sdks-java-io-jdbc" % beamVersion, + "org.apache.beam" % "beam-sdks-java-managed" % beamVersion, "org.apache.hadoop" % "hadoop-common" % hadoopVersion, "org.apache.httpcomponents" % "httpcore" % httpCoreVersion, "org.apache.parquet" % "parquet-column" % parquetVersion, @@ -1599,7 +1420,7 @@ lazy val `scio-examples` = project "org.apache.parquet" % "parquet-hadoop" % parquetVersion, "org.neo4j.driver" % "neo4j-java-driver" % neo4jDriverVersion, "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.tensorflow" % "tensorflow-core-api" % tensorFlowVersion, + "org.tensorflow" % "tensorflow-core-native" % tensorFlowVersion, "redis.clients" % "jedis" % jedisVersion, // runtime "com.google.cloud.bigdataoss" % "gcs-connector" % s"hadoop2-$bigdataossVersion" % Runtime, @@ -1833,7 +1654,7 @@ lazy val `scio-smb` = project "org.apache.parquet" % "parquet-column" % parquetVersion % Provided, // scio-parquet "org.apache.parquet" % "parquet-common" % parquetVersion % Provided, // scio-parquet "org.apache.parquet" % "parquet-hadoop" % parquetVersion % Provided, // scio-parquet - "org.tensorflow" % "tensorflow-core-api" % tensorFlowVersion % Provided, // scio-tensorflow + "org.tensorflow" % "tensorflow-core-native" % tensorFlowVersion % Provided, // scio-tensorflow // test "org.apache.beam" % "beam-sdks-java-core" % beamVersion % Test classifier "tests", "org.hamcrest" % "hamcrest" % hamcrestVersion % Test, @@ -1879,6 +1700,7 @@ lazy val integration = project `scio-extra` % "test->test", `scio-google-cloud-platform` % "compile;test->test", `scio-jdbc` % "compile;test->test", + `scio-managed` % "test->test", `scio-neo4j` % "test->test", `scio-smb` % "test->provided,test" ) @@ -1927,7 +1749,13 @@ lazy val integration = project "com.fasterxml.jackson.module" %% "jackson-module-scala" % jacksonVersion % Test, "com.spotify" %% "magnolify-bigquery" % magnolifyVersion % Test, "com.spotify" %% "magnolify-datastore" % magnolifyVersion % Test, - "org.apache.beam" % "beam-runners-google-cloud-dataflow-java" % beamVersion % Test + "org.apache.beam" % "beam-runners-google-cloud-dataflow-java" % beamVersion % Test, + "org.apache.beam" % "beam-sdks-java-io-iceberg" % beamVersion % Test, + "org.apache.iceberg" % "iceberg-common" % icebergVersion % Test, + "org.apache.iceberg" % "iceberg-core" % icebergVersion % Test, + "org.apache.iceberg" % "iceberg-parquet" % icebergVersion % Test, + "org.apache.parquet" % "parquet-common" % parquetVersion % Test, + "org.apache.parquet" % "parquet-column" % parquetVersion % Test ) ) @@ -1955,6 +1783,7 @@ lazy val site = project `scio-grpc` % "compile->test", `scio-jdbc`, `scio-macros`, + `scio-managed`, `scio-neo4j`, `scio-parquet`, `scio-redis`, diff --git a/integration/src/test/java/org/apache/beam/sdk/extensions/smb/SmbPublicAPITest.java b/integration/src/test/java/org/apache/beam/sdk/extensions/smb/SmbPublicAPITest.java index b33c1c7084..6ed2b314be 100644 --- a/integration/src/test/java/org/apache/beam/sdk/extensions/smb/SmbPublicAPITest.java +++ b/integration/src/test/java/org/apache/beam/sdk/extensions/smb/SmbPublicAPITest.java @@ -30,7 +30,7 @@ import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.util.MimeTypes; import org.apache.beam.sdk.values.TupleTag; -import org.tensorflow.proto.example.Example; +import org.tensorflow.proto.Example; /** Test public API access level. Passes by successfully compiling. */ public class SmbPublicAPITest { diff --git a/integration/src/test/scala/com/spotify/scio/bigtable/BigtableIT.scala b/integration/src/test/scala/com/spotify/scio/bigtable/BigtableIT.scala index 29e0407d3c..625fbd51a8 100644 --- a/integration/src/test/scala/com/spotify/scio/bigtable/BigtableIT.scala +++ b/integration/src/test/scala/com/spotify/scio/bigtable/BigtableIT.scala @@ -41,11 +41,7 @@ object BigtableIT { def testData(id: String): Seq[(String, Long)] = Seq((s"$id-key1", 1L), (s"$id-key2", 2L), (s"$id-key3", 3L)) - val bigtableOptions: BigtableOptions = BigtableOptions - .builder() - .setProjectId(projectId) - .setInstanceId(instanceId) - .build + val bigtableOptions: BigtableOptions = BTOptions(projectId, instanceId) val FAMILY_NAME: String = "count" val COLUMN_QUALIFIER: ByteString = ByteString.copyFromUtf8("long") @@ -110,7 +106,7 @@ class BigtableIT extends PipelineSpec { .build() runWithRealContext() { sc => sc - .bigtable(projectId, instanceId, tableId, rowFilter = rowFilter) + .bigtable(BTOptions(projectId, instanceId), tableId, rowFilter = rowFilter) .map(fromRow) should containInAnyOrder(data) }.waitUntilDone() } catch { @@ -151,7 +147,7 @@ class BigtableIT extends PipelineSpec { .build() runWithRealContext() { sc => sc - .bigtable(projectId, instanceId, tableId, rowFilter = rowFilter) + .bigtable(BTOptions(projectId, instanceId), tableId, rowFilter = rowFilter) .map(fromRow) should containInAnyOrder(data) }.waitUntilDone() } catch { diff --git a/integration/src/test/scala/com/spotify/scio/iceberg/IcebergIOIT.scala b/integration/src/test/scala/com/spotify/scio/iceberg/IcebergIOIT.scala new file mode 100644 index 0000000000..f2f7fa2c0f --- /dev/null +++ b/integration/src/test/scala/com/spotify/scio/iceberg/IcebergIOIT.scala @@ -0,0 +1,95 @@ +/* + * Copyright 2024 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.iceberg + +import com.dimafeng.testcontainers.{ForAllTestContainer, GenericContainer} +import com.spotify.scio.testing.PipelineSpec +import magnolify.beam._ +import org.apache.iceberg.catalog.{Namespace, TableIdentifier} +import org.apache.iceberg.rest.RESTCatalog +import org.apache.iceberg.types.Types.{IntegerType, NestedField, StringType} +import org.apache.iceberg.{CatalogProperties, CatalogUtil, PartitionSpec, Schema} +import org.testcontainers.containers.wait.strategy.HostPortWaitStrategy + +import java.time.Duration +import java.io.File +import java.nio.file.Files +import scala.jdk.CollectionConverters._ + +case class IcebergIOITRecord(a: Int, b: String) +object IcebergIOITRecord { + implicit val icebergIOITRecordRowType: RowType[IcebergIOITRecord] = RowType[IcebergIOITRecord] +} + +class IcebergIOIT extends PipelineSpec with ForAllTestContainer { + val ContainerPort = 8181 + val CatalogName = "iceberg_it" + val NamespaceName = "iceberg_it_ns" + val TableName = s"${NamespaceName}.iceberg_records" + + lazy val tempDir: File = { + val t = Files.createTempDirectory("iceberg-it").toFile + t.deleteOnExit() + t + } + + override val container: GenericContainer = + GenericContainer( + GenericContainer.stringToDockerImage("tabulario/iceberg-rest:1.6.0"), + exposedPorts = Seq(ContainerPort), + waitStrategy = new HostPortWaitStrategy() + .forPorts(ContainerPort) + .withStartupTimeout(Duration.ofSeconds(180)) + ) + + lazy val uri = s"http://${container.containerIpAddress}:${container.mappedPort(ContainerPort)}" + + override def afterStart(): Unit = { + val cat = new RESTCatalog() + cat.initialize(CatalogName, Map("uri" -> uri).asJava) + + cat.createNamespace(Namespace.of(NamespaceName)) + cat.createTable( + TableIdentifier.parse(TableName), + new Schema( + NestedField.required(0, "a", IntegerType.get()), + NestedField.required(1, "b", StringType.get()) + ), + PartitionSpec.unpartitioned() + ) + } + + "IcebergIO" should "work" in { + val catalogProperties = Map( + CatalogUtil.ICEBERG_CATALOG_TYPE -> CatalogUtil.ICEBERG_CATALOG_TYPE_REST, + CatalogProperties.URI -> uri + ) + val elements = 1.to(10).map(i => IcebergIOITRecord(i, s"$i")) + + runWithRealContext() { sc => + sc.parallelize(elements) + .saveAsIceberg(TableName, catalogProperties = catalogProperties) + } + + runWithRealContext() { sc => + sc.iceberg[IcebergIOITRecord]( + TableName, + catalogProperties = catalogProperties + ) should containInAnyOrder(elements) + } + } +} diff --git a/scio-avro/src/main/scala/com/spotify/scio/avro/AvroTypedIO.scala b/scio-avro/src/main/scala/com/spotify/scio/avro/AvroTypedIO.scala index d82ca8df4c..bfaed96d9b 100644 --- a/scio-avro/src/main/scala/com/spotify/scio/avro/AvroTypedIO.scala +++ b/scio-avro/src/main/scala/com/spotify/scio/avro/AvroTypedIO.scala @@ -21,6 +21,7 @@ import com.spotify.scio.avro.types.AvroType.HasAvroAnnotation import com.spotify.scio.coders.Coder import com.spotify.scio.io.{ScioIO, Tap, TapOf, TapT} import com.spotify.scio.values.SCollection +import magnolify.avro.{AvroType => AvroMagnolifyType} import org.apache.avro.generic.GenericRecord import scala.reflect.runtime.universe._ @@ -38,13 +39,18 @@ final case class AvroTypedIO[T <: HasAvroAnnotation: TypeTag: Coder](path: Strin private lazy val underlying: GenericRecordIO = GenericRecordIO(path, schema) override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = - sc.read(underlying)(params).map(avroT.fromGenericRecord) + sc.transform(_.read(underlying)(params).map(avroT.fromGenericRecord)) override protected def write(data: SCollection[T], params: WriteP): Tap[T] = { val datumFactory = Option(params.datumFactory).getOrElse(GenericRecordDatumFactory) implicit val coder: Coder[GenericRecord] = avroCoder(datumFactory, schema) - data.map(avroT.toGenericRecord).write(underlying)(params) - tap(AvroIO.ReadParam(params)) + underlying + .writeWithContext( + data.transform(_.map(avroT.toGenericRecord)), + params + ) + .underlying + .map(avroT.fromGenericRecord) } override def tap(read: ReadP): Tap[T] = @@ -61,6 +67,35 @@ object AvroTypedIO { @deprecated("Use AvroTypedIO instead", "0.14.0") object AvroTyped { type AvroIO[T <: HasAvroAnnotation] = AvroTypedIO[T] - def AvroIO[T <: HasAvroAnnotation: TypeTag: Coder](path: String): AvroIO[T] = - AvroTypedIO[T](path) + def AvroIO[T <: HasAvroAnnotation: TypeTag: Coder](path: String): AvroIO[T] = AvroTypedIO[T](path) +} + +final case class AvroMagnolifyTypedIO[T: AvroMagnolifyType: Coder](path: String) extends ScioIO[T] { + override type ReadP = AvroMagnolifyTypedIO.ReadParam + override type WriteP = AvroMagnolifyTypedIO.WriteParam + override val tapT: TapT.Aux[T, T] = TapOf[T] + + override def testId: String = s"AvroIO($path)" + + private lazy val avroT: AvroMagnolifyType[T] = implicitly + private lazy val schema = avroT.schema + private lazy val underlying: GenericRecordIO = GenericRecordIO(path, schema) + + override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = + sc.transform(_.read(underlying)(params).map(avroT.from)) + + override protected def write(data: SCollection[T], params: WriteP): Tap[T] = { + val datumFactory = Option(params.datumFactory).getOrElse(GenericRecordDatumFactory) + implicit val coder: Coder[GenericRecord] = avroCoder(datumFactory, schema) + underlying.writeWithContext(data.transform(_.map(avroT.to)), params).underlying.map(avroT.from) + } + + override def tap(read: ReadP): Tap[T] = underlying.tap(read).map(avroT.from) +} + +object AvroMagnolifyTypedIO { + type ReadParam = GenericRecordIO.ReadParam + val ReadParam = GenericRecordIO.ReadParam + type WriteParam = GenericRecordIO.WriteParam + val WriteParam = GenericRecordIO.WriteParam } diff --git a/scio-avro/src/main/scala/com/spotify/scio/avro/ObjectFileIO.scala b/scio-avro/src/main/scala/com/spotify/scio/avro/ObjectFileIO.scala index 3671b207ac..6f285427e7 100644 --- a/scio-avro/src/main/scala/com/spotify/scio/avro/ObjectFileIO.scala +++ b/scio-avro/src/main/scala/com/spotify/scio/avro/ObjectFileIO.scala @@ -39,7 +39,11 @@ final case class ObjectFileIO[T: Coder](path: String) extends ScioIO[T] { */ override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = { val objectCoder = CoderMaterializer.beamWithDefault(Coder[T]) - sc.read(underlying)(params).map(record => AvroBytesUtil.decode(objectCoder, record)) + sc.transform { self => + self + .read(underlying)(params) + .map(record => AvroBytesUtil.decode(objectCoder, record)) + } } /** diff --git a/scio-avro/src/main/scala/com/spotify/scio/avro/ProtobufIO.scala b/scio-avro/src/main/scala/com/spotify/scio/avro/ProtobufIO.scala index e2dae68683..2853fcdf78 100644 --- a/scio-avro/src/main/scala/com/spotify/scio/avro/ProtobufIO.scala +++ b/scio-avro/src/main/scala/com/spotify/scio/avro/ProtobufIO.scala @@ -18,17 +18,35 @@ package com.spotify.scio.avro import com.google.protobuf.Message import com.spotify.scio.ScioContext -import com.spotify.scio.io.{ScioIO, Tap, TapOf, TapT} +import com.spotify.scio.coders.Coder +import com.spotify.scio.io.{ScioIO, Tap, TapOf, TapT, TestIO} import com.spotify.scio.protobuf.util.ProtobufUtil import com.spotify.scio.values.SCollection +import magnolify.protobuf.ProtobufType import scala.reflect.ClassTag -final case class ProtobufIO[T <: Message: ClassTag](path: String) extends ScioIO[T] { - override type ReadP = ProtobufIO.ReadParam - override type WriteP = ProtobufIO.WriteParam - override val tapT: TapT.Aux[T, T] = TapOf[T] +sealed trait ProtobufIO[T] extends ScioIO[T] { + final override val tapT: TapT.Aux[T, T] = TapOf[T] +} + +object ProtobufIO { + final def apply[T](path: String): ProtobufIO[T] = + new ProtobufIO[T] with TestIO[T] { + override def testId: String = s"ProtobufIO($path)" + } +} + +object ProtobufObjectFileIO { + type ReadParam = GenericRecordIO.ReadParam + val ReadParam = GenericRecordIO.ReadParam + type WriteParam = GenericRecordIO.WriteParam + val WriteParam = GenericRecordIO.WriteParam +} +final case class ProtobufObjectFileIO[T <: Message: ClassTag](path: String) extends ProtobufIO[T] { + override type ReadP = ProtobufObjectFileIO.ReadParam + override type WriteP = ProtobufObjectFileIO.WriteParam override def testId: String = s"ProtobufIO($path)" private lazy val underlying: ObjectFileIO[T] = ObjectFileIO(path) @@ -53,13 +71,51 @@ final case class ProtobufIO[T <: Message: ClassTag](path: String) extends ScioIO data.write(underlying)(params.copy(metadata = metadata)).underlying } - override def tap(read: ReadP): Tap[T] = - ProtobufFileTap(path, read) + override def tap(read: ReadP): Tap[T] = ProtobufFileTap(path, read) } -object ProtobufIO { - type ReadParam = GenericRecordIO.ReadParam - val ReadParam = GenericRecordIO.ReadParam - type WriteParam = GenericRecordIO.WriteParam - val WriteParam = GenericRecordIO.WriteParam +final case class ProtobufTypedObjectFileIO[T: Coder, U <: Message: ClassTag]( + path: String +)(implicit pt: ProtobufType[T, U]) + extends ProtobufIO[T] { + override type ReadP = ProtobufTypedObjectFileIO.ReadParam + override type WriteP = ProtobufTypedObjectFileIO.WriteParam + override def testId: String = s"ProtobufIO($path)" + + private lazy val underlying: ObjectFileIO[U] = ObjectFileIO(path) + + /** + * Get an SCollection for a Protobuf file. + * + * Protobuf messages are serialized into `Array[Byte]` and stored in Avro files to leverage Avro's + * block file format. + */ + override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = + sc.transform(_.read(underlying)(params).map(pt.from)) + + /** + * Save this SCollection as a Protobuf file. + * + * Protobuf messages are serialized into `Array[Byte]` and stored in Avro files to leverage Avro's + * block file format. + */ + override protected def write(data: SCollection[T], params: WriteP): Tap[T] = { + val metadata = params.metadata ++ ProtobufUtil.schemaMetadataOf[U] + underlying + .writeWithContext( + data.transform(_.map(pt.to)), + params.copy(metadata = metadata) + ) + .underlying + .map(pt.from) + } + + override def tap(read: ReadP): Tap[T] = ProtobufFileTap[U](path, read).map(pt.from) +} + +object ProtobufTypedObjectFileIO { + type ReadParam = ProtobufObjectFileIO.ReadParam + val ReadParam = ProtobufObjectFileIO.ReadParam + type WriteParam = ProtobufObjectFileIO.WriteParam + val WriteParam = ProtobufObjectFileIO.WriteParam } diff --git a/scio-avro/src/main/scala/com/spotify/scio/avro/syntax/SCollectionSyntax.scala b/scio-avro/src/main/scala/com/spotify/scio/avro/syntax/SCollectionSyntax.scala index 8a09bf241d..e559d032b5 100644 --- a/scio-avro/src/main/scala/com/spotify/scio/avro/syntax/SCollectionSyntax.scala +++ b/scio-avro/src/main/scala/com/spotify/scio/avro/syntax/SCollectionSyntax.scala @@ -24,6 +24,8 @@ import com.spotify.scio.coders.Coder import com.spotify.scio.io.ClosedTap import com.spotify.scio.util.{FilenamePolicySupplier, ScioUtil} import com.spotify.scio.values._ +import magnolify.avro.{AvroType => AvroMagnolifyType} +import magnolify.protobuf.ProtobufType import org.apache.avro.Schema import org.apache.avro.file.CodecFactory import org.apache.avro.specific.{SpecificData, SpecificRecord} @@ -183,17 +185,17 @@ final class ProtobufSCollectionOps[T <: Message](private val self: SCollection[T */ def saveAsProtobufFile( path: String, - numShards: Int = ProtobufIO.WriteParam.DefaultNumShards, - suffix: String = ProtobufIO.WriteParam.DefaultSuffixProtobuf, - codec: CodecFactory = ProtobufIO.WriteParam.DefaultCodec, - metadata: Map[String, AnyRef] = ProtobufIO.WriteParam.DefaultMetadata, - shardNameTemplate: String = ProtobufIO.WriteParam.DefaultShardNameTemplate, - tempDirectory: String = ProtobufIO.WriteParam.DefaultTempDirectory, + numShards: Int = ProtobufObjectFileIO.WriteParam.DefaultNumShards, + suffix: String = ProtobufObjectFileIO.WriteParam.DefaultSuffixProtobuf, + codec: CodecFactory = ProtobufObjectFileIO.WriteParam.DefaultCodec, + metadata: Map[String, AnyRef] = ProtobufObjectFileIO.WriteParam.DefaultMetadata, + shardNameTemplate: String = ProtobufObjectFileIO.WriteParam.DefaultShardNameTemplate, + tempDirectory: String = ProtobufObjectFileIO.WriteParam.DefaultTempDirectory, filenamePolicySupplier: FilenamePolicySupplier = - ProtobufIO.WriteParam.DefaultFilenamePolicySupplier, - prefix: String = ProtobufIO.WriteParam.DefaultPrefix + ProtobufObjectFileIO.WriteParam.DefaultFilenamePolicySupplier, + prefix: String = ProtobufObjectFileIO.WriteParam.DefaultPrefix )(implicit ct: ClassTag[T]): ClosedTap[T] = { - val param = ProtobufIO.WriteParam[GenericRecord]( + val param = ProtobufObjectFileIO.WriteParam[GenericRecord]( numShards, suffix, codec, @@ -203,7 +205,73 @@ final class ProtobufSCollectionOps[T <: Message](private val self: SCollection[T shardNameTemplate, tempDirectory ) - self.write(ProtobufIO[T](path))(param) + self.write(ProtobufObjectFileIO[T](path))(param) + } +} + +final class TypedMagnolifyProtobufSCollectionOps[T](private val self: SCollection[T]) + extends AnyVal { + + /** + * Save this SCollection as a Protobuf file. + * + * Protobuf messages are serialized into `Array[Byte]` and stored in Avro files to leverage Avro's + * block file format. + */ + def saveAsProtobufFile[U <: Message: ClassTag]( + path: String, + numShards: Int = ProtobufTypedObjectFileIO.WriteParam.DefaultNumShards, + suffix: String = ProtobufTypedObjectFileIO.WriteParam.DefaultSuffixProtobuf, + codec: CodecFactory = ProtobufTypedObjectFileIO.WriteParam.DefaultCodec, + metadata: Map[String, AnyRef] = ProtobufTypedObjectFileIO.WriteParam.DefaultMetadata, + shardNameTemplate: String = ProtobufTypedObjectFileIO.WriteParam.DefaultShardNameTemplate, + tempDirectory: String = ProtobufTypedObjectFileIO.WriteParam.DefaultTempDirectory, + filenamePolicySupplier: FilenamePolicySupplier = + ProtobufTypedObjectFileIO.WriteParam.DefaultFilenamePolicySupplier, + prefix: String = ProtobufTypedObjectFileIO.WriteParam.DefaultPrefix + )(implicit pt: ProtobufType[T, U]): ClosedTap[T] = { + implicit val tCoder: Coder[T] = self.coder + val param = ProtobufTypedObjectFileIO.WriteParam[GenericRecord]( + numShards, + suffix, + codec, + metadata, + filenamePolicySupplier, + prefix, + shardNameTemplate, + tempDirectory + ) + self.write(ProtobufTypedObjectFileIO[T, U](path))(param) + } +} + +final class TypedMagnolifyAvroSCollectionOps[T](private val self: SCollection[T]) { + + def saveAsAvroFile( + path: String, + numShards: Int = AvroTypedIO.WriteParam.DefaultNumShards, + suffix: String = AvroTypedIO.WriteParam.DefaultSuffix, + codec: CodecFactory = AvroTypedIO.WriteParam.DefaultCodec, + metadata: Map[String, AnyRef] = AvroTypedIO.WriteParam.DefaultMetadata, + shardNameTemplate: String = AvroTypedIO.WriteParam.DefaultShardNameTemplate, + tempDirectory: String = AvroTypedIO.WriteParam.DefaultTempDirectory, + filenamePolicySupplier: FilenamePolicySupplier = + AvroTypedIO.WriteParam.DefaultFilenamePolicySupplier, + prefix: String = AvroTypedIO.WriteParam.DefaultPrefix, + datumFactory: AvroDatumFactory[GenericRecord] = AvroTypedIO.WriteParam.DefaultDatumFactory + )(implicit coder: Coder[T], at: AvroMagnolifyType[T]): ClosedTap[T] = { + val param = AvroMagnolifyTypedIO.WriteParam( + numShards, + suffix, + codec, + metadata, + filenamePolicySupplier, + prefix, + shardNameTemplate, + tempDirectory, + datumFactory + ) + self.write(AvroMagnolifyTypedIO[T](path))(param) } } @@ -290,4 +358,12 @@ trait SCollectionSyntax { c: SCollection[T] )(implicit ev: T <:< String): FilesSCollectionOps = new FilesSCollectionOps(c.covary_) + + implicit def typedAvroProtobufSCollectionOps[T]( + c: SCollection[T] + ): TypedMagnolifyProtobufSCollectionOps[T] = new TypedMagnolifyProtobufSCollectionOps[T](c) + + implicit def typedMagnolifyAvroSCollectionOps[T]( + c: SCollection[T] + ): TypedMagnolifyAvroSCollectionOps[T] = new TypedMagnolifyAvroSCollectionOps(c) } diff --git a/scio-avro/src/main/scala/com/spotify/scio/avro/syntax/ScioContextSyntax.scala b/scio-avro/src/main/scala/com/spotify/scio/avro/syntax/ScioContextSyntax.scala index c79a068ddc..bacbfda8aa 100644 --- a/scio-avro/src/main/scala/com/spotify/scio/avro/syntax/ScioContextSyntax.scala +++ b/scio-avro/src/main/scala/com/spotify/scio/avro/syntax/ScioContextSyntax.scala @@ -24,6 +24,8 @@ import com.spotify.scio.avro._ import com.spotify.scio.avro.types.AvroType.HasAvroAnnotation import com.spotify.scio.coders.Coder import com.spotify.scio.values._ +import magnolify.protobuf.ProtobufType +import magnolify.avro.{AvroType => AvroMagnolifyType} import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.avro.specific.SpecificRecord @@ -167,6 +169,16 @@ final class ScioContextOps(private val self: ScioContext) extends AnyVal { ): SCollection[T] = self.read(AvroTypedIO[T](path))(AvroTypedIO.ReadParam(suffix)) + /** + * Read avro data from `path` as `GenericRecord` and convert to `T` via the implicitly-available + * `magnolify.avro.AvroType[T]` + */ + def typedAvroFileMagnolify[T: AvroMagnolifyType: Coder]( + path: String, + suffix: String = AvroMagnolifyTypedIO.ReadParam.DefaultSuffix + ): SCollection[T] = + self.read(AvroMagnolifyTypedIO[T](path))(AvroMagnolifyTypedIO.ReadParam(suffix)) + /** * Get an SCollection for a Protobuf file. * @@ -175,9 +187,19 @@ final class ScioContextOps(private val self: ScioContext) extends AnyVal { */ def protobufFile[T <: Message: ClassTag]( path: String, - suffix: String = ProtobufIO.ReadParam.DefaultSuffix + suffix: String = ProtobufObjectFileIO.ReadParam.DefaultSuffix ): SCollection[T] = - self.read(ProtobufIO[T](path))(ProtobufIO.ReadParam(suffix)) + self.read(ProtobufObjectFileIO[T](path))(ProtobufObjectFileIO.ReadParam(suffix)) + + /** + * Read back protobuf messages serialized to `Array[Byte]` and stored in Avro files then map them + * automatically to type `T` via the implicit [[magnolify.protobuf.ProtobufType]] + */ + def typedProtobufFile[T: Coder, U <: Message: ClassTag]( + path: String, + suffix: String = ProtobufObjectFileIO.ReadParam.DefaultSuffix + )(implicit pt: ProtobufType[T, U]): SCollection[T] = + self.read(ProtobufTypedObjectFileIO[T, U](path))(ProtobufObjectFileIO.ReadParam(suffix)) } /** Enhanced with Avro methods. */ diff --git a/scio-avro/src/main/scala/com/spotify/scio/avro/taps.scala b/scio-avro/src/main/scala/com/spotify/scio/avro/taps.scala index 6a54921472..ad86792780 100644 --- a/scio-avro/src/main/scala/com/spotify/scio/avro/taps.scala +++ b/scio-avro/src/main/scala/com/spotify/scio/avro/taps.scala @@ -79,7 +79,7 @@ object ObjectFileTap { } object ProtobufFileTap { - def apply[T <: Message: ClassTag](path: String, params: ProtobufIO.ReadParam): Tap[T] = + def apply[T <: Message: ClassTag](path: String, params: ProtobufObjectFileIO.ReadParam): Tap[T] = ObjectFileTap(path, params)(Coder.protoMessageCoder[T]) } @@ -99,7 +99,7 @@ final case class AvroTaps(self: Taps) { /** Get a `Future[Tap[T]]` of a Protobuf file. */ def protobufFile[T <: Message: ClassTag]( path: String, - params: ProtobufIO.ReadParam = ProtobufIO.ReadParam() + params: ProtobufObjectFileIO.ReadParam = ProtobufObjectFileIO.ReadParam() ): Future[Tap[T]] = self.mkTap( s"Protobuf: $path", diff --git a/scio-avro/src/main/scala/com/spotify/scio/avro/types/AvroType.scala b/scio-avro/src/main/scala/com/spotify/scio/avro/types/AvroType.scala index 20b1a32ec4..e3430b27a4 100644 --- a/scio-avro/src/main/scala/com/spotify/scio/avro/types/AvroType.scala +++ b/scio-avro/src/main/scala/com/spotify/scio/avro/types/AvroType.scala @@ -51,6 +51,7 @@ import scala.reflect.runtime.universe._ * @groupname Ungrouped * Other Members */ +@deprecated("Use magnolify API instead.", "0.15.0") object AvroType { /** @@ -86,6 +87,7 @@ object AvroType { @compileTimeOnly( "enable macro paradise (2.12) or -Ymacro-annotations (2.13) to expand macro annotations" ) + @deprecated("Use magnolify API instead.", "0.15.0") class fromSchema(schema: String) extends StaticAnnotation { def macroTransform(annottees: Any*): Any = macro TypeProvider.schemaImpl } @@ -129,6 +131,7 @@ object AvroType { @compileTimeOnly( "enable macro paradise (2.12) or -Ymacro-annotations (2.13) to expand macro annotations" ) + @deprecated("Use magnolify API instead.", "0.15.0") class fromPath(folderGlob: String) extends StaticAnnotation { def macroTransform(annottees: Any*): Any = macro TypeProvider.pathImpl } @@ -158,6 +161,7 @@ object AvroType { @compileTimeOnly( "enable macro paradise (2.12) or -Ymacro-annotations (2.13) to expand macro annotations" ) + @deprecated("Use magnolify API instead.", "0.15.0") class fromSchemaFile(schemaFile: String) extends StaticAnnotation { def macroTransform(annottees: Any*): Any = macro TypeProvider.schemaFileImpl } @@ -185,6 +189,7 @@ object AvroType { @compileTimeOnly( "enable macro paradise (2.12) or -Ymacro-annotations (2.13) to expand macro annotations" ) + @deprecated("Use magnolify API instead.", "0.15.0") class toSchema extends StaticAnnotation { def macroTransform(annottees: Any*): Any = macro TypeProvider.toSchemaImpl } @@ -215,9 +220,11 @@ object AvroType { * Trait for case classes with generated companion objects. * @group trait */ + @deprecated("Use magnolify API instead.", "0.15.0") trait HasAvroAnnotation /** Generate [[org.apache.avro.Schema Schema]] for a case class. */ + @deprecated("Use magnolify API instead.", "0.15.0") def schemaOf[T: TypeTag]: Schema = SchemaProvider.schemaOf[T] /** @@ -225,6 +232,7 @@ object AvroType { * the given case class `T`. * @group converters */ + @deprecated("Use magnolify API instead.", "0.15.0") def fromGenericRecord[T]: GenericRecord => T = macro ConverterProvider.fromGenericRecordImpl[T] @@ -233,10 +241,12 @@ object AvroType { * [[org.apache.avro.generic.GenericRecord GenericRecord]]. * @group converters */ + @deprecated("Use magnolify API instead.", "0.15.0") def toGenericRecord[T]: T => GenericRecord = macro ConverterProvider.toGenericRecordImpl[T] /** Create a new AvroType instance. */ + @deprecated("Use magnolify API instead.", "0.15.0") def apply[T: TypeTag]: AvroType[T] = new AvroType[T] } @@ -245,6 +255,7 @@ object AvroType { * * This decouples generated fields and methods from macro expansion to keep core macro free. */ +@deprecated("Use magnolify API instead.", "0.15.0") class AvroType[T: TypeTag] extends Serializable { private val instance = runtimeMirror(getClass.getClassLoader) .reflectModule(typeOf[T].typeSymbol.companion.asModule) diff --git a/scio-avro/src/test/scala/com/spotify/scio/avro/AvroIOTest.scala b/scio-avro/src/test/scala/com/spotify/scio/avro/AvroIOTest.scala index 7d53ffd272..962c9a0dc3 100644 --- a/scio-avro/src/test/scala/com/spotify/scio/avro/AvroIOTest.scala +++ b/scio-avro/src/test/scala/com/spotify/scio/avro/AvroIOTest.scala @@ -33,6 +33,9 @@ import java.io.File object AvroIOTest { @AvroType.toSchema case class AvroRecord(i: Int, s: String, r: List[String]) + case class Track(trackId: String) + + case class Record(i: Int, s: String, r: List[String]) } class AvroIOFileNamePolicyTest extends FileNamePolicySpec[TestRecord] { @@ -58,7 +61,7 @@ class AvroIOFileNamePolicyTest extends FileNamePolicySpec[TestRecord] { _.map(AvroUtils.newSpecificRecord).saveAsAvroFile( "nonsense", shardNameTemplate = "SSS-of-NNN", - filenamePolicySupplier = testFilenamePolicySupplier + filenamePolicySupplier = testFilenamePolicySupplier(_, _) ) ) } @@ -88,7 +91,7 @@ class ObjectIOFileNamePolicyTest extends FileNamePolicySpec[AvroIOTest.AvroRecor _.map(x => AvroRecord(x, x.toString, (1 to x).map(_.toString).toList)).saveAsObjectFile( "nonsense", shardNameTemplate = "SSS-of-NNN", - filenamePolicySupplier = testFilenamePolicySupplier + filenamePolicySupplier = testFilenamePolicySupplier(_, _) ) ) } @@ -116,7 +119,7 @@ class ProtobufIOFileNamePolicyTest extends FileNamePolicySpec[TrackPB] { _.map(x => TrackPB.newBuilder().setTrackId(x.toString).build()).saveAsProtobufFile( "nonsense", shardNameTemplate = "SSS-of-NNN", - filenamePolicySupplier = testFilenamePolicySupplier + filenamePolicySupplier = testFilenamePolicySupplier(_, _) ) ) } @@ -200,6 +203,12 @@ class AvroIOTest extends ScioIOSpec { testJobTest(xs)(io)(_.typedAvroFile[AvroRecord](_))(_.saveAsTypedAvroFile(_)) } + it should "work with typed Avro with magnolify AvroType" in { + val xs = (1 to 100).map(x => Record(x, x.toString, (1 to x).map(_.toString).toList)) + testTap(xs)(_.saveAsAvroFile(_))(".avro") + testJobTest(xs)(AvroIO[Record])(_.typedAvroFileMagnolify[Record](_))(_.saveAsAvroFile(_)) + } + "ObjectFileIO" should "work" in { val xs = (1 to 100).map(x => AvroRecord(x, x.toString, (1 to x).map(_.toString).toList)) testTap(xs)(_.saveAsObjectFile(_))(".obj.avro") @@ -216,4 +225,12 @@ class AvroIOTest extends ScioIOSpec { testJobTest(xs)(ProtobufIO(_))(_.protobufFile[TrackPB](_))(_.saveAsProtobufFile(_)) } + "TypedProtobufIO" should "work" in { + val xs = (1 to 100).map(x => Track(x.toString)) + val suffix = ".protobuf.avro" + testTap(xs)(_.saveAsProtobufFile[TrackPB](_))(suffix) + testJobTest(xs)(ProtobufIO(_))(_.typedProtobufFile[Track, TrackPB](_))( + _.saveAsProtobufFile[TrackPB](_) + ) + } } diff --git a/scio-avro/src/test/scala/com/spotify/scio/avro/types/ConverterProviderTest.scala b/scio-avro/src/test/scala/com/spotify/scio/avro/types/ConverterProviderTest.scala index e8c7b98778..cc7b81036a 100644 --- a/scio-avro/src/test/scala/com/spotify/scio/avro/types/ConverterProviderTest.scala +++ b/scio-avro/src/test/scala/com/spotify/scio/avro/types/ConverterProviderTest.scala @@ -18,7 +18,6 @@ package com.spotify.scio.avro.types import java.nio.file.Files - import com.spotify.scio._ import com.spotify.scio.avro._ import org.apache.commons.io.FileUtils diff --git a/scio-core/src/main/java/com/spotify/scio/transforms/BaseAsyncBatchLookupDoFn.java b/scio-core/src/main/java/com/spotify/scio/transforms/BaseAsyncBatchLookupDoFn.java index 38ce932377..a598effc4d 100644 --- a/scio-core/src/main/java/com/spotify/scio/transforms/BaseAsyncBatchLookupDoFn.java +++ b/scio-core/src/main/java/com/spotify/scio/transforms/BaseAsyncBatchLookupDoFn.java @@ -160,17 +160,6 @@ public void startBundle(StartBundleContext context) { semaphore.release(maxPendingRequests); } - // kept for binary compatibility. Must not be used - // TODO: remove in 0.15.0 - @Deprecated - public void processElement( - Input input, - Instant timestamp, - OutputReceiver> out, - BoundedWindow window) { - processElement(input, timestamp, window, null, out); - } - @ProcessElement public void processElement( @Element Input input, diff --git a/scio-core/src/main/java/com/spotify/scio/transforms/BaseAsyncDoFn.java b/scio-core/src/main/java/com/spotify/scio/transforms/BaseAsyncDoFn.java index f32efdc0aa..dc21c37c46 100644 --- a/scio-core/src/main/java/com/spotify/scio/transforms/BaseAsyncDoFn.java +++ b/scio-core/src/main/java/com/spotify/scio/transforms/BaseAsyncDoFn.java @@ -73,14 +73,6 @@ public void finishBundle(FinishBundleContext context) { flush(r -> context.output(r.getValue(), r.getTimestamp(), r.getWindow())); } - // kept for binary compatibility. Must not be used - // TODO: remove in 0.15.0 - @Deprecated - public void processElement( - Input input, Instant timestamp, OutputReceiver out, BoundedWindow window) { - processElement(input, timestamp, window, null, out); - } - @ProcessElement public void processElement( @Element Input element, diff --git a/scio-core/src/main/java/com/spotify/scio/transforms/BaseAsyncLookupDoFn.java b/scio-core/src/main/java/com/spotify/scio/transforms/BaseAsyncLookupDoFn.java index f8b2766826..6ee274c27e 100644 --- a/scio-core/src/main/java/com/spotify/scio/transforms/BaseAsyncLookupDoFn.java +++ b/scio-core/src/main/java/com/spotify/scio/transforms/BaseAsyncLookupDoFn.java @@ -161,17 +161,6 @@ public void startBundle(StartBundleContext context) { semaphore.release(maxPendingRequests); } - // kept for binary compatibility. Must not be used - // TODO: remove in 0.15.0 - @Deprecated - public void processElement( - Input input, - Instant timestamp, - OutputReceiver> out, - BoundedWindow window) { - processElement(input, timestamp, window, null, out); - } - @SuppressWarnings("unchecked") @ProcessElement public void processElement( diff --git a/scio-core/src/main/java/com/spotify/scio/transforms/FileDownloadDoFn.java b/scio-core/src/main/java/com/spotify/scio/transforms/FileDownloadDoFn.java index 1066dae7d2..c0169e11ea 100644 --- a/scio-core/src/main/java/com/spotify/scio/transforms/FileDownloadDoFn.java +++ b/scio-core/src/main/java/com/spotify/scio/transforms/FileDownloadDoFn.java @@ -79,14 +79,6 @@ public void startBundle(StartBundleContext context) { this.batch.clear(); } - // kept for binary compatibility. Must not be used - // TODO: remove in 0.15.0 - @Deprecated - public void processElement( - URI element, Instant timestamp, OutputReceiver out, BoundedWindow window) { - processElement(element, timestamp, window, null, out); - } - @ProcessElement public void processElement( @DoFn.Element URI element, diff --git a/scio-core/src/main/scala/com/spotify/scio/ScioContext.scala b/scio-core/src/main/scala/com/spotify/scio/ScioContext.scala index 743c466dd7..4c87d5b11c 100644 --- a/scio-core/src/main/scala/com/spotify/scio/ScioContext.scala +++ b/scio-core/src/main/scala/com/spotify/scio/ScioContext.scala @@ -568,7 +568,6 @@ class ScioContext private[scio] ( } private[scio] def prepare(): Unit = { - // TODO: make sure this works for other PipelineOptions RunnerContext.prepareOptions(options, artifacts) ScioContext.validateOptions(options) } diff --git a/scio-core/src/main/scala/com/spotify/scio/coders/instances/ScalaCoders.scala b/scio-core/src/main/scala/com/spotify/scio/coders/instances/ScalaCoders.scala index ca5b17bca0..3f2787bc0a 100644 --- a/scio-core/src/main/scala/com/spotify/scio/coders/instances/ScalaCoders.scala +++ b/scio-core/src/main/scala/com/spotify/scio/coders/instances/ScalaCoders.scala @@ -444,7 +444,6 @@ trait ScalaCoders extends CoderGrammar with CoderDerivation { implicit def seqCoder[T: Coder]: Coder[Seq[T]] = transform(Coder[T])(bc => beam(new SeqCoder[T](bc))) - // TODO: proper chunking implementation implicit def iterableCoder[T: Coder]: Coder[Iterable[T]] = transform(Coder[T])(bc => beam(new IterableCoder[T](bc))) diff --git a/scio-core/src/main/scala/com/spotify/scio/testing/TestDataManager.scala b/scio-core/src/main/scala/com/spotify/scio/testing/TestDataManager.scala index b4504873a3..eb92cbae6d 100644 --- a/scio-core/src/main/scala/com/spotify/scio/testing/TestDataManager.scala +++ b/scio-core/src/main/scala/com/spotify/scio/testing/TestDataManager.scala @@ -87,7 +87,6 @@ private[scio] class TestOutput(val m: Map[String, SCollection[_] => Any]) { java.util.concurrent.ConcurrentHashMap.newKeySet[String]().asScala def apply[T](io: ScioIO[T]): SCollection[T] => Any = apply(io.testId) def apply[T](key: String): SCollection[T] => Any = { - // TODO: support Materialize outputs, maybe Materialized[T]? require( m.contains(key), s"Missing test output: $key, available: ${m.keys.mkString("[", ", ", "]")}" diff --git a/scio-core/src/main/scala/com/spotify/scio/util/FilenamePolicySupplier.scala b/scio-core/src/main/scala/com/spotify/scio/util/FilenamePolicySupplier.scala index d66e1f3ec2..decf98bf41 100644 --- a/scio-core/src/main/scala/com/spotify/scio/util/FilenamePolicySupplier.scala +++ b/scio-core/src/main/scala/com/spotify/scio/util/FilenamePolicySupplier.scala @@ -23,6 +23,7 @@ import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions import org.apache.beam.sdk.io.fs.ResourceId import org.apache.beam.sdk.transforms.windowing.{BoundedWindow, PaneInfo} +@FunctionalInterface trait FilenamePolicySupplier { def apply(path: String, suffix: String): FilenamePolicy } diff --git a/scio-core/src/main/scala/com/spotify/scio/values/SCollection.scala b/scio-core/src/main/scala/com/spotify/scio/values/SCollection.scala index 1d1b39b264..19e5e83af0 100644 --- a/scio-core/src/main/scala/com/spotify/scio/values/SCollection.scala +++ b/scio-core/src/main/scala/com/spotify/scio/values/SCollection.scala @@ -1757,15 +1757,15 @@ sealed trait SCollection[T] extends PCollectionWrapper[T] { } /** - * Generic write method for all `ScioIO[T]` implementations, if it is test pipeline this will + * Generic write method for all `ScioIO[T]` implementations, if it is a test pipeline this will * evaluate pre-registered output IO implementation which match for the passing `ScioIO[T]` - * implementation. if not this will invoke [[com.spotify.scio.io.ScioIO[T]#write]] method along - * with write configurations passed by. + * implementation. If not, this will invoke [[com.spotify.scio.io.ScioIO[T]#write]] with the + * provided write configuration. * * @param io * an implementation of `ScioIO[T]` trait * @param params - * configurations need to pass to perform underline write implementation + * configurations need to pass to perform underlying write implementation */ def write(io: ScioIO[T])(params: io.WriteP): ClosedTap[io.tapT.T] = io.writeWithContext(this, params) diff --git a/scio-core/src/test/scala/com/spotify/scio/coders/instances/kryo/JodaSerializerTest.scala b/scio-core/src/test/scala/com/spotify/scio/coders/instances/kryo/JodaSerializerTest.scala index 13525a27b7..acd6783f53 100644 --- a/scio-core/src/test/scala/com/spotify/scio/coders/instances/kryo/JodaSerializerTest.scala +++ b/scio-core/src/test/scala/com/spotify/scio/coders/instances/kryo/JodaSerializerTest.scala @@ -27,7 +27,6 @@ import scala.jdk.CollectionConverters._ import scala.util.Try class JodaSerializerTest extends AnyFlatSpec with Checkers { - // TODO: remove this once https://github.com/scalatest/scalatest/issues/1090 is addressed implicit override val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 100) diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/IcebergExample.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/IcebergExample.scala new file mode 100644 index 0000000000..b3ffc386e5 --- /dev/null +++ b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/IcebergExample.scala @@ -0,0 +1,66 @@ +/* + * Copyright 2024 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.examples.extra + +import com.spotify.scio.ContextAndArgs +import com.spotify.scio.iceberg._ +import magnolify.beam._ + +// Example: Apache Iceberg read/write Example + +// Usage: + +// `sbt "runMain com.spotify.scio.examples.extra.IcebergExample +// --project=[PROJECT] --runner=DataflowRunner --region=[REGION NAME] +// --inputTable=[INPUT TABLE] --catalogName=[CATALOG NAME] +// --catalogType=[CATALOG TYPE] --catalogUri=[CATALOG URI] +// --catalogWarehouse=[CATALOG WAREHOUSE] --outputTable=[OUTPUT TABLE]"` +object IcebergExample { + + case class Record(a: Int, b: String) + + def main(cmdlineArgs: Array[String]): Unit = { + val (sc, args) = ContextAndArgs(cmdlineArgs) + + // Catalog configuration + val catalogConfig = Map( + "type" -> args("catalogType"), + "uri" -> args("catalogUri"), + "warehouse" -> args("catalogWarehouse") + ) + + // Derive a conversion between Record and Beam Row + implicit val rt: RowType[Record] = RowType[Record] + + sc + // Read Records from Iceberg + .iceberg[Record]( + args("inputTable"), + args.optional("catalogName").orNull, + catalogConfig + ) + .map(r => r.copy(a = r.a + 1)) + // Write Records to Iceberg + .saveAsIceberg( + args("outputTable"), + args.optional("catalogName").orNull, + catalogConfig + ) + + sc.run() + } +} diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyAvroExample.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyAvroExample.scala index f4cb43ee7a..b7120890a5 100644 --- a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyAvroExample.scala +++ b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyAvroExample.scala @@ -23,16 +23,9 @@ package com.spotify.scio.examples.extra import com.spotify.scio._ import com.spotify.scio.avro._ -import com.spotify.scio.coders.Coder import com.spotify.scio.examples.common.ExampleData -import com.spotify.scio.examples.extra.MagnolifyAvroExample.wordCountType -import org.apache.avro.generic.GenericRecord object MagnolifyAvroExample { - // limit import scope to avoid polluting namespace - import magnolify.avro._ - - val wordCountType: AvroType[WordCount] = AvroType[WordCount] case class WordCount(word: String, count: Long) } @@ -47,9 +40,6 @@ object MagnolifyAvroExample { // --output=gs://[BUCKET]/[PATH]/wordcount-avro"` object MagnolifyAvroWriteExample { - implicit val genericCoder: Coder[GenericRecord] = - avroGenericRecordCoder(wordCountType.schema) - def main(cmdlineArgs: Array[String]): Unit = { import MagnolifyAvroExample._ @@ -57,8 +47,9 @@ object MagnolifyAvroWriteExample { sc.textFile(args.getOrElse("input", ExampleData.KING_LEAR)) .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) .countByValue - .map(t => wordCountType(WordCount.tupled(t))) - .saveAsAvroFile(args("output"), schema = wordCountType.schema) + .map { case (word, count) => WordCount(word, count) } + // uses implicitly-derived magnolify.avro.AvroType[WordCount] to save to avro + .saveAsAvroFile(args("output")) sc.run() () } @@ -78,8 +69,8 @@ object MagnolifyAvroReadExample { import MagnolifyAvroExample._ val (sc, args) = ContextAndArgs(cmdlineArgs) - sc.avroFile(args("input"), wordCountType.schema) - .map(e => wordCountType(e)) + // uses implicitly-derived magnolify.avro.AvroType[WordCount] to read from avro + sc.typedAvroFileMagnolify[WordCount](args("input")) .map(wc => wc.word + ": " + wc.count) .saveAsTextFile(args("output")) sc.run() diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyBigtableExample.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyBigtableExample.scala index fd52360ea7..7e8e974183 100644 --- a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyBigtableExample.scala +++ b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyBigtableExample.scala @@ -34,11 +34,8 @@ import magnolify.bigtable._ import scala.collection.compat._ object MagnolifyBigtableExample { - // Define case class representation of TensorFlow `Example` case class WordCount(cnt: Long) - // `BigtableType` provides mapping between case classes and `Seq[Mutation]`/`Row` - // for writing/reading. - val WordCountType: BigtableType[WordCount] = BigtableType[WordCount] + val ColumnFamily = "counts" } // ## Magnolify Bigtable Write Example @@ -65,14 +62,10 @@ object MagnolifyBigtableWriteExample { sc.textFile(args.getOrElse("input", ExampleData.KING_LEAR)) .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) .countByValue - // Convert case class to `Seq[Mutation]` and lift it into a key-value pair - // for saving to Bigtable table. - .map { case (word, count) => - val mutations = - WordCountType(WordCount(count), columnFamily = "counts").iterator.to(Iterable) - ByteString.copyFromUtf8(word) -> mutations - } - .saveAsBigtable(btProjectId, btInstanceId, btTableId) + .mapValues(cnt => WordCount(cnt)) + // `keyFn` converts word to a ByteString, while the value is converted via an + // implicitly derived BigtableType[WordCount] + .saveAsBigtable(btProjectId, btInstanceId, btTableId, ColumnFamily, ByteString.copyFromUtf8 _) sc.run() () @@ -99,12 +92,15 @@ object MagnolifyBigtableReadExample { val btInstanceId = args("bigtableInstanceId") val btTableId = args("bigtableTableId") - sc.bigtable(btProjectId, btInstanceId, btTableId) - .map { row => - // Convert Bigtable `Row` to the case class and lift it into a key-value pair. - row.getKey.toStringUtf8 -> WordCountType(row, columnFamily = "counts").cnt - } - .saveAsTextFile(args("output")) + // Internally converts Bigtable `Row` to `(String, WordCount)` via implicit + // BigtableType[WordCount] and the provided `keyFn` + sc.typedBigtable[String, WordCount]( + btProjectId, + btInstanceId, + btTableId, + ColumnFamily, + (bs: ByteString) => bs.toStringUtf8 + ).saveAsTextFile(args("output")) sc.run() () diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyDatastoreExample.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyDatastoreExample.scala index 6e7037daf5..2564bb6156 100644 --- a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyDatastoreExample.scala +++ b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyDatastoreExample.scala @@ -22,19 +22,14 @@ // convert between case classes and Datastore `Entity` types. package com.spotify.scio.examples.extra -import com.google.datastore.v1.client.DatastoreHelper.makeKey import com.google.datastore.v1.Query import com.spotify.scio._ import com.spotify.scio.datastore._ import com.spotify.scio.examples.common.ExampleData -import magnolify.datastore._ object MagnolifyDatastoreExample { - val kind = "magnolify" // Define case class representation of Datastore entities case class WordCount(word: String, count: Long) - // `DatastoreType` provides mapping between case classes and Datatore entities - val wordCountType: EntityType[WordCount] = EntityType[WordCount] } // ## Magnolify Datastore Write Example @@ -54,14 +49,7 @@ object MagnolifyDatastoreWriteExample { sc.textFile(args.getOrElse("input", ExampleData.KING_LEAR)) .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) .countByValue - .map { t => - // Convert case class to `Entity.Builder` - wordCountType - .to(WordCount.tupled(t)) - // Set entity key - .setKey(makeKey(kind, t._1)) - .build() - } + .map { case (word, count) => WordCount(word, count) } .saveAsDatastore(args("output")) sc.run() () @@ -82,9 +70,7 @@ object MagnolifyDatastoreReadExample { import MagnolifyDatastoreExample._ val (sc, args) = ContextAndArgs(cmdlineArgs) - sc.datastore(args("input"), Query.getDefaultInstance) - // Convert `Entity` to case class - .map(e => wordCountType(e)) + sc.typedDatastore[WordCount](args("input"), Query.getDefaultInstance) .map(wc => wc.word + ": " + wc.count) .saveAsTextFile(args("output")) sc.run() diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyTensorFlowExample.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyTensorFlowExample.scala index f52421c46e..8c603d77ff 100644 --- a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyTensorFlowExample.scala +++ b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyTensorFlowExample.scala @@ -26,7 +26,7 @@ import com.google.protobuf.ByteString import com.spotify.scio._ import com.spotify.scio.examples.common.ExampleData import com.spotify.scio.tensorflow._ -import org.tensorflow.proto.example.Example +import org.tensorflow.proto.Example import magnolify.tensorflow._ object MagnolifyTensorFlowExample { @@ -35,8 +35,6 @@ object MagnolifyTensorFlowExample { // `Example` type doesn't support `String` natively, derive one from `ByteString` implicit val efString: ExampleField.Primitive[String] = ExampleField.from[ByteString](_.toStringUtf8)(ByteString.copyFromUtf8) - // `TensorFlowType` provides mapping between case classes and TensorFlow `Example` - val wordCountType: ExampleType[WordCount] = ExampleType[WordCount] } // ## Magnolify Tensorflow Write Example @@ -56,8 +54,8 @@ object MagnolifyTensorFlowWriteExample { sc.textFile(args.getOrElse("input", ExampleData.KING_LEAR)) .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) .countByValue - // Convert case class to `Example` and then serialize as `Array[Byte]` - .map(t => wordCountType(WordCount.tupled(t)).toByteArray) + .map { case (word, count) => WordCount(word, count) } + // converts WordCount to Example with the implicitly-derived ExampleType[WordCount] .saveAsTfRecordFile(args("output")) sc.run() () @@ -78,12 +76,8 @@ object MagnolifyTensorFlowReadExample { import MagnolifyTensorFlowExample._ val (sc, args) = ContextAndArgs(cmdlineArgs) - sc.tfRecordFile(args("input")) - .map { b => - // Deserialize `Array[Byte]` as `Example` and then convert to case class - wordCountType(Example.parseFrom(b)) - } - .map(wc => wc.word + ": " + wc.count) + // reads TF Examples and converts to WordCount via the implicitly-derived ExampleType[WordCount] + sc.typedTfRecordFile[WordCount](args("input")) .saveAsTextFile(args("output")) sc.run() () diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyTypedBigQueryTornadoes.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyTypedBigQueryTornadoes.scala new file mode 100644 index 0000000000..6649a6b75c --- /dev/null +++ b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyTypedBigQueryTornadoes.scala @@ -0,0 +1,72 @@ +/* + * Copyright 2019 Spotify AB. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Example: Read and write using typed BigQuery API with case classes +// Usage: + +// `sbt "runMain com.spotify.scio.examples.extra.TypedBigQueryTornadoes +// --project=[PROJECT] --runner=DataflowRunner --region=[REGION NAME] +// --output=[PROJECT]:[DATASET].[TABLE]"` +package com.spotify.scio.examples.extra + +import com.spotify.scio.bigquery._ +import com.spotify.scio.{ContextAndArgs, ScioContext} + +object MagnolifyTypedBigQueryTornadoes { + val query: String = "SELECT tornado, month FROM [bigquery-public-data:samples.gsod]" + case class Row(tornado: Option[Boolean], month: Long) + case class Result(month: Long, tornado_count: Long) + + def pipeline(cmdlineArgs: Array[String]): ScioContext = { + val (sc, args) = ContextAndArgs(cmdlineArgs) + + val resultTap = sc + // Get input from BigQuery and convert elements from `TableRow` to `Row` with the + // implicitly-available `TableRowType[Row]` + .typedBigQuerySelect[Row](Query(query)) + .flatMap(r => if (r.tornado.getOrElse(false)) Seq(r.month) else Nil) + .countByValue + .map(kv => Result(kv._1, kv._2)) + // Save output to BigQuery, convert elements from `Result` to `TableRow` with the + // implicitly-available `TableRowType[Result]` + .saveAsBigQueryTable( + Table.Spec(args("output")), + writeDisposition = WRITE_TRUNCATE, + createDisposition = CREATE_IF_NEEDED + ) + + // Access the loaded tables + resultTap + .output(BigQueryIO.SuccessfulTableLoads) + .map(_.getTableSpec) + .debug(prefix = "Loaded table: ") + + // Access the failed records + resultTap + .output(BigQueryIO.FailedInserts) + .count + .debug(prefix = "Failed inserts: ") + + sc + } + + def main(cmdlineArgs: Array[String]): Unit = { + val sc = pipeline(cmdlineArgs) + sc.run().waitUntilDone() + () + } +} diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyTypedStorageBigQueryTornadoes.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyTypedStorageBigQueryTornadoes.scala new file mode 100644 index 0000000000..16a435f004 --- /dev/null +++ b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/MagnolifyTypedStorageBigQueryTornadoes.scala @@ -0,0 +1,79 @@ +/* + * Copyright 2019 Spotify AB. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Example: Read using typed BigQuery Storage API with annotated case classes +// Usage: + +// `sbt "runMain com.spotify.scio.examples.extra.TypedStorageBigQueryTornadoes +// --project=[PROJECT] --runner=DataflowRunner --region=[REGION NAME] +// --output=[PROJECT]:[DATASET].[TABLE]"` +package com.spotify.scio.examples.extra + +import com.spotify.scio.bigquery._ +import com.spotify.scio.{ContextAndArgs, ScioContext} +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method + +object MagnolifyTypedStorageBigQueryTornadoes { + val table: String = "bigquery-public-data:samples.gsod" + case class Row(month: Long, tornado: Option[Boolean]) + case class Result(month: Long, tornado_count: Long) + + def pipeline(cmdlineArgs: Array[String]): ScioContext = { + val (sc, args) = ContextAndArgs(cmdlineArgs) + + val resultTap = sc + // Get input from BigQuery and convert elements from `TableRow` to `Row` with the + // implicitly-available `TableRowType[Row]` + .typedBigQueryStorageMagnolify[Row]( + Table.Spec(table), + selectedFields = List("tornado", "month"), + rowRestriction = "tornado = true" + ) + .map(_.month) + .countByValue + .map(kv => Result(kv._1, kv._2)) + // Save output to BigQuery, convert elements from `Result` to `TableRow` with the + // implicitly-available `TableRowType[Result]` + .saveAsBigQueryTable( + Table.Spec(args("output")), + method = Method.STORAGE_WRITE_API, + writeDisposition = WRITE_TRUNCATE, + createDisposition = CREATE_IF_NEEDED, + successfulInsertsPropagation = true + ) + + // Access the inserted records + resultTap + .output(BigQueryIO.SuccessfulStorageApiInserts) + .count + .debug(prefix = "Successful inserts: ") + + // Access the failed records + resultTap + .output(BigQueryIO.FailedStorageApiInserts) + .count + .debug(prefix = "Failed inserts: ") + + sc + } + + def main(cmdlineArgs: Array[String]): Unit = { + val sc = pipeline(cmdlineArgs) + sc.run().waitUntilDone() + () + } +} diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/ManagedExample.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/ManagedExample.scala new file mode 100644 index 0000000000..efe825e617 --- /dev/null +++ b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/ManagedExample.scala @@ -0,0 +1,78 @@ +/* + * Copyright 2024 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.examples.extra + +import com.spotify.scio.ContextAndArgs +import com.spotify.scio.coders.Coder +import com.spotify.scio.managed._ +import com.spotify.scio.values.SCollection +import magnolify.beam._ +import org.apache.beam.sdk.managed.Managed +import org.apache.beam.sdk.values.Row + +// Example: Beam's Managed IO + +// Usage: + +// `sbt "runMain com.spotify.scio.examples.extra.ManagedExample +// --project=[PROJECT] --runner=DataflowRunner --region=[REGION NAME] +// --table=[TABLE] --catalogName=[CATALOG] --catalogType=[CATALOG TYPE] +// --catalogUri=[CATALOG URI] --catalogWarehouse=[CATALOG WAREHOUSE] +// --output=[OUTPUT PATH]"` +object ManagedExample { + + case class Record(a: Int, b: String) + + def main(cmdlineArgs: Array[String]): Unit = { + val (sc, args) = ContextAndArgs(cmdlineArgs) + + val config: Map[String, Object] = Map( + "table" -> args("table"), + "catalog_name" -> args("catalogName"), + "catalog_properties" -> + Map( + "type" -> args("catalogType"), + "uri" -> args("catalogUri"), + "warehouse" -> args("catalogWarehouse") + ) + ) + + val rt = RowType[Record] + // Provide an implicit coder for Row with the schema derived from Record case class + implicit val recordRowCoder: Coder[Row] = Coder.row(rt.schema) + + // Read beam Row instances from iceberg + val records: SCollection[Record] = sc + .managed( + Managed.ICEBERG, + // Schema derived from the Record case class + rt.schema, + config + ) + // Convert the Row instance to a Record + .map(rt.apply) + + records + .map(r => r.copy(a = r.a + 1)) + // Convert the Record to a Row + .map(rt.apply) + // Save Row instances to Iceberg + .saveAsManaged(Managed.ICEBERG, config) + + sc.run() + } +} diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/ParquetExample.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/ParquetExample.scala index 4ea605a82a..77aac93a00 100644 --- a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/ParquetExample.scala +++ b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/ParquetExample.scala @@ -32,7 +32,7 @@ import com.spotify.scio.coders.Coder import com.spotify.scio.io.ClosedTap import com.spotify.scio.parquet.ParquetConfiguration import org.apache.avro.generic.GenericRecord -import org.tensorflow.proto.example.{BytesList, Example, Feature, Features, FloatList} +import org.tensorflow.proto.{BytesList, Example, Feature, Features, FloatList} import org.tensorflow.metadata.{v0 => tfmd} object ParquetExample { diff --git a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/RedisExamples.scala b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/RedisExamples.scala index bbc712b7ae..a1c476903b 100644 --- a/scio-examples/src/main/scala/com/spotify/scio/examples/extra/RedisExamples.scala +++ b/scio-examples/src/main/scala/com/spotify/scio/examples/extra/RedisExamples.scala @@ -70,7 +70,7 @@ object RedisReadStringsExample { // `sbt "runMain com.spotify.scio.examples.extra.RedisWriteBatchExample // --project=[PROJECT] --runner=DataflowRunner --region=[REGION NAME] // --redisHost=[REDIS_HOST] -// --redisPort=[REDIS_PORT]` +// --redisPort=[REDIS_PORT]"` object RedisWriteBatchExample { def main(cmdlineArgs: Array[String]): Unit = { @@ -102,7 +102,7 @@ object RedisWriteBatchExample { // --project=[PROJECT] --runner=DataflowRunner --region=[REGION NAME] // --subscription=[PUBSUB_SUBSCRIPTION] // --redisHost=[REDIS_HOST] -// --redisPort=[REDIS_PORT]` +// --redisPort=[REDIS_PORT]"` object RedisWriteStreamingExample { def main(cmdlineArgs: Array[String]): Unit = { @@ -139,7 +139,7 @@ object RedisWriteStreamingExample { // `sbt "runMain com.spotify.scio.examples.extra.RedisLookUpStringsExample // --project=[PROJECT] --runner=DataflowRunner --region=[REGION NAME] // --redisHost=[REDIS_HOST] -// --redisPort=[REDIS_PORT]` +// --redisPort=[REDIS_PORT]"` object RedisLookUpStringsExample { def main(cmdlineArgs: Array[String]): Unit = { diff --git a/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyAvroExampleTest.scala b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyAvroExampleTest.scala index 478d8bed46..7566495777 100644 --- a/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyAvroExampleTest.scala +++ b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyAvroExampleTest.scala @@ -20,35 +20,27 @@ package com.spotify.scio.examples.extra import com.spotify.scio.avro.AvroIO import com.spotify.scio.io._ import com.spotify.scio.testing._ -import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder} class MagnolifyAvroExampleTest extends PipelineSpec { import MagnolifyAvroExample._ val textIn: Seq[String] = Seq("a b c d e", "a b a b") val wordCount: Seq[(String, Long)] = Seq(("a", 3L), ("b", 3L), ("c", 1L), ("d", 1L), ("e", 1L)) - val records: Seq[GenericRecord] = wordCount.map { kv => - new GenericRecordBuilder(wordCountType.schema) - .set("word", kv._1) - .set("count", kv._2) - .build() - } + val records: Seq[WordCount] = wordCount.map { case (word, count) => WordCount(word, count) } val textOut: Seq[String] = wordCount.map(kv => kv._1 + ": " + kv._2) "MagnolifyAvroWriteExample" should "work" in { - import MagnolifyAvroWriteExample.genericCoder JobTest[com.spotify.scio.examples.extra.MagnolifyAvroWriteExample.type] .args("--input=in.txt", "--output=wc.avro") .input(TextIO("in.txt"), textIn) - .output(AvroIO[GenericRecord]("wc.avro"))(coll => coll should containInAnyOrder(records)) + .output(AvroIO[WordCount]("wc.avro"))(coll => coll should containInAnyOrder(records)) .run() } "MagnolifyAvroReadExample" should "work" in { - import MagnolifyAvroWriteExample.genericCoder JobTest[com.spotify.scio.examples.extra.MagnolifyAvroReadExample.type] .args("--input=wc.avro", "--output=out.txt") - .input(AvroIO[GenericRecord]("wc.avro"), records) + .input(AvroIO[WordCount]("wc.avro"), records) .output(TextIO("out.txt"))(coll => coll should containInAnyOrder(textOut)) .run() } diff --git a/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyBigtableExampleTest.scala b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyBigtableExampleTest.scala new file mode 100644 index 0000000000..eb6c13b653 --- /dev/null +++ b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyBigtableExampleTest.scala @@ -0,0 +1,58 @@ +/* + * Copyright 2024 Spotify AB. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.spotify.scio.examples.extra + +import com.spotify.scio.bigtable.BigtableIO +import com.spotify.scio.io._ +import com.spotify.scio.testing._ + +class MagnolifyBigtableExampleTest extends PipelineSpec { + import MagnolifyBigtableExample._ + + val project = "my-project" + val instance = "my-instance" + val table = "my-table" + val bigtableOptions: Seq[String] = Seq( + s"--bigtableProjectId=$project", + s"--bigtableInstanceId=$instance", + s"--bigtableTableId=$table" + ) + + val textIn: Seq[String] = Seq("a b c d e", "a b a b") + val wordCount: Seq[(String, Long)] = Seq(("a", 3L), ("b", 3L), ("c", 1L), ("d", 1L), ("e", 1L)) + val expected: Seq[(String, WordCount)] = wordCount.map { case (k, v) => (k, WordCount(v)) } + val expectedText: Seq[String] = expected.map(_.toString) + + "MagnolifyBigtableWriteExample" should "work" in { + JobTest[MagnolifyBigtableWriteExample.type] + .args(bigtableOptions :+ "--input=in.txt": _*) + .input(TextIO("in.txt"), textIn) + .output(BigtableIO[(String, WordCount)](project, instance, table))(coll => + coll should containInAnyOrder(expected) + ) + .run() + } + + "MagnolifyBigtableReadExample" should "work" in { + JobTest[MagnolifyBigtableReadExample.type] + .args(bigtableOptions :+ "--output=out.txt": _*) + .input(BigtableIO[(String, WordCount)](project, instance, table), expected) + .output(TextIO("out.txt"))(coll => coll should containInAnyOrder(expectedText)) + .run() + } +} diff --git a/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyDatastoreExampleTest.scala b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyDatastoreExampleTest.scala index ab8514b90b..50cdfc84d0 100644 --- a/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyDatastoreExampleTest.scala +++ b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyDatastoreExampleTest.scala @@ -17,37 +17,30 @@ package com.spotify.scio.examples.extra -import com.google.datastore.v1.client.DatastoreHelper.{makeKey, makeValue} -import com.google.datastore.v1.Entity import com.spotify.scio.io._ import com.spotify.scio.datastore._ import com.spotify.scio.testing._ class MagnolifyDatastoreExampleTest extends PipelineSpec { + import MagnolifyDatastoreExample._ + val textIn: Seq[String] = Seq("a b c d e", "a b a b") val wordCount: Seq[(String, Long)] = Seq(("a", 3L), ("b", 3L), ("c", 1L), ("d", 1L), ("e", 1L)) - val entities: Seq[Entity] = wordCount.map { kv => - Entity - .newBuilder() - .setKey(makeKey(MagnolifyDatastoreExample.kind, kv._1)) - .putProperties("word", makeValue(kv._1).build()) - .putProperties("count", makeValue(kv._2).build()) - .build() - } + val entities: Seq[WordCount] = wordCount.map { case (word, count) => WordCount(word, count) } val textOut: Seq[String] = wordCount.map(kv => kv._1 + ": " + kv._2) "MagnolifyDatastoreWriteExample" should "work" in { JobTest[com.spotify.scio.examples.extra.MagnolifyDatastoreWriteExample.type] .args("--input=in.txt", "--output=project") .input(TextIO("in.txt"), textIn) - .output(DatastoreIO("project"))(coll => coll should containInAnyOrder(entities)) + .output(DatastoreIO[WordCount]("project"))(_ should containInAnyOrder(entities)) .run() } "MagnolifyDatastoreReadExample" should "work" in { JobTest[com.spotify.scio.examples.extra.MagnolifyDatastoreReadExample.type] .args("--input=project", "--output=out.txt") - .input(DatastoreIO("project"), entities) + .input(DatastoreIO[WordCount]("project"), entities) .output(TextIO("out.txt"))(coll => coll should containInAnyOrder(textOut)) .run() } diff --git a/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyTensorFlowExampleTest.scala b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyTensorFlowExampleTest.scala index 916d764463..94aed60d6b 100644 --- a/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyTensorFlowExampleTest.scala +++ b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyTensorFlowExampleTest.scala @@ -17,54 +17,32 @@ package com.spotify.scio.examples.extra -import com.google.protobuf.ByteString import com.spotify.scio.io._ -import com.spotify.scio.tensorflow.TFRecordIO +import com.spotify.scio.tensorflow.TFExampleTypedIO import com.spotify.scio.testing._ -import org.tensorflow.proto.example._ class MagnolifyTensorFlowExampleTest extends PipelineSpec { + import MagnolifyTensorFlowExample._ + val textIn: Seq[String] = Seq("a b c d e", "a b a b") - val wordCount: Seq[(String, Long)] = Seq(("a", 3L), ("b", 3L), ("c", 1L), ("d", 1L), ("e", 1L)) - val examples: Seq[Example] = wordCount.map { kv => - Example - .newBuilder() - .setFeatures( - Features - .newBuilder() - .putFeature( - "word", - Feature - .newBuilder() - .setBytesList(BytesList.newBuilder().addValue(ByteString.copyFromUtf8(kv._1))) - .build() - ) - .putFeature( - "count", - Feature - .newBuilder() - .setInt64List(Int64List.newBuilder().addValue(kv._2)) - .build() - ) - ) - .build() - } - val textOut: Seq[String] = wordCount.map(kv => kv._1 + ": " + kv._2) + val wordCount: Seq[WordCount] = Seq(("a", 3L), ("b", 3L), ("c", 1L), ("d", 1L), ("e", 1L)) + .map { case (word, count) => WordCount(word, count) } + val textOut: Seq[String] = wordCount.map(_.toString()) "MagnolifyTensorFlowWriteExample" should "work" in { - JobTest[com.spotify.scio.examples.extra.MagnolifyTensorFlowWriteExample.type] + JobTest[MagnolifyTensorFlowWriteExample.type] .args("--input=in.txt", "--output=wc.tfrecords") .input(TextIO("in.txt"), textIn) - .output(TFRecordIO("wc.tfrecords")) { - _.map(Example.parseFrom) should containInAnyOrder(examples) + .output(TFExampleTypedIO[WordCount]("wc.tfrecords")) { + _ should containInAnyOrder(wordCount) } .run() } "MagnolifyTensorFlowReadExample" should "work" in { - JobTest[com.spotify.scio.examples.extra.MagnolifyTensorFlowReadExample.type] + JobTest[MagnolifyTensorFlowReadExample.type] .args("--input=wc.tfrecords", "--output=out.txt") - .input(TFRecordIO("wc.tfrecords"), examples.map(_.toByteArray)) + .input(TFExampleTypedIO[WordCount]("wc.tfrecords"), wordCount) .output(TextIO("out.txt"))(coll => coll should containInAnyOrder(textOut)) .run() } diff --git a/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyTypedBigQueryTornadoesTest.scala b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyTypedBigQueryTornadoesTest.scala new file mode 100644 index 0000000000..abb58bcb83 --- /dev/null +++ b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyTypedBigQueryTornadoesTest.scala @@ -0,0 +1,46 @@ +/* + * Copyright 2019 Spotify AB. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.spotify.scio.examples.extra + +import com.spotify.scio.bigquery._ +import com.spotify.scio.testing._ + +class MagnolifyTypedBigQueryTornadoesTest extends PipelineSpec { + import MagnolifyTypedBigQueryTornadoes.{Result, Row} + + val inData: Seq[Row] = Seq( + Row(Some(true), 1), + Row(Some(false), 1), + Row(Some(false), 2), + Row(Some(true), 3), + Row(Some(true), 4), + Row(Some(true), 4) + ) + + val expected: Seq[Result] = Seq(Result(1, 1), Result(3, 1), Result(4, 2)) + + "MagnolifyTypedBigQueryTornadoes" should "work" in { + JobTest[com.spotify.scio.examples.extra.MagnolifyTypedBigQueryTornadoes.type] + .args("--output=dataset.table") + .input(BigQueryIO(MagnolifyTypedBigQueryTornadoes.query), inData) + .output(BigQueryIO[Result]("dataset.table")) { coll => + coll should containInAnyOrder(expected) + } + .run() + } +} diff --git a/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyTypedStorageBigQueryTornadoesTest.scala b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyTypedStorageBigQueryTornadoesTest.scala new file mode 100644 index 0000000000..9482c88215 --- /dev/null +++ b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/MagnolifyTypedStorageBigQueryTornadoesTest.scala @@ -0,0 +1,51 @@ +/* + * Copyright 2019 Spotify AB. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.spotify.scio.examples.extra + +import com.spotify.scio.bigquery._ +import com.spotify.scio.testing._ + +final class MagnolifyTypedStorageBigQueryTornadoesTest extends PipelineSpec { + import MagnolifyTypedStorageBigQueryTornadoes.{Result, Row} + + val inData: Seq[Row] = Seq( + Row(1, Some(true)), + Row(3, Some(true)), + Row(4, Some(true)), + Row(4, Some(true)) + ) + + val expected: Seq[Result] = Seq(Result(1, 1), Result(3, 1), Result(4, 2)) + + "MagnolifyStorageTypedBigQueryTornadoes" should "work" in { + JobTest[com.spotify.scio.examples.extra.MagnolifyTypedStorageBigQueryTornadoes.type] + .args("--output=dataset.table") + .input( + BigQueryIO( + MagnolifyTypedStorageBigQueryTornadoes.table, + List("tornado", "month"), + Some("tornado = true") + ), + inData + ) + .output(BigQueryIO[Result]("dataset.table")) { coll => + coll should containInAnyOrder(expected) + } + .run() + } +} diff --git a/scio-examples/src/test/scala/com/spotify/scio/examples/extra/ParquetExampleTest.scala b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/ParquetExampleTest.scala index 554a3a0095..b52b4a6c55 100644 --- a/scio-examples/src/test/scala/com/spotify/scio/examples/extra/ParquetExampleTest.scala +++ b/scio-examples/src/test/scala/com/spotify/scio/examples/extra/ParquetExampleTest.scala @@ -40,7 +40,7 @@ import com.spotify.scio.parquet.avro._ import com.spotify.scio.parquet.types._ import com.spotify.scio.parquet.tensorflow._ import com.spotify.scio.io.TextIO -import org.tensorflow.proto.example.{Example, Feature, Features, FloatList} +import org.tensorflow.proto.{Example, Feature, Features, FloatList} class ParquetExampleTest extends PipelineSpec { diff --git a/scio-extra/src/main/scala/com/spotify/scio/extra/sparkey/package.scala b/scio-extra/src/main/scala/com/spotify/scio/extra/sparkey/package.scala index 7396a74871..f14b27dcd5 100644 --- a/scio-extra/src/main/scala/com/spotify/scio/extra/sparkey/package.scala +++ b/scio-extra/src/main/scala/com/spotify/scio/extra/sparkey/package.scala @@ -218,7 +218,6 @@ package object sparkey extends SparkeyReaderInstances with SparkeyCoders { ClosedTap[Nothing](EmptyTap) } - // TODO for 0.15: make private, remove deprecation /** * Write the key-value pairs of this SCollection as a Sparkey file to a specific location. * @@ -233,8 +232,7 @@ package object sparkey extends SparkeyReaderInstances with SparkeyCoders { * @return * A singleton SCollection containing the [[SparkeyUri]] of the saved files. */ - @deprecated("Use saveAsSparkey instead", since = "0.14.0") - def asSparkey( + private[scio] def asSparkey( path: String = null, maxMemoryUsage: Long = -1, numShards: Short = SparkeyIO.DefaultNumShards, @@ -261,15 +259,14 @@ package object sparkey extends SparkeyReaderInstances with SparkeyCoders { ) } - // TODO for 0.15: make private, remove deprecation /** * Write the key-value pairs of this SCollection as a Sparkey file to a temporary location. * * @return * A singleton SCollection containing the [[SparkeyUri]] of the saved files. */ - @deprecated("Use saveAsSparkey instead", since = "0.14.0") - def asSparkey(implicit w: SparkeyWritable[K, V]): SCollection[SparkeyUri] = this.asSparkey() + private[scio] def asSparkey(implicit w: SparkeyWritable[K, V]): SCollection[SparkeyUri] = + this.asSparkey() /** * Convert this SCollection to a SideInput, mapping key-value pairs of each window to a diff --git a/scio-extra/src/test/scala/com/spotify/scio/extra/PropertySpec.scala b/scio-extra/src/test/scala/com/spotify/scio/extra/PropertySpec.scala index c32e32e9cc..54ac6154a3 100644 --- a/scio-extra/src/test/scala/com/spotify/scio/extra/PropertySpec.scala +++ b/scio-extra/src/test/scala/com/spotify/scio/extra/PropertySpec.scala @@ -22,7 +22,6 @@ import org.scalatest.matchers.should.Matchers import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks trait PropertySpec extends AnyPropSpec with ScalaCheckDrivenPropertyChecks with Matchers { - // TODO: remove this once https://github.com/scalatest/scalatest/issues/1090 is addressed implicit override val generatorDrivenConfig: PropertyCheckConfiguration = PropertyCheckConfiguration(minSuccessful = 100) } diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/BigQueryIO.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/BigQueryIO.scala index 16948f619e..ea22e011a3 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/BigQueryIO.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/BigQueryIO.scala @@ -26,6 +26,7 @@ import com.spotify.scio.coders._ import com.spotify.scio.io._ import com.spotify.scio.util.{FilenamePolicySupplier, Functions, ScioUtil} import com.spotify.scio.values.{SCollection, SideOutput, SideOutputCollections} +import magnolify.bigquery.TableRowType import org.apache.avro.generic.GenericRecord import org.apache.beam.sdk.extensions.gcp.options.GcpOptions import org.apache.beam.sdk.io.Compression @@ -281,7 +282,7 @@ final case class BigQueryTypedSelect[T: Coder]( } override protected def write(data: SCollection[T], params: WriteP): Tap[T] = - throw new UnsupportedOperationException("BigQuerySelect is read-only") + throw new UnsupportedOperationException("BigQueryTypedSelect is read-only") override def tap(params: ReadP): Tap[T] = { val tableReference = BigQuery @@ -572,7 +573,7 @@ final case class BigQueryStorageSelect(sqlQuery: Query) extends BigQueryIO[Table sc.read(underlying)(BigQueryTypedSelect.ReadParam()) override protected def write(data: SCollection[TableRow], params: WriteP): Tap[TableRow] = - throw new UnsupportedOperationException("BigQuerySelect is read-only") + throw new UnsupportedOperationException("BigQueryStorageSelect is read-only") override def tap(params: ReadP): Tap[TableRow] = underlying.tap(BigQueryTypedSelect.ReadParam()) } @@ -953,3 +954,126 @@ object BigQueryTyped { } } } + +// SELECT + +object BigQueryMagnolifyTypedSelectIO { + type ReadParam = BigQueryTypedSelect.ReadParam + val ReadParam = BigQueryTypedSelect.ReadParam +} + +final case class BigQueryMagnolifyTypedSelectIO[T: TableRowType: Coder]( + query: Query +) extends BigQueryIO[T] { + override type ReadP = BigQuerySelect.ReadParam + override type WriteP = Nothing // ReadOnly + + private lazy val tableRowType: TableRowType[T] = implicitly + private[this] lazy val underlying = + BigQueryTypedSelect(beam.BigQueryIO.readTableRows(), query, identity)(coders.tableRowCoder) + + override def testId: String = s"BigQueryIO(${query.underlying})" + + override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = + sc.transform(_.read(underlying)(params).map(row => tableRowType(row))) + + override protected def write(data: SCollection[T], params: WriteP): Tap[T] = + throw new UnsupportedOperationException("MagnolifyBigQuerySelect is read-only") + + override def tap(params: ReadP): Tap[T] = underlying.tap(params).map(row => tableRowType(row)) +} + +// TABLE + +final case class BigQueryMagnolifyTypedTable[T: TableRowType: Coder]( + table: Table +) extends BigQueryIO[T] + with WriteResultIO[T] { + override type ReadP = Unit + override type WriteP = BigQueryTypedTable.WriteParam[T] + + override def testId: String = s"BigQueryIO(${table.spec})" + + private val tableRowType: TableRowType[T] = implicitly + private val readFn = Functions.serializableFn[SchemaAndRecord, T](sar => + tableRowType(BigQueryUtils.convertGenericRecordToTableRow(sar.getRecord, sar.getTableSchema)) + ) + private val writeFn = Functions.serializableFn[T, TableRow](t => tableRowType(t)) + + private lazy val underlying = { + BigQueryTypedTable( + beam.BigQueryIO.read(readFn), + beam.BigQueryIO.write().withFormatFunction(writeFn), + table, + (gr, ts) => tableRowType(BigQueryUtils.convertGenericRecordToTableRow(gr, ts)) + ) + } + + override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = + sc.read(underlying) + + override protected def writeWithResult( + data: SCollection[T], + params: WriteP + ): (Tap[T], SideOutputCollections) = { + val outputs = data + .write(underlying)(params) + .outputs + .get + + (tap(()), outputs) + } + + override def tap(read: ReadP): Tap[T] = + BigQueryTableRowTypedTap[T](table, tableRowType.apply) +} + +// STORAGE + +final case class BigQueryMagnolifyTypedStorage[T: TableRowType: Coder]( + table: Table, + selectedFields: List[String], + rowRestriction: Option[String] +) extends BigQueryIO[T] { + override type ReadP = Unit + override type WriteP = Nothing // ReadOnly + + override def testId: String = + s"BigQueryIO(${table.spec}, List(${selectedFields.mkString(",")}), $rowRestriction)" + + private lazy val tableRowType: TableRowType[T] = implicitly + private lazy val underlying = BigQueryStorage(table, selectedFields, rowRestriction) + + override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = + sc.transform(_.read(underlying).map(tr => tableRowType(tr))) + + override protected def write(data: SCollection[T], params: WriteP): Tap[T] = + throw new UnsupportedOperationException("MagnolifyBigQueryStorage is read-only") + + override def tap(read: ReadP): Tap[T] = + underlying.tap(read).map(tr => tableRowType(tr)) +} + +object BigQueryMagnolifyTypedStorage { + val ReadParam = BigQueryStorage.ReadParam +} + +final case class BigQueryMagnolifyTypedStorageSelect[T: TableRowType: Coder](sqlQuery: Query) + extends BigQueryIO[T] { + override type ReadP = Unit + override type WriteP = Nothing // ReadOnly + + private[this] lazy val underlying = BigQueryStorageSelect(sqlQuery) + private lazy val tableRowType: TableRowType[T] = implicitly + + override def testId: String = s"BigQueryIO(${sqlQuery.underlying})" + + override protected def read(sc: ScioContext, params: ReadP): SCollection[T] = + sc.transform(_.read(underlying).map(tr => tableRowType(tr))) + + override protected def write(data: SCollection[T], params: WriteP): Tap[T] = + throw new UnsupportedOperationException("MagnolifyBigQueryStorageSelect is read-only") + + override def tap(params: ReadP): Tap[T] = + underlying.tap(params).map(tr => tableRowType(tr)) +} diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/dynamic/syntax/SCollectionSyntax.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/dynamic/syntax/SCollectionSyntax.scala index b17542f267..de32a6191a 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/dynamic/syntax/SCollectionSyntax.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/dynamic/syntax/SCollectionSyntax.scala @@ -25,6 +25,7 @@ import com.spotify.scio.bigquery.{TableRow, Writes} import com.spotify.scio.io.{ClosedTap, EmptyTap} import com.spotify.scio.util.Functions import com.spotify.scio.values.SCollection +import magnolify.bigquery.TableRowType import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.{ CreateDisposition, Method, @@ -37,12 +38,37 @@ import org.apache.beam.sdk.values.ValueInSingleWindow import scala.reflect.runtime.universe._ import scala.util.chaining._ +object DynamicWriteParam extends Writes.WriteParamDefaults + /** * Enhanced version of [[com.spotify.scio.values.SCollection SCollection]] with dynamic destinations * methods. */ final class DynamicBigQueryOps[T](private val self: SCollection[T]) extends AnyVal { + /** + * Save this SCollection to dynamic BigQuery tables specified by `tableFn`, converting elements of + * type `T` to `TableRow` via the implicitly-available `TableRowType[T]` + */ + def saveAsBigQuery( + writeDisposition: WriteDisposition = DynamicWriteParam.DefaultWriteDisposition, + createDisposition: CreateDisposition = DynamicWriteParam.DefaultCreateDisposition, + extendedErrorInfo: Boolean = DynamicWriteParam.DefaultExtendedErrorInfo + )( + tableFn: ValueInSingleWindow[T] => TableDestination + )(implicit tableRowType: TableRowType[T]): ClosedTap[Nothing] = { + val destinations = DynamicDestinationsUtil.tableFn(tableFn, tableRowType.schema) + + new DynamicBigQueryOps(self).saveAsBigQuery( + destinations, + tableRowType.to, + writeDisposition, + createDisposition, + false, + extendedErrorInfo + ) + } + /** * Save this SCollection to dynamic BigQuery tables using the table and schema specified by the * [[org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations DynamicDestinations]]. @@ -52,8 +78,8 @@ final class DynamicBigQueryOps[T](private val self: SCollection[T]) extends AnyV formatFn: T => TableRow, writeDisposition: WriteDisposition, createDisposition: CreateDisposition, - successfulInsertsPropagation: Boolean = false, - extendedErrorInfo: Boolean = false + successfulInsertsPropagation: Boolean, + extendedErrorInfo: Boolean ): ClosedTap[Nothing] = { if (self.context.isTest) { throw new NotImplementedError( @@ -91,15 +117,16 @@ final class DynamicTableRowBigQueryOps[T <: TableRow](private val self: SCollect */ def saveAsBigQuery( schema: TableSchema, - writeDisposition: WriteDisposition = null, - createDisposition: CreateDisposition = null, - extendedErrorInfo: Boolean = false + writeDisposition: WriteDisposition = DynamicWriteParam.DefaultWriteDisposition, + createDisposition: CreateDisposition = DynamicWriteParam.DefaultCreateDisposition, + extendedErrorInfo: Boolean = DynamicWriteParam.DefaultExtendedErrorInfo )(tableFn: ValueInSingleWindow[T] => TableDestination): ClosedTap[Nothing] = new DynamicBigQueryOps(self).saveAsBigQuery( DynamicDestinationsUtil.tableFn(tableFn, schema), identity, writeDisposition, createDisposition, + false, extendedErrorInfo ) } @@ -117,9 +144,9 @@ final class DynamicTypedBigQueryOps[T <: HasAnnotation](private val self: SColle * [[com.spotify.scio.bigquery.types.BigQueryType BigQueryType]]. */ def saveAsTypedBigQuery( - writeDisposition: WriteDisposition = null, - createDisposition: CreateDisposition = null, - extendedErrorInfo: Boolean = false + writeDisposition: WriteDisposition = DynamicWriteParam.DefaultWriteDisposition, + createDisposition: CreateDisposition = DynamicWriteParam.DefaultCreateDisposition, + extendedErrorInfo: Boolean = DynamicWriteParam.DefaultExtendedErrorInfo )( tableFn: ValueInSingleWindow[T] => TableDestination )(implicit tt: TypeTag[T]): ClosedTap[Nothing] = { @@ -131,6 +158,7 @@ final class DynamicTypedBigQueryOps[T <: HasAnnotation](private val self: SColle bqt.toTableRow, writeDisposition, createDisposition, + false, extendedErrorInfo ) } diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/package.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/package.scala index 357b2f7b51..62a377f5b1 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/package.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/package.scala @@ -21,6 +21,7 @@ import com.google.api.services.bigquery.model.{TableRow => GTableRow} import com.spotify.scio.bigquery.instances.CoderInstances import com.spotify.scio.bigquery.syntax.{ FileStorageSyntax, + MagnolifySyntax, SCollectionSyntax, ScioContextSyntax, TableReferenceSyntax, @@ -51,6 +52,7 @@ package object bigquery with TableRowSyntax with TableReferenceSyntax with FileStorageSyntax + with MagnolifySyntax with CoderInstances { /** Alias for BigQuery `CreateDisposition`. */ diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/syntax/MagnolifySyntax.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/syntax/MagnolifySyntax.scala new file mode 100644 index 0000000000..ef6cca096d --- /dev/null +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/syntax/MagnolifySyntax.scala @@ -0,0 +1,148 @@ +/* + * Copyright 2024 Spotify AB. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.spotify.scio.bigquery.syntax + +import com.spotify.scio.ScioContext +import com.spotify.scio.bigquery.{ + BigQueryMagnolifyTypedSelectIO, + BigQueryMagnolifyTypedStorage, + BigQueryMagnolifyTypedStorageSelect, + BigQueryMagnolifyTypedTable, + BigQueryTypedTable, + Clustering, + Query, + Sharding, + Table, + TimePartitioning +} +import com.spotify.scio.coders.Coder +import com.spotify.scio.io.ClosedTap +import com.spotify.scio.values.SCollection +import magnolify.bigquery.TableRowType +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.{ + CreateDisposition, + Method, + WriteDisposition +} +import org.apache.beam.sdk.io.gcp.bigquery.InsertRetryPolicy +import org.joda.time.Duration + +final class MagnolifyBigQueryScioContextOps(private val self: ScioContext) extends AnyVal { + + /** + * Get an SCollection for a BigQuery SELECT query. Both + * [[https://cloud.google.com/bigquery/docs/reference/legacy-sql Legacy SQL]] and + * [[https://cloud.google.com/bigquery/docs/reference/standard-sql/ Standard SQL]] dialects are + * supported. By default the query dialect will be automatically detected. To override this + * behavior, start the query string with `#legacysql` or `#standardsql`. + */ + def typedBigQuerySelect[T: TableRowType: Coder]( + sqlQuery: Query, + flattenResults: Boolean = BigQueryMagnolifyTypedSelectIO.ReadParam.DefaultFlattenResults + ): SCollection[T] = + self.read(BigQueryMagnolifyTypedSelectIO(sqlQuery))( + BigQueryMagnolifyTypedSelectIO.ReadParam(flattenResults) + ) + + /** Get an SCollection for a BigQuery table. */ + def typedBigQueryTable[T: TableRowType: Coder](table: Table): SCollection[T] = + self.read(BigQueryMagnolifyTypedTable(table)) + + /** + * Get an SCollection for a BigQuery table using the storage API. + * + * @param selectedFields + * names of the fields in the table that should be read. If empty, all fields will be read. If + * the specified field is a nested field, all the sub-fields in the field will be selected. + * Fields will always appear in the generated class in the same order as they appear in the + * table, regardless of the order specified in selectedFields. + * @param rowRestriction + * SQL text filtering statement, similar ti a WHERE clause in a query. Currently, we support + * combinations of predicates that are a comparison between a column and a constant value in SQL + * statement. Aggregates are not supported. For example: + * + * {{{ + * "a > DATE '2014-09-27' AND (b > 5 AND c LIKE 'date')" + * }}} + */ + def typedBigQueryStorageMagnolify[T: TableRowType: Coder]( + table: Table, + selectedFields: List[String] = BigQueryMagnolifyTypedStorage.ReadParam.DefaultSelectFields, + rowRestriction: String = null + ): SCollection[T] = + self.read(BigQueryMagnolifyTypedStorage(table, selectedFields, Option(rowRestriction))) + + /** + * Get an SCollection for a BigQuery SELECT query using the storage API. + * + * @param query + * SQL query + */ + def typedBigQueryStorageMagnolify[T: TableRowType: Coder](query: Query): SCollection[T] = + self.read(BigQueryMagnolifyTypedStorageSelect(query)) + +} + +final class MagnolifyBigQuerySCollectionOps[T](private val self: SCollection[T]) { + + def saveAsBigQueryTable( + table: Table, + timePartitioning: TimePartitioning = BigQueryTypedTable.WriteParam.DefaultTimePartitioning, + writeDisposition: WriteDisposition = BigQueryTypedTable.WriteParam.DefaultWriteDisposition, + createDisposition: CreateDisposition = BigQueryTypedTable.WriteParam.DefaultCreateDisposition, + clustering: Clustering = BigQueryTypedTable.WriteParam.DefaultClustering, + method: Method = BigQueryTypedTable.WriteParam.DefaultMethod, + triggeringFrequency: Duration = BigQueryTypedTable.WriteParam.DefaultTriggeringFrequency, + sharding: Sharding = BigQueryTypedTable.WriteParam.DefaultSharding, + failedInsertRetryPolicy: InsertRetryPolicy = + BigQueryTypedTable.WriteParam.DefaultFailedInsertRetryPolicy, + successfulInsertsPropagation: Boolean = + BigQueryTypedTable.WriteParam.DefaultSuccessfulInsertsPropagation, + extendedErrorInfo: Boolean = BigQueryTypedTable.WriteParam.DefaultExtendedErrorInfo, + configOverride: BigQueryTypedTable.WriteParam.ConfigOverride[T] = + BigQueryTypedTable.WriteParam.DefaultConfigOverride + )(implicit coder: Coder[T], tableRowType: TableRowType[T]): ClosedTap[T] = { + val param = BigQueryTypedTable.WriteParam[T]( + method, + tableRowType.schema, + writeDisposition, + createDisposition, + tableRowType.description, + timePartitioning, + clustering, + triggeringFrequency, + sharding, + failedInsertRetryPolicy, + successfulInsertsPropagation, + extendedErrorInfo, + configOverride + ) + self.write(BigQueryMagnolifyTypedTable[T](table))(param) + } + +} + +trait MagnolifySyntax { + implicit def magnolifyBigQueryScioContextOps(sc: ScioContext): MagnolifyBigQueryScioContextOps = + new MagnolifyBigQueryScioContextOps(sc) + + implicit def magnolifyBigQuerySCollectionOps[T]( + scoll: SCollection[T] + ): MagnolifyBigQuerySCollectionOps[T] = + new MagnolifyBigQuerySCollectionOps(scoll) +} diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/taps.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/taps.scala index acdeb28868..63f9608fa3 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/taps.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/taps.scala @@ -42,6 +42,16 @@ final case class TableRowJsonTap(path: String, params: TableRowJsonIO.ReadParam) sc.read(TableRowJsonIO(path))(params) } +final case class BigQueryTableRowTypedTap[T: Coder](table: Table, fn: TableRow => T) + extends Tap[T] { + lazy val client: BigQuery = BigQuery.defaultInstance() + + override def value: Iterator[T] = client.tables.rows(table).map(fn) + + override def open(sc: ScioContext): SCollection[T] = + sc.read(BigQueryTypedTable(table, Format.TableRow)(tableRowCoder)).map(fn) +} + final case class BigQueryTypedTap[T: Coder](table: Table, fn: (GenericRecord, TableSchema) => T) extends Tap[T] { lazy val client: BigQuery = BigQuery.defaultInstance() diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/types/BigQueryType.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/types/BigQueryType.scala index 4992c5fa6b..fbacad7ef5 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/types/BigQueryType.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigquery/types/BigQueryType.scala @@ -53,6 +53,7 @@ import scala.util.Try * @groupname Ungrouped * Other Members */ +@deprecated("Use magnolify API instead.", "0.15.0") object BigQueryType { /** @@ -193,6 +194,7 @@ object BigQueryType { @compileTimeOnly( "enable macro paradise (2.12) or -Ymacro-annotations (2.13) to expand macro annotations" ) + @deprecated("Use magnolify API instead.", "0.15.0") class fromTable(tableSpec: String, args: String*) extends StaticAnnotation { def macroTransform(annottees: Any*): Any = macro TypeProvider.tableImpl } @@ -223,6 +225,7 @@ object BigQueryType { @compileTimeOnly( "enable macro paradise (2.12) or -Ymacro-annotations (2.13) to expand macro annotations" ) + @deprecated("Use magnolify API instead.", "0.15.0") class fromSchema(schema: String) extends StaticAnnotation { def macroTransform(annottees: Any*): Any = macro TypeProvider.schemaImpl } @@ -272,6 +275,7 @@ object BigQueryType { @compileTimeOnly( "enable macro paradise (2.12) or -Ymacro-annotations (2.13) to expand macro annotations" ) + @deprecated("Use magnolify API instead.", "0.15.0") class fromStorage( tableSpec: String, args: List[Any] = Nil, @@ -318,6 +322,7 @@ object BigQueryType { @compileTimeOnly( "enable macro paradise (2.12) or -Ymacro-annotations (2.13) to expand macro annotations" ) + @deprecated("Use magnolify API instead.", "0.15.0") class fromQuery(query: String, args: Any*) extends StaticAnnotation { def macroTransform(annottees: Any*): Any = macro TypeProvider.queryImpl } @@ -338,32 +343,38 @@ object BigQueryType { @compileTimeOnly( "enable macro paradise (2.12) or -Ymacro-annotations (2.13) to expand macro annotations" ) + @deprecated("Use magnolify API instead.", "0.15.0") class toTable extends StaticAnnotation { def macroTransform(annottees: Any*): Any = macro TypeProvider.toTableImpl } /** Generate [[org.apache.avro.Schema Schema]] for a case class. */ + @deprecated("Use magnolify API instead.", "0.15.0") def avroSchemaOf[T: TypeTag]: Schema = SchemaProvider.avroSchemaOf[T] /** Generate [[com.google.api.services.bigquery.model.TableSchema TableSchema]] for a case class. */ + @deprecated("Use magnolify API instead.", "0.15.0") def schemaOf[T: TypeTag]: TableSchema = SchemaProvider.schemaOf[T] /** * Generate a converter function from Avro [[GenericRecord]] to the given case class `T`. * @group converters */ + @deprecated("Use magnolify API instead.", "0.15.0") def fromAvro[T]: GenericRecord => T = macro ConverterProvider.fromAvroImpl[T] /** * Generate a converter function from the given case class `T` to [[GenericRecord]]. * @group converters */ + @deprecated("Use magnolify API instead.", "0.15.0") def toAvro[T]: T => GenericRecord = macro ConverterProvider.toAvroImpl[T] /** * Generate a converter function from [[TableRow]] to the given case class `T`. * @group converters */ + @deprecated("Use magnolify API instead.", "0.15.0") def fromTableRow[T]: TableRow => T = macro ConverterProvider.fromTableRowImpl[T] @@ -371,9 +382,11 @@ object BigQueryType { * Generate a converter function from the given case class `T` to [[TableRow]]. * @group converters */ + @deprecated("Use magnolify API instead.", "0.15.0") def toTableRow[T]: T => TableRow = macro ConverterProvider.toTableRowImpl[T] /** Create a new BigQueryType instance. */ + @deprecated("Use magnolify API instead.", "0.15.0") @inline final def apply[T: TypeTag]: BigQueryType[T] = new BigQueryType[T] } @@ -382,6 +395,7 @@ object BigQueryType { * * This decouples generated fields and methods from macro expansion to keep core macro free. */ +@deprecated("Use magnolify API instead.", "0.15.0") class BigQueryType[T: TypeTag] { private[this] val bases = typeOf[T].companion.baseClasses diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/BTOptions.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/BTOptions.scala new file mode 100644 index 0000000000..f4d1f4a5df --- /dev/null +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/BTOptions.scala @@ -0,0 +1,24 @@ +/* + * Copyright 2025 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.bigtable + +import com.google.cloud.bigtable.config.BigtableOptions + +object BTOptions { + def apply(projectId: String, instanceId: String): BigtableOptions = + BigtableOptions.builder().setProjectId(projectId).setInstanceId(instanceId).build +} diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/BigTableIO.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/BigTableIO.scala index 965a1777dc..9261bdfb01 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/BigTableIO.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/BigTableIO.scala @@ -25,6 +25,7 @@ import com.spotify.scio.coders.{Coder, CoderMaterializer} import com.spotify.scio.io.{EmptyTap, EmptyTapOf, ScioIO, Tap, TapT, TestIO} import com.spotify.scio.util.Functions import com.spotify.scio.values.SCollection +import magnolify.bigtable.BigtableType import org.apache.beam.sdk.io.gcp.{bigtable => beam} import org.apache.beam.sdk.io.range.ByteKeyRange import org.apache.beam.sdk.values.KV @@ -56,17 +57,13 @@ final case class BigtableRead(bigtableOptions: BigtableOptions, tableId: String) override protected def read(sc: ScioContext, params: ReadP): SCollection[Row] = { val coder = CoderMaterializer.beam(sc, Coder.protoMessageCoder[Row]) - val opts = bigtableOptions // defeat closure - val read = beam.BigtableIO - .read() - .withProjectId(bigtableOptions.getProjectId) - .withInstanceId(bigtableOptions.getInstanceId) - .withTableId(tableId) - .withBigtableOptionsConfigurator(Functions.serializableFn(_ => opts.toBuilder)) - .withMaxBufferElementCount(params.maxBufferElementCount.map(Int.box).orNull) - .pipe(r => if (params.keyRanges.isEmpty) r else r.withKeyRanges(params.keyRanges.asJava)) - .pipe(r => Option(params.rowFilter).fold(r)(r.withRowFilter)): @nowarn("cat=deprecation") - + val read = BigtableRead.read( + bigtableOptions, + tableId, + params.maxBufferElementCount, + params.keyRanges, + params.rowFilter + ) sc.applyTransform(read).setCoder(coder) } @@ -97,14 +94,133 @@ object BigtableRead { maxBufferElementCount: Option[Int] = ReadParam.DefaultMaxBufferElementCount ) - final def apply(projectId: String, instanceId: String, tableId: String): BigtableRead = { - val bigtableOptions = BigtableOptions - .builder() - .setProjectId(projectId) - .setInstanceId(instanceId) - .build - BigtableRead(bigtableOptions, tableId) + private[scio] def read( + bigtableOptions: BigtableOptions, + tableId: String, + maxBufferElementCount: Option[Int], + keyRanges: Seq[ByteKeyRange], + rowFilter: RowFilter + ): beam.BigtableIO.Read = { + val opts = bigtableOptions // defeat closure + beam.BigtableIO + .read() + .withProjectId(bigtableOptions.getProjectId) + .withInstanceId(bigtableOptions.getInstanceId) + .withTableId(tableId) + .withBigtableOptionsConfigurator(Functions.serializableFn(_ => opts.toBuilder)) + .withMaxBufferElementCount(maxBufferElementCount.map(Int.box).orNull) + .pipe(r => if (keyRanges.isEmpty) r else r.withKeyRanges(keyRanges.asJava)) + .pipe(r => Option(rowFilter).fold(r)(r.withRowFilter)): @nowarn("cat=deprecation") + } +} + +final case class BigtableTypedIO[K: Coder, T: BigtableType: Coder]( + bigtableOptions: BigtableOptions, + tableId: String +) extends BigtableIO[(K, T)] { + override type ReadP = BigtableTypedIO.ReadParam[K] + override type WriteP = BigtableTypedIO.WriteParam[K] + + override def testId: String = + s"BigtableIO(${bigtableOptions.getProjectId}\t${bigtableOptions.getInstanceId}\t$tableId)" + + override protected def read( + sc: ScioContext, + params: ReadP + ): SCollection[(K, T)] = { + val coder = CoderMaterializer.beam(sc, Coder.protoMessageCoder[Row]) + val read = BigtableRead.read( + bigtableOptions, + tableId, + params.maxBufferElementCount, + params.keyRanges, + params.rowFilter + ) + + val bigtableType: BigtableType[T] = implicitly + val cf = params.columnFamily + val keyFn = params.keyFn + sc.transform( + _.applyTransform(read) + .setCoder(coder) + .map(row => keyFn(row.getKey) -> bigtableType(row, cf)) + ) + } + + override protected def write( + data: SCollection[(K, T)], + params: WriteP + ): Tap[Nothing] = { + val bigtableType: BigtableType[T] = implicitly + val btParams = params match { + case d: BigtableTypedIO.Default[_] => + BigtableWrite.Default(d.flowControlEnabled) + case b: BigtableTypedIO.Bulk[_] => + BigtableWrite.Bulk( + b.numOfShards, + Option(b.flushInterval).getOrElse(BigtableWrite.Bulk.DefaultFlushInterval) + ) + } + val cf = params.columnFamily + val ts = params.timestamp + val keyFn = params.keyFn + data.transform_("Bigtable write") { coll => + coll + .map { case (key, t) => + val mutations = Iterable(bigtableType.apply(t, cf, ts)).asJava + .asInstanceOf[java.lang.Iterable[Mutation]] + KV.of(keyFn(key), mutations) + } + .applyInternal(BigtableWrite.sink(tableId, bigtableOptions, btParams)) + } + EmptyTap + } + + override def tap(params: ReadP): Tap[Nothing] = + throw new NotImplementedError("Bigtable tap not implemented") +} + +object BigtableTypedIO { + object ReadParam { + val DefaultKeyRanges: Seq[ByteKeyRange] = Seq.empty[ByteKeyRange] + val DefaultRowFilter: RowFilter = null + val DefaultMaxBufferElementCount: Option[Int] = None } + + final case class ReadParam[K] private ( + columnFamily: String, + keyFn: ByteString => K, + keyRanges: Seq[ByteKeyRange] = ReadParam.DefaultKeyRanges, + rowFilter: RowFilter = ReadParam.DefaultRowFilter, + maxBufferElementCount: Option[Int] = ReadParam.DefaultMaxBufferElementCount + ) + + sealed trait WriteParam[K] { + val columnFamily: String + val keyFn: K => ByteString + val timestamp: Long + } + object WriteParam { + val DefaultTimestamp: Long = 0L + val DefaultNumOfShards: Option[Int] = None + val DefaultFlushInterval: Duration = null + val DefaultFlowControlEnabled = false + } + + final case class Default[K] private ( + columnFamily: String, + keyFn: K => ByteString, + timestamp: Long = WriteParam.DefaultTimestamp, + flowControlEnabled: Boolean = WriteParam.DefaultFlowControlEnabled + ) extends WriteParam[K] + + final case class Bulk[K] private ( + columnFamily: String, + keyFn: K => ByteString, + timestamp: Long, + numOfShards: Int, + flushInterval: Duration + ) extends WriteParam[K] } final case class BigtableWrite[T <: Mutation](bigtableOptions: BigtableOptions, tableId: String) @@ -127,28 +243,12 @@ final case class BigtableWrite[T <: Mutation](bigtableOptions: BigtableOptions, data: SCollection[(ByteString, Iterable[T])], params: WriteP ): Tap[Nothing] = { - val sink = - params match { - case BigtableWrite.Default(flowControlEnabled) => - val opts = bigtableOptions // defeat closure - beam.BigtableIO - .write() - .withProjectId(bigtableOptions.getProjectId) - .withInstanceId(bigtableOptions.getInstanceId) - .withTableId(tableId) - .withFlowControl(flowControlEnabled) - .withBigtableOptionsConfigurator( - Functions.serializableFn(_ => opts.toBuilder) - ): @nowarn("cat=deprecation") - case BigtableWrite.Bulk(numOfShards, flushInterval) => - new BigtableBulkWriter(tableId, bigtableOptions, numOfShards, flushInterval) - } data.transform_("Bigtable write") { coll => coll .map { case (key, value) => KV.of(key, value.asJava.asInstanceOf[java.lang.Iterable[Mutation]]) } - .applyInternal(sink) + .applyInternal(BigtableWrite.sink(tableId, bigtableOptions, params)) } EmptyTap } @@ -187,4 +287,22 @@ object BigtableWrite { .build BigtableWrite[T](bigtableOptions, tableId) } + + private[scio] def sink(tableId: String, bigtableOptions: BigtableOptions, params: WriteParam) = { + params match { + case BigtableWrite.Default(flowControlEnabled) => + val opts = bigtableOptions // defeat closure + beam.BigtableIO + .write() + .withProjectId(bigtableOptions.getProjectId) + .withInstanceId(bigtableOptions.getInstanceId) + .withTableId(tableId) + .withFlowControl(flowControlEnabled) + .withBigtableOptionsConfigurator( + Functions.serializableFn(_ => opts.toBuilder) + ): @nowarn("cat=deprecation") + case BigtableWrite.Bulk(numOfShards, flushInterval) => + new BigtableBulkWriter(tableId, bigtableOptions, numOfShards, flushInterval) + } + } } diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/syntax/SCollectionSyntax.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/syntax/SCollectionSyntax.scala index 6757dec44e..96bca18280 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/syntax/SCollectionSyntax.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/syntax/SCollectionSyntax.scala @@ -23,8 +23,9 @@ import com.google.protobuf.ByteString import com.spotify.scio.io.ClosedTap import com.spotify.scio.values.SCollection import org.joda.time.Duration - -import com.spotify.scio.bigtable.BigtableWrite +import com.spotify.scio.bigtable.{BTOptions, BigtableTypedIO, BigtableWrite} +import com.spotify.scio.coders.Coder +import magnolify.bigtable.BigtableType /** Enhanced version of [[com.spotify.scio.values.SCollection SCollection]] with Bigtable methods. */ final class SCollectionMutationOps[T <: Mutation]( @@ -75,8 +76,81 @@ final class SCollectionMutationOps[T <: Mutation]( ) } +final class BigtableTypedOps[K: Coder, T: BigtableType: Coder]( + private val self: SCollection[(K, T)] +) { + + def saveAsBigtable( + projectId: String, + instanceId: String, + tableId: String, + columnFamily: String, + keyFn: K => ByteString + ): ClosedTap[Nothing] = { + val params = BigtableTypedIO.Default[K](columnFamily, keyFn) + self.write(BigtableTypedIO[K, T](BTOptions(projectId, instanceId), tableId))(params) + } + + def saveAsBigtable( + projectId: String, + instanceId: String, + tableId: String, + columnFamily: String, + keyFn: K => ByteString, + flowControlEnabled: Boolean + ): ClosedTap[Nothing] = { + val params = + BigtableTypedIO.Default[K](columnFamily, keyFn, flowControlEnabled = flowControlEnabled) + self.write(BigtableTypedIO[K, T](BTOptions(projectId, instanceId), tableId))(params) + } + + def saveAsBigtable( + projectId: String, + instanceId: String, + tableId: String, + columnFamily: String, + keyFn: K => ByteString, + timestamp: Long + ): ClosedTap[Nothing] = { + val params = BigtableTypedIO.Default[K](columnFamily, keyFn, timestamp) + self.write(BigtableTypedIO[K, T](BTOptions(projectId, instanceId), tableId))(params) + } + + def saveAsBigtable( + projectId: String, + instanceId: String, + tableId: String, + columnFamily: String, + keyFn: K => ByteString, + timestamp: Long, + flowControlEnabled: Boolean + ): ClosedTap[Nothing] = { + val params = BigtableTypedIO.Default[K](columnFamily, keyFn, timestamp, flowControlEnabled) + self.write(BigtableTypedIO[K, T](BTOptions(projectId, instanceId), tableId))(params) + } + + def saveAsBigtable( + bigtableOptions: BigtableOptions, + tableId: String, + columnFamily: String, + keyFn: K => ByteString, + timestamp: Long = BigtableTypedIO.WriteParam.DefaultTimestamp, + numOfShards: Int, + flushInterval: Duration = BigtableTypedIO.WriteParam.DefaultFlushInterval + ): ClosedTap[Nothing] = { + val params = + BigtableTypedIO + .Bulk[K](columnFamily, keyFn, timestamp, numOfShards, flushInterval) + self.write(BigtableTypedIO[K, T](bigtableOptions, tableId))(params) + } +} + trait SCollectionSyntax { implicit def bigtableMutationOps[T <: Mutation]( sc: SCollection[(ByteString, Iterable[T])] ): SCollectionMutationOps[T] = new SCollectionMutationOps[T](sc) + + implicit def bigtableTypedOps[K: Coder, T: BigtableType: Coder]( + sc: SCollection[(K, T)] + ): BigtableTypedOps[K, T] = new BigtableTypedOps[K, T](sc) } diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/syntax/ScioContextSyntax.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/syntax/ScioContextSyntax.scala index cd21ee3f90..782a58633b 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/syntax/ScioContextSyntax.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/bigtable/syntax/ScioContextSyntax.scala @@ -20,11 +20,18 @@ package com.spotify.scio.bigtable.syntax import com.google.bigtable.admin.v2.GcRule import com.google.bigtable.v2._ import com.google.cloud.bigtable.config.BigtableOptions +import com.google.protobuf.ByteString import com.spotify.scio.ScioContext -import com.spotify.scio.bigtable.BigtableRead -import com.spotify.scio.bigtable.BigtableUtil -import com.spotify.scio.bigtable.TableAdmin +import com.spotify.scio.bigtable.{ + BTOptions, + BigtableRead, + BigtableTypedIO, + BigtableUtil, + TableAdmin +} +import com.spotify.scio.coders.Coder import com.spotify.scio.values.SCollection +import magnolify.bigtable.BigtableType import org.apache.beam.sdk.io.range.ByteKeyRange import org.joda.time.Duration @@ -39,6 +46,77 @@ object ScioContextOps { final class ScioContextOps(private val self: ScioContext) extends AnyVal { import ScioContextOps._ + def typedBigtable[K: Coder, T: BigtableType: Coder]( + projectId: String, + instanceId: String, + tableId: String, + columnFamily: String, + keyFn: ByteString => K + ): SCollection[(K, T)] = + typedBigtable(BTOptions(projectId, instanceId), tableId, columnFamily, keyFn) + + def typedBigtable[K: Coder, T: BigtableType: Coder]( + projectId: String, + instanceId: String, + tableId: String, + columnFamily: String, + keyFn: ByteString => K, + keyRanges: Seq[ByteKeyRange] + ): SCollection[(K, T)] = + typedBigtable(BTOptions(projectId, instanceId), tableId, columnFamily, keyFn, keyRanges) + + def typedBigtable[K: Coder, T: BigtableType: Coder]( + projectId: String, + instanceId: String, + tableId: String, + columnFamily: String, + keyFn: ByteString => K, + keyRanges: Seq[ByteKeyRange], + rowFilter: RowFilter + ): SCollection[(K, T)] = + typedBigtable( + BTOptions(projectId, instanceId), + tableId, + columnFamily, + keyFn, + keyRanges, + rowFilter + ) + + def typedBigtable[K: Coder, T: BigtableType: Coder]( + projectId: String, + instanceId: String, + tableId: String, + columnFamily: String, + keyFn: ByteString => K, + keyRanges: Seq[ByteKeyRange], + rowFilter: RowFilter, + maxBufferElementCount: Option[Int] + ): SCollection[(K, T)] = + typedBigtable( + BTOptions(projectId, instanceId), + tableId, + columnFamily, + keyFn, + keyRanges, + rowFilter, + maxBufferElementCount + ) + + def typedBigtable[K: Coder, T: BigtableType: Coder]( + bigtableOptions: BigtableOptions, + tableId: String, + columnFamily: String, + keyFn: ByteString => K, + keyRanges: Seq[ByteKeyRange] = BigtableRead.ReadParam.DefaultKeyRanges, + rowFilter: RowFilter = BigtableRead.ReadParam.DefaultRowFilter, + maxBufferElementCount: Option[Int] = BigtableRead.ReadParam.DefaultMaxBufferElementCount + ): SCollection[(K, T)] = { + val params = + BigtableTypedIO.ReadParam(columnFamily, keyFn, keyRanges, rowFilter, maxBufferElementCount) + self.read(BigtableTypedIO[K, T](bigtableOptions, tableId))(params) + } + /** Get an SCollection for a Bigtable table. */ def bigtable( projectId: String, @@ -47,7 +125,7 @@ final class ScioContextOps(private val self: ScioContext) extends AnyVal { keyRange: ByteKeyRange, rowFilter: RowFilter ): SCollection[Row] = - bigtable(projectId, instanceId, tableId, Seq(keyRange), rowFilter) + bigtable(BTOptions(projectId, instanceId), tableId, Seq(keyRange), rowFilter) /** Get an SCollection for a Bigtable table. */ def bigtable( @@ -58,20 +136,32 @@ final class ScioContextOps(private val self: ScioContext) extends AnyVal { rowFilter: RowFilter, maxBufferElementCount: Option[Int] ): SCollection[Row] = - bigtable(projectId, instanceId, tableId, Seq(keyRange), rowFilter, maxBufferElementCount) + bigtable( + BTOptions(projectId, instanceId), + tableId, + Seq(keyRange), + rowFilter, + maxBufferElementCount + ) + + /** Get an SCollection for a Bigtable table. */ + def bigtable( + projectId: String, + instanceId: String, + tableId: String + ): SCollection[Row] = + bigtable(BTOptions(projectId, instanceId), tableId) /** Get an SCollection for a Bigtable table. */ def bigtable( projectId: String, instanceId: String, tableId: String, - keyRanges: Seq[ByteKeyRange] = BigtableRead.ReadParam.DefaultKeyRanges, - rowFilter: RowFilter = BigtableRead.ReadParam.DefaultRowFilter, - maxBufferElementCount: Option[Int] = BigtableRead.ReadParam.DefaultMaxBufferElementCount - ): SCollection[Row] = { - val parameters = BigtableRead.ReadParam(keyRanges, rowFilter, maxBufferElementCount) - self.read(BigtableRead(projectId, instanceId, tableId))(parameters) - } + keyRanges: Seq[ByteKeyRange], + rowFilter: RowFilter, + maxBufferElementCount: Option[Int] + ): SCollection[Row] = + bigtable(BTOptions(projectId, instanceId), tableId, keyRanges, rowFilter, maxBufferElementCount) /** Get an SCollection for a Bigtable table. */ def bigtable( @@ -96,20 +186,9 @@ final class ScioContextOps(private val self: ScioContext) extends AnyVal { def bigtable( bigtableOptions: BigtableOptions, tableId: String, - keyRanges: Seq[ByteKeyRange], - rowFilter: RowFilter - ): SCollection[Row] = { - val parameters = BigtableRead.ReadParam(keyRanges, rowFilter) - self.read(BigtableRead(bigtableOptions, tableId))(parameters) - } - - /** Get an SCollection for a Bigtable table. */ - def bigtable( - bigtableOptions: BigtableOptions, - tableId: String, - keyRanges: Seq[ByteKeyRange], - rowFilter: RowFilter, - maxBufferElementCount: Option[Int] + keyRanges: Seq[ByteKeyRange] = BigtableRead.ReadParam.DefaultKeyRanges, + rowFilter: RowFilter = BigtableRead.ReadParam.DefaultRowFilter, + maxBufferElementCount: Option[Int] = BigtableRead.ReadParam.DefaultMaxBufferElementCount ): SCollection[Row] = { val parameters = BigtableRead.ReadParam(keyRanges, rowFilter, maxBufferElementCount) self.read(BigtableRead(bigtableOptions, tableId))(parameters) diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/datastore/DatastoreIO.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/datastore/DatastoreIO.scala index 2977d464d8..a37b7da1d6 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/datastore/DatastoreIO.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/datastore/DatastoreIO.scala @@ -19,30 +19,69 @@ package com.spotify.scio.datastore import com.spotify.scio.ScioContext import com.spotify.scio.values.SCollection -import com.spotify.scio.io.{EmptyTap, EmptyTapOf, ScioIO, Tap, TapT} +import com.spotify.scio.io.{EmptyTap, EmptyTapOf, ScioIO, Tap, TapT, TestIO} import com.google.datastore.v1.{Entity, Query} import com.spotify.scio.coders.{Coder, CoderMaterializer} +import com.spotify.scio.datastore.DatastoreTypedIO.{ReadParam, WriteParam} +import magnolify.datastore.EntityType import org.apache.beam.sdk.io.gcp.datastore.{DatastoreIO => BDatastoreIO, DatastoreV1 => BDatastore} -final case class DatastoreIO(projectId: String) extends ScioIO[Entity] { - override type ReadP = DatastoreIO.ReadParam - override type WriteP = DatastoreIO.WriteParam +sealed trait DatastoreIO[T] extends ScioIO[T] { + final override val tapT: TapT.Aux[T, Nothing] = EmptyTapOf[T] +} - override val tapT: TapT.Aux[Entity, Nothing] = EmptyTapOf[Entity] +object DatastoreIO { + final def apply[T](projectId: String): DatastoreIO[T] = + new DatastoreIO[T] with TestIO[T] { + override def testId: String = s"DatastoreIO($projectId)" + } +} - override protected def read(sc: ScioContext, params: ReadP): SCollection[Entity] = { - val coder = CoderMaterializer.beam(sc, Coder.protoMessageCoder[Entity]) - val read = BDatastoreIO - .v1() - .read() - .withProjectId(projectId) - .withNamespace(params.namespace) - .withQuery(params.query) - sc.applyTransform( - Option(params.configOverride).map(_(read)).getOrElse(read) - ).setCoder(coder) +final case class DatastoreTypedIO[T: EntityType: Coder](projectId: String) extends DatastoreIO[T] { + override type ReadP = DatastoreTypedIO.ReadParam + override type WriteP = DatastoreTypedIO.WriteParam + override def testId: String = s"DatastoreIO($projectId)" + + override protected def read(sc: ScioContext, params: ReadParam): SCollection[T] = { + val entityType: EntityType[T] = implicitly + sc.transform { ctx => + DatastoreEntityIO + .read(ctx, projectId, params.namespace, params.query, params.configOverride) + .map(e => entityType(e)) + } + } + + override protected def write(data: SCollection[T], params: WriteParam): Tap[Nothing] = { + val entityType: EntityType[T] = implicitly + val write = BDatastoreIO.v1.write.withProjectId(projectId) + data.transform_ { scoll => + scoll + .map(t => entityType(t)) + .applyInternal( + Option(params.configOverride).map(_(write)).getOrElse(write) + ) + } + EmptyTap } + override def tap(read: ReadParam): Tap[Nothing] = EmptyTap +} + +object DatastoreTypedIO { + type ReadParam = DatastoreEntityIO.ReadParam + val ReadParam = DatastoreEntityIO.ReadParam + type WriteParam = DatastoreEntityIO.WriteParam + val WriteParam = DatastoreEntityIO.WriteParam +} + +final case class DatastoreEntityIO(projectId: String) extends DatastoreIO[Entity] { + override type ReadP = DatastoreEntityIO.ReadParam + override type WriteP = DatastoreEntityIO.WriteParam + override def testId: String = s"DatastoreIO($projectId)" + + override protected def read(sc: ScioContext, params: ReadP): SCollection[Entity] = + DatastoreEntityIO.read(sc, projectId, params.namespace, params.query, params.configOverride) + override protected def write(data: SCollection[Entity], params: WriteP): Tap[Nothing] = { val write = BDatastoreIO.v1.write.withProjectId(projectId) data.applyInternal( @@ -51,10 +90,10 @@ final case class DatastoreIO(projectId: String) extends ScioIO[Entity] { EmptyTap } - override def tap(read: DatastoreIO.ReadParam): Tap[Nothing] = EmptyTap + override def tap(read: DatastoreEntityIO.ReadParam): Tap[Nothing] = EmptyTap } -object DatastoreIO { +object DatastoreEntityIO { object ReadParam { val DefaultNamespace: String = null @@ -74,4 +113,23 @@ object DatastoreIO { final case class WriteParam private ( configOverride: BDatastore.Write => BDatastore.Write = WriteParam.DefaultConfigOverride ) + + private[scio] def read( + sc: ScioContext, + projectId: String, + namespace: String, + query: Query, + configOverride: BDatastore.Read => BDatastore.Read + ): SCollection[Entity] = { + val coder = CoderMaterializer.beam(sc, Coder.protoMessageCoder[Entity]) + val read = BDatastoreIO + .v1() + .read() + .withProjectId(projectId) + .withNamespace(namespace) + .withQuery(query) + sc.applyTransform( + Option(configOverride).map(_(read)).getOrElse(read) + ).setCoder(coder) + } } diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/datastore/syntax/SCollectionSyntax.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/datastore/syntax/SCollectionSyntax.scala index 8ed685b3bf..bfc420cba9 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/datastore/syntax/SCollectionSyntax.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/datastore/syntax/SCollectionSyntax.scala @@ -19,10 +19,11 @@ import com.google.datastore.v1.Entity package com.spotify.scio.datastore.syntax import com.spotify.scio.values.SCollection -import com.spotify.scio.datastore.DatastoreIO +import com.spotify.scio.datastore.{DatastoreEntityIO, DatastoreTypedIO} import com.spotify.scio.io.ClosedTap import com.google.datastore.v1.Entity -import com.spotify.scio.datastore.DatastoreIO.WriteParam +import com.spotify.scio.coders.Coder +import magnolify.datastore.EntityType import org.apache.beam.sdk.io.gcp.datastore.{DatastoreV1 => BDatastore} final class SCollectionEntityOps[T <: Entity](private val coll: SCollection[T]) extends AnyVal { @@ -33,13 +34,29 @@ final class SCollectionEntityOps[T <: Entity](private val coll: SCollection[T]) */ def saveAsDatastore( projectId: String, - configOverride: BDatastore.Write => BDatastore.Write = WriteParam.DefaultConfigOverride + configOverride: BDatastore.Write => BDatastore.Write = + DatastoreEntityIO.WriteParam.DefaultConfigOverride ): ClosedTap[Nothing] = - coll.covary_[Entity].write(DatastoreIO(projectId))(WriteParam(configOverride)) + coll + .covary_[Entity] + .write(DatastoreEntityIO(projectId))(DatastoreEntityIO.WriteParam(configOverride)) +} + +final class TypedEntitySCollectionOps[T: EntityType: Coder](private val coll: SCollection[T]) { + def saveAsDatastore( + projectId: String, + configOverride: BDatastore.Write => BDatastore.Write = + DatastoreTypedIO.WriteParam.DefaultConfigOverride + ): ClosedTap[Nothing] = + coll.write(DatastoreTypedIO(projectId))(DatastoreTypedIO.WriteParam(configOverride)) } trait SCollectionSyntax { implicit def datastoreEntitySCollectionOps[T <: Entity]( coll: SCollection[T] ): SCollectionEntityOps[T] = new SCollectionEntityOps(coll) + + implicit def typedDatastoreEntitySCollectionOps[T: EntityType: Coder]( + coll: SCollection[T] + ): TypedEntitySCollectionOps[T] = new TypedEntitySCollectionOps(coll) } diff --git a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/datastore/syntax/ScioContextSyntax.scala b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/datastore/syntax/ScioContextSyntax.scala index bb50a920a1..83c82c7165 100644 --- a/scio-google-cloud-platform/src/main/scala/com/spotify/scio/datastore/syntax/ScioContextSyntax.scala +++ b/scio-google-cloud-platform/src/main/scala/com/spotify/scio/datastore/syntax/ScioContextSyntax.scala @@ -19,9 +19,10 @@ package com.spotify.scio.datastore.syntax import com.spotify.scio.ScioContext import com.spotify.scio.values.SCollection -import com.spotify.scio.datastore.DatastoreIO +import com.spotify.scio.datastore.{DatastoreEntityIO, DatastoreTypedIO} import com.google.datastore.v1.{Entity, Query} -import com.spotify.scio.datastore.DatastoreIO.ReadParam +import com.spotify.scio.coders.Coder +import magnolify.datastore.EntityType import org.apache.beam.sdk.io.gcp.datastore.{DatastoreV1 => BDatastore} final class ScioContextOps(private val sc: ScioContext) extends AnyVal { @@ -33,10 +34,24 @@ final class ScioContextOps(private val sc: ScioContext) extends AnyVal { def datastore( projectId: String, query: Query, - namespace: String = ReadParam.DefaultNamespace, - configOverride: BDatastore.Read => BDatastore.Read = ReadParam.DefaultConfigOverride + namespace: String = DatastoreEntityIO.ReadParam.DefaultNamespace, + configOverride: BDatastore.Read => BDatastore.Read = + DatastoreEntityIO.ReadParam.DefaultConfigOverride ): SCollection[Entity] = - sc.read(DatastoreIO(projectId))(ReadParam(query, namespace, configOverride)) + sc.read(DatastoreEntityIO(projectId))( + DatastoreEntityIO.ReadParam(query, namespace, configOverride) + ) + + def typedDatastore[T: EntityType: Coder]( + projectId: String, + query: Query, + namespace: String = DatastoreTypedIO.ReadParam.DefaultNamespace, + configOverride: BDatastore.Read => BDatastore.Read = + DatastoreTypedIO.ReadParam.DefaultConfigOverride + ): SCollection[T] = + sc.read(DatastoreTypedIO(projectId))( + DatastoreTypedIO.ReadParam(query, namespace, configOverride) + ) } trait ScioContextSyntax { diff --git a/scio-google-cloud-platform/src/test/scala/com/spotify/scio/bigquery/BigQueryIOTest.scala b/scio-google-cloud-platform/src/test/scala/com/spotify/scio/bigquery/BigQueryIOTest.scala index 45fbe21966..30a1575509 100644 --- a/scio-google-cloud-platform/src/test/scala/com/spotify/scio/bigquery/BigQueryIOTest.scala +++ b/scio-google-cloud-platform/src/test/scala/com/spotify/scio/bigquery/BigQueryIOTest.scala @@ -38,6 +38,8 @@ object BigQueryIOTest { @BigQueryType.toTable case class BQRecord(i: Int, s: String, r: List[String]) + case class MagnolifyRecord(i: Int, s: String) + // BQ Write transform display id data for tableDescription private val TableDescriptionId = DisplayData.Identifier.of( DisplayData.Path.root(), @@ -191,6 +193,58 @@ final class BigQueryIOTest extends ScioIOSpec { testJobTest(xs)(TableRowJsonIO(_))(_.tableRowJsonFile(_))(_.saveAsTableRowJsonFile(_)) } + "MagnolifyBigQuerySelect" should "work" in { + // unsafe implicits must be explicitly imported for TableRowType[MagnolifyRecord] to be derived + import magnolify.bigquery.unsafe._ + val xs = (1 to 100).map(x => MagnolifyRecord(x, x.toString)) + testJobTest(xs, in = "select * from x", out = "project:dataset.out_table") { + BigQueryIO(_) + } { (coll, s) => + coll.typedBigQuerySelect[MagnolifyRecord](Query(s)) + } { (coll, s) => + coll.saveAsBigQueryTable(Table.Spec(s)) + } + } + + "MagnolifyBigQueryTable" should "work" in { + // unsafe implicits must be explicitly imported for TableRowType[MagnolifyRecord] to be derived + import magnolify.bigquery.unsafe._ + val xs = (1 to 100).map(x => MagnolifyRecord(x, x.toString)) + testJobTest(xs, in = "project:dataset.in_table", out = "project:dataset.out_table") { + BigQueryIO(_) + } { (coll, s) => + coll.typedBigQueryTable[MagnolifyRecord](Table.Spec(s)) + } { (coll, s) => + coll.saveAsBigQueryTable(Table.Spec(s)) + } + } + + "MagnolifyBigQueryStorage" should "work with Table" in { + // unsafe implicits must be explicitly imported for TableRowType[MagnolifyRecord] to be derived + import magnolify.bigquery.unsafe._ + val xs = (1 to 100).map(x => MagnolifyRecord(x, x.toString)) + testJobTest(xs, in = "project:dataset.in_table", out = "project:dataset.out_table")( + BigQueryIO(_, List(), None), + Some(BigQueryIO(_)) + ) { (coll, s) => + coll.typedBigQueryStorageMagnolify[MagnolifyRecord](Table.Spec(s)) + } { (coll, s) => + coll.saveAsBigQueryTable(Table.Spec(s)) + } + } + + it should "work with Query" in { + // unsafe implicits must be explicitly imported for TableRowType[MagnolifyRecord] to be derived + import magnolify.bigquery.unsafe._ + val xs = (1 to 100).map(x => MagnolifyRecord(x, x.toString)) + testJobTest(xs, in = "select x, y from z", out = "project:dataset.out_table") { + BigQueryIO(_) + } { (coll, s) => + coll.typedBigQueryStorageMagnolify[MagnolifyRecord](Query(s)) + } { (coll, s) => + coll.saveAsBigQueryTable(Table.Spec(s)) + } + } } object JobWithDuplicateInput { diff --git a/scio-google-cloud-platform/src/test/scala/com/spotify/scio/bigtable/BigtableIOTest.scala b/scio-google-cloud-platform/src/test/scala/com/spotify/scio/bigtable/BigtableIOTest.scala index eb47617917..95564b34eb 100644 --- a/scio-google-cloud-platform/src/test/scala/com/spotify/scio/bigtable/BigtableIOTest.scala +++ b/scio-google-cloud-platform/src/test/scala/com/spotify/scio/bigtable/BigtableIOTest.scala @@ -22,9 +22,13 @@ import com.google.bigtable.v2.{Mutation, Row} import com.google.protobuf.ByteString import com.spotify.scio.testing._ +// must be defined outside the test class or test job will hang +case class Foo(i: Int, s: String) + class BigtableIOTest extends ScioIOSpec { val projectId = "project" val instanceId = "instance" + val columnFamily = "columnFamily" "BigtableIO" should "work with input" in { val xs = (1 to 100).map { x => @@ -46,4 +50,24 @@ class BigtableIOTest extends ScioIOSpec { _.saveAsBigtable(projectId, instanceId, _) ) } + + it should "work with typed input" in { + val xs = (1 to 100).map(x => x.toString -> Foo(x, x.toString)) + testJobTestInput(xs)(BigtableIO[(String, Foo)](projectId, instanceId, _))( + _.typedBigtable[String, Foo]( + projectId, + instanceId, + _, + columnFamily, + (bs: ByteString) => bs.toStringUtf8 + ) + ) + } + + it should "work with typed output" in { + val xs = (1 to 100).map(x => (x.toString, Foo(x, x.toString))) + testJobTestOutput(xs)(BigtableIO(projectId, instanceId, _))( + _.saveAsBigtable(projectId, instanceId, _, columnFamily, ByteString.copyFromUtf8 _) + ) + } } diff --git a/scio-google-cloud-platform/src/test/scala/com/spotify/scio/datastore/DatastoreIOTest.scala b/scio-google-cloud-platform/src/test/scala/com/spotify/scio/datastore/DatastoreIOTest.scala index 9461a9c793..785035d2ec 100644 --- a/scio-google-cloud-platform/src/test/scala/com/spotify/scio/datastore/DatastoreIOTest.scala +++ b/scio-google-cloud-platform/src/test/scala/com/spotify/scio/datastore/DatastoreIOTest.scala @@ -34,7 +34,20 @@ object DatastoreJob { } } +object TypedDatastoreJob { + case class MyEntity(int: Long) + + def main(cmdlineArgs: Array[String]): Unit = { + val (sc, args) = ContextAndArgs(cmdlineArgs) + sc.datastore(args("input"), null, null) + .saveAsDatastore(args("output")) + sc.run() + () + } +} + class DatastoreIOTest extends PipelineSpec with ScioIOSpec { + import TypedDatastoreJob.MyEntity "DatastoreIO" should "work" in { val xs = (1L to 100L).map { x => @@ -57,7 +70,7 @@ class DatastoreIOTest extends PipelineSpec with ScioIOSpec { JobTest[DatastoreJob.type] .args("--input=store.in", "--output=store.out") .input(DatastoreIO("store.in"), (1L to 3L).map(newEntity)) - .output(DatastoreIO("store.out"))(coll => coll should containInAnyOrder(xs)) + .output(DatastoreIO[Entity]("store.out"))(coll => coll should containInAnyOrder(xs)) .run() it should "pass correct DatastoreJob" in { @@ -73,4 +86,31 @@ class DatastoreIOTest extends PipelineSpec with ScioIOSpec { } } + it should "work with typed data" in { + val xs = (1L to 100L).map(x => MyEntity(x)) + testJobTest(xs)(DatastoreIO(_))(_.typedDatastore[MyEntity](_, null))(_.saveAsDatastore(_)) + } + + def testTypedDatastore(xs: Seq[MyEntity]): Unit = { + val in = (1L to 3L).map(MyEntity) + JobTest[TypedDatastoreJob.type] + .args("--input=store.in", "--output=store.out") + .input(DatastoreIO[MyEntity]("store.in"), in) + .output(DatastoreIO[MyEntity]("store.out"))(coll => coll should containInAnyOrder(xs)) + .run() + } + + it should "pass correct TypedDatastoreJob" in { + testTypedDatastore((1L to 3L).map(MyEntity)) + } + + it should "fail incorrect TypedDatastoreJob" in { + an[AssertionError] should be thrownBy { + testTypedDatastore((1L to 2L).map(MyEntity)) + } + an[AssertionError] should be thrownBy { + testTypedDatastore((1L to 4L).map(MyEntity)) + } + } + } diff --git a/scio-managed/src/main/scala/com/spotify/scio/iceberg/IcebergIO.scala b/scio-managed/src/main/scala/com/spotify/scio/iceberg/IcebergIO.scala new file mode 100644 index 0000000000..2789cdd9cb --- /dev/null +++ b/scio-managed/src/main/scala/com/spotify/scio/iceberg/IcebergIO.scala @@ -0,0 +1,128 @@ +/* + * Copyright 2024 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.iceberg + +import com.spotify.scio.ScioContext +import com.spotify.scio.coders.Coder +import com.spotify.scio.io.{EmptyTap, EmptyTapOf, ScioIO, Tap, TapT} +import com.spotify.scio.values.SCollection +import magnolify.beam.RowType +import org.apache.beam.sdk.managed.Managed +import com.spotify.scio.managed.ManagedIO +import org.apache.beam.sdk.coders.RowCoder +import org.apache.beam.sdk.values.Row +import magnolia1._ + +private[scio] object ConfigMap { + type Typeclass[T] = ConfigMapType[T] + + trait ConfigMapType[T] { + def toMap(value: T): Map[String, AnyRef] + } + implicit def gen[T]: ConfigMapType[T] = macro Magnolia.gen[T] + + // just needed to satisfy magnolia + implicit val stringToMap: ConfigMapType[String] = _ => Map.empty + implicit val intToMap: ConfigMapType[Int] = _ => Map.empty + implicit val mapToMap: ConfigMapType[Map[String, String]] = _ => Map.empty + implicit val listToMap: ConfigMapType[List[String]] = _ => Map.empty + + private def toSnakeCase(s: String): String = + s.replaceAll("([^A-Z])([A-Z])", "$1_$2").toLowerCase + + def join[T](caseClass: CaseClass[ConfigMapType, T]): ConfigMapType[T] = (value: T) => { + caseClass.parameters.flatMap { p => + val fieldValue = p.dereference(value) + if (fieldValue == null) None + else Some(toSnakeCase(p.label) -> fieldValue.asInstanceOf[AnyRef]) + }.toMap + } +} + +final case class IcebergIO[T: RowType: Coder](table: String, catalogName: Option[String]) + extends ScioIO[T] { + override type ReadP = IcebergIO.ReadParam + override type WriteP = IcebergIO.WriteParam + override val tapT: TapT.Aux[T, Nothing] = EmptyTapOf[T] + + private lazy val rowType: RowType[T] = implicitly + private lazy val beamRowCoder: RowCoder = RowCoder.of(rowType.schema) + implicit private lazy val rowCoder: Coder[Row] = Coder.beam(beamRowCoder) + + override def testId: String = s"IcebergIO(${(Some(table) ++ catalogName).mkString(", ")})" + + private[scio] def config[P]( + params: P + )(implicit mapper: ConfigMap.ConfigMapType[P]): Map[String, AnyRef] = { + val b = Map.newBuilder[String, AnyRef] + b += ("table" -> table) + catalogName.foreach(name => b += ("catalog_name" -> name)) + b ++= mapper.toMap(params) + b.result() + } + + override protected def read(sc: ScioContext, params: IcebergIO.ReadParam): SCollection[T] = { + val io = ManagedIO(Managed.ICEBERG, config(params)) + sc.transform(_.read(io)(ManagedIO.ReadParam(rowType.schema)).map(rowType.from)) + } + + override protected def write(data: SCollection[T], params: IcebergIO.WriteParam): Tap[tapT.T] = { + val io = ManagedIO(Managed.ICEBERG, config(params)) + data.map(rowType.to).setCoder(beamRowCoder).write(io).underlying + } + + override def tap(read: IcebergIO.ReadParam): Tap[tapT.T] = EmptyTap +} + +object IcebergIO { + case class ReadParam private ( + catalogProperties: Map[String, String] = ReadParam.DefaultCatalogProperties, + configProperties: Map[String, String] = ReadParam.DefaultConfigProperties, + keep: List[String] = ReadParam.DefaultKeep, + drop: List[String] = ReadParam.DefaultDrop, + filter: String = ReadParam.DefaultFilter + ) + object ReadParam { + val DefaultCatalogProperties: Map[String, String] = null + val DefaultConfigProperties: Map[String, String] = null + val DefaultKeep: List[String] = null + val DefaultDrop: List[String] = null + val DefaultFilter: String = null + + implicit val configMap: ConfigMap.ConfigMapType[ReadParam] = ConfigMap.gen[ReadParam] + } + case class WriteParam private ( + catalogProperties: Map[String, String] = WriteParam.DefaultCatalogProperties, + configProperties: Map[String, String] = WriteParam.DefaultConfigProperties, + triggeringFrequencySeconds: Int = WriteParam.DefaultTriggeringFrequencySeconds, + directWriteByteLimit: Int = WriteParam.DefaultDirectWriteByteLimit, + keep: List[String] = WriteParam.DefaultKeep, + drop: List[String] = WriteParam.DefaultDrop, + only: String = WriteParam.DefaultOnly + ) + object WriteParam { + val DefaultCatalogProperties: Map[String, String] = null + val DefaultConfigProperties: Map[String, String] = null + val DefaultTriggeringFrequencySeconds: Int = null.asInstanceOf[Int] + val DefaultDirectWriteByteLimit: Int = null.asInstanceOf[Int] + val DefaultKeep: List[String] = null + val DefaultDrop: List[String] = null + val DefaultOnly: String = null + + implicit val configMap: ConfigMap.ConfigMapType[WriteParam] = ConfigMap.gen[WriteParam] + } +} diff --git a/scio-managed/src/main/scala/com/spotify/scio/iceberg/package.scala b/scio-managed/src/main/scala/com/spotify/scio/iceberg/package.scala new file mode 100644 index 0000000000..579f505a1a --- /dev/null +++ b/scio-managed/src/main/scala/com/spotify/scio/iceberg/package.scala @@ -0,0 +1,28 @@ +/* + * Copyright 2024 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio + +import com.spotify.scio.iceberg.syntax.{SCollectionSyntax, ScioContextSyntax} + +/** + * Iceberg IO APIs. Import all. + * + * {{{ + * import com.spotify.scio.iceberg._ + * }}} + */ +package object iceberg extends ScioContextSyntax with SCollectionSyntax diff --git a/scio-managed/src/main/scala/com/spotify/scio/iceberg/syntax/IcebergSCollectionSyntax.scala b/scio-managed/src/main/scala/com/spotify/scio/iceberg/syntax/IcebergSCollectionSyntax.scala new file mode 100644 index 0000000000..54c54a0c42 --- /dev/null +++ b/scio-managed/src/main/scala/com/spotify/scio/iceberg/syntax/IcebergSCollectionSyntax.scala @@ -0,0 +1,61 @@ +/* + * Copyright 2024 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.iceberg.syntax + +import com.spotify.scio.coders.Coder +import com.spotify.scio.iceberg.IcebergIO +import com.spotify.scio.io.ClosedTap +import com.spotify.scio.values.SCollection +import magnolify.beam.RowType + +class IcebergSCollectionSyntax[T: RowType: Coder](self: SCollection[T]) { + + /** + * @see + * [[org.apache.beam.sdk.io.iceberg.IcebergWriteSchemaTransformProvider IcebergWriteSchemaTransformProvider]] + * https://github.com/apache/beam/blob/v2.68.0/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProvider.java#L135-L153 + */ + def saveAsIceberg( + table: String, + catalogName: String = null, + catalogProperties: Map[String, String] = IcebergIO.WriteParam.DefaultCatalogProperties, + configProperties: Map[String, String] = IcebergIO.WriteParam.DefaultConfigProperties, + triggeringFrequencySeconds: Int = IcebergIO.WriteParam.DefaultTriggeringFrequencySeconds, + directWriteByteLimit: Int = IcebergIO.WriteParam.DefaultDirectWriteByteLimit, + keep: List[String] = IcebergIO.WriteParam.DefaultKeep, + drop: List[String] = IcebergIO.WriteParam.DefaultDrop, + only: String = IcebergIO.WriteParam.DefaultOnly + ): ClosedTap[Nothing] = { + val params = IcebergIO.WriteParam( + catalogProperties, + configProperties, + triggeringFrequencySeconds, + directWriteByteLimit, + keep, + drop, + only + ) + self.write(IcebergIO(table, Option(catalogName)))(params) + } +} + +trait SCollectionSyntax { + implicit def icebergSCollectionSyntax[T: RowType: Coder]( + self: SCollection[T] + ): IcebergSCollectionSyntax[T] = + new IcebergSCollectionSyntax(self) +} diff --git a/scio-managed/src/main/scala/com/spotify/scio/iceberg/syntax/IcebergScioContextSyntax.scala b/scio-managed/src/main/scala/com/spotify/scio/iceberg/syntax/IcebergScioContextSyntax.scala new file mode 100644 index 0000000000..515b3b0c54 --- /dev/null +++ b/scio-managed/src/main/scala/com/spotify/scio/iceberg/syntax/IcebergScioContextSyntax.scala @@ -0,0 +1,49 @@ +/* + * Copyright 2024 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.iceberg.syntax + +import com.spotify.scio.ScioContext +import com.spotify.scio.coders.Coder +import com.spotify.scio.iceberg.IcebergIO +import com.spotify.scio.values.SCollection +import magnolify.beam.RowType + +class IcebergScioContextSyntax(self: ScioContext) { + + /** + * @see + * [[org.apache.beam.sdk.io.iceberg.IcebergReadSchemaTransformProvider IcebergReadSchemaTransformProvider]] + * https://github.com/apache/beam/blob/v2.68.0/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProvider.java#L107-L139 + */ + def iceberg[T: Coder]( + table: String, + catalogName: String = null, + catalogProperties: Map[String, String] = IcebergIO.ReadParam.DefaultCatalogProperties, + configProperties: Map[String, String] = IcebergIO.ReadParam.DefaultConfigProperties, + keep: List[String] = IcebergIO.ReadParam.DefaultKeep, + drop: List[String] = IcebergIO.ReadParam.DefaultDrop, + filter: String = IcebergIO.ReadParam.DefaultFilter + )(implicit rt: RowType[T]): SCollection[T] = { + val params = IcebergIO.ReadParam(catalogProperties, configProperties, keep, drop, filter) + self.read(IcebergIO(table, Option(catalogName)))(params) + } +} + +trait ScioContextSyntax { + implicit def icebergScioContextSyntax(self: ScioContext): IcebergScioContextSyntax = + new IcebergScioContextSyntax(self) +} diff --git a/scio-managed/src/main/scala/com/spotify/scio/managed/ManagedIO.scala b/scio-managed/src/main/scala/com/spotify/scio/managed/ManagedIO.scala new file mode 100644 index 0000000000..1bfea1bada --- /dev/null +++ b/scio-managed/src/main/scala/com/spotify/scio/managed/ManagedIO.scala @@ -0,0 +1,73 @@ +/* + * Copyright 2024 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.managed + +import com.spotify.scio.ScioContext +import com.spotify.scio.io.{EmptyTap, EmptyTapOf, ScioIO, Tap, TapT} +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.managed.Managed +import org.apache.beam.sdk.schemas.Schema +import org.apache.beam.sdk.values.{PCollectionRowTuple, Row} +import scala.jdk.CollectionConverters._ + +final case class ManagedIO(ioName: String, config: Map[String, Object]) extends ScioIO[Row] { + override type ReadP = ManagedIO.ReadParam + override type WriteP = ManagedIO.WriteParam + override val tapT: TapT.Aux[Row, Nothing] = EmptyTapOf[Row] + + private lazy val _config: java.util.Map[String, Object] = ManagedIO.convertConfig(config) + + // not-ideal IO naming, but we have no identifier except the config map + override def testId: String = s"ManagedIO($ioName, ${config.toString})" + override protected def read(sc: ScioContext, params: ManagedIO.ReadParam): SCollection[Row] = { + sc.wrap( + sc.applyInternal[PCollectionRowTuple]( + Managed.read(ioName).withConfig(_config) + ).getSinglePCollection + ) + } + + override protected def write( + data: SCollection[Row], + params: ManagedIO.WriteParam + ): Tap[tapT.T] = { + data.applyInternal(Managed.write(ioName).withConfig(_config)) + EmptyTap + } + + override def tap(read: ManagedIO.ReadParam): Tap[tapT.T] = EmptyTap +} + +object ManagedIO { + final case class ReadParam(schema: Schema) + type WriteParam = Unit + + private[scio] def convertConfig(config: Map[String, Object]): java.util.Map[String, Object] = { + // recursively convert this yaml-compatible nested scala map to java map + // we either do this or the user has to create nested java maps in scala code + // both are bad + def _convert(a: Object): Object = { + a match { + case m: Map[_, _] => + m.asInstanceOf[Map[_, Object]].map { case (k, v) => k -> _convert(v) }.asJava + case i: Iterable[_] => i.map(o => _convert(o.asInstanceOf[Object])).asJava + case _ => a + } + } + config.map { case (k, v) => k -> _convert(v) }.asJava + } +} diff --git a/scio-managed/src/main/scala/com/spotify/scio/managed/package.scala b/scio-managed/src/main/scala/com/spotify/scio/managed/package.scala new file mode 100644 index 0000000000..dd0def4dd3 --- /dev/null +++ b/scio-managed/src/main/scala/com/spotify/scio/managed/package.scala @@ -0,0 +1,28 @@ +/* + * Copyright 2024 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio + +import com.spotify.scio.managed.syntax.{SCollectionSyntax, ScioContextSyntax} + +/** + * Managed IO APIs. Import all. + * + * {{{ + * import com.spotify.scio.managed._ + * }}} + */ +package object managed extends ScioContextSyntax with SCollectionSyntax diff --git a/scio-managed/src/main/scala/com/spotify/scio/managed/syntax/ManagedSCollectionSyntax.scala b/scio-managed/src/main/scala/com/spotify/scio/managed/syntax/ManagedSCollectionSyntax.scala new file mode 100644 index 0000000000..076c112c03 --- /dev/null +++ b/scio-managed/src/main/scala/com/spotify/scio/managed/syntax/ManagedSCollectionSyntax.scala @@ -0,0 +1,32 @@ +/* + * Copyright 2024 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.managed.syntax + +import com.spotify.scio.io.ClosedTap +import com.spotify.scio.managed.ManagedIO +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.values.Row + +class ManagedSCollectionSyntax(self: SCollection[Row]) { + def saveAsManaged(sink: String, config: Map[String, AnyRef] = Map.empty): ClosedTap[Nothing] = + self.write(ManagedIO(sink, config)) +} + +trait SCollectionSyntax { + implicit def managedSCollectionSyntax(self: SCollection[Row]): ManagedSCollectionSyntax = + new ManagedSCollectionSyntax(self) +} diff --git a/scio-managed/src/main/scala/com/spotify/scio/managed/syntax/ManagedScioContextSyntax.scala b/scio-managed/src/main/scala/com/spotify/scio/managed/syntax/ManagedScioContextSyntax.scala new file mode 100644 index 0000000000..a00b7f45b2 --- /dev/null +++ b/scio-managed/src/main/scala/com/spotify/scio/managed/syntax/ManagedScioContextSyntax.scala @@ -0,0 +1,37 @@ +/* + * Copyright 2024 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.managed.syntax + +import com.spotify.scio.ScioContext +import com.spotify.scio.managed.ManagedIO +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.schemas.Schema +import org.apache.beam.sdk.values.Row + +class ManagedScioContextSyntax(self: ScioContext) { + def managed( + source: String, + schema: Schema, + config: Map[String, Object] = Map.empty + ): SCollection[Row] = + self.read[Row](ManagedIO(source, config))(ManagedIO.ReadParam(schema)) +} + +trait ScioContextSyntax { + implicit def managedScioContextSyntax(self: ScioContext): ManagedScioContextSyntax = + new ManagedScioContextSyntax(self) +} diff --git a/scio-managed/src/test/scala/com/spotify/scio/iceberg/IcebergIOTest.scala b/scio-managed/src/test/scala/com/spotify/scio/iceberg/IcebergIOTest.scala new file mode 100644 index 0000000000..363193d202 --- /dev/null +++ b/scio-managed/src/test/scala/com/spotify/scio/iceberg/IcebergIOTest.scala @@ -0,0 +1,88 @@ +/* + * Copyright 2025 Spotify AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.spotify.scio.iceberg + +import com.spotify.scio.managed.ManagedIO +import com.spotify.scio.testing.ScioIOSpec + +case class Fake(a: String, b: Int) + +class IcebergIOTest extends ScioIOSpec { + "IcebergIO" should "produce snake_case config maps" in { + val io = IcebergIO[Fake]("tableName", Some("catalogName")) + + val reads: Seq[IcebergIO.ReadParam] = List( + IcebergIO.ReadParam(), + IcebergIO.ReadParam( + Map.empty, + Map.empty, + List.empty, + List.empty, + "" + ), + IcebergIO.ReadParam( + Map("catalogProp1" -> "catalogProp1Value"), + Map("configProp1" -> "configProp1Value", "configProp2" -> "configProp2Value"), + List("keep1", "keep2", "keep3"), + List("drop1", "drop2", "drop3"), + "id > 10" + ) + ) + val writes: Seq[IcebergIO.WriteParam] = List( + IcebergIO.WriteParam(), + IcebergIO.WriteParam( + Map.empty, + Map.empty, + null.asInstanceOf[Int], + null.asInstanceOf[Int], + List.empty, + List.empty, + "" + ), + IcebergIO.WriteParam( + Map("catalogProp1" -> "catalogProp1Value"), + Map("configProp1" -> "configProp1Value", "configProp2" -> "configProp2Value"), + 10, + 100, + List("keep1", "keep2", "keep3"), + List("drop1", "drop2", "drop3"), + "only" + ) + ) + + val configs: Seq[Map[String, AnyRef]] = reads.map(io.config(_)) ++ writes.map(io.config(_)) + val expectedKeys = Set( + // common + "table", + "catalog_name", + "catalog_properties", + "config_properties", + "keep", + "drop", + // reads + "filter", + // writes + "triggering_frequency_seconds", + "direct_write_byte_limit", + "only" + ) + assert(configs.flatMap(_.keys).toSet.intersect(expectedKeys) == expectedKeys) + + // don't throw + configs.map(ManagedIO.convertConfig) + } +} diff --git a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleConverters.java b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleConverters.java index 3d3c7f5dc8..58eb039f87 100644 --- a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleConverters.java +++ b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleConverters.java @@ -26,12 +26,12 @@ import org.apache.parquet.schema.GroupType; import org.tensorflow.metadata.v0.FeatureType; import org.tensorflow.metadata.v0.Schema; -import org.tensorflow.proto.example.BytesList; -import org.tensorflow.proto.example.Example; -import org.tensorflow.proto.example.Feature; -import org.tensorflow.proto.example.Features; -import org.tensorflow.proto.example.FloatList; -import org.tensorflow.proto.example.Int64List; +import org.tensorflow.proto.BytesList; +import org.tensorflow.proto.Example; +import org.tensorflow.proto.Feature; +import org.tensorflow.proto.Features; +import org.tensorflow.proto.FloatList; +import org.tensorflow.proto.Int64List; class TensorflowExampleConverters { static class ExampleConverter extends GroupConverter { diff --git a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetInputFormat.java b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetInputFormat.java index 74ddcf0853..79947cd6e0 100644 --- a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetInputFormat.java +++ b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetInputFormat.java @@ -20,7 +20,7 @@ import org.apache.parquet.hadoop.ParquetInputFormat; import org.apache.parquet.hadoop.util.ContextUtil; import org.tensorflow.metadata.v0.Schema; -import org.tensorflow.proto.example.Example; +import org.tensorflow.proto.Example; public class TensorflowExampleParquetInputFormat extends ParquetInputFormat { diff --git a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetOutputFormat.java b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetOutputFormat.java index ad7abf3703..11689afa5c 100644 --- a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetOutputFormat.java +++ b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetOutputFormat.java @@ -20,7 +20,7 @@ import org.apache.parquet.hadoop.ParquetOutputFormat; import org.apache.parquet.hadoop.util.ContextUtil; import org.tensorflow.metadata.v0.Schema; -import org.tensorflow.proto.example.Example; +import org.tensorflow.proto.Example; public class TensorflowExampleParquetOutputFormat extends ParquetOutputFormat { diff --git a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetReader.java b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetReader.java index 9e0ad14aa7..f7138a7374 100644 --- a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetReader.java +++ b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetReader.java @@ -23,7 +23,7 @@ import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.api.ReadSupport; import org.apache.parquet.io.InputFile; -import org.tensorflow.proto.example.Example; +import org.tensorflow.proto.Example; public class TensorflowExampleParquetReader extends ParquetReader { diff --git a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetWriter.java b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetWriter.java index cc0aad26d4..899e7deb8a 100644 --- a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetWriter.java +++ b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleParquetWriter.java @@ -25,7 +25,7 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.io.OutputFile; import org.tensorflow.metadata.v0.Schema; -import org.tensorflow.proto.example.Example; +import org.tensorflow.proto.Example; public class TensorflowExampleParquetWriter extends ParquetWriter { diff --git a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleReadSupport.java b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleReadSupport.java index d5ed98f169..13b9135a7b 100644 --- a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleReadSupport.java +++ b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleReadSupport.java @@ -24,7 +24,7 @@ import org.apache.parquet.io.api.RecordMaterializer; import org.apache.parquet.schema.MessageType; import org.tensorflow.metadata.v0.Schema; -import org.tensorflow.proto.example.Example; +import org.tensorflow.proto.Example; public class TensorflowExampleReadSupport extends ReadSupport { diff --git a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleRecordMaterializer.java b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleRecordMaterializer.java index 8c3feb0dc3..83c9d5da23 100644 --- a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleRecordMaterializer.java +++ b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleRecordMaterializer.java @@ -20,7 +20,7 @@ import org.apache.parquet.io.api.RecordMaterializer; import org.apache.parquet.schema.MessageType; import org.tensorflow.metadata.v0.Schema; -import org.tensorflow.proto.example.Example; +import org.tensorflow.proto.Example; public class TensorflowExampleRecordMaterializer extends RecordMaterializer { diff --git a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleWriteSupport.java b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleWriteSupport.java index fa5555e710..5776068133 100644 --- a/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleWriteSupport.java +++ b/scio-parquet/src/main/java/com/spotify/parquet/tensorflow/TensorflowExampleWriteSupport.java @@ -33,7 +33,7 @@ import org.tensorflow.metadata.v0.Feature; import org.tensorflow.metadata.v0.FeatureType; import org.tensorflow.metadata.v0.Schema; -import org.tensorflow.proto.example.Example; +import org.tensorflow.proto.Example; public class TensorflowExampleWriteSupport extends WriteSupport { @@ -98,17 +98,16 @@ public void write(Example record) { private void writeRecordFields(GroupType schema, Schema tfSchema, Example example) { List fields = schema.getFields(); List mdFeatures = tfSchema.getFeatureList(); - Map features = - example.getFeatures().getFeatureMap(); + Map features = example.getFeatures().getFeatureMap(); for (int index = 0; index < mdFeatures.size(); index++) { Feature mdFeature = mdFeatures.get(index); FeatureType type = mdFeature.getType(); // if feature is missing in the example, return an empty tensor - org.tensorflow.proto.example.Feature value = + org.tensorflow.proto.Feature value = features.getOrDefault( - mdFeature.getName(), org.tensorflow.proto.example.Feature.getDefaultInstance()); + mdFeature.getName(), org.tensorflow.proto.Feature.getDefaultInstance()); Type fieldType = fields.get(index); String fieldName = fieldType.getName(); switch (type) { diff --git a/scio-parquet/src/main/java/com/spotify/scio/parquet/WriterUtils.java b/scio-parquet/src/main/java/com/spotify/scio/parquet/WriterUtils.java index 5d2c6d1caf..f3f40a1a70 100644 --- a/scio-parquet/src/main/java/com/spotify/scio/parquet/WriterUtils.java +++ b/scio-parquet/src/main/java/com/spotify/scio/parquet/WriterUtils.java @@ -29,7 +29,10 @@ public class WriterUtils { public static > ParquetWriter build( - ParquetWriter.Builder builder, Configuration conf, CompressionCodecName compression) + ParquetWriter.Builder builder, + Configuration conf, + CompressionCodecName compression, + Map extraMetadata) throws IOException { // https://github.com/apache/parquet-mr/tree/master/parquet-hadoop#class-parquetoutputformat long rowGroupSize = @@ -53,6 +56,10 @@ public static > ParquetWriter builder = builder.withBloomFilterNDV(entry.getKey(), entry.getValue()); } + if (extraMetadata != null) { + builder = builder.withExtraMetaData(extraMetadata); + } + return builder .withConf(conf) .withCompressionCodec(compression) diff --git a/scio-parquet/src/main/java/com/spotify/scio/parquet/avro/ParquetAvroFileBasedSink.java b/scio-parquet/src/main/java/com/spotify/scio/parquet/avro/ParquetAvroFileBasedSink.java index bd9d695a29..d4a6a02f5c 100644 --- a/scio-parquet/src/main/java/com/spotify/scio/parquet/avro/ParquetAvroFileBasedSink.java +++ b/scio-parquet/src/main/java/com/spotify/scio/parquet/avro/ParquetAvroFileBasedSink.java @@ -20,6 +20,7 @@ import com.spotify.scio.parquet.BeamOutputFile; import com.spotify.scio.parquet.WriterUtils; import java.nio.channels.WritableByteChannel; +import java.util.Map; import org.apache.avro.Schema; import org.apache.beam.sdk.io.FileBasedSink; import org.apache.beam.sdk.io.fs.ResourceId; @@ -36,22 +37,25 @@ public class ParquetAvroFileBasedSink extends FileBasedSink { private final String schemaString; private final SerializableConfiguration conf; private final CompressionCodecName compression; + private final Map extraMetadata; public ParquetAvroFileBasedSink( ValueProvider baseOutputFileName, FileBasedSink.DynamicDestinations dynamicDestinations, Schema schema, Configuration conf, - CompressionCodecName compression) { + CompressionCodecName compression, + Map extraMetadata) { super(baseOutputFileName, dynamicDestinations); this.schemaString = schema.toString(); this.conf = new SerializableConfiguration(conf); this.compression = compression; + this.extraMetadata = extraMetadata; } @Override public FileBasedSink.WriteOperation createWriteOperation() { - return new ParquetAvroWriteOperation(this, schemaString, conf, compression); + return new ParquetAvroWriteOperation(this, schemaString, conf, compression, extraMetadata); } // ======================================================================= @@ -63,22 +67,25 @@ static class ParquetAvroWriteOperation extends WriteOperation { private final String schemaString; private final SerializableConfiguration conf; private final CompressionCodecName compression; + private final Map extraMetadata; public ParquetAvroWriteOperation( FileBasedSink sink, String schemaString, SerializableConfiguration conf, - CompressionCodecName compression) { + CompressionCodecName compression, + Map extraMetadata) { super(sink); this.schemaString = schemaString; this.conf = conf; this.compression = compression; + this.extraMetadata = extraMetadata; } @Override public Writer createWriter() throws Exception { return new ParquetAvroWriter<>( - this, new Schema.Parser().parse(schemaString), conf, compression); + this, new Schema.Parser().parse(schemaString), conf, compression, extraMetadata); } } @@ -91,17 +98,20 @@ static class ParquetAvroWriter extends FileBasedSink.Writer { private final Schema schema; private final SerializableConfiguration conf; private final CompressionCodecName compression; + private final Map extraMetadata; private ParquetWriter writer; public ParquetAvroWriter( WriteOperation writeOperation, Schema schema, SerializableConfiguration conf, - CompressionCodecName compression) { + CompressionCodecName compression, + Map extraMetadata) { super(writeOperation, MimeTypes.BINARY); this.schema = schema; this.conf = conf; this.compression = compression; + this.extraMetadata = extraMetadata; } @Override @@ -110,7 +120,7 @@ protected void prepareWrite(WritableByteChannel channel) throws Exception { Configuration configuration = conf.get(); AvroParquetWriter.Builder builder = AvroParquetWriter.builder(outputFile).withSchema(schema); - writer = WriterUtils.build(builder, configuration, compression); + writer = WriterUtils.build(builder, configuration, compression, extraMetadata); } @Override diff --git a/scio-parquet/src/main/java/com/spotify/scio/parquet/tensorflow/ParquetExampleFileBasedSink.java b/scio-parquet/src/main/java/com/spotify/scio/parquet/tensorflow/ParquetExampleFileBasedSink.java index a973a8d347..3adc088b17 100644 --- a/scio-parquet/src/main/java/com/spotify/scio/parquet/tensorflow/ParquetExampleFileBasedSink.java +++ b/scio-parquet/src/main/java/com/spotify/scio/parquet/tensorflow/ParquetExampleFileBasedSink.java @@ -21,6 +21,7 @@ import com.spotify.scio.parquet.BeamOutputFile; import com.spotify.scio.parquet.WriterUtils; import java.nio.channels.WritableByteChannel; +import java.util.Map; import org.apache.beam.sdk.io.FileBasedSink; import org.apache.beam.sdk.io.fs.ResourceId; import org.apache.beam.sdk.io.hadoop.SerializableConfiguration; @@ -30,29 +31,32 @@ import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.tensorflow.metadata.v0.Schema; -import org.tensorflow.proto.example.Example; +import org.tensorflow.proto.Example; public class ParquetExampleFileBasedSink extends FileBasedSink { private final Schema schema; private final SerializableConfiguration conf; private final CompressionCodecName compression; + private final Map extraMetadata; public ParquetExampleFileBasedSink( ValueProvider baseOutputFileName, FileBasedSink.DynamicDestinations dynamicDestinations, Schema schema, Configuration conf, - CompressionCodecName compression) { + CompressionCodecName compression, + Map extraMetadata) { super(baseOutputFileName, dynamicDestinations); this.schema = schema; this.conf = new SerializableConfiguration(conf); this.compression = compression; + this.extraMetadata = extraMetadata; } @Override public FileBasedSink.WriteOperation createWriteOperation() { - return new ParquetExampleWriteOperation(this, schema, conf, compression); + return new ParquetExampleWriteOperation(this, schema, conf, compression, extraMetadata); } // ======================================================================= @@ -63,21 +67,24 @@ static class ParquetExampleWriteOperation extends FileBasedSink.WriteOperation extraMetadata; ParquetExampleWriteOperation( FileBasedSink sink, Schema schema, SerializableConfiguration conf, - CompressionCodecName compression) { + CompressionCodecName compression, + Map extraMetadata) { super(sink); this.schema = schema; this.conf = conf; this.compression = compression; + this.extraMetadata = extraMetadata; } @Override public Writer createWriter() throws Exception { - return new ParquetExampleWriter(this, schema, conf, compression); + return new ParquetExampleWriter(this, schema, conf, compression, extraMetadata); } } @@ -90,17 +97,20 @@ static class ParquetExampleWriter extends FileBasedSink.Writer { private final Schema schema; private final SerializableConfiguration conf; private final CompressionCodecName compression; + private final Map extraMetadata; private ParquetWriter writer; public ParquetExampleWriter( FileBasedSink.WriteOperation writeOperation, Schema schema, SerializableConfiguration conf, - CompressionCodecName compression) { + CompressionCodecName compression, + Map extraMetadata) { super(writeOperation, MimeTypes.BINARY); this.schema = schema; this.conf = conf; this.compression = compression; + this.extraMetadata = extraMetadata; } @Override @@ -108,7 +118,7 @@ protected void prepareWrite(WritableByteChannel channel) throws Exception { BeamOutputFile outputFile = BeamOutputFile.of(channel); TensorflowExampleParquetWriter.Builder builder = TensorflowExampleParquetWriter.builder(outputFile).withSchema(schema); - writer = WriterUtils.build(builder, conf.get(), compression); + writer = WriterUtils.build(builder, conf.get(), compression, extraMetadata); } @Override diff --git a/scio-parquet/src/main/java/com/spotify/scio/parquet/types/ParquetTypeFileBasedSink.java b/scio-parquet/src/main/java/com/spotify/scio/parquet/types/ParquetTypeFileBasedSink.java index e6c53da0c9..fb2b51a9e6 100644 --- a/scio-parquet/src/main/java/com/spotify/scio/parquet/types/ParquetTypeFileBasedSink.java +++ b/scio-parquet/src/main/java/com/spotify/scio/parquet/types/ParquetTypeFileBasedSink.java @@ -20,6 +20,7 @@ import com.spotify.scio.parquet.BeamOutputFile; import com.spotify.scio.parquet.WriterUtils; import java.nio.channels.WritableByteChannel; +import java.util.Map; import magnolify.parquet.ParquetType; import org.apache.beam.sdk.io.FileBasedSink; import org.apache.beam.sdk.io.fs.ResourceId; @@ -35,22 +36,25 @@ public class ParquetTypeFileBasedSink extends FileBasedSink { private final ParquetType type; private final SerializableConfiguration conf; private final CompressionCodecName compression; + private final Map extraMetadata; public ParquetTypeFileBasedSink( ValueProvider baseOutputFileName, FileBasedSink.DynamicDestinations dynamicDestinations, ParquetType type, Configuration conf, - CompressionCodecName compression) { + CompressionCodecName compression, + Map extraMetadata) { super(baseOutputFileName, dynamicDestinations); this.type = type; this.conf = new SerializableConfiguration(conf); this.compression = compression; + this.extraMetadata = extraMetadata; } @Override public FileBasedSink.WriteOperation createWriteOperation() { - return new ParquetTypeWriteOperation<>(this, type, conf, compression); + return new ParquetTypeWriteOperation<>(this, type, conf, compression, extraMetadata); } // ======================================================================= @@ -61,21 +65,24 @@ static class ParquetTypeWriteOperation extends WriteOperation { private final ParquetType type; private final SerializableConfiguration conf; private final CompressionCodecName compression; + private final Map extraMetadata; public ParquetTypeWriteOperation( FileBasedSink sink, ParquetType type, SerializableConfiguration conf, - CompressionCodecName compression) { + CompressionCodecName compression, + Map extraMetadata) { super(sink); this.type = type; this.conf = conf; this.compression = compression; + this.extraMetadata = extraMetadata; } @Override public Writer createWriter() throws Exception { - return new ParquetTypeWriter<>(this, type, conf, compression); + return new ParquetTypeWriter<>(this, type, conf, compression, extraMetadata); } } @@ -88,24 +95,28 @@ static class ParquetTypeWriter extends FileBasedSink.Writer { private final ParquetType type; private final SerializableConfiguration conf; private final CompressionCodecName compression; + private final Map extraMetadata; private ParquetWriter writer; public ParquetTypeWriter( WriteOperation writeOperation, ParquetType type, SerializableConfiguration conf, - CompressionCodecName compression) { + CompressionCodecName compression, + Map extraMetadata) { super(writeOperation, MimeTypes.BINARY); this.type = type; this.conf = conf; this.compression = compression; + this.extraMetadata = extraMetadata; } @SuppressWarnings("unchecked") @Override protected void prepareWrite(WritableByteChannel channel) throws Exception { BeamOutputFile outputFile = BeamOutputFile.of(channel); - writer = WriterUtils.build(type.writeBuilder(outputFile), conf.get(), compression); + writer = + WriterUtils.build(type.writeBuilder(outputFile), conf.get(), compression, extraMetadata); } @Override diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/avro/ParquetAvroIO.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/avro/ParquetAvroIO.scala index ca65f80da2..1ca7577f2c 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/avro/ParquetAvroIO.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/avro/ParquetAvroIO.scala @@ -50,6 +50,7 @@ import org.apache.parquet.filter2.predicate.FilterPredicate import org.apache.parquet.hadoop.ParquetInputFormat import org.apache.parquet.hadoop.metadata.CompressionCodecName +import scala.jdk.CollectionConverters._ import scala.reflect.{classTag, ClassTag} final case class ParquetAvroIO[T: ClassTag: Coder](path: String) extends ScioIO[T] { @@ -85,7 +86,8 @@ final case class ParquetAvroIO[T: ClassTag: Coder](path: String) extends ScioIO[ shardNameTemplate: String, isWindowed: Boolean, tempDirectory: ResourceId, - isLocalRunner: Boolean + isLocalRunner: Boolean, + metadata: Map[String, String] ) = { require(tempDirectory != null, "tempDirectory must not be null") val fp = FilenamePolicySupplier.resolve( @@ -104,7 +106,8 @@ final case class ParquetAvroIO[T: ClassTag: Coder](path: String) extends ScioIO[ dynamicDestinations, schema, job.getConfiguration, - compression + compression, + metadata.asJava ) val transform = WriteFiles.to(sink).withNumShards(numShards) if (!isWindowed) transform else transform.withWindowedWrites() @@ -128,7 +131,8 @@ final case class ParquetAvroIO[T: ClassTag: Coder](path: String) extends ScioIO[ params.shardNameTemplate, ScioUtil.isWindowed(data), ScioUtil.tempDirOrDefault(params.tempDirectory, data.context), - ScioUtil.isLocalRunner(data.context.options.getRunner) + ScioUtil.isLocalRunner(data.context.options.getRunner), + params.metadata ) ) tap(ParquetAvroIO.ReadParam(params)) @@ -272,6 +276,7 @@ object ParquetAvroIO { val DefaultPrefix: String = null val DefaultShardNameTemplate: String = null val DefaultTempDirectory: String = null + val DefaultMetadata: Map[String, String] = null } final case class WriteParam private ( @@ -283,6 +288,7 @@ object ParquetAvroIO { filenamePolicySupplier: FilenamePolicySupplier = WriteParam.DefaultFilenamePolicySupplier, prefix: String = WriteParam.DefaultPrefix, shardNameTemplate: String = WriteParam.DefaultShardNameTemplate, - tempDirectory: String = WriteParam.DefaultTempDirectory + tempDirectory: String = WriteParam.DefaultTempDirectory, + metadata: Map[String, String] = WriteParam.DefaultMetadata ) } diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/avro/syntax/SCollectionSyntax.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/avro/syntax/SCollectionSyntax.scala index d2b789ba7a..572e54974e 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/avro/syntax/SCollectionSyntax.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/avro/syntax/SCollectionSyntax.scala @@ -64,7 +64,8 @@ class SCollectionOps[T <: IndexedRecord](private val self: SCollection[T]) exten shardNameTemplate: String = WriteParam.DefaultShardNameTemplate, tempDirectory: String = WriteParam.DefaultTempDirectory, filenamePolicySupplier: FilenamePolicySupplier = WriteParam.DefaultFilenamePolicySupplier, - prefix: String = WriteParam.DefaultPrefix + prefix: String = WriteParam.DefaultPrefix, + metadata: Map[String, String] = WriteParam.DefaultMetadata )(implicit ct: ClassTag[T], coder: Coder[T]): ClosedTap[T] = { val param = WriteParam( schema = schema, @@ -75,7 +76,8 @@ class SCollectionOps[T <: IndexedRecord](private val self: SCollection[T]) exten filenamePolicySupplier = filenamePolicySupplier, prefix = prefix, shardNameTemplate = shardNameTemplate, - tempDirectory = tempDirectory + tempDirectory = tempDirectory, + metadata = metadata ) self.write(ParquetAvroIO[T](path))(param) } diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/read/ReadSupportFactory.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/read/ReadSupportFactory.scala index 47a161fb8b..ee2fe276bd 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/read/ReadSupportFactory.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/read/ReadSupportFactory.scala @@ -20,7 +20,7 @@ import com.spotify.parquet.tensorflow.TensorflowExampleReadSupport import magnolify.parquet.ParquetType import org.apache.parquet.avro.AvroReadSupport import org.apache.parquet.hadoop.api.ReadSupport -import org.tensorflow.proto.example.Example +import org.tensorflow.proto.Example sealed trait ReadSupportFactory[T] extends Serializable { def readSupport: ReadSupport[T] diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIO.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIO.scala index 4f619d8d0d..3efca5477d 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIO.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIO.scala @@ -46,7 +46,7 @@ import org.apache.hadoop.mapreduce.Job import org.apache.parquet.filter2.predicate.FilterPredicate import org.apache.parquet.hadoop.{ParquetInputFormat, ParquetReader} import org.apache.parquet.hadoop.metadata.CompressionCodecName -import org.tensorflow.proto.example.{Example, Features} +import org.tensorflow.proto.{Example, Features} import org.tensorflow.metadata.v0.Schema import scala.jdk.CollectionConverters._ @@ -159,7 +159,8 @@ final case class ParquetExampleIO(path: String) extends ScioIO[Example] { shardNameTemplate: String, isWindowed: Boolean, tempDirectory: ResourceId, - isLocalRunner: Boolean + isLocalRunner: Boolean, + metadata: Map[String, String] ) = { require(tempDirectory != null, "tempDirectory must not be null") val fp = FilenamePolicySupplier.resolve( @@ -177,7 +178,8 @@ final case class ParquetExampleIO(path: String) extends ScioIO[Example] { dynamicDestinations, schema, job.getConfiguration, - compression + compression, + metadata.asJava ) val transform = WriteFiles.to(sink).withNumShards(numShards) if (!isWindowed) transform else transform.withWindowedWrites() @@ -197,7 +199,8 @@ final case class ParquetExampleIO(path: String) extends ScioIO[Example] { params.shardNameTemplate, ScioUtil.isWindowed(data), ScioUtil.tempDirOrDefault(params.tempDirectory, data.context), - ScioUtil.isLocalRunner(data.context.options.getRunner) + ScioUtil.isLocalRunner(data.context.options.getRunner), + params.metadata ) ) tap(ParquetExampleIO.ReadParam(params)) @@ -237,6 +240,7 @@ object ParquetExampleIO { val DefaultPrefix: String = null val DefaultShardNameTemplate: String = null val DefaultTempDirectory: String = null + val DefaultMetadata: Map[String, String] = null } final case class WriteParam private ( @@ -248,7 +252,8 @@ object ParquetExampleIO { filenamePolicySupplier: FilenamePolicySupplier = WriteParam.DefaultFilenamePolicySupplier, prefix: String = WriteParam.DefaultPrefix, shardNameTemplate: String = WriteParam.DefaultShardNameTemplate, - tempDirectory: String = WriteParam.DefaultTempDirectory + tempDirectory: String = WriteParam.DefaultTempDirectory, + metadata: Map[String, String] = WriteParam.DefaultMetadata ) } diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/dynamic/ParquetExampleSink.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/dynamic/ParquetExampleSink.scala index 23692ffc41..08b5fd0109 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/dynamic/ParquetExampleSink.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/dynamic/ParquetExampleSink.scala @@ -22,15 +22,17 @@ import org.apache.beam.sdk.io.FileIO import org.apache.beam.sdk.io.hadoop.SerializableConfiguration import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.parquet.hadoop.ParquetWriter -import org.tensorflow.proto.example.Example +import org.tensorflow.proto.Example import org.tensorflow.metadata.v0.Schema import java.nio.channels.WritableByteChannel +import scala.jdk.CollectionConverters._ class ParquetExampleSink( val schema: Schema, val compression: CompressionCodecName, - val conf: SerializableConfiguration + val conf: SerializableConfiguration, + val metadata: Map[String, String] ) extends FileIO.Sink[Example] { private var writer: ParquetWriter[Example] = _ @@ -38,7 +40,7 @@ class ParquetExampleSink( override def open(channel: WritableByteChannel): Unit = { val outputFile = BeamOutputFile.of(channel) val builder = TensorflowExampleParquetWriter.builder(outputFile).withSchema(schema) - writer = WriterUtils.build(builder, conf.get, compression) + writer = WriterUtils.build(builder, conf.get, compression, metadata.asJava) } override def write(element: Example): Unit = writer.write(element) diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/dynamic/syntax/SCollectionSyntax.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/dynamic/syntax/SCollectionSyntax.scala index f2a4ac2227..ff085c8092 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/dynamic/syntax/SCollectionSyntax.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/dynamic/syntax/SCollectionSyntax.scala @@ -25,7 +25,7 @@ import com.spotify.scio.values.SCollection import org.apache.beam.sdk.io.hadoop.SerializableConfiguration import org.apache.hadoop.conf.Configuration import org.apache.parquet.hadoop.metadata.CompressionCodecName -import org.tensorflow.proto.example.Example +import org.tensorflow.proto.Example import org.tensorflow.metadata.v0.Schema final class DynamicParquetExampleSCollectionOps( @@ -44,7 +44,8 @@ final class DynamicParquetExampleSCollectionOps( compression: CompressionCodecName = ParquetExampleIO.WriteParam.DefaultCompression, conf: Configuration = ParquetExampleIO.WriteParam.DefaultConfiguration, tempDirectory: String = ParquetExampleIO.WriteParam.DefaultTempDirectory, - prefix: String = ParquetExampleIO.WriteParam.DefaultPrefix + prefix: String = ParquetExampleIO.WriteParam.DefaultPrefix, + metadata: Map[String, String] = ParquetExampleIO.WriteParam.DefaultMetadata )( destinationFn: Example => String ): ClosedTap[Nothing] = { @@ -56,7 +57,8 @@ final class DynamicParquetExampleSCollectionOps( val sink = new ParquetExampleSink( schema, compression, - new SerializableConfiguration(ParquetConfiguration.ofNullable(conf)) + new SerializableConfiguration(ParquetConfiguration.ofNullable(conf)), + metadata ) val write = writeDynamic( path = path, diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/syntax/SCollectionSyntax.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/syntax/SCollectionSyntax.scala index 340be599da..99d7d45889 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/syntax/SCollectionSyntax.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/syntax/SCollectionSyntax.scala @@ -24,7 +24,7 @@ import com.spotify.scio.util.FilenamePolicySupplier import com.spotify.scio.values.SCollection import org.apache.hadoop.conf.Configuration import org.apache.parquet.hadoop.metadata.CompressionCodecName -import org.tensorflow.proto.example.Example +import org.tensorflow.proto.Example import org.tensorflow.metadata.v0.Schema /** Enhanced version of [[SCollection]] with Parquet [[Example]] methods. */ @@ -41,7 +41,8 @@ final class SCollectionOps(private val self: SCollection[Example]) extends AnyVa shardNameTemplate: String = WriteParam.DefaultShardNameTemplate, tempDirectory: String = WriteParam.DefaultTempDirectory, filenamePolicySupplier: FilenamePolicySupplier = WriteParam.DefaultFilenamePolicySupplier, - prefix: String = WriteParam.DefaultPrefix + prefix: String = WriteParam.DefaultPrefix, + metadata: Map[String, String] = WriteParam.DefaultMetadata ): ClosedTap[Example] = self.write(ParquetExampleIO(path))( WriteParam( @@ -53,7 +54,8 @@ final class SCollectionOps(private val self: SCollection[Example]) extends AnyVa filenamePolicySupplier, prefix, shardNameTemplate, - tempDirectory + tempDirectory, + metadata ) ) } diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/syntax/ScioContextSyntax.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/syntax/ScioContextSyntax.scala index 55225863e5..501b21f0a9 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/syntax/ScioContextSyntax.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/tensorflow/syntax/ScioContextSyntax.scala @@ -24,7 +24,7 @@ import com.spotify.scio.values.SCollection import org.apache.hadoop.conf.Configuration import org.apache.parquet.filter2.predicate.FilterPredicate import org.tensorflow.metadata.v0.Schema -import org.tensorflow.proto.example.Example +import org.tensorflow.proto.Example /** Enhanced version of [[ScioContext]] with Parquet [[Example]] methods. */ final class ScioContextOps(private val self: ScioContext) extends AnyVal { diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/types/ParquetTypeIO.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/types/ParquetTypeIO.scala index 98c4c6ee7a..7acbcf8154 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/types/ParquetTypeIO.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/types/ParquetTypeIO.scala @@ -133,7 +133,8 @@ final case class ParquetTypeIO[T: ClassTag: Coder: ParquetType]( shardNameTemplate: String, isWindowed: Boolean, tempDirectory: ResourceId, - isLocalRunner: Boolean + isLocalRunner: Boolean, + metadata: Map[String, String] ) = { require(tempDirectory != null, "tempDirectory must not be null") val fp = FilenamePolicySupplier.resolve( @@ -151,7 +152,8 @@ final case class ParquetTypeIO[T: ClassTag: Coder: ParquetType]( dynamicDestinations, tpe, job.getConfiguration, - compression + compression, + metadata.asJava ) val transform = WriteFiles.to(sink).withNumShards(numShards) if (!isWindowed) transform else transform.withWindowedWrites() @@ -170,7 +172,8 @@ final case class ParquetTypeIO[T: ClassTag: Coder: ParquetType]( params.shardNameTemplate, ScioUtil.isWindowed(data), ScioUtil.tempDirOrDefault(params.tempDirectory, data.context), - ScioUtil.isLocalRunner(data.context.options.getRunner) + ScioUtil.isLocalRunner(data.context.options.getRunner), + params.metadata ) ) tap(ParquetTypeIO.ReadParam(params)) @@ -208,6 +211,7 @@ object ParquetTypeIO { val DefaultPrefix: String = null val DefaultShardNameTemplate: String = null val DefaultTempDirectory: String = null + val DefaultMetadata: Map[String, String] = null } final case class WriteParam private ( @@ -218,7 +222,8 @@ object ParquetTypeIO { filenamePolicySupplier: FilenamePolicySupplier = WriteParam.DefaultFilenamePolicySupplier, prefix: String = WriteParam.DefaultPrefix, shardNameTemplate: String = WriteParam.DefaultShardNameTemplate, - tempDirectory: String = WriteParam.DefaultTempDirectory + tempDirectory: String = WriteParam.DefaultTempDirectory, + metadata: Map[String, String] = WriteParam.DefaultMetadata ) } diff --git a/scio-parquet/src/main/scala/com/spotify/scio/parquet/types/syntax/SCollectionSyntax.scala b/scio-parquet/src/main/scala/com/spotify/scio/parquet/types/syntax/SCollectionSyntax.scala index 17f466c9a2..0e175b59d5 100644 --- a/scio-parquet/src/main/scala/com/spotify/scio/parquet/types/syntax/SCollectionSyntax.scala +++ b/scio-parquet/src/main/scala/com/spotify/scio/parquet/types/syntax/SCollectionSyntax.scala @@ -42,7 +42,8 @@ final class SCollectionOps[T](private val self: SCollection[T]) extends AnyVal { shardNameTemplate: String = WriteParam.DefaultShardNameTemplate, tempDirectory: String = WriteParam.DefaultTempDirectory, filenamePolicySupplier: FilenamePolicySupplier = WriteParam.DefaultFilenamePolicySupplier, - prefix: String = WriteParam.DefaultPrefix + prefix: String = WriteParam.DefaultPrefix, + metadata: Map[String, String] = WriteParam.DefaultMetadata )(implicit ct: ClassTag[T], coder: Coder[T], pt: ParquetType[T]): ClosedTap[T] = self.write(ParquetTypeIO[T](path))( WriteParam( @@ -53,7 +54,8 @@ final class SCollectionOps[T](private val self: SCollection[T]) extends AnyVal { filenamePolicySupplier, prefix, shardNameTemplate, - tempDirectory + tempDirectory, + metadata ) ) } diff --git a/scio-parquet/src/test/scala/com/spotify/scio/parquet/avro/ParquetAvroIOTest.scala b/scio-parquet/src/test/scala/com/spotify/scio/parquet/avro/ParquetAvroIOTest.scala index 73bb78829e..a85ce94483 100644 --- a/scio-parquet/src/test/scala/com/spotify/scio/parquet/avro/ParquetAvroIOTest.scala +++ b/scio-parquet/src/test/scala/com/spotify/scio/parquet/avro/ParquetAvroIOTest.scala @@ -22,7 +22,7 @@ import com.spotify.scio._ import com.spotify.scio.avro._ import com.spotify.scio.coders.Coder import com.spotify.scio.io.{ClosedTap, FileNamePolicySpec, ScioIOTest, TapSpec, TextIO} -import com.spotify.scio.parquet.ParquetConfiguration +import com.spotify.scio.parquet.{BeamInputFile, ParquetConfiguration} import com.spotify.scio.parquet.read.ParquetReadConfiguration import com.spotify.scio.testing._ import com.spotify.scio.util.FilenamePolicySupplier @@ -43,7 +43,9 @@ import org.apache.beam.sdk.options.PipelineOptionsFactory import org.apache.beam.sdk.transforms.windowing.{BoundedWindow, IntervalWindow, PaneInfo} import org.apache.commons.io.FileUtils import org.apache.hadoop.conf.Configuration +import org.apache.parquet.HadoopReadOptions import org.apache.parquet.avro.{AvroDataSupplier, AvroReadSupport, AvroWriteSupport} +import org.apache.parquet.hadoop.ParquetFileReader import org.joda.time.{DateTimeFieldType, Duration, Instant} import org.scalatest.BeforeAndAfterAll import org.scalatest.prop.TableDrivenPropertyChecks.{forAll => forAllCases, Table} @@ -378,6 +380,27 @@ class ParquetAvroIOTest extends ScioIOSpec with TapSpec with BeforeAndAfterAll { } } + it should "write extra metadata" in withTempDir { dir => + val sc = ScioContext() + val outDir = s"${dir.toPath.resolve("test-metadata").toFile.getAbsolutePath}" + + sc + .parallelize(1 to 10) + .map(x => new Account(x, x.toString, x.toString, x.toDouble, AccountStatus.Active)) + .saveAsParquetAvroFile(outDir, metadata = Map("foo" -> "bar", "bar" -> "baz"), numShards = 1) + sc.run() + + val options = HadoopReadOptions.builder(ParquetConfiguration.empty()).build + val r = + ParquetFileReader.open(BeamInputFile.of(s"$outDir/part-00000-of-00001.parquet"), options) + val metadata = r.getFileMetaData.getKeyValueMetaData + + metadata.get("foo") shouldBe "bar" + metadata.get("bar") shouldBe "baz" + + r.close() + } + class TestRecordProjection(@unused str: String) "tap" should "use projection schema and GenericDataSupplier" in { diff --git a/scio-parquet/src/test/scala/com/spotify/scio/parquet/dynamic/ParquetDynamicTest.scala b/scio-parquet/src/test/scala/com/spotify/scio/parquet/dynamic/ParquetDynamicTest.scala index 0c610cd956..9dd9dd06a8 100644 --- a/scio-parquet/src/test/scala/com/spotify/scio/parquet/dynamic/ParquetDynamicTest.scala +++ b/scio-parquet/src/test/scala/com/spotify/scio/parquet/dynamic/ParquetDynamicTest.scala @@ -75,7 +75,7 @@ trait ParquetDynamicTest extends PipelineSpec { class ParquetTensorflowDynamicTest extends ParquetDynamicTest { import com.google.protobuf.ByteString - import org.tensorflow.proto.example._ + import org.tensorflow.proto._ import org.tensorflow.metadata.{v0 => tfmd} import com.spotify.scio.parquet.tensorflow._ import com.spotify.scio.parquet.tensorflow.dynamic._ diff --git a/scio-parquet/src/test/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIOTest.scala b/scio-parquet/src/test/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIOTest.scala index 2423c6cc8b..2a0090f31d 100644 --- a/scio-parquet/src/test/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIOTest.scala +++ b/scio-parquet/src/test/scala/com/spotify/scio/parquet/tensorflow/ParquetExampleIOTest.scala @@ -20,16 +20,19 @@ package com.spotify.scio.parquet.tensorflow import com.google.protobuf.ByteString import com.spotify.scio.ScioContext import com.spotify.scio.io.{ClosedTap, FileNamePolicySpec, ScioIOTest, TapSpec} +import com.spotify.scio.parquet.{BeamInputFile, ParquetConfiguration} import com.spotify.scio.parquet.types._ import com.spotify.scio.testing.ScioIOSpec import com.spotify.scio.util.FilenamePolicySupplier import com.spotify.scio.values.SCollection import magnolify.parquet.ParquetType import org.apache.commons.io.FileUtils +import org.apache.parquet.HadoopReadOptions import org.apache.parquet.filter2.predicate.FilterApi +import org.apache.parquet.hadoop.ParquetFileReader import org.scalatest.BeforeAndAfterAll import org.tensorflow.metadata.{v0 => tfmd} -import org.tensorflow.proto.example._ +import org.tensorflow.proto._ import java.nio.file.Files import scala.jdk.CollectionConverters._ @@ -278,4 +281,30 @@ class ParquetExampleIOTest extends ScioIOSpec with TapSpec with BeforeAndAfterAl _.saveAsParquetExampleFile(_, schema) ) } + + it should "write extra metadata" in withTempDir { dir => + val sc = ScioContext() + val outDir = s"${dir.toPath.resolve("test-metadata").toFile.getAbsolutePath}" + + sc + .parallelize(1 to 10) + .map(newExample) + .saveAsParquetExampleFile( + outDir, + schema, + metadata = Map("foo" -> "bar", "bar" -> "baz"), + numShards = 1 + ) + sc.run() + + val options = HadoopReadOptions.builder(ParquetConfiguration.empty()).build + val r = + ParquetFileReader.open(BeamInputFile.of(s"$outDir/part-00000-of-00001.parquet"), options) + val metadata = r.getFileMetaData.getKeyValueMetaData + + metadata.get("foo") shouldBe "bar" + metadata.get("bar") shouldBe "baz" + + r.close() + } } diff --git a/scio-parquet/src/test/scala/com/spotify/scio/parquet/types/ParquetTypeIOTest.scala b/scio-parquet/src/test/scala/com/spotify/scio/parquet/types/ParquetTypeIOTest.scala index 3b0f4df783..bcd7ac423f 100644 --- a/scio-parquet/src/test/scala/com/spotify/scio/parquet/types/ParquetTypeIOTest.scala +++ b/scio-parquet/src/test/scala/com/spotify/scio/parquet/types/ParquetTypeIOTest.scala @@ -20,11 +20,14 @@ package com.spotify.scio.parquet.types import java.{lang => jl} import com.spotify.scio.ScioContext import com.spotify.scio.io.{ClosedTap, FileNamePolicySpec, ScioIOTest, TapSpec} +import com.spotify.scio.parquet.{BeamInputFile, ParquetConfiguration} import com.spotify.scio.testing.ScioIOSpec import com.spotify.scio.util.FilenamePolicySupplier import com.spotify.scio.values.SCollection import org.apache.commons.io.FileUtils +import org.apache.parquet.HadoopReadOptions import org.apache.parquet.filter2.predicate.FilterApi +import org.apache.parquet.hadoop.ParquetFileReader import org.scalatest.BeforeAndAfterAll import java.nio.file.Files @@ -139,6 +142,26 @@ class ParquetTypeIOTest extends ScioIOSpec with TapSpec with BeforeAndAfterAll { data should containInAnyOrder(expected) sc.run() } + + it should "write extra metadata" in withTempDir { dir => + val sc = ScioContext() + val outDir = s"${dir.toPath.resolve("test-metadata").toFile.getAbsolutePath}" + + sc + .parallelize(records) + .saveAsTypedParquetFile(outDir, metadata = Map("foo" -> "bar", "bar" -> "baz"), numShards = 1) + sc.run() + + val options = HadoopReadOptions.builder(ParquetConfiguration.empty()).build + val r = + ParquetFileReader.open(BeamInputFile.of(s"$outDir/part-00000-of-00001.parquet"), options) + val metadata = r.getFileMetaData.getKeyValueMetaData + + metadata.get("foo") shouldBe "bar" + metadata.get("bar") shouldBe "baz" + + r.close() + } } case class Wide(i: Int, s: String, o: Option[Int], r: List[Int]) diff --git a/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/TensorFlowBucketIO.java b/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/TensorFlowBucketIO.java index 9cf0f908c8..1a9e0e3cbf 100644 --- a/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/TensorFlowBucketIO.java +++ b/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/TensorFlowBucketIO.java @@ -31,7 +31,7 @@ import org.apache.beam.sdk.io.fs.ResourceId; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; -import org.tensorflow.proto.example.Example; +import org.tensorflow.proto.Example; /** * API for reading and writing sorted-bucket TensorFlow TFRecord files with TensorFlow {@link @@ -43,7 +43,7 @@ public class TensorFlowBucketIO { // make sure tensorflow is part of the classpath static { try { - Class.forName("org.tensorflow.proto.example.Example"); + Class.forName("org.tensorflow.proto.Example"); } catch (ClassNotFoundException e) { throw new MissingImplementationException("tensorflow", e); } diff --git a/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/TensorFlowBucketMetadata.java b/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/TensorFlowBucketMetadata.java index aa98494241..abb8d57d1c 100644 --- a/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/TensorFlowBucketMetadata.java +++ b/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/TensorFlowBucketMetadata.java @@ -30,11 +30,11 @@ import org.apache.beam.sdk.transforms.display.DisplayData; import org.apache.beam.sdk.transforms.display.DisplayData.Builder; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; -import org.tensorflow.proto.example.BytesList; -import org.tensorflow.proto.example.Example; -import org.tensorflow.proto.example.Feature; -import org.tensorflow.proto.example.FloatList; -import org.tensorflow.proto.example.Int64List; +import org.tensorflow.proto.BytesList; +import org.tensorflow.proto.Example; +import org.tensorflow.proto.Feature; +import org.tensorflow.proto.FloatList; +import org.tensorflow.proto.Int64List; /** {@link BucketMetadata} for TensorFlow {@link Example} records. */ public class TensorFlowBucketMetadata extends BucketMetadata { diff --git a/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/TensorFlowFileOperations.java b/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/TensorFlowFileOperations.java index ba25bdf9b6..efbd7fc81b 100644 --- a/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/TensorFlowFileOperations.java +++ b/scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/TensorFlowFileOperations.java @@ -27,7 +27,7 @@ import org.apache.beam.sdk.io.FileIO; import org.apache.beam.sdk.io.TFRecordIO; import org.apache.beam.sdk.util.MimeTypes; -import org.tensorflow.proto.example.Example; +import org.tensorflow.proto.Example; /** * {@link org.apache.beam.sdk.extensions.smb.FileOperations} implementation for TensorFlow TFRecord diff --git a/scio-smb/src/test/java/org/apache/beam/sdk/extensions/smb/TensorFlowBucketMetadataTest.java b/scio-smb/src/test/java/org/apache/beam/sdk/extensions/smb/TensorFlowBucketMetadataTest.java index 1a84aad329..f55d627cd8 100644 --- a/scio-smb/src/test/java/org/apache/beam/sdk/extensions/smb/TensorFlowBucketMetadataTest.java +++ b/scio-smb/src/test/java/org/apache/beam/sdk/extensions/smb/TensorFlowBucketMetadataTest.java @@ -33,12 +33,12 @@ import org.hamcrest.MatcherAssert; import org.junit.Assert; import org.junit.Test; -import org.tensorflow.proto.example.BytesList; -import org.tensorflow.proto.example.Example; -import org.tensorflow.proto.example.Feature; -import org.tensorflow.proto.example.Features; -import org.tensorflow.proto.example.FloatList; -import org.tensorflow.proto.example.Int64List; +import org.tensorflow.proto.BytesList; +import org.tensorflow.proto.Example; +import org.tensorflow.proto.Feature; +import org.tensorflow.proto.Features; +import org.tensorflow.proto.FloatList; +import org.tensorflow.proto.Int64List; /** Unit tests for {@link TensorFlowBucketMetadata}. */ public class TensorFlowBucketMetadataTest { diff --git a/scio-smb/src/test/java/org/apache/beam/sdk/extensions/smb/TensorFlowFileOperationsTest.java b/scio-smb/src/test/java/org/apache/beam/sdk/extensions/smb/TensorFlowFileOperationsTest.java index 3427401e2f..5dd6557eeb 100644 --- a/scio-smb/src/test/java/org/apache/beam/sdk/extensions/smb/TensorFlowFileOperationsTest.java +++ b/scio-smb/src/test/java/org/apache/beam/sdk/extensions/smb/TensorFlowFileOperationsTest.java @@ -35,12 +35,12 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import org.tensorflow.proto.example.BytesList; -import org.tensorflow.proto.example.Example; -import org.tensorflow.proto.example.Feature; -import org.tensorflow.proto.example.Features; -import org.tensorflow.proto.example.FloatList; -import org.tensorflow.proto.example.Int64List; +import org.tensorflow.proto.BytesList; +import org.tensorflow.proto.Example; +import org.tensorflow.proto.Feature; +import org.tensorflow.proto.Features; +import org.tensorflow.proto.FloatList; +import org.tensorflow.proto.Int64List; /** Unit tests for {@link TensorFlowFileOperations}. */ public class TensorFlowFileOperationsTest { diff --git a/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/TFRecordIO.scala b/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/TFRecordIO.scala index 7d157c756e..f9767de053 100644 --- a/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/TFRecordIO.scala +++ b/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/TFRecordIO.scala @@ -29,9 +29,11 @@ import org.apache.beam.sdk.io.{ WriteFiles } import org.apache.beam.sdk.{io => beam} -import org.tensorflow.proto.example.{Example, SequenceExample} +import org.tensorflow.proto.{Example, SequenceExample} import com.spotify.scio.io.TapT +import com.spotify.scio.tensorflow.TFExampleIO.ReadParam import com.spotify.scio.util.FilenamePolicySupplier +import magnolify.tensorflow.ExampleType import org.apache.beam.sdk.io.fs.ResourceId import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider import org.apache.beam.sdk.transforms.SerializableFunctions @@ -94,6 +96,59 @@ object TFRecordIO { ) } +object TFExampleTypedIO { + type ReadParam = TFRecordIO.ReadParam + val ReadParam = TFRecordIO.ReadParam + type WriteParam = TFRecordIO.WriteParam + val WriteParam = TFRecordIO.WriteParam +} + +final case class TFExampleTypedIO[T: ExampleType: Coder](path: String) extends ScioIO[T] { + override type WriteP = TFExampleTypedIO.WriteParam + override type ReadP = TFExampleTypedIO.ReadParam + override val tapT: TapT.Aux[T, T] = TapOf[T] + + override def testId: String = s"TFExampleTypedIO($path)" + + override protected def write(data: SCollection[T], params: WriteP): Tap[T] = { + val exampleType: ExampleType[T] = implicitly + data.transform_ { scoll => + scoll + .map(t => exampleType(t).toByteArray) + .applyInternal( + TFRecordMethods.tfWrite( + path, + params.suffix, + params.numShards, + params.compression, + params.filenamePolicySupplier, + params.prefix, + params.shardNameTemplate, + ScioUtil.isWindowed(data), + ScioUtil.tempDirOrDefault(params.tempDirectory, data.context) + ) + ) + } + tap(TFExampleIO.ReadParam(params)) + } + + override def tap(params: ReadP): Tap[T] = { + val exampleType: ExampleType[T] = implicitly + TFRecordMethods + .tap(path, params) + .map(bytes => exampleType(Example.parseFrom(bytes))) + } + + override protected def read(sc: ScioContext, params: ReadParam): SCollection[T] = { + val exampleType: ExampleType[T] = implicitly + sc.transform { ctx => + TFRecordMethods + .read(ctx, path, params) + .map(bytes => exampleType(Example.parseFrom(bytes))) + } + } +} + final case class TFExampleIO(path: String) extends ScioIO[Example] { override type ReadP = TFExampleIO.ReadParam override type WriteP = TFExampleIO.WriteParam @@ -162,7 +217,7 @@ private object TFRecordMethods { ) } - private def tfWrite( + private[scio] def tfWrite( path: String, suffix: String, numShards: Int, diff --git a/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/syntax/SCollectionSyntax.scala b/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/syntax/SCollectionSyntax.scala index 21fa619837..1e94e82649 100644 --- a/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/syntax/SCollectionSyntax.scala +++ b/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/syntax/SCollectionSyntax.scala @@ -18,16 +18,50 @@ package com.spotify.scio.tensorflow.syntax import org.apache.beam.sdk.io.Compression -import org.tensorflow.proto.example.{Example, SequenceExample} +import org.tensorflow.proto.{Example, SequenceExample} +import com.spotify.scio.coders.Coder import com.spotify.scio.io.ClosedTap -import com.spotify.scio.tensorflow.{TFExampleIO, TFRecordIO, TFSequenceExampleIO} +import com.spotify.scio.tensorflow.{TFExampleIO, TFExampleTypedIO, TFRecordIO, TFSequenceExampleIO} import com.spotify.scio.util.FilenamePolicySupplier import com.spotify.scio.values.SCollection +import magnolify.tensorflow.ExampleType + +final class TypedExampleSCollectionOps[T](private val self: SCollection[T]) { + + /** + * Converts this collection of `T` into Tensorflow [[org.tensorflow.proto.example.Example]]s with + * the provided [[magnolify.tensorflow.ExampleType]], then saves these as a TensorFlow TFRecord + * file. + */ + def saveAsTfRecordFile( + path: String, + suffix: String = TFExampleIO.WriteParam.DefaultSuffix, + compression: Compression = TFExampleIO.WriteParam.DefaultCompression, + numShards: Int = TFExampleIO.WriteParam.DefaultNumShards, + shardNameTemplate: String = TFExampleIO.WriteParam.DefaultShardNameTemplate, + tempDirectory: String = TFExampleIO.WriteParam.DefaultTempDirectory, + filenamePolicySupplier: FilenamePolicySupplier = + TFExampleIO.WriteParam.DefaultFilenamePolicySupplier, + prefix: String = TFExampleIO.WriteParam.DefaultPrefix + )(implicit exampleType: ExampleType[T]): ClosedTap[T] = { + implicit val tCoder: Coder[T] = self.coder + val param = TFExampleTypedIO.WriteParam( + suffix, + compression, + numShards, + filenamePolicySupplier, + prefix, + shardNameTemplate, + tempDirectory + ) + self.write(TFExampleTypedIO(path))(param) + } +} final class ExampleSCollectionOps[T <: Example](private val self: SCollection[T]) extends AnyVal { /** - * Saves this SCollection of `org.tensorflow.proto.example.Example` as a TensorFlow TFRecord file. + * Saves this SCollection of `org.tensorflow.proto.Example` as a TensorFlow TFRecord file. * * @return */ @@ -97,9 +131,8 @@ final class TFRecordSCollectionOps[T <: Array[Byte]](private val self: SCollecti /** * Save this SCollection as a TensorFlow TFRecord file. Note that elements must be of type - * `Array[Byte]`. The recommended record encoding is `org.tensorflow.proto.example.Example` - * protocol buffers (which contain `org.tensorflow.proto.example.Features` as a field) serialized - * as bytes. + * `Array[Byte]`. The recommended record encoding is `org.tensorflow.proto.Example` protocol + * buffers (which contain `org.tensorflow.proto.Features` as a field) serialized as bytes. * * @group output */ @@ -131,8 +164,7 @@ final class SequenceExampleSCollectionOps[T <: SequenceExample](private val self extends AnyVal { /** - * Saves this SCollection of `org.tensorflow.proto.example.SequenceExample` as a TensorFlow - * TFRecord file. + * Saves this SCollection of `org.tensorflow.proto.SequenceExample` as a TensorFlow TFRecord file. * * @return */ @@ -193,4 +225,8 @@ trait SCollectionSyntax { implicit def tensorFlowSequenceExampleSCollectionOps[T <: SequenceExample]( s: SCollection[T] ): SequenceExampleSCollectionOps[T] = new SequenceExampleSCollectionOps(s) + + implicit def tensorFlowTypedExampleSCollectionOps[T]( + s: SCollection[T] + ): TypedExampleSCollectionOps[T] = new TypedExampleSCollectionOps(s) } diff --git a/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/syntax/ScioContextSyntax.scala b/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/syntax/ScioContextSyntax.scala index 7ecedb47ce..e4f7724287 100644 --- a/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/syntax/ScioContextSyntax.scala +++ b/scio-tensorflow/src/main/scala/com/spotify/scio/tensorflow/syntax/ScioContextSyntax.scala @@ -18,20 +18,34 @@ package com.spotify.scio.tensorflow.syntax import java.nio.file.Files - import com.spotify.scio.ScioContext -import com.spotify.scio.tensorflow.{TFExampleIO, TFRecordIO, TFSequenceExampleIO} +import com.spotify.scio.coders.Coder +import com.spotify.scio.tensorflow.{TFExampleIO, TFExampleTypedIO, TFRecordIO, TFSequenceExampleIO} import com.spotify.scio.values.{DistCache, SCollection} +import magnolify.tensorflow.ExampleType import org.apache.beam.sdk.io.Compression -import org.tensorflow.proto.example.{Example, SequenceExample} +import org.tensorflow.proto.{Example, SequenceExample} import org.tensorflow.metadata.v0._ final class ScioContextOps(private val self: ScioContext) extends AnyVal { + /** + * Get an SCollection for a TensorFlow TFRecord file. Input must be Records are read back as + * Tensorflow [[org.tensorflow.proto.example.Example]]s then mapped to the user type `T` with the + * implicit [[magnolify.tensorflow.ExampleType]] + * + * @group input + */ + def typedTfRecordFile[T: ExampleType: Coder]( + path: String, + compression: Compression = Compression.AUTO + ): SCollection[T] = + self.read(TFExampleTypedIO(path))(TFExampleTypedIO.ReadParam(compression)) + /** * Get an SCollection for a TensorFlow TFRecord file. Note that TFRecord files are not splittable. - * The recommended record encoding is [[org.tensorflow.proto.example.Example]] protocol buffers - * (which contain [[org.tensorflow.proto.example.Features]] as a field) serialized as bytes. + * The recommended record encoding is [[org.tensorflow.proto.Example]] protocol buffers (which + * contain [[org.tensorflow.proto.Features]] as a field) serialized as bytes. * @group input */ def tfRecordFile( @@ -41,8 +55,8 @@ final class ScioContextOps(private val self: ScioContext) extends AnyVal { self.read(TFRecordIO(path))(TFRecordIO.ReadParam(compression)) /** - * Get an SCollection of [[org.tensorflow.proto.example.Example]] from a TensorFlow TFRecord file - * encoded as serialized [[org.tensorflow.proto.example.Example]] protocol buffers. + * Get an SCollection of [[org.tensorflow.proto.Example]] from a TensorFlow TFRecord file encoded + * as serialized [[org.tensorflow.proto.Example]] protocol buffers. * @group input */ def tfRecordExampleFile( @@ -52,9 +66,8 @@ final class ScioContextOps(private val self: ScioContext) extends AnyVal { self.read(TFExampleIO(path))(TFExampleIO.ReadParam(compression)) /** - * Get an SCollection of [[org.tensorflow.proto.example.SequenceExample]] from a TensorFlow - * TFRecord file encoded as serialized [[org.tensorflow.proto.example.SequenceExample]] protocol - * buffers. + * Get an SCollection of [[org.tensorflow.proto.SequenceExample]] from a TensorFlow TFRecord file + * encoded as serialized [[org.tensorflow.proto.SequenceExample]] protocol buffers. * @group input */ def tfRecordSequenceExampleFile( @@ -64,9 +77,9 @@ final class ScioContextOps(private val self: ScioContext) extends AnyVal { self.read(TFSequenceExampleIO(path))(TFExampleIO.ReadParam(compression)) /** - * Get an SCollection of [[org.tensorflow.proto.example.Example]] from a TensorFlow TFRecord file - * encoded as serialized [[org.tensorflow.proto.example.Example]] protocol buffers, along with the - * remotely stored [[org.tensorflow.metadata.v0.Schema]] object available in a DistCache. + * Get an SCollection of [[org.tensorflow.proto.Example]] from a TensorFlow TFRecord file encoded + * as serialized [[org.tensorflow.proto.Example]] protocol buffers, along with the remotely stored + * [[org.tensorflow.metadata.v0.Schema]] object available in a DistCache. * @group input */ def tfRecordExampleFileWithSchema( @@ -82,9 +95,9 @@ final class ScioContextOps(private val self: ScioContext) extends AnyVal { } /** - * Get an SCollection of [[org.tensorflow.proto.example.Example]] from a TensorFlow TFRecord file - * encoded as serialized [[org.tensorflow.proto.example.Example]] protocol buffers, along with the - * remotely stored [[org.tensorflow.metadata.v0.Schema]] object available in a DistCache. + * Get an SCollection of [[org.tensorflow.proto.Example]] from a TensorFlow TFRecord file encoded + * as serialized [[org.tensorflow.proto.Example]] protocol buffers, along with the remotely stored + * [[org.tensorflow.metadata.v0.Schema]] object available in a DistCache. * @group input */ def tfRecordSequenceExampleFileWithSchema( diff --git a/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/MetadataSchemaTest.scala b/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/MetadataSchemaTest.scala index f5083cc931..d5e9552f78 100644 --- a/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/MetadataSchemaTest.scala +++ b/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/MetadataSchemaTest.scala @@ -18,7 +18,7 @@ package com.spotify.scio.tensorflow import com.google.protobuf.ByteString -import org.tensorflow.proto.example._ +import org.tensorflow.proto._ import scala.jdk.CollectionConverters._ diff --git a/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/TFExampleIOTest.scala b/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/TFExampleIOTest.scala index 743972c5a8..4fb475079a 100644 --- a/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/TFExampleIOTest.scala +++ b/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/TFExampleIOTest.scala @@ -22,7 +22,7 @@ import com.spotify.scio.testing._ import com.spotify.scio.util.FilenamePolicySupplier import com.spotify.scio.values.SCollection import magnolify.tensorflow._ -import org.tensorflow.proto.example.Example +import org.tensorflow.proto.Example object TFExampleIOTest { case class Record(i: Int, s: String) @@ -39,6 +39,13 @@ class TFExampleIOTest extends ScioIOSpec { testTap(xs)(_.saveAsTfRecordFile(_))(".tfrecords") testJobTest(xs)(TFExampleIO(_))(_.tfRecordExampleFile(_))(_.saveAsTfRecordFile(_)) } + + it should "work with typed records" in { + val xs = (1 to 100).map(x => Record(x, x.toString)) + implicit val exampleType: ExampleType[Record] = recordT + testTap(xs)(_.saveAsTfRecordFile(_))(".tfrecords") + testJobTest(xs)(TFExampleTypedIO(_))(_.typedTfRecordFile(_))(_.saveAsTfRecordFile(_)) + } } class TFExampleIOFileNamePolicyTest extends FileNamePolicySpec[Example] { @@ -65,7 +72,7 @@ class TFExampleIOFileNamePolicyTest extends FileNamePolicySpec[Example] { _.map(x => recordT(Record(x, x.toString))).saveAsTfRecordFile( "nonsense", shardNameTemplate = "SSS-of-NNN", - filenamePolicySupplier = testFilenamePolicySupplier + filenamePolicySupplier = testFilenamePolicySupplier(_, _) ) ) } diff --git a/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/TFSequenceExampleIOTest.scala b/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/TFSequenceExampleIOTest.scala index 1d8d73ca14..ac57fab381 100644 --- a/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/TFSequenceExampleIOTest.scala +++ b/scio-tensorflow/src/test/scala/com/spotify/scio/tensorflow/TFSequenceExampleIOTest.scala @@ -19,7 +19,7 @@ package com.spotify.scio.tensorflow import com.google.protobuf.ByteString import com.spotify.scio.testing.ScioIOSpec -import org.tensorflow.proto.example._ +import org.tensorflow.proto._ import scala.jdk.CollectionConverters._ diff --git a/scio-test/core/src/main/scala/com/spotify/scio/testing/SCollectionMatchers.scala b/scio-test/core/src/main/scala/com/spotify/scio/testing/SCollectionMatchers.scala index 2f825b408f..f6e062de72 100644 --- a/scio-test/core/src/main/scala/com/spotify/scio/testing/SCollectionMatchers.scala +++ b/scio-test/core/src/main/scala/com/spotify/scio/testing/SCollectionMatchers.scala @@ -326,7 +326,7 @@ trait SCollectionMatchers extends EqInstances { ): Matcher[T] = matcher.matcher(_.inEarlyGlobalWindowPanes) - /** Assert that the SCollection in question contains the provided elements. */ + /** Assert that the SCollection in question contains exactly the provided elements. */ def containInAnyOrder[T: Coder: Eq]( value: Iterable[T] ): IterableMatcher[SCollection[T], T] = @@ -443,8 +443,6 @@ trait SCollectionMatchers extends EqInstances { } } - // TODO: investigate why multi-map doesn't work - /** Assert that the SCollection in question satisfies the provided function. */ def satisfy[T: Coder: Eq]( predicate: Iterable[T] => Boolean diff --git a/scio-test/core/src/main/scala/com/spotify/scio/testing/ScioIOSpec.scala b/scio-test/core/src/main/scala/com/spotify/scio/testing/ScioIOSpec.scala index 183c8fd0d4..657cc884d3 100644 --- a/scio-test/core/src/main/scala/com/spotify/scio/testing/ScioIOSpec.scala +++ b/scio-test/core/src/main/scala/com/spotify/scio/testing/ScioIOSpec.scala @@ -149,30 +149,34 @@ trait ScioIOSpec extends PipelineSpec { } def testJobTest[T: Coder](xs: Seq[T], in: String = "in", out: String = "out")( - ioFn: String => ScioIO[T] + ioFn: String => ScioIO[T], + optOutIOFn: Option[String => ScioIO[T]] = None )( readFn: (ScioContext, String) => SCollection[T] )( writeFn: (SCollection[T], String) => ClosedTap[_] ): Unit = { + val inIO = ioFn(in) + val outIO = optOutIOFn.map(outIoFn => outIoFn(out)).getOrElse(ioFn(out)) + val testJob = (sc: ScioContext) => writeFn(readFn(sc, in), out) JobTest(testJob) - .input(ioFn(in), xs) - .output(ioFn(out))(_ should containInAnyOrder(xs)) + .input(inIO, xs) + .output(outIO)(_ should containInAnyOrder(xs)) .run() the[IllegalArgumentException] thrownBy { JobTest(testJob) .input(CustomIO[T](in), xs) - .output(ioFn(out))(_ should containInAnyOrder(xs)) + .output(outIO)(_ should containInAnyOrder(xs)) .run() - } should have message s"requirement failed: Missing test input: ${ioFn(in).testId}, available: [CustomIO($in)]" + } should have message s"requirement failed: Missing test input: ${inIO.testId}, available: [CustomIO($in)]" the[IllegalArgumentException] thrownBy { JobTest(testJob) - .input(ioFn(in), xs) + .input(inIO, xs) .output(CustomIO[T](out))(_ should containInAnyOrder(xs)) .run() - } should have message s"requirement failed: Missing test output: ${ioFn(out).testId}, available: [CustomIO($out)]" + } should have message s"requirement failed: Missing test output: ${outIO.testId}, available: [CustomIO($out)]" } } diff --git a/scio-test/core/src/test/scala/com/spotify/scio/testing/SCollectionMatchersTest.scala b/scio-test/core/src/test/scala/com/spotify/scio/testing/SCollectionMatchersTest.scala index 9c7c10c8be..43e6b3cf25 100644 --- a/scio-test/core/src/test/scala/com/spotify/scio/testing/SCollectionMatchersTest.scala +++ b/scio-test/core/src/test/scala/com/spotify/scio/testing/SCollectionMatchersTest.scala @@ -108,6 +108,9 @@ class SCollectionMatchersTest extends PipelineSpec { // lambda ser/de runWithContext(_.parallelize(Seq(newTR(1))) should containInAnyOrder(Seq(newTR(1)))) runWithContext(_.parallelize(Seq(newTR(1))) shouldNot containInAnyOrder(Seq(newTR(2)))) + runWithContext( + _.parallelize(Seq(newTR(1))) should satisfy[TestRecord](_.toList.contains(newTR(1))) + ) } it should "support containsInAnyOrder containing %" in { diff --git a/scio-test/parquet/src/main/scala/com/spotify/scio/testing/parquet/tensorflow/package.scala b/scio-test/parquet/src/main/scala/com/spotify/scio/testing/parquet/tensorflow/package.scala index 24e1578372..14854b45dd 100644 --- a/scio-test/parquet/src/main/scala/com/spotify/scio/testing/parquet/tensorflow/package.scala +++ b/scio-test/parquet/src/main/scala/com/spotify/scio/testing/parquet/tensorflow/package.scala @@ -27,7 +27,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.parquet.filter2.predicate.FilterPredicate import org.apache.parquet.hadoop.ParquetInputFormat import org.tensorflow.metadata.{v0 => tfmd} -import org.tensorflow.proto.example.Example +import org.tensorflow.proto.Example package object tensorflow { implicit def toParquetExampleHelpers( diff --git a/scio-test/parquet/src/test/scala/com/spotify/scio/testing/parquet/ParquetTestUtilsTest.scala b/scio-test/parquet/src/test/scala/com/spotify/scio/testing/parquet/ParquetTestUtilsTest.scala index 575dfe43ae..45d33e7c6f 100644 --- a/scio-test/parquet/src/test/scala/com/spotify/scio/testing/parquet/ParquetTestUtilsTest.scala +++ b/scio-test/parquet/src/test/scala/com/spotify/scio/testing/parquet/ParquetTestUtilsTest.scala @@ -24,7 +24,7 @@ import org.apache.parquet.filter2.predicate.FilterApi import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers import org.tensorflow.metadata.{v0 => tfmd} -import org.tensorflow.proto.example._ +import org.tensorflow.proto._ import scala.jdk.CollectionConverters._ diff --git a/site/src/main/paradox/FAQ.md b/site/src/main/paradox/FAQ.md index 2fe9536556..740e1f43d4 100644 --- a/site/src/main/paradox/FAQ.md +++ b/site/src/main/paradox/FAQ.md @@ -484,7 +484,7 @@ If you encounter an SBT error with message "Initial heap size set to a larger va You might get an error message like `java.io.IOException: Unable to create parent directories of /Applications/IntelliJ IDEA CE.app/Contents/bin/.bigquery/012345abcdef.schema.json`. This usually happens to people who run IntelliJ IDEA with its bundled JVM. There are two solutions. -- Install JDK from [java.com](https://www.java.com/) and switch to it by following the "All platforms: switch between installed runtimes" section in this [page](https://intellij-support.jetbrains.com/hc/en-us/articles/206544879-Selecting-the-JDK-version-the-IDE-will-run-under). +- Install JDK from a vendor like [adoptium](https://adoptium.net/index.html) or [corretto](https://aws.amazon.com/corretto/) and switch to it by following the "All platforms: switch between installed runtimes" section in this [page](https://intellij-support.jetbrains.com/hc/en-us/articles/206544879-Selecting-the-JDK-version-the-IDE-will-run-under). - Override the bigquery `.cache` directory as a JVM compiler parameter. On the bottom right of the IntelliJ window, click the icon that looks like a clock, and then "Configure...". Then, edit the JVM parameters to include the line `-Dbigquery.cache.directory=/.bigquery`. Then, restart the compile server by clicking on the clock icon -> Stop, and then Start. #### How to make IntelliJ IDEA work with type safe BigQuery classes? diff --git a/site/src/main/paradox/io/Iceberg.md b/site/src/main/paradox/io/Iceberg.md new file mode 100644 index 0000000000..c01adcd865 --- /dev/null +++ b/site/src/main/paradox/io/Iceberg.md @@ -0,0 +1,47 @@ +# Iceberg + +Scio supports reading from and writing to [Apache Iceberg](https://iceberg.apache.org/) via Beam's @ref[Managed transforms](Managed.md). +[Magnolify's](https://github.com/spotify/magnolify) `RowType` (available as part of the `magnolify-beam` artifact) provides automatically-derived mappings between scala case classes and Beam's `Row`, used by the underlying managed transform. See [full documentation here](https://github.com/spotify/magnolify/blob/main/docs/beam.md). + +To read: + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.iceberg._ +import com.spotify.scio.values.SCollection +import magnolify.beam._ + +case class Record(a: Int, b: String) +implicit val rt: RowType[Record] = RowType[Record] + +val sc: ScioContext = ??? +val table: String = ??? +val catalogName: String = ??? +val catalogConfig: Map[String, String] = ??? + +val records: SCollection[Record] = sc.iceberg[Record]( + table, + catalogName, + catalogConfig +) +``` + +To write: + +```scala mdoc:invisible +import com.spotify.scio.iceberg._ +import com.spotify.scio.values.SCollection +import magnolify.beam._ +case class Record(a: Int, b: String) +implicit val rt: RowType[Record] = RowType[Record] +``` + +```scala mdoc:compile-only +val records: SCollection[Record] = ??? + +val table: String = ??? +val catalogName: String = ??? +val catalogConfig: Map[String, String] = ??? + +records.saveAsIceberg(table, catalogName, catalogConfig) +``` diff --git a/site/src/main/paradox/io/Managed.md b/site/src/main/paradox/io/Managed.md new file mode 100644 index 0000000000..c6f2f6275a --- /dev/null +++ b/site/src/main/paradox/io/Managed.md @@ -0,0 +1,74 @@ +# Managed IO + +Beam's Managed transforms move responsibility for the creation of transform classes from user code to the runner, allowing runner-specific optimizations like hot-swapping an instance of a transform with an updated one. +Beam currently supports Iceberg and Kafka managed transforms. +See also [Dataflow's supported transforms](https://cloud.google.com/dataflow/docs/guides/managed-io). + +A Scio @ref:[Coder](../internals/Coders.md) must be defined for the Beam @javadoc[Row](org.apache.beam.sdk.values.Row), derived from the Beam @javadoc[Schema](org.apache.beam.sdk.schemas.Schema) expected from the datasource. +If you have more than one type of data being read into Beam Rows, you will need to provide the coders explicitly instead of implicitly. + +The source and sink parameters should be imported from Beam's @javadoc[Managed](org.apache.beam.sdk.managed.Managed). + +```scala mdoc:compile-only +import com.spotify.scio.ScioContext +import com.spotify.scio.coders.Coder +import com.spotify.scio.managed._ +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.managed.Managed +import org.apache.beam.sdk.schemas.Schema +import org.apache.beam.sdk.values.Row + +val sc: ScioContext = ??? + +val rowSchema: Schema = ??? +implicit val rowCoder: Coder[Row] = Coder.row(rowSchema) + +val config: Map[String, Object] = ??? +val rows: SCollection[Row] = sc.managed(Managed.ICEBERG, rowSchema, config) +``` + +Saving data to a Managed IO is similar: +```scala mdoc:invisible +import com.spotify.scio.managed._ +import com.spotify.scio.coders.Coder +import com.spotify.scio.values.SCollection +import org.apache.beam.sdk.managed.Managed +import org.apache.beam.sdk.schemas.Schema +import org.apache.beam.sdk.values.Row +``` + +```scala mdoc:compile-only +val rows: SCollection[Row] = ??? +val config: Map[String, Object] = ??? + +rows.saveAsManaged(Managed.ICEBERG, config) +``` + +[Magnolify's](https://github.com/spotify/magnolify) `RowType` (available as part of the `magnolify-beam` artifact) provides automatically-derived mappings between Beam's `Row` and scala case classes. See [full documentation here](https://github.com/spotify/magnolify/blob/main/docs/beam.md). + +```scala mdoc:invisible +import com.spotify.scio.ScioContext +import com.spotify.scio.managed._ +import org.apache.beam.sdk.managed.Managed +import org.apache.beam.sdk.schemas.Schema +import org.apache.beam.sdk.values.Row +``` + +```scala mdoc:compile-only +import magnolify.beam._ + +val config: Map[String, Object] = ??? + +case class Record(a: Int, b: String) +val rt = RowType[Record] +implicit val recordRowCoder: Coder[Row] = Coder.row(rt.schema) + +val sc: ScioContext = ??? +sc.managed(Managed.ICEBERG, rt.schema, config) + // convert the Row instance to a Record + .map(rt.apply) + .map(r => r.copy(a = r.a + 1)) + // convert the Record to a Row + .map(rt.apply) + .saveAsManaged(Managed.ICEBERG, config) +``` diff --git a/site/src/main/paradox/io/Neo4J.md b/site/src/main/paradox/io/Neo4J.md index 8a932965fc..40d4977d46 100644 --- a/site/src/main/paradox/io/Neo4J.md +++ b/site/src/main/paradox/io/Neo4J.md @@ -1,6 +1,6 @@ # Neo4J -Scio provides support [Neo4J](https://neo4j.com/) in the `scio-neo4j` artifact. +Scio provides support for [Neo4J](https://neo4j.com/) in the `scio-neo4j` artifact. Scio uses [magnolify's](https://github.com/spotify/magnolify) `magnolify-neo4j` to convert to and from Neo4J types. diff --git a/site/src/main/paradox/io/Tensorflow.md b/site/src/main/paradox/io/Tensorflow.md index 4d5091f71e..18b6e7b670 100644 --- a/site/src/main/paradox/io/Tensorflow.md +++ b/site/src/main/paradox/io/Tensorflow.md @@ -6,13 +6,13 @@ Scio supports several methods of reading and writing [Tensorflow](https://www.te Depending on your input format, and if you need to provide a schema or not, there are various ways to read Tensorflow files. -@scaladoc[tfRecordFile](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordFile(path:String,compression:org.apache.beam.sdk.io.Compression):com.spotify.scio.values.SCollection[Array[Byte]]) reads entire `TFRecord` files into byte array elements in the pipeline, @scaladoc[tfRecordExampleFile](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordExampleFile(path:String,compression:org.apache.beam.sdk.io.Compression):com.spotify.scio.values.SCollection[org.tensorflow.proto.example.Example]) (or @scaladoc[tfRecordExampleFileWithSchema](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordExampleFileWithSchema(path:String,schemaFilename:String,compression:org.apache.beam.sdk.io.Compression):(com.spotify.scio.values.SCollection[org.tensorflow.proto.example.Example],com.spotify.scio.values.DistCache[org.tensorflow.metadata.v0.Schema]))) will read @javadoc[Example](org.tensorflow.proto.example.Example) instances, and @scaladoc[tfRecordSequenceExampleFile](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordSequenceExampleFile(path:String,compression:org.apache.beam.sdk.io.Compression):com.spotify.scio.values.SCollection[org.tensorflow.proto.example.SequenceExample]) (or @scaladoc[tfRecordSequenceExampleFileWithSchema](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordSequenceExampleFileWithSchema(path:String,schemaFilename:String,compression:org.apache.beam.sdk.io.Compression):(com.spotify.scio.values.SCollection[org.tensorflow.proto.example.SequenceExample],com.spotify.scio.values.DistCache[org.tensorflow.metadata.v0.Schema]))) will read @javadoc[SequenceExample](org.tensorflow.proto.example.SequenceExample) instances: +@scaladoc[tfRecordFile](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordFile(path:String,compression:org.apache.beam.sdk.io.Compression):com.spotify.scio.values.SCollection[Array[Byte]]) reads entire `TFRecord` files into byte array elements in the pipeline, @scaladoc[tfRecordExampleFile](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordExampleFile(path:String,compression:org.apache.beam.sdk.io.Compression):com.spotify.scio.values.SCollection[]) (or @scaladoc[tfRecordExampleFileWithSchema](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordExampleFileWithSchema(path:String,schemaFilename:String,compression:org.apache.beam.sdk.io.Compression):(com.spotify.scio.values.SCollection[org.tensorflow.proto.Example],com.spotify.scio.values.DistCache[org.tensorflow.metadata.v0.Schema]))) will read @javadoc[Example](org.tensorflow.proto.Example) instances, and @scaladoc[tfRecordSequenceExampleFile](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordSequenceExampleFile(path:String,compression:org.apache.beam.sdk.io.Compression):com.spotify.scio.values.SCollection[org.tensorflow.proto.SequenceExample]) (or @scaladoc[tfRecordSequenceExampleFileWithSchema](com.spotify.scio.tensorflow.syntax.ScioContextOps#tfRecordSequenceExampleFileWithSchema(path:String,schemaFilename:String,compression:org.apache.beam.sdk.io.Compression):(com.spotify.scio.values.SCollection[org.tensorflow.proto.SequenceExample],com.spotify.scio.values.DistCache[org.tensorflow.metadata.v0.Schema]))) will read @javadoc[SequenceExample](org.tensorflow.proto.SequenceExample) instances: ```scala mdoc:compile-only import com.spotify.scio.ScioContext import com.spotify.scio.values.SCollection import com.spotify.scio.tensorflow._ -import org.tensorflow.proto.example.{Example, SequenceExample} +import org.tensorflow.proto.{Example, SequenceExample} val sc: ScioContext = ??? val recordBytes: SCollection[Array[Byte]] = sc.tfRecordFile("gs://input-record-path") @@ -25,16 +25,16 @@ val sequenceExamples: SCollection[SequenceExample] = sc.tfRecordSequenceExampleF Similar to reading, there are multiple ways to write Tensorflow files, depending on the format of the elements to be output. Each of these write methods is called `saveAsTfRecordFile`, but only one variant of the method is available based on the element type. -* For `SCollection[T]` where `T` is a subclass of `Example`: @scaladoc[saveAsTfRecordFile](com.spotify.scio.tensorflow.syntax.ExampleSCollectionOps#saveAsTfRecordFile(path:String,suffix:String,compression:org.apache.beam.sdk.io.Compression,numShards:Int,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier):com.spotify.scio.io.ClosedTap[org.tensorflow.proto.example.Example]) -* For `SCollection[Seq[T]]` where `T` is a subclass of `Example`: @scaladoc[saveAsTfRecordFile](com.spotify.scio.tensorflow.syntax.SeqExampleSCollectionOps#saveAsTfRecordFile(path:String,suffix:String,compression:org.apache.beam.sdk.io.Compression,numShards:Int,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier):com.spotify.scio.io.ClosedTap[org.tensorflow.proto.example.Example]) -* For `SCollection[T]` where `T` is a subclass of `SequenceExample`: @scaladoc[saveAsTfRecordFile](com.spotify.scio.tensorflow.syntax.SequenceExampleSCollectionOps#saveAsTfRecordFile(path:String,suffix:String,compression:org.apache.beam.sdk.io.Compression,numShards:Int,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier):com.spotify.scio.io.ClosedTap[org.tensorflow.proto.example.SequenceExample]) +* For `SCollection[T]` where `T` is a subclass of `Example`: @scaladoc[saveAsTfRecordFile](com.spotify.scio.tensorflow.syntax.ExampleSCollectionOps#saveAsTfRecordFile(path:String,suffix:String,compression:org.apache.beam.sdk.io.Compression,numShards:Int,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier):com.spotify.scio.io.ClosedTap[org.tensorflow.proto.Example]) +* For `SCollection[Seq[T]]` where `T` is a subclass of `Example`: @scaladoc[saveAsTfRecordFile](com.spotify.scio.tensorflow.syntax.SeqExampleSCollectionOps#saveAsTfRecordFile(path:String,suffix:String,compression:org.apache.beam.sdk.io.Compression,numShards:Int,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier):com.spotify.scio.io.ClosedTap[org.tensorflow.proto.Example]) +* For `SCollection[T]` where `T` is a subclass of `SequenceExample`: @scaladoc[saveAsTfRecordFile](com.spotify.scio.tensorflow.syntax.SequenceExampleSCollectionOps#saveAsTfRecordFile(path:String,suffix:String,compression:org.apache.beam.sdk.io.Compression,numShards:Int,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier):com.spotify.scio.io.ClosedTap[org.tensorflow.proto.SequenceExample]) * For `SCollection[Array[Byte]]`, where it is recommended that the bytes are a serialized `Example`: @scaladoc[saveAsTfRecordFile](com.spotify.scio.tensorflow.syntax.TFRecordSCollectionOps#saveAsTfRecordFile(path:String,suffix:String,compression:org.apache.beam.sdk.io.Compression,numShards:Int,shardNameTemplate:String,tempDirectory:String,filenamePolicySupplier:com.spotify.scio.util.FilenamePolicySupplier)(implicitev:T%3C:%3CArray[Byte]):com.spotify.scio.io.ClosedTap[Array[Byte]]) ```scala mdoc:compile-only import com.spotify.scio.values.SCollection import com.spotify.scio.tensorflow._ -import org.tensorflow.proto.example.{Example, SequenceExample} +import org.tensorflow.proto.{Example, SequenceExample} val recordBytes: SCollection[Array[Byte]] = ??? val examples: SCollection[Example] = ??? diff --git a/site/src/main/paradox/io/index.md b/site/src/main/paradox/io/index.md index 72a47d427a..422f8a6023 100644 --- a/site/src/main/paradox/io/index.md +++ b/site/src/main/paradox/io/index.md @@ -11,10 +11,12 @@ * @ref:[Cassandra](Cassandra.md) * @ref:[CSV](Csv.md) * @ref:[Datastore](Datastore.md) -* @ref:[Grpc](Grpc.md) * @ref:[Elasticsearch](Elasticsearch.md) +* @ref:[Grpc](Grpc.md) +* @ref:[Iceberg](Iceberg.md) * @ref:[JDBC](Jdbc.md) * @ref:[Json](Json.md) +* @ref:[Managed](Managed.md) * @ref:[Neo4J](Neo4J.md) * @ref:[Object](Object.md) * @ref:[Parquet](Parquet.md)