[!439] - feature: Error in Kafka multi topic writer when keyFieldName option is Some for some topics and None for others

agilelab-tmnd1991 · agilelab-tmnd1991 · commit 809f9652e1dd · 2025-02-02T13:50:13.000Z
# New features and improvements

- Introduce NonEmptyList to allow for safer code

# Breaking changes

None.

# Migration

None.

# Bug fixes

- When a Kafka multi-topic writer is used, if some topics do not declare a keyFieldName the writer used to fail, now it start the streaming query and will set `key` column to null for the records that come from the topic that does not feature `keyFieldName`

# How this feature was tested

Existing unit tests, new unit test, Integration test on a downstream implementation

# Related issue

Closes github #573
diff --git a/core/src/main/scala/it/agilelab/bigdata/utils/NonEmptyList.scala b/core/src/main/scala/it/agilelab/bigdata/utils/NonEmptyList.scala
@@ -0,0 +1,24 @@
+package it.agilelab.bigdata.utils
+
+case class NonEmptyList[+A](head: A, tail: List[A]) {
+  val size: Int = tail.size + 1
+
+  def map[B](f: A => B): NonEmptyList[B] = {
+    NonEmptyList(f(head), tail.map(f))
+  }
+
+  def mkString(sep: String): String = {
+    if (tail.isEmpty) {
+      "" + head
+    } else {
+      head + sep + tail.mkString(sep)
+    }
+  }
+
+  def exists(p: A => Boolean): Boolean = {
+    p(head) || tail.exists(p)
+  }
+}
+object NonEmptyList {
+  def one[A](a: A): NonEmptyList[A] = NonEmptyList(a, Nil)
+}
diff --git a/core/src/test/scala/it/agilelab/bigdata/utils/NonEmptyListSpec.scala b/core/src/test/scala/it/agilelab/bigdata/utils/NonEmptyListSpec.scala
@@ -0,0 +1,69 @@
+package it.agilelab.bigdata.utils
+
+import org.scalatest.{FunSuite, Matchers}
+
+class NonEmptyListSpec extends FunSuite with Matchers {
+
+  test("size should return correct size for different list lengths") {
+    assert(NonEmptyList(1, Nil).size == 1)
+    assert(NonEmptyList(1, List(2)).size == 2)
+    assert(NonEmptyList(1, List(2, 3, 4, 5)).size == 5)
+  }
+
+  test("map should correctly transform elements") {
+    val nel = NonEmptyList(1, List(2, 3))
+    val mapped = nel.map(_ * 2)
+    assert(mapped == NonEmptyList(2, List(4, 6)))
+  }
+
+  test("map should handle identity function") {
+    val nel = NonEmptyList(1, List(2, 3))
+    assert(nel.map(identity) == nel)
+  }
+
+  test("map should work with different types") {
+    val nel = NonEmptyList(1, List(2, 3))
+    val mapped = nel.map(_.toString)
+    assert(mapped == NonEmptyList("1", List("2", "3")))
+  }
+
+  test("mkString should concatenate elements with separator") {
+    assert(NonEmptyList("a", List("b", "c")).mkString(",") == "a,b,c")
+    assert(NonEmptyList(1, List(2, 3)).mkString(" - ") == "1 - 2 - 3")
+  }
+
+  test("mkString should handle empty tail correctly") {
+    assert(NonEmptyList("only", Nil).mkString(",") == "only")
+  }
+
+  test("exists should return true if the head matches the predicate") {
+    assert(NonEmptyList(1, List(2, 3)).exists(_ == 1))
+  }
+
+  test("exists should return true if an element in the tail matches the predicate") {
+    assert(NonEmptyList(1, List(2, 3)).exists(_ == 3))
+  }
+
+  test("exists should return false if no elements match the predicate") {
+    assert(!NonEmptyList(1, List(2, 3)).exists(_ == 10))
+  }
+
+  test("exists should handle always-true and always-false predicates") {
+    val nel = NonEmptyList(1, List(2, 3))
+    assert(nel.exists(_ => true)) // Always true predicate
+    assert(!nel.exists(_ => false)) // Always false predicate
+  }
+
+  test("one should create a NonEmptyList with a single element") {
+    val nel = NonEmptyList.one(42)
+    assert(nel.head == 42)
+    assert(nel.tail.isEmpty)
+    assert(nel.size == 1)
+  }
+
+  test("one should support different types") {
+    val nel = NonEmptyList.one("Scala")
+    assert(nel.head == "Scala")
+    assert(nel.tail.isEmpty)
+  }
+}
diff --git a/plugin-kafka-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/kafka/KafkaWriters.scala b/plugin-kafka-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/kafka/KafkaWriters.scala
@@ -1,6 +1,7 @@
 package it.agilelab.bigdata.wasp.consumers.spark.plugins.kafka
 
 import com.typesafe.config.Config
+import it.agilelab.bigdata.utils.NonEmptyList
 import it.agilelab.bigdata.wasp.consumers.spark.utils.AvroSerializerExpression
 import it.agilelab.bigdata.wasp.core.logging.Logging
 import it.agilelab.bigdata.wasp.core.utils.SubjectUtils
@@ -40,21 +41,17 @@ object KafkaWriters extends Logging {
   private[kafka] def prepareDfToWrite(
       df: DataFrame,
       topicFieldNameOpt: Option[String],
-      topics: Seq[TopicModel],
+      topics: NonEmptyList[TopicModel],
       darwinConf: Option[Config]
   ) = {
 
-    val throwException = udf { s: String =>
-      throw new Exception(s"Unknown topic name $s")
-    }
-
     topicFieldNameOpt match {
       case Some(topicFieldName) =>
         require(topics.size > 1, s"Got topicFieldName = $topicFieldName but only one topic to write ($topics)")
-        val keyCol: Option[Column]     = keyExpression(topics, topicFieldNameOpt, throwException, df.col, darwinConf)
-        val headersCol: Option[Column] = headerExpression(topics, topicFieldNameOpt, throwException)
+        val keyCol: Option[Column]     = keyExpression(topics, topicFieldNameOpt, df.col, darwinConf)
+        val headersCol: Option[Column] = headerExpression(topics, topicFieldNameOpt)
         val topicCol: Column           = col(topicFieldName)
-        val valueCol: Column           = valueExpression(topics, topicFieldNameOpt, df.schema, df.col, throwException, darwinConf)
+        val valueCol: Column           = valueExpression(topics, topicFieldNameOpt, df.schema, df.col, darwinConf)
 
         val columns =
           (keyCol.map(_.as("key")) ++
@@ -68,9 +65,9 @@ object KafkaWriters extends Logging {
           topics.size == 1,
           "More than one topic to write specified but there's no column containing the topics' name."
         )
-        val keyCol: Option[Column]     = keyExpression(topics, topicFieldNameOpt, throwException, df.col, darwinConf)
-        val headersCol: Option[Column] = headerExpression(topics, topicFieldNameOpt, throwException)
-        val valueCol: Column           = valueExpression(topics, topicFieldNameOpt, df.schema, df.col, throwException, darwinConf)
+        val keyCol: Option[Column]     = keyExpression(topics, topicFieldNameOpt, df.col, darwinConf)
+        val headersCol: Option[Column] = headerExpression(topics, topicFieldNameOpt)
+        val valueCol: Column           = valueExpression(topics, topicFieldNameOpt, df.schema, df.col, darwinConf)
 
         val columns =
           (keyCol.map(_.as("key")) ++
@@ -83,53 +80,38 @@ object KafkaWriters extends Logging {
   }
 
   private def keyExpression(
-      topics: Seq[TopicModel],
+      topics: NonEmptyList[TopicModel],
       topicFieldName: Option[String],
-      exceptionUdf: UserDefinedFunction,
       columnExtractor: String => Column,
       darwinConf: Option[Config]
-  ) = {
+  ): Option[Column] = {
 
     def valueOfKey(topicModel: TopicModel): Column = {
-      val keyField = topicModel.keyFieldName.get
-      topicModel.topicDataType match {
-        case "avro" => convertKeyForAvro(columnExtractor(keyField), topicModel, darwinConf)
-        case dataType if dataType == "json" || dataType == "binary" || dataType == "plaintext" =>
-          convertKeyToBinary(columnExtractor(keyField))
-        case unknown => throw new UnsupportedOperationException(s"Unknown topic data type $unknown")
-      }
+      topicModel.keyFieldName
+        .map(keyField =>
+          topicModel.topicDataType match {
+            case "avro"                          => convertKeyForAvro(columnExtractor(keyField), topicModel, darwinConf)
+            case "json" | "binary" | "plaintext" => convertKeyToBinary(columnExtractor(keyField))
+            case unknown                         => throw new UnsupportedOperationException(s"Unknown topic data type $unknown")
+          }
+        )
+        .getOrElse(lit(null).cast(BinaryType))
     }
-
     if (topics.exists(_.keyFieldName.isDefined)) {
-
-      if (topicFieldName.isDefined) {
-        val head = topics.head
-        val tail = topics.tail
-
-        Some(
-          tail
-            .foldLeft(when(conditionOnTopicName(topicFieldName.get, head), valueOfKey(head))) { (z, x) =>
-              z.when(conditionOnTopicName(topicFieldName.get, x), valueOfKey(x))
-            }
-            .otherwise(exceptionUdf(col(topicFieldName.get)))
-        )
-      } else {
-        Some(valueOfKey(topics.head))
-      }
-
+      Some(computeFieldExpression(topics, topicFieldName, valueOfKey))
     } else {
       None
     }
+
   }
 
   private def valueExpression(
-      topics: Seq[TopicModel],
+      topics: NonEmptyList[TopicModel],
       topicFieldName: Option[String],
       dfSchema: StructType,
       columnExtractor: String => Column,
-      exceptionUdf: UserDefinedFunction,
       darwinConf: Option[Config]
-  ) = {
+  ): Column = {
 
     def valueOfValue(topicModel: TopicModel): Column = {
       val columnsInValues = topicModel.valueFieldsNames match {
@@ -145,56 +127,47 @@ object KafkaWriters extends Logging {
         case unknown     => throw new UnsupportedOperationException(s"Unknown topic data type $unknown")
       }
     }
-
-    if (topicFieldName.isDefined) {
-      val head = topics.head
-      val tail = topics.tail
-
-      tail
-        .foldLeft(when(conditionOnTopicName(topicFieldName.get, head), valueOfValue(head))) {
-          (z: Column, x: TopicModel) =>
-            z.when(conditionOnTopicName(topicFieldName.get, x), valueOfValue(x))
-        }
-        .otherwise(exceptionUdf(col(topicFieldName.get)))
-
-    } else {
-      valueOfValue(topics.head)
-    }
-
+    computeFieldExpression(topics, topicFieldName, valueOfValue)
   }
 
   private def headerExpression(
-      topics: Seq[TopicModel],
-      topicFieldName: Option[String],
-      exceptionUdf: UserDefinedFunction
-  ) = {
+      topics: NonEmptyList[TopicModel],
+      topicFieldName: Option[String]
+  ): Option[Column] = {
 
     def valueOfHeader(head: TopicModel) = {
       head.headersFieldName.map(col).getOrElse(lit(null))
     }
 
     if (topics.exists(_.headersFieldName.isDefined)) {
-
-      if (topicFieldName.isDefined) {
-        val head = topics.head
-        val tail = topics.tail
-        Some(
-          tail
-            .foldLeft(when(conditionOnTopicName(topicFieldName.get, head), valueOfHeader(head))) {
-              (z: Column, x: TopicModel) =>
-                z.when(conditionOnTopicName(topicFieldName.get, x), valueOfHeader(x))
-            }
-            .otherwise(exceptionUdf(col(topicFieldName.get)))
-        )
-      } else {
-        Some(valueOfHeader(topics.head))
-      }
+      Some(computeFieldExpression(topics, topicFieldName, valueOfHeader))
     } else {
       None
     }
 
   }
 
+  private val unknownTopicExpression: UserDefinedFunction = udf { s: String =>
+    throw new Exception(s"Unknown topic name $s")
+  }
+
+  private def computeFieldExpression(
+      topics: NonEmptyList[TopicModel], // this list is non empty
+      maybeTopicFieldName: Option[String],
+      valueExtractor: TopicModel => Column
+  ): Column = {
+    val NonEmptyList(head, tail) = topics
+    maybeTopicFieldName
+      .map(topicFieldName =>
+        tail
+          .foldLeft(when(conditionOnTopicName(topicFieldName, head), valueExtractor(head))) { (z, x) =>
+            z.when(conditionOnTopicName(topicFieldName, x), valueExtractor(x))
+          }
+          .otherwise(unknownTopicExpression(col(topicFieldName)))
+      )
+      .getOrElse(valueExtractor(head))
+  }
+
   private def conditionOnTopicName(topicFieldName: String, head: TopicModel) = {
     col(topicFieldName).equalTo(head.name)
   }
@@ -352,10 +325,9 @@ object KafkaWriters extends Logging {
         _ => ()
       )
 
-    val topicsToWrite = if (topics.isEmpty) {
-      List(mainTopicModel.asInstanceOf[TopicModel])
-    } else {
-      topics
+    val topicsToWrite = topics match {
+      case Nil          => NonEmptyList.one(mainTopicModel.asInstanceOf[TopicModel])
+      case head :: tail => NonEmptyList(head, tail)
     }
 
     logger.info(s"Writing with topic models: ${topicsToWrite.map(_.name).mkString(" ")}")
diff --git a/plugin-kafka-spark/src/test/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/kafka/ConvertDataframeTest.scala b/plugin-kafka-spark/src/test/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/kafka/ConvertDataframeTest.scala