fix: Fix IntegerLookup docstring output shape documentation and add test coverage (#21625)

harshaljanjani · web-flow · commit a0094551bc2a · 2025-08-29T10:06:12.000-07:00
diff --git a/keras/src/layers/preprocessing/integer_lookup.py b/keras/src/layers/preprocessing/integer_lookup.py
@@ -111,9 +111,12 @@ class IntegerLookup(IndexLookup):
                 appeared in the sample.
             - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is
                 applied to find the value in each token slot.
-            For `"int"` output, any shape of input and output is supported.
-            For all other output modes, currently only output up to rank 2
-            is supported. Defaults to `"int"`.
+            For `"int"` output, the output shape matches the input shape.
+            For `"one_hot"` output, the output shape is
+            `input_shape + (vocabulary_size,)`, where `input_shape` may
+            have arbitrary rank. For other output modes (`"multi_hot"`,
+            `"count"`, `"tf_idf"`), the output shape is `(batch_size,
+            vocabulary_size)`. Defaults to `"int"`.
         pad_to_max_tokens: Only applicable when `output_mode` is `"multi_hot"`,
             `"count"`, or `"tf_idf"`. If `True`, the output will have
             its feature axis padded to `max_tokens` even if the number
diff --git a/keras/src/layers/preprocessing/integer_lookup_test.py b/keras/src/layers/preprocessing/integer_lookup_test.py
@@ -104,3 +104,52 @@ def test_tf_data_compatibility(self):
         ds = tf_data.Dataset.from_tensor_slices(input_data).batch(4).map(layer)
         output = next(iter(ds)).numpy()
         self.assertAllClose(output, np.array([2, 3, 4, 0]))
+
+    def test_one_hot_output_with_higher_rank_input(self):
+        input_data = np.array([[1, 2], [3, 0]])
+        vocabulary = [1, 2, 3]
+        layer = layers.IntegerLookup(
+            vocabulary=vocabulary, output_mode="one_hot"
+        )
+        output_data = layer(input_data)
+        self.assertEqual(output_data.shape, (2, 2, 4))
+        expected_output = np.array(
+            [
+                [[0, 1, 0, 0], [0, 0, 1, 0]],
+                [[0, 0, 0, 1], [1, 0, 0, 0]],
+            ]
+        )
+        self.assertAllClose(output_data, expected_output)
+        output_data_3d = layer(np.expand_dims(input_data, axis=0))
+        self.assertEqual(output_data_3d.shape, (1, 2, 2, 4))
+        self.assertAllClose(
+            output_data_3d, np.expand_dims(expected_output, axis=0)
+        )
+
+    def test_multi_hot_output_shape(self):
+        input_data = np.array([[1, 2], [3, 0]])
+        vocabulary = [1, 2, 3]
+        layer = layers.IntegerLookup(
+            vocabulary=vocabulary, output_mode="multi_hot"
+        )
+        output_data = layer(input_data)
+        self.assertEqual(output_data.shape, (2, 4))
+
+    def test_count_output_shape(self):
+        input_data = np.array([[1, 2], [3, 0]])
+        vocabulary = [1, 2, 3]
+        layer = layers.IntegerLookup(vocabulary=vocabulary, output_mode="count")
+        output_data = layer(input_data)
+        self.assertEqual(output_data.shape, (2, 4))
+
+    def test_tf_idf_output_shape(self):
+        input_data = np.array([[1, 2], [3, 0]])
+        vocabulary = [1, 2, 3]
+        idf_weights = [1.0, 1.0, 1.0]
+        layer = layers.IntegerLookup(
+            vocabulary=vocabulary,
+            idf_weights=idf_weights,
+            output_mode="tf_idf",
+        )
+        output_data = layer(input_data)
+        self.assertEqual(output_data.shape, (2, 4))