[Python] Fix WriteToBigQuery transform using CopyJob does not work with WRITE_TRUNCATE write disposition (#34247)

portikCoder · portikCoder · commit c55ca2595d25 · 2025-07-22T15:26:58.000+02:00
* It only truncates the first table, but originally didn't take care of identical table-ids but from different dataset-id, or project-id.
diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py
@@ -541,45 +541,27 @@ def process_one(self, element, job_name_prefix):
       copy_from_reference.projectId = vp.RuntimeValueProvider.get_value(
           'project', str, '') or self.project
 
-    copy_job_name = '%s_%s' % (
-        job_name_prefix,
-        _bq_uuid(
-            '%s:%s.%s' % (
-                copy_from_reference.projectId,
-                copy_from_reference.datasetId,
-                copy_from_reference.tableId)))
-
     _LOGGER.info(
         "Triggering copy job from %s to %s",
         copy_from_reference,
         copy_to_reference)
-    if copy_to_reference.tableId not in self._observed_tables:
-      # When the write_disposition for a job is WRITE_TRUNCATE,
-      # multiple copy jobs to the same destination can stump on
-      # each other, truncate data, and write to the BQ table over and
-      # over.
-      # Thus, the first copy job runs with the user's write_disposition,
-      # but afterwards, all jobs must always WRITE_APPEND to the table.
-      # If they do not, subsequent copy jobs will clear out data appended
-      # by previous jobs.
-      write_disposition = self.write_disposition
-      wait_for_job = True
-      self._observed_tables.add(copy_to_reference.tableId)
-      Lineage.sinks().add(
-          'bigquery',
-          copy_to_reference.projectId,
-          copy_to_reference.datasetId,
-          copy_to_reference.tableId)
-    else:
-      wait_for_job = False
-      write_disposition = 'WRITE_APPEND'
+
+    wait_for_job, write_disposition = (
+      self._determine_write_disposition(copy_to_reference))
 
     if not self.bq_io_metadata:
       self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
 
     project_id = (
         copy_to_reference.projectId
         if self.load_job_project_id is None else self.load_job_project_id)
+    copy_job_name = '%s_%s' % (
+        job_name_prefix,
+        _bq_uuid(
+            '%s:%s.%s' % (
+                copy_from_reference.projectId,
+                copy_from_reference.datasetId,
+                copy_from_reference.tableId)))
     job_reference = self.bq_wrapper._insert_copy_job(
         project_id,
         copy_job_name,
@@ -594,6 +576,43 @@ def process_one(self, element, job_name_prefix):
     self.pending_jobs.append(
         GlobalWindows.windowed_value((destination, job_reference)))
 
+  def _determine_write_disposition(self, copy_to_reference) -> tuple[bool, str]:
+    """
+    Determines the write disposition for a BigQuery copy job,
+     based on destination.
+
+    When the write_disposition for a job is WRITE_TRUNCATE, multiple copy jobs
+    to the same destination can interfere with each other, truncate data, and
+    write to the BigQuery table repeatedly. To prevent this, the first copy job
+    runs with the user's specified write_disposition, but subsequent jobs must
+    always use WRITE_APPEND. This ensures that subsequent copy jobs do not
+    clear out data appended by previous jobs.
+
+    Args:
+        copy_to_reference: The reference to the destination table.
+
+    Returns:
+        A tuple containing a boolean indicating whether to wait for the job to
+        complete and the write disposition to use for the job.
+    """
+    full_table_ref = '%s:%s.%s' % (
+        copy_to_reference.projectId,
+        copy_to_reference.datasetId,
+        copy_to_reference.tableId)
+    if full_table_ref not in self._observed_tables:
+      write_disposition = self.write_disposition
+      wait_for_job = True
+      self._observed_tables.add(full_table_ref)
+      Lineage.sinks().add(
+          'bigquery',
+          copy_to_reference.projectId,
+          copy_to_reference.datasetId,
+          copy_to_reference.tableId)
+    else:
+      wait_for_job = False
+      write_disposition = 'WRITE_APPEND'
+    return wait_for_job, write_disposition
+
   def finish_bundle(self):
     for windowed_value in self.pending_jobs:
       job_ref = windowed_value.value[1]
diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py
@@ -24,6 +24,8 @@
 import secrets
 import time
 import unittest
+from unittest.mock import Mock
+from unittest.mock import call
 
 import mock
 import pytest
@@ -39,6 +41,7 @@
 from apache_beam.io.gcp import bigquery
 from apache_beam.io.gcp import bigquery_tools
 from apache_beam.io.gcp.bigquery import BigQueryDisposition
+from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper
 from apache_beam.io.gcp.internal.clients import bigquery as bigquery_api
 from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultMatcher
 from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultStreamingMatcher
@@ -820,6 +823,164 @@ def test_multiple_partition_files_write_dispositions(
     # TriggerCopyJob only processes once
     self.assertEqual(mock_call_process.call_count, 1)
 
+  @mock.patch(
+      'apache_beam.io.gcp.bigquery_tools.BigQueryWrapper.wait_for_bq_job')
+  @mock.patch(
+      'apache_beam.io.gcp.bigquery_tools.BigQueryWrapper._insert_copy_job')
+  @mock.patch(
+      'apache_beam.io.gcp.bigquery_tools.BigQueryWrapper._start_job',
+      wraps=BigQueryWrapper._start_job)
+  def test_multiple_identical_destinations_on_write_truncate(
+      self, mock_perform_start_job, mock_insert_copy_job, mock_wait_for_bq_job):
+    """
+    Test that multiple identical table names,
+     but under different datasets are handled correctly.
+    This essentially means that the `write_disposition` is set
+     to `WRITE_TRUNCATE` for the first job and `WRITE_APPEND` for the rest.
+
+    Previously this was not the case and all jobs were set to `WRITE_APPEND`
+     from the 2nd table that was named identically with at least
+     one previous table - but from different dataset.
+    """
+    def dynamic_destination_resolver(element, *side_inputs):
+      """A dynamic destination resolver that returns a destination strictly the
+       same table, but different dataset."""
+      if element['name'] == 'beam':
+        return 'project1:dataset1.table1'
+      elif element['name'] == 'flink':
+        return 'project1:dataset2.table1'
+
+      return 'project1:dataset3.table1'
+
+    job_reference = bigquery_api.JobReference()
+    job_reference.projectId = 'project1'
+    job_reference.jobId = 'job_name1'
+    result_job = mock.Mock()
+    result_job.jobReference = job_reference
+
+    mock_job = mock.Mock()
+    mock_job.status.state = 'DONE'
+    mock_job.status.errorResult = None
+    mock_job.jobReference = job_reference
+
+    bq_client = mock.Mock()
+    bq_client.jobs.Get.return_value = mock_job
+
+    bq_client.jobs.Insert.return_value = result_job
+    bq_client.tables.Delete.return_value = None
+
+    m = bigquery_tools.BigQueryWrapper(bq_client)
+    m.wait_for_bq_job = mock.Mock()
+    m.wait_for_bq_job.return_value = None
+
+    mock_jobs = [
+        Mock(jobReference=bigquery_api.JobReference(jobId=f'job_name{i}'))
+        # Order matters in a sense to prove that jobs with different ids
+        #  (`2` & `3`) are run with `WRITE_APPEND` without this current fix.
+        for i in [1, 2, 1, 3, 1]
+    ]
+    mock_perform_start_job.side_effect = mock_jobs
+
+    # For now we don't care about the return value.
+    mock_insert_copy_job.return_value = None
+
+    with TestPipeline('DirectRunner') as p:
+      _ = (
+          p
+          | beam.Create([
+              {
+                  'name': 'beam', 'language': 'java'
+              },
+              {
+                  'name': 'flink', 'language': 'java'
+              },
+              {
+                  'name': 'beam', 'language': 'java'
+              },
+              {
+                  'name': 'spark', 'language': 'java'
+              },
+              {
+                  'name': 'beam', 'language': 'java'
+              },
+          ],
+                        reshuffle=False)
+          | bqfl.BigQueryBatchFileLoads(
+              dynamic_destination_resolver,
+              custom_gcs_temp_location=self._new_tempdir(),
+              test_client=bq_client,
+              validate=False,
+              temp_file_format=bigquery_tools.FileFormat.JSON,
+              max_file_size=45,
+              max_partition_size=80,
+              max_files_per_partition=3,
+              write_disposition=BigQueryDisposition.WRITE_TRUNCATE))
+
+    from apache_beam.io.gcp.internal.clients.bigquery import TableReference
+    mock_insert_copy_job.assert_has_calls(
+        [
+            call(
+                'project1',
+                mock.ANY,
+                TableReference(
+                    datasetId='dataset1',
+                    projectId='project1',
+                    tableId='job_name1'),
+                TableReference(
+                    datasetId='dataset1',
+                    projectId='project1',
+                    tableId='table1'),
+                create_disposition=None,
+                write_disposition='WRITE_TRUNCATE',
+                job_labels={'step_name': 'bigquerybatchfileloads'}),
+            call(
+                'project1',
+                mock.ANY,
+                TableReference(
+                    datasetId='dataset1',
+                    projectId='project1',
+                    tableId='job_name2'),
+                TableReference(
+                    datasetId='dataset1',
+                    projectId='project1',
+                    tableId='table1'),
+                create_disposition=None,
+                write_disposition='WRITE_APPEND',
+                job_labels={'step_name': 'bigquerybatchfileloads'}),
+            call(
+                'project1',
+                mock.ANY,
+                TableReference(
+                    datasetId='dataset2',
+                    projectId='project1',
+                    tableId='job_name1'),
+                TableReference(
+                    datasetId='dataset2',
+                    projectId='project1',
+                    tableId='table1'),
+                create_disposition=None,
+                # Previously this was `WRITE_APPEND`.
+                write_disposition='WRITE_TRUNCATE',
+                job_labels={'step_name': 'bigquerybatchfileloads'}),
+            call(
+                'project1',
+                mock.ANY,
+                TableReference(
+                    datasetId='dataset3',
+                    projectId='project1',
+                    tableId='job_name3'),
+                TableReference(
+                    datasetId='dataset3',
+                    projectId='project1',
+                    tableId='table1'),
+                create_disposition=None,
+                # Previously this was `WRITE_APPEND`.
+                write_disposition='WRITE_TRUNCATE',
+                job_labels={'step_name': 'bigquerybatchfileloads'}),
+        ],
+        any_order=True)
+    self.assertEqual(4, mock_insert_copy_job.call_count)
+
   @parameterized.expand([
       param(is_streaming=False, with_auto_sharding=False, compat_version=None),
       param(is_streaming=True, with_auto_sharding=False, compat_version=None),