Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
5.1
* Use LWTs for all auto-repair history mutations (CASSANDRA-20996)
* Add cqlsh autocompletion for the identity mapping feature (CASSANDRA-20021)
* Add DDL Guardrail enabling administrators to disallow creation/modification of keyspaces with durable_writes = false (CASSANDRA-20913)
* Optimize Counter, Meter and Histogram metrics using thread local counters (CASSANDRA-20250)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,27 +150,27 @@ public class AutoRepairUtils
COL_DELETE_HOSTS, COL_DELETE_HOSTS_UPDATE_TIME, COL_REPAIR_TYPE, COL_HOST_ID);

final static String DEL_AUTO_REPAIR_HISTORY = String.format(
"DELETE FROM %s.%s WHERE %s = ? AND %s = ?"
"DELETE FROM %s.%s WHERE %s = ? AND %s = ? IF EXISTS"
, SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_REPAIR_TYPE,
COL_HOST_ID);

final static String RECORD_START_REPAIR_HISTORY = String.format(
"UPDATE %s.%s SET %s= ?, repair_turn = ? WHERE %s = ? AND %s = ?"
"UPDATE %s.%s SET %s= ?, repair_turn = ? WHERE %s = ? AND %s = ? IF EXISTS"
, SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_REPAIR_START_TS,
COL_REPAIR_TYPE, COL_HOST_ID);

final static String RECORD_FINISH_REPAIR_HISTORY = String.format(
"UPDATE %s.%s SET %s= ?, %s=false WHERE %s = ? AND %s = ?"
"UPDATE %s.%s SET %s= ?, %s=false WHERE %s = ? AND %s = ? IF EXISTS"
, SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_REPAIR_FINISH_TS,
COL_FORCE_REPAIR, COL_REPAIR_TYPE, COL_HOST_ID);

final static String CLEAR_DELETE_HOSTS = String.format(
"UPDATE %s.%s SET %s= {} WHERE %s = ? AND %s = ?"
"UPDATE %s.%s SET %s= {} WHERE %s = ? AND %s = ? IF EXISTS"
, SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_DELETE_HOSTS,
COL_REPAIR_TYPE, COL_HOST_ID);

final static String SET_FORCE_REPAIR = String.format(
"UPDATE %s.%s SET %s=true WHERE %s = ? AND %s = ?"
"UPDATE %s.%s SET %s=true WHERE %s = ? AND %s = ? IF EXISTS"
, SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_FORCE_REPAIR,
COL_REPAIR_TYPE, COL_HOST_ID);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -538,4 +538,42 @@ public void testSkipSystemTraces()
{
assertFalse(AutoRepairUtils.shouldConsiderKeyspace(Keyspace.open(SchemaConstants.TRACE_KEYSPACE_NAME)));
}

@Test
public void testAutoRepairHistoryOutOfOrderDeleteRaceCondition()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test passes even without any of the changes because ADD_HOST_ID_TO_DELETE_HOSTS already had IF EXISTS, however, the changes in this PR are necessary.
Please clarify in the description that the PR includes the test case and a certain cases we missed earlier.

{
// Setup: Create a node that will be deleted
UUID nodeToDelete = UUID.randomUUID();
UUID votingNode = UUID.randomUUID();

// Insert an initial row for the node to be deleted
QueryProcessor.executeInternal(String.format(
"INSERT INTO %s.%s (repair_type, host_id, repair_start_ts, repair_finish_ts) VALUES ('%s', %s, 100, 200)",
SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY,
repairType, nodeToDelete));

// Verify the row exists
UntypedResultSet beforeDelete = QueryProcessor.executeInternal(String.format(
"SELECT * FROM %s.%s WHERE repair_type = '%s' AND host_id = %s",
SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY,
repairType, nodeToDelete));
assertNotNull(beforeDelete);
assertEquals(1, beforeDelete.size());

// Simulate the race condition:
// 1. First, the delete is executed (this should create a tombstone)
AutoRepairUtils.deleteAutoRepairHistory(repairType, nodeToDelete);

// 2. Then, a vote to delete arrives after the row has already been deleted
AutoRepairUtils.addHostIdToDeleteHosts(repairType, votingNode, nodeToDelete);

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also clear deleted hosts as part of the late event

AutoRepairUtils.clearDeleteHosts(repairType, nodeToDelete);

// Verify that the row is still deleted despite the out-of-order vote
UntypedResultSet afterRace = QueryProcessor.executeInternal(String.format(
"SELECT * FROM %s.%s WHERE repair_type = '%s' AND host_id = %s",
SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY,
repairType, nodeToDelete));
assertNotNull(afterRace);
// The row should not exist - the delete should win despite the vote arriving later
assertEquals("Row should remain deleted despite out-of-order vote", 0, afterRace.size());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ public void prepare()
QueryProcessor.executeInternal(String.format(
"TRUNCATE %s.%s",
SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY));
QueryProcessor.executeInternal(String.format(
"INSERT INTO %s.%s (host_id, repair_type) VALUES (%s, '%s')",
SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY,
StorageService.instance.getHostIdForEndpoint(InetAddressAndPort.getLocalHost()), repairTypeStr));
}

@Test
Expand Down