Skip to content

Conversation

@getvictor
Copy link
Member

@getvictor getvictor commented Oct 10, 2025

Related issue: Resolves #34116

Checklist for submitter

If some of the following don't apply, delete the relevant line.

  • Changes file added for user-visible changes in changes/, orbit/changes/ or ee/fleetd-chrome/changes.
    See Changes files for more information.

Testing

  • Added/updated automated tests (see below)

  • QA'd all new/changed functionality manually

Local test that reproduces the issue and verifies fix. Not checked in.

// testAggressiveDeadlockReproduction creates an aggressive test scenario to reproduce
// deadlocks that occur when multiple hosts update certificate sources concurrently.
//
// This test bypasses the retry logic to actually detect deadlocks and prove the fix works.
//
// Results without fix: ~1100 deadlocks (22% rate) across 50 iterations
// Results with fix: ~160 deadlocks (3% rate) - 86% improvement
func testAggressiveDeadlockReproduction(t *testing.T, ds *Datastore) {
	ctx := context.Background()

	// Create hosts and certificates
	numHosts := 30
	hosts := make([]*fleet.Host, numHosts)
	for i := 0; i < numHosts; i++ {
		host, err := ds.NewHost(ctx, &fleet.Host{
			DetailUpdatedAt: time.Now(),
			LabelUpdatedAt:  time.Now(),
			PolicyUpdatedAt: time.Now(),
			SeenTime:        time.Now(),
			OsqueryHostID:   ptr.String(fmt.Sprintf("deadlock-test-host-%d", i)),
			NodeKey:         ptr.String(fmt.Sprintf("deadlock-test-host-%d-key", i)),
			UUID:            fmt.Sprintf("deadlock-test-host-%d-uuid", i),
			Hostname:        fmt.Sprintf("deadlock-host-%d", i),
		})
		require.NoError(t, err)
		hosts[i] = host
	}

	// Create certificates for each host
	certsPerHost := 15
	allCertIDs := make([][]uint, numHosts)

	for i := 0; i < numHosts; i++ {
		certs := make([]*fleet.HostCertificateRecord, certsPerHost)
		for j := 0; j < certsPerHost; j++ {
			certTemplate := x509.Certificate{
				Subject: pkix.Name{
					Country:            []string{"US"},
					CommonName:         fmt.Sprintf("host%d-cert%d.test.com", i, j),
					Organization:       []string{"Test Org"},
					OrganizationalUnit: []string{"Engineering"},
				},
				Issuer: pkix.Name{
					Country:      []string{"US"},
					CommonName:   "issuer.test.com",
					Organization: []string{"Test Issuer"},
				},
				SerialNumber:          big.NewInt(int64(i*1000 + j)),
				KeyUsage:              x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
				ExtKeyUsage:           []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
				SignatureAlgorithm:    x509.SHA256WithRSA,
				NotBefore:             time.Now().Add(-time.Hour).Truncate(time.Second).UTC(),
				NotAfter:              time.Now().Add(24 * time.Hour).Truncate(time.Second).UTC(),
				BasicConstraintsValid: true,
			}

			cert := generateTestHostCertificateRecord(t, hosts[i].ID, &certTemplate)
			cert.Source = fleet.SystemHostCertificate
			certs[j] = cert
		}

		// Insert certificates
		err := ds.UpdateHostCertificates(ctx, hosts[i].ID, hosts[i].UUID, certs)
		require.NoError(t, err)

		// Load certificate IDs
		loadedCerts, _, err := ds.ListHostCertificates(ctx, hosts[i].ID, fleet.ListOptions{})
		require.NoError(t, err)
		require.Len(t, loadedCerts, certsPerHost)

		certIDs := make([]uint, certsPerHost)
		for j, cert := range loadedCerts {
			certIDs[j] = cert.ID
		}
		allCertIDs[i] = certIDs
	}

	// Run aggressive deadlock test
	totalDeadlocks := 0
	iterations := 50

	for iter := 0; iter < iterations; iter++ {
		t.Logf("Iteration %d/%d", iter+1, iterations)

		type result struct {
			txIdx int
			err   error
		}

		numTransactions := 100 // Many concurrent transactions
		resultsCh := make(chan result, numTransactions)

		// Launch concurrent transactions
		for txIdx := 0; txIdx < numTransactions; txIdx++ {
			go func(idx int) {
				hostIdx := idx % numHosts
				certIDs := allCertIDs[hostIdx]

				// Build UNSORTED source records to trigger deadlocks
				// Reverse order for even transactions to create lock conflicts
				toReplace := make([]*fleet.HostCertificateRecord, len(certIDs))
				for i := range certIDs {
					actualIdx := i
					if idx%2 == 0 {
						actualIdx = len(certIDs) - 1 - i
					}

					toReplace[i] = &fleet.HostCertificateRecord{
						ID:       certIDs[actualIdx],
						Source:   fleet.UserHostCertificate,
						Username: fmt.Sprintf("user%d", idx),
					}
				}

				// Call replaceHostCertsSourcesDB directly (no retry)
				err := ds.withTx(ctx, func(tx sqlx.ExtContext) error {
					return replaceHostCertsSourcesDB(ctx, tx, toReplace)
				})

				resultsCh <- result{txIdx: idx, err: err}
			}(txIdx)
		}

		// Collect results
		iterDeadlocks := 0
		for i := 0; i < numTransactions; i++ {
			res := <-resultsCh
			if res.err != nil {
				if strings.Contains(res.err.Error(), "Deadlock") || strings.Contains(res.err.Error(), "deadlock") {
					iterDeadlocks++
				} else {
					require.NoError(t, res.err, "Transaction %d unexpected error", res.txIdx)
				}
			}
		}

		if iterDeadlocks > 0 {
			t.Logf("  Deadlocks in iteration %d: %d/%d transactions", iter+1, iterDeadlocks, numTransactions)
			totalDeadlocks += iterDeadlocks
		}
	}

	// Report results
	deadlockRate := float64(totalDeadlocks) / float64(iterations*100) * 100
	t.Logf("\n=== DEADLOCK TEST RESULTS ===")
	t.Logf("Total deadlocks: %d across %d iterations", totalDeadlocks, iterations)
	t.Logf("Deadlock rate: %.1f%%", deadlockRate)
	t.Logf("\nExpected without fix: ~1100 deadlocks (22%% rate)")
	t.Logf("Expected with fix: ~160 deadlocks (3%% rate)")

	if totalDeadlocks > 500 {
		t.Fatalf("High deadlock count suggests fix is not applied or not working")
	}
}

Summary by CodeRabbit

  • Bug Fixes
    • Resolved rare database deadlocks when multiple hosts update certificates simultaneously, improving reliability of host vitals updates.
    • Reduced unnecessary delete operations during certificate updates to lower lock contention and improve stability under load.
    • Standardized processing of certificate sources to ensure consistent behavior across concurrent updates.
    • Overall improvements result in smoother certificate synchronization without user-facing changes.

@getvictor
Copy link
Member Author

@coderabbitai full review

@coderabbitai
Copy link
Contributor

coderabbitai bot commented Oct 10, 2025

✅ Actions performed

Full review triggered.

@coderabbitai
Copy link
Contributor

coderabbitai bot commented Oct 10, 2025

Walkthrough

Addresses MySQL deadlocks when concurrently updating host certificate sources by enforcing deterministic processing and conditional deletions: sorts replacement sources to standardize lock order, deduplicates IDs for deletion, checks for existing rows before deleting, and performs insertions after deletion logic. No public APIs changed.

Changes

Cohort / File(s) Summary
Deadlock mitigation in host certificate sources
server/datastore/mysql/host_certificates.go
- Sorts toReplaceSources by host_certificate_id, source, username to standardize lock order
- Deduplicates host_certificate_id list before deletion
- Adds pre-delete existence check to avoid unnecessary gap locks
- Conditionally deletes only when rows exist
- Inserts new sources after deletion handling

Sequence Diagram(s)

sequenceDiagram
  autonumber
  participant S as Service
  participant DS as Datastore
  participant SQL as MySQL

  S->>DS: Update host certificate sources
  DS->>DS: Build toReplaceSources (inputs)
  DS->>DS: Sort by (host_certificate_id, source, username)
  DS->>DS: Deduplicate certificate IDs for deletion
  DS->>SQL: SELECT EXISTS host_certificate_sources for IDs
  alt Rows exist
    DS->>SQL: DELETE FROM host_certificate_sources WHERE host_certificate_id IN (...)
  else No rows
    DS->>DS: Skip deletion
  end
  DS->>SQL: INSERT INTO host_certificate_sources (new rows)
  DS-->>S: Return
Loading

Estimated code review effort

🎯 3 (Moderate) | ⏱️ ~20 minutes

Possibly related PRs

Suggested reviewers

  • sgress454
  • dantecatalfamo
  • JordanMontgomery

Pre-merge checks and finishing touches

❌ Failed checks (2 warnings)
Check name Status Explanation Resolution
Docstring Coverage ⚠️ Warning Docstring coverage is 0.00% which is insufficient. The required threshold is 80.00%. You can run @coderabbitai generate docstrings to improve docstring coverage.
Description Check ⚠️ Warning The pull request description correctly references the related issue and includes a changes-file checklist, but it omits required sections of the repository template such as the unreleased bug fix confirmation options, explicit automated test updates, and the database migration guidance, and it does not mark non-applicable sections as removed. Please update the description to include the “For unreleased bug fixes” section with a selected confirmation checkbox, indicate whether automated tests were added or explain why they are not applicable, and add or explicitly remove the database migrations checklist as required by the template.
✅ Passed checks (3 passed)
Check name Status Explanation
Title Check ✅ Passed The title accurately conveys the primary purpose of the changeset by highlighting the fix for MySQL deadlocks during concurrent certificate updates, which directly reflects the modifications made to enforce deterministic lock ordering and avoid deadlocks.
Linked Issues Check ✅ Passed The code changes enforce deterministic lock ordering by sorting and deduplicating host_certificate_sources and add conditional deletion logic, which directly addresses the deadlock scenario described in issue #34116 by reducing conflicting locks under concurrent operations.
Out of Scope Changes Check ✅ Passed All modifications are confined to the MySQL host_certificates.go logic for certificate source handling and directly relate to mitigating deadlocks, with no unrelated or extraneous code changes detected.
✨ Finishing touches
  • 📝 Generate docstrings
🧪 Generate unit tests (beta)
  • Create PR with unit tests
  • Post copyable unit tests in a comment
  • Commit unit tests in branch victor/34116-cert-source-deadlocks

Thanks for using CodeRabbit! It's free for OSS, and your support helps us grow. If you like it, consider giving us a shout-out.

❤️ Share

Comment @coderabbitai help to get the list of available commands and usage tips.

@getvictor getvictor marked this pull request as ready for review October 10, 2025 21:26
@getvictor getvictor requested a review from a team as a code owner October 10, 2025 21:26
@codecov
Copy link

codecov bot commented Oct 10, 2025

Codecov Report

❌ Patch coverage is 69.69697% with 10 lines in your changes missing coverage. Please review.
✅ Project coverage is 64.23%. Comparing base (8973500) to head (dd7c88e).
⚠️ Report is 17 commits behind head on main.

Files with missing lines Patch % Lines
server/datastore/mysql/host_certificates.go 69.69% 7 Missing and 3 partials ⚠️
Additional details and impacted files
@@           Coverage Diff           @@
##             main   #34119   +/-   ##
=======================================
  Coverage   64.23%   64.23%           
=======================================
  Files        2058     2058           
  Lines      206902   206927   +25     
  Branches     6899     6899           
=======================================
+ Hits       132900   132921   +21     
- Misses      63576    63578    +2     
- Partials    10426    10428    +2     
Flag Coverage Δ
backend 65.33% <69.69%> (+<0.01%) ⬆️

Flags with carried forward coverage won't be shown. Click here to find out more.

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.
  • 📦 JS Bundle Analysis: Save yourself from yourself by tracking and limiting bundle sizes in JS merges.

@getvictor getvictor merged commit 9cc7a02 into main Oct 13, 2025
39 checks passed
@getvictor getvictor deleted the victor/34116-cert-source-deadlocks branch October 13, 2025 14:48
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

Deadlocks inserting into host_certificate_sources

2 participants