diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index fca3415534..0641966ecf 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1258,11 +1258,12 @@ fn print_task_blueprint_planner(details: &serde_json::Value) { but could not make it the target: {error}" ); } - BlueprintPlannerStatus::Targeted { blueprint_id, .. } => { + BlueprintPlannerStatus::Targeted { blueprint_id, report, .. } => { println!( " planned new blueprint {blueprint_id}, \ and made it the current target" ); + println!("{report}"); } } } diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout index 4feccb11e4..1c0922f8df 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout @@ -37,20 +37,7 @@ generated inventory collection eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51 from configu > # we added has no disks. > blueprint-plan dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51 INFO skipping noop image source check for all sleds, reason: no target release is currently set -INFO skipping sled (no zpools in service), sled_id: 00320471-945d-413c-85e7-03e091a70b3c -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CruciblePantry zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 WARN cannot issue more SP updates (no current artifacts) -INFO all zones up-to-date -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 based on parent blueprint dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 > blueprint-show 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index 4b2c4f5e16..ef82f4056a 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -526,20 +526,7 @@ T ENA ID PARENT > blueprint-plan ade5749d-bdf3-4fab-a8ae-00bea01b3a5a INFO skipping noop image source check for all sleds, reason: no target release is currently set -INFO found sled missing NTP zone (will add one), sled_id: 89d02b1b-478c-401a-8e28-7a26f74fa41b -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -WARN failed to place all new desired Clickhouse zones, placed: 0, wanted_to_place: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -WARN failed to place all new desired CruciblePantry zones, placed: 0, wanted_to_place: 3 -WARN failed to place all new desired InternalDns zones, placed: 0, wanted_to_place: 3 -INFO sufficient ExternalDns zones exist in plan, desired_count: 0, current_count: 0 -WARN failed to place all new desired Nexus zones, placed: 0, wanted_to_place: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 WARN cannot issue more SP updates (no current artifacts) -INFO some zones not yet up-to-date, sled_id: 89d02b1b-478c-401a-8e28-7a26f74fa41b, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: b3c9c041-d2f0-4767-bdaf-0e52e9d7a013 (service), zone_kind: InternalNtp, reason: MissingInInventory { bp_image_source: InstallDataset } }] -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 86db3308-f817-4626-8838-4085949a6a41 based on parent blueprint ade5749d-bdf3-4fab-a8ae-00bea01b3a5a > blueprint-list @@ -1018,19 +1005,7 @@ parent: 02697f74-b14a-4418-90f0-c28b2a3a6aa9 > # sled to be expunged. > blueprint-plan latest INFO skipping noop image source check for all sleds, reason: no target release is currently set -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CruciblePantry zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 WARN cannot issue more SP updates (no current artifacts) -INFO all zones up-to-date -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 86db3308-f817-4626-8838-4085949a6a41 based on parent blueprint ade5749d-bdf3-4fab-a8ae-00bea01b3a5a > blueprint-diff ade5749d-bdf3-4fab-a8ae-00bea01b3a5a latest diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout index a1876121be..6caf1afc88 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout @@ -1026,19 +1026,7 @@ parent: 3f00b694-1b16-4aaa-8f78-e6b3a527b434 > # blueprint-plan will place a new external DNS zone, diff DNS to see the new zone has `ns` and NS records. > blueprint-plan 366b0b68-d80e-4bc1-abd3-dc69837847e0 INFO skipping noop image source check for all sleds, reason: no target release is currently set -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CruciblePantry zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO added zone to sled, sled_id: 711ac7f8-d19e-4572-bdb9-e9b50f6e362a, kind: ExternalDns -INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 WARN cannot issue more SP updates (no current artifacts) -INFO some zones not yet up-to-date, sled_id: 711ac7f8-d19e-4572-bdb9-e9b50f6e362a, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: fe2d5287-24e3-4071-b214-2640b097a759 (service), zone_kind: ExternalDns, reason: MissingInInventory { bp_image_source: InstallDataset } }] -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 > blueprint-diff 366b0b68-d80e-4bc1-abd3-dc69837847e0 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout index ade6db2196..d391a5a657 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout @@ -1043,19 +1043,7 @@ external DNS: > # Planning a new blueprint will now replace the expunged zone, with new records for its replacement. > blueprint-plan 58d5e830-0884-47d8-a7cd-b2b3751adeb4 INFO skipping noop image source check for all sleds, reason: no target release is currently set -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CruciblePantry zones exist in plan, desired_count: 3, current_count: 3 -INFO added zone to sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, kind: InternalDns -INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 WARN cannot issue more SP updates (no current artifacts) -INFO some zones not yet up-to-date, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: e375dd21-320b-43b7-bc92-a2c3dac9d9e1 (service), zone_kind: InternalDns, reason: MissingInInventory { bp_image_source: InstallDataset } }] -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 based on parent blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 > blueprint-diff 58d5e830-0884-47d8-a7cd-b2b3751adeb4 af934083-59b5-4bf6-8966-6fb5292c29e1 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout index 95a4c206dd..dbcfe2d572 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout @@ -165,22 +165,8 @@ INFO install dataset artifact hash not found in TUF repo, ignoring for noop chec INFO install dataset artifact hash not found in TUF repo, ignoring for noop checks, sled_id: b82ede02-399c-48c6-a1de-411df4fa49a7, zone_id: ecbe0b3d-1acc-44b2-b6d4-f4d2770516e4, kind: crucible, file_name: crucible.tar.gz, expected_hash: 866f6a7c2e51c056fb722b5113e80181cc9cd8b712a0d3dbf1edc4ce29e5229e INFO skipped noop image source check on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, reason: remove_mupdate_override is set in the blueprint (ffffffff-ffff-ffff-ffff-ffffffffffff) INFO skipped noop image source check on sled, sled_id: e96e226f-4ed9-4c01-91b9-69a9cd076c9e, reason: sled not found in inventory -INFO noop converting 6/6 install-dataset zones to artifact store, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 6, num_already_artifact: 0 -INFO noop converting 5/6 install-dataset zones to artifact store, sled_id: aff6c093-197d-42c5-ad80-9f10ba051a34, num_total: 6, num_already_artifact: 0 -INFO parent blueprint contains NTP zone, but it's not in inventory yet, sled_id: e96e226f-4ed9-4c01-91b9-69a9cd076c9e -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CruciblePantry zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 INFO configuring SP update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending SP updates, max: 1 -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 based on parent blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 @@ -537,20 +523,8 @@ INFO install dataset artifact hash not found in TUF repo, ignoring for noop chec INFO install dataset artifact hash not found in TUF repo, ignoring for noop checks, sled_id: b82ede02-399c-48c6-a1de-411df4fa49a7, zone_id: ecbe0b3d-1acc-44b2-b6d4-f4d2770516e4, kind: crucible, file_name: crucible.tar.gz, expected_hash: 866f6a7c2e51c056fb722b5113e80181cc9cd8b712a0d3dbf1edc4ce29e5229e INFO skipped noop image source check on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, reason: remove_mupdate_override is set in the blueprint (ffffffff-ffff-ffff-ffff-ffffffffffff) INFO performed noop image source checks on sled, sled_id: e96e226f-4ed9-4c01-91b9-69a9cd076c9e, num_total: 2, num_already_artifact: 0, num_eligible: 2, num_ineligible: 0 -INFO noop converting 2/2 install-dataset zones to artifact store, sled_id: e96e226f-4ed9-4c01-91b9-69a9cd076c9e, num_total: 2, num_already_artifact: 0 -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CruciblePantry zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 INFO SP update not yet completed (will keep it), artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending SP updates, max: 1 -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 based on parent blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout index a994234a81..0f8163333e 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout @@ -195,19 +195,8 @@ f45ba181-4b56-42cc-a762-874d90184a43 0 INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CruciblePantry zones exist in plan, desired_count: 0, current_count: 3 -INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 INFO configuring SP update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending SP updates, max: 1 -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 based on parent blueprint dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 > blueprint-diff dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 @@ -380,19 +369,8 @@ external DNS: INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CruciblePantry zones exist in plan, desired_count: 0, current_count: 3 -INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 INFO SP update not yet completed (will keep it), artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending SP updates, max: 1 -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 based on parent blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 > blueprint-diff 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 58d5e830-0884-47d8-a7cd-b2b3751adeb4 @@ -566,21 +544,10 @@ generated inventory collection eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51 from configu INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CruciblePantry zones exist in plan, desired_count: 0, current_count: 3 -INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 INFO SP update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO skipping board for SP update, serial_number: serial0, part_number: model0 INFO configuring SP update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO reached maximum number of pending SP updates, max: 1 -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 based on parent blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 > blueprint-diff 58d5e830-0884-47d8-a7cd-b2b3751adeb4 af934083-59b5-4bf6-8966-6fb5292c29e1 @@ -762,20 +729,9 @@ generated inventory collection 61f451b3-2121-4ed6-91c7-a550054f6c21 from configu INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CruciblePantry zones exist in plan, desired_count: 0, current_count: 3 -INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 INFO SP update impossible (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO configuring SP update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: Version(ArtifactVersion("0.5.0")), expected_active_version: 0.0.1, component: sp, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO reached maximum number of pending SP updates, max: 1 -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint df06bb57-ad42-4431-9206-abff322896c7 based on parent blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 > blueprint-diff af934083-59b5-4bf6-8966-6fb5292c29e1 df06bb57-ad42-4431-9206-abff322896c7 @@ -956,22 +912,11 @@ generated inventory collection b1bda47d-2c19-4fba-96e3-d9df28db7436 from configu INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CruciblePantry zones exist in plan, desired_count: 0, current_count: 3 -INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 INFO SP update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: Version(ArtifactVersion("0.5.0")), expected_active_version: 0.0.1, component: sp, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO skipping board for SP update, serial_number: serial1, part_number: model1 INFO skipping board for SP update, serial_number: serial0, part_number: model0 INFO configuring SP update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 2, sp_type: Sled, serial_number: serial2, part_number: model2 INFO ran out of boards for SP update -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 7f976e0d-d2a5-4eeb-9e82-c82bc2824aba based on parent blueprint df06bb57-ad42-4431-9206-abff322896c7 > blueprint-diff df06bb57-ad42-4431-9206-abff322896c7 7f976e0d-d2a5-4eeb-9e82-c82bc2824aba @@ -1152,23 +1097,11 @@ generated inventory collection a71f7a73-35a6-45e8-acbe-f1c5925eed69 from configu INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO sufficient BoundaryNtp zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient Clickhouse zones exist in plan, desired_count: 1, current_count: 1 -INFO sufficient ClickhouseKeeper zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient ClickhouseServer zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CockroachDb zones exist in plan, desired_count: 0, current_count: 0 -INFO sufficient CruciblePantry zones exist in plan, desired_count: 0, current_count: 3 -INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3 -INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0 INFO SP update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 2, sp_type: Sled, serial_number: serial2, part_number: model2 INFO skipping board for SP update, serial_number: serial2, part_number: model2 INFO skipping board for SP update, serial_number: serial0, part_number: model0 INFO skipping board for SP update, serial_number: serial1, part_number: model1 INFO ran out of boards for SP update -INFO updating zone image source in-place, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, zone_id: 353b3b65-20f7-48c3-88f7-495bd5d31545, kind: Clickhouse, image_source: artifact: version 1.0.0 -INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify generated blueprint 9034c710-3e57-45f3-99e5-4316145e87ac based on parent blueprint 7f976e0d-d2a5-4eeb-9e82-c82bc2824aba > blueprint-diff 7f976e0d-d2a5-4eeb-9e82-c82bc2824aba 9034c710-3e57-45f3-99e5-4316145e87ac diff --git a/nexus/reconfigurator/planning/src/lib.rs b/nexus/reconfigurator/planning/src/lib.rs index a11c5e4132..bdb02c88db 100644 --- a/nexus/reconfigurator/planning/src/lib.rs +++ b/nexus/reconfigurator/planning/src/lib.rs @@ -11,4 +11,5 @@ pub mod blueprint_editor; pub mod example; pub mod mgs_updates; pub mod planner; +pub mod reports; pub mod system; diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 9b13b6323b..027347308e 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -18,6 +18,7 @@ use crate::planner::image_source::NoopConvertInfo; use crate::planner::image_source::NoopConvertSledStatus; use crate::planner::image_source::NoopConvertZoneStatus; use crate::planner::omicron_zone_placement::PlacementError; +use crate::reports::InterimPlanningReport; use gateway_client::types::SpType; use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult; use nexus_sled_agent_shared::inventory::OmicronZoneImageSource; @@ -37,6 +38,13 @@ use nexus_types::deployment::SledDetails; use nexus_types::deployment::SledFilter; use nexus_types::deployment::TufRepoContentsError; use nexus_types::deployment::ZpoolFilter; +use nexus_types::deployment::{ + CockroachdbUnsafeToShutdown, PlanningAddStepReport, + PlanningCockroachdbSettingsStepReport, PlanningDecommissionStepReport, + PlanningExpungeStepReport, PlanningMgsUpdatesStepReport, + PlanningNoopImageSourceStepReport, PlanningReport, + PlanningZoneUpdatesStepReport, ZoneUnsafeToShutdown, ZoneUpdatesWaitingOn, +}; use nexus_types::external_api::views::PhysicalDiskPolicy; use nexus_types::external_api::views::SledPolicy; use nexus_types::external_api::views::SledState; @@ -46,8 +54,6 @@ use omicron_common::policy::INTERNAL_DNS_REDUNDANCY; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; -use slog::debug; -use slog::error; use slog::{Logger, info, warn}; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; @@ -90,10 +96,8 @@ pub(crate) mod rng; /// services, etc.). const NUM_CONCURRENT_MGS_UPDATES: usize = 1; -enum UpdateStepResult { - ContinueToNextStep, - Waiting, -} +/// A receipt that `check_input_validity` has been run prior to planning. +struct InputChecked; pub struct Planner<'a> { log: Logger, @@ -143,38 +147,57 @@ impl<'a> Planner<'a> { } pub fn plan(mut self) -> Result { - self.check_input_validity()?; - self.do_plan()?; + let checked = self.check_input_validity()?; + self.do_plan(checked)?; Ok(self.blueprint.build()) } - fn check_input_validity(&self) -> Result<(), Error> { + pub fn plan_and_report( + mut self, + ) -> Result<(Blueprint, PlanningReport), Error> { + let checked = self.check_input_validity()?; + let report = self.do_plan(checked)?; + let blueprint = self.blueprint.build(); + let report = report.finalize(blueprint.id); + Ok((blueprint, report)) + } + + fn check_input_validity(&self) -> Result { if self.input.target_internal_dns_zone_count() > INTERNAL_DNS_REDUNDANCY { return Err(Error::PolicySpecifiesTooManyInternalDnsServers); } - Ok(()) + Ok(InputChecked) } - fn do_plan(&mut self) -> Result<(), Error> { - self.do_plan_expunge()?; - self.do_plan_decommission()?; - - let noop_info = - NoopConvertInfo::new(self.input, self.inventory, &self.blueprint)?; - noop_info.log_to(&self.log); - - self.do_plan_noop_image_source(noop_info)?; - self.do_plan_add()?; - if let UpdateStepResult::ContinueToNextStep = self.do_plan_mgs_updates() - { - self.do_plan_zone_updates()?; - } - self.do_plan_cockroachdb_settings(); - Ok(()) + fn do_plan( + &mut self, + _checked: InputChecked, + ) -> Result { + // Run the planning steps, recording their step reports as we go. + let expunge = self.do_plan_expunge()?; + let decommission = self.do_plan_decommission()?; + let noop_image_source = self.do_plan_noop_image_source()?; + let mgs_updates = self.do_plan_mgs_updates(); + let add = self.do_plan_add(&mgs_updates)?; + let zone_updates = self.do_plan_zone_updates(&add, &mgs_updates)?; + let cockroachdb_settings = self.do_plan_cockroachdb_settings(); + Ok(InterimPlanningReport { + expunge, + decommission, + noop_image_source, + add, + mgs_updates, + zone_updates, + cockroachdb_settings, + }) } - fn do_plan_decommission(&mut self) -> Result<(), Error> { + fn do_plan_decommission( + &mut self, + ) -> Result { + let mut report = PlanningDecommissionStepReport::new(); + // Check for any sleds that are currently commissioned but can be // decommissioned. Our gates for decommissioning are: // @@ -209,15 +232,10 @@ impl<'a> Planner<'a> { continue; } // If the sled is already decommissioned it... why is it showing - // up when we ask for commissioned sleds? Warn, but don't try to + // up when we ask for commissioned sleds? Report, but don't try to // decommission it again. (SledPolicy::Expunged, SledState::Decommissioned) => { - error!( - self.log, - "decommissioned sled returned by \ - SledFilter::Commissioned"; - "sled_id" => %sled_id, - ); + report.zombie_sleds.push(sled_id); continue; } // The sled is expunged but not yet decommissioned; fall through @@ -257,7 +275,7 @@ impl<'a> Planner<'a> { } } - Ok(()) + Ok(report) } fn do_plan_decommission_expunged_disks_for_in_service_sled( @@ -307,17 +325,22 @@ impl<'a> Planner<'a> { self.blueprint.sled_decommission_disks(sled_id, disks_to_decommission) } - fn do_plan_expunge(&mut self) -> Result<(), Error> { - let mut commissioned_sled_ids = BTreeSet::new(); + fn do_plan_expunge(&mut self) -> Result { + let mut report = PlanningExpungeStepReport::new(); // Remove services from sleds marked expunged. We use // `SledFilter::Commissioned` and have a custom `needs_zone_expungement` // function that allows us to produce better errors. + let mut commissioned_sled_ids = BTreeSet::new(); for (sled_id, sled_details) in self.input.all_sleds(SledFilter::Commissioned) { commissioned_sled_ids.insert(sled_id); - self.do_plan_expunge_for_commissioned_sled(sled_id, sled_details)?; + self.do_plan_expunge_for_commissioned_sled( + sled_id, + sled_details, + &mut report, + )?; } // Check for any decommissioned sleds (i.e., sleds for which our @@ -348,13 +371,14 @@ impl<'a> Planner<'a> { } } - Ok(()) + Ok(report) } fn do_plan_expunge_for_commissioned_sled( &mut self, sled_id: SledUuid, sled_details: &SledDetails, + report: &mut PlanningExpungeStepReport, ) -> Result<(), Error> { match sled_details.policy { SledPolicy::InService { .. } => { @@ -391,14 +415,8 @@ impl<'a> Planner<'a> { // isn't in the blueprint at all (e.g., a disk could // have been added and then expunged since our // parent blueprint was created). We don't want to - // fail in this case, but will issue a warning. - warn!( - self.log, - "planning input contained expunged disk not \ - present in parent blueprint"; - "sled_id" => %sled_id, - "disk" => ?disk, - ); + // fail in this case, but will report it. + report.orphan_disks.insert(sled_id, disk.disk_id); } Err(err) => return Err(err), } @@ -513,11 +531,17 @@ impl<'a> Planner<'a> { fn do_plan_noop_image_source( &mut self, - noop_info: NoopConvertInfo, - ) -> Result<(), Error> { + ) -> Result { + use nexus_types::deployment::PlanningNoopImageSourceSkipSledReason as SkipSledReason; + let mut report = PlanningNoopImageSourceStepReport::new(); + + let noop_info = + NoopConvertInfo::new(self.input, self.inventory, &self.blueprint)?; + noop_info.log_to(&self.log); + let sleds = match noop_info { NoopConvertInfo::GlobalEligible { sleds } => sleds, - NoopConvertInfo::GlobalIneligible { .. } => return Ok(()), + NoopConvertInfo::GlobalIneligible { .. } => return Ok(report), }; for sled in sleds { let eligible = match &sled.status { @@ -527,23 +551,19 @@ impl<'a> Planner<'a> { let zone_counts = eligible.zone_counts(); if zone_counts.num_install_dataset() == 0 { - debug!( - self.log, - "all zones are already Artifact, so \ - no noop image source action required"; - "num_total" => zone_counts.num_total, + report.skip_sled( + sled.sled_id, + SkipSledReason::AllZonesAlreadyArtifact( + zone_counts.num_total, + ), ); continue; } if zone_counts.num_eligible > 0 { - info!( - self.log, - "noop converting {}/{} install-dataset zones to artifact store", + report.converted_zones( + sled.sled_id, zone_counts.num_eligible, - zone_counts.num_install_dataset(); - "sled_id" => %sled.sled_id, - "num_total" => zone_counts.num_total, - "num_already_artifact" => zone_counts.num_already_artifact, + zone_counts.num_install_dataset(), ); } @@ -571,10 +591,15 @@ impl<'a> Planner<'a> { } } - Ok(()) + Ok(report) } - fn do_plan_add(&mut self) -> Result<(), Error> { + fn do_plan_add( + &mut self, + mgs_updates: &PlanningMgsUpdatesStepReport, + ) -> Result { + let mut report = PlanningAddStepReport::new(); + // Internal DNS is a prerequisite for bringing up all other zones. At // this point, we assume that internal DNS (as a service) is already // functioning. @@ -588,8 +613,6 @@ impl<'a> Planner<'a> { // We will not mark sleds getting Crucible zones as ineligible; other // control plane service zones starting concurrently with Crucible zones // is fine. - let mut sleds_waiting_for_ntp_zone = BTreeSet::new(); - for (sled_id, sled_resources) in self.input.all_sled_resources(SledFilter::InService) { @@ -636,12 +659,7 @@ impl<'a> Planner<'a> { .next() .is_none() { - info!( - self.log, - "skipping sled (no zpools in service)"; - "sled_id" => %sled_id, - ); - sleds_waiting_for_ntp_zone.insert(sled_id); + report.sleds_with_no_zpools_for_ntp_zone.insert(sled_id); continue; } @@ -651,14 +669,13 @@ impl<'a> Planner<'a> { // provision anything else. if self.blueprint.sled_ensure_zone_ntp( sled_id, - self.image_source_for_new_zone(ZoneKind::InternalNtp)?, + self.image_source_for_new_zone( + ZoneKind::InternalNtp, + mgs_updates, + )?, )? == Ensure::Added { - info!( - &self.log, - "found sled missing NTP zone (will add one)"; - "sled_id" => %sled_id - ); + report.sleds_missing_ntp_zone.insert(sled_id); self.blueprint.record_operation(Operation::AddZone { sled_id, kind: ZoneKind::InternalNtp, @@ -686,14 +703,11 @@ impl<'a> Planner<'a> { .requires_timesync() }) { - info!( - &self.log, - "sled getting NTP zone has other services already; \ - considering it eligible for discretionary zones"; - "sled_id" => %sled_id, - ); + report + .sleds_getting_ntp_and_discretionary_zones + .insert(sled_id); } else { - sleds_waiting_for_ntp_zone.insert(sled_id); + report.sleds_waiting_for_ntp_zone.insert(sled_id); continue; } } @@ -738,12 +752,7 @@ impl<'a> Planner<'a> { }) .unwrap_or(false); if !has_ntp_inventory { - info!( - &self.log, - "parent blueprint contains NTP zone, but it's not in \ - inventory yet"; - "sled_id" => %sled_id, - ); + report.sleds_waiting_for_ntp_zone.insert(sled_id); continue; } @@ -754,15 +763,15 @@ impl<'a> Planner<'a> { if self.blueprint.sled_ensure_zone_crucible( sled_id, *zpool_id, - self.image_source_for_new_zone(ZoneKind::Crucible)?, + self.image_source_for_new_zone( + ZoneKind::Crucible, + mgs_updates, + )?, )? == Ensure::Added { - info!( - &self.log, - "found sled zpool missing Crucible zone (will add one)"; - "sled_id" => ?sled_id, - "zpool_id" => ?zpool_id, - ); + report + .sleds_missing_crucible_zone + .insert((sled_id, *zpool_id)); ncrucibles_added += 1; } } @@ -782,16 +791,19 @@ impl<'a> Planner<'a> { } } - self.do_plan_add_discretionary_zones(&sleds_waiting_for_ntp_zone)?; + self.do_plan_add_discretionary_zones(mgs_updates, &mut report)?; // Now that we've added all the disks and zones we plan on adding, // ensure that all sleds have the datasets they need to have. - self.do_plan_datasets()?; + self.do_plan_datasets(&mut report)?; - Ok(()) + Ok(report) } - fn do_plan_datasets(&mut self) -> Result<(), Error> { + fn do_plan_datasets( + &mut self, + _report: &mut PlanningAddStepReport, + ) -> Result<(), Error> { for sled_id in self.input.all_sled_ids(SledFilter::InService) { if let EnsureMultiple::Changed { added, @@ -823,7 +835,8 @@ impl<'a> Planner<'a> { fn do_plan_add_discretionary_zones( &mut self, - sleds_waiting_for_ntp_zone: &BTreeSet, + mgs_updates: &PlanningMgsUpdatesStepReport, + report: &mut PlanningAddStepReport, ) -> Result<(), Error> { // We usually don't need to construct an `OmicronZonePlacement` to add // discretionary zones, so defer its creation until it's needed. @@ -841,7 +854,8 @@ impl<'a> Planner<'a> { DiscretionaryOmicronZone::Nexus, DiscretionaryOmicronZone::Oximeter, ] { - let num_zones_to_add = self.num_additional_zones_needed(zone_kind); + let num_zones_to_add = + self.num_additional_zones_needed(zone_kind, report); if num_zones_to_add == 0 { continue; } @@ -858,7 +872,7 @@ impl<'a> Planner<'a> { .input .all_sled_resources(SledFilter::Discretionary) .filter(|(sled_id, _)| { - !sleds_waiting_for_ntp_zone.contains(&sled_id) + !report.sleds_waiting_for_ntp_zone.contains(&sled_id) }) .map(|(sled_id, sled_resources)| { OmicronZonePlacementSledState { @@ -886,17 +900,20 @@ impl<'a> Planner<'a> { zone_placement, zone_kind, num_zones_to_add, + mgs_updates, + report, )?; } Ok(()) } - // Given the current blueprint state and policy, returns the number of - // additional zones needed of the given `zone_kind` to satisfy the policy. + /// Given the current blueprint state and policy, returns the number of + /// additional zones needed of the given `zone_kind` to satisfy the policy. fn num_additional_zones_needed( &mut self, zone_kind: DiscretionaryOmicronZone, + report: &mut PlanningAddStepReport, ) -> usize { // Count the number of `kind` zones on all in-service sleds. This // will include sleds that are in service but not eligible for new @@ -959,30 +976,31 @@ impl<'a> Planner<'a> { }; // TODO-correctness What should we do if we have _too many_ - // `zone_kind` zones? For now, just log it the number of zones any - // time we have at least the minimum number. + // `zone_kind` zones? For now, just report the number of zones + // any time we have at least the minimum number. let num_zones_to_add = target_count.saturating_sub(num_existing_kind_zones); if num_zones_to_add == 0 { - info!( - self.log, "sufficient {zone_kind:?} zones exist in plan"; - "desired_count" => target_count, - "current_count" => num_existing_kind_zones, + report.sufficient_zones_exist.insert( + ZoneKind::from(zone_kind).report_str().to_owned(), + (target_count, num_existing_kind_zones), ); } num_zones_to_add } - // Attempts to place `num_zones_to_add` new zones of `kind`. - // - // It is not an error if there are too few eligible sleds to start a - // sufficient number of zones; instead, we'll log a warning and start as - // many as we can (up to `num_zones_to_add`). + /// Attempts to place `num_zones_to_add` new zones of `kind`. + /// + /// It is not an error if there are too few eligible sleds to start a + /// sufficient number of zones; instead, we'll log a warning and start as + /// many as we can (up to `num_zones_to_add`). fn add_discretionary_zones( &mut self, zone_placement: &mut OmicronZonePlacement, kind: DiscretionaryOmicronZone, num_zones_to_add: usize, + mgs_updates: &PlanningMgsUpdatesStepReport, + report: &mut PlanningAddStepReport, ) -> Result<(), Error> { for i in 0..num_zones_to_add { let sled_id = match zone_placement.place_zone(kind) { @@ -992,18 +1010,16 @@ impl<'a> Planner<'a> { // (albeit unlikely?) we're in a weird state where we need // more sleds or disks to come online, and we may need to be // able to produce blueprints to achieve that status. - warn!( - self.log, - "failed to place all new desired {kind:?} zones"; - "placed" => i, - "wanted_to_place" => num_zones_to_add, + report.out_of_eligible_sleds.insert( + ZoneKind::from(kind).report_str().to_owned(), + (i, num_zones_to_add), ); - break; } }; - let image_source = self.image_source_for_new_zone(kind.into())?; + let image_source = + self.image_source_for_new_zone(kind.into(), mgs_updates)?; match kind { DiscretionaryOmicronZone::BoundaryNtp => { self.blueprint.sled_promote_internal_ntp_to_boundary_ntp( @@ -1039,11 +1055,9 @@ impl<'a> Planner<'a> { .blueprint .sled_add_zone_oximeter(sled_id, image_source)?, }; - info!( - self.log, "added zone to sled"; - "sled_id" => %sled_id, - "kind" => ?kind, - ); + report + .discretionary_zones_placed + .push((sled_id, ZoneKind::from(kind).report_str().to_owned())); } Ok(()) @@ -1051,7 +1065,7 @@ impl<'a> Planner<'a> { /// Update at most one MGS-managed device (SP, RoT, etc.), if any are out of /// date. - fn do_plan_mgs_updates(&mut self) -> UpdateStepResult { + fn do_plan_mgs_updates(&mut self) -> PlanningMgsUpdatesStepReport { // Determine which baseboards we will consider updating. // // Sleds may be present but not adopted as part of the control plane. @@ -1095,24 +1109,45 @@ impl<'a> Planner<'a> { current_artifacts, NUM_CONCURRENT_MGS_UPDATES, ); + self.blueprint.pending_mgs_updates_replace_all(next.clone()); - // TODO This is not quite right. See oxidecomputer/omicron#8285. - let rv = if next.is_empty() { - UpdateStepResult::ContinueToNextStep - } else { - UpdateStepResult::Waiting - }; - self.blueprint.pending_mgs_updates_replace_all(next); - rv + PlanningMgsUpdatesStepReport::new(next) } /// Update at most one existing zone to use a new image source. - fn do_plan_zone_updates(&mut self) -> Result<(), Error> { - // We are only interested in non-decommissioned sleds. + fn do_plan_zone_updates( + &mut self, + add: &PlanningAddStepReport, + mgs_updates: &PlanningMgsUpdatesStepReport, + ) -> Result { + let mut report = PlanningZoneUpdatesStepReport::new(); + + // Do not update any zones if we've added any discretionary zones + // (e.g., in response to policy changes) ... + if add.any_discretionary_zones_placed() { + report.waiting_on(ZoneUpdatesWaitingOn::DiscretionaryZones); + return Ok(report); + } + + // ... or if there are still pending updates for the RoT / SP / + // Host OS / etc. + if mgs_updates.any_updates_pending() { + report.waiting_on(ZoneUpdatesWaitingOn::PendingMgsUpdates); + return Ok(report); + } + + // We are only interested in non-decommissioned sleds with + // running NTP zones (TODO: check time sync). let sleds = self .input .all_sleds(SledFilter::Commissioned) - .map(|(id, _details)| id) + .filter_map(|(sled_id, _details)| { + if add.sleds_waiting_for_ntp_zone.contains(&sled_id) { + None + } else { + Some(sled_id) + } + }) .collect::>(); // Wait for zones to appear up-to-date in the inventory. @@ -1223,14 +1258,14 @@ impl<'a> Planner<'a> { "sled_id" => %sled_id, "zones_currently_updating" => ?zones_currently_updating, ); - return Ok(()); + return Ok(report); } } // Find out of date zones, as defined by zones whose image source does // not match what it should be based on our current target release. let target_release = self.input.tuf_repo().description(); - let mut out_of_date_zones = sleds + let out_of_date_zones = sleds .into_iter() .flat_map(|sled_id| { let log = &self.log; @@ -1258,28 +1293,27 @@ impl<'a> Planner<'a> { } }; if zone.image_source != desired_image_source { - Some((sled_id, zone, desired_image_source)) + Some((sled_id, zone.clone(), desired_image_source)) } else { None } }) }) - .peekable(); - - // Before we filter out zones that can't be updated, do we have any out - // of date zones at all? We need this to explain why we didn't update - // any zones below, if we don't. - let have_out_of_date_zones = out_of_date_zones.peek().is_some(); + .collect::>(); + report.out_of_date_zones.extend(out_of_date_zones.iter().cloned()); // Of the out-of-date zones, filter out zones that can't be updated yet, // either because they're not ready or because it wouldn't be safe to // bounce them. - let mut updateable_zones = - out_of_date_zones.filter(|(_sled_id, zone, _new_image_source)| { - if !self.can_zone_be_shut_down_safely(zone) { + let mut updateable_zones = out_of_date_zones.iter().filter( + |(_sled_id, zone, _new_image_source)| { + if !self.can_zone_be_shut_down_safely(zone, &mut report) { return false; } - match self.is_zone_ready_for_update(zone.zone_type.kind()) { + match self.is_zone_ready_for_update( + zone.zone_type.kind(), + mgs_updates, + ) { Ok(true) => true, Ok(false) => false, Err(err) => { @@ -1294,35 +1328,22 @@ impl<'a> Planner<'a> { false } } - }); + }, + ); - // Update the first out-of-date zone. if let Some((sled_id, zone, new_image_source)) = updateable_zones.next() { - // Borrow check workaround: `self.update_or_expunge_zone` needs - // `&mut self`, but `self` is borrowed in the `updateable_zones` - // iterator. Clone the one zone we want to update, then drop the - // iterator; now we can call `&mut self` methods. - let zone = zone.clone(); - std::mem::drop(updateable_zones); - - return self.update_or_expunge_zone( - sled_id, - &zone, - new_image_source, - ); - } - - if have_out_of_date_zones { - info!( - self.log, - "not all zones up-to-date, but no zones can be updated now" - ); + // Update the first out-of-date zone. + self.update_or_expunge_zone( + *sled_id, + zone, + new_image_source.clone(), + report, + ) } else { - info!(self.log, "all zones up-to-date"); + // No zones to update. + Ok(report) } - - Ok(()) } /// Update a zone to use a new image source, either in-place or by @@ -1332,7 +1353,8 @@ impl<'a> Planner<'a> { sled_id: SledUuid, zone: &BlueprintZoneConfig, new_image_source: BlueprintZoneImageSource, - ) -> Result<(), Error> { + mut report: PlanningZoneUpdatesStepReport, + ) -> Result { let zone_kind = zone.zone_type.kind(); // We're called by `do_plan_zone_updates()`, which guarantees the @@ -1345,18 +1367,12 @@ impl<'a> Planner<'a> { | ZoneKind::ClickhouseKeeper | ZoneKind::ClickhouseServer | ZoneKind::CockroachDb => { - info!( - self.log, "updating zone image source in-place"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - "kind" => ?zone.zone_type.kind(), - "image_source" => %new_image_source, - ); self.blueprint.comment(format!( "updating {:?} zone {} in-place", zone.zone_type.kind(), zone.id )); + report.updated_zones.push((sled_id, zone.clone())); self.blueprint.sled_set_zone_source( sled_id, zone.id, @@ -1370,25 +1386,24 @@ impl<'a> Planner<'a> { | ZoneKind::InternalNtp | ZoneKind::Nexus | ZoneKind::Oximeter => { - info!( - self.log, "expunging out-of-date zone"; - "sled_id" => %sled_id, - "zone_id" => %zone.id, - "kind" => ?zone.zone_type.kind(), - ); self.blueprint.comment(format!( "expunge {:?} zone {} for update", zone.zone_type.kind(), zone.id )); + report.expunged_zones.push((sled_id, zone.clone())); self.blueprint.sled_expunge_zone(sled_id, zone.id)?; } } - Ok(()) + Ok(report) } - fn do_plan_cockroachdb_settings(&mut self) { + fn do_plan_cockroachdb_settings( + &mut self, + ) -> PlanningCockroachdbSettingsStepReport { + let mut report = PlanningCockroachdbSettingsStepReport::new(); + // Figure out what we should set the CockroachDB "preserve downgrade // option" setting to based on the planning input. // @@ -1466,12 +1481,8 @@ impl<'a> Planner<'a> { Err(_) => CockroachDbPreserveDowngrade::DoNotModify, }; self.blueprint.cockroachdb_preserve_downgrade(value); - info!( - &self.log, - "will ensure cockroachdb setting"; - "setting" => "cluster.preserve_downgrade_option", - "value" => ?value, - ); + report.preserve_downgrade = value; + report // Hey! Listen! // @@ -1486,12 +1497,14 @@ impl<'a> Planner<'a> { fn image_source_for_new_zone( &self, zone_kind: ZoneKind, + mgs_updates: &PlanningMgsUpdatesStepReport, ) -> Result { - let source_repo = if self.is_zone_ready_for_update(zone_kind)? { - self.input.tuf_repo().description() - } else { - self.input.old_repo().description() - }; + let source_repo = + if self.is_zone_ready_for_update(zone_kind, mgs_updates)? { + self.input.tuf_repo().description() + } else { + self.input.old_repo().description() + }; source_repo.zone_image_source(zone_kind) } @@ -1500,10 +1513,14 @@ impl<'a> Planner<'a> { fn is_zone_ready_for_update( &self, zone_kind: ZoneKind, + mgs_updates: &PlanningMgsUpdatesStepReport, ) -> Result { - // TODO-correctness: We should return false regardless of `zone_kind` if - // there are still pending updates for components earlier in the update - // ordering than zones: RoT bootloader / RoT / SP / Host OS. + // We return false regardless of `zone_kind` if there are still + // pending updates for components earlier in the update ordering + // than zones: RoT bootloader / RoT / SP / Host OS. + if mgs_updates.any_updates_pending() { + return Ok(false); + } match zone_kind { ZoneKind::Nexus => { @@ -1552,46 +1569,58 @@ impl<'a> Planner<'a> { /// because the underlying disk / sled has been expunged" case. In this /// case, we have no choice but to reconcile with the fact that the zone is /// now gone. - fn can_zone_be_shut_down_safely(&self, zone: &BlueprintZoneConfig) -> bool { + fn can_zone_be_shut_down_safely( + &self, + zone: &BlueprintZoneConfig, + report: &mut PlanningZoneUpdatesStepReport, + ) -> bool { match zone.zone_type.kind() { ZoneKind::CockroachDb => { - debug!(self.log, "Checking if Cockroach node can shut down"); + use CockroachdbUnsafeToShutdown::*; + use ZoneUnsafeToShutdown::*; + // We must hear from all nodes let all_statuses = &self.inventory.cockroach_status; if all_statuses.len() < COCKROACHDB_REDUNDANCY { - warn!(self.log, "Not enough nodes"); + report.unsafe_zone(zone, Cockroachdb(NotEnoughNodes)); return false; } // All nodes must report: "We have the necessary redundancy, and // have observed no underreplicated ranges". - for (node_id, status) in all_statuses { - let log = self.log.new(slog::o!( - "operation" => "Checking Cockroach node status for shutdown safety", - "node_id" => node_id.to_string() - )); + for (_node_id, status) in all_statuses { let Some(ranges_underreplicated) = status.ranges_underreplicated else { - warn!(log, "Missing underreplicated stat"); + report.unsafe_zone( + zone, + Cockroachdb(MissingUnderreplicatedStat), + ); return false; }; if ranges_underreplicated != 0 { - warn!(log, "Underreplicated ranges != 0"; "ranges_underreplicated" => ranges_underreplicated); + report.unsafe_zone( + zone, + Cockroachdb(UnderreplicatedRanges( + ranges_underreplicated, + )), + ); return false; } let Some(live_nodes) = status.liveness_live_nodes else { - warn!(log, "Missing live_nodes"); + report.unsafe_zone( + zone, + Cockroachdb(MissingLiveNodesStat), + ); return false; }; if live_nodes < COCKROACHDB_REDUNDANCY as u64 { - warn!(log, "Live nodes < COCKROACHDB_REDUNDANCY"; "live_nodes" => live_nodes); + report.unsafe_zone( + zone, + Cockroachdb(NotEnoughLiveNodes(live_nodes)), + ); return false; } - info!( - log, - "CockroachDB Node status looks ready for shutdown" - ); } true } @@ -5758,7 +5787,7 @@ pub(crate) mod test { /// If incidental planner work changes this value occasionally, /// that's fine; but if we find we're changing it all the time, /// we should probably drop it and keep just the maximum below. - const EXP_PLANNING_ITERATIONS: usize = 57; + const EXP_PLANNING_ITERATIONS: usize = 55; /// Planning must not take more than this number of iterations. const MAX_PLANNING_ITERATIONS: usize = 100; @@ -5769,7 +5798,7 @@ pub(crate) mod test { update_collection_from_blueprint(&mut example, &parent); let blueprint_name = format!("blueprint{i}"); - let blueprint = Planner::new_based_on( + let (blueprint, report) = Planner::new_based_on( log.clone(), &parent, &input, @@ -5778,10 +5807,15 @@ pub(crate) mod test { ) .expect("can't create planner") .with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name))) - .plan() + .plan_and_report() .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); + eprintln!("{report}\n"); + assert_eq!(report.blueprint_id, blueprint.id); + // TODO: more report testing + let summary = blueprint.diff_since_blueprint(&parent); + eprintln!("diff to {blueprint_name}: {}", summary.display()); if summary.total_zones_added() == 0 && summary.total_zones_removed() == 0 && summary.total_zones_modified() == 0 diff --git a/nexus/reconfigurator/planning/src/reports.rs b/nexus/reconfigurator/planning/src/reports.rs new file mode 100644 index 0000000000..e72b63f76a --- /dev/null +++ b/nexus/reconfigurator/planning/src/reports.rs @@ -0,0 +1,60 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Utilities for structured reports on planning, i.e., Blueprint generation. +//! +//! Most of the important structures (e.g., `PlanningReport`, the step reports) +//! are defined in [`nexus_types::deployment`] so that they may be shared with +//! the `blueprint_planner` background task and `omdb`. + +use nexus_types::deployment::{ + PlanningAddStepReport, PlanningCockroachdbSettingsStepReport, + PlanningDecommissionStepReport, PlanningExpungeStepReport, + PlanningMgsUpdatesStepReport, PlanningNoopImageSourceStepReport, + PlanningReport, PlanningZoneUpdatesStepReport, +}; +use omicron_uuid_kinds::BlueprintUuid; + +/// A blueprint planning report minus the blueprint ID that the +/// report is for. Returned by [`crate::planner::Planner::do_plan`] +/// when all planning steps are complete, but before the blueprint +/// has been built (and so we don't yet know its ID). +#[derive(Debug)] +pub(crate) struct InterimPlanningReport { + pub expunge: PlanningExpungeStepReport, + pub decommission: PlanningDecommissionStepReport, + pub noop_image_source: PlanningNoopImageSourceStepReport, + pub mgs_updates: PlanningMgsUpdatesStepReport, + pub add: PlanningAddStepReport, + pub zone_updates: PlanningZoneUpdatesStepReport, + pub cockroachdb_settings: PlanningCockroachdbSettingsStepReport, +} + +impl InterimPlanningReport { + /// Attach a blueprint ID to an interim planning report. + pub(crate) fn finalize( + self, + blueprint_id: BlueprintUuid, + ) -> PlanningReport { + let Self { + expunge, + decommission, + noop_image_source, + mgs_updates, + add, + zone_updates, + cockroachdb_settings, + } = self; + PlanningReport { + blueprint_id, + expunge, + decommission, + noop_image_source, + mgs_updates, + add, + zone_updates, + cockroachdb_settings, + } + } +} diff --git a/nexus/src/app/background/tasks/blueprint_planner.rs b/nexus/src/app/background/tasks/blueprint_planner.rs index 66a33f6e41..7aac222814 100644 --- a/nexus/src/app/background/tasks/blueprint_planner.rs +++ b/nexus/src/app/background/tasks/blueprint_planner.rs @@ -150,7 +150,7 @@ impl BlueprintPlanner { )); } }; - let blueprint = match planner.plan() { + let (blueprint, report) = match planner.plan_and_report() { Ok(blueprint) => blueprint, Err(error) => { error!(&opctx.log, "can't plan: {error}"); @@ -241,7 +241,11 @@ impl BlueprintPlanner { // We have a new target! self.tx_blueprint.send_replace(Some(Arc::new((target, blueprint)))); - BlueprintPlannerStatus::Targeted { parent_blueprint_id, blueprint_id } + BlueprintPlannerStatus::Targeted { + parent_blueprint_id, + blueprint_id, + report, + } } } @@ -332,8 +336,10 @@ mod test { BlueprintPlannerStatus::Targeted { parent_blueprint_id, blueprint_id, + report, } if parent_blueprint_id == initial_blueprint.id - && blueprint_id != initial_blueprint.id => + && blueprint_id != initial_blueprint.id + && blueprint_id == report.blueprint_id => { blueprint_id } diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index b6e46b26a1..8aa4d95ce6 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -67,6 +67,7 @@ mod clickhouse; pub mod execution; mod network_resources; mod planning_input; +mod planning_report; mod zone_type; use crate::inventory::BaseboardId; @@ -109,6 +110,19 @@ pub use planning_input::TargetReleaseDescription; pub use planning_input::TufRepoContentsError; pub use planning_input::TufRepoPolicy; pub use planning_input::ZpoolFilter; +pub use planning_report::CockroachdbUnsafeToShutdown; +pub use planning_report::PlanningAddStepReport; +pub use planning_report::PlanningCockroachdbSettingsStepReport; +pub use planning_report::PlanningDecommissionStepReport; +pub use planning_report::PlanningExpungeStepReport; +pub use planning_report::PlanningMgsUpdatesStepReport; +pub use planning_report::PlanningNoopImageSourceSkipSledReason; +pub use planning_report::PlanningNoopImageSourceSkipZoneReason; +pub use planning_report::PlanningNoopImageSourceStepReport; +pub use planning_report::PlanningReport; +pub use planning_report::PlanningZoneUpdatesStepReport; +pub use planning_report::ZoneUnsafeToShutdown; +pub use planning_report::ZoneUpdatesWaitingOn; use std::sync::Arc; pub use zone_type::BlueprintZoneType; pub use zone_type::DurableDataset; diff --git a/nexus/types/src/deployment/planning_report.rs b/nexus/types/src/deployment/planning_report.rs new file mode 100644 index 0000000000..c38af74f4d --- /dev/null +++ b/nexus/types/src/deployment/planning_report.rs @@ -0,0 +1,679 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types representing a report on a planning run that produced a blueprint. + +use super::ArtifactHash; +use super::BlueprintZoneConfig; +use super::BlueprintZoneImageSource; +use super::CockroachDbPreserveDowngrade; +use super::PendingMgsUpdates; + +use omicron_common::policy::COCKROACHDB_REDUNDANCY; +use omicron_uuid_kinds::BlueprintUuid; +use omicron_uuid_kinds::MupdateOverrideUuid; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::PhysicalDiskUuid; +use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::ZpoolUuid; +use serde::Deserialize; +use serde::Serialize; + +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::fmt; + +/// A full blueprint planning report. Other than the blueprint ID, each +/// field corresponds to a step in the update planner, i.e., a subroutine +/// of `omicron_nexus::reconfigurator::planning::Planner::do_plan`. +/// +/// The intent of a planning report is to capture information useful to an +/// operator or developer about the planning process itself, especially if +/// it has become "stuck" (unable to proceed with an update). It is *not* a +/// summary of the plan (blueprint), but rather a description of non-fatal +/// conditions the planner is waiting on, unexpected or invalid +/// configurations encountered during planning, etc. The planner may make +/// internal decisions based on the step reports; the intent is that an +/// operator may make administrative decisions based on the full report. +/// +/// Only successful planning runs are currently covered by this report. +/// Failures to plan (i.e., to generate a valid blueprint) are represented +/// by `nexus-reconfigurator-planning::blueprint_builder::Error`. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +#[must_use = "an unread report is not actionable"] +pub struct PlanningReport { + /// The blueprint produced by the planning run this report describes. + pub blueprint_id: BlueprintUuid, + + // Step reports. + pub expunge: PlanningExpungeStepReport, + pub decommission: PlanningDecommissionStepReport, + pub noop_image_source: PlanningNoopImageSourceStepReport, + pub mgs_updates: PlanningMgsUpdatesStepReport, + pub add: PlanningAddStepReport, + pub zone_updates: PlanningZoneUpdatesStepReport, + pub cockroachdb_settings: PlanningCockroachdbSettingsStepReport, +} + +impl fmt::Display for PlanningReport { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Self { + blueprint_id, + expunge, + decommission, + noop_image_source, + mgs_updates, + add, + zone_updates, + cockroachdb_settings, + } = self; + writeln!(f, "Report on planning run for blueprint {blueprint_id}:")?; + expunge.fmt(f)?; + decommission.fmt(f)?; + noop_image_source.fmt(f)?; + mgs_updates.fmt(f)?; + add.fmt(f)?; + zone_updates.fmt(f)?; + cockroachdb_settings.fmt(f)?; + Ok(()) + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct PlanningExpungeStepReport { + /// Expunged disks not present in the parent blueprint. + pub orphan_disks: BTreeMap, +} + +impl PlanningExpungeStepReport { + pub fn new() -> Self { + Self { orphan_disks: BTreeMap::new() } + } +} + +impl fmt::Display for PlanningExpungeStepReport { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Self { orphan_disks } = self; + if !orphan_disks.is_empty() { + writeln!( + f, + "* planning input contained expunged disks \ + not present in parent blueprint:", + )?; + for (sled, disk) in orphan_disks.iter() { + writeln!(f, " * sled {sled}, disk {disk}",)?; + } + } + Ok(()) + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct PlanningDecommissionStepReport { + /// Decommissioned sleds that unexpectedly appeared as commissioned. + pub zombie_sleds: Vec, +} + +impl PlanningDecommissionStepReport { + pub fn new() -> Self { + Self { zombie_sleds: Vec::new() } + } +} + +impl fmt::Display for PlanningDecommissionStepReport { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Self { zombie_sleds } = self; + if !zombie_sleds.is_empty() { + let n = zombie_sleds.len(); + let s = if n == 1 { "" } else { "s" }; + writeln!( + f, + "* decommissioned sled{s} returned by `SledFilter::Commissioned`: {}", + zombie_sleds + .iter() + .map(|sled_id| format!("{sled_id}")) + .collect::>() + .join(", ") + )?; + } + Ok(()) + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct PlanningNoopImageSourceStepReport { + pub no_target_release: bool, + pub skipped_sleds: + BTreeMap, + pub skipped_zones: + BTreeMap, + pub converted_zones: BTreeMap, +} + +impl PlanningNoopImageSourceStepReport { + pub fn new() -> Self { + Self { + no_target_release: false, + skipped_sleds: BTreeMap::new(), + skipped_zones: BTreeMap::new(), + converted_zones: BTreeMap::new(), + } + } + + pub fn skip_sled( + &mut self, + sled_id: SledUuid, + reason: PlanningNoopImageSourceSkipSledReason, + ) { + self.skipped_sleds.insert(sled_id, reason); + } + + pub fn skip_zone( + &mut self, + zone_id: OmicronZoneUuid, + reason: PlanningNoopImageSourceSkipZoneReason, + ) { + self.skipped_zones.insert(zone_id, reason); + } + + pub fn converted_zones( + &mut self, + sled_id: SledUuid, + num_eligible: usize, + num_dataset: usize, + ) { + self.converted_zones.insert(sled_id, (num_eligible, num_dataset)); + } +} + +impl fmt::Display for PlanningNoopImageSourceStepReport { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Self { + no_target_release, + skipped_sleds, + skipped_zones: _, + converted_zones, + } = self; + + if *no_target_release { + return writeln!( + f, + "* Skipping noop image source check for all sleds (no current TUF repo)", + ); + } + + for (sled_id, reason) in skipped_sleds.iter() { + writeln!( + f, + "* Skipping noop image source check on sled {sled_id}: {reason}" + )?; + } + + // Very noisy in tests. + // for (zone_id, reason) in skipped_zones.iter() { + // writeln!( + // f, + // "* Skipping noop image source check for zone {zone_id}: {reason}" + // )?; + // } + + for (sled_id, (m, n)) in converted_zones.iter() { + if *m > 0 && *n > 0 { + writeln!( + f, + "* Noop converting {m}/{n} install-dataset zones to artifact store \ + on sled {sled_id}", + )?; + } + } + + Ok(()) + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub enum PlanningNoopImageSourceSkipSledReason { + AllZonesAlreadyArtifact(usize), + SledNotInInventory, + ErrorRetrievingZoneManifest(String), + RemoveMupdateOverride(MupdateOverrideUuid), +} + +impl fmt::Display for PlanningNoopImageSourceSkipSledReason { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::AllZonesAlreadyArtifact(n) => { + write!(f, "all {n} zones are already from artifacts") + } + Self::SledNotInInventory => { + write!(f, "sled not present in latest inventory collection") + } + Self::ErrorRetrievingZoneManifest(error) => { + write!( + f, + "sled-agent encountered error retrieving zone manifest \ + (this is abnormal): {error}" + ) + } + Self::RemoveMupdateOverride(id) => { + write!( + f, + "blueprint has get_remove_mupdate_override set for sled: {id}", + ) + } + } + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub enum PlanningNoopImageSourceSkipZoneReason { + ZoneNotInManifest { + zone_kind: String, + file_name: String, + }, + InvalidArtifact { + zone_kind: String, + file_name: String, + error: String, + }, + ArtifactNotInRepo { + artifact_hash: ArtifactHash, + zone_kind: String, + file_name: String, + }, +} + +impl fmt::Display for PlanningNoopImageSourceSkipZoneReason { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::ZoneNotInManifest { file_name, .. } => { + write!(f, "artifact not found in zone manifest: {file_name}") + } + Self::InvalidArtifact { error, .. } => { + write!( + f, + "zone manifest inventory indicated install dataset artifact \ + is invalid, not using artifact (this is abnormal): {error}" + ) + } + Self::ArtifactNotInRepo { .. } => { + write!(f, "install dataset artifact hash not found in TUF repo") + } + } + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct PlanningMgsUpdatesStepReport { + pub pending_mgs_updates: PendingMgsUpdates, +} + +impl PlanningMgsUpdatesStepReport { + pub fn new(pending_mgs_updates: PendingMgsUpdates) -> Self { + Self { pending_mgs_updates } + } + + // TODO This is not quite right. See oxidecomputer/omicron#8285. + pub fn any_updates_pending(&self) -> bool { + !self.pending_mgs_updates.is_empty() + } +} + +impl fmt::Display for PlanningMgsUpdatesStepReport { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Self { pending_mgs_updates } = self; + if !pending_mgs_updates.is_empty() { + let n = pending_mgs_updates.len(); + let s = if n == 1 { "" } else { "s" }; + writeln!(f, "* {n} pending MGS update{s}:")?; + for update in pending_mgs_updates.iter() { + writeln!( + f, + " * {}: {:?}", + update.baseboard_id, update.details + )?; + } + } + Ok(()) + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct PlanningAddStepReport { + pub sleds_with_no_zpools_for_ntp_zone: BTreeSet, + pub sleds_waiting_for_ntp_zone: BTreeSet, + pub sleds_getting_ntp_and_discretionary_zones: BTreeSet, + pub sleds_missing_ntp_zone: BTreeSet, + pub sleds_missing_crucible_zone: BTreeSet<(SledUuid, ZpoolUuid)>, + + /// Discretionary zone kind → (placed, wanted to place) + pub out_of_eligible_sleds: BTreeMap, + + /// Discretionary zone kind → (wanted to place, num existing) + pub sufficient_zones_exist: BTreeMap, + + /// List of (Sled ID, kind of discretionary zone placed there) pairs. + // TODO: make `sled_add_zone_*` methods return the added zone config + // so that we can report it here. + pub discretionary_zones_placed: Vec<(SledUuid, String)>, +} + +impl PlanningAddStepReport { + pub fn new() -> Self { + Self { + sleds_with_no_zpools_for_ntp_zone: BTreeSet::new(), + sleds_waiting_for_ntp_zone: BTreeSet::new(), + sleds_getting_ntp_and_discretionary_zones: BTreeSet::new(), + sleds_missing_ntp_zone: BTreeSet::new(), + sleds_missing_crucible_zone: BTreeSet::new(), + out_of_eligible_sleds: BTreeMap::new(), + sufficient_zones_exist: BTreeMap::new(), + discretionary_zones_placed: Vec::new(), + } + } + + pub fn any_discretionary_zones_placed(&self) -> bool { + !self.discretionary_zones_placed.is_empty() + } +} + +impl fmt::Display for PlanningAddStepReport { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Self { + sleds_with_no_zpools_for_ntp_zone, + sleds_waiting_for_ntp_zone, + sleds_getting_ntp_and_discretionary_zones, + sleds_missing_ntp_zone, + sleds_missing_crucible_zone, + out_of_eligible_sleds, + sufficient_zones_exist: _, + discretionary_zones_placed, + } = self; + + if !sleds_with_no_zpools_for_ntp_zone.is_empty() { + writeln!( + f, + "* No zpools in service for NTP zones on sleds: {}", + sleds_with_no_zpools_for_ntp_zone + .iter() + .map(|sled_id| format!("{sled_id}")) + .collect::>() + .join(", ") + )?; + } + + if !sleds_waiting_for_ntp_zone.is_empty() { + writeln!( + f, + "* Discretionary zone placement waiting for NTP zones on sleds: {}", + sleds_waiting_for_ntp_zone + .iter() + .map(|sled_id| format!("{sled_id}")) + .collect::>() + .join(", ") + )?; + } + + if !sleds_getting_ntp_and_discretionary_zones.is_empty() { + writeln!( + f, + "* Sleds getting NTP zones and which have other services already, \ + making them eligible for discretionary zones: {}", + sleds_getting_ntp_and_discretionary_zones + .iter() + .map(|sled_id| format!("{sled_id}")) + .collect::>() + .join(", ") + )?; + } + + for sled_id in sleds_missing_ntp_zone { + writeln!(f, "* Missing NTP zone on sled {sled_id}",)?; + } + + for (sled_id, zpool_id) in sleds_missing_crucible_zone { + writeln!( + f, + "* Missing Crucible zone for sled {sled_id}, zpool {zpool_id}", + )?; + } + + for (kind, (placed, desired)) in out_of_eligible_sleds.iter() { + writeln!( + f, + "* Only placed {placed}/{desired} desired {kind} zones" + )?; + } + + // Noisy in tests. + // for (kind, (desired, existing)) in sufficient_zones_exist.iter() { + // writeln!( + // f, + // "* Sufficient {kind} zones exist in plan: {desired}/{existing}" + // )?; + // } + + if !discretionary_zones_placed.is_empty() { + writeln!(f, "* Discretionary zones placed:")?; + for (sled_id, kind) in discretionary_zones_placed.iter() { + writeln!(f, " * a {kind} zone on sled {sled_id}")?; + } + } + + Ok(()) + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct PlanningZoneUpdatesStepReport { + /// What are we waiting on to start zone updates? + pub waiting_on: Option, + + /// (Sled ID, zone, desired image) + pub out_of_date_zones: + Vec<(SledUuid, BlueprintZoneConfig, BlueprintZoneImageSource)>, + + pub expunged_zones: Vec<(SledUuid, BlueprintZoneConfig)>, + pub updated_zones: Vec<(SledUuid, BlueprintZoneConfig)>, + pub unsafe_zones: Vec<(BlueprintZoneConfig, ZoneUnsafeToShutdown)>, +} + +impl PlanningZoneUpdatesStepReport { + pub fn new() -> Self { + Self { + waiting_on: None, + out_of_date_zones: Vec::new(), + expunged_zones: Vec::new(), + updated_zones: Vec::new(), + unsafe_zones: Vec::new(), + } + } + + pub fn waiting_on(&mut self, waiting_on: ZoneUpdatesWaitingOn) { + self.waiting_on = Some(waiting_on); + } + + pub fn unsafe_zone( + &mut self, + zone: &BlueprintZoneConfig, + reason: ZoneUnsafeToShutdown, + ) { + self.unsafe_zones.push((zone.clone(), reason)) + } +} + +impl fmt::Display for PlanningZoneUpdatesStepReport { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Self { + waiting_on, + out_of_date_zones, + expunged_zones, + updated_zones, + unsafe_zones, + } = self; + + if let Some(waiting_on) = waiting_on { + writeln!(f, "* Zone updates waiting on {}", waiting_on.as_str())?; + } + + if !expunged_zones.is_empty() { + let n = out_of_date_zones.len(); + let s = if n == 1 { "" } else { "s" }; + writeln!(f, "* Out-of-date zone{s} expunged:")?; + for (sled_id, zone) in expunged_zones.iter() { + writeln!( + f, + " * sled {}, zone {} ({})", + sled_id, + zone.id, + zone.zone_type.kind().report_str(), + )?; + } + } + + if !updated_zones.is_empty() { + let n = out_of_date_zones.len(); + let s = if n == 1 { "" } else { "s" }; + writeln!(f, "* Out-of-date zone{s} updated in-place:")?; + for (sled_id, zone) in updated_zones.iter() { + writeln!( + f, + " * sled {}, zone {} ({})", + sled_id, + zone.id, + zone.zone_type.kind().report_str(), + )?; + } + } + + if !out_of_date_zones.is_empty() { + let n = out_of_date_zones.len(); + let s = if n == 1 { "" } else { "s" }; + writeln!(f, "* {n} out-of-date zone{s}:")?; + for (sled, zone, _image_source) in out_of_date_zones.iter() { + writeln!( + f, + " * sled {}, zone {} ({})", // TODO: current → desired image source + sled, + zone.id, + zone.zone_type.kind().report_str(), + )?; + } + } + + if !unsafe_zones.is_empty() { + let n = unsafe_zones.len(); + let s = if n == 1 { "" } else { "s" }; + writeln!(f, "* {n} zone{s} not ready to shut down safely:")?; + for (zone, reason) in unsafe_zones.iter() { + writeln!( + f, + " * zone {} ({}): {}", + zone.id, + zone.zone_type.kind().report_str(), + reason, + )?; + } + } + + Ok(()) + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub enum ZoneUpdatesWaitingOn { + /// Waiting on discretionary zone placement. + DiscretionaryZones, + + /// Waiting on updates to RoT / SP / Host OS / etc. + PendingMgsUpdates, +} + +impl ZoneUpdatesWaitingOn { + pub fn as_str(&self) -> &'static str { + match self { + Self::DiscretionaryZones => "discretionary zones", + Self::PendingMgsUpdates => { + "pending MGS updates (RoT / SP / Host OS / etc.)" + } + } + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub enum ZoneUnsafeToShutdown { + Cockroachdb(CockroachdbUnsafeToShutdown), +} + +impl fmt::Display for ZoneUnsafeToShutdown { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Cockroachdb(reason) => write!(f, "{reason}"), + } + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub enum CockroachdbUnsafeToShutdown { + MissingLiveNodesStat, + MissingUnderreplicatedStat, + NotEnoughLiveNodes(u64), + NotEnoughNodes, + UnderreplicatedRanges(u64), +} + +impl fmt::Display for CockroachdbUnsafeToShutdown { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::MissingLiveNodesStat => write!(f, "missing live_nodes stat"), + Self::MissingUnderreplicatedStat => { + write!(f, "missing ranges_underreplicated stat") + } + Self::NotEnoughLiveNodes(n) => { + write!( + f, + "not enough live nodes: {n} < {COCKROACHDB_REDUNDANCY}" + ) + } + Self::NotEnoughNodes => write!(f, "not enough nodes"), + Self::UnderreplicatedRanges(n) => { + if *n > 0 { + write!(f, "{n} > 0 underreplicated ranges") + } else { + write!( + f, + "no underreplicated ranges (this shouldn't happen)" + ) + } + } + } + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct PlanningCockroachdbSettingsStepReport { + pub preserve_downgrade: CockroachDbPreserveDowngrade, +} + +impl PlanningCockroachdbSettingsStepReport { + pub fn new() -> Self { + Self { preserve_downgrade: CockroachDbPreserveDowngrade::DoNotModify } + } +} + +impl fmt::Display for PlanningCockroachdbSettingsStepReport { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let PlanningCockroachdbSettingsStepReport { preserve_downgrade } = self; + if !matches!( + preserve_downgrade, + CockroachDbPreserveDowngrade::DoNotModify, + ) { + writeln!( + f, + "* Will ensure cockroachdb setting: {preserve_downgrade}" + )?; + } + Ok(()) + } +} diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index ca97f5f892..f6a4d38530 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use crate::deployment::PlanningReport; use crate::external_api::views; use chrono::DateTime; use chrono::Utc; @@ -460,6 +461,7 @@ impl slog::KV for DebugDatasetsRendezvousStats { } /// The status of a `blueprint_planner` background task activation. +#[allow(clippy::large_enum_variant)] #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub enum BlueprintPlannerStatus { /// Automatic blueprint planning has been explicitly disabled @@ -479,7 +481,11 @@ pub enum BlueprintPlannerStatus { /// Planing succeeded, and we saved and made the new blueprint the /// current target. - Targeted { parent_blueprint_id: BlueprintUuid, blueprint_id: BlueprintUuid }, + Targeted { + parent_blueprint_id: BlueprintUuid, + blueprint_id: BlueprintUuid, + report: PlanningReport, + }, } /// The status of a `alert_dispatcher` background task activation.