Skip to content

Commit aefe883

Browse files
committed
nexus: use debug dropbox to save Reconfigurator state
1 parent aa13f77 commit aefe883

16 files changed

Lines changed: 452 additions & 106 deletions

File tree

Cargo.lock

Lines changed: 89 additions & 65 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,8 @@ omicron-certificates = { path = "certificates" }
645645
omicron-cockroach-admin = { path = "cockroach-admin" }
646646
omicron-cockroach-metrics = { path = "cockroach-metrics" }
647647
omicron-common = { path = "common" }
648+
# XXX-dap publish
649+
omicron-debug-dropbox = { git = "https://github.com/oxidecomputer/omicron-debug-dropbox", rev = "13e5461bc3a725789ae6463d2b3b7d64528ffdde" }
648650
omicron-dev-lib = { path = "dev-tools/omicron-dev-lib" }
649651
omicron-ledger = { path = "ledger" }
650652
omicron-gateway = { path = "gateway" }

nexus/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ nexus-test-interface.workspace = true
7777
ntp-admin-client.workspace = true
7878
num-integer.workspace = true
7979
omicron-cockroach-metrics.workspace = true
80+
omicron-debug-dropbox.workspace = true
8081
openssl.workspace = true
8182
oximeter-client.workspace = true
8283
oximeter-db = { workspace = true, default-features = false, features = [

nexus/reconfigurator/preparation/src/lib.rs

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use nexus_db_queries::db::datastore::SQL_BATCH_SIZE;
1717
use nexus_db_queries::db::pagination::Paginator;
1818
use nexus_types::deployment::Blueprint;
1919
use nexus_types::deployment::BlueprintMetadata;
20+
use nexus_types::deployment::BlueprintTarget;
2021
use nexus_types::deployment::ClickhousePolicy;
2122
use nexus_types::deployment::CockroachDbClusterVersion;
2223
use nexus_types::deployment::CockroachDbSettings;
@@ -480,7 +481,8 @@ async fn fetch_all_service_ip_pool_ranges(
480481
Ok(ranges)
481482
}
482483

483-
/// Loads state for debugging or import into `reconfigurator-cli`
484+
/// Loads Reconfigurator-related state from a live system for debugging or
485+
/// import into `reconfigurator-cli`
484486
///
485487
/// This is used in omdb, tests, and in Nexus to collect support bundles
486488
pub async fn reconfigurator_state_load(
@@ -575,6 +577,35 @@ pub async fn reconfigurator_state_load(
575577
.collect::<Vec<Blueprint>>()
576578
.await;
577579

580+
// Delegate the rest.
581+
reconfigurator_state_assemble(
582+
opctx,
583+
datastore,
584+
planning_input,
585+
collections,
586+
blueprints,
587+
target_blueprint,
588+
)
589+
.await
590+
}
591+
592+
/// Assembles a reconfigurator state file with caller-provided planning input,
593+
/// inventory collections, blueprints, and target blueprint
594+
///
595+
/// These parts of the returned state file will be exactly as the caller
596+
/// provided them. The other state that goes into the file will be loaded from
597+
/// the live system.
598+
///
599+
/// This is used to package up all the information that went into a specific
600+
/// planner run for future debugging.
601+
pub async fn reconfigurator_state_assemble(
602+
opctx: &OpContext,
603+
datastore: &DataStore,
604+
planning_input: PlanningInput,
605+
collections: Vec<Collection>,
606+
blueprints: Vec<Blueprint>,
607+
target_blueprint: BlueprintTarget,
608+
) -> Result<UnstableReconfiguratorState, anyhow::Error> {
578609
// It's also useful to include information about any DNS generations
579610
// mentioned in any blueprints.
580611
let blueprints_list = &blueprints;

nexus/src/app/background/init.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,7 @@ impl BackgroundTasksInitializer {
585585
reconfigurator_config_watcher.clone(),
586586
inventory_load_watcher.clone(),
587587
rx_blueprint.clone(),
588+
args.debug_dropbox.clone(),
588589
);
589590
let rx_planner = blueprint_planner.watcher();
590591
driver.register(TaskDefinition {
@@ -1322,6 +1323,7 @@ pub struct BackgroundTasksData {
13221323
/// Console session absolute timeout, from
13231324
/// `pkg.console.session_absolute_timeout_minutes`.
13241325
pub console_session_absolute_timeout: chrono::TimeDelta,
1326+
pub debug_dropbox: Arc<omicron_debug_dropbox::Producer>,
13251327
}
13261328

13271329
/// Starts the three DNS-propagation-related background tasks for either

nexus/src/app/background/tasks/blueprint_planner.rs

Lines changed: 74 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,11 @@
55
//! Background task for automatic update planning.
66
77
use super::reconfigurator_config::ReconfiguratorConfigLoaderState;
8+
use crate::app::BlueprintDebugAction;
89
use crate::app::background::BackgroundTask;
910
use crate::app::background::tasks::blueprint_load::LoadedTargetBlueprint;
11+
use crate::app::blueprint_debug_filename;
12+
use anyhow::Context;
1013
use chrono::Utc;
1114
use futures::future::BoxFuture;
1215
use nexus_auth::authz;
@@ -16,6 +19,7 @@ use nexus_db_queries::db::datastore::BlueprintLimitReachedOutput;
1619
use nexus_reconfigurator_planning::planner::Planner;
1720
use nexus_reconfigurator_planning::planner::PlannerRng;
1821
use nexus_reconfigurator_preparation::PlanningInputFromDb;
22+
use nexus_reconfigurator_preparation::reconfigurator_state_assemble;
1923
use nexus_types::deployment::BlueprintSource;
2024
use nexus_types::deployment::BlueprintTarget;
2125
use nexus_types::deployment::PlanningReport;
@@ -52,6 +56,10 @@ enum PlanError {
5256
#[source]
5357
source: Error,
5458
},
59+
#[error("failed to assemble debug state")]
60+
AssembleDebugState(#[source] anyhow::Error),
61+
#[error("failed to save debug state to dropbox")]
62+
SaveDebugState(#[from] omicron_debug_dropbox::DepositError),
5563
}
5664

5765
/// Background task that runs the update planner.
@@ -62,6 +70,7 @@ pub struct BlueprintPlanner {
6270
rx_blueprint: Receiver<Option<LoadedTargetBlueprint>>,
6371
tx_planned: Sender<Option<BlueprintUuid>>,
6472
blueprint_limit: u64,
73+
debug_dropbox: Arc<omicron_debug_dropbox::Producer>,
6574
}
6675

6776
/// The default number of blueprints, beyond which the auto-planner will stop
@@ -86,6 +95,7 @@ impl BlueprintPlanner {
8695
rx_config: Receiver<ReconfiguratorConfigLoaderState>,
8796
rx_inventory: Receiver<Option<Arc<Collection>>>,
8897
rx_blueprint: Receiver<Option<LoadedTargetBlueprint>>,
98+
debug_dropbox: Arc<omicron_debug_dropbox::Producer>,
8999
) -> Self {
90100
let (tx_planned, _) = watch::channel(None);
91101
Self {
@@ -95,6 +105,7 @@ impl BlueprintPlanner {
95105
rx_blueprint,
96106
tx_planned,
97107
blueprint_limit: DEFAULT_BLUEPRINT_LIMIT,
108+
debug_dropbox,
98109
}
99110
}
100111

@@ -138,7 +149,9 @@ impl BlueprintPlanner {
138149
PlanError::AssemblePlanningInput(_)
139150
| PlanError::MakePlanner { .. }
140151
| PlanError::Plan(_)
141-
| PlanError::SaveBlueprint { .. } => {
152+
| PlanError::AssembleDebugState(_)
153+
| PlanError::SaveBlueprint { .. }
154+
| PlanError::SaveDebugState(_) => {
142155
error!(
143156
&opctx.log,
144157
"blueprint planning failed";
@@ -268,18 +281,54 @@ impl BlueprintPlanner {
268281
}
269282
}
270283

271-
// We have a fresh blueprint; save it.
284+
// We have a fresh blueprint. We're going to proceed with trying to
285+
// make it the target.
272286
let blueprint_id = blueprint.id;
273287
info!(
274288
&opctx.log,
275289
"planning produced new blueprint";
276290
"parent_blueprint_id" => %parent_blueprint_id,
277291
"blueprint_id" => %blueprint_id,
278292
);
293+
294+
// Assemble a Reconfigurator state file that we can archive for future
295+
// debugging. You could argue that this should be best-effort. But
296+
// this really shouldn't fail under normal operation. It should only
297+
// fail if the database is partially offline or something like that. In
298+
// that case, it's fairly likely this whole operation is going to fail
299+
// anyway. On the other hand, if we allowed this to be non-fatal, it
300+
// would be easy to not notice if some *bug* caused this to stop working
301+
// altogether, and then we'd silently lose valuable debugging
302+
// information from deployed systems. So we just treat this as fatal.
303+
let debug = reconfigurator_state_assemble(
304+
opctx,
305+
&self.datastore,
306+
input,
307+
vec![(*collection).clone()],
308+
vec![(*parent).clone(), blueprint.clone()],
309+
target,
310+
)
311+
.await
312+
.and_then(|s| {
313+
serde_json::to_string(&s)
314+
.context("serializing Reconfigurator state file")
315+
})
316+
.map_err(PlanError::AssembleDebugState)?;
317+
318+
// Insert the new blueprint into the database.
279319
self.datastore.blueprint_insert(opctx, &blueprint).await.map_err(
280320
|error| PlanError::SaveBlueprint { blueprint_id, source: error },
281321
)?;
282322

323+
// Archive the Reconfigurator state file. As above, we require that
324+
// this succeed.
325+
let debug_name = blueprint_debug_filename(
326+
&blueprint,
327+
BlueprintDebugAction::Autoplan,
328+
);
329+
let deposit =
330+
self.debug_dropbox.deposit_file_str(&debug_name, &debug).await?;
331+
283332
// Try to make it the current target.
284333
let target = BlueprintTarget {
285334
target_id: blueprint_id,
@@ -316,6 +365,11 @@ impl BlueprintPlanner {
316365
);
317366
}
318367
}
368+
369+
// Try to cancel the dropbox deposit. This information is
370+
// useless now. It's not a problem if this doesn't work.
371+
deposit.cancel_and_attempt_delete().await;
372+
319373
return Ok(BlueprintPlannerStatus::Planned {
320374
parent_blueprint_id,
321375
error: format!("{error}"),
@@ -448,6 +502,7 @@ mod test {
448502
use nexus_types::deployment::{
449503
PendingMgsUpdates, ReconfiguratorConfig, ReconfiguratorConfigView,
450504
};
505+
use omicron_debug_dropbox::DebugDropbox;
451506
use omicron_test_utils::dev;
452507
use omicron_uuid_kinds::OmicronZoneUuid;
453508
use std::collections::BTreeMap;
@@ -460,10 +515,8 @@ mod test {
460515
// Set up the test context.
461516
let nexus = &cptestctx.server.server_context().nexus;
462517
let datastore = nexus.datastore();
463-
let opctx = OpContext::for_tests(
464-
cptestctx.logctx.log.clone(),
465-
datastore.clone(),
466-
);
518+
let log = &cptestctx.logctx.log;
519+
let opctx = OpContext::for_tests(log.clone(), datastore.clone());
467520

468521
// Spin up the blueprint loader background task.
469522
let (tx_loader, _) = watch::channel(None);
@@ -479,7 +532,7 @@ mod test {
479532

480533
// Spin up the inventory collector background task.
481534
let resolver = internal_dns_resolver::Resolver::new_from_addrs(
482-
cptestctx.logctx.log.clone(),
535+
log.clone(),
483536
&[cptestctx.internal_dns.dns_server.local_address()],
484537
)
485538
.expect("can't start resolver");
@@ -510,13 +563,20 @@ mod test {
510563
time_modified: now_db_precision(),
511564
}),
512565
);
566+
let debug_dropbox = Arc::new(
567+
DebugDropbox::for_tests_noop(log)
568+
.initialize_producer("test")
569+
.await
570+
.unwrap(),
571+
);
513572

514573
// Finally, spin up the planner background task.
515574
let mut planner = BlueprintPlanner::new(
516575
datastore.clone(),
517576
rx_config_loader,
518577
rx_inventory,
519578
rx_loader.clone(),
579+
debug_dropbox,
520580
);
521581

522582
// On activation, the planner should run successfully and generate
@@ -686,12 +746,19 @@ mod test {
686746
// check_blueprint_limit_reached.
687747
let (_tx_inventory, rx_inventory) = watch::channel(None);
688748
let (_tx_blueprint, rx_blueprint) = watch::channel(None);
749+
let debug_dropbox = Arc::new(
750+
DebugDropbox::for_tests_noop(&logctx.log)
751+
.initialize_producer("test")
752+
.await
753+
.unwrap(),
754+
);
689755

690756
let mut planner = BlueprintPlanner::new(
691757
datastore.clone(),
692758
rx_config_loader,
693759
rx_inventory,
694760
rx_blueprint,
761+
debug_dropbox,
695762
);
696763

697764
// This limit matches the loop above.

0 commit comments

Comments
 (0)