55//! Background task for automatic update planning.
66
77use super :: reconfigurator_config:: ReconfiguratorConfigLoaderState ;
8+ use crate :: app:: BlueprintDebugAction ;
89use crate :: app:: background:: BackgroundTask ;
910use crate :: app:: background:: tasks:: blueprint_load:: LoadedTargetBlueprint ;
11+ use crate :: app:: blueprint_debug_filename;
12+ use anyhow:: Context ;
1013use chrono:: Utc ;
1114use futures:: future:: BoxFuture ;
1215use nexus_auth:: authz;
@@ -16,6 +19,7 @@ use nexus_db_queries::db::datastore::BlueprintLimitReachedOutput;
1619use nexus_reconfigurator_planning:: planner:: Planner ;
1720use nexus_reconfigurator_planning:: planner:: PlannerRng ;
1821use nexus_reconfigurator_preparation:: PlanningInputFromDb ;
22+ use nexus_reconfigurator_preparation:: reconfigurator_state_assemble;
1923use nexus_types:: deployment:: BlueprintSource ;
2024use nexus_types:: deployment:: BlueprintTarget ;
2125use nexus_types:: deployment:: PlanningReport ;
@@ -52,6 +56,10 @@ enum PlanError {
5256 #[ source]
5357 source : Error ,
5458 } ,
59+ #[ error( "failed to assemble debug state" ) ]
60+ AssembleDebugState ( #[ source] anyhow:: Error ) ,
61+ #[ error( "failed to save debug state to dropbox" ) ]
62+ SaveDebugState ( #[ from] omicron_debug_dropbox:: DepositError ) ,
5563}
5664
5765/// Background task that runs the update planner.
@@ -62,6 +70,7 @@ pub struct BlueprintPlanner {
6270 rx_blueprint : Receiver < Option < LoadedTargetBlueprint > > ,
6371 tx_planned : Sender < Option < BlueprintUuid > > ,
6472 blueprint_limit : u64 ,
73+ debug_dropbox : Arc < omicron_debug_dropbox:: Producer > ,
6574}
6675
6776/// The default number of blueprints, beyond which the auto-planner will stop
@@ -86,6 +95,7 @@ impl BlueprintPlanner {
8695 rx_config : Receiver < ReconfiguratorConfigLoaderState > ,
8796 rx_inventory : Receiver < Option < Arc < Collection > > > ,
8897 rx_blueprint : Receiver < Option < LoadedTargetBlueprint > > ,
98+ debug_dropbox : Arc < omicron_debug_dropbox:: Producer > ,
8999 ) -> Self {
90100 let ( tx_planned, _) = watch:: channel ( None ) ;
91101 Self {
@@ -95,6 +105,7 @@ impl BlueprintPlanner {
95105 rx_blueprint,
96106 tx_planned,
97107 blueprint_limit : DEFAULT_BLUEPRINT_LIMIT ,
108+ debug_dropbox,
98109 }
99110 }
100111
@@ -138,7 +149,9 @@ impl BlueprintPlanner {
138149 PlanError :: AssemblePlanningInput ( _)
139150 | PlanError :: MakePlanner { .. }
140151 | PlanError :: Plan ( _)
141- | PlanError :: SaveBlueprint { .. } => {
152+ | PlanError :: AssembleDebugState ( _)
153+ | PlanError :: SaveBlueprint { .. }
154+ | PlanError :: SaveDebugState ( _) => {
142155 error ! (
143156 & opctx. log,
144157 "blueprint planning failed" ;
@@ -268,18 +281,54 @@ impl BlueprintPlanner {
268281 }
269282 }
270283
271- // We have a fresh blueprint; save it.
284+ // We have a fresh blueprint. We're going to proceed with trying to
285+ // make it the target.
272286 let blueprint_id = blueprint. id ;
273287 info ! (
274288 & opctx. log,
275289 "planning produced new blueprint" ;
276290 "parent_blueprint_id" => %parent_blueprint_id,
277291 "blueprint_id" => %blueprint_id,
278292 ) ;
293+
294+ // Assemble a Reconfigurator state file that we can archive for future
295+ // debugging. You could argue that this should be best-effort. But
296+ // this really shouldn't fail under normal operation. It should only
297+ // fail if the database is partially offline or something like that. In
298+ // that case, it's fairly likely this whole operation is going to fail
299+ // anyway. On the other hand, if we allowed this to be non-fatal, it
300+ // would be easy to not notice if some *bug* caused this to stop working
301+ // altogether, and then we'd silently lose valuable debugging
302+ // information from deployed systems. So we just treat this as fatal.
303+ let debug = reconfigurator_state_assemble (
304+ opctx,
305+ & self . datastore ,
306+ input,
307+ vec ! [ ( * collection) . clone( ) ] ,
308+ vec ! [ ( * parent) . clone( ) , blueprint. clone( ) ] ,
309+ target,
310+ )
311+ . await
312+ . and_then ( |s| {
313+ serde_json:: to_string ( & s)
314+ . context ( "serializing Reconfigurator state file" )
315+ } )
316+ . map_err ( PlanError :: AssembleDebugState ) ?;
317+
318+ // Insert the new blueprint into the database.
279319 self . datastore . blueprint_insert ( opctx, & blueprint) . await . map_err (
280320 |error| PlanError :: SaveBlueprint { blueprint_id, source : error } ,
281321 ) ?;
282322
323+ // Archive the Reconfigurator state file. As above, we require that
324+ // this succeed.
325+ let debug_name = blueprint_debug_filename (
326+ & blueprint,
327+ BlueprintDebugAction :: Autoplan ,
328+ ) ;
329+ let deposit =
330+ self . debug_dropbox . deposit_file_str ( & debug_name, & debug) . await ?;
331+
283332 // Try to make it the current target.
284333 let target = BlueprintTarget {
285334 target_id : blueprint_id,
@@ -316,6 +365,11 @@ impl BlueprintPlanner {
316365 ) ;
317366 }
318367 }
368+
369+ // Try to cancel the dropbox deposit. This information is
370+ // useless now. It's not a problem if this doesn't work.
371+ deposit. cancel_and_attempt_delete ( ) . await ;
372+
319373 return Ok ( BlueprintPlannerStatus :: Planned {
320374 parent_blueprint_id,
321375 error : format ! ( "{error}" ) ,
@@ -448,6 +502,7 @@ mod test {
448502 use nexus_types:: deployment:: {
449503 PendingMgsUpdates , ReconfiguratorConfig , ReconfiguratorConfigView ,
450504 } ;
505+ use omicron_debug_dropbox:: DebugDropbox ;
451506 use omicron_test_utils:: dev;
452507 use omicron_uuid_kinds:: OmicronZoneUuid ;
453508 use std:: collections:: BTreeMap ;
@@ -460,10 +515,8 @@ mod test {
460515 // Set up the test context.
461516 let nexus = & cptestctx. server . server_context ( ) . nexus ;
462517 let datastore = nexus. datastore ( ) ;
463- let opctx = OpContext :: for_tests (
464- cptestctx. logctx . log . clone ( ) ,
465- datastore. clone ( ) ,
466- ) ;
518+ let log = & cptestctx. logctx . log ;
519+ let opctx = OpContext :: for_tests ( log. clone ( ) , datastore. clone ( ) ) ;
467520
468521 // Spin up the blueprint loader background task.
469522 let ( tx_loader, _) = watch:: channel ( None ) ;
@@ -479,7 +532,7 @@ mod test {
479532
480533 // Spin up the inventory collector background task.
481534 let resolver = internal_dns_resolver:: Resolver :: new_from_addrs (
482- cptestctx . logctx . log . clone ( ) ,
535+ log. clone ( ) ,
483536 & [ cptestctx. internal_dns . dns_server . local_address ( ) ] ,
484537 )
485538 . expect ( "can't start resolver" ) ;
@@ -510,13 +563,20 @@ mod test {
510563 time_modified : now_db_precision ( ) ,
511564 } ) ,
512565 ) ;
566+ let debug_dropbox = Arc :: new (
567+ DebugDropbox :: for_tests_noop ( log)
568+ . initialize_producer ( "test" )
569+ . await
570+ . unwrap ( ) ,
571+ ) ;
513572
514573 // Finally, spin up the planner background task.
515574 let mut planner = BlueprintPlanner :: new (
516575 datastore. clone ( ) ,
517576 rx_config_loader,
518577 rx_inventory,
519578 rx_loader. clone ( ) ,
579+ debug_dropbox,
520580 ) ;
521581
522582 // On activation, the planner should run successfully and generate
@@ -686,12 +746,19 @@ mod test {
686746 // check_blueprint_limit_reached.
687747 let ( _tx_inventory, rx_inventory) = watch:: channel ( None ) ;
688748 let ( _tx_blueprint, rx_blueprint) = watch:: channel ( None ) ;
749+ let debug_dropbox = Arc :: new (
750+ DebugDropbox :: for_tests_noop ( & logctx. log )
751+ . initialize_producer ( "test" )
752+ . await
753+ . unwrap ( ) ,
754+ ) ;
689755
690756 let mut planner = BlueprintPlanner :: new (
691757 datastore. clone ( ) ,
692758 rx_config_loader,
693759 rx_inventory,
694760 rx_blueprint,
761+ debug_dropbox,
695762 ) ;
696763
697764 // This limit matches the loop above.
0 commit comments