{"environment":"unified_incident_env","baselines":[{"scenario_id":"worker_deploy_cascade","name":"deterministic-remediation-baseline","description":"A bad worker deploy causes sustained database overload and login 502s at the gateway. The agent must diagnose from evidence, choose a safe remediation, verify recovery, and declare resolved only after checks pass.","optimal_ticks":10,"actions":[{"action":{"metadata":{},"action_type":"query_deploys","service":"worker","metric":null,"check_name":null,"hypothesis":null},"rationale":"Check whether any recent deploy aligns with the incident start."},{"action":{"metadata":{},"action_type":"query_logs","service":"worker","metric":null,"check_name":null,"hypothesis":null},"rationale":"Inspect worker logs because deploy timing and queue pressure suggest worker-originated harm."},{"action":{"metadata":{},"action_type":"query_metrics","service":"database","metric":"cpu","check_name":null,"hypothesis":null},"rationale":"Confirm that the database is overloaded as a downstream effect."},{"action":{"metadata":{},"action_type":"query_dependencies","service":"api-gateway","metric":null,"check_name":null,"hypothesis":null},"rationale":"Verify the gateway depends on the worker and database path."},{"action":{"metadata":{},"action_type":"submit_hypothesis","service":null,"metric":null,"check_name":null,"hypothesis":{"root_cause":"bad_worker_deploy","affected_services":["worker","database","api-gateway"],"confidence":0.82,"recommended_next_action":"rollback_deploy"}},"rationale":"Commit a calibrated hypothesis before taking an invasive mitigation step."},{"action":{"metadata":{},"action_type":"rollback_deploy","service":"worker","metric":null,"check_name":null,"hypothesis":null},"rationale":"Remove the triggering change before restarting downstream services."},{"action":{"metadata":{},"action_type":"restart_service","service":"database","metric":null,"check_name":null,"hypothesis":null},"rationale":"Bring the database back cleanly after the root cause is removed."},{"action":{"metadata":{},"action_type":"run_check","service":null,"metric":null,"check_name":"database_recovery","hypothesis":null},"rationale":"Verify the database is no longer crashing."},{"action":{"metadata":{},"action_type":"run_check","service":null,"metric":null,"check_name":"end_to_end","hypothesis":null},"rationale":"Verify gateway traffic succeeds end-to-end."},{"action":{"metadata":{},"action_type":"declare_resolved","service":null,"metric":null,"check_name":null,"hypothesis":null},"rationale":"Declare resolved only after objective checks pass."}]},{"scenario_id":"db_config_rollout","name":"deterministic-remediation-baseline","description":"A database config push cut connection pool size and write requests now time out. A separate worker deploy landed around the same time and looks suspicious but is not the cause. The agent must avoid the decoy, roll back the database config, restart it, and verify recovery.","optimal_ticks":10,"actions":[{"action":{"metadata":{},"action_type":"query_logs","service":"database","metric":null,"check_name":null,"hypothesis":null},"rationale":"Database is the loudest alert; inspect logs for the actual error signature."},{"action":{"metadata":{},"action_type":"query_deploys","service":"database","metric":null,"check_name":null,"hypothesis":null},"rationale":"Pool-acquire errors suggest a config change; check recent database rollouts."},{"action":{"metadata":{},"action_type":"query_metrics","service":"database","metric":"error_rate","check_name":null,"hypothesis":null},"rationale":"Confirm the error pattern is pool exhaustion rather than compute overload."},{"action":{"metadata":{},"action_type":"query_logs","service":"worker","metric":null,"check_name":null,"hypothesis":null},"rationale":"Rule out the decoy worker deploy by reading worker logs directly."},{"action":{"metadata":{},"action_type":"submit_hypothesis","service":null,"metric":null,"check_name":null,"hypothesis":{"root_cause":"database_only_failure","affected_services":["database","api-gateway","worker"],"confidence":0.8,"recommended_next_action":"rollback_deploy"}},"rationale":"Localize the fault to the database config before remediating."},{"action":{"metadata":{},"action_type":"rollback_deploy","service":"database","metric":null,"check_name":null,"hypothesis":null},"rationale":"Roll back the offending database config rollout."},{"action":{"metadata":{},"action_type":"restart_service","service":"database","metric":null,"check_name":null,"hypothesis":null},"rationale":"Restart the database cleanly against the restored pool config."},{"action":{"metadata":{},"action_type":"run_check","service":null,"metric":null,"check_name":"database_recovery","hypothesis":null},"rationale":"Verify database pool health and write latency are back within SLO."},{"action":{"metadata":{},"action_type":"run_check","service":null,"metric":null,"check_name":"end_to_end","hypothesis":null},"rationale":"Verify gateway write-path traffic succeeds end-to-end."},{"action":{"metadata":{},"action_type":"declare_resolved","service":null,"metric":null,"check_name":null,"hypothesis":null},"rationale":"Declare resolved only after objective checks pass."}]},{"scenario_id":"gateway_auth_rollout","name":"deterministic-remediation-baseline","description":"A new api-gateway auth-middleware rollout is rejecting ~40% of valid logins. A recent worker deploy and elevated worker queue depth make the worker look like a plausible suspect. The agent must localize to the gateway, roll back its deploy, and verify recovery without unnecessary restarts.","optimal_ticks":8,"actions":[{"action":{"metadata":{},"action_type":"query_logs","service":"api-gateway","metric":null,"check_name":null,"hypothesis":null},"rationale":"Gateway is rejecting logins; read gateway logs to localize the rejection class."},{"action":{"metadata":{},"action_type":"query_deploys","service":"api-gateway","metric":null,"check_name":null,"hypothesis":null},"rationale":"Login rejection aligns with a recent auth middleware rollout; confirm deploy timing."},{"action":{"metadata":{},"action_type":"query_deploys","service":"worker","metric":null,"check_name":null,"hypothesis":null},"rationale":"Rule out the worker deploy explicitly rather than assuming."},{"action":{"metadata":{},"action_type":"submit_hypothesis","service":null,"metric":null,"check_name":null,"hypothesis":{"root_cause":"api_gateway_fault","affected_services":["api-gateway","worker"],"confidence":0.85,"recommended_next_action":"rollback_deploy"}},"rationale":"Commit a calibrated hypothesis localizing to the gateway auth rollout."},{"action":{"metadata":{},"action_type":"rollback_deploy","service":"api-gateway","metric":null,"check_name":null,"hypothesis":null},"rationale":"Roll back the bad auth middleware rollout; no restart needed."},{"action":{"metadata":{},"action_type":"run_check","service":null,"metric":null,"check_name":"end_to_end","hypothesis":null},"rationale":"Verify that gateway login traffic now succeeds end-to-end."},{"action":{"metadata":{},"action_type":"run_check","service":null,"metric":null,"check_name":"database_recovery","hypothesis":null},"rationale":"Confirm the database is (and stayed) healthy throughout."},{"action":{"metadata":{},"action_type":"declare_resolved","service":null,"metric":null,"check_name":null,"hypothesis":null},"rationale":"Declare resolved only after objective checks pass."}]}]}