Skip to content

Commit

Permalink
Add task-dump endpoints to MGS
Browse files Browse the repository at this point in the history
This exposes the `faux-mgs dump` command, which shows the number of tasks
dump present, and enables downloading a dehydrated dump.

Closes #7271
  • Loading branch information
wfchandler committed Dec 18, 2024
1 parent 7297d76 commit 8b13de1
Show file tree
Hide file tree
Showing 14 changed files with 466 additions and 9 deletions.
24 changes: 20 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -390,8 +390,8 @@ gateway-client = { path = "clients/gateway-client" }
# is "fine", because SP/MGS communication maintains forwards and backwards
# compatibility, but will mean that faux-mgs might be missing new
# functionality.)
gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "9bbac475dcaac88286c07a20b6bd3e94fc81d7f0", default-features = false, features = ["std"] }
gateway-sp-comms = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "9bbac475dcaac88286c07a20b6bd3e94fc81d7f0" }
gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "97301243f0707708ae9e629e2b4cdea5ae3fd078", default-features = false, features = ["std"] }
gateway-sp-comms = { git = "https://github.com/oxidecomputer/management-gateway-service", rev = "97301243f0707708ae9e629e2b4cdea5ae3fd078" }
gateway-test-utils = { path = "gateway-test-utils" }
gateway-types = { path = "gateway-types" }
gethostname = "0.5.0"
Expand Down
3 changes: 2 additions & 1 deletion dev-tools/omdb/src/bin/omdb/mgs/sensors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ pub(crate) struct Sensor {
impl Sensor {
fn units(&self) -> &str {
match self.kind {
MeasurementKind::Temperature => "°C",
MeasurementKind::Temperature | MeasurementKind::CpuTctl => "°C",
MeasurementKind::Current | MeasurementKind::InputCurrent => "A",
MeasurementKind::Voltage | MeasurementKind::InputVoltage => "V",
MeasurementKind::Speed => "RPM",
Expand Down Expand Up @@ -150,6 +150,7 @@ impl Sensor {
fn to_kind_string(&self) -> &str {
match self.kind {
MeasurementKind::Temperature => "temp",
MeasurementKind::CpuTctl => "tctl",
MeasurementKind::Power => "power",
MeasurementKind::Current => "current",
MeasurementKind::Voltage => "voltage",
Expand Down
31 changes: 31 additions & 0 deletions gateway-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use gateway_types::{
ignition::{IgnitionCommand, SpIgnitionInfo},
rot::{RotCfpa, RotCfpaSlot, RotCmpa, RotState},
sensor::SpSensorReading,
task_dump::TaskDump,
update::{
HostPhase2Progress, HostPhase2RecoveryImageId, InstallinatorImageId,
SpUpdateStatus,
Expand Down Expand Up @@ -306,6 +307,26 @@ pub trait GatewayApi {
params: TypedBody<GetRotBootInfoParams>,
) -> Result<HttpResponseOk<RotState>, HttpError>;

/// Get the number of task dumps present on an SP
#[endpoint {
method = GET,
path = "/sp/{type}/{slot}/task-dump",
}]
async fn sp_host_task_dump_count(
rqctx: RequestContext<Self::Context>,
path: Path<PathSp>,
) -> Result<HttpResponseOk<u32>, HttpError>;

/// Read a single task dump from an SP
#[endpoint {
method = GET,
path = "/sp/{type}/{slot}/task-dump/{task_dump_index}",
}]
async fn sp_host_task_dump_get(
rqctx: RequestContext<Self::Context>,
path: Path<PathSpTaskDumpIndex>,
) -> Result<HttpResponseOk<TaskDump>, HttpError>;

/// List SPs via Ignition
///
/// Retreive information for all SPs via the Ignition controller. This is
Expand Down Expand Up @@ -498,6 +519,16 @@ pub struct PathSpComponent {
pub component: String,
}

#[derive(Deserialize, JsonSchema)]
pub struct PathSpTaskDumpIndex {
/// ID for the SP that the gateway service translates into the appropriate
/// port for communicating with the given SP.
#[serde(flatten)]
pub sp: SpIdentifier,
/// The index of the task dump to be read.
pub task_dump_index: u32,
}

#[derive(Deserialize, JsonSchema)]
pub struct ComponentCabooseSlot {
/// The firmware slot to for which we want to request caboose information.
Expand Down
2 changes: 2 additions & 0 deletions gateway-types/src/component_details.rs
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ pub enum MeasurementKind {
InputCurrent,
InputVoltage,
Speed,
CpuTctl,
}

impl From<gateway_messages::measurement::MeasurementKind> for MeasurementKind {
Expand All @@ -387,6 +388,7 @@ impl From<gateway_messages::measurement::MeasurementKind> for MeasurementKind {
MeasurementKind::InputCurrent => Self::InputCurrent,
MeasurementKind::InputVoltage => Self::InputVoltage,
MeasurementKind::Speed => Self::Speed,
MeasurementKind::CpuTctl => Self::CpuTctl,
}
}
}
Expand Down
1 change: 1 addition & 0 deletions gateway-types/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ pub mod host;
pub mod ignition;
pub mod rot;
pub mod sensor;
pub mod task_dump;
pub mod update;
35 changes: 35 additions & 0 deletions gateway-types/src/task_dump.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;

#[derive(
Debug,
Clone,
PartialEq,
Eq,
PartialOrd,
Ord,
Deserialize,
Serialize,
JsonSchema,
)]
pub struct TaskDump {
/// Index of the crashed task.
pub task_index: u16,
/// Timestamp at which the task crash occurred.
pub timestamp: u64,
/// Hex-encoded Hubris archive ID.
pub archive_id: String,
/// `BORD` field from the caboose.
pub bord: String,
/// `GITC` field from the caboose.
pub gitc: String,
/// `VERS` field from the caboose, if present.
pub vers: Option<String>,
/// Base64-encoded raw memory read from the SP.
pub base64_memory: BTreeMap<u32, String>,
}
61 changes: 61 additions & 0 deletions gateway/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ use gateway_types::rot::RotCfpaSlot;
use gateway_types::rot::RotCmpa;
use gateway_types::rot::RotState;
use gateway_types::sensor::SpSensorReading;
use gateway_types::task_dump::TaskDump;
use gateway_types::update::HostPhase2Progress;
use gateway_types::update::HostPhase2RecoveryImageId;
use gateway_types::update::InstallinatorImageId;
Expand Down Expand Up @@ -655,6 +656,66 @@ impl GatewayApi for GatewayImpl {
apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
}

async fn sp_host_task_dump_count(
rqctx: RequestContext<Self::Context>,
path: Path<PathSp>,
) -> Result<HttpResponseOk<u32>, HttpError> {
let apictx = rqctx.context();
let sp_id = path.into_inner().sp.into();

let handler = async {
let sp = apictx.mgmt_switch.sp(sp_id)?;
let ct = sp.task_dump_count().await.map_err(|err| {
SpCommsError::SpCommunicationFailed { sp: sp_id, err }
})?;

Ok(HttpResponseOk(ct))
};
apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
}

async fn sp_host_task_dump_get(
rqctx: RequestContext<Self::Context>,
path: Path<PathSpTaskDumpIndex>,
) -> Result<HttpResponseOk<TaskDump>, HttpError> {
let apictx = rqctx.context();
let path = path.into_inner();
let task_index = path.task_dump_index;
let sp_id = path.sp.into();

let handler = async {
let sp = apictx.mgmt_switch.sp(sp_id)?;
let raw_dump =
sp.task_dump_read(task_index).await.map_err(|err| {
SpCommsError::SpCommunicationFailed { sp: sp_id, err }
})?;

let archive_id = hex::encode(raw_dump.archive_id);
let base64_memory = raw_dump
.memory
.into_iter()
.map(|(key, mem)| {
let base64_mem =
base64::engine::general_purpose::STANDARD.encode(mem);
(key, base64_mem)
})
.collect();

let dump = TaskDump {
task_index: raw_dump.task_index,
timestamp: raw_dump.timestamp,
archive_id,
bord: raw_dump.bord,
gitc: raw_dump.gitc,
vers: raw_dump.vers,
base64_memory,
};

Ok(HttpResponseOk(dump))
};
apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
}

async fn ignition_list(
rqctx: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<Vec<SpIgnitionInfo>>, HttpError> {
Expand Down
14 changes: 14 additions & 0 deletions gateway/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,7 @@ impl SpPoller {
MeasurementKind::InputCurrent => "input_current",
MeasurementKind::InputVoltage => "input_voltage",
MeasurementKind::Speed => "fan_speed",
MeasurementKind::CpuTctl => "cpu_tctl",
};
let error = match error {
MeasurementError::InvalidSensor => "invalid_sensor",
Expand Down Expand Up @@ -858,6 +859,10 @@ impl SpPoller {
&metric::AmdCpuTctl { sensor, datum },
)
}
(Ok(datum), MeasurementKind::CpuTctl) => Sample::new(
target,
&metric::AmdCpuTctl { sensor, datum },
),
// Other measurements with the "temperature" measurement
// kind are physical temperatures that actually exist in
// reality (and are always in Celsius).
Expand All @@ -873,6 +878,12 @@ impl SpPoller {
&metric::AmdCpuTctl { sensor, datum: 0.0 },
)
}
(Err(_), MeasurementKind::CpuTctl) => {
Sample::new_missing(
target,
&metric::AmdCpuTctl { sensor, datum: 0.0 },
)
}
(Err(_), MeasurementKind::Temperature) => {
Sample::new_missing(
target,
Expand Down Expand Up @@ -1205,5 +1216,8 @@ fn comms_error_str(error: CommunicationError) -> &'static str {
CommunicationError::BadTrailingDataSize { .. } => {
"bad_trailing_data_size"
}
CommunicationError::BadDecompressionSize { .. } => {
"bad_decompression_size"
}
}
}
1 change: 1 addition & 0 deletions nexus/tests/integration_tests/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -837,6 +837,7 @@ async fn test_mgs_metrics(
temp += 1;
}
}
Kind::CpuTctl => cpu_tctl += 1,
Kind::Current => current += 1,
Kind::Voltage => voltage += 1,
Kind::InputVoltage => input_voltage += 1,
Expand Down
Loading

0 comments on commit 8b13de1

Please sign in to comment.