diff --git a/Cargo.lock b/Cargo.lock index 86d7de4669..459467e96d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10803,8 +10803,12 @@ dependencies = [ name = "sled-diagnostics" version = "0.1.0" dependencies = [ + "cfg-if", + "fs-err", "futures", + "libc", "omicron-workspace-hack", + "slog", "thiserror 1.0.69", "tokio", ] diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 81d9211104..9b750b0cf7 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -715,6 +715,48 @@ } } }, + "/support/pargs-info": { + "get": { + "operationId": "support_pargs_info", + "responses": { + "200": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/support/pstack-info": { + "get": { + "operationId": "support_pstack_info", + "responses": { + "200": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/support/zoneadm-info": { "get": { "operationId": "support_zoneadm_info", diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 56c6760be7..25b943064b 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -647,6 +647,22 @@ pub trait SledAgentApi { async fn support_dladm_info( request_context: RequestContext, ) -> Result, HttpError>; + + #[endpoint { + method = GET, + path = "/support/pargs-info", + }] + async fn support_pargs_info( + request_context: RequestContext, + ) -> Result, HttpError>; + + #[endpoint { + method = GET, + path = "/support/pstack-info", + }] + async fn support_pstack_info( + request_context: RequestContext, + ) -> Result, HttpError>; } #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index edba1c47b1..54e93aa792 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -1001,4 +1001,36 @@ impl SledAgentApi for SledAgentImpl { Ok(HttpResponseOk(FreeformBody(output.into()))) } + + async fn support_pargs_info( + request_context: RequestContext, + ) -> Result, HttpError> { + let sa = request_context.context(); + let output = sa + .support_pargs_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>() + .as_slice() + .join("\n\n"); + + Ok(HttpResponseOk(FreeformBody(output.into()))) + } + + async fn support_pstack_info( + request_context: RequestContext, + ) -> Result, HttpError> { + let sa = request_context.context(); + let output = sa + .support_pstack_info() + .await + .into_iter() + .map(|cmd| cmd.get_output()) + .collect::>() + .as_slice() + .join("\n\n"); + + Ok(HttpResponseOk(FreeformBody(output.into()))) + } } diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 60dcb1be31..66414fc74e 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -737,6 +737,18 @@ impl SledAgentApi for SledAgentSimImpl { ) -> Result, HttpError> { method_unimplemented() } + + async fn support_pargs_info( + _request_context: RequestContext, + ) -> Result, HttpError> { + method_unimplemented() + } + + async fn support_pstack_info( + _request_context: RequestContext, + ) -> Result, HttpError> { + method_unimplemented() + } } fn method_unimplemented() -> Result { diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 37526690cb..c626957097 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -1388,6 +1388,18 @@ impl SledAgent { ) -> Vec> { sled_diagnostics::dladm_info().await } + + pub(crate) async fn support_pargs_info( + &self, + ) -> Vec> { + sled_diagnostics::pargs_oxide_processes(&self.log).await + } + + pub(crate) async fn support_pstack_info( + &self, + ) -> Vec> { + sled_diagnostics::pstack_oxide_processes(&self.log).await + } } #[derive(From, thiserror::Error, Debug)] diff --git a/sled-diagnostics/Cargo.toml b/sled-diagnostics/Cargo.toml index 98ac59b674..9e10dfe1c7 100644 --- a/sled-diagnostics/Cargo.toml +++ b/sled-diagnostics/Cargo.toml @@ -7,7 +7,11 @@ edition = "2021" workspace = true [dependencies] +cfg-if.workspace = true +fs-err.workspace = true futures.workspace = true +libc.workspace = true omicron-workspace-hack.workspace = true +slog.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["full"] } diff --git a/sled-diagnostics/src/contract.rs b/sled-diagnostics/src/contract.rs new file mode 100644 index 0000000000..f6bca904f9 --- /dev/null +++ b/sled-diagnostics/src/contract.rs @@ -0,0 +1,182 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// ! Bindings to libcontract(3lib). + +use fs_err as fs; +use libc::{c_char, c_int, c_void, pid_t}; +use slog::{warn, Logger}; +use thiserror::Error; + +use std::{ + collections::BTreeSet, + ffi::{CStr, CString}, + os::fd::AsRawFd, + path::Path, +}; + +const CT_ALL: &str = "/system/contract/all"; +// Most Oxide services +const OXIDE_FMRI: &str = "svc:/oxide/"; +// NB: Used for propolis zones +const ILLUMOS_FMRI: &str = "svc:/system/illumos/"; +const CTD_ALL: i32 = 2; + +#[allow(non_camel_case_types)] +type ct_stathdl_t = *mut c_void; + +#[link(name = "contract")] +extern "C" { + fn ct_status_read( + fd: c_int, + detail: c_int, + stathdlp: *mut ct_stathdl_t, + ) -> c_int; + fn ct_status_free(stathdlp: ct_stathdl_t); + fn ct_status_get_id(stathdlp: ct_stathdl_t) -> i32; + fn ct_pr_status_get_members( + stathdlp: ct_stathdl_t, + pidpp: *mut *mut pid_t, + n: *mut u32, + ) -> c_int; + fn ct_pr_status_get_svc_fmri( + stathdlp: ct_stathdl_t, + fmri: *mut *mut c_char, + ) -> c_int; +} + +#[derive(Error, Debug)] +pub enum ContractError { + #[error(transparent)] + FileIo(#[from] std::io::Error), + #[error( + "Failed to call ct_pr_status_get_svc_fmri for contract {ctid}: {error}" + )] + Fmri { ctid: i32, error: std::io::Error }, + #[error( + "Failed to call ct_pr_status_get_members for contract {ctid}: {error}" + )] + Members { ctid: i32, error: std::io::Error }, + #[error("ct_status_read returned successfully but handed back a null ptr for {0}")] + Null(std::path::PathBuf), + #[error("Failed to call ct_status_read on {path}: {error}")] + StatusRead { path: std::path::PathBuf, error: std::io::Error }, +} + +pub struct ContractStatus { + handle: ct_stathdl_t, +} + +impl Drop for ContractStatus { + fn drop(&mut self) { + unsafe { ct_status_free(self.handle) }; + } +} + +macro_rules! libcall_io { + ($fn: ident ( $($arg: expr), * $(,)*) ) => {{ + let res = unsafe { $fn($($arg, )*) }; + if res == 0 { + Ok(res) + } else { + Err(std::io::Error::last_os_error()) + } + }}; + } + +impl ContractStatus { + fn new(contract_status: &Path) -> Result { + let file = fs::File::open(contract_status)?; + let mut handle: ct_stathdl_t = std::ptr::null_mut(); + libcall_io!(ct_status_read(file.as_raw_fd(), CTD_ALL, &mut handle,)) + .map_err(|error| ContractError::StatusRead { + path: contract_status.to_path_buf(), + error, + })?; + + // We don't ever expect the system to hand back a null ptr when + // returning success but let's be extra cautious anyways. + if handle.is_null() { + return Err(ContractError::Null(contract_status.to_path_buf())); + } + + Ok(Self { handle }) + } + + fn get_members(&self) -> Result<&[i32], ContractError> { + let mut numpids = 0; + let mut pids: *mut pid_t = std::ptr::null_mut(); + + let pids = { + libcall_io!(ct_pr_status_get_members( + self.handle, + &mut pids, + &mut numpids, + )) + .map_err(|error| { + let ctid = unsafe { ct_status_get_id(self.handle) }; + ContractError::Members { ctid, error } + })?; + + unsafe { + if pids.is_null() { + &[] + } else { + std::slice::from_raw_parts(pids, numpids as usize) + } + } + }; + + Ok(pids) + } + + fn get_fmri(&self) -> Result, ContractError> { + // The lifetime of this string is tied to the lifetime of the status + // handle itself and will be cleaned up when the handle is freed. + let mut ptr: *mut c_char = std::ptr::null_mut(); + libcall_io!(ct_pr_status_get_svc_fmri(self.handle, &mut ptr)).map_err( + |error| { + let ctid = unsafe { ct_status_get_id(self.handle) }; + ContractError::Fmri { ctid, error } + }, + )?; + + if ptr.is_null() { + return Ok(None); + } + + let cstr = unsafe { CStr::from_ptr(ptr) }; + Ok(Some(cstr.to_owned())) + } +} + +pub fn find_oxide_pids(log: &Logger) -> Result, ContractError> { + let mut pids = BTreeSet::new(); + let ents = fs::read_dir(CT_ALL)?; + for ct in ents { + let ctid = ct?; + let mut path = ctid.path(); + path.push("status"); + + let status = match ContractStatus::new(path.as_path()) { + Ok(status) => status, + Err(e) => { + // There's a race between the time we find the contracts to the + // time we attempt to read the contract's status. We can safely + // skip all of the errors for diagnostics purposes but we should + // leave a log in our wake. + warn!(log, "Failed to read contract ({:?}): {}", path, e); + continue; + } + }; + + let fmri_owned = status.get_fmri()?.unwrap_or_default(); + let fmri = fmri_owned.to_string_lossy(); + if fmri.starts_with(OXIDE_FMRI) || fmri.starts_with(ILLUMOS_FMRI) { + pids.extend(status.get_members()?); + } + } + + Ok(pids) +} diff --git a/sled-diagnostics/src/contract_stub.rs b/sled-diagnostics/src/contract_stub.rs new file mode 100644 index 0000000000..9637c3486d --- /dev/null +++ b/sled-diagnostics/src/contract_stub.rs @@ -0,0 +1,18 @@ +//! Stub implementation for platfroms without libcontract(3lib). + +use std::collections::BTreeSet; + +use slog::{warn, Logger}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum ContractError {} + +pub fn find_oxide_pids(log: &Logger) -> Result, ContractError> { + warn!( + log, + "Unable to find oxide pids on a non illumos platform, \ + returning empty set" + ); + Ok(BTreeSet::new()) +} diff --git a/sled-diagnostics/src/lib.rs b/sled-diagnostics/src/lib.rs index 466c1ec446..cbb3a5a0d0 100644 --- a/sled-diagnostics/src/lib.rs +++ b/sled-diagnostics/src/lib.rs @@ -5,6 +5,16 @@ //! Diagnostics for an Oxide sled that exposes common support commands. use futures::{stream::FuturesUnordered, StreamExt}; +use slog::Logger; + +cfg_if::cfg_if! { + if #[cfg(target_os = "illumos")] { + mod contract; + } else { + mod contract_stub; + use contract_stub as contract; + } +} mod queries; pub use crate::queries::{ @@ -28,7 +38,7 @@ pub async fn ipadm_info( execute_command_with_timeout(c, DEFAULT_TIMEOUT).await }) .collect::>() - .collect::>>() + .collect::>>() .await } @@ -47,6 +57,48 @@ pub async fn dladm_info( execute_command_with_timeout(c, DEFAULT_TIMEOUT).await }) .collect::>() - .collect::>>() + .collect::>>() + .await +} + +pub async fn pargs_oxide_processes( + log: &Logger, +) -> Vec> { + // In a diagnostics context we care about looping over every pid we find, + // but on failure we should just return a single error in a vec that + // represents the entire failed operation. + let pids = match contract::find_oxide_pids(log) { + Ok(pids) => pids, + Err(e) => return vec![Err(e.into())], + }; + + pids.iter() + .map(|pid| pargs_process(*pid)) + .map(|c| async move { + execute_command_with_timeout(c, DEFAULT_TIMEOUT).await + }) + .collect::>() + .collect::>>() + .await +} + +pub async fn pstack_oxide_processes( + log: &Logger, +) -> Vec> { + // In a diagnostics context we care about looping over every pid we find, + // but on failure we should just return a single error in a vec that + // represents the entire failed operation. + let pids = match contract::find_oxide_pids(log) { + Ok(pids) => pids, + Err(e) => return vec![Err(e.into())], + }; + + pids.iter() + .map(|pid| pstack_process(*pid)) + .map(|c| async move { + execute_command_with_timeout(c, DEFAULT_TIMEOUT).await + }) + .collect::>() + .collect::>>() .await } diff --git a/sled-diagnostics/src/queries.rs b/sled-diagnostics/src/queries.rs index 9a66842cb2..2f2b135f0d 100644 --- a/sled-diagnostics/src/queries.rs +++ b/sled-diagnostics/src/queries.rs @@ -9,9 +9,17 @@ use std::{process::Command, time::Duration}; use thiserror::Error; use tokio::io::AsyncReadExt; +#[cfg(target_os = "illumos")] +use crate::contract::ContractError; + +#[cfg(not(target_os = "illumos"))] +use crate::contract_stub::ContractError; + const DLADM: &str = "/usr/sbin/dladm"; const IPADM: &str = "/usr/sbin/ipadm"; const PFEXEC: &str = "/usr/bin/pfexec"; +const PSTACK: &str = "/usr/bin/pstack"; +const PARGS: &str = "/usr/bin/pargs"; const ZONEADM: &str = "/usr/sbin/zoneadm"; pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); @@ -22,6 +30,8 @@ pub trait SledDiagnosticsCommandHttpOutput { #[derive(Error, Debug)] pub enum SledDiagnosticsCmdError { + #[error("libcontract error: {0}")] + Contract(#[from] ContractError), #[error("Failed to duplicate pipe for command [{command}]: {error}")] Dup { command: String, error: std::io::Error }, #[error("Failed to proccess output for command [{command}]: {error}")] @@ -205,9 +215,17 @@ pub fn dladm_show_linkprop() -> Command { cmd } -/* - * Public API - */ +pub fn pargs_process(pid: i32) -> Command { + let mut cmd = std::process::Command::new(PFEXEC); + cmd.env_clear().arg(PARGS).arg("-ae").arg(pid.to_string()); + cmd +} + +pub fn pstack_process(pid: i32) -> Command { + let mut cmd = std::process::Command::new(PFEXEC); + cmd.env_clear().arg(PSTACK).arg(pid.to_string()); + cmd +} #[cfg(test)] mod test {