Skip to content

Commit

Permalink
Add support for finding oxide processes on a sled (#7320)
Browse files Browse the repository at this point in the history
This PR adds support for finding all oxide processes via libcontract and
relying on the fact that we deploy oxide services with the fmri prefix
svc:/oxide/ or svc:/system/illumos. This allows us to find every pid
started via smf even if the smf script uses ctrun.

You can see this for yourself by running pfexec ctstat -av on a
helios/illumos based system:
```
205     1       process owned   2492    0       -       -       
        cookie:                0
        informative event set: core
        critical event set:    hwerr empty
        fatal event set:       hwerr
        parameter set:         noorphan regent
        member processes:      2497
        inherited contracts:   none
        service fmri:          svc:/oxide/wicketd:default
        service fmri ctid:     204
        creator:               ctrun
        aux:                   
249     1       process owned   1334    0       -       -       
        cookie:                0x20
        informative event set: none
        critical event set:    hwerr empty
        fatal event set:       none
        parameter set:         inherit regent
        member processes:      2794
        inherited contracts:   none
        service fmri:          svc:/oxide/dendrite:default
        service fmri ctid:     249
        creator:               svc.startd
        aux:                   start
254     1       process owned   1334    0       -       -       
        cookie:                0x20
        informative event set: none
        critical event set:    hwerr empty
        fatal event set:       none
        parameter set:         inherit regent
        member processes:      2802
        inherited contracts:   none
        service fmri:          svc:/oxide/lldpd:default
        service fmri ctid:     254
        creator:               svc.startd
        aux:                   start
```
This PR is on top of:
- #7193
  • Loading branch information
papertigers authored Jan 8, 2025
1 parent d77cebe commit a77566a
Show file tree
Hide file tree
Showing 11 changed files with 397 additions and 5 deletions.
4 changes: 4 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 42 additions & 0 deletions openapi/sled-agent.json
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,48 @@
}
}
},
"/support/pargs-info": {
"get": {
"operationId": "support_pargs_info",
"responses": {
"200": {
"description": "",
"content": {
"*/*": {
"schema": {}
}
}
},
"4XX": {
"$ref": "#/components/responses/Error"
},
"5XX": {
"$ref": "#/components/responses/Error"
}
}
}
},
"/support/pstack-info": {
"get": {
"operationId": "support_pstack_info",
"responses": {
"200": {
"description": "",
"content": {
"*/*": {
"schema": {}
}
}
},
"4XX": {
"$ref": "#/components/responses/Error"
},
"5XX": {
"$ref": "#/components/responses/Error"
}
}
}
},
"/support/zoneadm-info": {
"get": {
"operationId": "support_zoneadm_info",
Expand Down
16 changes: 16 additions & 0 deletions sled-agent/api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,22 @@ pub trait SledAgentApi {
async fn support_dladm_info(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError>;

#[endpoint {
method = GET,
path = "/support/pargs-info",
}]
async fn support_pargs_info(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError>;

#[endpoint {
method = GET,
path = "/support/pstack-info",
}]
async fn support_pstack_info(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError>;
}

#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)]
Expand Down
32 changes: 32 additions & 0 deletions sled-agent/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1001,4 +1001,36 @@ impl SledAgentApi for SledAgentImpl {

Ok(HttpResponseOk(FreeformBody(output.into())))
}

async fn support_pargs_info(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError> {
let sa = request_context.context();
let output = sa
.support_pargs_info()
.await
.into_iter()
.map(|cmd| cmd.get_output())
.collect::<Vec<_>>()
.as_slice()
.join("\n\n");

Ok(HttpResponseOk(FreeformBody(output.into())))
}

async fn support_pstack_info(
request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError> {
let sa = request_context.context();
let output = sa
.support_pstack_info()
.await
.into_iter()
.map(|cmd| cmd.get_output())
.collect::<Vec<_>>()
.as_slice()
.join("\n\n");

Ok(HttpResponseOk(FreeformBody(output.into())))
}
}
12 changes: 12 additions & 0 deletions sled-agent/src/sim/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,18 @@ impl SledAgentApi for SledAgentSimImpl {
) -> Result<HttpResponseOk<FreeformBody>, HttpError> {
method_unimplemented()
}

async fn support_pargs_info(
_request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError> {
method_unimplemented()
}

async fn support_pstack_info(
_request_context: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<FreeformBody>, HttpError> {
method_unimplemented()
}
}

fn method_unimplemented<T>() -> Result<T, HttpError> {
Expand Down
12 changes: 12 additions & 0 deletions sled-agent/src/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1388,6 +1388,18 @@ impl SledAgent {
) -> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
sled_diagnostics::dladm_info().await
}

pub(crate) async fn support_pargs_info(
&self,
) -> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
sled_diagnostics::pargs_oxide_processes(&self.log).await
}

pub(crate) async fn support_pstack_info(
&self,
) -> Vec<Result<SledDiagnosticsCmdOutput, SledDiagnosticsCmdError>> {
sled_diagnostics::pstack_oxide_processes(&self.log).await
}
}

#[derive(From, thiserror::Error, Debug)]
Expand Down
4 changes: 4 additions & 0 deletions sled-diagnostics/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ edition = "2021"
workspace = true

[dependencies]
cfg-if.workspace = true
fs-err.workspace = true
futures.workspace = true
libc.workspace = true
omicron-workspace-hack.workspace = true
slog.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["full"] }
182 changes: 182 additions & 0 deletions sled-diagnostics/src/contract.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// ! Bindings to libcontract(3lib).

use fs_err as fs;
use libc::{c_char, c_int, c_void, pid_t};
use slog::{warn, Logger};
use thiserror::Error;

use std::{
collections::BTreeSet,
ffi::{CStr, CString},
os::fd::AsRawFd,
path::Path,
};

const CT_ALL: &str = "/system/contract/all";
// Most Oxide services
const OXIDE_FMRI: &str = "svc:/oxide/";
// NB: Used for propolis zones
const ILLUMOS_FMRI: &str = "svc:/system/illumos/";
const CTD_ALL: i32 = 2;

#[allow(non_camel_case_types)]
type ct_stathdl_t = *mut c_void;

#[link(name = "contract")]
extern "C" {
fn ct_status_read(
fd: c_int,
detail: c_int,
stathdlp: *mut ct_stathdl_t,
) -> c_int;
fn ct_status_free(stathdlp: ct_stathdl_t);
fn ct_status_get_id(stathdlp: ct_stathdl_t) -> i32;
fn ct_pr_status_get_members(
stathdlp: ct_stathdl_t,
pidpp: *mut *mut pid_t,
n: *mut u32,
) -> c_int;
fn ct_pr_status_get_svc_fmri(
stathdlp: ct_stathdl_t,
fmri: *mut *mut c_char,
) -> c_int;
}

#[derive(Error, Debug)]
pub enum ContractError {
#[error(transparent)]
FileIo(#[from] std::io::Error),
#[error(
"Failed to call ct_pr_status_get_svc_fmri for contract {ctid}: {error}"
)]
Fmri { ctid: i32, error: std::io::Error },
#[error(
"Failed to call ct_pr_status_get_members for contract {ctid}: {error}"
)]
Members { ctid: i32, error: std::io::Error },
#[error("ct_status_read returned successfully but handed back a null ptr for {0}")]
Null(std::path::PathBuf),
#[error("Failed to call ct_status_read on {path}: {error}")]
StatusRead { path: std::path::PathBuf, error: std::io::Error },
}

pub struct ContractStatus {
handle: ct_stathdl_t,
}

impl Drop for ContractStatus {
fn drop(&mut self) {
unsafe { ct_status_free(self.handle) };
}
}

macro_rules! libcall_io {
($fn: ident ( $($arg: expr), * $(,)*) ) => {{
let res = unsafe { $fn($($arg, )*) };
if res == 0 {
Ok(res)
} else {
Err(std::io::Error::last_os_error())
}
}};
}

impl ContractStatus {
fn new(contract_status: &Path) -> Result<Self, ContractError> {
let file = fs::File::open(contract_status)?;
let mut handle: ct_stathdl_t = std::ptr::null_mut();
libcall_io!(ct_status_read(file.as_raw_fd(), CTD_ALL, &mut handle,))
.map_err(|error| ContractError::StatusRead {
path: contract_status.to_path_buf(),
error,
})?;

// We don't ever expect the system to hand back a null ptr when
// returning success but let's be extra cautious anyways.
if handle.is_null() {
return Err(ContractError::Null(contract_status.to_path_buf()));
}

Ok(Self { handle })
}

fn get_members(&self) -> Result<&[i32], ContractError> {
let mut numpids = 0;
let mut pids: *mut pid_t = std::ptr::null_mut();

let pids = {
libcall_io!(ct_pr_status_get_members(
self.handle,
&mut pids,
&mut numpids,
))
.map_err(|error| {
let ctid = unsafe { ct_status_get_id(self.handle) };
ContractError::Members { ctid, error }
})?;

unsafe {
if pids.is_null() {
&[]
} else {
std::slice::from_raw_parts(pids, numpids as usize)
}
}
};

Ok(pids)
}

fn get_fmri(&self) -> Result<Option<CString>, ContractError> {
// The lifetime of this string is tied to the lifetime of the status
// handle itself and will be cleaned up when the handle is freed.
let mut ptr: *mut c_char = std::ptr::null_mut();
libcall_io!(ct_pr_status_get_svc_fmri(self.handle, &mut ptr)).map_err(
|error| {
let ctid = unsafe { ct_status_get_id(self.handle) };
ContractError::Fmri { ctid, error }
},
)?;

if ptr.is_null() {
return Ok(None);
}

let cstr = unsafe { CStr::from_ptr(ptr) };
Ok(Some(cstr.to_owned()))
}
}

pub fn find_oxide_pids(log: &Logger) -> Result<BTreeSet<i32>, ContractError> {
let mut pids = BTreeSet::new();
let ents = fs::read_dir(CT_ALL)?;
for ct in ents {
let ctid = ct?;
let mut path = ctid.path();
path.push("status");

let status = match ContractStatus::new(path.as_path()) {
Ok(status) => status,
Err(e) => {
// There's a race between the time we find the contracts to the
// time we attempt to read the contract's status. We can safely
// skip all of the errors for diagnostics purposes but we should
// leave a log in our wake.
warn!(log, "Failed to read contract ({:?}): {}", path, e);
continue;
}
};

let fmri_owned = status.get_fmri()?.unwrap_or_default();
let fmri = fmri_owned.to_string_lossy();
if fmri.starts_with(OXIDE_FMRI) || fmri.starts_with(ILLUMOS_FMRI) {
pids.extend(status.get_members()?);
}
}

Ok(pids)
}
18 changes: 18 additions & 0 deletions sled-diagnostics/src/contract_stub.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
//! Stub implementation for platfroms without libcontract(3lib).
use std::collections::BTreeSet;

use slog::{warn, Logger};
use thiserror::Error;

#[derive(Error, Debug)]
pub enum ContractError {}

pub fn find_oxide_pids(log: &Logger) -> Result<BTreeSet<i32>, ContractError> {
warn!(
log,
"Unable to find oxide pids on a non illumos platform, \
returning empty set"
);
Ok(BTreeSet::new())
}
Loading

0 comments on commit a77566a

Please sign in to comment.