Skip to content

Commit

Permalink
Initial implementation of debug information manager
Browse files Browse the repository at this point in the history
Currently, executables are opened as soon as possible to have a hold of
them to be able to symbolize later on. This approach is not ideal
because the number of opened file descriptors can balloon, and this is
done without first checking if the executables contain debug information
at all.

In the future all the local and remote use-cases will transition to
using the debug information manager. This first iteration is quite
simple and lacks a lot of features needed to make this really work. In
this commit it's still disabled by default, and just adds some basic
features to upload debug information to a server, if needed.

Left various TODOs on the planned changes.

Test Plan
=========

CI + manual checks.
  • Loading branch information
javierhonduco committed Dec 13, 2024
1 parent 4429694 commit e929617
Show file tree
Hide file tree
Showing 9 changed files with 249 additions and 23 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion lightswitch-object/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "lightswitch-object"
version = "0.1.0"
version = "0.1.1"
edition = "2021"
description = "Deals with object files"
license = "MIT"
Expand Down
5 changes: 5 additions & 0 deletions lightswitch-object/src/object.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ impl ObjectFile {
Ok(BuildId::sha256_from_digest(&self.code_hash))
}

/// Returns whether the object has debug symbols.
pub fn has_debug_info(&self) -> bool {
self.object.has_debug_symbols()
}

pub fn is_dynamic(&self) -> bool {
self.object.kind() == ObjectKind::Dynamic
}
Expand Down
10 changes: 10 additions & 0 deletions src/cli/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ pub(crate) enum Symbolizer {
None,
}

#[derive(PartialEq, clap::ValueEnum, Debug, Clone, Default)]
pub(crate) enum DebugInfoBackend {
#[default]
None,
Copy,
Remote,
}

#[derive(Parser, Debug)]
pub(crate) struct CliArgs {
/// Specific PIDs to profile
Expand Down Expand Up @@ -126,4 +134,6 @@ pub(crate) struct CliArgs {
pub(crate) exclude_self: bool,
#[arg(long, default_value_t, value_enum)]
pub(crate) symbolizer: Symbolizer,
#[arg(long, default_value_t, value_enum)]
pub(crate) debug_info_backend: DebugInfoBackend,
}
51 changes: 34 additions & 17 deletions src/cli/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ use std::io::Write;
use std::panic;
use std::path::PathBuf;
use std::sync::{Arc, Mutex};
use std::time::Duration;

use clap::Parser;
use crossbeam_channel::bounded;
use inferno::flamegraph;
use lightswitch::collector::{AggregatorCollector, Collector, NullCollector, StreamingCollector};
use lightswitch::debug_info::DebugInfoManager;
use lightswitch_metadata::metadata_provider::GlobalMetadataProvider;
use nix::unistd::Uid;
use prost::Message;
Expand All @@ -21,6 +23,9 @@ use tracing_subscriber::FmtSubscriber;
use lightswitch_capabilities::system_info::SystemInfo;
use lightswitch_metadata::metadata_provider::ThreadSafeGlobalMetadataProvider;

use lightswitch::debug_info::{
DebugInfoBackendFilesystem, DebugInfoBackendNull, DebugInfoBackendRemote,
};
use lightswitch::profile::symbolize_profile;
use lightswitch::profile::{fold_profile, to_pprof};
use lightswitch::profiler::{Profiler, ProfilerConfig};
Expand All @@ -32,12 +37,13 @@ mod args;
mod validators;

use crate::args::CliArgs;
use crate::args::DebugInfoBackend;
use crate::args::LoggingLevel;
use crate::args::ProfileFormat;
use crate::args::ProfileSender;
use crate::args::Symbolizer;

const DEFAULT_PPROF_INGEST_URL: &str = "http://localhost:4567/pprof/new";
const DEFAULT_SERVER_URL: &str = "http://localhost:4567";

/// Exit the main thread if any thread panics. We prefer this behaviour because pretty much every
/// thread is load bearing for the correct functioning.
Expand Down Expand Up @@ -98,24 +104,34 @@ fn main() -> Result<(), Box<dyn Error>> {
}
}

let server_url = args.server_url.unwrap_or(DEFAULT_SERVER_URL.into());

let metadata_provider: ThreadSafeGlobalMetadataProvider =
Arc::new(Mutex::new(GlobalMetadataProvider::default()));

let collector = Arc::new(Mutex::new(match args.sender {
ProfileSender::None => Box::new(NullCollector::new()) as Box<dyn Collector + Send>,
ProfileSender::LocalDisk => {
Box::new(AggregatorCollector::new()) as Box<dyn Collector + Send>
}
ProfileSender::Remote => Box::new(StreamingCollector::new(
args.symbolizer == Symbolizer::Local,
args.server_url
.as_ref()
.map_or(DEFAULT_PPROF_INGEST_URL, |v| v),
ProfilerConfig::default().session_duration,
args.sample_freq,
metadata_provider.clone(),
)) as Box<dyn Collector + Send>,
}));
let collector: Arc<Mutex<Box<dyn Collector + Send>>> =
Arc::new(Mutex::new(match args.sender {
ProfileSender::None => Box::new(NullCollector::new()),
ProfileSender::LocalDisk => Box::new(AggregatorCollector::new()),
ProfileSender::Remote => Box::new(StreamingCollector::new(
args.symbolizer == Symbolizer::Local,
&server_url,
ProfilerConfig::default().session_duration,
args.sample_freq,
metadata_provider.clone(),
)),
}));

let debug_info_manager: Box<dyn DebugInfoManager> = match args.debug_info_backend {
DebugInfoBackend::None => Box::new(DebugInfoBackendNull {}),
DebugInfoBackend::Copy => Box::new(DebugInfoBackendFilesystem {
path: PathBuf::from("/tmp"),
}),
DebugInfoBackend::Remote => Box::new(DebugInfoBackendRemote {
http_client_timeout: Duration::from_millis(500),
server_url,
}),
};

let profiler_config = ProfilerConfig {
libbpf_debug: args.libbpf_debug,
Expand All @@ -128,6 +144,7 @@ fn main() -> Result<(), Box<dyn Error>> {
mapsize_aggregated_stacks: args.mapsize_aggregated_stacks,
mapsize_rate_limits: args.mapsize_rate_limits,
exclude_self: args.exclude_self,
debug_info_manager,
..Default::default()
};

Expand Down Expand Up @@ -260,7 +277,7 @@ mod tests {
cmd.arg("--help");
cmd.assert().success();
let actual = String::from_utf8(cmd.unwrap().stdout).unwrap();
insta::assert_yaml_snapshot!(actual, @r#""Usage: lightswitch [OPTIONS]\n\nOptions:\n --pids <PIDS>\n Specific PIDs to profile\n\n --tids <TIDS>\n Specific TIDs to profile (these can be outside the PIDs selected above)\n\n --show-unwind-info <PATH_TO_BINARY>\n Show unwind info for given binary\n\n --show-info <PATH_TO_BINARY>\n Show build ID for given binary\n\n -D, --duration <DURATION>\n How long this agent will run in seconds\n \n [default: 18446744073709551615]\n\n --libbpf-debug\n Enable libbpf logs. This includes the BPF verifier output\n\n --bpf-logging\n Enable BPF programs logging\n\n --logging <LOGGING>\n Set lightswitch's logging level\n \n [default: info]\n [possible values: trace, debug, info, warn, error]\n\n --sample-freq <SAMPLE_FREQ_IN_HZ>\n Per-CPU Sampling Frequency in Hz\n \n [default: 19]\n\n --profile-format <PROFILE_FORMAT>\n Output file for Flame Graph in SVG format\n \n [default: flame-graph]\n [possible values: none, flame-graph, pprof]\n\n --profile-path <PROFILE_PATH>\n Path for the generated profile\n\n --profile-name <PROFILE_NAME>\n Name for the generated profile\n\n --sender <SENDER>\n Where to write the profile\n \n [default: local-disk]\n\n Possible values:\n - none: Discard the profile. Used for kernel tests\n - local-disk\n - remote\n\n --server-url <SERVER_URL>\n \n\n --perf-buffer-bytes <PERF_BUFFER_BYTES>\n Size of each profiler perf buffer, in bytes (must be a power of 2)\n \n [default: 524288]\n\n --mapsize-info\n Print eBPF map sizes after creation\n\n --mapsize-stacks <MAPSIZE_STACKS>\n max number of individual stacks to capture before aggregation\n \n [default: 100000]\n\n --mapsize-aggregated-stacks <MAPSIZE_AGGREGATED_STACKS>\n max number of unique stacks after aggregation\n \n [default: 10000]\n\n --mapsize-rate-limits <MAPSIZE_RATE_LIMITS>\n max number of rate limit entries\n \n [default: 5000]\n\n --exclude-self\n Do not profile the profiler (myself)\n\n --symbolizer <SYMBOLIZER>\n [default: local]\n [possible values: local, none]\n\n -h, --help\n Print help (see a summary with '-h')\n""#);
insta::assert_yaml_snapshot!(actual, @r#""Usage: lightswitch [OPTIONS]\n\nOptions:\n --pids <PIDS>\n Specific PIDs to profile\n\n --tids <TIDS>\n Specific TIDs to profile (these can be outside the PIDs selected above)\n\n --show-unwind-info <PATH_TO_BINARY>\n Show unwind info for given binary\n\n --show-info <PATH_TO_BINARY>\n Show build ID for given binary\n\n -D, --duration <DURATION>\n How long this agent will run in seconds\n \n [default: 18446744073709551615]\n\n --libbpf-debug\n Enable libbpf logs. This includes the BPF verifier output\n\n --bpf-logging\n Enable BPF programs logging\n\n --logging <LOGGING>\n Set lightswitch's logging level\n \n [default: info]\n [possible values: trace, debug, info, warn, error]\n\n --sample-freq <SAMPLE_FREQ_IN_HZ>\n Per-CPU Sampling Frequency in Hz\n \n [default: 19]\n\n --profile-format <PROFILE_FORMAT>\n Output file for Flame Graph in SVG format\n \n [default: flame-graph]\n [possible values: none, flame-graph, pprof]\n\n --profile-path <PROFILE_PATH>\n Path for the generated profile\n\n --profile-name <PROFILE_NAME>\n Name for the generated profile\n\n --sender <SENDER>\n Where to write the profile\n \n [default: local-disk]\n\n Possible values:\n - none: Discard the profile. Used for kernel tests\n - local-disk\n - remote\n\n --server-url <SERVER_URL>\n \n\n --perf-buffer-bytes <PERF_BUFFER_BYTES>\n Size of each profiler perf buffer, in bytes (must be a power of 2)\n \n [default: 524288]\n\n --mapsize-info\n Print eBPF map sizes after creation\n\n --mapsize-stacks <MAPSIZE_STACKS>\n max number of individual stacks to capture before aggregation\n \n [default: 100000]\n\n --mapsize-aggregated-stacks <MAPSIZE_AGGREGATED_STACKS>\n max number of unique stacks after aggregation\n \n [default: 10000]\n\n --mapsize-rate-limits <MAPSIZE_RATE_LIMITS>\n max number of rate limit entries\n \n [default: 5000]\n\n --exclude-self\n Do not profile the profiler (myself)\n\n --symbolizer <SYMBOLIZER>\n [default: local]\n [possible values: local, none]\n\n --debug-info-backend <DEBUG_INFO_BACKEND>\n [default: none]\n [possible values: none, copy, remote]\n\n -h, --help\n Print help (see a summary with '-h')\n""#);
}

#[rstest]
Expand Down
2 changes: 1 addition & 1 deletion src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ impl StreamingCollector {
) -> Self {
Self {
local_symbolizer,
pprof_ingest_url: pprof_ingest_url.into(),
pprof_ingest_url: format!("{}/pprof/new", pprof_ingest_url),
http_client_timeout: Duration::from_secs(30),
profile_duration,
profile_frequency_hz,
Expand Down
165 changes: 165 additions & 0 deletions src/debug_info.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
use std::io::Read;
use std::path::PathBuf;
use std::time::Duration;

use reqwest::StatusCode;
use tracing::instrument;

use lightswitch_object::BuildId;
use lightswitch_object::ExecutableId;

/// Handles with debug information.
///
/// This currently experimental, not feature-complete and not used yet during
/// symbolization. The end goal would be to keep track of every debug info
/// that's either present locally or remotely (depending on configuration), while
/// minimizing the number of open FDs, file copies, and race condition windows.
pub trait DebugInfoManager {
fn add_if_not_present(
&self,
name: &str,
build_id: &BuildId,
executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()>;
fn debug_info_path(&self) -> Option<PathBuf>;
}

pub struct DebugInfoBackendNull {}
impl DebugInfoManager for DebugInfoBackendNull {
fn add_if_not_present(
&self,
_name: &str,
_build_id: &BuildId,
_executable_id: ExecutableId,
_file: &mut std::fs::File,
) -> anyhow::Result<()> {
Ok(())
}

fn debug_info_path(&self) -> Option<PathBuf> {
None
}
}

#[derive(Debug)]
pub struct DebugInfoBackendFilesystem {
pub path: PathBuf,
}
impl DebugInfoManager for DebugInfoBackendFilesystem {
#[instrument]
fn add_if_not_present(
&self,
_name: &str,
build_id: &BuildId,
executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()> {
// try to find, else extract
if self.find_in_fs(build_id) {
return Ok(());
}

self.add_to_fs(build_id, executable_id, file)
}

fn debug_info_path(&self) -> Option<PathBuf> {
todo!()
}
}

impl DebugInfoBackendFilesystem {
fn find_in_fs(&self, build_id: &BuildId) -> bool {
self.path.join(build_id.to_string()).exists()
}

fn add_to_fs(
&self,
build_id: &BuildId,
_executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()> {
// TODO: add support for other methods beyond copying. For example
// hardlinks could be used and only fall back to copying if the src
// and dst filesystems differ.
let mut writer = std::fs::File::create(self.path.join(build_id.to_string()))?;
std::io::copy(file, &mut writer)?;
Ok(())
}
}

#[derive(Debug)]
pub struct DebugInfoBackendRemote {
pub http_client_timeout: Duration,
pub server_url: String,
}
impl DebugInfoManager for DebugInfoBackendRemote {
#[instrument(level = "debug")]
fn add_if_not_present(
&self,
name: &str,
build_id: &BuildId,
executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()> {
// TODO: add a local cache to not have to reach to the backend
// unnecessarily.
if self.find_in_backend(build_id)? {
return Ok(());
}

// TODO: do this in another thread.
self.upload_to_backend(name, build_id, executable_id, file)?;
Ok(())
}

fn debug_info_path(&self) -> Option<PathBuf> {
None
}
}

impl DebugInfoBackendRemote {
/// Whether the backend knows about some debug information.
#[instrument(level = "debug")]
fn find_in_backend(&self, build_id: &BuildId) -> anyhow::Result<bool> {
let client_builder = reqwest::blocking::Client::builder().timeout(self.http_client_timeout);
let client = client_builder.build()?;
let response = client
.get(format!(
"{}/debuginfo/{}",
self.server_url.clone(),
build_id
))
.send();

Ok(response?.status() == StatusCode::OK)
}

/// Send the debug information to the backend.
#[instrument]
fn upload_to_backend(
&self,
name: &str,
build_id: &BuildId,
executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()> {
let client_builder = reqwest::blocking::Client::builder().timeout(self.http_client_timeout);
let client = client_builder.build()?;
let mut debug_info = Vec::new();
file.read_to_end(&mut debug_info)?;

let response = client
.post(format!(
"{}/debuginfo/new/{}/{}/{}",
self.server_url.clone(),
name,
build_id,
executable_id
))
.body(debug_info)
.send()?;
println!("wrote debug info to server {:?}", response);
Ok(())
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub mod bpf;
pub mod collector;
pub mod debug_info;
pub mod ksym;
pub mod perf_events;
pub mod process;
Expand Down
Loading

0 comments on commit e929617

Please sign in to comment.