diff --git a/measureme/Cargo.toml b/measureme/Cargo.toml index 6cda378..161556c 100644 --- a/measureme/Cargo.toml +++ b/measureme/Cargo.toml @@ -13,6 +13,14 @@ repository = "https://github.com/rust-lang/measureme" travis-ci = { repository = "rust-lang/measureme" } [dependencies] +log = "0.4" parking_lot = "0.11.0" rustc-hash = "1.0.1" smallvec = "1.0" + +[features] +nightly = [] + +[target.'cfg(all(target_arch = "x86_64", target_os = "linux"))'.dependencies] +memmap = "0.7" +perf-event-open-sys = "1.0.1" diff --git a/measureme/src/counters.rs b/measureme/src/counters.rs new file mode 100644 index 0000000..6b4ab0e --- /dev/null +++ b/measureme/src/counters.rs @@ -0,0 +1,976 @@ +//! Profiling counters and their implementation. +//! +//! # Available counters +//! +//! Name (for [`Counter::by_name()`]) | Counter | OSes | CPUs +//! --------------------------------- | ------- | ---- | ---- +//! `wall-time` | [`WallTime`] | any | any +//! `instructions:u` | [`Instructions`] | Linux | `x86_64` +//! `instructions-minus-irqs:u` | [`InstructionsMinusIrqs`] | Linux | `x86_64`
- AMD (since K8)
- Intel (since Sandy Bridge) +//! `instructions-minus-r0420:u` | [`InstructionsMinusRaw0420`] | Linux | `x86_64`
- AMD (Zen) +//! +//! *Note: `:u` suffixes for hardware performance counters come from the Linux `perf` +//! tool, and indicate that the counter is only active while userspace code executes +//! (i.e. it's paused while the kernel handles syscalls, interrupts, etc.).* +//! +//! # Limitations and caveats +//! +//! *Note: for more information, also see the GitHub PR which first implemented hardware +//! performance counter support ([#143](https://github.com/rust-lang/measureme/pull/143)).* +//! +//! The hardware performance counters (i.e. all counters other than `wall-time`) are limited to: +//! * nightly Rust (gated on `features = ["nightly"]`), for `asm!` +//! * Linux, for out-of-the-box performance counter reads from userspace +//! * other OSes could work through custom kernel extensions/drivers, in the future +//! * `x86_64` CPUs, mostly due to lack of other available test hardware +//! * new architectures would be easier to support (on Linux) than new OSes +//! * easiest to add would be 32-bit `x86` (aka `i686`), which would reuse +//! most of the `x86_64` CPU model detection logic +//! * specific (newer) CPU models, for certain non-standard counters +//! * e.g. `instructions-minus-irqs:u` requires a "hardware interrupts" (aka "IRQs") +//! counter, which is implemented differently between vendors / models (if at all) +//! * single-threaded programs (counters only work on the thread they were created on) +//! * for profiling `rustc`, this means only "check mode" (`--emit=metadata`), +//! is supported currently (`-Z no-llvm-threads` could also work) +//! * unclear what the best approach for handling multiple threads would be +//! * changing the API (e.g. to require per-thread profiler handles) could result +//! in a more efficient implementation, but would also be less ergonomic +//! * profiling data from multithreaded programs would be harder to use due to +//! noise from synchronization mechanisms, non-deterministic work-stealing, etc. +//! +//! For ergonomic reasons, the public API doesn't vary based on `features` or target. +//! Instead, attempting to create any unsupported counter will return `Err`, just +//! like it does for any issue detected at runtime (e.g. incompatible CPU model). +//! +//! When counting instructions specifically, these factors will impact the profiling quality: +//! * high-level non-determinism (e.g. user interactions, networking) +//! * the ideal use-case is a mostly-deterministic program, e.g. a compiler like `rustc` +//! * if I/O can be isolated to separate profiling events, and doesn't impact +//! execution in a more subtle way (see below), the deterministic parts of +//! the program can still be profiled with high accuracy +//! * low-level non-determinism (e.g. ASLR, randomized `HashMap`s, thread scheduling) +//! * ASLR ("Address Space Layout Randomization"), may be provided by the OS for +//! security reasons, or accidentally caused through allocations that depend on +//! random data (even as low-entropy as e.g. the base 10 length of a process ID) +//! * on Linux ASLR can be disabled by running the process under `setarch -R` +//! * this impacts `rustc` and LLVM, which rely on keying `HashMap`s by addresses +//! (typically of interned data) as an optimization, and while non-determinstic +//! outputs are considered bugs, the instructions executed can still vary a lot, +//! even when the externally observable behavior is perfectly repeatable +//! * `HashMap`s are involved in one more than one way: +//! * both the executed instructions, and the shape of the allocations depend +//! on both the hasher state and choice of keys (as the buckets are in +//! a flat array indexed by some of the lower bits of the key hashes) +//! * so every `HashMap` with keys being/containing addresses will amplify +//! ASLR and ASLR-like effects, making the entire program more sensitive +//! * the default hasher is randomized, and while `rustc` doesn't use it, +//! proc macros can (and will), and it's harder to disable than Linux ASLR +//! * `jemalloc` (the allocator used by `rustc`, at least in official releases) +//! has a 10 second "purge timer", which can introduce an ASLR-like effect, +//! unless disabled with `MALLOC_CONF=dirty_decay_ms:0,muzzy_decay_ms:0` +//! * hardware flaws (whether in the design or implementation) +//! * hardware interrupts ("IRQs") and exceptions (like page faults) cause +//! overcounting (1 instruction per interrupt, possibly the `iret` from the +//! kernel handler back to the interrupted userspace program) +//! * this is the reason why `instructions-minus-irqs:u` should be preferred +//! to `instructions:u`, where the former is available +//! * there are system-wide options (e.g. `CONFIG_NO_HZ_FULL`) for removing +//! some interrupts from the cores used for profiling, but they're not as +//! complete of a solution, nor easy to set up in the first place +//! * AMD Zen CPUs have a speculative execution feature (dubbed `SpecLockMap`), +//! which can cause non-deterministic overcounting for instructions following +//! an atomic instruction (such as found in heap allocators, or `measureme`) +//! * this is automatically detected, with a `log` message pointing the user +//! to for guidance on how to +//! disable `SpecLockMap` on their system (sadly requires root access) +//! +//! Even if some of the above caveats apply for some profiling setup, as long as +//! the counters function, they can still be used, and compared with `wall-time`. +//! Chances are, they will still have less variance, as everything that impacts +//! instruction counts will also impact any time measurements. +//! +//! Also keep in mind that instruction counts do not properly reflect all kinds +//! of workloads, e.g. SIMD throughput and cache locality are unaccounted for. + +use std::error::Error; +use std::time::Instant; + +// HACK(eddyb) this is semantically `warn!` but uses `error!` because +// that's the only log level enabled by default - see also +// https://github.com/rust-lang/rust/issues/76824 +macro_rules! really_warn { + ($msg:literal $($rest:tt)*) => { + error!(concat!("[WARNING] ", $msg) $($rest)*) + } +} + +pub enum Counter { + WallTime(WallTime), + Instructions(Instructions), + InstructionsMinusIrqs(InstructionsMinusIrqs), + InstructionsMinusRaw0420(InstructionsMinusRaw0420), +} + +impl Counter { + pub fn by_name(name: &str) -> Result> { + Ok(match name { + WallTime::NAME => Counter::WallTime(WallTime::new()), + Instructions::NAME => Counter::Instructions(Instructions::new()?), + InstructionsMinusIrqs::NAME => { + Counter::InstructionsMinusIrqs(InstructionsMinusIrqs::new()?) + } + InstructionsMinusRaw0420::NAME => { + Counter::InstructionsMinusRaw0420(InstructionsMinusRaw0420::new()?) + } + _ => return Err(format!("{:?} is not a valid counter name", name).into()), + }) + } + + pub(super) fn describe_as_json(&self) -> String { + let (name, units) = match self { + Counter::WallTime(_) => ( + WallTime::NAME, + r#"[["ns", 1], ["μs", 1000], ["ms", 1000000], ["s", 1000000000]]"#, + ), + Counter::Instructions(_) => (Instructions::NAME, r#"[["instructions", 1]]"#), + Counter::InstructionsMinusIrqs(_) => { + (InstructionsMinusIrqs::NAME, r#"[["instructions", 1]]"#) + } + Counter::InstructionsMinusRaw0420(_) => { + (InstructionsMinusRaw0420::NAME, r#"[["instructions", 1]]"#) + } + }; + format!(r#"{{ "name": "{}", "units": {} }}"#, name, units) + } + + #[inline] + pub(super) fn since_start(&self) -> u64 { + match self { + Counter::WallTime(counter) => counter.since_start(), + Counter::Instructions(counter) => counter.since_start(), + Counter::InstructionsMinusIrqs(counter) => counter.since_start(), + Counter::InstructionsMinusRaw0420(counter) => counter.since_start(), + } + } +} + +/// "Monotonic clock" with nanosecond precision (using [`std::time::Instant`]). +/// +/// Can be obtained with `Counter::by_name("wall-time")`. +pub struct WallTime { + start: Instant, +} + +impl WallTime { + const NAME: &'static str = "wall-time"; + + pub fn new() -> Self { + WallTime { + start: Instant::now(), + } + } + + #[inline] + fn since_start(&self) -> u64 { + self.start.elapsed().as_nanos() as u64 + } +} + +/// "Instructions retired" hardware performance counter (userspace-only). +/// +/// Can be obtained with `Counter::by_name("instructions:u")`. +pub struct Instructions { + instructions: hw::Counter, + start: u64, +} + +impl Instructions { + const NAME: &'static str = "instructions:u"; + + pub fn new() -> Result> { + let model = hw::CpuModel::detect()?; + let instructions = hw::Counter::new(&model, HwCounterType::Instructions)?; + let start = instructions.read(); + Ok(Instructions { + instructions, + start, + }) + } + + #[inline] + fn since_start(&self) -> u64 { + self.instructions.read().wrapping_sub(self.start) + } +} + +/// More accurate [`Instructions`] (subtracting hardware interrupt counts). +/// +/// Can be obtained with `Counter::by_name("instructions-minus-irqs:u")`. +pub struct InstructionsMinusIrqs { + instructions: hw::Counter, + irqs: hw::Counter, + start: u64, +} + +impl InstructionsMinusIrqs { + const NAME: &'static str = "instructions-minus-irqs:u"; + + pub fn new() -> Result> { + let model = hw::CpuModel::detect()?; + let instructions = hw::Counter::new(&model, HwCounterType::Instructions)?; + let irqs = hw::Counter::new(&model, HwCounterType::Irqs)?; + let (start_instructions, start_irqs) = (&instructions, &irqs).read(); + let start = start_instructions.wrapping_sub(start_irqs); + Ok(InstructionsMinusIrqs { + instructions, + irqs, + start, + }) + } + + #[inline] + fn since_start(&self) -> u64 { + let (instructions, irqs) = (&self.instructions, &self.irqs).read(); + instructions.wrapping_sub(irqs).wrapping_sub(self.start) + } +} + +/// (Experimental) Like [`InstructionsMinusIrqs`] (but using an undocumented `r0420:u` counter). +/// +/// Can be obtained with `Counter::by_name("instructions-minus-r0420:u")`. +// +// HACK(eddyb) this is a variant of `instructions-minus-irqs:u`, where `r0420` +// is subtracted, instead of the usual "hardware interrupts" (aka IRQs). +// `r0420` is an undocumented counter on AMD Zen CPUs which appears to count +// both hardware interrupts and exceptions (such as page faults), though +// it's unclear yet what exactly it's counting (could even be `iret`s). +pub struct InstructionsMinusRaw0420(InstructionsMinusIrqs); + +impl InstructionsMinusRaw0420 { + const NAME: &'static str = "instructions-minus-r0420:u"; + + pub fn new() -> Result> { + let model = hw::CpuModel::detect()?; + let instructions = hw::Counter::new(&model, HwCounterType::Instructions)?; + let irqs = hw::Counter::new(&model, HwCounterType::Raw0420)?; + let (start_instructions, start_irqs) = (&instructions, &irqs).read(); + let start = start_instructions.wrapping_sub(start_irqs); + Ok(InstructionsMinusRaw0420(InstructionsMinusIrqs { + instructions, + irqs, + start, + })) + } + + #[inline] + fn since_start(&self) -> u64 { + self.0.since_start() + } +} + +trait HwCounterRead { + type Output; + fn read(&self) -> Self::Output; +} + +enum HwCounterType { + Instructions, + Irqs, + Raw0420, +} + +const BUG_REPORT_MSG: &str = + "please report this to https://github.com/rust-lang/measureme/issues/new"; + +/// Linux x86_64 implementation based on `perf_event_open` and `rdpmc`. +#[cfg(all(feature = "nightly", target_arch = "x86_64", target_os = "linux"))] +mod hw { + use memmap::{Mmap, MmapOptions}; + use perf_event_open_sys::{bindings::*, perf_event_open}; + use std::convert::TryInto; + use std::error::Error; + use std::fs; + use std::mem; + use std::os::unix::io::FromRawFd; + + pub(super) struct Counter { + mmap: Mmap, + reg_idx: u32, + } + + impl Counter { + pub(super) fn new( + model: &CpuModel, + counter_type: super::HwCounterType, + ) -> Result> { + let (type_, hw_id) = match counter_type { + super::HwCounterType::Instructions => ( + perf_type_id_PERF_TYPE_HARDWARE, + perf_hw_id_PERF_COUNT_HW_INSTRUCTIONS, + ), + super::HwCounterType::Irqs => { + (perf_type_id_PERF_TYPE_RAW, model.irqs_counter_config()?) + } + super::HwCounterType::Raw0420 => { + match model { + CpuModel::Amd(AmdGen::Zen) => {} + + _ => really_warn!( + "Counter::new: the undocumented `r0420` performance \ + counter has only been observed on AMD Zen CPUs" + ), + } + + (perf_type_id_PERF_TYPE_RAW, 0x04_20) + } + }; + Self::with_type_and_hw_id(type_, hw_id) + } + + fn with_type_and_hw_id( + type_: perf_type_id, + hw_id: u32, + ) -> Result> { + let mut attrs = perf_event_attr { + size: mem::size_of::().try_into().unwrap(), + type_, + config: hw_id.into(), + ..perf_event_attr::default() + }; + + // Only record same-thread, any CPUs, and only userspace (no kernel/hypervisor). + // NOTE(eddyb) `pid = 0`, despite talking about "process id", means + // "calling process/thread", *not* "any thread in the calling process" + // (i.e. "process" is interchangeable with "main thread of the process") + // FIXME(eddyb) introduce per-thread counters and/or use `inherit` + // (and `inherit_stat`? though they might not be appropriate here) + // to be able to read the counter on more than just the initial thread. + let pid = 0; + let cpu = -1; + let group_fd = -1; + attrs.set_exclude_kernel(1); + attrs.set_exclude_hv(1); + + let file = unsafe { + let fd = + perf_event_open(&mut attrs, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC.into()); + if fd < 0 { + Err(std::io::Error::from_raw_os_error(-fd)) + } else { + Ok(fs::File::from_raw_fd(fd)) + } + }; + let file = file.map_err(|e| format!("perf_event_open failed: {:?}", e))?; + + let mmap = unsafe { + MmapOptions::new() + .len(mem::size_of::()) + .map(&file) + }; + let mmap = mmap.map_err(|e| format!("perf_event_mmap_page: mmap failed: {:?}", e))?; + + let mut counter = Counter { mmap, reg_idx: 0 }; + + let (version, compat_version, caps, index, pmc_width) = counter + .access_mmap_page_with_seqlock(|mp| { + ( + mp.version, + mp.compat_version, + unsafe { mp.__bindgen_anon_1.__bindgen_anon_1 }, + mp.index, + mp.pmc_width, + ) + }); + + info!( + "Counter::new: version={} compat_version={} index={:#x}", + version, compat_version, index, + ); + + if caps.cap_user_rdpmc() == 0 { + return Err(format!( + "perf_event_mmap_page: missing cap_user_rdpmc{}", + if caps.cap_bit0_is_deprecated() == 0 && caps.cap_bit0() == 1 { + " (ignoring legacy/broken rdpmc support)" + } else { + "" + } + ) + .into()); + } + + if index == 0 { + return Err(format!( + "perf_event_mmap_page: no allocated hardware register (ran out?)" + ) + .into()); + } + counter.reg_idx = index - 1; + + if (cfg!(not(accurate_seqlock_rdpmc)) || true) && pmc_width != 48 { + return Err(format!( + "perf_event_mmap_page: {}-bit hardware counter found, only 48-bit supported", + pmc_width + ) + .into()); + } + + Ok(counter) + } + + /// Try to access the mmap page, retrying the `attempt` closure as long + /// as the "seqlock" sequence number changes (which indicates the kernel + /// has updated one or more fields within the mmap page). + #[inline] + fn access_mmap_page_with_seqlock( + &self, + attempt: impl Fn(&perf_event_mmap_page) -> T, + ) -> T { + // FIXME(eddyb) it's probably UB to use regular reads, especially + // from behind `&T`, with the only synchronization being barriers. + // Probably needs atomic reads, and stronger ones at that, for the + // `lock` field, than the fields (which would be `Relaxed`?). + let mmap_page = unsafe { &*(self.mmap.as_ptr() as *const perf_event_mmap_page) }; + let barrier = || std::sync::atomic::fence(std::sync::atomic::Ordering::Acquire); + + loop { + // Grab the "seqlock" - the kernel will update this value when it + // updates any of the other fields that may be read in `attempt`. + let seq_lock = mmap_page.lock; + barrier(); + + let result = attempt(mmap_page); + + // If nothing has changed, we're done. Otherwise, keep retrying. + barrier(); + if mmap_page.lock == seq_lock { + return result; + } + } + } + } + + impl super::HwCounterRead for Counter { + type Output = u64; + + #[inline] + fn read(&self) -> u64 { + // HACK(eddyb) keep the accurate code around while not using it, + // to minimize overhead without losing the more complex implementation. + let (counter, offset, pmc_width) = if cfg!(accurate_seqlock_rdpmc) && false { + self.access_mmap_page_with_seqlock(|mp| { + let caps = unsafe { mp.__bindgen_anon_1.__bindgen_anon_1 }; + assert_ne!(caps.cap_user_rdpmc(), 0); + + ( + rdpmc(mp.index.checked_sub(1).unwrap()), + mp.offset, + mp.pmc_width, + ) + }) + } else { + (rdpmc(self.reg_idx), 0, 48) + }; + + let counter = offset + (counter as i64); + + // Sign-extend the `pmc_width`-bit value to `i64`. + (counter << (64 - pmc_width) >> (64 - pmc_width)) as u64 + } + } + + impl super::HwCounterRead for (&Counter, &Counter) { + type Output = (u64, u64); + + #[inline] + fn read(&self) -> (u64, u64) { + // HACK(eddyb) keep the accurate code around while not using it, + // to minimize overhead without losing the more complex implementation. + if (cfg!(accurate_seqlock_rdpmc) || cfg!(unserialized_rdpmc)) && false { + return (self.0.read(), self.1.read()); + } + + let pmc_width = 48; + + let (a_counter, b_counter) = rdpmc_pair(self.0.reg_idx, self.1.reg_idx); + + // Sign-extend the `pmc_width`-bit values to `i64`. + ( + ((a_counter as i64) << (64 - pmc_width) >> (64 - pmc_width)) as u64, + ((b_counter as i64) << (64 - pmc_width) >> (64 - pmc_width)) as u64, + ) + } + } + + /// Read the hardware performance counter indicated by `reg_idx`. + /// + /// If the counter is signed, sign extension should be performed based on + /// the width of the register (32 to 64 bits, e.g. 48-bit seems common). + #[inline(always)] + fn rdpmc(reg_idx: u32) -> u64 { + let (lo, hi): (u32, u32); + unsafe { + // NOTE(eddyb) below comment is outdated (the other branch uses `cpuid`). + if cfg!(unserialized_rdpmc) && false { + // FIXME(eddyb) the Intel and AMD manuals warn about the need for + // "serializing instructions" before/after `rdpmc`, if avoiding any + // reordering is desired, but do not agree on the full set of usable + // "serializing instructions" (e.g. `mfence` isn't listed in both). + // + // The only usable, and guaranteed to work, "serializing instruction" + // appears to be `cpuid`, but it doesn't seem easy to use, especially + // due to the overlap in registers with `rdpmc` itself, and it might + // have too high of a cost, compared to serialization benefits (if any). + asm!("rdpmc", in("ecx") reg_idx, out("eax") lo, out("edx") hi, options(nostack)); + } else { + asm!( + // Dummy `cpuid(0)` to serialize instruction execution. + "xor eax, eax", + "cpuid", + + "mov ecx, {rdpmc_ecx:e}", + "rdpmc", + rdpmc_ecx = in(reg) reg_idx, + out("eax") lo, + out("edx") hi, + + // `cpuid` clobbers (not overwritten by `rdpmc`). + out("ebx") _, + out("ecx") _, + + options(nostack), + ); + } + } + lo as u64 | (hi as u64) << 32 + } + + /// Read two hardware performance counters at once (see `rdpmc`). + /// + /// Should be more efficient/accurate than two `rdpmc` calls, as it + /// only requires one "serializing instruction", rather than two. + #[inline(always)] + fn rdpmc_pair(a_reg_idx: u32, b_reg_idx: u32) -> (u64, u64) { + let (a_lo, a_hi): (u32, u32); + let (b_lo, b_hi): (u32, u32); + unsafe { + asm!( + // Dummy `cpuid(0)` to serialize instruction execution. + "xor eax, eax", + "cpuid", + + "mov ecx, {a_rdpmc_ecx:e}", + "rdpmc", + "mov {a_rdpmc_eax:e}, eax", + "mov {a_rdpmc_edx:e}, edx", + "mov ecx, {b_rdpmc_ecx:e}", + "rdpmc", + a_rdpmc_ecx = in(reg) a_reg_idx, + a_rdpmc_eax = out(reg) a_lo, + a_rdpmc_edx = out(reg) a_hi, + b_rdpmc_ecx = in(reg) b_reg_idx, + out("eax") b_lo, + out("edx") b_hi, + + // `cpuid` clobbers (not overwritten by `rdpmc`). + out("ebx") _, + out("ecx") _, + + options(nostack), + ); + } + ( + a_lo as u64 | (a_hi as u64) << 32, + b_lo as u64 | (b_hi as u64) << 32, + ) + } + + /// Categorization of `x86_64` CPUs, primarily based on how they + /// support for counting "hardware interrupts" (documented or not). + pub(super) enum CpuModel { + Amd(AmdGen), + Intel(IntelGen), + } + + pub(super) enum AmdGen { + /// K8 (Hammer) to Jaguar / Puma. + PreZen, + + /// Zen / Zen+ / Zen 2. + Zen, + + /// Unknown AMD CPU, contemporary to/succeeding Zen/Zen+/Zen 2, + /// but likely similar to them. + UnknownMaybeZenLike, + } + + pub(super) enum IntelGen { + /// Intel CPU predating Sandy Bridge. These are the only CPUs we + /// can't support (more) accurate instruction counting on, as they + /// don't (appear to) have any way to count "hardware interrupts". + PreBridge, + + /// Sandy Bridge / Ivy Bridge: + /// * client: Sandy Bridge (M/H) / Ivy Bridge (M/H/Gladden) + /// * server: Sandy Bridge (E/EN/EP) / Ivy Bridge (E/EN/EP/EX) + /// + /// Intel doesn't document support for counting "hardware interrupts" + /// prior to Skylake, but testing found that `HW_INTERRUPTS.RECEIVED` + /// from Skylake has existed, with the same config, as far back as + /// "Sandy Bridge" (but before that it mapped to a different event). + /// + /// These are the (pre-Skylake) *Bridge CPU models confirmed so far: + /// * Sandy Bridge (client) Family 6 Model 42 + /// Intel(R) Core(TM) i5-2520M CPU @ 2.50GHz (@alyssais) + /// * Ivy Bridge (client) Family 6 Model 58 + /// Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz (@eddyb) + /// + /// We later found this paper, which on page 5 lists 12 counters, + /// for each of Nehalem/Westmere, Sandy Bridge and Ivy Bridge: + /// http://web.eece.maine.edu/~vweaver/projects/deterministic/deterministic_counters.pdf + /// It appears that both Sandy Bridge and Ivy Bridge used to have + /// `HW_INTERRUPTS.RECEIVED` documented, before Intel removed every + /// mention of the counter from newer versions of their manuals. + Bridge, + + /// Haswell / Broadwell: + /// * client: Haswell (S/ULT/GT3e) / Broadwell (U/Y/S/H/C/W) + /// * server: Haswell (E/EP/EX) / Broadwell (E/EP/EX/DE/Hewitt Lake) + /// + /// Equally as undocumented as "Sandy Bridge / Ivy Bridge" (see above). + /// + /// These are the (pre-Skylake) *Well CPU models confirmed so far: + /// * Haswell (client) Family 6 Model 60 + /// Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz (@m-ou-se) + /// * Haswell (server) Family 6 Model 63 + /// Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz (@cuviper) + /// * Haswell (client + GT3e) Family 6 Model 70 + /// Intel(R) Core(TM) i7-4750HQ CPU @ 2.00GHz (@nagisa) + /// Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz (@m-ou-se) + Well, + + /// Skylake / Skylake-derived: + /// * client: Skylake (Y/U/DT/H/S) / Kaby Lake (Y/U/DT/H/S/X) / Coffee Lake (U/S/H/E) + /// * server: Skylake (SP/X/DE/W) / Cascade Lake (SP/X/W) + /// + /// Both "client" and "server" product lines have documented support + /// for counting "hardware interrupts" (`HW_INTERRUPTS.RECEIVED`). + /// + /// Intel does not make it clear that future product lines, such as + /// "Ice Lake", will continue to support this (or with what config), + /// and even "Comet Lake" (aka "10th gen") isn't explicitly listed. + Lake, + + /// Unknown Intel CPU, contemporary to/succeeding *Bridge/*Well/*Lake, + /// but likely similar to them. + UnknownMaybeLakeLike, + } + + impl CpuModel { + /// Detect the model of the current CPU using `cpuid`. + pub(super) fn detect() -> Result> { + let cpuid0 = unsafe { std::arch::x86_64::__cpuid(0) }; + let cpuid1 = unsafe { std::arch::x86_64::__cpuid(1) }; + let mut vendor = [0; 12]; + vendor[0..4].copy_from_slice(&cpuid0.ebx.to_le_bytes()); + vendor[4..8].copy_from_slice(&cpuid0.edx.to_le_bytes()); + vendor[8..12].copy_from_slice(&cpuid0.ecx.to_le_bytes()); + + let vendor = std::str::from_utf8(&vendor).map_err(|_| { + format!( + "cpuid returned non-UTF-8 vendor name: cpuid(0)={:?} cpuid(1)={:?}", + cpuid0, cpuid1 + ) + })?; + + let version = cpuid1.eax; + + let mut family = (version >> 8) & 0xf; + if family == 15 { + // Extended family. + family += (version >> 20) & 0xff; + } + + let mut model = (version >> 4) & 0xf; + if family >= 15 || vendor == "GenuineIntel" && family == 6 { + // Extended model. + model += ((version >> 16) & 0xf) << 4; + } + + info!( + "CpuModel::detect: vendor={:?} family={} model={}", + vendor, family, model + ); + + match vendor { + "AuthenticAMD" => { + use self::AmdGen::*; + + let (gen, name) = match (family, model) { + (0..=14, _) | (19, _) => { + return Err(format!( + "impossible AMD64 CPU detected (Family {} Model {}); {}", + family, + model, + super::BUG_REPORT_MSG + ) + .into()); + } + + (15, _) => (PreZen, "K8 (Hammer)"), + (16, _) => (PreZen, "K10 (Barcelona/Shanghai/Istanbul)"), + (17, _) => (PreZen, "K8+K10 hybrid (Turion X2 Ultra)"), + (18, _) => (PreZen, "Fusion"), + (20, _) => (PreZen, "Bobcat"), + (21, _) => (PreZen, "Bulldozer / Piledriver / Steamroller / Excavator"), + (22, _) => (PreZen, "Jaguar / Puma"), + + (23, 1) => (Zen, "Zen (Naples/Whitehaven/Summit Ridge/Snowy Owl)"), + (23, 17) => (Zen, "Zen (Raven Ridge)"), + (23, 24) => (Zen, "Zen (Banded Kestrel/Dali) / Zen+ (Picasso)"), + (23, 8) => (Zen, "Zen+ (Pinnacle Ridge)"), + (23, 49) => (Zen, "Zen 2 (Rome/Castle Peak)"), + (23, 113) => (Zen, "Zen 2 (Matisse)"), + + (23..=0xffff_ffff, _) => { + really_warn!( + "CpuModel::detect: unknown AMD CPU (Family {} Model {}), \ + assuming Zen-like; {}", + family, + model, + super::BUG_REPORT_MSG + ); + + (UnknownMaybeZenLike, "") + } + }; + + if !name.is_empty() { + info!("CpuModel::detect: known AMD CPU: {}", name); + } + + // The `SpecLockMap` (speculative atomic aka `lock` instruction + // execution, unclear what "Map" refers to) feature in AMD Zen CPUs + // causes non-deterministic overcounting of atomic instructions, + // presumably whenever it has to roll back the speculation + // (as in, the performance counters aren't rolled back). + // Even this this may be rare when uncontended, it adds up. + // + // There is an MSR bit (`MSRC001_1020[54]`) that's not officially + // documented, but which several motherboards and profiling tools + // set whenever IBS (Instruction-Based Sampling) is in use, and + // it is sometimes referred to as "disabling `SpecLockMap`" + // (hence having a name for the feature that speculates `lock`s). + // + // One way we could detect that the bit has been set would be to + // parse `uname().release` (aka `uname -r`) and look for versions + // which are known to include the patch suggested in this thread: + // https://github.com/mozilla/rr/issues/2034#issuecomment-693761247 + // + // However, one may set the bit using e.g. `wrmsr`, even on older + // kernels, so a more reliable approach is to execute some atomics + // and look at the `SpecLockMapCommit` (`r0825:u`) Zen counter, + // which only reliably remains `0` when `SpecLockMap` is disabled. + if matches!(gen, Zen | UnknownMaybeZenLike) { + if let Ok(spec_lock_map_commit) = + Counter::with_type_and_hw_id(perf_type_id_PERF_TYPE_RAW, 0x08_25) + { + use super::HwCounterRead; + + let start_spec_lock_map_commit = spec_lock_map_commit.read(); + + // Execute an atomic (`lock`) instruction, which should + // start speculative execution for following instructions + // (as long as `SpecLockMap` isn't disabled). + let mut atomic: u64 = 0; + let mut _tmp: u64 = 0; + unsafe { + asm!( + "lock xadd qword ptr [{atomic}], {tmp}", + + atomic = in(reg) &mut atomic, + tmp = inout(reg) _tmp, + ); + } + + if spec_lock_map_commit.read() != start_spec_lock_map_commit { + really_warn!( + "CpuModel::detect: SpecLockMap detected, in AMD {} CPU; \ + this may add some non-deterministic noise - \ + for information on disabling SpecLockMap, see \ + https://github.com/mozilla/rr/wiki/Zen", + name + ); + } + } + } + + Ok(CpuModel::Amd(gen)) + } + + "GenuineIntel" => { + use self::IntelGen::*; + + let (gen, name) = match (family, model) { + // No need to name these, they're unsupported anyway. + (0..=5, _) => (PreBridge, ""), + (15, _) => (PreBridge, "Netburst"), + (6, 0..=41) => (PreBridge, ""), + + // Older Xeon Phi CPUs, misplaced in Family 6. + (6, 87) => (PreBridge, "Knights Landing"), + (6, 133) => (PreBridge, "Knights Mill"), + + // Older Atom CPUs, interleaved with other CPUs. + // FIXME(eddyb) figure out if these are like *Bridge/*Well. + (6, 53) | (6, 54) => (PreBridge, "Saltwell"), + (6, 55) | (6, 74) | (6, 77) | (6, 90) | (6, 93) => { + (PreBridge, "Silvermont") + } + (6, 76) => (PreBridge, "Airmont (Cherry Trail/Braswell)"), + + // Older server CPUs, numbered out of order. + (6, 44) => (PreBridge, "Westmere (Gulftown/EP)"), + (6, 46) => (PreBridge, "Nehalem (EX)"), + (6, 47) => (PreBridge, "Westmere (EX)"), + + (6, 42) => (Bridge, "Sandy Bridge (M/H)"), + (6, 45) => (Bridge, "Sandy Bridge (E/EN/EP)"), + (6, 58) => (Bridge, "Ivy Bridge (M/H/Gladden)"), + (6, 62) => (Bridge, "Ivy Bridge (E/EN/EP/EX)"), + + (6, 60) => (Well, "Haswell (S)"), + (6, 61) => (Well, "Broadwell (U/Y/S)"), + (6, 63) => (Well, "Haswell (E/EP/EX)"), + (6, 69) => (Well, "Haswell (ULT)"), + (6, 70) => (Well, "Haswell (GT3e)"), + (6, 71) => (Well, "Broadwell (H/C/W)"), + (6, 79) => (Well, "Broadwell (E/EP/EX)"), + (6, 86) => (Well, "Broadwell (DE/Hewitt Lake)"), + + (6, 78) => (Lake, "Skylake (Y/U)"), + (6, 85) => (Lake, "Skylake (SP/X/DE/W) / Cascade Lake (SP/X/W)"), + (6, 94) => (Lake, "Skylake (DT/H/S)"), + (6, 142) => (Lake, "Kaby Lake (Y/U) / Coffee Lake (U)"), + (6, 158) => (Lake, "Kaby Lake (DT/H/S/X) / Coffee Lake (S/H/E)"), + + (6..=14, _) | (16..=0xffff_ffff, _) => { + really_warn!( + "CpuModel::detect: unknown Intel CPU (Family {} Model {}), \ + assuming Skylake-like; {}", + family, + model, + super::BUG_REPORT_MSG + ); + + (UnknownMaybeLakeLike, "") + } + }; + + if !name.is_empty() { + info!("CpuModel::detect: known Intel CPU: {}", name); + } + + Ok(CpuModel::Intel(gen)) + } + + _ => Err(format!( + "cpuid returned unknown CPU vendor {:?}; version={:#x}", + vendor, version + ) + .into()), + } + } + + /// Return the hardware performance counter configuration for + /// counting "hardware interrupts" (documented or not). + fn irqs_counter_config(&self) -> Result> { + match self { + CpuModel::Amd(model) => match model { + AmdGen::PreZen => Ok(0x00_cf), + AmdGen::Zen | AmdGen::UnknownMaybeZenLike => Ok(0x00_2c), + }, + CpuModel::Intel(model) => match model { + IntelGen::PreBridge => Err(format!( + "counting IRQs not yet supported on Intel CPUs \ + predating Sandy Bridge; {}", + super::BUG_REPORT_MSG + ) + .into()), + IntelGen::Bridge + | IntelGen::Well + | IntelGen::Lake + | IntelGen::UnknownMaybeLakeLike => Ok(0x01_cb), + }, + } + } + } +} + +#[cfg(not(all(feature = "nightly", target_arch = "x86_64", target_os = "linux")))] +mod hw { + use std::error::Error; + + pub(super) enum Counter {} + + impl Counter { + pub(super) fn new( + model: &CpuModel, + _: super::HwCounterType, + ) -> Result> { + match *model {} + } + } + + impl super::HwCounterRead for Counter { + type Output = u64; + + #[inline] + fn read(&self) -> u64 { + match *self {} + } + } + + impl super::HwCounterRead for (&Counter, &Counter) { + type Output = (u64, u64); + + #[inline] + fn read(&self) -> (u64, u64) { + match *self.0 {} + } + } + + pub(super) enum CpuModel {} + + impl CpuModel { + pub(super) fn detect() -> Result> { + // HACK(eddyb) mark `really_warn!` (and transitively `log` macros) + // and `BUG_REPORT_MSG` as "used" to silence warnings. + if false { + really_warn!("unsupported; {}", super::BUG_REPORT_MSG); + } + + let mut msg = String::new(); + let mut add_error = |s| { + if !msg.is_empty() { + msg += "; "; + } + msg += s; + }; + + if cfg!(not(feature = "nightly")) { + add_error("only supported with measureme's \"nightly\" feature"); + } + + if cfg!(not(target_arch = "x86_64")) { + add_error("only supported architecture is x86_64"); + } + + if cfg!(not(target_os = "linux")) { + add_error("only supported OS is Linux"); + } + + Err(msg.into()) + } + } +} diff --git a/measureme/src/lib.rs b/measureme/src/lib.rs index 30c8674..446c90a 100644 --- a/measureme/src/lib.rs +++ b/measureme/src/lib.rs @@ -12,6 +12,10 @@ //! //! To create a [`Profiler`], call the [`Profiler::new()`] function and provide a `Path` with //! the directory and file name for the trace files. +//! Alternatively, call the [`Profiler::with_counter()`] function, to choose the [`Counter`] +//! the profiler will use for events (whereas [`Profiler::new()`] defaults to `wall-time`). +//! +//! For more information on available counters, see the [`counters`] module documentation. //! //! To record an event, call the [`Profiler::record_instant_event()`] method, passing a few //! arguments: @@ -28,16 +32,24 @@ //! - [`Profiler::alloc_string()`]: allocates a string and returns the [`StringId`] that refers //! to it //! +//! [`counters`]: counters +//! [`Counter`]: counters::Counter //! [`Profiler`]: Profiler //! [`Profiler::alloc_string()`]: Profiler::alloc_string //! [`Profiler::alloc_string_with_reserved_id()`]: Profiler::alloc_string_with_reserved_id //! [`Profiler::new()`]: Profiler::new //! [`Profiler::record_event()`]: Profiler::record_event //! [`Profiler::start_recording_interval_event()`]: Profiler::start_recording_interval_event +//! [`Profiler::with_counter()`]: Profiler::with_counter //! [`StringId`]: StringId #![allow(renamed_and_removed_lints)] // intra_doc_link_resolution_failure is renamed on nightly #![deny(warnings, intra_doc_link_resolution_failure)] +#![cfg_attr(feature = "nightly", feature(asm))] + +#[macro_use] +extern crate log; +pub mod counters; pub mod event_id; pub mod file_header; mod profiler; diff --git a/measureme/src/profiler.rs b/measureme/src/profiler.rs index c897c2b..f4dd3bb 100644 --- a/measureme/src/profiler.rs +++ b/measureme/src/profiler.rs @@ -1,3 +1,4 @@ +use crate::counters::Counter; use crate::file_header::{write_file_header, FILE_MAGIC_EVENT_STREAM, FILE_MAGIC_TOP_LEVEL}; use crate::raw_event::RawEvent; use crate::serialization::{PageTag, SerializationSink, SerializationSinkBuilder}; @@ -7,16 +8,25 @@ use std::error::Error; use std::fs; use std::path::Path; use std::sync::Arc; -use std::time::Instant; pub struct Profiler { event_sink: Arc, string_table: StringTableBuilder, - start_time: Instant, + counter: Counter, } impl Profiler { pub fn new>(path_stem: P) -> Result> { + Self::with_counter( + path_stem, + Counter::WallTime(crate::counters::WallTime::new()), + ) + } + + pub fn with_counter>( + path_stem: P, + counter: Counter, + ) -> Result> { let path = path_stem.as_ref().with_extension(FILE_EXTENSION); fs::create_dir_all(path.parent().unwrap())?; @@ -39,7 +49,7 @@ impl Profiler { let profiler = Profiler { event_sink, string_table, - start_time: Instant::now(), + counter, }; let mut args = String::new(); @@ -49,13 +59,14 @@ impl Profiler { } profiler.string_table.alloc_metadata(&*format!( - r#"{{ "start_time": {}, "process_id": {}, "cmd": "{}" }}"#, + r#"{{ "start_time": {}, "process_id": {}, "cmd": "{}", "counter": {} }}"#, std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap() .as_nanos(), std::process::id(), args, + profiler.counter.describe_as_json(), )); Ok(profiler) @@ -88,7 +99,7 @@ impl Profiler { /// automatically. pub fn record_instant_event(&self, event_kind: StringId, event_id: EventId, thread_id: u32) { let raw_event = - RawEvent::new_instant(event_kind, event_id, thread_id, self.nanos_since_start()); + RawEvent::new_instant(event_kind, event_id, thread_id, self.counter.since_start()); self.record_raw_event(&raw_event); } @@ -107,7 +118,7 @@ impl Profiler { event_id, event_kind, thread_id, - start_ns: self.nanos_since_start(), + start_count: self.counter.since_start(), } } @@ -117,10 +128,6 @@ impl Profiler { raw_event.serialize(bytes); }); } - - fn nanos_since_start(&self) -> u64 { - self.start_time.elapsed().as_nanos() as _ - } } /// When dropped, this `TimingGuard` will record an "end" event in the @@ -131,7 +138,7 @@ pub struct TimingGuard<'a> { event_id: EventId, event_kind: StringId, thread_id: u32, - start_ns: u64, + start_count: u64, } impl<'a> Drop for TimingGuard<'a> { @@ -141,8 +148,8 @@ impl<'a> Drop for TimingGuard<'a> { self.event_kind, self.event_id, self.thread_id, - self.start_ns, - self.profiler.nanos_since_start(), + self.start_count, + self.profiler.counter.since_start(), ); self.profiler.record_raw_event(&raw_event); diff --git a/measureme/src/raw_event.rs b/measureme/src/raw_event.rs index d8aac65..d2a8d20 100644 --- a/measureme/src/raw_event.rs +++ b/measureme/src/raw_event.rs @@ -12,22 +12,26 @@ pub struct RawEvent { pub event_id: EventId, pub thread_id: u32, - // The following 96 bits store the start and the end timestamp, using + // The following 96 bits store the start and the end counter value, using // 48 bits for each. + // FIXME(eddyb) s/time/count/ pub start_time_lower: u32, + // FIXME(eddyb) s/time/count/ pub end_time_lower: u32, pub start_and_end_upper: u32, } -/// `RawEvents` that have an end time stamp with this value are instant events. -const INSTANT_TIMESTAMP_MARKER: u64 = 0xFFFF_FFFF_FFFF; +/// `RawEvents` that have an end counter value with this value are instant events. +const INSTANT_COUNT_MARKER: u64 = 0xFFFF_FFFF_FFFF; -/// The max instant timestamp we can represent with the 48 bits available. +/// The max instant counter value we can represent with the 48 bits available. +// FIXME(eddyb) s/TIMESTAMP/COUNT/ pub const MAX_INSTANT_TIMESTAMP: u64 = 0xFFFF_FFFF_FFFF; -/// The max interval timestamp we can represent with the 48 bits available. -/// The highest value is reserved for the `INSTANT_TIMESTAMP_MARKER`. -pub const MAX_INTERVAL_TIMESTAMP: u64 = INSTANT_TIMESTAMP_MARKER - 1; +/// The max interval counter value we can represent with the 48 bits available. +/// The highest value is reserved for the `INSTANT_COUNT_MARKER`. +// FIXME(eddyb) s/TIMESTAMP/COUNT/ +pub const MAX_INTERVAL_TIMESTAMP: u64 = INSTANT_COUNT_MARKER - 1; impl RawEvent { #[inline] @@ -35,17 +39,17 @@ impl RawEvent { event_kind: StringId, event_id: EventId, thread_id: u32, - start_nanos: u64, - end_nanos: u64, + start_count: u64, + end_count: u64, ) -> RawEvent { - assert!(start_nanos <= end_nanos); - assert!(end_nanos <= MAX_INTERVAL_TIMESTAMP); + assert!(start_count <= end_count); + assert!(end_count <= MAX_INTERVAL_TIMESTAMP); - let start_time_lower = start_nanos as u32; - let end_time_lower = end_nanos as u32; + let start_time_lower = start_count as u32; + let end_time_lower = end_count as u32; - let start_time_upper = (start_nanos >> 16) as u32 & 0xFFFF_0000; - let end_time_upper = (end_nanos >> 32) as u32; + let start_time_upper = (start_count >> 16) as u32 & 0xFFFF_0000; + let end_time_upper = (end_count >> 32) as u32; let start_and_end_upper = start_time_upper | end_time_upper; @@ -64,14 +68,14 @@ impl RawEvent { event_kind: StringId, event_id: EventId, thread_id: u32, - timestamp_ns: u64, + count: u64, ) -> RawEvent { - assert!(timestamp_ns <= MAX_INSTANT_TIMESTAMP); + assert!(count <= MAX_INSTANT_TIMESTAMP); - let start_time_lower = timestamp_ns as u32; + let start_time_lower = count as u32; let end_time_lower = 0xFFFF_FFFF; - let start_time_upper = (timestamp_ns >> 16) as u32; + let start_time_upper = (count >> 16) as u32; let start_and_end_upper = start_time_upper | 0x0000_FFFF; RawEvent { @@ -85,18 +89,20 @@ impl RawEvent { } #[inline] + // FIXME(eddyb) s/nanos/count/ pub fn start_nanos(&self) -> u64 { self.start_time_lower as u64 | (((self.start_and_end_upper & 0xFFFF_0000) as u64) << 16) } #[inline] + // FIXME(eddyb) s/nanos/count/ pub fn end_nanos(&self) -> u64 { self.end_time_lower as u64 | (((self.start_and_end_upper & 0x0000_FFFF) as u64) << 32) } #[inline] pub fn is_instant(&self) -> bool { - self.end_nanos() == INSTANT_TIMESTAMP_MARKER + self.end_nanos() == INSTANT_COUNT_MARKER } #[inline] @@ -206,24 +212,24 @@ mod tests { #[test] #[should_panic] - fn invalid_instant_timestamp() { + fn invalid_instant_count() { let _ = RawEvent::new_instant( StringId::INVALID, EventId::INVALID, 123, - // timestamp too large + // count too large MAX_INSTANT_TIMESTAMP + 1, ); } #[test] #[should_panic] - fn invalid_start_timestamp() { + fn invalid_start_count() { let _ = RawEvent::new_interval( StringId::INVALID, EventId::INVALID, 123, - // start timestamp too large + // start count too large MAX_INTERVAL_TIMESTAMP + 1, MAX_INTERVAL_TIMESTAMP + 1, ); @@ -231,50 +237,50 @@ mod tests { #[test] #[should_panic] - fn invalid_end_timestamp() { + fn invalid_end_count() { let _ = RawEvent::new_interval( StringId::INVALID, EventId::INVALID, 123, 0, - // end timestamp too large + // end count too large MAX_INTERVAL_TIMESTAMP + 3, ); } #[test] #[should_panic] - fn invalid_end_timestamp2() { + fn invalid_end_count2() { let _ = RawEvent::new_interval( StringId::INVALID, EventId::INVALID, 123, 0, - INSTANT_TIMESTAMP_MARKER, + INSTANT_COUNT_MARKER, ); } #[test] #[should_panic] - fn start_greater_than_end_timestamp() { + fn start_greater_than_end_count() { let _ = RawEvent::new_interval( StringId::INVALID, EventId::INVALID, 123, - // start timestamp greater than end timestamp + // start count greater than end count 1, 0, ); } #[test] - fn start_equal_to_end_timestamp() { + fn start_equal_to_end_count() { // This is allowed, make sure we don't panic let _ = RawEvent::new_interval(StringId::INVALID, EventId::INVALID, 123, 1, 1); } #[test] - fn interval_timestamp_decoding() { + fn interval_count_decoding() { // Check the upper limits let e = RawEvent::new_interval( StringId::INVALID, @@ -319,7 +325,7 @@ mod tests { } #[test] - fn instant_timestamp_decoding() { + fn instant_count_decoding() { assert_eq!( RawEvent::new_instant(StringId::INVALID, EventId::INVALID, 987, 0,).start_nanos(), 0