From 38bdd0131d320b9ed92a2bfa3c86d00796c1c95f Mon Sep 17 00:00:00 2001 From: Naveen Naidu Date: Fri, 10 May 2024 11:53:49 +0530 Subject: [PATCH] Approach 8: Use mmap and bytes everywhere (Time: 78s) In this appraoch we do two things: 1. Use mmap so that we can directly use bytes insted of depending on vectors 2. Replace all instances of Vec to bytes We can see from the previous flamegraph that `to_vec` takes almost 14% of the cycle in `read_line` function. The motivation to use mmap here was to convert everything in bytes so that we can reduce the allocation of memory on Heap for every line. P.S: I hate the use of unsafe here, but I'm going with it for now and plan to change it later once I make other stuff performant --- Cargo.toml | 1 + .../flamegraph.svg | 491 ++++++++++++++++++ src/main.rs | 48 +- 3 files changed, 515 insertions(+), 25 deletions(-) create mode 100644 flamegraphs/08-use-mmap-byte-everywhere/flamegraph.svg diff --git a/Cargo.toml b/Cargo.toml index 564c3af..c76e7ac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,4 @@ rust_decimal = "1.34" rust_decimal_macros = "1.34" fast-float = "0.2" rustc-hash = { version = "1.0"} +memmap2 = {version = "0.9.4"} diff --git a/flamegraphs/08-use-mmap-byte-everywhere/flamegraph.svg b/flamegraphs/08-use-mmap-byte-everywhere/flamegraph.svg new file mode 100644 index 0000000..6fd1b9e --- /dev/null +++ b/flamegraphs/08-use-mmap-byte-everywhere/flamegraph.svg @@ -0,0 +1,491 @@ +Flame Graph Reset ZoomSearch [ld-linux-x86-64.so.2] (6 samples, 0.02%)[ld-linux-x86-64.so.2] (8 samples, 0.02%)<core::ptr::non_null::NonNull<T> as core::cmp::PartialEq>::eq (536 samples, 1.53%)<core::slice::iter::Iter<T> as core::iter::traits::iterator::Iterator>::next (1,281 samples, 3.66%)<cor..<core::slice::iter::Split<T,P> as core::iter::traits::iterator::Iterator>::next::_{{closure}} (211 samples, 0.60%)rust_1brc::calculate_station_values::_{{closure}} (211 samples, 0.60%)<core::slice::iter::Iter<T> as core::iter::traits::iterator::Iterator>::position (3,162 samples, 9.03%)<core::slice:..[unknown] (77 samples, 0.22%)[unknown] (33 samples, 0.09%)[unknown] (30 samples, 0.09%)[unknown] (26 samples, 0.07%)[unknown] (21 samples, 0.06%)<core::slice::iter::Split<T,P> as core::iter::traits::iterator::Iterator>::next (4,302 samples, 12.29%)<core::slice::iter..core::slice::<impl [T]>::get_unchecked (1,067 samples, 3.05%)cor..<core::ops::range::RangeFrom<usize> as core::slice::index::SliceIndex<[T]>>::get_unchecked (1,067 samples, 3.05%)<co..<core::ops::range::Range<usize> as core::slice::index::SliceIndex<[T]>>::get_unchecked (1,067 samples, 3.05%)<co..core::ptr::const_ptr::<impl *const T>::add (1,067 samples, 3.05%)cor..<core::slice::iter::Iter<T> as core::iter::traits::double_ended::DoubleEndedIterator>::next_back (841 samples, 2.40%)<c..<core::ptr::non_null::NonNull<T> as core::cmp::PartialEq>::eq (700 samples, 2.00%)<..<core::slice::iter::Iter<T> as core::iter::traits::iterator::Iterator>::rposition (2,920 samples, 8.34%)<core::slice..<core::slice::iter::Split<T,P> as core::iter::traits::double_ended::DoubleEndedIterator>::next_back::_{{closure}} (1,389 samples, 3.97%)<cor..rust_1brc::read_line::_{{closure}} (1,389 samples, 3.97%)rust..<core::slice::iter::RSplit<T,P> as core::iter::traits::iterator::Iterator>::next (3,955 samples, 11.30%)<core::slice::ite..<core::slice::iter::Split<T,P> as core::iter::traits::double_ended::DoubleEndedIterator>::next_back (3,955 samples, 11.30%)<core::slice::ite..core::slice::<impl [T]>::get_unchecked (108 samples, 0.31%)<core::ops::range::RangeFrom<usize> as core::slice::index::SliceIndex<[T]>>::get_unchecked (108 samples, 0.31%)<core::ops::range::Range<usize> as core::slice::index::SliceIndex<[T]>>::get_unchecked (108 samples, 0.31%)core::ptr::const_ptr::<impl *const T>::add (108 samples, 0.31%)<f32 as core::ops::arith::Div>::div (5,479 samples, 15.65%)<f32 as core::ops::arith..<f32 as fast_float::float::Float>::pow10_fast_path (4 samples, 0.01%)fast_float::number::Number::try_fast_path (6,897 samples, 19.70%)fast_float::number::Number::try..fast_float::number::Number::is_fast_path (243 samples, 0.69%)fast_float::common::AsciiStr::check_first (11 samples, 0.03%)fast_float::common::AsciiStr::is_empty (10 samples, 0.03%)fast_float::common::AsciiStr::check_first_either (6 samples, 0.02%)fast_float::common::AsciiStr::first (182 samples, 0.52%)fast_float::common::AsciiStr::is_empty (6 samples, 0.02%)fast_float::common::AsciiStr::offset_from (5 samples, 0.01%)core::num::<impl isize>::wrapping_sub (5 samples, 0.01%)fast_float::number::try_parse_8digits_le (262 samples, 0.75%)fast_float::common::AsciiStr::try_read_u64 (262 samples, 0.75%)fast_float::common::AsciiStr::check_len (255 samples, 0.73%)core::num::<impl u8>::is_ascii_digit (968 samples, 2.76%)co..fast_float::common::AsciiStr::is_empty (378 samples, 1.08%)fast_float::common::AsciiStr::step (231 samples, 0.66%)fast_float::common::AsciiStr::step_by (231 samples, 0.66%)core::ptr::const_ptr::<impl *const T>::add (231 samples, 0.66%)rust_1brc::read_line (15,364 samples, 43.88%)rust_1brc::read_linefast_float::parse (11,409 samples, 32.58%)fast_float::parsefast_float::FastFloat::parse_float (11,409 samples, 32.58%)fast_float::FastFloat::parse_floatfast_float::FastFloat::parse_float_partial (11,409 samples, 32.58%)fast_float::FastFloat::parse_float_partialfast_float::parse::parse_float (11,409 samples, 32.58%)fast_float::parse::parse_floatfast_float::number::parse_number (4,110 samples, 11.74%)fast_float::numbe..fast_float::number::try_parse_digits (2,674 samples, 7.64%)fast_float..fast_float::common::AsciiStr::parse_digits (2,674 samples, 7.64%)fast_float..fast_float::number::try_parse_digits::_{{closure}} (57 samples, 0.16%)core::num::<impl u64>::wrapping_add (53 samples, 0.15%)std::collections::hash::map::Entry<K,V>::and_modify (548 samples, 1.57%)rust_1brc::calculate_station_values::_{{closure}} (548 samples, 1.57%)core::hash::Hasher::write_length_prefix (85 samples, 0.24%)<rustc_hash::FxHasher as core::hash::Hasher>::write_usize (85 samples, 0.24%)rustc_hash::FxHasher::add_to_hash (85 samples, 0.24%)core::num::<impl usize>::wrapping_mul (85 samples, 0.24%)core::slice::index::<impl core::ops::index::Index<I> for [T]>::index (113 samples, 0.32%)<core::ops::range::RangeFrom<usize> as core::slice::index::SliceIndex<[T]>>::index (113 samples, 0.32%)<core::ops::range::RangeFrom<usize> as core::slice::index::SliceIndex<[T]>>::get_unchecked (113 samples, 0.32%)<core::ops::range::Range<usize> as core::slice::index::SliceIndex<[T]>>::get_unchecked (113 samples, 0.32%)core::ptr::const_ptr::<impl *const T>::add (38 samples, 0.11%)<usize as core::ops::bit::BitXor>::bitxor (173 samples, 0.49%)core::num::<impl usize>::rotate_left (47 samples, 0.13%)<rustc_hash::FxHasher as core::hash::Hasher>::write (837 samples, 2.39%)<r..rustc_hash::FxHasher::add_to_hash (330 samples, 0.94%)core::num::<impl usize>::wrapping_mul (110 samples, 0.31%)hashbrown::map::make_hash (930 samples, 2.66%)ha..core::hash::BuildHasher::hash_one (930 samples, 2.66%)co..core::hash::impls::<impl core::hash::Hash for &T>::hash (930 samples, 2.66%)co..core::hash::impls::<impl core::hash::Hash for &T>::hash (930 samples, 2.66%)co..core::hash::impls::<impl core::hash::Hash for [T]>::hash (930 samples, 2.66%)co..core::hash::impls::<impl core::hash::Hash for u8>::hash_slice (845 samples, 2.41%)co..hashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry (8 samples, 0.02%)<hashbrown::raw::bitmask::BitMaskIter as core::iter::traits::iterator::Iterator>::next (1,281 samples, 3.66%)<has..hashbrown::raw::bitmask::BitMask::lowest_set_bit (1,281 samples, 3.66%)hash..core::num::nonzero::NonZero<u16>::new (1,034 samples, 2.95%)cor..hashbrown::raw::RawTable<T,A>::bucket (479 samples, 1.37%)hashbrown::raw::Bucket<T>::from_base_index (479 samples, 1.37%)core::ptr::mut_ptr::<impl *mut T>::sub (479 samples, 1.37%)core::ptr::mut_ptr::<impl *mut T>::offset (479 samples, 1.37%)<[A] as core::slice::cmp::SlicePartialEq<B>>::equal (6,728 samples, 19.22%)<[A] as core::slice::cmp::Slic..[libc.so.6] (4,578 samples, 13.07%)[libc.so.6]hashbrown::raw::RawTable<T,A>::find::_{{closure}} (7,224 samples, 20.63%)hashbrown::raw::RawTable<T,A>::f..hashbrown::rustc_entry::_<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry::_{{closure}} (6,745 samples, 19.26%)hashbrown::rustc_entry::_<impl..core::cmp::impls::<impl core::cmp::PartialEq<&B> for &A>::eq (6,745 samples, 19.26%)core::cmp::impls::<impl core::..core::slice::cmp::<impl core::cmp::PartialEq<[B]> for [A]>::eq (6,745 samples, 19.26%)core::slice::cmp::<impl core::..hashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry (17 samples, 0.05%)hashbrown::raw::h2 (513 samples, 1.47%)core::intrinsics::copy_nonoverlapping (10 samples, 0.03%)hashbrown::raw::sse2::Group::load (94 samples, 0.27%)core::core_arch::x86::sse2::_mm_loadu_si128 (94 samples, 0.27%)hashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry (84 samples, 0.24%)hashbrown::raw::sse2::Group::match_byte (1,320 samples, 3.77%)hash..core::core_arch::x86::sse2::_mm_movemask_epi8 (1,320 samples, 3.77%)core..hashbrown::raw::sse2::Group::match_empty (308 samples, 0.88%)hashbrown::raw::sse2::Group::match_byte (308 samples, 0.88%)core::core_arch::x86::sse2::_mm_movemask_epi8 (308 samples, 0.88%)hashbrown::raw::RawTableInner::find_inner (11,167 samples, 31.89%)hashbrown::raw::RawTableInner::find_innerhashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry (22 samples, 0.06%)hashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry (13,254 samples, 37.85%)hashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>..hashbrown::raw::RawTable<T,A>::find (11,388 samples, 32.52%)hashbrown::raw::RawTable<T,A>::findhashbrown::rustc_entry::<impl hashbrown::map::HashMap<K,V,S,A>>::rustc_entry (4 samples, 0.01%)all (35,014 samples, 100%)rust-1brc (35,014 samples, 100.00%)rust-1brc_start (35,006 samples, 99.98%)_start__libc_start_main (35,006 samples, 99.98%)__libc_start_main[libc.so.6] (35,006 samples, 99.98%)[libc.so.6]main (35,006 samples, 99.98%)mainstd::rt::lang_start_internal (35,006 samples, 99.98%)std::rt::lang_start_internalstd::rt::lang_start::_{{closure}} (35,006 samples, 99.98%)std::rt::lang_start::_{{closure}}std::sys_common::backtrace::__rust_begin_short_backtrace (35,006 samples, 99.98%)std::sys_common::backtrace::__rust_begin_short_backtracecore::ops::function::FnOnce::call_once (35,006 samples, 99.98%)core::ops::function::FnOnce::call_oncerust_1brc::main (35,006 samples, 99.98%)rust_1brc::mainrust_1brc::calculate_station_values (35,006 samples, 99.98%)rust_1brc::calculate_station_valuesstd::collections::hash::map::HashMap<K,V,S>::entry (13,606 samples, 38.86%)std::collections::hash::map::HashMap<K,V,S>::entrystd::collections::hash::map::map_entry (187 samples, 0.53%) \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index f209289..51cfa99 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,11 +1,10 @@ use clap::Parser; use std::collections::BTreeMap; -use std::fs::File; -use std::io::BufRead; -use std::io::BufReader; +// use std::str::from_utf8; use std::time::Instant; use fast_float; use rustc_hash::FxHashMap; +use memmap2::Mmap; #[derive(Parser, Debug)] #[command( @@ -26,26 +25,24 @@ struct StationValues { count: u32, } -fn read_line(data: &Vec) -> (Vec, f32) { +fn read_line(data: &[u8]) -> (&[u8], f32) { let mut parts = data.rsplit(|&c| c == b';'); let value_str = parts.next().expect("Failed to parse value string"); let value = fast_float::parse(value_str).expect("Failed to parse value"); let station_name = parts.next().expect("Failed to parse station name"); - (station_name.to_vec(), value) + (station_name, value) } // Calculate the station values -fn calculate_station_values(reader: &mut BufReader) -> FxHashMap, StationValues> { - let mut result: FxHashMap, StationValues> = FxHashMap::default(); - let mut buf = Vec::new(); - - while let Ok(bytes_read) = reader.read_until(b'\n', &mut buf) { - if bytes_read == 0 { - break; +fn calculate_station_values(data:&[u8]) -> FxHashMap<&[u8], StationValues> { + let mut result: FxHashMap<&[u8], StationValues> = FxHashMap::default(); + let lines = data.split(|&c| c == b'\n'); + for line in lines { + if line.is_empty() { + continue; } - // remove new line character - buf.truncate(bytes_read - 1); - let (station_name, value) = read_line(&buf); + + let (station_name, value) = read_line(line); result .entry(station_name) .and_modify(|e| { @@ -64,9 +61,6 @@ fn calculate_station_values(reader: &mut BufReader) -> FxHashMap, mean: value, count: 1, }); - - buf.clear(); - } // Calculate the mean for all entries and round off to 1 decimal place @@ -83,7 +77,7 @@ fn round_off(value: f32) -> f32 { (value * 10.0).round() / 10.0 } -fn write_result_stdout(result: FxHashMap, StationValues>) -> () { +fn write_result_stdout(result: FxHashMap<&[u8], StationValues>) -> () { let mut ordered_result = BTreeMap::new(); for (station_name, station_values) in result { ordered_result.insert(station_name, station_values); @@ -110,9 +104,10 @@ fn main() { let args = Args::parse(); let file = std::fs::File::open(&args.file).expect("Failed to open file"); - let mut reader = BufReader::new(file); + let mmap = unsafe { Mmap::map(&file).expect("Failed to map file") }; + let data = &*mmap; - let result = calculate_station_values(&mut reader); + let result = calculate_station_values(data); write_result_stdout(result); let duration = start.elapsed(); println!("\nTime taken is: {:?}", duration); @@ -122,7 +117,9 @@ fn main() { #[cfg(test)] mod tests { use crate::{calculate_station_values, StationValues}; - use std::{collections::HashMap, fs, io::BufReader, path::PathBuf}; + use std::{collections::HashMap, fs, path::PathBuf}; + use memmap2::Mmap; + #[test] fn test_measurement_data() { let test_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests"); @@ -138,13 +135,14 @@ mod tests { let test_output = read_test_output_file(output_file_name); let file = std::fs::File::open(test_file_name.clone()).expect("Failed to open file"); - let mut reader = BufReader::new(file); - let mut result = calculate_station_values(&mut reader); + let mmap = unsafe { Mmap::map(&file).expect("Failed to map file") }; + let data = &*mmap; + let mut result = calculate_station_values(data); let mut test_output_map_copy = test_output.clone(); // compare two hashmaps for (station_name, station_values) in test_output.into_iter() { - let result_station_values = result.remove(&station_name.as_bytes().to_vec()).expect( + let result_station_values = result.remove(station_name.as_bytes()).expect( ("Station not found: ".to_string() + &station_name + " in result hashmap") .as_str(), );