From 5813041271aa7b39c8f8e9c0a65fd31b57d7c7b8 Mon Sep 17 00:00:00 2001 From: HaoranYi Date: Fri, 6 Oct 2023 16:34:57 +0000 Subject: [PATCH 1/7] retry hash file allocation --- accounts-db/src/accounts_hash.rs | 62 ++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/accounts-db/src/accounts_hash.rs b/accounts-db/src/accounts_hash.rs index 222f2b1a640984..765fe7cc8bba75 100644 --- a/accounts-db/src/accounts_hash.rs +++ b/accounts-db/src/accounts_hash.rs @@ -87,22 +87,54 @@ impl AccountHashesFile { if self.writer.is_none() { // we have hashes to write but no file yet, so create a file that will auto-delete on drop - let mut data = tempfile_in(&self.dir_for_temp_cache_files).unwrap_or_else(|err| { - panic!( - "Unable to create file within {}: {err}", - self.dir_for_temp_cache_files.display() - ) - }); + let get_file = || -> Result<_, std::io::Error> { + let mut data = tempfile_in(&self.dir_for_temp_cache_files).unwrap_or_else(|err| { + panic!( + "Unable to create file within {}: {err}", + self.dir_for_temp_cache_files.display() + ) + }); + + // Theoretical performance optimization: write a zero to the end of + // the file so that we won't have to resize it later, which may be + // expensive. + assert!(self.capacity > 0); + data.seek(SeekFrom::Start((self.capacity - 1) as u64))?; + data.write_all(&[0])?; + data.rewind()?; + data.flush()?; + Ok(data) + }; - // Theoretical performance optimization: write a zero to the end of - // the file so that we won't have to resize it later, which may be - // expensive. - assert!(self.capacity > 0); - data.seek(SeekFrom::Start((self.capacity - 1) as u64)) - .unwrap(); - data.write_all(&[0]).unwrap(); - data.rewind().unwrap(); - data.flush().unwrap(); + // Retry 5 times for allocation the AccountHashFile. The memory maybe fragmented and + // causes memory allocation failure. Therefore, retry after failure. Hoping that the + // kernel can defrag the memory and allocation retries can succeed. + let mut num_retries = 0; + let data = loop { + num_retries += 1; + + match get_file() { + Ok(data) => { + break data; + } + Err(err) => { + info!( + "Unable to create account hash file within {}: {}, retry counter {}", + self.dir_for_temp_cache_files.display(), + err, + num_retries + ); + + if num_retries > 5 { + panic!( + "Unable to create account hash file within {}: after {} retries", + self.dir_for_temp_cache_files.display(), + num_retries + ); + } + } + } + }; //UNSAFE: Required to create a Mmap let map = unsafe { MmapMut::map_mut(&data) }; From a16677a59c6b1f402055e14f2619740911b41155 Mon Sep 17 00:00:00 2001 From: HaoranYi Date: Mon, 9 Oct 2023 13:20:41 +0000 Subject: [PATCH 2/7] add sleep --- accounts-db/src/accounts_hash.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/accounts-db/src/accounts_hash.rs b/accounts-db/src/accounts_hash.rs index 765fe7cc8bba75..c59a41cbacd9d3 100644 --- a/accounts-db/src/accounts_hash.rs +++ b/accounts-db/src/accounts_hash.rs @@ -26,6 +26,7 @@ use { atomic::{AtomicU64, AtomicUsize, Ordering}, Arc, }, + thread, time, }, tempfile::tempfile_in, }; @@ -132,6 +133,7 @@ impl AccountHashesFile { num_retries ); } + thread::sleep(time::Duration::from_millis(num_retries * 100)); } } }; From 5a20ee76febe58996d89f587a874eb7cbe53f041 Mon Sep 17 00:00:00 2001 From: HaoranYi Date: Mon, 9 Oct 2023 14:46:20 +0000 Subject: [PATCH 3/7] submit a datapoint for retry --- accounts-db/src/accounts_hash.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/accounts-db/src/accounts_hash.rs b/accounts-db/src/accounts_hash.rs index c59a41cbacd9d3..d4e53520845e43 100644 --- a/accounts-db/src/accounts_hash.rs +++ b/accounts-db/src/accounts_hash.rs @@ -133,6 +133,7 @@ impl AccountHashesFile { num_retries ); } + datapoint_info!("account_hash_file_allocation_retry", ("retry", 1, i64),); thread::sleep(time::Duration::from_millis(num_retries * 100)); } } From 42245e1a4b04129d7300a1aa28acc7550c517fd3 Mon Sep 17 00:00:00 2001 From: HaoranYi Date: Tue, 10 Oct 2023 16:46:38 +0000 Subject: [PATCH 4/7] typo --- accounts-db/src/accounts_hash.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/accounts-db/src/accounts_hash.rs b/accounts-db/src/accounts_hash.rs index d4e53520845e43..86366a05e6cb3b 100644 --- a/accounts-db/src/accounts_hash.rs +++ b/accounts-db/src/accounts_hash.rs @@ -107,9 +107,9 @@ impl AccountHashesFile { Ok(data) }; - // Retry 5 times for allocation the AccountHashFile. The memory maybe fragmented and - // causes memory allocation failure. Therefore, retry after failure. Hoping that the - // kernel can defrag the memory and allocation retries can succeed. + // Retry 5 times to allocate the AccountHashesFile. The memory might be fragmented and + // causes memory allocation failure. Therefore, let's retry after failure. Hoping that the + // kernel have the chance to defrag the memory between the retires, and retries succeed. let mut num_retries = 0; let data = loop { num_retries += 1; @@ -120,7 +120,7 @@ impl AccountHashesFile { } Err(err) => { info!( - "Unable to create account hash file within {}: {}, retry counter {}", + "Unable to create account hashes file within {}: {}, retry counter {}", self.dir_for_temp_cache_files.display(), err, num_retries @@ -128,12 +128,12 @@ impl AccountHashesFile { if num_retries > 5 { panic!( - "Unable to create account hash file within {}: after {} retries", + "Unable to create account hashes file within {}: after {} retries", self.dir_for_temp_cache_files.display(), num_retries ); } - datapoint_info!("account_hash_file_allocation_retry", ("retry", 1, i64),); + datapoint_info!("retry_account_hashes_file_allocation", ("retry", 1, i64)); thread::sleep(time::Duration::from_millis(num_retries * 100)); } } From 816162af49c46d132dbfe0975e192b8ae74ae3ee Mon Sep 17 00:00:00 2001 From: HaoranYi Date: Tue, 10 Oct 2023 17:41:44 +0000 Subject: [PATCH 5/7] more typos --- accounts-db/src/accounts_hash.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/accounts-db/src/accounts_hash.rs b/accounts-db/src/accounts_hash.rs index 86366a05e6cb3b..3d567bdee51e4d 100644 --- a/accounts-db/src/accounts_hash.rs +++ b/accounts-db/src/accounts_hash.rs @@ -109,7 +109,7 @@ impl AccountHashesFile { // Retry 5 times to allocate the AccountHashesFile. The memory might be fragmented and // causes memory allocation failure. Therefore, let's retry after failure. Hoping that the - // kernel have the chance to defrag the memory between the retires, and retries succeed. + // kernel has the chance to defrag the memory between the retries, and retries succeed. let mut num_retries = 0; let data = loop { num_retries += 1; From 3f6a2cff262ea1e9a1c6cb89b333ce468a9c9172 Mon Sep 17 00:00:00 2001 From: HaoranYi Date: Tue, 10 Oct 2023 14:30:48 -0500 Subject: [PATCH 6/7] Update accounts-db/src/accounts_hash.rs Co-authored-by: Brooks --- accounts-db/src/accounts_hash.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/accounts-db/src/accounts_hash.rs b/accounts-db/src/accounts_hash.rs index 3d567bdee51e4d..bc59d2debfc608 100644 --- a/accounts-db/src/accounts_hash.rs +++ b/accounts-db/src/accounts_hash.rs @@ -133,7 +133,7 @@ impl AccountHashesFile { num_retries ); } - datapoint_info!("retry_account_hashes_file_allocation", ("retry", 1, i64)); + datapoint_info!("retry_account_hashes_file_allocation", ("retry", num_retries, i64)); thread::sleep(time::Duration::from_millis(num_retries * 100)); } } From afe0c3636082b187fab93a6cb2af644cfdae42ef Mon Sep 17 00:00:00 2001 From: HaoranYi Date: Tue, 10 Oct 2023 19:46:46 +0000 Subject: [PATCH 7/7] fmt --- accounts-db/src/accounts_hash.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/accounts-db/src/accounts_hash.rs b/accounts-db/src/accounts_hash.rs index bc59d2debfc608..ff832cb5612358 100644 --- a/accounts-db/src/accounts_hash.rs +++ b/accounts-db/src/accounts_hash.rs @@ -133,7 +133,10 @@ impl AccountHashesFile { num_retries ); } - datapoint_info!("retry_account_hashes_file_allocation", ("retry", num_retries, i64)); + datapoint_info!( + "retry_account_hashes_file_allocation", + ("retry", num_retries, i64) + ); thread::sleep(time::Duration::from_millis(num_retries * 100)); } }