Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Archiving of prover in gpu_prover_queue #1537

Merged
merged 27 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
77c01dd
add archiver job for gpu_prover_queue
Artemka374 Apr 1, 2024
614b968
fix interval unit
Artemka374 Apr 1, 2024
6a90e5c
add availability checker
Artemka374 Apr 1, 2024
b7d84af
fix lint
Artemka374 Apr 1, 2024
8a4679f
Merge branch 'main' into afo/prover-archiving
Artemka374 Apr 1, 2024
8816883
fix lint
Artemka374 Apr 2, 2024
478c6e3
fix lint
Artemka374 Apr 2, 2024
ef01d97
add metric
Artemka374 Apr 2, 2024
d455609
Merge branch 'main' into afo/prover-archiving
Artemka374 Apr 2, 2024
86ecb00
fix build
Artemka374 Apr 2, 2024
1f6bfa3
Merge remote-tracking branch 'origin/afo/prover-archiving' into afo/p…
Artemka374 Apr 2, 2024
96cb068
Merge branch 'main' into afo/prover-archiving
Artemka374 Apr 2, 2024
feca4fb
add metrics
Artemka374 Apr 3, 2024
2b9d11a
fix configs
Artemka374 Apr 3, 2024
a1a78a3
fix build
Artemka374 Apr 3, 2024
0cc50d9
Merge branch 'main' into afo/prover-archiving
Artemka374 Apr 3, 2024
46de8a7
rename env variables
Artemka374 Apr 3, 2024
a6877f8
Merge remote-tracking branch 'origin/afo/prover-archiving' into afo/p…
Artemka374 Apr 3, 2024
f84c809
address comments
Artemka374 Apr 3, 2024
fde8379
Merge branch 'main' into afo/prover-archiving
Artemka374 Apr 3, 2024
ad880b1
fix few more things
Artemka374 Apr 3, 2024
eac463f
fix spellcheck
Artemka374 Apr 3, 2024
6991505
rename variables also for job archiver
Artemka374 Apr 3, 2024
d38b354
fix dic
Artemka374 Apr 3, 2024
d65b224
fix base configs
Artemka374 Apr 3, 2024
2ec84eb
fix dal query
Artemka374 Apr 3, 2024
2938875
fix config
Artemka374 Apr 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions checks-config/era.dic
Original file line number Diff line number Diff line change
Expand Up @@ -927,3 +927,4 @@ StorageMarker
SIGINT
opentelemetry
PubdataSendingMode
FriGpuProverArchiver
18 changes: 16 additions & 2 deletions core/lib/basic_types/src/prover_dal.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//! Types exposed by the prover DAL for general-purpose use.
use std::{net::IpAddr, ops::Add};
use std::{net::IpAddr, ops::Add, str::FromStr};

use chrono::{DateTime, Duration, Utc};

Expand Down Expand Up @@ -204,7 +204,7 @@ pub struct JobExtendedStatistics {
pub active_area: Vec<ProverJobInfo>,
}

#[derive(Debug, Copy, Clone)]
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum GpuProverInstanceStatus {
// The instance is available for processing.
Available,
Expand All @@ -215,3 +215,17 @@ pub enum GpuProverInstanceStatus {
// The instance is not alive anymore.
Dead,
}

impl FromStr for GpuProverInstanceStatus {
type Err = ();

fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"available" => Ok(Self::Available),
"full" => Ok(Self::Full),
"reserved" => Ok(Self::Reserved),
"dead" => Ok(Self::Dead),
_ => Err(()),
}
}
}
2 changes: 2 additions & 0 deletions core/lib/config/src/configs/fri_prover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ pub struct FriProverConfig {
pub queue_capacity: usize,
pub witness_vector_receiver_port: u16,
pub zone_read_url: String,
pub availability_check_interval_in_secs: u32,

// whether to write to public GCS bucket for https://github.com/matter-labs/era-boojum-validator-cli
pub shall_save_to_public_bucket: bool,
pub object_store: Option<ObjectStoreConfig>,
Expand Down
17 changes: 12 additions & 5 deletions core/lib/config/src/configs/house_keeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,20 @@ pub struct HouseKeeperConfig {
pub prover_db_pool_size: u32,
pub proof_compressor_job_retrying_interval_ms: u64,
pub proof_compressor_stats_reporting_interval_ms: u64,
pub prover_job_archiver_reporting_interval_ms: Option<u64>,
pub prover_job_archiver_archiving_interval_secs: Option<u64>,
pub prover_job_archiver_archiving_interval_ms: Option<u64>,
pub prover_job_archiver_archive_after_secs: Option<u64>,
pub fri_gpu_prover_archiver_archiving_interval_ms: Option<u64>,
pub fri_gpu_prover_archiver_archive_after_secs: Option<u64>,
}

impl HouseKeeperConfig {
pub fn prover_job_archiver_enabled(&self) -> bool {
self.prover_job_archiver_reporting_interval_ms.is_some()
&& self.prover_job_archiver_archiving_interval_secs.is_some()
pub fn prover_job_archiver_params(&self) -> Option<(u64, u64)> {
self.prover_job_archiver_archiving_interval_ms
.zip(self.prover_job_archiver_archive_after_secs)
}

pub fn fri_gpu_prover_archiver_params(&self) -> Option<(u64, u64)> {
self.fri_gpu_prover_archiver_archiving_interval_ms
.zip(self.fri_gpu_prover_archiver_archive_after_secs)
}
}
7 changes: 5 additions & 2 deletions core/lib/config/src/testonly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,7 @@ impl Distribution<configs::FriProverConfig> for EncodeDist {
witness_vector_receiver_port: self.sample(rng),
zone_read_url: self.sample(rng),
shall_save_to_public_bucket: self.sample(rng),
availability_check_interval_in_secs: self.sample(rng),
object_store: self.sample(rng),
}
}
Expand Down Expand Up @@ -563,8 +564,10 @@ impl Distribution<configs::house_keeper::HouseKeeperConfig> for EncodeDist {
witness_generator_job_retrying_interval_ms: self.sample(rng),
proof_compressor_job_retrying_interval_ms: self.sample(rng),
proof_compressor_stats_reporting_interval_ms: self.sample(rng),
prover_job_archiver_reporting_interval_ms: self.sample(rng),
prover_job_archiver_archiving_interval_secs: self.sample(rng),
prover_job_archiver_archiving_interval_ms: self.sample(rng),
prover_job_archiver_archive_after_secs: self.sample(rng),
fri_gpu_prover_archiver_archiving_interval_ms: self.sample(rng),
fri_gpu_prover_archiver_archive_after_secs: self.sample(rng),
}
}
}
Expand Down

This file was deleted.

2 changes: 2 additions & 0 deletions core/lib/env_config/src/fri_prover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ mod tests {
},
max_retries: 5,
}),
availability_check_interval_in_secs: 1_800,
}
}

Expand All @@ -65,6 +66,7 @@ mod tests {
FRI_PROVER_WITNESS_VECTOR_RECEIVER_PORT="3316"
FRI_PROVER_ZONE_READ_URL="http://metadata.google.internal/computeMetadata/v1/instance/zone"
FRI_PROVER_SHALL_SAVE_TO_PUBLIC_BUCKET=true
FRI_PROVER_AVAILABILITY_CHECK_INTERVAL_IN_SECS="1800"
OBJECT_STORE_BUCKET_BASE_URL="/base/url"
OBJECT_STORE_MODE="GCSWithCredentialFile"
OBJECT_STORE_GCS_CREDENTIAL_FILE_PATH="/path/to/credentials.json"
Expand Down
14 changes: 10 additions & 4 deletions core/lib/env_config/src/house_keeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,12 @@ mod tests {
prover_db_pool_size: 2,
proof_compressor_job_retrying_interval_ms: 30_000,
proof_compressor_stats_reporting_interval_ms: 10_000,
prover_job_archiver_reporting_interval_ms: Some(1_800_000),
prover_job_archiver_archiving_interval_secs: Some(172_800),
prover_job_archiver_archiving_interval_ms: Some(1_800_000),
prover_job_archiver_archive_after_secs: Some(172_800),
// 24 hours
fri_gpu_prover_archiver_archiving_interval_ms: Some(86_400_000),
// 48 hours
fri_gpu_prover_archiver_archive_after_secs: Some(172_800),
}
}

Expand All @@ -48,8 +52,10 @@ mod tests {
HOUSE_KEEPER_PROVER_STATS_REPORTING_INTERVAL_MS="5000"
HOUSE_KEEPER_PROOF_COMPRESSOR_STATS_REPORTING_INTERVAL_MS="10000"
HOUSE_KEEPER_PROOF_COMPRESSOR_JOB_RETRYING_INTERVAL_MS="30000"
HOUSE_KEEPER_PROVER_JOB_ARCHIVER_REPORTING_INTERVAL_MS="1800000"
HOUSE_KEEPER_PROVER_JOB_ARCHIVER_ARCHIVING_INTERVAL_SECS="172800"
HOUSE_KEEPER_PROVER_JOB_ARCHIVER_ARCHIVING_INTERVAL_MS="1800000"
HOUSE_KEEPER_PROVER_JOB_ARCHIVER_ARCHIVE_AFTER_SECS="172800"
HOUSE_KEEPER_FRI_GPU_PROVER_ARCHIVER_ARCHIVING_INTERVAL_MS="86400000"
HOUSE_KEEPER_FRI_GPU_PROVER_ARCHIVER_ARCHIVE_AFTER_SECS="172800"
"#;
lock.set_env(config);

Expand Down
22 changes: 14 additions & 8 deletions core/lib/protobuf_config/src/house_keeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,13 @@ impl ProtoRepr for proto::HouseKeeper {
.context("proof_compressor_stats_reporting_interval_ms")?,

// TODO(PLA-862): Make these 2 variables required
prover_job_archiver_reporting_interval_ms: self
.prover_job_archiver_reporting_interval_ms,
prover_job_archiver_archiving_interval_secs: self
.prover_job_archiver_archiving_interval_secs,
prover_job_archiver_archiving_interval_ms: self
.prover_job_archiver_archiving_interval_ms,
prover_job_archiver_archive_after_secs: self.prover_job_archiver_archive_after_secs,
fri_gpu_prover_archiver_archiving_interval_ms: self
.fri_gpu_prover_archiver_archiving_interval_ms,
fri_gpu_prover_archiver_archive_after_secs: self
.fri_gpu_prover_archiver_archive_after_secs,
})
}

Expand Down Expand Up @@ -73,10 +76,13 @@ impl ProtoRepr for proto::HouseKeeper {
proof_compressor_stats_reporting_interval_ms: Some(
this.proof_compressor_stats_reporting_interval_ms,
),
prover_job_archiver_reporting_interval_ms: this
.prover_job_archiver_reporting_interval_ms,
prover_job_archiver_archiving_interval_secs: this
.prover_job_archiver_archiving_interval_secs,
prover_job_archiver_archiving_interval_ms: this
.prover_job_archiver_archiving_interval_ms,
prover_job_archiver_archive_after_secs: this.prover_job_archiver_archive_after_secs,
fri_gpu_prover_archiver_archiving_interval_ms: this
.fri_gpu_prover_archiver_archiving_interval_ms,
fri_gpu_prover_archiver_archive_after_secs: this
.fri_gpu_prover_archiver_archive_after_secs,
}
}
}
26 changes: 14 additions & 12 deletions core/lib/protobuf_config/src/proto/house_keeper.proto
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@ syntax = "proto3";
package zksync.config.house_keeper;

message HouseKeeper {
optional uint64 l1_batch_metrics_reporting_interval_ms = 1; // required; ms
optional uint64 gpu_prover_queue_reporting_interval_ms = 2; // required; ms
optional uint64 prover_job_retrying_interval_ms = 3; // required; ms
optional uint64 prover_stats_reporting_interval_ms = 4; // required ms
optional uint64 witness_job_moving_interval_ms = 5; // required; ms
optional uint64 witness_generator_stats_reporting_interval_ms = 6; // required; ms
optional uint64 witness_generator_job_retrying_interval_ms = 9; // required; ms
optional uint32 prover_db_pool_size = 10; // required
optional uint64 proof_compressor_job_retrying_interval_ms = 12; // required; ms
optional uint64 proof_compressor_stats_reporting_interval_ms = 13; // required; ms
optional uint64 prover_job_archiver_reporting_interval_ms = 14; // optional; ms
optional uint64 prover_job_archiver_archiving_interval_secs = 15; // optional; seconds
optional uint64 l1_batch_metrics_reporting_interval_ms = 1; // required; ms
optional uint64 gpu_prover_queue_reporting_interval_ms = 2; // required; ms
optional uint64 prover_job_retrying_interval_ms = 3; // required; ms
optional uint64 prover_stats_reporting_interval_ms = 4; // required ms
optional uint64 witness_job_moving_interval_ms = 5; // required; ms
optional uint64 witness_generator_stats_reporting_interval_ms = 6; // required; ms
optional uint64 witness_generator_job_retrying_interval_ms = 9; // required; ms
optional uint32 prover_db_pool_size = 10; // required
optional uint64 proof_compressor_job_retrying_interval_ms = 12; // required; ms
optional uint64 proof_compressor_stats_reporting_interval_ms = 13; // required; ms
optional uint64 prover_job_archiver_archiving_interval_ms = 14; // optional; ms
optional uint64 prover_job_archiver_archive_after_secs = 15; // optional; seconds
optional uint64 fri_gpu_prover_archiver_archiving_interval_ms = 16; // optional; ms
optional uint64 fri_gpu_prover_archiver_archive_after_secs = 17; // optional; seconds
}
Loading
Loading