Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not re-prepare PVFs if not needed #4211

Merged
merged 8 commits into from
Apr 25, 2024
Merged
19 changes: 12 additions & 7 deletions polkadot/node/core/pvf/src/artifacts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ use crate::{host::PrecheckResultSender, worker_interface::WORKER_DIR_PREFIX};
use always_assert::always;
use polkadot_node_core_pvf_common::{error::PrepareError, prepare::PrepareStats, pvf::PvfPrepData};
use polkadot_parachain_primitives::primitives::ValidationCodeHash;
use polkadot_primitives::ExecutorParamsHash;
use polkadot_primitives::ExecutorParamsPrepHash;
use std::{
collections::HashMap,
fs,
Expand All @@ -85,22 +85,27 @@ pub fn generate_artifact_path(cache_path: &Path) -> PathBuf {
artifact_path
}

/// Identifier of an artifact. Encodes a code hash of the PVF and a hash of executor parameter set.
/// Identifier of an artifact. Encodes a code hash of the PVF and a hash of preparation-related
/// executor parameter set.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct ArtifactId {
pub(crate) code_hash: ValidationCodeHash,
pub(crate) executor_params_hash: ExecutorParamsHash,
pub(crate) executor_params_prep_hash: ExecutorParamsPrepHash,
}

impl ArtifactId {
/// Creates a new artifact ID with the given hash.
pub fn new(code_hash: ValidationCodeHash, executor_params_hash: ExecutorParamsHash) -> Self {
Self { code_hash, executor_params_hash }
pub fn new(
code_hash: ValidationCodeHash,
executor_params_prep_hash: ExecutorParamsPrepHash,
) -> Self {
Self { code_hash, executor_params_prep_hash }
}

/// Returns an artifact ID that corresponds to the PVF with given executor params.
/// Returns an artifact ID that corresponds to the PVF with given preparation-related
/// executor parameters.
pub fn from_pvf_prep_data(pvf: &PvfPrepData) -> Self {
Self::new(pvf.code_hash(), pvf.executor_params().hash())
s0me0ne-unkn0wn marked this conversation as resolved.
Show resolved Hide resolved
Self::new(pvf.code_hash(), pvf.executor_params().prep_hash())
}
}

Expand Down
70 changes: 69 additions & 1 deletion polkadot/node/core/pvf/tests/it/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use polkadot_node_core_pvf::{
ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
};
use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult};
use polkadot_primitives::{ExecutorParam, ExecutorParams};
use polkadot_primitives::{ExecutorParam, ExecutorParams, PvfExecKind, PvfPrepKind};

use std::{io::Write, time::Duration};
use tokio::sync::Mutex;
Expand Down Expand Up @@ -556,3 +556,71 @@ async fn nonexistent_cache_dir() {
.await
.unwrap();
}

// Checks the the artifact is not re-prepared when the executor environment parameters change
// in a way not affecting the preparation
#[tokio::test]
async fn artifact_does_not_reprepare_on_non_meaningful_exec_parameter_change() {
let host = TestHost::new_with_config(|cfg| {
cfg.prepare_workers_hard_max_num = 1;
})
.await;
let cache_dir = host.cache_dir.path();

let set1 = ExecutorParams::default();
let set2 =
ExecutorParams::from(&[ExecutorParam::PvfExecTimeout(PvfExecKind::Backing, 2500)][..]);

let _stats = host.precheck_pvf(halt::wasm_binary_unwrap(), set1).await.unwrap();

let md1 = {
let mut cache_dir: Vec<_> = std::fs::read_dir(cache_dir).unwrap().collect();
assert_eq!(cache_dir.len(), 2);
let mut artifact_path = cache_dir.pop().unwrap().unwrap();
if artifact_path.path().is_dir() {
artifact_path = cache_dir.pop().unwrap().unwrap();
}
std::fs::metadata(artifact_path.path()).unwrap()
};

tokio::time::sleep(Duration::from_secs(2)).await;

let _stats = host.precheck_pvf(halt::wasm_binary_unwrap(), set2).await.unwrap();

let md2 = {
let mut cache_dir: Vec<_> = std::fs::read_dir(cache_dir).unwrap().collect();
assert_eq!(cache_dir.len(), 2);
let mut artifact_path = cache_dir.pop().unwrap().unwrap();
if artifact_path.path().is_dir() {
artifact_path = cache_dir.pop().unwrap().unwrap();
}
std::fs::metadata(artifact_path.path()).unwrap()
};

assert_eq!(md1.created().unwrap(), md2.created().unwrap());
}

// Checks if the artifact is re-prepared if the re-preparation is needed by the nature of
// the execution environment parameters change
#[tokio::test]
async fn artifact_does_reprepare_on_meaningful_exec_parameter_change() {
let host = TestHost::new_with_config(|cfg| {
cfg.prepare_workers_hard_max_num = 1;
})
.await;
let cache_dir = host.cache_dir.path();

let set1 = ExecutorParams::default();
let set2 =
ExecutorParams::from(&[ExecutorParam::PvfPrepTimeout(PvfPrepKind::Prepare, 60000)][..]);

let _stats = host.precheck_pvf(halt::wasm_binary_unwrap(), set1).await.unwrap();
let cache_dir_contents: Vec<_> = std::fs::read_dir(cache_dir).unwrap().collect();

assert_eq!(cache_dir_contents.len(), 2);

let _stats = host.precheck_pvf(halt::wasm_binary_unwrap(), set2).await.unwrap();
let cache_dir_contents: Vec<_> = std::fs::read_dir(cache_dir).unwrap().collect();

assert_eq!(cache_dir_contents.len(), 3); // new artifact has been added
}
2 changes: 1 addition & 1 deletion polkadot/primitives/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ pub use v7::{
CandidateReceipt, CheckedDisputeStatementSet, CheckedMultiDisputeStatementSet, CollatorId,
CollatorSignature, CommittedCandidateReceipt, CompactStatement, ConsensusLog, CoreIndex,
CoreState, DisputeState, DisputeStatement, DisputeStatementSet, DownwardMessage, EncodeAs,
ExecutorParam, ExecutorParamError, ExecutorParams, ExecutorParamsHash,
ExecutorParam, ExecutorParamError, ExecutorParams, ExecutorParamsHash, ExecutorParamsPrepHash,
ExplicitDisputeStatement, GroupIndex, GroupRotationInfo, Hash, HashT, HeadData, Header,
HorizontalMessages, HrmpChannelId, Id, InboundDownwardMessage, InboundHrmpMessage, IndexedVec,
InherentData, InvalidDisputeStatementKind, Moment, MultiDisputeStatementSet, NodeFeatures,
Expand Down
91 changes: 91 additions & 0 deletions polkadot/primitives/src/v7/executor_params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,43 @@ impl sp_std::fmt::LowerHex for ExecutorParamsHash {
}
}

/// Unit type wrapper around [`type@Hash`] that represents a hash of preparation-related
/// executor parameters.
///
/// This type is produced by [`ExecutorParams::prep_hash`].
#[derive(Clone, Copy, Encode, Decode, Hash, Eq, PartialEq, PartialOrd, Ord, TypeInfo)]
pub struct ExecutorParamsPrepHash(Hash);

impl sp_std::fmt::Display for ExecutorParamsPrepHash {
fn fmt(&self, f: &mut sp_std::fmt::Formatter<'_>) -> sp_std::fmt::Result {
self.0.fmt(f)
}
}

impl sp_std::fmt::Debug for ExecutorParamsPrepHash {
fn fmt(&self, f: &mut sp_std::fmt::Formatter<'_>) -> sp_std::fmt::Result {
write!(f, "{:?}", self.0)
}
}

impl sp_std::fmt::LowerHex for ExecutorParamsPrepHash {
fn fmt(&self, f: &mut sp_std::fmt::Formatter<'_>) -> sp_std::fmt::Result {
sp_std::fmt::LowerHex::fmt(&self.0, f)
}
}

/// # Deterministically serialized execution environment semantics
/// Represents an arbitrary semantics of an arbitrary execution environment, so should be kept as
/// abstract as possible.
//
// ADR: For mandatory entries, mandatoriness should be enforced in code rather than separating them
// into individual fields of the structure. Thus, complex migrations shall be avoided when adding
// new entries and removing old ones. At the moment, there's no mandatory parameters defined. If
// they show up, they must be clearly documented as mandatory ones.
//
// !!! Any new parameter that affects the prepared artifact must be added to the set in
// !!! `prep_hash()`. Failing to do so may result in artifact not being re-prepared when a
// !!! relevant parameter changes which may lead to a consensus split.
#[derive(
Clone, Debug, Default, Encode, Decode, PartialEq, Eq, TypeInfo, Serialize, Deserialize,
)]
Expand All @@ -175,6 +205,19 @@ impl ExecutorParams {
ExecutorParamsHash(BlakeTwo256::hash(&self.encode()))
}

/// Returns hash of preparation-related executor parameters
pub fn prep_hash(&self) -> ExecutorParamsPrepHash {
sandreim marked this conversation as resolved.
Show resolved Hide resolved
use ExecutorParam::*;

let mut enc = b"prep".to_vec();
for param in &self.0 {
if matches!(param, StackLogicalMax(_) | PvfPrepTimeout(..) | WasmExtBulkMemory) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks brittle wrt adding new parameters. I would rewrite and use !matches such that any new parameter added is considered preparation related unless we explicitly name it here as non-preparation related.

Copy link
Contributor Author

@s0me0ne-unkn0wn s0me0ne-unkn0wn Apr 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's an obvious improvement I've overseen, thank you for the suggestion!

Another question I'm trying to answer: is PvfPrepTimeout a parameter that affects preparation indeed?

That may be a bit counterintuitive, but I'm more inclined to say "no" than "yes". We have a strict pre-checking timeout, and we never re-pre-check, even if executor environment params change. The preparation timeout is lenient, and anyway, those timeouts are non-deterministic. So I don't really see much sense in re-preparing artifacts if this timeout changes (especially given that we're not going to ever decrease them). WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's just try to see what would happen if we say no. Let's first pick the likely more troublesome situation, we for some reason reduce the timeout:

Artifacts are not being recompiled, but a node that restarts will and may now hit a timeout it did not hit before. If I am not mistaken, the node will not dispute, but simply don't vote - right? Now, if that happens on just a few nodes, not too much harm done.

Now, if this is a general problem, everytime a node restarts, we have another machine not voting. So we might run into finality issues.

Problems:

  1. Potential finality stall.
  2. The issue will only show up gradually over time and might be hard to debug. Real issues might only appear weeks later.

Now, let's flip it and let's say "yes". Now all validators will recompile immediately, including backers. I am assuming that anything that got backed with the old artifact, will also be approved and disputed with the old artifact (this is true, right?). In that case the situation would be way better, because now we only have a single parachain that will completely cease to make progress, but all other paras and the relay chain won't be affected at all. Only the backers for that para will try to prepare and fail, as long as they are assigned.

If we assume we will only ever increase that timeout, then of course this should be fine, but is this a sound assumption? Why would we change the timeout at all?

  1. Either, we are being attacked and someone abuses those long timeouts - we would need to reduce.
  2. We have legitimate PVFs that run into the limit ... we would need to increase or fix otherwise.

Couldn't say, which scenario is more likely. That we are never ever re-prechecking is more a bug than anything. What we should actually be doing is re-prechecking and only enacting the new paras, once this was successful. That would also avoid the finality issues on changing the parameters.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the node will not dispute, but simply don't vote - right?

Yes, I believe so. Although we treat the preparation timeout as a deterministic error (as it hasn't failed during the pre-check), we don't dispute.

I am assuming that anything that got backed with the old artifact, will also be approved and disputed with the old artifact (this is true, right?).

We should always use execution parameters from the session where the candidate was produced, so yes, they should always be executed with the same artifact.

we only have a single parachain that will completely cease to make progress

Why do you think it's only a single parachain? The timeout is per-network per-session, so after passing the session boundary where this parameter change is enacted, all the artifacts should be re-prepared, as we saw during the Kusama and Polkadot incidents.

If we assume we will only ever increase that timeout, then of course this should be fine, but is this a sound assumption?

I remember we talked about decreasing the timeouts as not being safe, as it may make PVFs that are already on-chain fail. At the very least, we should have some tooling that could be used to check all the already-existing PVFs not to fail with the new set of executor parameters on the reference hardware (I think @ordian was working on that one?)

Copy link
Member

@ordian ordian Apr 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the very least, we should have some tooling that could be used to check all the already-existing PVFs not to fail with the new set of executor parameters on the reference hardware (I think @ordian was working on that one?)

The tooling we do have, but currently it's not integrated into the release process. But even if it could be used to check the current PVFs at the time of the release, one could imagine an attacker uploading a new PVF (with on-demand core) right after the release that will be on the edge of the old time limit. So it doesn't solve the problem by itself.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So it doesn't solve the problem by itself.

Rerunning pre-checking before enactment would.

Why do you think it's only a single parachain

Not necessarily a single one, but most certainly not all of them. Otherwise we would have been really stupid with the parameters.

I remember we talked about decreasing the timeouts as not being safe, as it may make PVFs that are already on-chain fail.

True, but I think this should be fixed. Without pre-checking we get a finality stall, that alone is reason enough to do it properly and not enact parameters that have not been checked. Like, assuming we want to change these parameters at all: Why would we only ever want to increase? Most likely we need to increase to mitigate some issue, but would actually want to decrease again later, once it is fixed. Machines also get faster, compilers get better, so the chances that we actually want to decrease at some point are likely higher that we would want to increase.

In other words, by saying "no" we dig ourselves deeper into a solution that is actually not sound.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should always use execution parameters from the session where the candidate was produced, so yes, they should always be executed with the same artifact.

But that's not true, is it ?

On approval voting we fetch executor params based on the block the candidate is included: https://github.com/paritytech/polkadot-sdk/blob/master/polkadot/node/core/approval-voting/src/lib.rs#L2967, .

On backing we fetch them based on the relay_parent:
https://github.com/paritytech/polkadot-sdk/blob/master/polkadot/node/core/backing/src/lib.rs#L666

I don't think there is anything preventing the two relay chain blocks being in different sessions at the boundary, especially with async backing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But that's not true, is it ?

Hmmm, that may be an important find... If that does not hold, it should be fixed. The very purport of the executor parameters is to always use the same set of parameters with the same candidate.

enc.extend(param.encode())
}
}
ExecutorParamsPrepHash(BlakeTwo256::hash(&enc))
}

/// Returns a PVF preparation timeout, if any
pub fn pvf_prep_timeout(&self, kind: PvfPrepKind) -> Option<Duration> {
for param in &self.0 {
Expand Down Expand Up @@ -336,3 +379,51 @@ impl From<&[ExecutorParam]> for ExecutorParams {
ExecutorParams(arr.to_vec())
}
}

// This test ensures the hash generated by `prep_hash()` changes if any preparation-related
// executor parameter changes. If you're adding a new executor parameter, you must add it into
// this test, and if changing that parameter may affect the artifact produced on the preparation
// step, it must be added to `pre_hash()` as well, and a tuple of executor params set must be
// added to `match` in this test to ensure the hash changes. See also `prep_hash()` comments.
s0me0ne-unkn0wn marked this conversation as resolved.
Show resolved Hide resolved
#[test]
fn ensure_prep_hash_changes() {
use ExecutorParam::*;
let ep = ExecutorParams::from(
&[
MaxMemoryPages(0),
StackLogicalMax(0),
StackNativeMax(0),
PrecheckingMaxMemory(0),
PvfPrepTimeout(PvfPrepKind::Precheck, 0),
PvfPrepTimeout(PvfPrepKind::Prepare, 0),
PvfExecTimeout(PvfExecKind::Backing, 0),
PvfExecTimeout(PvfExecKind::Approval, 0),
WasmExtBulkMemory,
][..],
);

for p in ep.iter() {
let (ep1, ep2) = match p {
MaxMemoryPages(_) => continue,
StackLogicalMax(_) => (
ExecutorParams::from(&[StackLogicalMax(1)][..]),
ExecutorParams::from(&[StackLogicalMax(2)][..]),
),
StackNativeMax(_) => continue,
PrecheckingMaxMemory(_) => continue,
PvfPrepTimeout(PvfPrepKind::Precheck, _) => (
ExecutorParams::from(&[PvfPrepTimeout(PvfPrepKind::Precheck, 1)][..]),
ExecutorParams::from(&[PvfPrepTimeout(PvfPrepKind::Precheck, 2)][..]),
),
PvfPrepTimeout(PvfPrepKind::Prepare, _) => (
ExecutorParams::from(&[PvfPrepTimeout(PvfPrepKind::Prepare, 1)][..]),
ExecutorParams::from(&[PvfPrepTimeout(PvfPrepKind::Prepare, 2)][..]),
),
PvfExecTimeout(_, _) => continue,
WasmExtBulkMemory =>
(ExecutorParams::default(), ExecutorParams::from(&[WasmExtBulkMemory][..])),
};

assert_ne!(ep1.prep_hash(), ep2.prep_hash());
}
}
4 changes: 3 additions & 1 deletion polkadot/primitives/src/v7/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ pub mod executor_params;
pub mod slashing;

pub use async_backing::AsyncBackingParams;
pub use executor_params::{ExecutorParam, ExecutorParamError, ExecutorParams, ExecutorParamsHash};
pub use executor_params::{
ExecutorParam, ExecutorParamError, ExecutorParams, ExecutorParamsHash, ExecutorParamsPrepHash,
};

mod metrics;
pub use metrics::{
Expand Down
Loading