Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nydusify: introduce chunkdict generate subcommand #1572

Merged
merged 3 commits into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
280 changes: 280 additions & 0 deletions builder/src/chunkdict_generator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
// Copyright (C) 2023 Nydus Developers. All rights reserved.
//
// SPDX-License-Identifier: Apache-2.0

//! Generate Chunkdict RAFS bootstrap.
//! -------------------------------------------------------------------------------------------------
//! Bug 1: Inconsistent Chunk Size Leading to Blob Size Less Than 4K(v6_block_size)
//! Description: The size of chunks is not consistent, which results in the possibility that a blob,
//! composed of a group of these chunks, may be less than 4K(v6_block_size) in size.
//! This inconsistency leads to a failure in passing the size check.
//! -------------------------------------------------------------------------------------------------
//! Bug 2: Incorrect Chunk Number Calculation Due to Premature Check Logic
//! Description: The current logic for calculating the chunk number is based on the formula size/chunk size.
//! However, this approach is flawed as it precedes the actual check which accounts for chunk statistics.
//! Consequently, this leads to inaccurate counting of chunk numbers.

use super::core::node::{ChunkSource, NodeInfo};
use super::{BlobManager, Bootstrap, BootstrapManager, BuildContext, BuildOutput, Tree};
use crate::core::node::Node;
use crate::NodeChunk;
use anyhow::Result;
use nydus_rafs::metadata::chunk::ChunkWrapper;
use nydus_rafs::metadata::inode::InodeWrapper;
use nydus_rafs::metadata::layout::RafsXAttrs;
use nydus_storage::meta::BlobChunkInfoV1Ondisk;
use nydus_utils::compress::Algorithm;
use nydus_utils::digest::RafsDigest;
use std::ffi::OsString;
use std::mem::size_of;
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ChunkdictChunkInfo {
pub image_reference: String,
pub version: String,
pub chunk_blob_id: String,
pub chunk_digest: String,
pub chunk_compressed_size: u32,
pub chunk_uncompressed_size: u32,
pub chunk_compressed_offset: u64,
pub chunk_uncompressed_offset: u64,
}

pub struct ChunkdictBlobInfo {
pub blob_id: String,
pub blob_compressed_size: u64,
pub blob_uncompressed_size: u64,
pub blob_compressor: String,
pub blob_meta_ci_compressed_size: u64,
pub blob_meta_ci_uncompressed_size: u64,
pub blob_meta_ci_offset: u64,
}

/// Struct to generate chunkdict RAFS bootstrap.
pub struct Generator {}

impl Generator {
// Generate chunkdict RAFS bootstrap.
pub fn generate(
ctx: &mut BuildContext,
bootstrap_mgr: &mut BootstrapManager,
blob_mgr: &mut BlobManager,
chunkdict_chunks_origin: Vec<ChunkdictChunkInfo>,
chunkdict_blobs: Vec<ChunkdictBlobInfo>,
) -> Result<BuildOutput> {
// Validate and remove chunks whose belonged blob sizes are smaller than a block.
let mut chunkdict_chunks = chunkdict_chunks_origin.to_vec();
Self::validate_and_remove_chunks(ctx, &mut chunkdict_chunks);
// Build root tree.
let mut tree = Self::build_root_tree(ctx)?;

// Build child tree.
let child = Self::build_child_tree(ctx, blob_mgr, &chunkdict_chunks, &chunkdict_blobs)?;
let result = vec![child];
tree.children = result;

Self::validate_tree(&tree)?;

// Build bootstrap.
let mut bootstrap_ctx = bootstrap_mgr.create_ctx()?;
let mut bootstrap = Bootstrap::new(tree)?;
bootstrap.build(ctx, &mut bootstrap_ctx)?;

let blob_table = blob_mgr.to_blob_table(ctx)?;
let storage = &mut bootstrap_mgr.bootstrap_storage;
bootstrap.dump(ctx, storage, &mut bootstrap_ctx, &blob_table)?;

BuildOutput::new(blob_mgr, &bootstrap_mgr.bootstrap_storage)
}

/// Validate tree.
fn validate_tree(tree: &Tree) -> Result<()> {
let pre = &mut |t: &Tree| -> Result<()> {
let node = t.lock_node();
debug!("chunkdict tree: ");
debug!("inode: {}", node);
for chunk in &node.chunks {
debug!("\t chunk: {}", chunk);
}
Ok(())
};
tree.walk_dfs_pre(pre)?;
debug!("chunkdict tree is valid.");
Ok(())
}

/// Validates and removes chunks with a total uncompressed size smaller than the block size limit.
fn validate_and_remove_chunks(ctx: &mut BuildContext, chunkdict: &mut Vec<ChunkdictChunkInfo>) {
let mut chunk_sizes = std::collections::HashMap::new();

// Accumulate the uncompressed size for each chunk_blob_id.
for chunk in chunkdict.iter() {
*chunk_sizes.entry(chunk.chunk_blob_id.clone()).or_insert(0) +=
chunk.chunk_uncompressed_size as u64;
}
// Find all chunk_blob_ids with a total uncompressed size > v6_block_size.
let small_chunks: Vec<String> = chunk_sizes
.into_iter()
.filter(|&(_, size)| size < ctx.v6_block_size())
.inspect(|(id, _)| {
eprintln!(
"Warning: Blob with id '{}' is smaller than {} bytes.",
id,
ctx.v6_block_size()
)
})
.map(|(id, _)| id)
.collect();

// Retain only chunks with chunk_blob_id that has a total uncompressed size > v6_block_size.
chunkdict.retain(|chunk| !small_chunks.contains(&chunk.chunk_blob_id));
}

/// Build the root tree.
pub fn build_root_tree(ctx: &mut BuildContext) -> Result<Tree> {
let mut inode = InodeWrapper::new(ctx.fs_version);
inode.set_ino(1);
inode.set_uid(1000);
inode.set_gid(1000);
inode.set_projid(0);
inode.set_mode(0o660 | libc::S_IFDIR as u32);
inode.set_nlink(3);
inode.set_name_size("/".len());
inode.set_rdev(0);
inode.set_blocks(256);
let node_info = NodeInfo {
explicit_uidgid: true,
src_dev: 0,
src_ino: 0,
rdev: 0,
source: PathBuf::from("/"),
path: PathBuf::from("/"),
target: PathBuf::from("/"),
target_vec: vec![OsString::from("/")],
symlink: None,
xattrs: RafsXAttrs::default(),
v6_force_extended_inode: true,
};
let root_node = Node::new(inode, node_info, 0);
let tree = Tree::new(root_node);
Ok(tree)
}

/// Build the child tree.
fn build_child_tree(
ctx: &mut BuildContext,
blob_mgr: &mut BlobManager,
chunkdict_chunks: &[ChunkdictChunkInfo],
chunkdict_blobs: &[ChunkdictBlobInfo],
) -> Result<Tree> {
let mut inode = InodeWrapper::new(ctx.fs_version);
inode.set_ino(2);
inode.set_uid(0);
inode.set_gid(0);
inode.set_projid(0);
inode.set_mode(0o660 | libc::S_IFREG as u32);
inode.set_nlink(1);
inode.set_name_size("chunkdict".len());
inode.set_rdev(0);
inode.set_blocks(256);
let node_info = NodeInfo {
explicit_uidgid: true,
src_dev: 0,
src_ino: 1,
rdev: 0,
source: PathBuf::from("/"),
path: PathBuf::from("/chunkdict"),
target: PathBuf::from("/chunkdict"),
target_vec: vec![OsString::from("/"), OsString::from("/chunkdict")],
symlink: None,
xattrs: RafsXAttrs::new(),
v6_force_extended_inode: true,
};
let mut node = Node::new(inode, node_info, 0);

// Insert chunks.
Self::insert_chunks(ctx, blob_mgr, &mut node, chunkdict_chunks, chunkdict_blobs)?;
let node_size: u64 = node
.chunks
.iter()
.map(|chunk| chunk.inner.uncompressed_size() as u64)
.sum();
node.inode.set_size(node_size);

// Update child count.
node.inode.set_child_count(node.chunks.len() as u32);
let child = Tree::new(node);
child
.lock_node()
.v5_set_dir_size(ctx.fs_version, &child.children);
Ok(child)
}

/// Insert chunks.
fn insert_chunks(
ctx: &mut BuildContext,
blob_mgr: &mut BlobManager,
node: &mut Node,
chunkdict_chunks: &[ChunkdictChunkInfo],
chunkdict_blobs: &[ChunkdictBlobInfo],
) -> Result<()> {
for (index, chunk_info) in chunkdict_chunks.iter().enumerate() {
let chunk_size: u32 = chunk_info.chunk_compressed_size;
let file_offset = index as u64 * chunk_size as u64;
let mut chunk = ChunkWrapper::new(ctx.fs_version);

// Update blob context.
let (blob_index, blob_ctx) =
blob_mgr.get_or_cerate_blob_for_chunkdict(ctx, &chunk_info.chunk_blob_id)?;
let chunk_uncompressed_size = chunk_info.chunk_uncompressed_size;
let pre_d_offset = blob_ctx.current_uncompressed_offset;
blob_ctx.uncompressed_blob_size = pre_d_offset + chunk_uncompressed_size as u64;
blob_ctx.current_uncompressed_offset += chunk_uncompressed_size as u64;

blob_ctx.blob_meta_header.set_ci_uncompressed_size(
blob_ctx.blob_meta_header.ci_uncompressed_size()
+ size_of::<BlobChunkInfoV1Ondisk>() as u64,
);
blob_ctx.blob_meta_header.set_ci_compressed_size(
blob_ctx.blob_meta_header.ci_uncompressed_size()
+ size_of::<BlobChunkInfoV1Ondisk>() as u64,
);
let chunkdict_blob_info = chunkdict_blobs
.iter()
.find(|blob| blob.blob_id == chunk_info.chunk_blob_id)
.unwrap();
blob_ctx.blob_compressor =
Algorithm::from_str(chunkdict_blob_info.blob_compressor.as_str())?;
blob_ctx
.blob_meta_header
.set_ci_uncompressed_size(chunkdict_blob_info.blob_meta_ci_uncompressed_size);
blob_ctx
.blob_meta_header
.set_ci_compressed_size(chunkdict_blob_info.blob_meta_ci_compressed_size);
blob_ctx
.blob_meta_header
.set_ci_compressed_offset(chunkdict_blob_info.blob_meta_ci_offset);
blob_ctx.blob_meta_header.set_ci_compressor(Algorithm::Zstd);

// Update chunk context.
let chunk_index = blob_ctx.alloc_chunk_index()?;
chunk.set_blob_index(blob_index);
chunk.set_index(chunk_index);
chunk.set_file_offset(file_offset);
chunk.set_compressed_size(chunk_info.chunk_compressed_size);
chunk.set_compressed_offset(chunk_info.chunk_compressed_offset);
chunk.set_uncompressed_size(chunk_info.chunk_uncompressed_size);
chunk.set_uncompressed_offset(chunk_info.chunk_uncompressed_offset);
chunk.set_id(RafsDigest::from_string(&chunk_info.chunk_digest));

node.chunks.push(NodeChunk {
source: ChunkSource::Build,
inner: Arc::new(chunk.clone()),
});
}
Ok(())
}
}
40 changes: 40 additions & 0 deletions builder/src/core/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,9 @@ impl BlobContext {
blob_ctx
.blob_meta_header
.set_encrypted(features.contains(BlobFeatures::ENCRYPTED));
blob_ctx
.blob_meta_header
.set_is_chunkdict_generated(features.contains(BlobFeatures::IS_CHUNKDICT_GENERATED));

blob_ctx
}
Expand Down Expand Up @@ -955,6 +958,32 @@ impl BlobManager {
}
}

/// Get or cerate blob for chunkdict, this is used for chunk deduplication.
pub fn get_or_cerate_blob_for_chunkdict(
&mut self,
ctx: &BuildContext,
id: &str,
) -> Result<(u32, &mut BlobContext)> {
if self.get_blob_idx_by_id(id).is_none() {
let blob_ctx = Self::new_blob_ctx(ctx)?;
self.current_blob_index = Some(self.alloc_index()?);
self.add_blob(blob_ctx);
} else {
self.current_blob_index = self.get_blob_idx_by_id(id);
}
let (_, blob_ctx) = self.get_current_blob().unwrap();
if blob_ctx.blob_id.is_empty() {
blob_ctx.blob_id = id.to_string();
}
// Safe to unwrap because the blob context has been added.
Ok(self.get_current_blob().unwrap())
}

/// Determine if the given blob has been created.
pub fn has_blob(&self, blob_id: &str) -> bool {
self.get_blob_idx_by_id(blob_id).is_some()
}

/// Set the global chunk dictionary for chunk deduplication.
pub fn set_chunk_dict(&mut self, dict: Arc<dyn ChunkDict>) {
self.global_chunk_dict = dict
Expand Down Expand Up @@ -1097,6 +1126,7 @@ impl BlobManager {
compressed_blob_size,
blob_features,
flags,
build_ctx.is_chunkdict_generated,
);
}
RafsBlobTable::V6(table) => {
Expand All @@ -1116,6 +1146,7 @@ impl BlobManager {
ctx.blob_toc_digest,
ctx.blob_meta_size,
ctx.blob_toc_size,
build_ctx.is_chunkdict_generated,
ctx.blob_meta_header,
ctx.cipher_object.clone(),
ctx.cipher_ctx.clone(),
Expand Down Expand Up @@ -1293,6 +1324,9 @@ pub struct BuildContext {
pub configuration: Arc<ConfigV2>,
/// Generate the blob cache and blob meta
pub blob_cache_generator: Option<BlobCacheGenerator>,

/// Whether is chunkdict.
pub is_chunkdict_generated: bool,
}

impl BuildContext {
Expand Down Expand Up @@ -1361,6 +1395,7 @@ impl BuildContext {
features,
configuration: Arc::new(ConfigV2::default()),
blob_cache_generator: None,
is_chunkdict_generated: false,
}
}

Expand All @@ -1379,6 +1414,10 @@ impl BuildContext {
pub fn set_configuration(&mut self, config: Arc<ConfigV2>) {
self.configuration = config;
}

pub fn set_is_chunkdict(&mut self, is_chunkdict: bool) {
self.is_chunkdict_generated = is_chunkdict;
}
}

impl Default for BuildContext {
Expand Down Expand Up @@ -1411,6 +1450,7 @@ impl Default for BuildContext {
features: Features::new(),
configuration: Arc::new(ConfigV2::default()),
blob_cache_generator: None,
is_chunkdict_generated: false,
}
}
}
Expand Down
4 changes: 4 additions & 0 deletions builder/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ use sha2::Digest;

use self::core::node::{Node, NodeInfo};

pub use self::chunkdict_generator::ChunkdictBlobInfo;
pub use self::chunkdict_generator::ChunkdictChunkInfo;
pub use self::chunkdict_generator::Generator;
pub use self::compact::BlobCompactor;
pub use self::core::bootstrap::Bootstrap;
pub use self::core::chunk_dict::{parse_chunk_dict_arg, ChunkDict, HashChunkDict};
Expand All @@ -40,6 +43,7 @@ pub use self::merge::Merger;
pub use self::stargz::StargzBuilder;
pub use self::tarball::TarballBuilder;

mod chunkdict_generator;
mod compact;
mod core;
mod directory;
Expand Down
Loading
Loading