From a9140a81f55ff71749a484ddda1cd937cf7abc79 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Sat, 17 Apr 2021 12:44:49 -0400 Subject: [PATCH] support for hardlinks: extract, tree_hash, rewrite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for hardlinks, including: - extracting them by copying the linked file (no hardlink created) - tree hashing them as they are extracted - rewriting by duplicating the linked file This only supports hardlinks whose target is a plain file that has already been seen in the tarball that is being processed. You cannot have a hardlink that appears before the file that is linked. If the target of a hardlink is overwritten later, the link copies the current version of the file at the time of extraction. Tree hashing and rewrite are both consistent with this behavior. It is not supported to extract hardlinks where the link involves symlinks, even if the link refers to a path that would be a file — the target must be a plain file. Close #101. --- README.md | 10 ++++------ src/create.jl | 13 +++++++++++-- src/extract.jl | 47 +++++++++++++++++++++++++++++++++++++++-------- src/header.jl | 12 +++++++++--- 4 files changed, 63 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index d7dd413..55c1eb4 100644 --- a/README.md +++ b/README.md @@ -370,18 +370,16 @@ supports only the following file types: * plain files * directories * symlinks +* hardlinks (extracted as copies of plain files) The `Tar` package does not support other file types that the TAR format can -represent, including: hard links, character devices, block devices, and FIFOs. -If you attempt to create or extract an archive that contains any of these kinds -of entries, `Tar` will raise an error. You can, however, list the contents of a +represent, including: character devices, block devices, and FIFOs. If you +attempt to create or extract an archive that contains any of these kinds of +entries, `Tar` will raise an error. You can, however, list the contents of a tarball containing other kinds of entries by passing the `strict=false` flag to the `list` function; without this option, `list` raises the same error as `extract` would. -In the future, optional support may be added for using hard links within -archives to avoid duplicating identical files. - ### Time Stamps Also in accordance with its design goal as a data transfer tool, the `Tar` diff --git a/src/create.jl b/src/create.jl index 639958f..e8849be 100644 --- a/src/create.jl +++ b/src/create.jl @@ -54,8 +54,17 @@ function rewrite_tarball( end node = node′ end - node[name] = (hdr, position(old_tar)) - skip_data(old_tar, hdr.size) + if hdr.type == :hardlink + node′ = tree + for part in split(hdr.link, '/') + node′ = node′[part] + end + hdr′ = Header(node′[1], path=hdr.path, mode=hdr.mode) + node[name] = (hdr′, node′[2]) + else + node[name] = (hdr, position(old_tar)) + skip_data(old_tar, hdr.size) + end end write_tarball(new_tar, tree, buf=buf) do node, tar_path if node isa Dict diff --git a/src/extract.jl b/src/extract.jl index cf6d7e5..454c5be 100644 --- a/src/extract.jl +++ b/src/extract.jl @@ -77,8 +77,17 @@ function extract_tarball( mkdir(sys_path) elseif hdr.type == :symlink copy_symlinks || symlink(hdr.link, sys_path) + elseif hdr.type == :hardlink + src_path = joinpath(root, hdr.link) + @assert isfile(src_path) + cp(src_path, sys_path) elseif hdr.type == :file read_data(tar, sys_path, size=hdr.size, buf=buf) + else # should already be caught by check_header + error("unsupported tarball entry type: $(hdr.type)") + end + # apply tarball permissions + if hdr.type in (:file, :hardlink) exec = 0o100 & hdr.mode != 0 tar_mode = exec ? 0o755 : 0o644 sys_mode = filemode(sys_path) @@ -91,8 +100,6 @@ function extract_tarball( # we don't have a way to do that afaik end chmod(sys_path, tar_mode & sys_mode) - else # should already be caught by check_header - error("unsupported tarball entry type: $(hdr.type)") end end copy_symlinks || return @@ -216,12 +223,18 @@ function git_tree_hash( if hdr.type == :directory node[name] = Dict{String,Any}() return - end - if hdr.type == :symlink + elseif hdr.type == :symlink mode = "120000" hash = git_object_hash("blob", HashType) do io write(io, hdr.link) end + elseif hdr.type == :hardlink + mode = iszero(hdr.mode & 0o100) ? "100644" : "100755" + node′ = tree + for part in split(hdr.link, '/') + node′ = node′[part] + end + hash = node′[2] # hash of linked file elseif hdr.type == :file mode = iszero(hdr.mode & 0o100) ? "100644" : "100755" hash = git_file_hash(tar, hdr.size, HashType, buf=buf) @@ -342,17 +355,35 @@ function read_tarball( # normalize path and check for symlink attacks path = "" for part in split(hdr.path, '/') - (isempty(part) || part == ".") && continue # check_header doesn't allow ".." in path + (isempty(part) || part == ".") && continue get(paths, path, nothing) isa String && error(""" - Refusing to extract path with symlink prefix, possible attack - * path to extract: $(repr(hdr.path)) - * symlink prefix: $(repr(path)) + Refusing to extract path with symlink prefix [possible attack] + * path: $(repr(hdr.path)) + * prefix: $(repr(path)) """) isempty(path) || (paths[path] = :directory) path = isempty(path) ? part : "$path/$part" end paths[path] = hdr.type == :symlink ? hdr.link : hdr.type + # check that hardlinks refer to already-seen files + if hdr.type == :hardlink + parts = split(hdr.link, '/') + filter!(parts) do part + # check_header doesn't allow ".." in link + !isempty(part) && part != "." + end + link = join(parts, '/') + type = get(paths, link, Symbol("non-existent")) + type == :file || error(""" + Refusing to extract hardlink with $type target [possible attack] + * path: $(repr(hdr.path)) + * target: $(repr(hdr.link)) + """) + # use normalized link path + hdr = Header(hdr, link=link) + end + # apply callback, checking that it consumes IO correctly before = applicable(position, tar) ? position(tar) : 0 callback(hdr, split(path, '/', keepempty=false)) applicable(position, tar) || continue diff --git a/src/header.jl b/src/header.jl index 49db676..d1e3601 100644 --- a/src/header.jl +++ b/src/header.jl @@ -99,12 +99,18 @@ function check_header(hdr::Header) err("path is absolute") occursin(r"(^|/)\.\.(/|$)", hdr.path) && err("path contains '..' component") - hdr.type in (:file, :symlink, :directory) || + hdr.type in (:file, :hardlink, :symlink, :directory) || err("unsupported entry type") hdr.type ∉ (:hardlink, :symlink) && !isempty(hdr.link) && err("non-link with link path") - hdr.type == :symlink && hdr.size != 0 && - err("symlink with non-zero size") + hdr.type ∈ (:hardlink, :symlink) && isempty(hdr.link) && + err("$(hdr.type) with empty link path") + hdr.type ∈ (:hardlink, :symlink) && hdr.size != 0 && + err("$(hdr.type) with non-zero size") + hdr.type == :hardlink && hdr.link[1] == '/' && + err("hardlink with absolute link path") + hdr.type == :hardlink && occursin(r"(^|/)\.\.(/|$)", hdr.link) && + err("hardlink contains '..' component") hdr.type == :directory && hdr.size != 0 && err("directory with non-zero size") hdr.type != :directory && endswith(hdr.path, "/") &&