Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add s2 commandline tools #160

Merged
merged 2 commits into from
Sep 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ script:
- go test -v -cpu=2 ./...
- go test -cpu=2 -tags=noasm ./...
- go test -cpu=1,4 -short -race ./...
- go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d && s2c && s2d
- GOOS=linux GOARCH=386 go install ./...

matrix:
Expand Down
64 changes: 63 additions & 1 deletion s2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,69 @@ Similar to the Writer, a Reader can be reused using the `Reset` method.
For smaller data blocks, there is also a non-streaming interface: `Encode()`, `EncodeBetter()` and `Decode()`.
Do however note that these functions (similar to Snappy) does not provide validation of data,
so data corruption may be undetected. Stream encoding provides CRC checks of data.


# Commandline tools

Some very simply commandline tools are provided; `s2c` for compression and `s2d` for decompression.

Installing then requires Go to be installed. To install them, use:

`go install github.com/klauspost/compress/s2/cmd/s2c && go install github.com/klauspost/compress/s2/cmd/s2d`

To build binaries to the current folder use:

`go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d`


## s2c

```
Usage: s2c [options] file1 file2

Compresses all files supplied as input separately.
Output files are written as 'filename.ext.s2'.
By default output files will be overwritten.
Use - as the only file name to read from stdin and write to stdout.

Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt

Options:
-blocksize string
Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "1M")
-c Write all output to stdout. Multiple input files will be concatenated.
-cpu int
Compress using this amount of threads (default Auto)
-faster
Compress faster, but with a minor compression loss
-help
Display help
-pad string
Pad size to a multiple of this value, Examples: 64K, 256K, 1M, 4M, etc (default "1")
-safe
Do not overwrite output files
```

## s2d

```
Usage: s2d [options] file1 file2

Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'.
Output file names have the extension removed. By default output files will be overwritten.
Use - as the only file name to read from stdin and write to stdout.

Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt

Options:
-c Write all output to stdout. Multiple input files will be concatenated.
-help
Display help
-safe
Do not overwrite output files
```

# Performance

This section will focus on comparisons to Snappy.
Expand Down
169 changes: 169 additions & 0 deletions s2/cmd/s2c/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
package main

import (
"bufio"
"errors"
"flag"
"fmt"
"io"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
"unicode"

"github.com/klauspost/compress/s2"
)

var (
faster = flag.Bool("faster", false, "Compress faster, but with a minor compression loss")
cpu = flag.Int("cpu", runtime.GOMAXPROCS(0), "Compress using this amount of threads")
blockSize = flag.String("blocksize", "1M", "Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB")
safe = flag.Bool("safe", false, "Do not overwrite output files")
padding = flag.String("pad", "1", "Pad size to a multiple of this value, Examples: 64K, 256K, 1M, 4M, etc")
stdout = flag.Bool("c", false, "Write all output to stdout. Multiple input files will be concatenated.")
help = flag.Bool("help", false, "Display help")
)

func main() {
flag.Parse()
sz, err := toSize(*blockSize)
exitErr(err)
pad, err := toSize(*padding)
exitErr(err)

args := flag.Args()
if len(args) == 0 || *help {
fmt.Println(`Usage: s2c [options] file1 file2

Compresses all files supplied as input separately.
Output files are written as 'filename.ext.s2'.
By default output files will be overwritten.
Use - as the only file name to read from stdin and write to stdout.

Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt

Options:`)
flag.PrintDefaults()
}
opts := []s2.WriterOption{s2.WriterBlockSize(int(sz)), s2.WriterConcurrency(*cpu), s2.WriterPadding(int(pad))}
if !*faster {
opts = append(opts, s2.WriterBetterCompression())
}
wr := s2.NewWriter(nil, opts...)

// No args, use stdin/stdout
if len(args) == 1 && args[0] == "-" {
wr.Reset(os.Stdout)
_, err := io.Copy(wr, os.Stdin)
exitErr(err)
exitErr(wr.Close())
return
}
var files []string

for _, pattern := range args {
found, err := filepath.Glob(pattern)
exitErr(err)
if len(found) == 0 {
exitErr(fmt.Errorf("unable to find file %v", pattern))
}
files = append(files, found...)
}

for _, filename := range files {
func() {
dstFilename := fmt.Sprintf("%s%s", filename, ".s2")
if !*stdout {
fmt.Println("Compressing", filename, "->", dstFilename)
}
// Input file.
file, err := os.Open(filename)
exitErr(err)
defer file.Close()
src := bufio.NewReaderSize(file, int(sz)*2)
finfo, err := file.Stat()
exitErr(err)
var out io.Writer
if *stdout {
out = os.Stdout
} else {
mode := finfo.Mode() // use the same mode for the output file
if *safe {
_, err := os.Stat(dstFilename)
if !os.IsNotExist(err) {
exitErr(errors.New("destination files exists"))
}
}
dstFile, err := os.OpenFile(dstFilename, os.O_CREATE|os.O_WRONLY, mode)
exitErr(err)
defer dstFile.Close()
bw := bufio.NewWriterSize(dstFile, int(sz)*2)
defer bw.Flush()
out = bw
}
wc := wCounter{out: out}
wr.Reset(&wc)
defer wr.Close()
start := time.Now()
input, err := wr.ReadFrom(src)
exitErr(err)
err = wr.Close()
exitErr(err)
if !*stdout {
elapsed := time.Since(start)
mbpersec := (float64(input) / (1024 * 1024)) / (float64(elapsed) / (float64(time.Second)))
pct := float64(wc.n) * 100 / float64(input)
fmt.Printf("%d -> %d [%.02f%%]; %dMB/s\n", input, wc.n, pct, int(mbpersec))
}
}()
}
}

func exitErr(err error) {
if err != nil {
fmt.Fprintln(os.Stderr, "ERROR:", err.Error())
os.Exit(2)
}
}

// toSize converts a size indication to bytes.
func toSize(size string) (uint64, error) {
size = strings.ToUpper(strings.TrimSpace(size))
firstLetter := strings.IndexFunc(size, unicode.IsLetter)
if firstLetter == -1 {
firstLetter = len(size)
}

bytesString, multiple := size[:firstLetter], size[firstLetter:]
bytes, err := strconv.ParseUint(bytesString, 10, 64)
if err != nil {
return 0, fmt.Errorf("unable to parse size: %v", err)
}

switch multiple {
case "M", "MB", "MIB":
return bytes * 1 << 20, nil
case "K", "KB", "KIB":
return bytes * 1 << 10, nil
case "B", "":
return bytes, nil
default:
return 0, fmt.Errorf("unknown size suffix: %v", multiple)
}
}

type wCounter struct {
n int
out io.Writer
}

func (w *wCounter) Write(p []byte) (n int, err error) {
n, err = w.out.Write(p)
w.n += n
return n, err

}
132 changes: 132 additions & 0 deletions s2/cmd/s2d/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
package main

import (
"bufio"
"errors"
"flag"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"time"

"github.com/klauspost/compress/s2"
)

var (
safe = flag.Bool("safe", false, "Do not overwrite output files")
stdout = flag.Bool("c", false, "Write all output to stdout. Multiple input files will be concatenated.")
help = flag.Bool("help", false, "Display help")
)

func main() {
flag.Parse()
r := s2.NewReader(nil)

// No args, use stdin/stdout
args := flag.Args()
if len(args) == 0 || *help {
fmt.Println(`Usage: s2d [options] file1 file2

Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'.
Output file names have the extension removed. By default output files will be overwritten.
Use - as the only file name to read from stdin and write to stdout.

Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt

Options:`)
flag.PrintDefaults()
}
if len(args) == 1 && args[0] == "-" {
r.Reset(os.Stdin)
_, err := io.Copy(os.Stdout, r)
exitErr(err)
return
}
var files []string

for _, pattern := range args {
found, err := filepath.Glob(pattern)
exitErr(err)
if len(found) == 0 {
exitErr(fmt.Errorf("unable to find file %v", pattern))
}
files = append(files, found...)
}

for _, filename := range files {
dstFilename := filename
switch {
case strings.HasSuffix(filename, ".s2"):
dstFilename = strings.TrimSuffix(filename, ".s2")
case strings.HasSuffix(filename, ".snappy"):
dstFilename = strings.TrimSuffix(filename, ".snappy")
default:
fmt.Println("Skipping", filename)
continue
}

func() {
if !*stdout {
fmt.Println("Decompressing", filename, "->", dstFilename)
}
// Input file.
file, err := os.Open(filename)
exitErr(err)
defer file.Close()
rc := rCounter{in: file}
src := bufio.NewReaderSize(&rc, 4<<20)
finfo, err := file.Stat()
exitErr(err)
mode := finfo.Mode() // use the same mode for the output file
if *safe {
_, err := os.Stat(dstFilename)
if !os.IsNotExist(err) {
exitErr(errors.New("destination files exists"))
}
}
var out io.Writer
if *stdout {
out = os.Stdout
} else {
dstFile, err := os.OpenFile(dstFilename, os.O_CREATE|os.O_WRONLY, mode)
exitErr(err)
defer dstFile.Close()
bw := bufio.NewWriterSize(out, 4<<20)
defer bw.Flush()
out = bw
}
r.Reset(src)
start := time.Now()
output, err := io.Copy(out, r)
exitErr(err)
if !*stdout {
elapsed := time.Since(start)
mbPerSec := (float64(output) / (1024 * 1024)) / (float64(elapsed) / (float64(time.Second)))
pct := float64(output) * 100 / float64(rc.n)
fmt.Printf("%d -> %d [%.02f%%]; %dMB/s\n", rc.n, output, pct, int(mbPerSec))
}
}()
}
}

func exitErr(err error) {
if err != nil {
fmt.Fprintln(os.Stderr, "ERROR:", err.Error())
os.Exit(2)
}
}

type rCounter struct {
n int
in io.Reader
}

func (w *rCounter) Read(p []byte) (n int, err error) {
n, err = w.in.Read(p)
w.n += n
return n, err

}