From 94f36847a3e720ecc2546d2ff8a85ddc681316b7 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 12 Sep 2019 14:52:09 -0700 Subject: [PATCH 1/2] Add s2 commandline tools --- .travis.yml | 1 + s2/README.md | 64 ++++++++++++++++- s2/cmd/s2c/main.go | 169 +++++++++++++++++++++++++++++++++++++++++++++ s2/cmd/s2d/main.go | 132 +++++++++++++++++++++++++++++++++++ 4 files changed, 365 insertions(+), 1 deletion(-) create mode 100644 s2/cmd/s2c/main.go create mode 100644 s2/cmd/s2d/main.go diff --git a/.travis.yml b/.travis.yml index 4ae7712305..22d94138dd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,6 +21,7 @@ script: - go test -v -cpu=2 ./... - go test -cpu=2 -tags=noasm ./... - go test -cpu=1,4 -short -race ./... + - go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d && s2c && s2d - GOOS=linux GOARCH=386 go install ./... matrix: diff --git a/s2/README.md b/s2/README.md index 81657eda1f..11343474dd 100644 --- a/s2/README.md +++ b/s2/README.md @@ -72,7 +72,69 @@ Similar to the Writer, a Reader can be reused using the `Reset` method. For smaller data blocks, there is also a non-streaming interface: `Encode()`, `EncodeBetter()` and `Decode()`. Do however note that these functions (similar to Snappy) does not provide validation of data, so data corruption may be undetected. Stream encoding provides CRC checks of data. - + +# Commandline tools + +Some very simply commandline tools are provided; `s2c` for compression and `s2d` for decompression. + +Installing then requires Go to be installed. To install them, use: + +`go install github.com/klauspost/compress/s2/cmd/s2c && go install github.com/klauspost/compress/s2/cmd/s2d` + +To build binaries to the current folder use: + +`go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d` + + +## s2c + +``` +Usage: s2c [options] file1 file2 + +Compresses all files supplied as input separately. +Output files are written as 'filename.ext.s2'. +By default output files will be overwritten. +Use - as the only file name to read from stdin and write to stdout. + +Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt +Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt + +Options: + -blocksize string + Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "1M") + -c Write all output to stdout. Multiple input files will be concatenated. + -cpu int + Compress using this amount of threads (default 12) + -faster + Compress faster, but with a minor compression loss + -help + Display help + -padding string + Pad size to a multiple of this value, Examples: 64K, 256K, 1M, 4M, etc (default "1") + -safe + Do not overwrite output files +``` + +## s2d + +``` +Usage: s2d [options] file1 file2 + +Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'. +Output file names have the extension removed. By default output files will be overwritten. +Use - as the only file name to read from stdin and write to stdout. + +Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt +Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt + +Options: + -c Write all output to stdout. Multiple input files will be concatenated. + -help + Display help + -safe + Do not overwrite output files +``` + # Performance This section will focus on comparisons to Snappy. diff --git a/s2/cmd/s2c/main.go b/s2/cmd/s2c/main.go new file mode 100644 index 0000000000..7a973f7acd --- /dev/null +++ b/s2/cmd/s2c/main.go @@ -0,0 +1,169 @@ +package main + +import ( + "bufio" + "errors" + "flag" + "fmt" + "io" + "os" + "path/filepath" + "runtime" + "strconv" + "strings" + "time" + "unicode" + + "github.com/klauspost/compress/s2" +) + +var ( + faster = flag.Bool("faster", false, "Compress faster, but with a minor compression loss") + cpu = flag.Int("cpu", runtime.GOMAXPROCS(0), "Compress using this amount of threads") + blockSize = flag.String("blocksize", "1M", "Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB") + safe = flag.Bool("safe", false, "Do not overwrite output files") + padding = flag.String("padding", "1", "Pad size to a multiple of this value, Examples: 64K, 256K, 1M, 4M, etc") + stdout = flag.Bool("c", false, "Write all output to stdout. Multiple input files will be concatenated.") + help = flag.Bool("help", false, "Display help") +) + +func main() { + flag.Parse() + sz, err := toSize(*blockSize) + exitErr(err) + pad, err := toSize(*padding) + exitErr(err) + + args := flag.Args() + if len(args) == 0 || *help { + fmt.Println(`Usage: s2c [options] file1 file2 + +Compresses all files supplied as input separately. +Output files are written as 'filename.ext.s2'. +By default output files will be overwritten. +Use - as the only file name to read from stdin and write to stdout. + +Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt +Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt + +Options:`) + flag.PrintDefaults() + } + opts := []s2.WriterOption{s2.WriterBlockSize(int(sz)), s2.WriterConcurrency(*cpu), s2.WriterPadding(int(pad))} + if !*faster { + opts = append(opts, s2.WriterBetterCompression()) + } + wr := s2.NewWriter(nil, opts...) + + // No args, use stdin/stdout + if len(args) == 1 && args[0] == "-" { + wr.Reset(os.Stdout) + _, err := io.Copy(wr, os.Stdin) + exitErr(err) + exitErr(wr.Close()) + return + } + var files []string + + for _, pattern := range args { + found, err := filepath.Glob(pattern) + exitErr(err) + if len(found) == 0 { + exitErr(fmt.Errorf("unable to find file %v", pattern)) + } + files = append(files, found...) + } + + for _, filename := range files { + func() { + dstFilename := fmt.Sprintf("%s%s", filename, ".s2") + if !*stdout { + fmt.Println("Compressing", filename, "->", dstFilename) + } + // Input file. + file, err := os.Open(filename) + exitErr(err) + defer file.Close() + src := bufio.NewReaderSize(file, int(sz)*2) + finfo, err := file.Stat() + exitErr(err) + var out io.Writer + if *stdout { + out = os.Stdout + } else { + mode := finfo.Mode() // use the same mode for the output file + if *safe { + _, err := os.Stat(dstFilename) + if !os.IsNotExist(err) { + exitErr(errors.New("destination files exists")) + } + } + dstFile, err := os.OpenFile(dstFilename, os.O_CREATE|os.O_WRONLY, mode) + exitErr(err) + defer dstFile.Close() + bw := bufio.NewWriterSize(dstFile, int(sz)*2) + defer bw.Flush() + out = bw + } + wc := wCounter{out: out} + wr.Reset(&wc) + defer wr.Close() + start := time.Now() + input, err := wr.ReadFrom(src) + exitErr(err) + err = wr.Close() + exitErr(err) + if !*stdout { + elapsed := time.Since(start) + mbpersec := (float64(input) / (1024 * 1024)) / (float64(elapsed) / (float64(time.Second))) + pct := float64(wc.n) * 100 / float64(input) + fmt.Printf("%d -> %d [%.02f%%]; %dMB/s\n", input, wc.n, pct, int(mbpersec)) + } + }() + } +} + +func exitErr(err error) { + if err != nil { + fmt.Fprintln(os.Stderr, "ERROR:", err.Error()) + os.Exit(2) + } +} + +// toSize converts a size indication to bytes. +func toSize(size string) (uint64, error) { + size = strings.ToUpper(strings.TrimSpace(size)) + firstLetter := strings.IndexFunc(size, unicode.IsLetter) + if firstLetter == -1 { + firstLetter = len(size) + } + + bytesString, multiple := size[:firstLetter], size[firstLetter:] + bytes, err := strconv.ParseUint(bytesString, 10, 64) + if err != nil { + return 0, fmt.Errorf("unable to parse size: %v", err) + } + + switch multiple { + case "M", "MB", "MIB": + return bytes * 1 << 20, nil + case "K", "KB", "KIB": + return bytes * 1 << 10, nil + case "B", "": + return bytes, nil + default: + return 0, fmt.Errorf("unknown size suffix: %v", multiple) + } +} + +type wCounter struct { + n int + out io.Writer +} + +func (w *wCounter) Write(p []byte) (n int, err error) { + n, err = w.out.Write(p) + w.n += n + return n, err + +} diff --git a/s2/cmd/s2d/main.go b/s2/cmd/s2d/main.go new file mode 100644 index 0000000000..94a43cfd57 --- /dev/null +++ b/s2/cmd/s2d/main.go @@ -0,0 +1,132 @@ +package main + +import ( + "bufio" + "errors" + "flag" + "fmt" + "io" + "os" + "path/filepath" + "strings" + "time" + + "github.com/klauspost/compress/s2" +) + +var ( + safe = flag.Bool("safe", false, "Do not overwrite output files") + stdout = flag.Bool("c", false, "Write all output to stdout. Multiple input files will be concatenated.") + help = flag.Bool("help", false, "Display help") +) + +func main() { + flag.Parse() + r := s2.NewReader(nil) + + // No args, use stdin/stdout + args := flag.Args() + if len(args) == 0 || *help { + fmt.Println(`Usage: s2d [options] file1 file2 + +Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'. +Output file names have the extension removed. By default output files will be overwritten. +Use - as the only file name to read from stdin and write to stdout. + +Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt +Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt + +Options:`) + flag.PrintDefaults() + } + if len(args) == 1 && args[0] == "-" { + r.Reset(os.Stdin) + _, err := io.Copy(os.Stdout, r) + exitErr(err) + return + } + var files []string + + for _, pattern := range args { + found, err := filepath.Glob(pattern) + exitErr(err) + if len(found) == 0 { + exitErr(fmt.Errorf("unable to find file %v", pattern)) + } + files = append(files, found...) + } + + for _, filename := range files { + dstFilename := filename + switch { + case strings.HasSuffix(filename, ".s2"): + dstFilename = strings.TrimSuffix(filename, ".s2") + case strings.HasSuffix(filename, ".snappy"): + dstFilename = strings.TrimSuffix(filename, ".snappy") + default: + fmt.Println("Skipping", filename) + continue + } + + func() { + if !*stdout { + fmt.Println("Decompressing", filename, "->", dstFilename) + } + // Input file. + file, err := os.Open(filename) + exitErr(err) + defer file.Close() + rc := rCounter{in: file} + src := bufio.NewReaderSize(&rc, 4<<20) + finfo, err := file.Stat() + exitErr(err) + mode := finfo.Mode() // use the same mode for the output file + if *safe { + _, err := os.Stat(dstFilename) + if !os.IsNotExist(err) { + exitErr(errors.New("destination files exists")) + } + } + var out io.Writer + if *stdout { + out = os.Stdout + } else { + dstFile, err := os.OpenFile(dstFilename, os.O_CREATE|os.O_WRONLY, mode) + exitErr(err) + defer dstFile.Close() + bw := bufio.NewWriterSize(out, 4<<20) + defer bw.Flush() + out = bw + } + r.Reset(src) + start := time.Now() + output, err := io.Copy(out, r) + exitErr(err) + if !*stdout { + elapsed := time.Since(start) + mbPerSec := (float64(output) / (1024 * 1024)) / (float64(elapsed) / (float64(time.Second))) + pct := float64(output) * 100 / float64(rc.n) + fmt.Printf("%d -> %d [%.02f%%]; %dMB/s\n", rc.n, output, pct, int(mbPerSec)) + } + }() + } +} + +func exitErr(err error) { + if err != nil { + fmt.Fprintln(os.Stderr, "ERROR:", err.Error()) + os.Exit(2) + } +} + +type rCounter struct { + n int + in io.Reader +} + +func (w *rCounter) Read(p []byte) (n int, err error) { + n, err = w.in.Read(p) + w.n += n + return n, err + +} From 6049fa6277a3d172ced8e7e217cf2624bf38b2b1 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 12 Sep 2019 14:59:30 -0700 Subject: [PATCH 2/2] Tweaks --- s2/README.md | 4 ++-- s2/cmd/s2c/main.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/s2/README.md b/s2/README.md index 11343474dd..61b32a6cb5 100644 --- a/s2/README.md +++ b/s2/README.md @@ -104,12 +104,12 @@ Options: Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "1M") -c Write all output to stdout. Multiple input files will be concatenated. -cpu int - Compress using this amount of threads (default 12) + Compress using this amount of threads (default Auto) -faster Compress faster, but with a minor compression loss -help Display help - -padding string + -pad string Pad size to a multiple of this value, Examples: 64K, 256K, 1M, 4M, etc (default "1") -safe Do not overwrite output files diff --git a/s2/cmd/s2c/main.go b/s2/cmd/s2c/main.go index 7a973f7acd..2a8b062dc3 100644 --- a/s2/cmd/s2c/main.go +++ b/s2/cmd/s2c/main.go @@ -22,7 +22,7 @@ var ( cpu = flag.Int("cpu", runtime.GOMAXPROCS(0), "Compress using this amount of threads") blockSize = flag.String("blocksize", "1M", "Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB") safe = flag.Bool("safe", false, "Do not overwrite output files") - padding = flag.String("padding", "1", "Pad size to a multiple of this value, Examples: 64K, 256K, 1M, 4M, etc") + padding = flag.String("pad", "1", "Pad size to a multiple of this value, Examples: 64K, 256K, 1M, 4M, etc") stdout = flag.Bool("c", false, "Write all output to stdout. Multiple input files will be concatenated.") help = flag.Bool("help", false, "Display help") )