Skip to content

Commit

Permalink
add pdf2plain ctl
Browse files Browse the repository at this point in the history
  • Loading branch information
Miachol committed Mar 25, 2020
1 parent 87d0e60 commit 307dfd4
Show file tree
Hide file tree
Showing 8 changed files with 243 additions and 0 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ require (
github.com/openbiox/ligo v0.0.0-20200119135627-2571ec0d416e
github.com/sirupsen/logrus v1.4.2
github.com/spf13/cobra v0.0.5
github.com/stretchr/testify v1.4.0 // indirect
)
45 changes: 45 additions & 0 deletions pdf2plain/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
## pdf2plain

Can be used to convert pdf to plain text for downstream process of bioextr.


## Installation

Xpdf command line tools (required):

- Linux 32/64-bit: [download](https://xpdfreader-dl.s3.amazonaws.com/xpdf-tools-linux-4.02.tar.gz) ([GPG signature](https://xpdfreader-dl.s3.amazonaws.com/xpdf-tools-linux-4.02.tar.gz.sig))
- Windows 32/64-bit: [download](https://xpdfreader-dl.s3.amazonaws.com/xpdf-tools-win-4.02.zip) ([GPG signature](https://xpdfreader-dl.s3.amazonaws.com/xpdf-tools-win-4.02.zip.sig))
- Mac 64-bit: [download](https://xpdfreader-dl.s3.amazonaws.com/xpdf-tools-mac-4.02.tar.gz) ([GPG signature](https://xpdfreader-dl.s3.amazonaws.com/xpdf-tools-mac-4.02.tar.gz.sig))

```bash
# windows
wget https://github.com/openbiox/bioextr/releases/download/v0.1.0/pdf2plain.exe

# osx
wget https://github.com/openbiox/bioextr/releases/download/v0.1.0/pdf2plain_osx
mv pdf2plain_osx pdf2plain
chmod a+x pdf2plain

# linux
wget https://github.com/openbiox/bioextr/releases/download/v0.1.0/pdf2plain_linux64
mv pdf2plain_linux64 pdf2plain
chmod a+x pdf2plain

# get latest version
go get -u github.com/openbiox/bioextr/pdf2plain
```

## Usage

```
pdf2plain _examples/Multi-omic_approaches_to_improve_outcome_for_T-cel.pdf -o out.text
```

## Maintainer

- [@Jianfeng](https://github.com/Miachol)

## License

Academic Free License version 3.0

Binary file not shown.
85 changes: 85 additions & 0 deletions pdf2plain/cmd/root.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package cmd

import (
"fmt"
"os"

"code.sajari.com/docconv"
"github.com/openbiox/ligo/io"
"github.com/spf13/cobra"
)

// RootClisT is the bioctl global flags
type RootClisT struct {
// version of bioctl
Version string
Verbose int
SaveLog bool
TaskID string
LogDir string
Clean bool
Out string

HelpFlags bool
}

var RootClis = RootClisT{
Version: version,
Verbose: 1,
HelpFlags: true,
}

var RootCmd = &cobra.Command{
Use: "pdf2plain [input.pdf]",
Short: "A wrapper command line tool to convert pdf files to plain text.",
Long: `A wrapper command line tool to convert pdf files to plain text. More see here https://github.com/openbiox/bioextr.`,
Run: func(cmd *cobra.Command, args []string) {
if RootClis.Clean {
RootClis.HelpFlags = false
}
if len(args) > 0 {
initCmd(cmd, args)
convertor(cmd, args)
RootClis.HelpFlags = false
}
if RootClis.HelpFlags {
cmd.Help()
}
},
}

func convertor(cmd *cobra.Command, args []string) {
res, err := docconv.ConvertPath(args[0])
if err != nil {
log.Fatal(err)
}
if RootClis.Out != "" {
if err := io.CreateFileParDir(RootClis.Out); err != nil {
log.Warnln(err)
return
}
con, _ := io.Open(RootClis.Out)
fmt.Fprintf(con, res.Body)
return
}
fmt.Println(res.Body)
}

// Execute main interface of bget
func Execute() {
if err := RootCmd.Execute(); err != nil {
if !RootCmd.HasFlags() && !RootCmd.HasSubCommands() {
RootCmd.Help()
} else {
fmt.Println(err)
os.Exit(1)
}
}
}

func init() {
wd, _ = os.Getwd()
RootCmd.Version = version
setGlobalFlag(RootCmd)
RootCmd.Example = ` pdf2plain _examples/Multi-omic_approaches_to_improve_outcome_for_T-cel.pdf`
}
73 changes: 73 additions & 0 deletions pdf2plain/cmd/utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package cmd

import (
"fmt"
"io"
"os"
"path"
"path/filepath"
"strings"

cvrt "github.com/openbiox/ligo/convert"
cio "github.com/openbiox/ligo/io"
clog "github.com/openbiox/ligo/log"
"github.com/openbiox/ligo/stringo"
"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
)

var log = clog.Logger
var logBash = clog.LoggerBash
var logEnv = log.WithFields(logrus.Fields{
"prefix": "Env"})
var logPrefix string
var wd string

func setGlobalFlag(cmd *cobra.Command) {
wd, _ = os.Getwd()
cmd.PersistentFlags().IntVarP(&(RootClis.Verbose), "verbose", "", 1, "verbose level(0:no output, 1: basic level, 2: with env info")
cmd.PersistentFlags().StringVarP(&(RootClis.TaskID), "task-id", "k", stringo.RandString(15), "task ID (default is random).")
cmd.PersistentFlags().StringVarP(&(RootClis.LogDir), "log-dir", "", path.Join(wd, "_log"), "log dir.")
cmd.PersistentFlags().BoolVarP(&(RootClis.SaveLog), "save-log", "s", false, "save log to file.")
cmd.PersistentFlags().BoolVarP(&(RootClis.Clean), "clean", "", false, "remove log dir.")
cmd.PersistentFlags().StringVarP(&RootClis.Out, "out", "o", "", "out specifies destination of the returned data (default to stdout or current woring directory).")
}
func initCmd(cmd *cobra.Command, args []string) {
setLog()
if RootClis.Verbose == 2 {
logEnv.Infof("Prog: %s", cmd.CommandPath())
logEnv.Infof("TaskID: %s", RootClis.TaskID)
if RootClis.SaveLog && logPrefix != "" {
logEnv.Infof("Log: %s.log", logPrefix)
}
if len(args) > 0 {
logEnv.Infof("Args: %s", strings.Join(args, " "))
}
logEnv.Infof("Global: %v", cvrt.Struct2Map(RootClis))
}
if RootClis.Clean {
cleanLog()
}
}

func setLog() {
var logCon io.Writer
var logDir = RootClis.LogDir

if RootClis.SaveLog {
if logDir == "" {
logDir = filepath.Join(os.TempDir(), "_log")
}
logPrefix = fmt.Sprintf("%s/%s", logDir, RootClis.TaskID)
cio.CreateDir(logDir)
logCon, _ = cio.Open(logPrefix + ".log")
}
clog.SetLogStream(log, RootClis.Verbose == 0, RootClis.SaveLog, &logCon)
}

func cleanLog() {
RootClis.HelpFlags = false
if err := os.RemoveAll(RootClis.LogDir); err != nil {
log.Warn(err)
}
}
3 changes: 3 additions & 0 deletions pdf2plain/cmd/version.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
package cmd

var version = "v0.1.0"
13 changes: 13 additions & 0 deletions pdf2plain/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
module github.com/openbiox/bioextr/pdf2plain

go 1.14

require (
code.sajari.com/docconv v1.1.0
github.com/JalfResi/justext v0.0.0-20170829062021-c0282dea7198 // indirect
github.com/advancedlogic/GoOse v0.0.0-20191112112754-e742535969c1 // indirect
github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 // indirect
github.com/openbiox/ligo v0.0.0-20200324043626-a1096f60db64
github.com/sirupsen/logrus v1.5.0
github.com/spf13/cobra v0.0.6
)
23 changes: 23 additions & 0 deletions pdf2plain/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Copyright © 2019 Jianfeng Li lee_jianfeng@openbiox.org
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"github.com/openbiox/bioextr/pdf2plain/cmd"
)

func main() {
cmd.Execute()
}

0 comments on commit 307dfd4

Please sign in to comment.