Skip to content

Commit

Permalink
reconstruction
Browse files Browse the repository at this point in the history
  • Loading branch information
elvis972602 committed Feb 5, 2023
1 parent 20a5c38 commit 69d459c
Show file tree
Hide file tree
Showing 18 changed files with 1,269 additions and 246 deletions.
53 changes: 0 additions & 53 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,59 +4,6 @@ A simple scraper to filter and download images from kemono.party
## Downloader Released
Source code is in `./main`

## Usage
```go
go get github.com/elvis972602/kemono-scraper
```

## Example
```go
downloader := NewDownloader(
BaseURL("https://kemono.party"),
// the amount of download at the same time
MaxConcurrent(3),
Timeout(300*time.Second),
// async download, download several files at the same time,
// may cause the file order is not the same as the post order
// you can use save path rule to control it
Async(true),
// the file will order by name in <order>-<file name>
SavePath(func(creator Creator, post Post, i int, attachment File) string {
var name string
if filepath.Ext(attachment.Name) == ".zip" {
name = attachment.Name
} else {
name = fmt.Sprintf("%d-%s", i, attachment.Name)
}
return fmt.Sprintf(filepath.Join("./download", "%s", "%s", "%s"), ValidDirectoryName(creator.Name), ValidDirectoryName(post.Title), ValidDirectoryName(name))
}),
WithHeader(Header{
"User-Agent": UserAgent,
"Referer": "https://kemono.party",
"accept": Accept,
"accept-encoding": "gzip, deflate, br",
"accept-language": "ja-JP;q=0.8,ja;q=0.7,en-US;q=0.6,en;q=0.5",
}),
RateLimit(2),
Retry(3),
)
K := NewKemono(
WithUsers("123456", "service", "654321", "service2"),
WithBanner(true),
WithPostFilter(
ReleaseDateFilter(time.Now().AddDate(0, 0, -365), time.Now()),
),
WithAttachmentFilter(
ExtensionFilter(".jpg", ".png", ".zip", ".gif"),
),
SetDownloader(downloader),
)
K.Start()
```

## Features
With Kemono-scraper, you can implement a Downloader to take advantage of features such as multi-connection downloading, resume broken downloads, and more.




87 changes: 64 additions & 23 deletions downloader.go → downloader/downloader.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
package kemono_scraper
package downloader

import (
"context"
"errors"
"fmt"
"io"
"log"
"github.com/elvis972602/kemono-scraper/kemono"
"github.com/elvis972602/kemono-scraper/term"
"github.com/elvis972602/kemono-scraper/utils"
"net/http"
"os"
"path/filepath"
"strconv"
"sync"
"time"
)
Expand All @@ -21,12 +23,14 @@ const (
Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
)

type Header map[string]string

type Downloader interface {
Download(<-chan FileWithIndex, Creator, Post) <-chan error
type Log interface {
Printf(format string, v ...interface{})
Print(s string)
SetStatus(s []string)
}

type Header map[string]string

type DownloadOption func(*downloader)

type downloader struct {
Expand All @@ -43,11 +47,11 @@ type downloader struct {
OverWrite bool

// SavePath return the path to save the file
SavePath func(creator Creator, post Post, i int, attachment File) string
SavePath func(creator kemono.Creator, post kemono.Post, i int, attachment kemono.File) string
// timeout
Timeout time.Duration

reteLimiter *rateLimiter
reteLimiter *utils.RateLimiter

Header Header

Expand All @@ -56,18 +60,23 @@ type downloader struct {
retry int

retryInterval time.Duration

progressBar *utils.ProgressBar

log Log
}

func NewDownloader(options ...DownloadOption) Downloader {
func NewDownloader(options ...DownloadOption) kemono.Downloader {
// with default options
d := &downloader{
MaxConcurrent: maxConcurrent,
MaxConnection: maxConnection,
SavePath: defaultSavePath,
Timeout: 300 * time.Second,
Async: false,
reteLimiter: newRateLimiter(rateLimit),
reteLimiter: utils.NewRateLimiter(rateLimit),
retry: 2,
progressBar: utils.NewProgressBar(term.NewTerminal(os.Stdout, os.Stderr, false)),
}
for _, option := range options {
option(d)
Expand All @@ -79,6 +88,21 @@ func NewDownloader(options ...DownloadOption) Downloader {
d.MaxConcurrent = 1
}
d.cookies = make(chan []*http.Cookie, d.MaxConcurrent)
if d.log == nil {
panic("log is nil")
}

d.progressBar = utils.NewProgressBar(d.log)

go func() {
tick := time.NewTicker(100 * time.Millisecond)
for {
select {
case <-tick.C:
d.progressBar.SetStatus()
}
}
}()
return d
}

Expand Down Expand Up @@ -106,7 +130,7 @@ func Timeout(timeout time.Duration) DownloadOption {
// limit the rate of download per second
func RateLimit(n int) DownloadOption {
return func(d *downloader) {
d.reteLimiter = newRateLimiter(n)
d.reteLimiter = utils.NewRateLimiter(n)
}
}

Expand All @@ -116,14 +140,21 @@ func WithHeader(header Header) DownloadOption {
}
}

func SavePath(savePath func(creator Creator, post Post, i int, attachment File) string) DownloadOption {
func SavePath(savePath func(creator kemono.Creator, post kemono.Post, i int, attachment kemono.File) string) DownloadOption {
return func(d *downloader) {
d.SavePath = savePath
}
}

func defaultSavePath(creator Creator, post Post, i int, attachment File) string {
return fmt.Sprintf(filepath.Join("./download", "%s", "%s", "%s"), ValidDirectoryName(creator.Name), ValidDirectoryName(post.Title), ValidDirectoryName(attachment.Name))
// SetLog set the log
func SetLog(log Log) DownloadOption {
return func(d *downloader) {
d.log = log
}
}

func defaultSavePath(creator kemono.Creator, post kemono.Post, i int, attachment kemono.File) string {
return fmt.Sprintf(filepath.Join("./download", "%s", "%s", "%s"), utils.ValidDirectoryName(creator.Name), utils.ValidDirectoryName(DirectoryName(post)), utils.ValidDirectoryName(attachment.Name))
}

// Async set the async download option
Expand Down Expand Up @@ -152,7 +183,7 @@ func RetryInterval(interval time.Duration) DownloadOption {
}
}

func (d *downloader) Download(files <-chan FileWithIndex, creator Creator, post Post) <-chan error {
func (d *downloader) Download(files <-chan kemono.FileWithIndex, creator kemono.Creator, post kemono.Post) <-chan error {

//TODO: implement download
var (
Expand Down Expand Up @@ -214,7 +245,7 @@ func (d *downloader) download(filePath, url, fileHash string) error {
return err
}
if complete {
log.Printf("file %s already exists, skip", filePath)
d.log.Printf(utils.ShortenString("file ", filePath, " already exists, skip"))
return nil
}
} else {
Expand All @@ -239,7 +270,7 @@ func (d *downloader) download(filePath, url, fileHash string) error {
func (d *downloader) downloadFile(file *os.File, url string) error {
d.reteLimiter.Token()

log.Printf("downloading file %s", url)
//progressBar.Printf("downloading file %s", url)
ctx, cancel := context.WithTimeout(context.Background(), d.Timeout)
defer cancel()

Expand All @@ -266,10 +297,13 @@ func (d *downloader) downloadFile(file *os.File, url string) error {
}
defer resp.Body.Close()

// get content length
contentLength, err := strconv.ParseInt(resp.Header.Get("Content-Length"), 10, 64)

// 429 too many requests
if resp.StatusCode == http.StatusTooManyRequests {
if retry > 0 {
log.Printf("request too many times, retry after %.1f seconds...", d.retryInterval.Seconds())
d.log.Printf("request too many times, retry after %.1f seconds...", d.retryInterval.Seconds())
time.Sleep(d.retryInterval)
return get(retry - 1)
} else {
Expand All @@ -284,11 +318,14 @@ func (d *downloader) downloadFile(file *os.File, url string) error {
if len(resp.Cookies()) < d.MaxConcurrent {
d.cookies <- resp.Cookies()
}

_, err = io.Copy(file, resp.Body)
bar := &utils.Bar{Since: time.Now(), Prefix: "Download", Content: fmt.Sprintf("%s", filepath.Base(file.Name())), Max: contentLength, Length: 30}
d.progressBar.AddBar(bar)
_, err = utils.Copy(file, resp.Body, bar)
if err != nil {
d.progressBar.Fail(bar, err)
return fmt.Errorf("failed to write file: %w", err)
}
d.progressBar.Success(bar)
return nil
}

Expand Down Expand Up @@ -321,7 +358,7 @@ func checkFileExitAndComplete(filePath, fileHash string) (file *os.File, complet
err = fmt.Errorf("open file error: %w", err)
return
}
h, err = Hash(file)
h, err = utils.Hash(file)
if err != nil {
err = fmt.Errorf("get file hash error: %w", err)
return
Expand All @@ -333,7 +370,7 @@ func checkFileExitAndComplete(filePath, fileHash string) (file *os.File, complet
}
err = file.Truncate(0)
if err != nil {
log.Printf("truncate file error: %s", err)
err = fmt.Errorf("truncate file error: %w", err)
return nil, false, err
}
_, err := file.Seek(0, 0)
Expand All @@ -355,3 +392,7 @@ func newGetRequest(ctx context.Context, header Header, url string) (*http.Reques
}
return req, nil
}

func DirectoryName(p kemono.Post) string {
return fmt.Sprintf("[%s][%s]%s", p.Published.Format("20060102"), p.Id, p.Title)
}
52 changes: 37 additions & 15 deletions example/example.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,51 +2,73 @@ package main

import (
"fmt"
kemono "github.com/elvis972602/kemono-scraper"
"github.com/elvis972602/kemono-scraper/downloader"
"github.com/elvis972602/kemono-scraper/kemono"
"github.com/elvis972602/kemono-scraper/term"
"github.com/elvis972602/kemono-scraper/utils"
"os"
"path/filepath"
"time"
)

func main() {
downloader := kemono.NewDownloader(
kemono.BaseURL("https://kemono.party"),
t := term.NewTerminal(os.Stdout, os.Stderr, false)

d := downloader.NewDownloader(
downloader.BaseURL("https://kemono.party"),
// the amount of download at the same time
kemono.MaxConcurrent(3),
kemono.Timeout(300*time.Second),
downloader.MaxConcurrent(3),
downloader.Timeout(300*time.Second),
// async download, download several files at the same time,
// may cause the file order is not the same as the post order
// you can use save path rule to control it
kemono.Async(true),
downloader.Async(true),
// the file will order by name in <order>-<file name>
kemono.SavePath(func(creator kemono.Creator, post kemono.Post, i int, attachment kemono.File) string {
downloader.SavePath(func(creator kemono.Creator, post kemono.Post, i int, attachment kemono.File) string {
var name string
if filepath.Ext(attachment.Name) == ".zip" {
name = attachment.Name
} else {
name = fmt.Sprintf("%d-%s", i, attachment.Name)
}
return fmt.Sprintf(filepath.Join("./download", "%s", "%s", "%s"), kemono.ValidDirectoryName(creator.Name), kemono.ValidDirectoryName(post.Title), kemono.ValidDirectoryName(name))
return fmt.Sprintf(filepath.Join("./download", "%s", "%s", "%s"), utils.ValidDirectoryName(creator.Name), utils.ValidDirectoryName(post.Title), utils.ValidDirectoryName(name))
}),
kemono.WithHeader(kemono.Header{
"User-Agent": kemono.UserAgent,
downloader.WithHeader(downloader.Header{
"User-Agent": downloader.UserAgent,
"Referer": "https://kemono.party",
"accept": kemono.Accept,
"accept": downloader.Accept,
"accept-encoding": "gzip, deflate, br",
"accept-language": "ja-JP;q=0.8,ja;q=0.7,en-US;q=0.6,en;q=0.5",
}),
kemono.RateLimit(2),
kemono.Retry(5),
downloader.RateLimit(2),
downloader.Retry(5),
downloader.RetryInterval(5*time.Second),
downloader.SetLog(t),
)
user1 := kemono.NewCreator("service1", "123456")
user2 := kemono.NewCreator("service2", "654321")
K := kemono.NewKemono(
kemono.WithUsers("74671556", "fanbox"),
kemono.WithUsers(user1, user2),
kemono.WithUsersPair("service3", "987654"),
kemono.WithBanner(true),
kemono.WithPostFilter(
kemono.ReleaseDateFilter(time.Now().AddDate(0, 0, -365), time.Now()),
),
kemono.WithAttachmentFilter(
kemono.ExtensionFilter(".jpg", ".png", ".zip", ".gif"),
),
kemono.SetDownloader(downloader),
// a post filter for specific user
kemono.WithUserPostFilter(user1, kemono.EditDateFilter(time.Now().AddDate(0, 0, -20), time.Now())),
// an attachment filter for specific user
kemono.WithUserAttachmentFilter(user2, func(i int, attachment kemono.File) bool {
if i%2 == 0 {
return false
}
return true
}),
kemono.SetDownloader(d),
// if not set , use default log
kemono.SetLog(t),
)
K.Start()
}
10 changes: 9 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,12 @@ module github.com/elvis972602/kemono-scraper

go 1.18

require gopkg.in/yaml.v3 v3.0.1 // indirect
require gopkg.in/yaml.v3 v3.0.1

require (
github.com/orcaman/concurrent-map/v2 v2.0.1 // indirect
golang.org/x/crypto v0.5.0 // indirect
golang.org/x/sys v0.4.0 // indirect
golang.org/x/term v0.4.0 // indirect
golang.org/x/text v0.6.0 // indirect
)
Loading

0 comments on commit 69d459c

Please sign in to comment.