This repository has been archived by the owner on Feb 29, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yaml
67 lines (64 loc) · 1.43 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
client:
max_depth: 30 # Maximum depth of the tree
min_sleep: 1 # Minimum sleep time between requests
max_sleep: 5 # Maximum sleep time between requests
max_timeout: 15 # Maximum timeout for requests
max_redirections: 7 # Maximum number of redirections
max_failures: 5 # Maximum number of failures before stopping crawling (for a given URL)
urls:
roots:
- https://www.aliexpress.com
- https://www.instagram.com
- https://www.wikipedia.org
- https://www.facebook.com
- https://www.youtube.com
- https://www.xvideos.com
- https://www.twitter.com
- https://www.pornhub.com
- https://www.google.com
- https://www.reddit.com
- https://www.amazon.com
- https://github.com
- https://www.yahoo.com
- https://www.baidu.com
- https://www.ebay.com
blacklist:
childs:
- https://for-example
hrefs:
- /for-example
types:
- ico
- png
- bmp
- jpeg
- jpg
- psd
- svg
- tga
- tiff
- gif
- xml
- pdf
- css
- webm
- webp
- json
- mp3
- mp4
- ogg
- rar
- zip
- gz
- tar
- 7z
- msi
- exe
- txt
- js
- webmanifest
user_agent:
generate: false # Not working yet
current: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0 # Own user agent
machine_config:
name: machine_config # File name