forked from okfde/dokukratie
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
122 lines (97 loc) · 4.23 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
export MEMORIOUS_CONFIG_PATH=dokukratie
export MEMORIOUS_USER_AGENT="Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
# production use: `make <crawler>`
# current available scrapers:
# state scrapers
bb: bb.pull bb.run_prod bb.mmmeta bb.upload
be: be.pull be.run_prod be.mmmeta be.upload
bw: bw.pull bw.run_prod bw.mmmeta bw.upload
by: by.pull by.run_prod by.mmmeta by.upload
hh: hh.pull hh.run_prod hh.mmmeta hh.upload
he: he.pull he.run_prod he.mmmeta he.upload
mv: mv.pull mv.run_prod mv.mmmeta mv.upload
ni: ni.pull ni.run_prod ni.mmmeta ni.upload
nw: nw.pull nw.run_prod nw.mmmeta nw.upload
rp: rp.pull rp.run_prod rp.mmmeta rp.upload
st: st.pull st.run_prod st.mmmeta st.upload
th: th.pull th.run_prod th.mmmeta th.upload
# other scrapers
dip: dip.pull dip.run_prod dip.mmmeta dip.upload
parlamentsspiegel: parlamentsspiegel.pull parlamentsspiegel.run_prod parlamentsspiegel.mmmeta parlamentsspiegel.upload
sehrgutachten: sehrgutachten.pull sehrgutachten.run_prod sehrgutachten.mmmeta sehrgutachten.upload
vsberichte: vsberichte.pull vsberichte.run_prod vsberichte.mmmeta vsberichte.upload
# all the things
config.states: bb.config bw.config by.config hh.config he.config mv.config ni.config nw.config rp.config st.config th.config
action.states: bb.action bw.action by.action hh.action he.action mv.action ni.action nw.action rp.action st.action th.action
pull.states: bb.pull bw.pull by.pull hh.pull he.pull mv.pull ni.pull nw.pull rp.pull st.pull th.pull
mmmeta.states: bb.mmmeta bw.mmmeta by.mmmeta hh.mmmeta he.mmmeta mv.mmmeta ni.mmmeta nw.mmmeta rp.mmmeta st.mmmeta th.mmmeta
upload.states: bb.upload bw.upload by.upload hh.upload he.upload mv.upload ni.upload nw.upload rp.upload st.upload th.upload
push.states: bb.push bw.push by.push hh.push he.push mv.push ni.push nw.push rp.push st.push th.push
sync.states: states.config states.pull states.mmmeta states.upload
he.run_prod:
mkdir -p ./data/store/$*/_mmmeta
sed "s/<scraper_name>/$*/" config.yml.tmpl > ./data/store/$*/_mmmeta/config.yml
# don't ddos hessen
MEMORIOUS_HTTP_RATE_LIMIT=30 MMMETA=./data/store/he memorious run he --threads=4
ls ./data/store/$*/*.pdf | wc
ls ./data/store/$*/*.json | wc
parlamentsspiegel.run_prod:
# don't go back too far
START_DATE_DELTA=2 MMMETA=./data/store/parlamentsspiegel memorious run parlamentsspiegel --threads=4
vsberichte.run_prod:
# don't use mmmeta
memorious run vsberichte --threads=4
%.run_prod:
mkdir -p ./data/store/$*/_mmmeta
sed "s/<scraper_name>/$*/" config.yml.tmpl > ./data/store/$*/_mmmeta/config.yml
MMMETA=./data/store/$* memorious run $* --threads=4
ls ./data/store/$*/*.pdf | wc
ls ./data/store/$*/*.json | wc
run.%:
memorious run $*
ls ./data/store/$*/*.pdf | wc
ls ./data/store/$*/*.json | wc
install:
pip install -e .
install.dev: install
pip install -r requirements-dev.txt
install.prod: install
pip install -r requirements-prod.txt
install.test: install.dev
pip install twine coverage nose moto pytest pytest-cov black flake8 isort
%.config:
mkdir -p ./data/store/$*/_mmmeta
sed "s/<scraper_name>/$*/" config.yml.tmpl > ./data/store/$*/_mmmeta/config.yml
%.action:
mkdir -p ./.github/workflows/
sed "s/<scraper_name>/$*/" workflow.yml.tmpl > ./.github/workflows/$*.yml
%.mmmeta:
MMMETA=./data/store/$* mmmeta generate
%.pull:
#aws --endpoint-url $(ARCHIVE_ENDPOINT_URL) s3 sync s3://$(DATA_BUCKET)/$*/_mmmeta/db/ ./data/store/$*/_mmmeta/db
%.push:
#aws --endpoint-url $(ARCHIVE_ENDPOINT_URL) s3 sync --exclude "*.db*" ./data/store/$*/_mmmeta/ s3://$(DATA_BUCKET)/$*/_mmmeta
%.upload:
#aws --endpoint-url $(ARCHIVE_ENDPOINT_URL) s3 sync --exclude "*.db*" ./data/store/$*/ s3://$(DATA_BUCKET)/$*
test: install.test
rm -rf testdata
mkdir testdata
pytest -s --cov=dokukratie --cov-report term-missing ./tests/
rm -rf testdata
test.%:
rm -rf testdata/$*
mkdir -p testdata/$*
pytest -s --cov=dokukratie --cov-report term-missing ./tests/ -k "test_$*"
rm -rf testdata/$*
clean:
rm -fr build/
rm -fr dist/
rm -fr .eggs/
find . -name '*.egg-info' -exec rm -fr {} +
find . -name '*.egg' -exec rm -f {} +
find . -name '*.pyc' -exec rm -f {} +
find . -name '*.pyo' -exec rm -f {} +
find . -name '*~' -exec rm -f {} +
find . -name '__pycache__' -exec rm -fr {} +
redis:
docker run -p 6379:6379 redis:alpine