-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.sh
executable file
·65 lines (59 loc) · 1.56 KB
/
crawler.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env bash
# A small web crawler
REMOTE_DIR="https://archive.robocup.info"
BASE_DIR="$(dirname "$(readlink -f "$0")")"
BIN_DIR="$BASE_DIR/bin"
SKIP=1
# get files in day1,2021
# RANGE=/2021/Day3
# get all files in 2021
# RANGE=/2021
# get all files since 1996
RANGE=/
if [ ! -d "$BIN_DIR" ]; then
mkdir -p "$BIN_DIR"
fi
function get {
local url
url=$1
if [ "$(curl "$REMOTE_DIR/$url/" | grep -o BIN | wc -l)" -gt 0 ]; then
local team_urls
team_urls="$(curl "$REMOTE_DIR/$url/" | grep -oP '(?<=alt="file"/></td><td class="fb-n"><a href="/)(([\s\S])*?)(?=">)')"
for team_url in $team_urls; do
local bin
bin="${team_url##*RoboCup/}"
if [[ -e $BASE_DIR/$bin && SKIP -gt 0 ]]; then
continue
fi
local dir
dir="${bin%/*}"
if [ ! -d "$BIN_DIR/$dir" ]; then
mkdir -p "$BIN_DIR/$dir"
fi
cd "$BIN_DIR/$dir" || exit 255
echo "Getting $REMOTE_DIR/$team_url/ to $BIN_DIR/$dir"
curl -O "$REMOTE_DIR/$team_url"
done
else
local folder_urls
folder_urls="$(curl "$REMOTE_DIR/$url/" | grep -oP '(?<=alt="folder"/></td><td class="fb-n"><a href="/)([\s\S]*?)(?=/">)')"
for folder_url in $folder_urls; do
get "$folder_url"
done
fi
}
function extract {
local dir
dir=$1
cd "$dir" || exit 255
for file in "$dir"/*; do
if [ -d "$file" ]; then
extract "$file"
elif [[ -f $file && ${file%%.tar.gz} != "$file" ]]; then
tar -xvzf "$file"
rm "$file"
fi
done
}
get "Soccer/Simulation/2D/binaries/RoboCup$RANGE" 2>/dev/null
extract "$BIN_DIR"