-
Notifications
You must be signed in to change notification settings - Fork 30
/
git_downloader.py
executable file
·130 lines (106 loc) · 4.48 KB
/
git_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
#
import sys, os, argparse, logging, fnmatch, posixpath, socket
import github
if sys.version_info < (3, 0):
# python 2
import urlparse
from urllib import urlretrieve
from urllib import quote
else:
# python 3
import urllib.parse as urlparse
from urllib.request import urlretrieve
from urllib.parse import quote
def main(args, loglevel):
logging.basicConfig(format="%(levelname)s: %(message)s", level=loglevel)
socket.setdefaulttimeout(args.timeout)
if args.username and args.password:
g = github.Github(args.username, args.password)
else:
g = github.Github()
if args.repo_file:
repo_gen = file_repo_gen(args.repo_file, g)
else:
repo_gen = g.get_repos(since=args.last_repo)
download_files(args, g, repo_gen)
def file_repo_gen(repo_file, g):
with open(repo_file, 'r') as f:
for line in f:
repo_str = line.rstrip().split('github.com/')[-1]
yield g.get_repo(repo_str)
def download_files(args, g, repo_gen):
file_counter = 0
for repo in repo_gen:
try:
logging.info('Fetching repository: %s (id: %i)' % (repo.full_name, repo.id))
tree = repo.get_git_tree('master', recursive=True)
files_to_download = []
for file in tree.tree:
if fnmatch.fnmatch(file.path, args.wildcard):
files_to_download.append('https://github.com/%s/raw/master/%s' % (repo.full_name, file.path))
for file in files_to_download:
logging.info('Downloading %s' % file)
file = quote(file)
file_counter += 1
filename = posixpath.basename(urlparse.urlsplit(file).path)
output_path = os.path.join(args.output_dir, filename)
if os.path.exists(output_path):
output_path += "-" + str(file_counter)
try:
urlretrieve(file, output_path)
except Exception:
logging.exception('Error downloading %s.' % file)
except Exception:
logging.exception('Error fetching repository.')
args.yara_meta = os.path.join(args.output_dir, args.yara_meta)
with open(args.yara_meta, 'w') as f:
for i in os.listdir(args.output_dir):
try:
f.write("include \"" + i + "\"\n")
except Exception:
logging.exception('Couldn\'t write to %s.' % args.yara_meta)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description = "Github file downloader")
parser.add_argument("-u",
"--username",
default = None,
help = "Username used to authenticate with github for increased rate limit")
parser.add_argument("-p",
"--password",
default = None,
help = "Password or token used to authenticate with github")
parser.add_argument("-r",
"--repo_file",
help = "Path for the input file which contains a url of a Github repository for each separate line")
parser.add_argument("-l",
"--last_repo",
type=int,
default = github.GithubObject.NotSet,
help = "When not using a repo_file, this will be used as starting position for github repo crawl")
parser.add_argument("-w",
"--wildcard",
help = "Unix shell-style wildcard to match files to download (for example: *.txt)")
parser.add_argument("-o",
"--output_dir",
default = "",
help = "Directory to store all downloaded files")
parser.add_argument("-y",
"--yara-meta",
default = "rules.yara",
help = "Yara meta rule filename to create")
parser.add_argument("-t",
"--timeout",
default = 30,
help = "Socket timeout (seconds)")
parser.add_argument("-v",
"--verbose",
help="increase output verbosity",
action="store_true")
args = parser.parse_args()
# Setup logging
if args.verbose:
loglevel = logging.DEBUG
else:
loglevel = logging.INFO
main(args, loglevel)