-
Notifications
You must be signed in to change notification settings - Fork 1
/
data-collection.py
183 lines (160 loc) · 6.48 KB
/
data-collection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# Python script to collect the data from the request Ids.
# Author- Arunesh Pandey
#
# The final data structure for Ping, DNS, TraceRoute-
# {
# domain1:
# {
# subdomain1: {
# probe_id1: {
#
# },
# probe_id2: {
#
# }
# },
# subdomain2: {
#
# }
# },
# domain2: {
#
# }
# }
#
import json
import urllib.request as req
from ripe.atlas.cousteau import Measurement, Probe
from ripe.atlas.sagan import PingResult, DnsResult, TracerouteResult
import tldextract as tld
import pickle # To dump the whole obj in file
FILE_NAME = "./final_request_ids"
probe_country_dict = dict()
# Main function
if __name__ == "__main__":
# Write the results to respective files
# ping_file = open("./data/ping_data", "wb")
dns_file = open("./data/dns_data_newest", "wb")
# trace_file = open("./data/trace_data", "wb")
# Pickle object
# ping_pickler = pickle.Pickler(ping_file, -1)
dns_pickler = pickle.Pickler(dns_file, -1)
# trace_pickler = pickle.Pickler(trace_file, -1)
# Read the lines and create a separate list per type
f = open(FILE_NAME, "r")
lines = f.readlines()
print(lines)
ping_ids = lines[0].strip().split()
dns_ids = lines[1].strip().split()
trace_ids = lines[2].strip().split()
req_ids = ping_ids + dns_ids + trace_ids
final_trace_dict = dict() # Final Trace Route dict
final_ping_dict = dict() # Final Ping dict
final_dns_dict = dict() # Final DNS dict
for req_id in req_ids:
try:
m = Measurement(id=req_id)
subdomain_name = m.meta_data['target']
# Make sure this is the correct req_id- random checks
if m.description.startswith("Ping to"):
print("Ping- " + str(req_id))
# Get the result string in a JSON
url = m.result_url
url_handle = req.urlopen(url)
data_frame = json.loads(url_handle.read().decode())
# Create the per Probe Object
probe_dict = dict()
# Create the local per subdomain dict
subdomain_dict = dict()
i = 1
for df in data_frame:
# Fill the Country vs Probe Dictionary
probe_id = df["prb_id"]
try:
p = Probe(id=probe_id)
probe_country_dict[probe_id] = p.country_code
except Exception as e:
print("Exception at Probe ID- " + str(probe_id))
probe_dict[probe_id] = PingResult(df)
print("Iteration Num- " + str(i))
i += 1
subdomain_dict[subdomain_name] = probe_dict
# Fill up the domain-subdomain dict
domain_name = tld.extract(subdomain_name).domain
if domain_name not in final_ping_dict:
final_ping_dict[domain_name] = []
final_ping_dict[domain_name].append(subdomain_dict)
elif m.description.startswith('DNS A request for '):
print("DNS- " + str(req_id))
if not subdomain_name or subdomain_name == '':
subdomain_name = m.meta_data['query_argument']
# Get the result string in a JSON
url = m.result_url
url_handle = req.urlopen(url)
data_frame = json.loads(url_handle.read().decode())
# Create the per Probe Object
probe_dict = dict()
# Create the local per subdomain dict
subdomain_dict = {}
# Tester
i = 1
for df in data_frame:
# Fill the Country vs Probe Dictionary
probe_id = df["prb_id"]
try:
p = Probe(id=probe_id)
probe_country_dict[probe_id] = p.country_code
except Exception as e:
print("Exception at Probe ID- " + str(probe_id))
probe_dict[probe_id] = DnsResult(df)
print("Iteration Num- " + str(i))
i += 1
subdomain_dict[subdomain_name] = probe_dict
# Fill up the domain-subdomain dict
domain_name = tld.extract(subdomain_name).domain
if domain_name not in final_dns_dict:
final_dns_dict[domain_name] = []
final_dns_dict[domain_name].append(subdomain_dict)
elif m.description.startswith('Traceroute to '):
print("Traceroute- " + str(req_id))
# Get the result string in a JSON
url = m.result_url
url_handle = req.urlopen(url)
data_frame = json.loads(url_handle.read().decode())
# Create the per Probe Object
probe_dict = dict()
# Create the local per subdomain dict
i = 1
subdomain_dict = dict()
for df in data_frame:
# Fill the Country vs Probe Dictionary
probe_id = df["prb_id"]
try:
p = Probe(id=probe_id)
probe_country_dict[probe_id] = p.country_code
except Exception as e:
print("Exception at Probe ID- " + str(probe_id))
probe_dict[probe_id] = TracerouteResult(df)
print("Iteration Num- " + str(i))
i += 1
subdomain_dict[subdomain_name] = probe_dict
print(subdomain_name)
# Fill up the domain-subdomain dict
domain_name = tld.extract(subdomain_name).domain
if domain_name not in final_trace_dict:
final_trace_dict[domain_name] = []
final_trace_dict[domain_name].append(subdomain_dict)
except Exception as e:
print("Exception at -" + str(req_id))
# Dump the data to the files
# ping_pickler.dump(final_ping_dict)
dns_pickler.dump(final_dns_dict)
# trace_pickler.dump(final_trace_dict)
# File to save the probe country code
probe_file = open("./data/probe_country_code.json", "w")
probe_file.write(json.dumps(probe_country_dict))
probe_file.close()
# Close the files
# ping_file.close()
dns_file.close()
# trace_file.close()