-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
77e3222
commit 217e405
Showing
4 changed files
with
288 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
\begin{lstlisting}[language=Python] | ||
import json | ||
import re | ||
|
||
import distance | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
import pydeep | ||
import tqdm | ||
from scipy.spatial.distance import pdist, squareform | ||
from sklearn.cluster import AffinityPropagation | ||
|
||
NOT_WHITESPACE = re.compile(r'[^\s]') | ||
|
||
|
||
def decode_stacked(document, pos=0, decoder=json.JSONDecoder()): | ||
while True: | ||
match = NOT_WHITESPACE.search(document, pos) | ||
if not match: | ||
return | ||
pos = match.start() | ||
|
||
try: | ||
obj, pos = decoder.raw_decode(document, pos) | ||
except json.JSONDecodeError: | ||
raise | ||
yield obj | ||
|
||
|
||
class contract: | ||
|
||
|
||
address_list = [] | ||
name_list = [] | ||
hash_list = [] | ||
contract_list = [] | ||
|
||
def __init__(self, jsonObj): | ||
self.address = jsonObj['address'] | ||
self.name = jsonObj['contract_name'] | ||
self.bytecode = jsonObj['bytecode'] | ||
self.hash = pydeep.hash_buf(self.bytecode) | ||
contract.contract_list.append(self) | ||
contract.address_list.append(self.address) | ||
contract.name_list.append(self.name) | ||
contract.hash_list.append(self.hash) | ||
|
||
|
||
def create_contracts(contracts_file): | ||
with open(contracts_file, "r") as myfile: | ||
contracts = myfile.read() | ||
|
||
for obj in tqdm.tqdm(decode_stacked(contracts)): | ||
new_c = contract(obj) | ||
|
||
print("Contract bytecode hashes computed.") | ||
|
||
|
||
def compute_similarity(X): | ||
""" | ||
Compute similarity matrix with mean of 3 distances | ||
:param X: List of contracts ssdeep hashes | ||
:return: Similarity matrix | ||
""" | ||
jaccard_matrix = pdist(X, lambda x, y: distance.jaccard(x[0], y[0])) | ||
np.savetxt("../data/jaccard_matrix.csv", np.asarray(squareform(jaccard_matrix)), delimiter=",") | ||
|
||
sorensen_matrix = pdist(X, lambda x, y: distance.sorensen(x[0], y[0])) | ||
np.savetxt("../data/sorensen_matrix.csv", np.asarray(squareform(sorensen_matrix)), delimiter=",") | ||
|
||
# normalized, so that the results can be meaningfully compared | ||
# method=1 means the shortest alignment between the sequences is taken as factor | ||
levenshtein_matrix = pdist(X, lambda x, y: distance.nlevenshtein(x[0], y[0], method=1)) | ||
np.savetxt("../data/levenshtein_matrix.csv", np.asarray(squareform(levenshtein_matrix)), delimiter=",") | ||
|
||
mean_matrix = 1 - np.mean(np.array([jaccard_matrix, sorensen_matrix, levenshtein_matrix]), axis=0) | ||
np.savetxt("../data/similarity_matrix.csv", np.asarray(mean_matrix), delimiter=",") | ||
|
||
print("Similarity matrix computed.") | ||
return mean_matrix | ||
|
||
|
||
def clusterize(X): | ||
""" | ||
:param X: Similarity matrix | ||
:return: List of (contract name, contract address, ssdeep hash, cluster label) | ||
""" | ||
|
||
# choose preference value base on unique contract names | ||
names = [re.sub(r"\d+", "", name) for name in contract.name_list] | ||
(unique_name, index_name) = np.unique(names, return_index=True) | ||
print("Number of unique contract names: ", len(unique_name)) | ||
|
||
preference = np.full(len(names), np.amin(similarity_matrix)) | ||
for index in index_name: | ||
preference[index] = np.amax(similarity_matrix) | ||
|
||
af = AffinityPropagation(affinity="precomputed", max_iter=2000, convergence_iter=200, preference=preference, | ||
damping=0.9) | ||
af.fit(similarity_matrix) | ||
cluster_centers_indices = af.cluster_centers_indices_ | ||
num_of_clusters = len(cluster_centers_indices) | ||
labels = af.labels_ | ||
|
||
print('Number of clusters: %d' % num_of_clusters) | ||
|
||
output = [] | ||
for i in range(len(labels)): | ||
c = [contract.name_list[i], contract.address_list[i], contract.hash_list[i], labels[i]] | ||
output.append(c) | ||
|
||
np.savetxt("../data/contracts_clustering.csv", output, delimiter=",", fmt='%s') | ||
return output | ||
|
||
|
||
def autolabel(rects): | ||
"""Attach a text label above each bar in *rects*, displaying its height.""" | ||
for rect in rects: | ||
height = rect.get_height() | ||
ax.annotate('{}'.format(height), | ||
xy=(rect.get_x() + rect.get_width() / 2, height), | ||
xytext=(0, 3), # 3 points vertical offset | ||
textcoords="offset points", | ||
ha='center', va='bottom') | ||
|
||
if __name__ == '__main__': | ||
create_contracts('../data/contracts_list') | ||
|
||
X = np.array(contract.hash_list).reshape(-1, 1) | ||
|
||
similarity_matrix = compute_similarity(X) | ||
|
||
output = clusterize(similarity_matrix) | ||
|
||
# Plot cluster size/examplar name | ||
nb_clusters = 10 | ||
(cluster, index, freq) = np.unique([i[3] for i in output], return_index=True, return_counts=True) | ||
name = [re.sub(r"\d+", "", output[i][0]) for i in index] | ||
address = [output[i][1] for i in index] | ||
sort = sorted([(address[i], name[i], freq[i]) for i in range(len(name))], key=lambda c: c[2], | ||
reverse=True) | ||
|
||
fig, ax = plt.subplots() | ||
bars = ax.bar(range(nb_clusters), [v[2] for v in sort[:nb_clusters]], label=[v[1] for v in sort[:nb_clusters]]) | ||
autolabel(bars) | ||
plt.ylabel("Cluster size") | ||
plt.xlabel("Exemplar name") | ||
|
||
xticks_pos = [0.65 * rect.get_width() + rect.get_xy()[0] for rect in bars] | ||
plt.xticks(xticks_pos, labels=[v[1] for v in sort[:nb_clusters]], ha='right', rotation=45) | ||
plt.show() | ||
\end{lstlisting} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
\begin{lstlisting}[language=Python] | ||
import os | ||
import signal | ||
import sys | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
from google.cloud import bigquery | ||
from tqdm import tqdm | ||
|
||
terminate = False | ||
|
||
#file to save list of contracts addresses | ||
CONTRACTS_FILE = '' | ||
|
||
#dir to save Solidity source code | ||
CONTRACTS_DIR = '' | ||
|
||
NB_SOURCE = 0 | ||
NB_DELEGATE = 0 | ||
|
||
def get_contracts_from_bigQuery(): | ||
""" | ||
Fetch contracts from BigQuery Ethereum dataset. | ||
""" | ||
global CONTRACTS_FILE | ||
client = bigquery.Client() | ||
|
||
# SQL query to request all contracts from 01/01/2019 to 31/05/2019. | ||
new_contracts_query = 'SELECT address ' \ | ||
'FROM `bigquery-public-data.crypto_ethereum.contracts` ' \ | ||
'WHERE block_timestamp >= "{}-{}-{} 00:00:00" ' \ | ||
'AND block_timestamp < "{}-{}-{} 00:00:00" ' \ | ||
'AND bytecode != "0x" ' \ | ||
'ORDER BY block_timestamp'.format('2019','01','01','2019','05','31') | ||
|
||
query_job = client.query(new_contracts_query) | ||
results = query_job.result() | ||
print('Contracts fetched') | ||
with open(CONTRACTS_FILE,'w') as file: | ||
for row in results: | ||
file.write(row.address + '\n') | ||
|
||
def get_delegate_code(address): | ||
""" | ||
Save Solidity contract source code if available at Etherscan and using delegatecall in CONTRACTS_DIR. | ||
:param address: address of the contract | ||
""" | ||
global NB_SOURCE | ||
global NB_DELEGATE | ||
|
||
|
||
url = "https://etherscan.io/address/%s#code" % address | ||
|
||
try: | ||
r = requests.get(url) | ||
except requests.exceptions.RequestException as e: | ||
print('Error: {} Contract: {} Url: {}\n'.format(e, address, url)) | ||
return | ||
|
||
html = r.text | ||
soup = BeautifulSoup(html, 'html.parser') | ||
|
||
# No source code | ||
if soup.find(id="editor") == None: | ||
return | ||
|
||
name = soup.find("span", class_="h6 font-weight-bold mb-0").contents[0] | ||
code = str(soup.find(id="editor")).replace('<pre class="js-sourcecopyarea editor" id="editor" style="margin-top: 5px;">', '').replace('</pre>','') | ||
|
||
# No source code | ||
if len(code) <= 0: | ||
return | ||
NB_SOURCE += 1 | ||
|
||
# No delegatecall in code | ||
if 'delegatecall' in code: | ||
NB_DELEGATE +=1 | ||
else: | ||
return | ||
|
||
bytecode = str(soup.find(id="verifiedbytecode2")).replace('<div id=\"verifiedbytecode2\">', '').replace('</div>','') | ||
# Sometimes no bytecode is returned, even if available | ||
while bytecode == "None": | ||
try: | ||
r = requests.get(url) | ||
except requests.exceptions.RequestException as e: | ||
print('Error: {} Contract: {} Url: {}\n'.format(e, address, url)) | ||
continue | ||
html = r.text | ||
|
||
soup = BeautifulSoup(html, 'html.parser') | ||
bytecode = str(soup.find(id="verifiedbytecode2")).replace('<div id=\"verifiedbytecode2\">', '').replace( | ||
'</div>', '') | ||
|
||
# Fetch contract name if available | ||
fname = name if len(name) > 0 else address | ||
|
||
#Manage duplicate names | ||
if os.path.isfile(CONTRACTS_DIR+fname+'.sol'): | ||
i=0; | ||
while os.path.isfile(CONTRACTS_DIR+fname+str(i)+'.sol'): | ||
i+=1 | ||
fname += str(i) | ||
fname += ".sol" | ||
|
||
# Save .sol file | ||
with open(CONTRACTS_DIR + fname, 'w') as of: | ||
of.write('//Contract address: ' + address + '\n//Bytecode: ' + bytecode + '\n' + code) | ||
of.flush() | ||
|
||
def sigint_handler(signum, frame): | ||
global terminate | ||
terminate = True | ||
|
||
|
||
if __name__ == '__main__': | ||
contracts_count = 0 | ||
signal.signal(signal.SIGINT,sigint_handler) | ||
|
||
#get_contracts_from_bigQuery() | ||
|
||
with open(CONTRACTS_FILE,'r') as file: | ||
lines = file.readlines() | ||
for line in tqdm(lines): | ||
get_delegate_code(line.rstrip()) | ||
contracts_count += 1 | ||
|
||
if terminate: | ||
print('\nStopped collecting at line: {}'.format(line)) | ||
print('Number of contracts processed: {}\nNumber of contracts with source code: {}\nNumber of contracts with delegatecall: {}'.format(contracts_count,NB_SOURCE,NB_DELEGATE)) | ||
sys.exit(0) | ||
|
||
print('\nNumber of contracts processed: {}\nNumber of contracts with source code: {}\nNumber of contracts with delegatecall: {}'.format(contracts_count, NB_SOURCE, NB_DELEGATE)) | ||
\end{lstlisting} |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.