Skip to content

Commit

Permalink
commit final report
Browse files Browse the repository at this point in the history
  • Loading branch information
nostalg33k committed Aug 13, 2019
1 parent 77e3222 commit 217e405
Show file tree
Hide file tree
Showing 4 changed files with 288 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ before_install:
deploy:
- api_key:
secure: i7vW4blwSfk03XHT5dSVNvltoC3rodaIamo3BwNQotamFVPEPqtwqswMXSotuo0fHdeX4fkrhx3Yrzu/eh4oK9WWQmInpxhteB0ttu6lXxLuQpMHoZHVdmFTYE9uwjy4/6Ri0AQGaRoghNoLfeEly3TGa0yXzWMwolY6gmrlJKMizWWKZG/V3z/1kBF0QWeqRfH5N/1Hx1sBiwW0NNvKZuElLUT+b9hK4tmUkQOWlrnLhOGk9MaiDw18Xf3ifEAr2nZgsCTgUTJy6+wlW1rmnVkKS0oiIt89Z7goCBdjVXmKLNoswF2nN/qxFj15mQ99VZPent6BtGMBBeRnBq4GDswVeb2AcnrGi5cijl23AFQV3iN711Dc1U9db9H24eeixZA1+/z5QR5S9zLztd5NUHprOv0nIXefuZ2CVVQF8J5y76A2AAytLRNxqdYVO/Au9u71Hwa5OE0Oaf/y+lJZFPktK4NnmOdDKgwAHM1ebc/HynDQ2ZR9DeAOScO7mC2QpBIukPfLozb5DUKUgnuQODEsAL8V6JlTTk7xQeEpkxLUUMHrg0e5SgwnIoO2/234zyEJp9Pto04nOz6LNqxb7WyxIT1aTneJrUmS3/fu5fBEFgQybmZDSBnJxasYkIzS2o/R4Q4QtzQ95wwrrxZMcvuB+m1d6nN01V9NZSH6F6U=
draft: true
draft: false
file: rapport.pdf
provider: releases
skip_cleanup: true
Expand Down
152 changes: 152 additions & 0 deletions rapport_fr/codec.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
\begin{lstlisting}[language=Python]
import json
import re

import distance
import matplotlib.pyplot as plt
import numpy as np
import pydeep
import tqdm
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import AffinityPropagation

NOT_WHITESPACE = re.compile(r'[^\s]')


def decode_stacked(document, pos=0, decoder=json.JSONDecoder()):
while True:
match = NOT_WHITESPACE.search(document, pos)
if not match:
return
pos = match.start()

try:
obj, pos = decoder.raw_decode(document, pos)
except json.JSONDecodeError:
raise
yield obj


class contract:


address_list = []
name_list = []
hash_list = []
contract_list = []

def __init__(self, jsonObj):
self.address = jsonObj['address']
self.name = jsonObj['contract_name']
self.bytecode = jsonObj['bytecode']
self.hash = pydeep.hash_buf(self.bytecode)
contract.contract_list.append(self)
contract.address_list.append(self.address)
contract.name_list.append(self.name)
contract.hash_list.append(self.hash)


def create_contracts(contracts_file):
with open(contracts_file, "r") as myfile:
contracts = myfile.read()

for obj in tqdm.tqdm(decode_stacked(contracts)):
new_c = contract(obj)

print("Contract bytecode hashes computed.")


def compute_similarity(X):
"""
Compute similarity matrix with mean of 3 distances
:param X: List of contracts ssdeep hashes
:return: Similarity matrix
"""
jaccard_matrix = pdist(X, lambda x, y: distance.jaccard(x[0], y[0]))
np.savetxt("../data/jaccard_matrix.csv", np.asarray(squareform(jaccard_matrix)), delimiter=",")

sorensen_matrix = pdist(X, lambda x, y: distance.sorensen(x[0], y[0]))
np.savetxt("../data/sorensen_matrix.csv", np.asarray(squareform(sorensen_matrix)), delimiter=",")

# normalized, so that the results can be meaningfully compared
# method=1 means the shortest alignment between the sequences is taken as factor
levenshtein_matrix = pdist(X, lambda x, y: distance.nlevenshtein(x[0], y[0], method=1))
np.savetxt("../data/levenshtein_matrix.csv", np.asarray(squareform(levenshtein_matrix)), delimiter=",")

mean_matrix = 1 - np.mean(np.array([jaccard_matrix, sorensen_matrix, levenshtein_matrix]), axis=0)
np.savetxt("../data/similarity_matrix.csv", np.asarray(mean_matrix), delimiter=",")

print("Similarity matrix computed.")
return mean_matrix


def clusterize(X):
"""
:param X: Similarity matrix
:return: List of (contract name, contract address, ssdeep hash, cluster label)
"""

# choose preference value base on unique contract names
names = [re.sub(r"\d+", "", name) for name in contract.name_list]
(unique_name, index_name) = np.unique(names, return_index=True)
print("Number of unique contract names: ", len(unique_name))

preference = np.full(len(names), np.amin(similarity_matrix))
for index in index_name:
preference[index] = np.amax(similarity_matrix)

af = AffinityPropagation(affinity="precomputed", max_iter=2000, convergence_iter=200, preference=preference,
damping=0.9)
af.fit(similarity_matrix)
cluster_centers_indices = af.cluster_centers_indices_
num_of_clusters = len(cluster_centers_indices)
labels = af.labels_

print('Number of clusters: %d' % num_of_clusters)

output = []
for i in range(len(labels)):
c = [contract.name_list[i], contract.address_list[i], contract.hash_list[i], labels[i]]
output.append(c)

np.savetxt("../data/contracts_clustering.csv", output, delimiter=",", fmt='%s')
return output


def autolabel(rects):
"""Attach a text label above each bar in *rects*, displaying its height."""
for rect in rects:
height = rect.get_height()
ax.annotate('{}'.format(height),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')

if __name__ == '__main__':
create_contracts('../data/contracts_list')

X = np.array(contract.hash_list).reshape(-1, 1)

similarity_matrix = compute_similarity(X)

output = clusterize(similarity_matrix)

# Plot cluster size/examplar name
nb_clusters = 10
(cluster, index, freq) = np.unique([i[3] for i in output], return_index=True, return_counts=True)
name = [re.sub(r"\d+", "", output[i][0]) for i in index]
address = [output[i][1] for i in index]
sort = sorted([(address[i], name[i], freq[i]) for i in range(len(name))], key=lambda c: c[2],
reverse=True)

fig, ax = plt.subplots()
bars = ax.bar(range(nb_clusters), [v[2] for v in sort[:nb_clusters]], label=[v[1] for v in sort[:nb_clusters]])
autolabel(bars)
plt.ylabel("Cluster size")
plt.xlabel("Exemplar name")

xticks_pos = [0.65 * rect.get_width() + rect.get_xy()[0] for rect in bars]
plt.xticks(xticks_pos, labels=[v[1] for v in sort[:nb_clusters]], ha='right', rotation=45)
plt.show()
\end{lstlisting}
135 changes: 135 additions & 0 deletions rapport_fr/codes.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
\begin{lstlisting}[language=Python]
import os
import signal
import sys

import requests
from bs4 import BeautifulSoup
from google.cloud import bigquery
from tqdm import tqdm

terminate = False

#file to save list of contracts addresses
CONTRACTS_FILE = ''

#dir to save Solidity source code
CONTRACTS_DIR = ''

NB_SOURCE = 0
NB_DELEGATE = 0

def get_contracts_from_bigQuery():
"""
Fetch contracts from BigQuery Ethereum dataset.
"""
global CONTRACTS_FILE
client = bigquery.Client()

# SQL query to request all contracts from 01/01/2019 to 31/05/2019.
new_contracts_query = 'SELECT address ' \
'FROM `bigquery-public-data.crypto_ethereum.contracts` ' \
'WHERE block_timestamp >= "{}-{}-{} 00:00:00" ' \
'AND block_timestamp < "{}-{}-{} 00:00:00" ' \
'AND bytecode != "0x" ' \
'ORDER BY block_timestamp'.format('2019','01','01','2019','05','31')

query_job = client.query(new_contracts_query)
results = query_job.result()
print('Contracts fetched')
with open(CONTRACTS_FILE,'w') as file:
for row in results:
file.write(row.address + '\n')

def get_delegate_code(address):
"""
Save Solidity contract source code if available at Etherscan and using delegatecall in CONTRACTS_DIR.
:param address: address of the contract
"""
global NB_SOURCE
global NB_DELEGATE


url = "https://etherscan.io/address/%s#code" % address

try:
r = requests.get(url)
except requests.exceptions.RequestException as e:
print('Error: {} Contract: {} Url: {}\n'.format(e, address, url))
return

html = r.text
soup = BeautifulSoup(html, 'html.parser')

# No source code
if soup.find(id="editor") == None:
return

name = soup.find("span", class_="h6 font-weight-bold mb-0").contents[0]
code = str(soup.find(id="editor")).replace('<pre class="js-sourcecopyarea editor" id="editor" style="margin-top: 5px;">', '').replace('</pre>','')

# No source code
if len(code) <= 0:
return
NB_SOURCE += 1

# No delegatecall in code
if 'delegatecall' in code:
NB_DELEGATE +=1
else:
return

bytecode = str(soup.find(id="verifiedbytecode2")).replace('<div id=\"verifiedbytecode2\">', '').replace('</div>','')
# Sometimes no bytecode is returned, even if available
while bytecode == "None":
try:
r = requests.get(url)
except requests.exceptions.RequestException as e:
print('Error: {} Contract: {} Url: {}\n'.format(e, address, url))
continue
html = r.text

soup = BeautifulSoup(html, 'html.parser')
bytecode = str(soup.find(id="verifiedbytecode2")).replace('<div id=\"verifiedbytecode2\">', '').replace(
'</div>', '')

# Fetch contract name if available
fname = name if len(name) > 0 else address

#Manage duplicate names
if os.path.isfile(CONTRACTS_DIR+fname+'.sol'):
i=0;
while os.path.isfile(CONTRACTS_DIR+fname+str(i)+'.sol'):
i+=1
fname += str(i)
fname += ".sol"

# Save .sol file
with open(CONTRACTS_DIR + fname, 'w') as of:
of.write('//Contract address: ' + address + '\n//Bytecode: ' + bytecode + '\n' + code)
of.flush()

def sigint_handler(signum, frame):
global terminate
terminate = True


if __name__ == '__main__':
contracts_count = 0
signal.signal(signal.SIGINT,sigint_handler)

#get_contracts_from_bigQuery()

with open(CONTRACTS_FILE,'r') as file:
lines = file.readlines()
for line in tqdm(lines):
get_delegate_code(line.rstrip())
contracts_count += 1

if terminate:
print('\nStopped collecting at line: {}'.format(line))
print('Number of contracts processed: {}\nNumber of contracts with source code: {}\nNumber of contracts with delegatecall: {}'.format(contracts_count,NB_SOURCE,NB_DELEGATE))
sys.exit(0)

print('\nNumber of contracts processed: {}\nNumber of contracts with source code: {}\nNumber of contracts with delegatecall: {}'.format(contracts_count, NB_SOURCE, NB_DELEGATE))
\end{lstlisting}
Binary file added rapport_fr/figures/calls_20_popular.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 217e405

Please sign in to comment.