commit final report

nostalg33k · Aug 13, 2019 · 217e405 · 217e405
1 parent 77e3222
commit 217e405
Show file tree

Hide file tree

Showing 4 changed files with 288 additions and 1 deletion.
diff --git a/.travis.yml b/.travis.yml
@@ -3,7 +3,7 @@ before_install:
 deploy:
 - api_key:
     secure: i7vW4blwSfk03XHT5dSVNvltoC3rodaIamo3BwNQotamFVPEPqtwqswMXSotuo0fHdeX4fkrhx3Yrzu/eh4oK9WWQmInpxhteB0ttu6lXxLuQpMHoZHVdmFTYE9uwjy4/6Ri0AQGaRoghNoLfeEly3TGa0yXzWMwolY6gmrlJKMizWWKZG/V3z/1kBF0QWeqRfH5N/1Hx1sBiwW0NNvKZuElLUT+b9hK4tmUkQOWlrnLhOGk9MaiDw18Xf3ifEAr2nZgsCTgUTJy6+wlW1rmnVkKS0oiIt89Z7goCBdjVXmKLNoswF2nN/qxFj15mQ99VZPent6BtGMBBeRnBq4GDswVeb2AcnrGi5cijl23AFQV3iN711Dc1U9db9H24eeixZA1+/z5QR5S9zLztd5NUHprOv0nIXefuZ2CVVQF8J5y76A2AAytLRNxqdYVO/Au9u71Hwa5OE0Oaf/y+lJZFPktK4NnmOdDKgwAHM1ebc/HynDQ2ZR9DeAOScO7mC2QpBIukPfLozb5DUKUgnuQODEsAL8V6JlTTk7xQeEpkxLUUMHrg0e5SgwnIoO2/234zyEJp9Pto04nOz6LNqxb7WyxIT1aTneJrUmS3/fu5fBEFgQybmZDSBnJxasYkIzS2o/R4Q4QtzQ95wwrrxZMcvuB+m1d6nN01V9NZSH6F6U=
-  draft: true
+  draft: false
   file: rapport.pdf
   provider: releases
   skip_cleanup: true

diff --git a/rapport_fr/codec.tex b/rapport_fr/codec.tex
@@ -0,0 +1,152 @@
+\begin{lstlisting}[language=Python]
+import json
+import re
+
+import distance
+import matplotlib.pyplot as plt
+import numpy as np
+import pydeep
+import tqdm
+from scipy.spatial.distance import pdist, squareform
+from sklearn.cluster import AffinityPropagation
+
+NOT_WHITESPACE = re.compile(r'[^\s]')
+
+
+def decode_stacked(document, pos=0, decoder=json.JSONDecoder()):
+    while True:
+        match = NOT_WHITESPACE.search(document, pos)
+        if not match:
+            return
+        pos = match.start()
+
+        try:
+            obj, pos = decoder.raw_decode(document, pos)
+        except json.JSONDecodeError:
+            raise
+        yield obj
+
+
+class contract:
+
+
+    address_list = []
+    name_list = []
+    hash_list = []
+    contract_list = []
+
+    def __init__(self, jsonObj):
+        self.address = jsonObj['address']
+        self.name = jsonObj['contract_name']
+        self.bytecode = jsonObj['bytecode']
+        self.hash = pydeep.hash_buf(self.bytecode)
+        contract.contract_list.append(self)
+        contract.address_list.append(self.address)
+        contract.name_list.append(self.name)
+        contract.hash_list.append(self.hash)
+
+
+def create_contracts(contracts_file):
+    with open(contracts_file, "r") as myfile:
+        contracts = myfile.read()
+
+    for obj in tqdm.tqdm(decode_stacked(contracts)):
+        new_c = contract(obj)
+
+    print("Contract bytecode hashes computed.")
+
+
+def compute_similarity(X):
+    """
+    Compute similarity matrix with mean of 3 distances
+    :param X: List of contracts ssdeep hashes
+    :return: Similarity matrix
+    """
+    jaccard_matrix = pdist(X, lambda x, y: distance.jaccard(x[0], y[0]))
+    np.savetxt("../data/jaccard_matrix.csv", np.asarray(squareform(jaccard_matrix)), delimiter=",")
+
+    sorensen_matrix = pdist(X, lambda x, y: distance.sorensen(x[0], y[0]))
+    np.savetxt("../data/sorensen_matrix.csv", np.asarray(squareform(sorensen_matrix)), delimiter=",")
+
+    # normalized, so that the results can be meaningfully compared
+    # method=1 means the shortest alignment between the sequences is taken as factor
+    levenshtein_matrix = pdist(X, lambda x, y: distance.nlevenshtein(x[0], y[0], method=1))
+    np.savetxt("../data/levenshtein_matrix.csv", np.asarray(squareform(levenshtein_matrix)), delimiter=",")
+
+    mean_matrix = 1 - np.mean(np.array([jaccard_matrix, sorensen_matrix, levenshtein_matrix]), axis=0)
+    np.savetxt("../data/similarity_matrix.csv", np.asarray(mean_matrix), delimiter=",")
+
+    print("Similarity matrix computed.")
+    return mean_matrix
+
+
+def clusterize(X):
+    """
+    :param X: Similarity matrix
+    :return: List of (contract name, contract address, ssdeep hash, cluster label)
+    """
+
+    # choose preference value base on unique contract names
+    names = [re.sub(r"\d+", "", name) for name in contract.name_list]
+    (unique_name, index_name) = np.unique(names, return_index=True)
+    print("Number of unique contract names: ", len(unique_name))
+
+    preference = np.full(len(names), np.amin(similarity_matrix))
+    for index in index_name:
+        preference[index] = np.amax(similarity_matrix)
+
+    af = AffinityPropagation(affinity="precomputed", max_iter=2000, convergence_iter=200, preference=preference,
+                             damping=0.9)
+    af.fit(similarity_matrix)
+    cluster_centers_indices = af.cluster_centers_indices_
+    num_of_clusters = len(cluster_centers_indices)
+    labels = af.labels_
+
+    print('Number of clusters: %d' % num_of_clusters)
+
+    output = []
+    for i in range(len(labels)):
+        c = [contract.name_list[i], contract.address_list[i], contract.hash_list[i], labels[i]]
+        output.append(c)
+
+    np.savetxt("../data/contracts_clustering.csv", output, delimiter=",", fmt='%s')
+    return output
+
+
+def autolabel(rects):
+    """Attach a text label above each bar in *rects*, displaying its height."""
+    for rect in rects:
+        height = rect.get_height()
+        ax.annotate('{}'.format(height),
+                    xy=(rect.get_x() + rect.get_width() / 2, height),
+                    xytext=(0, 3),  # 3 points vertical offset
+                    textcoords="offset points",
+                    ha='center', va='bottom')
+
+if __name__ == '__main__':
+    create_contracts('../data/contracts_list')
+
+    X = np.array(contract.hash_list).reshape(-1, 1)
+
+    similarity_matrix = compute_similarity(X)
+
+    output = clusterize(similarity_matrix)
+
+    # Plot cluster size/examplar name
+    nb_clusters = 10
+    (cluster, index, freq) = np.unique([i[3] for i in output], return_index=True, return_counts=True)
+    name = [re.sub(r"\d+", "", output[i][0]) for i in index]
+    address = [output[i][1] for i in index]
+    sort = sorted([(address[i], name[i], freq[i]) for i in range(len(name))], key=lambda c: c[2],
+                  reverse=True)
+
+    fig, ax = plt.subplots()
+    bars = ax.bar(range(nb_clusters), [v[2] for v in sort[:nb_clusters]], label=[v[1] for v in sort[:nb_clusters]])
+    autolabel(bars)
+    plt.ylabel("Cluster size")
+    plt.xlabel("Exemplar name")
+
+    xticks_pos = [0.65 * rect.get_width() + rect.get_xy()[0] for rect in bars]
+    plt.xticks(xticks_pos, labels=[v[1] for v in sort[:nb_clusters]], ha='right', rotation=45)
+    plt.show()
+    \end{lstlisting}
diff --git a/rapport_fr/codes.tex b/rapport_fr/codes.tex
@@ -0,0 +1,135 @@
+\begin{lstlisting}[language=Python]
+import os
+import signal
+import sys
+
+import requests
+from bs4 import BeautifulSoup
+from google.cloud import bigquery
+from tqdm import tqdm
+
+terminate = False
+
+#file to save list of contracts addresses
+CONTRACTS_FILE = ''
+
+#dir to save Solidity source code
+CONTRACTS_DIR = ''
+
+NB_SOURCE = 0
+NB_DELEGATE = 0
+
+def get_contracts_from_bigQuery():
+    """
+    Fetch contracts from BigQuery Ethereum dataset.
+    """
+    global CONTRACTS_FILE
+    client = bigquery.Client()
+
+    # SQL query to request all contracts from 01/01/2019 to 31/05/2019.
+    new_contracts_query = 'SELECT address  ' \
+                          'FROM `bigquery-public-data.crypto_ethereum.contracts` ' \
+                          'WHERE block_timestamp >= "{}-{}-{} 00:00:00" ' \
+                          'AND block_timestamp < "{}-{}-{} 00:00:00" ' \
+                          'AND bytecode != "0x" ' \
+                          'ORDER BY block_timestamp'.format('2019','01','01','2019','05','31')
+
+    query_job = client.query(new_contracts_query)
+    results = query_job.result()
+    print('Contracts fetched')
+    with open(CONTRACTS_FILE,'w') as file:
+        for row in results:
+            file.write(row.address + '\n')
+
+    def get_delegate_code(address):
+    """
+    Save Solidity contract source code if available at Etherscan and using delegatecall in CONTRACTS_DIR.
+    :param address: address of the contract
+    """
+    global NB_SOURCE
+    global NB_DELEGATE
+
+
+    url = "https://etherscan.io/address/%s#code" % address
+
+    try:
+        r = requests.get(url)
+    except requests.exceptions.RequestException as e:
+        print('Error: {}  Contract: {}  Url: {}\n'.format(e, address, url))
+        return
+
+    html = r.text
+    soup = BeautifulSoup(html, 'html.parser')
+
+    # No source code
+    if soup.find(id="editor") == None:
+        return
+
+    name = soup.find("span", class_="h6 font-weight-bold mb-0").contents[0]
+    code = str(soup.find(id="editor")).replace('<pre class="js-sourcecopyarea editor" id="editor" style="margin-top: 5px;">', '').replace('</pre>','')
+
+    # No source code
+    if len(code) <= 0:
+        return
+    NB_SOURCE += 1
+
+    # No delegatecall in code
+    if 'delegatecall' in code:
+        NB_DELEGATE +=1
+    else:
+        return
+
+    bytecode = str(soup.find(id="verifiedbytecode2")).replace('<div id=\"verifiedbytecode2\">', '').replace('</div>','')
+    # Sometimes no bytecode is returned, even if available
+    while bytecode == "None":
+        try:
+            r = requests.get(url)
+        except requests.exceptions.RequestException as e:
+            print('Error: {}  Contract: {}  Url: {}\n'.format(e, address, url))
+            continue
+        html = r.text
+
+        soup = BeautifulSoup(html, 'html.parser')
+        bytecode = str(soup.find(id="verifiedbytecode2")).replace('<div id=\"verifiedbytecode2\">', '').replace(
+            '</div>', '')
+
+    # Fetch contract name if available
+    fname = name if len(name) > 0 else address
+
+    #Manage duplicate names
+    if os.path.isfile(CONTRACTS_DIR+fname+'.sol'):
+        i=0;
+        while os.path.isfile(CONTRACTS_DIR+fname+str(i)+'.sol'):
+            i+=1
+        fname += str(i)
+    fname += ".sol"
+
+    # Save .sol file
+    with open(CONTRACTS_DIR + fname, 'w') as of:
+        of.write('//Contract address: ' + address + '\n//Bytecode: ' + bytecode + '\n' + code)
+        of.flush()
+
+def sigint_handler(signum, frame):
+    global terminate
+    terminate = True
+
+
+if __name__ == '__main__':
+    contracts_count = 0
+    signal.signal(signal.SIGINT,sigint_handler)
+
+    #get_contracts_from_bigQuery()
+
+    with open(CONTRACTS_FILE,'r') as file:
+        lines = file.readlines()
+        for line in tqdm(lines):
+            get_delegate_code(line.rstrip())
+            contracts_count += 1
+
+            if terminate:
+                print('\nStopped collecting at line: {}'.format(line))
+                print('Number of contracts processed: {}\nNumber of contracts with source code: {}\nNumber of contracts with delegatecall: {}'.format(contracts_count,NB_SOURCE,NB_DELEGATE))
+                sys.exit(0)
+
+    print('\nNumber of contracts processed: {}\nNumber of contracts with source code: {}\nNumber of contracts with delegatecall: {}'.format(contracts_count, NB_SOURCE, NB_DELEGATE))
+\end{lstlisting}
diff --git a/rapport_fr/figures/calls_20_popular.png b/rapport_fr/figures/calls_20_popular.png