This repository has been archived by the owner on Oct 4, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 16
/
pagerank_test.py
67 lines (42 loc) · 1.68 KB
/
pagerank_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""
Acceptance tests for pagerank.py
Make sure that this file is in the same directory as pagerank.py!
'Why do we fall sir? So that we can learn to pick ourselves up.'
- Batman Begins (2005)
"""
import random as rd
import pytest as pt
from pagerank import DAMPING, crawl, iterate_pagerank, sample_pagerank
TOLERANCE = 1e-3 # Error tolerance = ±0.001 when comparing sample and iterate results
SAMPLES = 10 ** 6 # More samples => better result
corpus0 = crawl("corpus0")
def test_crawl0():
assert len(corpus0) == 4
def test_iterate0():
expected = {"1.html": 0.2202, "2.html": 0.4289, "3.html": 0.2202, "4.html": 0.1307}
iterate = iterate_pagerank(corpus0, damping_factor=DAMPING)
return compare(iterate, expected)
@pt.mark.parametrize("execution_number", range(10))
def test_sample_vs_iterate(execution_number):
return run_sample_vs_iterate()
# helper function
def checksum(probability):
assert sum(probability.values()) == pt.approx(1, abs=TOLERANCE)
def run_sample_vs_iterate():
corpus, _ = generate_random_data()
sample = sample_pagerank(corpus, damping_factor=DAMPING, n=SAMPLES)
iterate = iterate_pagerank(corpus, damping_factor=DAMPING)
checksum(sample)
checksum(iterate)
return compare(sample, iterate)
def compare(prob1, prob2):
for page in prob1.keys():
assert prob1[page] == pt.approx(prob2[page], abs=TOLERANCE)
def generate_random_data():
links = [f"{i}.html" for i in range(rd.randint(1, 10))]
page = rd.choice(links)
corpus = {
link: set(rd.choices(links, k=rd.randint(0, len(links)))) - set([link])
for link in links
}
return corpus, page