-
Notifications
You must be signed in to change notification settings - Fork 5
/
pagerank.py
164 lines (129 loc) · 4.91 KB
/
pagerank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
import random as rd
import re
import sys
DAMPING = 0.85
SAMPLES = 10000
THRESHOLD = 0.001 # pageRank changes must be > threshold to be effective
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python pagerank.py corpus")
corpus = crawl(sys.argv[1])
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
print(f"PageRank Results from Sampling (n = {SAMPLES})")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
ranks = iterate_pagerank(corpus, DAMPING)
print(f"PageRank Results from Iteration")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
def crawl(directory):
"""
Parse a directory of HTML pages and check for links to other pages.
Return a dictionary where each key is a page, and values are
a list of all other pages in the corpus that are linked to by the page.
"""
pages = dict()
# Extract all links from HTML files
for filename in os.listdir(directory):
if not filename.endswith(".html"):
continue
with open(os.path.join(directory, filename)) as f:
contents = f.read()
links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
pages[filename] = set(links) - {filename}
# Only include links to other pages in the corpus
for filename in pages:
pages[filename] = set(link for link in pages[filename] if link in pages)
return pages
def transition_model(corpus, page, damping_factor):
"""
Return a probability distribution over which page to visit next,
given a current page.
With probability `damping_factor`, choose a link at random
linked to by `page`. With probability `1 - damping_factor`, choose
a link at random chosen from all pages in the corpus.
"""
# Initialize probability distribution in Case 2
probability = {page: (1 - damping_factor) / len(corpus) for page in corpus.keys()}
neighbors = corpus[page]
# If page has no outgoing links, choose randomly from all N pages with equal probability
if not neighbors:
neighbors = corpus.keys()
# Calculate the probability distribution in Case 1
for neighbor in neighbors:
probability[neighbor] += damping_factor / len(neighbors)
return probability
def normalize(sample):
"""
Return a dictionary such that each probability distribution is
normalized (i.e., sums to 1, with relative proportions the same).
"""
total = sum(sample.values())
for key in sample.keys():
sample[key] /= total
return sample
def sample_pagerank(corpus, damping_factor, n):
"""
Return PageRank values for each page by sampling `n` pages
according to transition model, starting with a page at random.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
pages = corpus.keys()
sample = {page: 0 for page in pages}
for i in range(n):
if i == 0:
# Start sampling from a random page
surfer = rd.choice(list(pages))
else:
model = transition_model(corpus, surfer, damping_factor)
surfer = rd.choices(list(model.keys()), list(model.values()), k=1)[0]
sample[surfer] += 1
return normalize(sample)
def links_to(corpus, page):
"""
Return a list of pages that links to the page
"""
links = []
for source, targets in corpus.items():
if not targets or page in targets:
links.append(source)
return links
def calculate(pageRank, corpus, page, damping_factor):
"""
Calculate the pageRank[page] using the iterative algorithm
"""
N = len(corpus)
sigma = 0
for link in links_to(corpus, page):
# If the source page has no link, assume it can visit all N pages
numLinks = N if not corpus[link] else len(corpus[link])
sigma += pageRank[link] / numLinks
return (1 - damping_factor) / N + damping_factor * sigma
def has_changes(value1, value2):
"""
Return true if delta > THRESHOLD
"""
return abs(value1 - value2) > THRESHOLD
def iterate_pagerank(corpus, damping_factor):
"""
Return PageRank values for each page by iteratively updating
PageRank values until convergence.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
pages = corpus.keys()
pageRank = {page: 1 / len(pages) for page in pages}
while True:
prevRank = pageRank.copy()
for page in pages:
pageRank[page] = calculate(prevRank, corpus, page, damping_factor)
# Repeat if at least 1 PageRank value changed
if any(has_changes(prevRank[page], pageRank[page]) for page in pages):
continue
return pageRank
if __name__ == "__main__":
main()