-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_wikipedia.py
78 lines (62 loc) · 2.09 KB
/
scrape_wikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from bs4 import BeautifulSoup
import requests
import queue
import mysql.connector as connector
def toWikiLink(string):
string = string.strip().replace(" ", "_")
return "https://en.wikipedia.org/wiki/" + string
cnx = connector.connect(user='root',
host='127.0.0.1',
database='separation')
cursor = cnx.cursor()
start = "Hydrophobia_(disambiguation)"
cursor.execute("INSERT INTO page (title) VALUES (%s)", (start,))
cnx.commit()
start_id = cursor.lastrowid
scanned = set()
added = set()
q = queue.Queue()
q.put(start_id)
added.add(start_id)
currCount = 0
while not q.empty():
id = q.get()
if id in scanned:
continue
currCount += 1
cursor.execute("SELECT title FROM page WHERE id = %s", (id,))
title = cursor.fetchone()[0]
page = requests.get(toWikiLink(title))
scanned.add(id)
print("Running (" + str(currCount) + "): " + title)
if page.status_code != 200:
print("---------Error: page " + title + " cannot be accessed. Status code: " + str(page.status_code))
continue
soup = BeautifulSoup(page.content, 'html.parser')
tags = soup.find(id="bodyContent").find_all('a')
added_links = set()
for tag in tags:
link = tag.get("href", "")
if not link.startswith("/wiki/") or link.startswith("/wiki/File"):
continue
link = link[6:]
if link in added_links:
continue
added_links.add(link)
print(" > Adding edge: " + link)
cursor.execute("SELECT id FROM page WHERE title = %s", (link,))
row = cursor.fetchone()
if row:
new_id = row[0]
else:
cursor.execute("INSERT INTO page (title) VALUES (%s)", (link,))
new_id = cursor.lastrowid
cnx.commit()
if new_id not in added:
q.put(new_id)
added.add(new_id)
cursor.execute("INSERT INTO link (source, destination) VALUES (%s, %s)", (id, new_id,))
new_id = cursor.lastrowid
cnx.commit()
cursor.close()
cnx.close()