-
Notifications
You must be signed in to change notification settings - Fork 1
/
CB3.py
141 lines (113 loc) · 4.33 KB
/
CB3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
# coding: utf-8
# In[33]:
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, NavigableString, Tag
import requests
import re
import calendar
from sqlalchemy import create_engine, Column, String, Integer, MetaData
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
# In[34]:
#init connection to sqlite
engine = create_engine("sqlite:///events.db")
#create session to cache commands for sqlite engine instance
Session = sessionmaker(bind = engine)
session = Session()
#provide table definition
Base = declarative_base()
class Event(Base):
__tablename__ = 'event'
id = Column('id',Integer, primary_key = True)
district = Column(Integer)
title = Column(String(100))
date = Column(String(50))
details = Column(String(1000))
time = Column(String(50))
def __init__(self, title, date, details,time,district):
self.title = title
self.date = date
self.details = details
self.time = time
self.district = district
#for print
def __repr__(self):
return f'{self.title} - {self.date}: {self.time}\n {self.details}'
#call to metadata to generate schema
Base.metadata.create_all(engine)
# In[35]:
def CB3(url,session,monthInteger):
#convert the webpage to soup
page = requests.get(url)
soup = BeautifulSoup(page.text, features = 'lxml')
#find the webpage body with event details
body = soup.find_all(class_='bodytext')[1]
blocks = []
block = []
for tag in body:
#find start of event with every hr tag and append it to blocks
if len(block) == 0 and tag.name == 'hr':
block.append(tag)
continue
if len(block) > 0 and tag.name == 'hr':
blocks.append(block)
block = []
continue
#if hr tag not found append the tag to previous block
block.append(tag)
events = []
event = {
'title': None,
'date': None,
'details': ''
}
#iterate over tags for each event, search for title,date,time and details and add to event dict
for block in blocks:
for i,tag in enumerate(block):
for item in str(tag).split('<br/>'):
text = BeautifulSoup(item,features = 'lxml').text
#if event title and date found append to event otherwise init event to None
#if <b> tag found add as title
if event['title'] is None and '<b>' in item:
event['title'] = text
continue
#if current month found get date and time
if event['date'] is None and calendar.month_name[monthInteger] in item:
date = re.search(r"\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(Nov|Dec)(?:ember)?) (([1-3]\d)|(\d))",text)
time = re.search(r"((1[0-2]|0?\d):(\d\d)([AaPp][Mm]))",text)
event['date'] = str(monthInteger) + '/' + date.group(2) if date else ''
event['time'] = time.group(0) if time else ''
continue
#add rest of the tags in event as details
event['details'] += " " + str(text).replace('\xa0',' ')
if not(event['title'] and event['date']):
event = {
'title': None,
'date': None,
'details': ''
}
continue
#append event with title and date to events list and init event dict to None
events.append(event)
event = {
'title': None,
'date': None,
'details': ''
}
district = 103
#remove previous entries
session.query(Event).filter(Event.district == district).delete()
session.commit()
#add items to database
for event in events:
row = Event(title=event['title'], date=event['date'],
details=event['details'], time=event['time'], district= district)
session.add(row)
session.commit()
# In[36]:
CB3('https://www1.nyc.gov/html/mancb3/html/calendar/calendar.shtml',session,2)
# In[37]:
#print all users
for event in session.query(Event).filter(Event.district == 103):
print(event,"\n-------------------------------")