-
Notifications
You must be signed in to change notification settings - Fork 2
/
fec-press-release-scraper
executable file
·167 lines (132 loc) · 4.38 KB
/
fec-press-release-scraper
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python3
#
# Scrape the FEC site for all Press Releases.
# Writes each PR to a unique .json file with meta+content.
#
# Usage:
# python fec-press-release-scraper
from lxml import html
from lxml.etree import XPath
from lxml.etree import tostring
import hashlib
import requests
import pprint
import json
import os
import sys
import re
base_url = 'http://www.fec.gov/press/'
release_list = 'news_releases.shtml'
###############################################################
# methods
def debug_mode():
return os.environ.get('DEBUG')
def shasum_for_href(href):
return hashlib.sha1(href.encode()).hexdigest()
def fetch_press_release(href):
# create html cache dir if necessary
if not os.path.isdir('./html'):
os.mkdir('./html')
html_cache_file = './html/' + shasum_for_href(href) + '.html'
if os.path.isfile(html_cache_file):
html_buf = open(html_cache_file, 'r').read()
else:
page = requests.get(base_url + href)
if page.headers['content-type'] != 'text/html':
html_buf = "<html><body><a href='{0}' class='pdf'>PDF</a></body></html>".format(href).encode('latin1')
else:
html_buf = page.content
with open(html_cache_file, 'w') as cached_html:
cached_html.write(html_buf.decode('latin1'))
tree = html.fromstring(html_buf)
pr_els = tree.xpath('//div[@class="press_release_content"]')
if not pr_els:
pr_els = tree.xpath('//div[@id="fec_mainContent"]')
if not pr_els:
pr_els = tree.xpath('//div[@id="fec_press_content"]')
# this is a last gasp. there are likely other recogizable patterns
# that should be matched first, but that haven't yet been found.
if not pr_els:
pr_els = tree.xpath('//body')
if len(pr_els) == 0:
print("ERROR: Failed to local press release content for ", href)
print("Cached in ", html_cache_file)
debug_html_element(tree)
return pr_els
def parse_press_release(pr, href):
# remove all the 'style' attributes
for tag in pr.xpath('//*[@style]'):
tag.attrib.pop('style')
pr_data = { 'html': tostring(pr).decode('UTF-8'), 'href': href }
pr_data['pdf'] = parse_pdf_link(pr)
return pr_data
def parse_pdf_link(pr):
link = pr.xpath('//a[@class="pdf"]')
if link:
return link[0].attrib.pop('href')
return None
def debug_html_element(el):
print('HTML: %s' % tostring(el, pretty_print=True).decode('UTF-8'))
def fetch_release_list():
# fetch the master list, or use local cached version
if not os.path.isfile(release_list):
page = requests.get(base_url + release_list)
with open(release_list, 'w') as cached_release_list:
cached_release_list.write(page.content.decode('latin1'))
# create output dir if it does not yet exist
if not os.path.isdir('./json'):
os.mkdir('./json')
# parse the master list
filehandle = open(release_list, 'r')
return html.fromstring(filehandle.read())
def process_release_list():
tree = fetch_release_list()
news_releases = tree.xpath('//table[@id="news_releases"]/tbody/tr')
cell_text = XPath('./td//text()')
title_cell = XPath('./td/*/a|./td/a')
for row in news_releases:
cells = cell_text(row)
date = cells[0]
title = cells[1]
category = cells[2]
pr_link = title_cell(row)
if not pr_link:
print("No PR link for row %s" % tostring(row))
continue
href = pr_link[0].get('href')
if debug_mode():
print("href=%s" % href)
pr_data = process_release_page(href)
if not pr_data:
print("href %s was empty" % href)
continue
cache_json(href, date, title, category, pr_data)
def cache_json(href, date, title, category, pr_data):
pr_data['href'] = href
pr_data['date'] = date
pr_data['title'] = title
pr_data['category'] = category
write_json_file(href, pr_data)
def process_release_page(href):
pr_els = fetch_press_release(href)
if pr_els is None or len(pr_els) is 0:
print("ERROR: No PR content in %s" % href)
return
pr = pr_els[0]
if debug_mode():
debug_html_element(pr)
return parse_press_release(pr, href)
def write_json_file(href, pr_data):
json_file_name = shasum_for_href(href) + '.json'
with open('json/'+json_file_name, 'w') as json_file:
json.dump(pr_data, json_file)
##########################################################
# main
if len(sys.argv) > 1:
# skip script name
sys.argv.pop(0)
for href in sys.argv:
pr_data = process_release_page(href)
print( json.dumps(pr_data) )
else:
process_release_list()