-
Notifications
You must be signed in to change notification settings - Fork 0
/
atom-reader.py
executable file
·53 lines (43 loc) · 1.58 KB
/
atom-reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu May 14 13:19:20 2020
@author: briancroxall
Script to manage the second portion of the data processing. This script will
open each result from atom-harvester.py and compile its data into one
data dictionary.
"""
from datetime import datetime
from glob import glob
from bs4 import BeautifulSoup
import re
startTime = datetime.now()
def make_soup(xml):
soup = BeautifulSoup(xml, 'lxml-xml')
return soup
# Create output files
with open('data-dictionary.tsv', 'w') as save_file:
pass
# Corpus
data = glob('atom-data/*.xml')
# Process data
for counter, each in enumerate(data):
if counter % 100 == 0:
print('.', end='', flush=True)
with open(each) as data_file:
soup = make_soup(data_file)
entries = []
for each in soup.find_all('entry'):
entries.append(each)
for entry in entries:
link = entry.find('link').get('href')
ocr = link + 'ocr.txt'
date = link.split('/')[5]
img_num = link.split('/')[7]
title_tag = entry.find('title').get_text()
location = re.findall(r'\((.*?)\)', title_tag, flags=re.I)[0] # https://regex101.com/r/e5WQw8/1
newspaper = re.findall(r'(.*?)\s(?:\[volume\])?\s?\(', title_tag, flags=re.I)[0] # https://regex101.com/r/e5WQw8/4
with open('data-dictionary.tsv', 'a') as save_file:
print(newspaper, location, date, img_num, link, ocr, sep='\t',
file=save_file)
print('Time elapsed: ', datetime.now() - startTime)