-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocr-extractor.py
executable file
·55 lines (40 loc) · 1.31 KB
/
ocr-extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 15 12:43:56 2020
@author: briancroxall
This script extracts the OCR text from the HTML from Chronicling America.
"""
from glob import glob
from bs4 import BeautifulSoup
import os
from datetime import datetime
startTime = datetime.now()
def make_soup(xml):
soup = BeautifulSoup(xml, 'lxml-xml')
return soup
def get_name(filename):
no_dir = filename.split('/')[-1]
no_ext = no_dir.split('.')[0]
return no_ext
if not os.path.isdir('ocr-txt'):
os.mkdir('ocr-txt')
corpus = sorted(glob('ocr-html/*.html'))
test = ['ocr-html/the-republican-journal_1890-12-04_p2.html',
'ocr-html/st_1891-03-19_p8.html',
'ocr-html/the-state-republican_1891-02-26_p3.html']
for counter, each in enumerate(corpus):
if counter % 100 == 0:
print('.', end='', flush=True)
with open(each) as input_file:
soup = make_soup(input_file)
ocr = soup.div.p.get_text(' ')
"""
ocr_tag = soup.div.p
ocr = ocr_tag.get_text()
"""
filename = get_name(each)
with open(f'ocr-txt/{filename}.txt', 'w') as ocr_data:
print(ocr, file=ocr_data)
print('\nNumber of OCR files extracted: ', counter + 1)
print('\nTime elapsed: ', datetime.now() - startTime)