-
Notifications
You must be signed in to change notification settings - Fork 0
/
06212023_immigrant_visa_pdf_df.py
315 lines (289 loc) · 13.4 KB
/
06212023_immigrant_visa_pdf_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 20 2023
@author: Tiangeng Lu
Auto-download monthly immigrant visa statistics .pdf files with minimum loops.
This program checks the download status of the targeted files before downloading all files.
This program will only download the files that haven't been downloaded yet.
This program converts all pdfs to txts.
This program contains immigrant-visa-specific data cleaning steps.
This program converts all txts to a single dataframe.
This program saves monthly dataframes.
"""
import requests
from scrapy import Selector
import pandas as pd
from pandas.tseries.offsets import MonthEnd
import os
from urllib import request
from datetime import datetime
from PyPDF2 import PdfReader
import time as tm
# user-defined function(s)
def dtime(file):
return datetime.fromtimestamp(os.path.getmtime(file)).strftime("%Y-%m-%d, %A, %H:%M:%S")
# URL
main_url = 'https://travel.state.gov/content/travel/en/legal/visa-law0/visa-statistics/immigrant-visa-statistics/monthly-immigrant-visa-issuances.html'
# request content from the main url
main_html = requests.get(main_url).content
# object type is 'bytes'
type(main_html)
# select text, `main_selector` type is `selector.unified.Selector`, NOT subscriptable
main_selector = Selector(text = main_html)
# get urls that contain .pdf; this selector.xpath.extract() is important
all_links = main_selector.xpath('//*[contains(@href, ".pdf")]/@href').extract()
# examine the PATTERNS of these links, then subset them
print(all_links[:5])
# the following step is data/url-specific, select the urls that contain certain pattern
national_links = [link for link in all_links if "FSC" in link]
national_links = list(set(national_links))
print(len(national_links))
# Count the URLs in each year: 2017 should have 10 (from March), 2023 has 4 (up to April), and all other years should have 12
print("2017:",len([link for link in national_links if "2017" in link]))
print("2018:",len([link for link in national_links if "2018" in link]))
print("2019:",len([link for link in national_links if "2019" in link]))
print("2020:",len([link for link in national_links if "202020" in link])) # not a typo but by the actual links
print("2021:",len([link for link in national_links if "2021" in link]))
print("2022:",len([link for link in national_links if "2022" in link]))
print("2023:",len([link for link in national_links if "2023" in link]))
# add prefix
prefix = 'https://travel.state.gov'
# list comprehension/generator to replace a loop over list elements
national_links = [prefix + link for link in national_links if link.startswith('/content')]
# save urls
file = open('URL_iv_visa_nationality.txt', 'w')
for url in national_links:
file.write(url + "\n")
file.close()
# build catalog by extracting yyyy-mm info
# given the observed patterns, subset the substrings help get the yyyy-mm info
print(national_links[0].split('/')[-1].split('%'))
# use list comprehension/generator to replace loops over list elements
month = [link.split('/')[-1].split('%')[0].upper() for link in national_links]
year = [link.split('/')[-1].split('%')[1][2:] for link in national_links]
print(set(month)); print(set(year))
# fix the irregular patterns, starting with singling out the problematic observations
# the following will serve as the .iloc indexer
temp_list = [i for i, yr in enumerate(year) if len(yr) != 4]
national_catalog = pd.DataFrame(
{'url': national_links,
'year': year,
'month':month
})
# observe the problematic rows, it's easy to fix: month has 'AUGUST2021', and year has '-'
# for year, get the last four characters from the month column
national_catalog['year'].iloc[temp_list] = national_catalog['month'].iloc[temp_list].str[-4:]
# for month, remove the last four characters
national_catalog['month'].iloc[temp_list] = national_catalog['month'].iloc[temp_list].str[:-4]
# the following gives the numeric index value
# fix september
extr_fix_id = [i for i, mon in enumerate(national_catalog['month']) if mon == "SEPT"]
national_catalog['month'][extr_fix_id] = "SEPTEMBER"
# alternative code
# national_catalog['month'].iloc[national_catalog[national_catalog['month']=='SEPT'].index] = 'SEPTEMBER'
print(set(national_catalog['month']))
# create time stamp
national_catalog['mmyy'] = national_catalog['year'].str.cat(national_catalog['month'], sep = '-')
# shift to month end given the nature of the data
national_catalog['mmyy'] = pd.to_datetime(national_catalog['mmyy'], errors = 'ignore') + MonthEnd()
# remove HH:MM
national_catalog['mmyy'] = national_catalog['mmyy'].dt.date
national_catalog = national_catalog.sort_values('mmyy', ascending = True).reset_index(drop = True)
path = 'iv'
path_Exist = os.path.exists(path)
if not path_Exist:
os.makedirs(path)
print("The folder was last created at:", datetime.fromtimestamp(os.path.getmtime(path)).strftime("%Y-%m-%d, %A"))
# print-out the last document, must be sorted by time
sorted(os.listdir(path))[-1]
# create a list of filenames
all_filenames = [None] * len(national_catalog)
for i in range(len(national_catalog)):
all_filenames[i] = os.getcwd() + '/' + path + '/' + 'iv_' + str(national_catalog['mmyy'][i])[:10] + '.pdf'
all_filenames
national_catalog['filename'] = all_filenames
# check whether all pdfs have been downloaded, and then add the download status to the catalog data
status = [None]*len(national_catalog)
# check the download status for each filename
for i,k in enumerate(national_catalog['filename']):
if os.path.isfile(k):
status[i] = True
print(dtime(k))
else:
status[i] = False
# update the download status in national_catalog
if len(status) == len(national_catalog):
national_catalog['downloaded'] = status
# if the file hasn't been downloaded yet, download it now
for i in range(len(national_catalog)):
if national_catalog['downloaded'][i] == True:
print(national_catalog['filename'][i], "was last downloaded at", dtime(national_catalog['filename'][i]))
else:
print(national_catalog['filename'][i], 'will be downloaded now.')
request.urlretrieve(national_catalog['url'][i], national_catalog['filename'][i])
# update the national_catalog dataframe to reflect the most recent downloads
status = [None]*len(national_catalog)
# check the download status for each filename
for i,k in enumerate(national_catalog['filename']):
if os.path.isfile(k):
status[i] = True
print(dtime(k))
else:
status[i] = False
# update the download status in national_catalog
if len(status) == len(national_catalog):
national_catalog['downloaded'] = status
# save the catalog data
national_catalog['download_time'] = [dtime(pdf) for pdf in national_catalog['filename']]
national_catalog.to_csv('iv_catalog.csv', index = False)
########################################################
########################################################
iv_folder = 'ivtxt'
if not os.path.exists(iv_folder):
os.makedirs(iv_folder)
print(dtime(iv_folder))
# check whether the catalog dataframe exists
if 'national_catalog' in globals():
print("Yes. The catalog data is already here.")
else:
print("No. Import the catalog data from disk.")
national_catalog = pd.read_csv('iv_catalog.csv')
# convert every pdf file (including all pages within the pdf) to a txt
TEXT = [None] * len(national_catalog['filename'])
# outer loop: pdf files
for n, nth_pdf in enumerate(national_catalog['filename']):
# create a reader object
reader = PdfReader(nth_pdf)
Pages = [None] * len(reader.pages)
Text = [None] * len(reader.pages)
# inner loop: pdf pages
for i in range(len(reader.pages)):
Pages[i] = reader.pages[i]
Text[i] = Pages[i].extract_text()
TEXT[n] = Text
# save txt to file
# can only concatenate str (not "datetime.date") to str
txtnames = ['iv_'+str(txt)+'.txt' for txt in national_catalog['mmyy']]
# two layers of loops
for i, txt in enumerate(txtnames):
file = open(iv_folder + '/' + txt, 'w')
for individual_page in TEXT[i]:
file.write(individual_page + "\n")
file.close()
print("The txt folder now has", str(len(os.listdir(iv_folder))), "files as of", tm.strftime("%Y-%m-%d, %A, %H:%M"))
# txt files to dataframe
# start with a raw container of dataframes
DF_raw = [None] * len(TEXT)
# the following is another nested loop
# OUTER: each txt converted from pdf document
for n in range(len(TEXT)):
# the length of df_list gets updated in every iteration over n
df_list = [None] * len(TEXT[n])
# INNER: the ith page in the nth txt file
for i, page in enumerate(TEXT[n]):
# loop over each page
df_list[i] = pd.DataFrame(page.split('\n'))
# retain the page number from their original txt(pdf) file
df_list[i]['pg'] = i + 1
# end of INNER: pages in the nth txt file
# OUTER: append the dataframe for every page to one bigger dataframe
df_file = pd.concat([df_i for df_i in df_list]).reset_index(drop = True)
df_file = df_file.rename(columns = {0:'V'})
# set to upper case and trim
df_file['V'] = df_file['V'].str.upper().apply(lambda x: x.strip())
# add yyyy-mm timestamp column, both should work
#df_file['time'] = txtnames[n][3:-4]
df_file['time'] = national_catalog['mmyy'][n].strftime("%Y-%m-%d")
if df_file['V'].str.contains('GRAND TOTAL', case = False).any() == False:
# Are there any documents that do not have a grand total?
print(n)
else:
# If there's a "Grand Total" in any of the pages, get the row/index ids that are on or after the "Grand Total"
remove = df_file.index >= list(df_file[df_file['V'].str.contains('GRAND TOTAL', case = False)].index)[0]
# exclude the rows from the `remove` row ids/indeces
df_file = df_file.iloc[remove == False]
# the finished df_file becomes the nth dataframe in the DF_raw list
DF_raw[n] = df_file
# concatenate all data frames, the following has 6-digit observations, all contents in column 'V'
# yyyy-mm and page information were added
DF = pd.concat([df_raw for df_raw in DF_raw]).reset_index(drop = True)
# Save a back-up copy of DF
DF_archive = DF.copy(deep = True)
# immigrant-visa specific headers and footers, observe the pdf files
headers = ['PAGE ', 'FOREIGN STATE OF', 'CHARGEABILITY', 'PLACE OF BIRTH', '\(FY 20', '\(FY20']
# the following df are the headers/footers rows
DF_headers = DF[DF['V'].str.contains('|'.join(headers))]
# get the indeces/row ids
remove_ids = list(DF_headers.index)
# remove rows by remove_ids
DF = DF[~DF.index.isin(remove_ids)]
# remove blank rows in column 'V'
DF = DF[DF['V'].str.len() > 1]
# make sure all V values end with number
comingled_rows = DF[DF['V'].str.endswith('ISSUANCES')]
# remove the unwanted partial string that were in the same row with data entries
str_remove = 'IMMIGRANT VISA ISSUANCES'
DF['V'] = DF['V'].str.replace(str_remove, '')
DF['V'] = DF['V'].apply(lambda x: x.strip())
# the following should return nothing
for row in DF['V']:
if row[-1] not in ['1','2','3','4','5','6','7','8','9','0']:
print(row)
# Create new columns
visa = [None]*len(DF)
issue = [None]*len(DF)
nationality = [None]*len(DF)
for i, row in enumerate(DF['V']):
issue[i] = row.split(' ')[-1]
visa[i] = row.split(' ')[-2]
nationality[i] = ' '.join(row.split(' ')[:-2])
## CREATE A NEW DATAFRAME
DF_final = pd.DataFrame({
'time': DF['time'],
'pg': DF['pg'],
'nationality': nationality,
'visa': visa,
'issuance':issue
})
# country list
print(sorted(list(set(DF_final['nationality']))))
nationality_ct = DF_final['nationality'].value_counts().to_frame().reset_index().\
rename(columns = {'index':'country','nationality':'count'}).\
sort_values(['count','country'], ascending = True).reset_index(drop = True)
nationality_ct.to_csv('iv_nationalities.csv', index = False)
# see the visa type count, make sure to remove extra leading/trailing space
DF_final['visa'] = DF_final['visa'].apply(lambda x: x.strip())
visa_ct = DF_final['visa'].value_counts().to_frame().reset_index()
visa_ct = visa_ct.rename(columns = {'visa':'count','index':'type'})
visa_ct.to_csv('iv_visa.csv', index = False)
# remove the big mark "," from the number column
print(DF_final['issuance'].dtypes)
# the monthly issuances of immigrant visas are much lower than nonimmigrant visas
DF_final['issuance'] = DF_final['issuance'].str.replace(',', '')
# this should be blank
DF_final[DF_final['issuance'].str.contains('[A-Za-z]')]
# the following should equal to the number of rows in the final dataframe
print(len(DF_final[DF_final['issuance'].str.contains('[0-9]')]))
# convert to integer
DF_final['issuance'] = DF_final['issuance'].astype('int')
print(DF_final['issuance'].dtypes)
# save final dataframe
DF_final.to_csv('iv_alltime.csv', index = False)
# final Step, save separate dataframes for each month
release_date = sorted(list(set(DF_final['time'])))
print(release_date)
# create a new dataframe folder
path_df = 'ivdf'
if os.path.exists(path_df) == True:
print("YES. The directory already exists. \nNO ACTIONS REQUIRED \n")
print(sorted(os.listdir(path_df)))
else:
print("The directory doesn't exist. Create it right now.")
os.makedirs(path_df)
print("Does the directory exist now?", os.path.exists(path_df))
# df names for each month
df_month_filenames = ['iv_'+str(name)+'.csv' for name in national_catalog['mmyy']]
# save dataframes of each month
for i in range(len(release_date)):
DF_final.loc[DF_final['time'] == release_date[i]].to_csv(path_df + '/' + df_month_filenames[i], index = False)