-
Notifications
You must be signed in to change notification settings - Fork 0
/
full_ocod_parse_process.py
executable file
·91 lines (63 loc) · 3.76 KB
/
full_ocod_parse_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from address_parsing_helper_functions import *
import sys
"""
This script runs the full parsing pipeline.
It takes a single argument which is the root path of the empty_homes_data path folder.
It requires that all data is found in that folder.
I hope to update the script so that it is easier to swap out the data for new data when it is created
docker run --rm -it -v $(pwd):/app jonno/parse_process:test ./app/enhance_ocod/full_ocod_parse_process.py ./app/data Malcom_UK_Owners_Missing.csv Malcom_UK_Owners_Missing_enhanced.csv
"""
args = sys.argv
root_path = str(args[1])
data_file = str(args[2])
ouput_file = str(args[3])
ocod_data = load_and_prep_OCOD_data(root_path + data_file)
all_entities = spacy_pred_fn(spacy_model_path = root_path+'spacy_cpu_model', ocod_data = ocod_data)
#all_entities = load_cleaned_labels(root_path + 'full_dataset_no_overlaps.json')
full_expanded_data = parsing_and_expansion_process(all_entities, expand_addresses = True)
del all_entities #memory management
ocod_data = post_process_expanded_data(full_expanded_data, ocod_data)
del full_expanded_data #memory management
from locate_and_classify_helper_functions import *
print("Load ONSPD")
# zip file handler
zip = zipfile.ZipFile(root_path + 'ONSPD.zip')
# looks in the data folder for a csv file that begins ONSPD
#This will obviously break if the ONS change the archive structure
target_zipped_file = [i for i in zip.namelist() if re.search(r'^Data\/ONSPD.+csv$',i) ][0]
postcode_district_lookup = load_postocde_district_lookup(root_path + "ONSPD.zip", target_zipped_file)
print("Pre-process expanded ocod data")
ocod_data = preprocess_expandaded_ocod_data(ocod_data, postcode_district_lookup)
print("Load and pre-process the Land Registry price paid dataset")
#loads from a folder of price paid files
price_paid_df = load_and_process_pricepaid_data(root_path+'price_paid_files/', postcode_district_lookup)
print("Add in missing Local authority codes to the ocoda dataset")
ocod_data = add_missing_lads_ocod(ocod_data, price_paid_df)
print("Load and pre-process the voa business ratings list dataset")
voa_businesses = load_voa_ratinglist(root_path +'VOA_ratings.csv', postcode_district_lookup)
del postcode_district_lookup #for memory purposes
print("Match street addresses and buildings")
ocod_data = street_and_building_matching(ocod_data, price_paid_df, voa_businesses)
#This takes some time
print('Sub-street matching, this takes some time')
ocod_data = substreet_matching(ocod_data, price_paid_df, voa_businesses)
del price_paid_df #for memory purposes
print('Add in businesses per oa and lsoa')
ocod_data = counts_of_businesses_per_oa_lsoa(ocod_data, voa_businesses)
print('Identify businesses using address matching')
ocod_data = voa_address_match_all_data(ocod_data, voa_businesses)
del voa_businesses #probably not necessary but still delete to save memory
print('Classification type 1')
ocod_data = classification_type1(ocod_data)
print('Classification type 2')
ocod_data = classification_type2(ocod_data)
print('Contract ocod dataset')
ocod_data = contract_ocod_after_classification(ocod_data, class_type = 'class2', classes = ['residential'] )
print('Process complete saving the enchanced ocod dataset to ' + root_path + ouput_file)
#subset the dataframe to only the columns necessary for the dataset and save
ocod_data.loc[:, ['title_number', 'within_title_id', 'within_larger_title', 'unique_id', 'unit_id', 'unit_type',
'building_name', 'street_number', 'street_name', 'postcode', 'city',
'district', 'region', 'property_address', 'oa11cd', 'lsoa11cd',
'msoa11cd', 'lad11cd', 'class', 'class2']].rename(columns={'within_title_id':'nested_id',
'within_larger_title':'nested_title'}).to_csv(root_path + ouput_file)
#FINISH!