main.py

from csv import DictReader
from pathlib import Path
from typing import List, Dict, Any, Union
import pandas as pd
from collections import Counter
from voterfiles import StateFileFunctions
from targets import TargetFileReader
from targets.templates.validator import TargetValidator
from matching.matching import MatchVoters

pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
texas_voterfile = StateFileFunctions('texas')
passed, failed = texas_voterfile.validate()

# texas_targets = TargetFileReader('texas')
# texas_targets.generate_column_set()
# texas_targets.combine_files()
# texas_targets.run_validation()
#
# matches = MatchVoters(passed, texas_targets.validator.valid)
# matches.get_absolute_matches()
# matches.run_match()
# all_matches = [dict(x) for x in matches.all_matches]

# pd.DataFrame.from_records(all_matches).to_csv(Path.home() / 'Downloads' / '20231214_vep_matches.csv', index=False)
# texas_df = pd.DataFrame.from_records(texas_targets.validator.invalid)
# errors = texas_df[texas_df['error_type'] == 'String should have at least 4 characters']
#
# print(texas_df.count())
find_data = lambda col: Counter(sorted([x.get(col) for x in texas_targets.records if x.get(col) is not None and len(x.get(col)) < 4]))
# zip4 = find_data('ZIP4')
# cell_phone = [x.get('CELLTELEPHONENUM') for x in texas_targets.records if x.get('CELLTELEPHONENUM') is not None]
# landline_phone = [x.get('LANDLINETELEPHONENUM') for x in texas_targets.records if x.get('LANDLINETELEPHONENUM') is not None]
#
# birthdate = find_data('BIRTHDATE')
# birthday = find_data('BIRTHDAY')
# county = find_data('COUNTY')
# county_type_count = Counter(county)
#
# errors = Counter([x['error_type'] for x in texas_voterfile.invalid])
# error_df = pd.DataFrame.from_dict(errors, orient='index', columns=['count'])

# temp_path = Path.home() / 'Downloads' / 'TX VEP Source Files As Of 11.27.23'
#
# files = list(x for x in temp_path.iterdir() if x.is_file() and x.suffix == '.csv')
#
# file_dicts = {}
# for file in files:
#     with file.open('r') as f:
#         reader = DictReader(f)
#         for row in reader:
#             file_dicts.setdefault(file.stem, []).append(row)
#
# file_columns = [cols for file in file_dicts.values() for cols in file[0].keys()]
# col_counts = Counter(file_columns)