Skip to content

Commit

Permalink
linted the code and updated to format age
Browse files Browse the repository at this point in the history
  • Loading branch information
franciscojavierarceo committed Dec 7, 2023
1 parent e23fa5a commit debb72e
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 133 deletions.
188 changes: 111 additions & 77 deletions src/pyipums/clean_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,29 @@
from ipumspy import readers, ddi

INCOME_COLUMN = "INC"
EDUC_LT_HS = 'Less than High School Diploma'
EDUC_HS = 'High school diploma or equivalent'
EDUC_VOC = 'Vocational Degree'
EDUC_LT_HS = "Less than High School Diploma"
EDUC_HS = "High school diploma or equivalent"
EDUC_VOC = "Vocational Degree"
EDUC_BS = "Bachelor's degree"
EDUC_GRAD = "Graduate degree"
EDUC_ATTAINMENT = {
'NIU or blank': 'Missing',
'Grades 1, 2, 3, or 4': EDUC_LT_HS,
'Grades 5 or 6': EDUC_LT_HS,
'Grades 7 or 8': EDUC_LT_HS,
'Grade 9': EDUC_LT_HS,
'Grade 10': EDUC_LT_HS,
'Grade 11': EDUC_LT_HS,
'12th grade, no diploma': EDUC_LT_HS,
'None or preschool': EDUC_LT_HS,
'High school diploma or equivalent': EDUC_HS,
"NIU or blank": "Missing",
"Grades 1, 2, 3, or 4": EDUC_LT_HS,
"Grades 5 or 6": EDUC_LT_HS,
"Grades 7 or 8": EDUC_LT_HS,
"Grade 9": EDUC_LT_HS,
"Grade 10": EDUC_LT_HS,
"Grade 11": EDUC_LT_HS,
"12th grade, no diploma": EDUC_LT_HS,
"None or preschool": EDUC_LT_HS,
"High school diploma or equivalent": EDUC_HS,
"Associate's degree, academic program": EDUC_HS,
"Associate's degree, occupational/vocational program": EDUC_VOC,
'Some college but no degree': EDUC_HS,
"Some college but no degree": EDUC_HS,
"Bachelor's degree": EDUC_BS,
"Master's degree": EDUC_GRAD,
'Doctorate degree': EDUC_GRAD,
'Professional school degree': EDUC_GRAD,
"Doctorate degree": EDUC_GRAD,
"Professional school degree": EDUC_GRAD,
}


Expand All @@ -49,49 +49,50 @@ def clean_cps_income(self):

self.df[f"{col}"] = self.df[col].replace(invalid_values, np.nan)

self.df['INCTOT'] = self.df['INCTOT'].astype(float)
self.df["INCTOT"] = self.df["INCTOT"].astype(float)

def clean_educ_attainment(self):
self.df['Educational Attainment'] = self.df['Education'].apply(
lambda x: EDUC_ATTAINMENT.get(x)
).astype(str)
self.df["Educational Attainment"] = (
self.df["Education"].apply(lambda x: EDUC_ATTAINMENT.get(x)).astype(str)
)

def clean_variables(self):
self.df['Occupation'] = map_codes(self.ddi_codebook, self.df, 'OCC2010')
self.df['Education'] = map_codes(self.ddi_codebook, self.df, 'EDUC')
self.df['Birthplace'] = map_codes(self.ddi_codebook, self.df, 'BPL')
self.df['Marital_Status'] = map_codes(self.ddi_codebook, self.df, 'MARST')
self.df['Nativity'] = map_codes(self.ddi_codebook, self.df, 'NATIVITY')
self.df['Class_of_worker'] = map_codes(self.ddi_codebook, self.df, 'CLASSWKR')
self.df['Hispanic'] = map_codes(self.ddi_codebook, self.df, 'HISPAN')
self.df['Asian'] = map_codes(self.ddi_codebook, self.df, 'ASIAN')
self.df['Race'] = map_codes(self.ddi_codebook, self.df, 'RACE')
self.df['Veteran_Status'] = map_codes(self.ddi_codebook, self.df, 'VETSTAT')
self.df["Occupation"] = map_codes(self.ddi_codebook, self.df, "OCC2010")
self.df["Education"] = map_codes(self.ddi_codebook, self.df, "EDUC")
self.df["Birthplace"] = map_codes(self.ddi_codebook, self.df, "BPL")
self.df["Marital_Status"] = map_codes(self.ddi_codebook, self.df, "MARST")
self.df["Nativity"] = map_codes(self.ddi_codebook, self.df, "NATIVITY")
self.df["Class_of_worker"] = map_codes(self.ddi_codebook, self.df, "CLASSWKR")
self.df["Hispanic"] = map_codes(self.ddi_codebook, self.df, "HISPAN")
self.df["Asian"] = map_codes(self.ddi_codebook, self.df, "ASIAN")
self.df["Race"] = map_codes(self.ddi_codebook, self.df, "RACE")
self.df["Veteran_Status"] = map_codes(self.ddi_codebook, self.df, "VETSTAT")
self.df["Age"] = self.df["AGE"].astype(float)

def clean_wages(self):
# Aggregating Wages
income_buckets = {
'INCSS': 'Government',
'INCWELFR': 'Government',
'INCRETIR': 'Investment',
'INCSSI': 'Government',
'INCINT': 'Investment',
'INCUNEMP': 'Government',
'INCWKCOM': 'Wage',
'INCVET': 'Government',
'INCSURV': 'Government',
'INCDISAB': 'Government',
'INCDIVID': 'Investment',
'INCRENT': 'Investment',
'INCEDUC': 'Government',
'INCCHILD': 'Government',
'INCASIST': 'Government',
'INCOTHER': 'Unknown',
'INCRANN': 'Investment',
'INCPENS': 'Wage',
'INCWAGE': 'Wage',
'INCBUS': 'Wage',
'INCFARM': 'Wage',
"INCSS": "Government",
"INCWELFR": "Government",
"INCRETIR": "Investment",
"INCSSI": "Government",
"INCINT": "Investment",
"INCUNEMP": "Government",
"INCWKCOM": "Wage",
"INCVET": "Government",
"INCSURV": "Government",
"INCDISAB": "Government",
"INCDIVID": "Investment",
"INCRENT": "Investment",
"INCEDUC": "Government",
"INCCHILD": "Government",
"INCASIST": "Government",
"INCOTHER": "Unknown",
"INCRANN": "Investment",
"INCPENS": "Wage",
"INCWAGE": "Wage",
"INCBUS": "Wage",
"INCFARM": "Wage",
}

column_to_bucket_map = {}
Expand All @@ -106,40 +107,73 @@ def clean_wages(self):
for k in column_to_bucket_map:
self.df[f"{k} Income"] = self.df[column_to_bucket_map[k]].sum(axis=1)

self.df['Investment Income as Percent of Total Income'] = np.where(
self.df['INCTOT'] == 0,
self.df["Investment Income as Percent of Total Income"] = np.where(
self.df["INCTOT"] == 0,
0,
self.df['Investment Income'].astype(float) / self.df['INCTOT'].astype(float))
self.df["Investment Income"].astype(float)
/ self.df["INCTOT"].astype(float),
)

self.df['Government Income as Percent of Total Income'] = np.where(
self.df['INCTOT'] == 0,
self.df["Government Income as Percent of Total Income"] = np.where(
self.df["INCTOT"] == 0,
0,
self.df['Government Income'].astype(float) / self.df['INCTOT'].astype(float))
self.df["Government Income"].astype(float)
/ self.df["INCTOT"].astype(float),
)

self.df['Wage Income as Percent of Total Income'] = np.where(
self.df['INCTOT'] == 0,
self.df["Wage Income as Percent of Total Income"] = np.where(
self.df["INCTOT"] == 0,
0,
self.df['Wage Income'].astype(float) / self.df['INCTOT'].astype(float))

self.df.loc[self.df['Investment Income as Percent of Total Income'] < 0, 'Investment Income as Percent of Total Income'] = 0.
self.df.loc[self.df['Investment Income as Percent of Total Income'] > 1, 'Investment Income as Percent of Total Income'] = 1
self.df.loc[self.df['Government Income as Percent of Total Income'] < 0, 'Government Income as Percent of Total Income'] = 0.
self.df.loc[self.df['Government Income as Percent of Total Income'] > 1, 'Government Income as Percent of Total Income'] = 1
self.df.loc[self.df['Wage Income as Percent of Total Income'] < 0, 'Wage Income as Percent of Total Income'] = 0.
self.df.loc[self.df['Wage Income as Percent of Total Income'] > 1, 'Wage Income as Percent of Total Income'] = 1

self.df['Weighted Total Income'] = self.df['INCTOT'] * self.df['ASECWT']
self.df['Weighted Government Income'] = self.df['Government Income'] * self.df['ASECWT']
self.df['Weighted Investment Income'] = self.df['Investment Income'] * self.df['ASECWT']
self.df['Weighted Wage Income'] = self.df['Wage Income'] * self.df['ASECWT']

self.df['Weighted Government Income as Percent of Total Income'] = self.df['Government Income as Percent of Total Income'] * self.df['ASECWT']
self.df['Weighted Investment Income as Percent of Total Income'] = self.df['Investment Income as Percent of Total Income'] * self.df['ASECWT']
self.df['Weighted Wage Income as Percent of Total Income'] = self.df['Wage Income as Percent of Total Income'] * self.df['ASECWT']
self.df["Wage Income"].astype(float) / self.df["INCTOT"].astype(float),
)

self.df.loc[
self.df["Investment Income as Percent of Total Income"] < 0,
"Investment Income as Percent of Total Income",
] = 0.0
self.df.loc[
self.df["Investment Income as Percent of Total Income"] > 1,
"Investment Income as Percent of Total Income",
] = 1
self.df.loc[
self.df["Government Income as Percent of Total Income"] < 0,
"Government Income as Percent of Total Income",
] = 0.0
self.df.loc[
self.df["Government Income as Percent of Total Income"] > 1,
"Government Income as Percent of Total Income",
] = 1
self.df.loc[
self.df["Wage Income as Percent of Total Income"] < 0,
"Wage Income as Percent of Total Income",
] = 0.0
self.df.loc[
self.df["Wage Income as Percent of Total Income"] > 1,
"Wage Income as Percent of Total Income",
] = 1

self.df["Weighted Total Income"] = self.df["INCTOT"] * self.df["ASECWT"]
self.df["Weighted Government Income"] = (
self.df["Government Income"] * self.df["ASECWT"]
)
self.df["Weighted Investment Income"] = (
self.df["Investment Income"] * self.df["ASECWT"]
)
self.df["Weighted Wage Income"] = self.df["Wage Income"] * self.df["ASECWT"]

self.df["Weighted Government Income as Percent of Total Income"] = (
self.df["Government Income as Percent of Total Income"] * self.df["ASECWT"]
)
self.df["Weighted Investment Income as Percent of Total Income"] = (
self.df["Investment Income as Percent of Total Income"] * self.df["ASECWT"]
)
self.df["Weighted Wage Income as Percent of Total Income"] = (
self.df["Wage Income as Percent of Total Income"] * self.df["ASECWT"]
)

def clean_data(self):
self.clean_variables()
self.clean_cps_income()
self.clean_educ_attainment()
self.clean_wages()
return self.df
return self.df
111 changes: 55 additions & 56 deletions tests/test_clean_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,60 +7,60 @@
from src.pyipums.clean_data import IpumsCleaner, EDUC_ATTAINMENT

xvars = [
'AGE',
'ADJGINC',
'ASECWT',
'ASECWTH',
'ASIAN',
'ASECFWT',
'STATEFIP',
'TAXINC',
'UHRSWORK1',
'RACE',
'SEX',
'SRCWELFR',
'YEAR',
'FOODSTAMP',
'STAMPVAL',
'WTFINL',
'BPL',
'HISPAN',
'EMPSTAT',
'LABFORCE',
'OCC',
'OCC2010',
'MARST',
'VETSTAT',
'CITIZEN',
'NATIVITY',
'CLASSWKR',
'WKSTAT',
'EDUC',
'OFFPOV',
'EARNWT',
'INCWAGE',
'INCBUS',
'INCFARM',
'INCSS',
'INCWELFR',
'INCRETIR',
'INCSSI',
'INCINT',
'INCUNEMP',
'INCWKCOM',
'INCVET',
'INCSURV',
'INCDISAB',
'INCDIVID',
'INCRENT',
'INCEDUC',
'INCCHILD',
'INCASIST',
'INCOTHER',
'INCRANN',
'INCPENS',
'INCTOT',
'STATECENSUS',
"AGE",
"ADJGINC",
"ASECWT",
"ASECWTH",
"ASIAN",
"ASECFWT",
"STATEFIP",
"TAXINC",
"UHRSWORK1",
"RACE",
"SEX",
"SRCWELFR",
"YEAR",
"FOODSTAMP",
"STAMPVAL",
"WTFINL",
"BPL",
"HISPAN",
"EMPSTAT",
"LABFORCE",
"OCC",
"OCC2010",
"MARST",
"VETSTAT",
"CITIZEN",
"NATIVITY",
"CLASSWKR",
"WKSTAT",
"EDUC",
"OFFPOV",
"EARNWT",
"INCWAGE",
"INCBUS",
"INCFARM",
"INCSS",
"INCWELFR",
"INCRETIR",
"INCSSI",
"INCINT",
"INCUNEMP",
"INCWKCOM",
"INCVET",
"INCSURV",
"INCDISAB",
"INCDIVID",
"INCRENT",
"INCEDUC",
"INCCHILD",
"INCASIST",
"INCOTHER",
"INCRANN",
"INCPENS",
"INCTOT",
"STATECENSUS",
]


Expand All @@ -77,7 +77,6 @@ def test_read(self):

df = IpumsCleaner(ipums_df, ddi_codebook).clean_data()
self.assertEqual(
set(df['Educational Attainment'].unique()),
set(df["Educational Attainment"].unique()),
set(EDUC_ATTAINMENT.values()),
)

0 comments on commit debb72e

Please sign in to comment.