linted the code and updated to format age

franciscojavierarceo · Dec 7, 2023 · debb72e · debb72e
1 parent e23fa5a
commit debb72e
Show file tree

Hide file tree

Showing 2 changed files with 166 additions and 133 deletions.
diff --git a/src/pyipums/clean_data.py b/src/pyipums/clean_data.py
@@ -3,29 +3,29 @@
 from ipumspy import readers, ddi
 
 INCOME_COLUMN = "INC"
-EDUC_LT_HS = 'Less than High School Diploma'
-EDUC_HS = 'High school diploma or equivalent'
-EDUC_VOC = 'Vocational Degree'
+EDUC_LT_HS = "Less than High School Diploma"
+EDUC_HS = "High school diploma or equivalent"
+EDUC_VOC = "Vocational Degree"
 EDUC_BS = "Bachelor's degree"
 EDUC_GRAD = "Graduate degree"
 EDUC_ATTAINMENT = {
-    'NIU or blank': 'Missing',
-    'Grades 1, 2, 3, or 4': EDUC_LT_HS,
-    'Grades 5 or 6': EDUC_LT_HS,
-    'Grades 7 or 8': EDUC_LT_HS,
-    'Grade 9': EDUC_LT_HS,
-    'Grade 10': EDUC_LT_HS,
-    'Grade 11': EDUC_LT_HS,
-    '12th grade, no diploma': EDUC_LT_HS,
-    'None or preschool': EDUC_LT_HS,
-    'High school diploma or equivalent': EDUC_HS,
+    "NIU or blank": "Missing",
+    "Grades 1, 2, 3, or 4": EDUC_LT_HS,
+    "Grades 5 or 6": EDUC_LT_HS,
+    "Grades 7 or 8": EDUC_LT_HS,
+    "Grade 9": EDUC_LT_HS,
+    "Grade 10": EDUC_LT_HS,
+    "Grade 11": EDUC_LT_HS,
+    "12th grade, no diploma": EDUC_LT_HS,
+    "None or preschool": EDUC_LT_HS,
+    "High school diploma or equivalent": EDUC_HS,
     "Associate's degree, academic program": EDUC_HS,
     "Associate's degree, occupational/vocational program": EDUC_VOC,
-    'Some college but no degree': EDUC_HS,
+    "Some college but no degree": EDUC_HS,
     "Bachelor's degree": EDUC_BS,
     "Master's degree": EDUC_GRAD,
-    'Doctorate degree': EDUC_GRAD,
-    'Professional school degree': EDUC_GRAD,
+    "Doctorate degree": EDUC_GRAD,
+    "Professional school degree": EDUC_GRAD,
 }
 
 
@@ -49,49 +49,50 @@ def clean_cps_income(self):
 
             self.df[f"{col}"] = self.df[col].replace(invalid_values, np.nan)
 
-        self.df['INCTOT'] = self.df['INCTOT'].astype(float)
+        self.df["INCTOT"] = self.df["INCTOT"].astype(float)
 
     def clean_educ_attainment(self):
-        self.df['Educational Attainment'] = self.df['Education'].apply(
-            lambda x: EDUC_ATTAINMENT.get(x)
-        ).astype(str)
+        self.df["Educational Attainment"] = (
+            self.df["Education"].apply(lambda x: EDUC_ATTAINMENT.get(x)).astype(str)
+        )
 
     def clean_variables(self):
-        self.df['Occupation'] = map_codes(self.ddi_codebook, self.df, 'OCC2010')
-        self.df['Education'] = map_codes(self.ddi_codebook, self.df, 'EDUC')
-        self.df['Birthplace'] = map_codes(self.ddi_codebook, self.df, 'BPL')
-        self.df['Marital_Status'] = map_codes(self.ddi_codebook, self.df, 'MARST')
-        self.df['Nativity'] = map_codes(self.ddi_codebook, self.df, 'NATIVITY')
-        self.df['Class_of_worker'] = map_codes(self.ddi_codebook, self.df, 'CLASSWKR')
-        self.df['Hispanic'] = map_codes(self.ddi_codebook, self.df, 'HISPAN')
-        self.df['Asian'] = map_codes(self.ddi_codebook, self.df, 'ASIAN')
-        self.df['Race'] = map_codes(self.ddi_codebook, self.df, 'RACE')
-        self.df['Veteran_Status'] = map_codes(self.ddi_codebook, self.df, 'VETSTAT')
+        self.df["Occupation"] = map_codes(self.ddi_codebook, self.df, "OCC2010")
+        self.df["Education"] = map_codes(self.ddi_codebook, self.df, "EDUC")
+        self.df["Birthplace"] = map_codes(self.ddi_codebook, self.df, "BPL")
+        self.df["Marital_Status"] = map_codes(self.ddi_codebook, self.df, "MARST")
+        self.df["Nativity"] = map_codes(self.ddi_codebook, self.df, "NATIVITY")
+        self.df["Class_of_worker"] = map_codes(self.ddi_codebook, self.df, "CLASSWKR")
+        self.df["Hispanic"] = map_codes(self.ddi_codebook, self.df, "HISPAN")
+        self.df["Asian"] = map_codes(self.ddi_codebook, self.df, "ASIAN")
+        self.df["Race"] = map_codes(self.ddi_codebook, self.df, "RACE")
+        self.df["Veteran_Status"] = map_codes(self.ddi_codebook, self.df, "VETSTAT")
+        self.df["Age"] = self.df["AGE"].astype(float)
 
     def clean_wages(self):
         # Aggregating Wages
         income_buckets = {
-            'INCSS': 'Government',
-            'INCWELFR': 'Government',
-            'INCRETIR': 'Investment',
-            'INCSSI': 'Government',
-            'INCINT': 'Investment',
-            'INCUNEMP': 'Government',
-            'INCWKCOM': 'Wage',
-            'INCVET': 'Government',
-            'INCSURV': 'Government',
-            'INCDISAB': 'Government',
-            'INCDIVID': 'Investment',
-            'INCRENT': 'Investment',
-            'INCEDUC': 'Government',
-            'INCCHILD': 'Government',
-            'INCASIST': 'Government',
-            'INCOTHER': 'Unknown',
-            'INCRANN': 'Investment',
-            'INCPENS': 'Wage',
-            'INCWAGE': 'Wage',
-            'INCBUS': 'Wage',
-            'INCFARM': 'Wage',
+            "INCSS": "Government",
+            "INCWELFR": "Government",
+            "INCRETIR": "Investment",
+            "INCSSI": "Government",
+            "INCINT": "Investment",
+            "INCUNEMP": "Government",
+            "INCWKCOM": "Wage",
+            "INCVET": "Government",
+            "INCSURV": "Government",
+            "INCDISAB": "Government",
+            "INCDIVID": "Investment",
+            "INCRENT": "Investment",
+            "INCEDUC": "Government",
+            "INCCHILD": "Government",
+            "INCASIST": "Government",
+            "INCOTHER": "Unknown",
+            "INCRANN": "Investment",
+            "INCPENS": "Wage",
+            "INCWAGE": "Wage",
+            "INCBUS": "Wage",
+            "INCFARM": "Wage",
         }
 
         column_to_bucket_map = {}
@@ -106,40 +107,73 @@ def clean_wages(self):
         for k in column_to_bucket_map:
             self.df[f"{k} Income"] = self.df[column_to_bucket_map[k]].sum(axis=1)
 
-        self.df['Investment Income as Percent of Total Income'] = np.where(
-            self.df['INCTOT'] == 0,
+        self.df["Investment Income as Percent of Total Income"] = np.where(
+            self.df["INCTOT"] == 0,
             0,
-            self.df['Investment Income'].astype(float) / self.df['INCTOT'].astype(float))
+            self.df["Investment Income"].astype(float)
+            / self.df["INCTOT"].astype(float),
+        )
 
-        self.df['Government Income as Percent of Total Income'] = np.where(
-            self.df['INCTOT'] == 0,
+        self.df["Government Income as Percent of Total Income"] = np.where(
+            self.df["INCTOT"] == 0,
             0,
-            self.df['Government Income'].astype(float) / self.df['INCTOT'].astype(float))
+            self.df["Government Income"].astype(float)
+            / self.df["INCTOT"].astype(float),
+        )
 
-        self.df['Wage Income as Percent of Total Income'] = np.where(
-            self.df['INCTOT'] == 0,
+        self.df["Wage Income as Percent of Total Income"] = np.where(
+            self.df["INCTOT"] == 0,
             0,
-            self.df['Wage Income'].astype(float) / self.df['INCTOT'].astype(float))
-
-        self.df.loc[self.df['Investment Income as Percent of Total Income'] < 0, 'Investment Income as Percent of Total Income'] = 0.
-        self.df.loc[self.df['Investment Income as Percent of Total Income'] > 1, 'Investment Income as Percent of Total Income'] = 1
-        self.df.loc[self.df['Government Income as Percent of Total Income'] < 0, 'Government Income as Percent of Total Income'] = 0.
-        self.df.loc[self.df['Government Income as Percent of Total Income'] > 1, 'Government Income as Percent of Total Income'] = 1
-        self.df.loc[self.df['Wage Income as Percent of Total Income'] < 0, 'Wage Income as Percent of Total Income'] = 0.
-        self.df.loc[self.df['Wage Income as Percent of Total Income'] > 1, 'Wage Income as Percent of Total Income'] = 1
-
-        self.df['Weighted Total Income'] = self.df['INCTOT'] * self.df['ASECWT']
-        self.df['Weighted Government Income'] = self.df['Government Income'] * self.df['ASECWT']
-        self.df['Weighted Investment Income'] = self.df['Investment Income'] * self.df['ASECWT']
-        self.df['Weighted Wage Income'] = self.df['Wage Income'] * self.df['ASECWT']
-
-        self.df['Weighted Government Income as Percent of Total Income'] = self.df['Government Income as Percent of Total Income'] * self.df['ASECWT']
-        self.df['Weighted Investment Income as Percent of Total Income'] = self.df['Investment Income as Percent of Total Income'] * self.df['ASECWT']
-        self.df['Weighted Wage Income as Percent of Total Income'] = self.df['Wage Income as Percent of Total Income'] * self.df['ASECWT']
+            self.df["Wage Income"].astype(float) / self.df["INCTOT"].astype(float),
+        )
+
+        self.df.loc[
+            self.df["Investment Income as Percent of Total Income"] < 0,
+            "Investment Income as Percent of Total Income",
+        ] = 0.0
+        self.df.loc[
+            self.df["Investment Income as Percent of Total Income"] > 1,
+            "Investment Income as Percent of Total Income",
+        ] = 1
+        self.df.loc[
+            self.df["Government Income as Percent of Total Income"] < 0,
+            "Government Income as Percent of Total Income",
+        ] = 0.0
+        self.df.loc[
+            self.df["Government Income as Percent of Total Income"] > 1,
+            "Government Income as Percent of Total Income",
+        ] = 1
+        self.df.loc[
+            self.df["Wage Income as Percent of Total Income"] < 0,
+            "Wage Income as Percent of Total Income",
+        ] = 0.0
+        self.df.loc[
+            self.df["Wage Income as Percent of Total Income"] > 1,
+            "Wage Income as Percent of Total Income",
+        ] = 1
+
+        self.df["Weighted Total Income"] = self.df["INCTOT"] * self.df["ASECWT"]
+        self.df["Weighted Government Income"] = (
+            self.df["Government Income"] * self.df["ASECWT"]
+        )
+        self.df["Weighted Investment Income"] = (
+            self.df["Investment Income"] * self.df["ASECWT"]
+        )
+        self.df["Weighted Wage Income"] = self.df["Wage Income"] * self.df["ASECWT"]
+
+        self.df["Weighted Government Income as Percent of Total Income"] = (
+            self.df["Government Income as Percent of Total Income"] * self.df["ASECWT"]
+        )
+        self.df["Weighted Investment Income as Percent of Total Income"] = (
+            self.df["Investment Income as Percent of Total Income"] * self.df["ASECWT"]
+        )
+        self.df["Weighted Wage Income as Percent of Total Income"] = (
+            self.df["Wage Income as Percent of Total Income"] * self.df["ASECWT"]
+        )
 
     def clean_data(self):
         self.clean_variables()
         self.clean_cps_income()
         self.clean_educ_attainment()
         self.clean_wages()
-        return self.df
+        return self.df
diff --git a/tests/test_clean_data.py b/tests/test_clean_data.py
@@ -7,60 +7,60 @@
 from src.pyipums.clean_data import IpumsCleaner, EDUC_ATTAINMENT
 
 xvars = [
-    'AGE',
-    'ADJGINC',
-    'ASECWT',
-    'ASECWTH',
-    'ASIAN',
-    'ASECFWT',
-    'STATEFIP',
-    'TAXINC',
-    'UHRSWORK1',
-    'RACE',
-    'SEX',
-    'SRCWELFR',
-    'YEAR',
-    'FOODSTAMP',
-    'STAMPVAL',
-    'WTFINL',
-    'BPL',
-    'HISPAN',
-    'EMPSTAT',
-    'LABFORCE',
-    'OCC',
-    'OCC2010',
-    'MARST',
-    'VETSTAT',
-    'CITIZEN',
-    'NATIVITY',
-    'CLASSWKR',
-    'WKSTAT',
-    'EDUC',
-    'OFFPOV',
-    'EARNWT',
-    'INCWAGE',
-    'INCBUS',
-    'INCFARM',
-    'INCSS',
-    'INCWELFR',
-    'INCRETIR',
-    'INCSSI',
-    'INCINT',
-    'INCUNEMP',
-    'INCWKCOM',
-    'INCVET',
-    'INCSURV',
-    'INCDISAB',
-    'INCDIVID',
-    'INCRENT',
-    'INCEDUC',
-    'INCCHILD',
-    'INCASIST',
-    'INCOTHER',
-    'INCRANN',
-    'INCPENS',
-    'INCTOT',
-    'STATECENSUS',
+    "AGE",
+    "ADJGINC",
+    "ASECWT",
+    "ASECWTH",
+    "ASIAN",
+    "ASECFWT",
+    "STATEFIP",
+    "TAXINC",
+    "UHRSWORK1",
+    "RACE",
+    "SEX",
+    "SRCWELFR",
+    "YEAR",
+    "FOODSTAMP",
+    "STAMPVAL",
+    "WTFINL",
+    "BPL",
+    "HISPAN",
+    "EMPSTAT",
+    "LABFORCE",
+    "OCC",
+    "OCC2010",
+    "MARST",
+    "VETSTAT",
+    "CITIZEN",
+    "NATIVITY",
+    "CLASSWKR",
+    "WKSTAT",
+    "EDUC",
+    "OFFPOV",
+    "EARNWT",
+    "INCWAGE",
+    "INCBUS",
+    "INCFARM",
+    "INCSS",
+    "INCWELFR",
+    "INCRETIR",
+    "INCSSI",
+    "INCINT",
+    "INCUNEMP",
+    "INCWKCOM",
+    "INCVET",
+    "INCSURV",
+    "INCDISAB",
+    "INCDIVID",
+    "INCRENT",
+    "INCEDUC",
+    "INCCHILD",
+    "INCASIST",
+    "INCOTHER",
+    "INCRANN",
+    "INCPENS",
+    "INCTOT",
+    "STATECENSUS",
 ]
 
 
@@ -77,7 +77,6 @@ def test_read(self):
 
         df = IpumsCleaner(ipums_df, ddi_codebook).clean_data()
         self.assertEqual(
-            set(df['Educational Attainment'].unique()),
+            set(df["Educational Attainment"].unique()),
             set(EDUC_ATTAINMENT.values()),
         )
-