updated

franciscojavierarceo · Dec 10, 2023 · b698477 · b698477
1 parent 369acf4
commit b698477
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 191 deletions.
diff --git a/demo.ipynb b/demo.ipynb
@@ -503,14 +503,6 @@
     "wdf = ipums_df[ipums_df['ASECWT'].isnull()==False].reset_index(drop=True)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "81db5b7c-774b-4978-8b09-fd660045763c",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": 18,
@@ -571,189 +563,6 @@
     "    wdf[f\"{k} Income\"] = wdf[column_to_bucket_map[k]].sum(axis=1)\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "d087ce82-68b4-4b18-9d57-dacd352ab832",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(0, 31613)"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "wdf['Investment Income'].astype(float).isnull().sum(), wdf['INCTOT'].astype(float).isnull().sum()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "id": "0248dbd4-a37a-4b1a-b6b0-affb052cbb0b",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([4.52284034e-04, 6.11583389e-05, 4.09590410e-02, ...,\n",
-       "                  nan,            nan,            nan])"
-      ]
-     },
-     "execution_count": 35,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "np.where(\n",
-    "    wdf['INCTOT'].astype(float) == 0,\n",
-    "    0,\n",
-    "    wdf['Investment Income'].astype(float) / wdf['INCTOT'].astype(float))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "9b44970f-23ef-4cf1-a7bc-22b840e32e3f",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "TypeError",
-     "evalue": "boolean value of NA is ambiguous",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[22], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mInvestment Income as Percent of Total Income\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwhere\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[43mwdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mINCTOT\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m      4\u001b[0m \u001b[43m    \u001b[49m\u001b[43mwdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mInvestment Income\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mfloat\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mwdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mINCTOT\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mfloat\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      6\u001b[0m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGovernment Income as Percent of Total Income\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mwhere(\n\u001b[1;32m      7\u001b[0m     wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mINCTOT\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m      8\u001b[0m     \u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m      9\u001b[0m     wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGovernment Income\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mfloat\u001b[39m) \u001b[38;5;241m/\u001b[39m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mINCTOT\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mfloat\u001b[39m))\n\u001b[1;32m     11\u001b[0m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mWage Income as Percent of Total Income\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mwhere(\n\u001b[1;32m     12\u001b[0m     wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mINCTOT\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m     13\u001b[0m     \u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m     14\u001b[0m     wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mWage Income\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mfloat\u001b[39m) \u001b[38;5;241m/\u001b[39m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mINCTOT\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mfloat\u001b[39m))\n",
-      "File \u001b[0;32m<__array_function__ internals>:180\u001b[0m, in \u001b[0;36mwhere\u001b[0;34m(*args, **kwargs)\u001b[0m\n",
-      "File \u001b[0;32m~/.pyenv/versions/3.8.16/lib/python3.8/site-packages/pandas/_libs/missing.pyx:382\u001b[0m, in \u001b[0;36mpandas._libs.missing.NAType.__bool__\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;31mTypeError\u001b[0m: boolean value of NA is ambiguous"
-     ]
-    }
-   ],
-   "source": [
-    "\n",
-    "wdf['Investment Income as Percent of Total Income'] = np.where(\n",
-    "    wdf['INCTOT'] == 0,\n",
-    "    0,\n",
-    "    wdf['Investment Income'].astype(float) / wdf['INCTOT'].astype(float))\n",
-    "\n",
-    "wdf['Government Income as Percent of Total Income'] = np.where(\n",
-    "    wdf['INCTOT'] == 0,\n",
-    "    0,\n",
-    "    wdf['Government Income'].astype(float) / wdf['INCTOT'].astype(float))\n",
-    "\n",
-    "wdf['Wage Income as Percent of Total Income'] = np.where(\n",
-    "    wdf['INCTOT'] == 0,\n",
-    "    0,\n",
-    "    wdf['Wage Income'].astype(float) / wdf['INCTOT'].astype(float))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "8483607a-359c-4615-be7b-ed57e5ec28d3",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "31615"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df['INCTOT_2'].isnull().sum()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "12edee1f-407d-482c-8fd2-1611028fc7ae",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Int64Dtype()"
-      ]
-     },
-     "execution_count": 33,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "wdf['INCTOT'].dtype"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "3b643d5f-c953-41cc-aa1b-b4c0ef853c26",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0         44220\n",
-       "1         32702\n",
-       "2         50050\n",
-       "3         40001\n",
-       "4         20000\n",
-       "          ...  \n",
-       "152727    31300\n",
-       "152728        0\n",
-       "152729      NaN\n",
-       "152730      NaN\n",
-       "152731      NaN\n",
-       "Name: INCTOT_2, Length: 152732, dtype: object"
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df['INCTOT_2'] "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "4c7fa010-cde2-46f8-8f21-ff7524391d4e",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([4.52284034e-04, 6.11583389e-05, 4.09590410e-02, ...,\n",
-       "                  nan,            nan,            nan])"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "np.where(\n",
-    "    df['INCTOT_2'] == 0,\n",
-    "    0,\n",
-    "    df['Investment Income'].astype(float) / df['INCTOT_2'].astype(float))\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 10,

diff --git a/src/pyipums/clean_data.py b/src/pyipums/clean_data.py
@@ -64,6 +64,7 @@ def clean_variables(self):
         self.df["Nativity"] = map_codes(self.ddi_codebook, self.df, "NATIVITY")
         self.df["Class_of_worker"] = map_codes(self.ddi_codebook, self.df, "CLASSWKR")
         self.df["Hispanic"] = map_codes(self.ddi_codebook, self.df, "HISPAN")
+        self.df['Hispanic or Not'] = np.where(self.df['Hispanic'] != 'Not Hispanic', 'Hispanic', 'Not Hispanic')
         self.df["Asian"] = map_codes(self.ddi_codebook, self.df, "ASIAN")
         self.df["Race"] = map_codes(self.ddi_codebook, self.df, "RACE")
         self.df["Veteran_Status"] = map_codes(self.ddi_codebook, self.df, "VETSTAT")