Skip to content

Commit

Permalink
updated
Browse files Browse the repository at this point in the history
  • Loading branch information
franciscojavierarceo committed Dec 10, 2023
1 parent 369acf4 commit b698477
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 191 deletions.
191 changes: 0 additions & 191 deletions demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -503,14 +503,6 @@
"wdf = ipums_df[ipums_df['ASECWT'].isnull()==False].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "81db5b7c-774b-4978-8b09-fd660045763c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 18,
Expand Down Expand Up @@ -571,189 +563,6 @@
" wdf[f\"{k} Income\"] = wdf[column_to_bucket_map[k]].sum(axis=1)\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "d087ce82-68b4-4b18-9d57-dacd352ab832",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0, 31613)"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wdf['Investment Income'].astype(float).isnull().sum(), wdf['INCTOT'].astype(float).isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "0248dbd4-a37a-4b1a-b6b0-affb052cbb0b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([4.52284034e-04, 6.11583389e-05, 4.09590410e-02, ...,\n",
" nan, nan, nan])"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.where(\n",
" wdf['INCTOT'].astype(float) == 0,\n",
" 0,\n",
" wdf['Investment Income'].astype(float) / wdf['INCTOT'].astype(float))\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "9b44970f-23ef-4cf1-a7bc-22b840e32e3f",
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "boolean value of NA is ambiguous",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[22], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mInvestment Income as Percent of Total Income\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwhere\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mwdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mINCTOT\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mwdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mInvestment Income\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mfloat\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mwdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mINCTOT\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mfloat\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGovernment Income as Percent of Total Income\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mwhere(\n\u001b[1;32m 7\u001b[0m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mINCTOT\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m 8\u001b[0m \u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m 9\u001b[0m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mGovernment Income\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mfloat\u001b[39m) \u001b[38;5;241m/\u001b[39m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mINCTOT\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mfloat\u001b[39m))\n\u001b[1;32m 11\u001b[0m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mWage Income as Percent of Total Income\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mwhere(\n\u001b[1;32m 12\u001b[0m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mINCTOT\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m 13\u001b[0m \u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m 14\u001b[0m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mWage Income\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mfloat\u001b[39m) \u001b[38;5;241m/\u001b[39m wdf[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mINCTOT\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mfloat\u001b[39m))\n",
"File \u001b[0;32m<__array_function__ internals>:180\u001b[0m, in \u001b[0;36mwhere\u001b[0;34m(*args, **kwargs)\u001b[0m\n",
"File \u001b[0;32m~/.pyenv/versions/3.8.16/lib/python3.8/site-packages/pandas/_libs/missing.pyx:382\u001b[0m, in \u001b[0;36mpandas._libs.missing.NAType.__bool__\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: boolean value of NA is ambiguous"
]
}
],
"source": [
"\n",
"wdf['Investment Income as Percent of Total Income'] = np.where(\n",
" wdf['INCTOT'] == 0,\n",
" 0,\n",
" wdf['Investment Income'].astype(float) / wdf['INCTOT'].astype(float))\n",
"\n",
"wdf['Government Income as Percent of Total Income'] = np.where(\n",
" wdf['INCTOT'] == 0,\n",
" 0,\n",
" wdf['Government Income'].astype(float) / wdf['INCTOT'].astype(float))\n",
"\n",
"wdf['Wage Income as Percent of Total Income'] = np.where(\n",
" wdf['INCTOT'] == 0,\n",
" 0,\n",
" wdf['Wage Income'].astype(float) / wdf['INCTOT'].astype(float))\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "8483607a-359c-4615-be7b-ed57e5ec28d3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"31615"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['INCTOT_2'].isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "12edee1f-407d-482c-8fd2-1611028fc7ae",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Int64Dtype()"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wdf['INCTOT'].dtype"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "3b643d5f-c953-41cc-aa1b-b4c0ef853c26",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 44220\n",
"1 32702\n",
"2 50050\n",
"3 40001\n",
"4 20000\n",
" ... \n",
"152727 31300\n",
"152728 0\n",
"152729 NaN\n",
"152730 NaN\n",
"152731 NaN\n",
"Name: INCTOT_2, Length: 152732, dtype: object"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['INCTOT_2'] "
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "4c7fa010-cde2-46f8-8f21-ff7524391d4e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([4.52284034e-04, 6.11583389e-05, 4.09590410e-02, ...,\n",
" nan, nan, nan])"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.where(\n",
" df['INCTOT_2'] == 0,\n",
" 0,\n",
" df['Investment Income'].astype(float) / df['INCTOT_2'].astype(float))\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
Expand Down
1 change: 1 addition & 0 deletions src/pyipums/clean_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def clean_variables(self):
self.df["Nativity"] = map_codes(self.ddi_codebook, self.df, "NATIVITY")
self.df["Class_of_worker"] = map_codes(self.ddi_codebook, self.df, "CLASSWKR")
self.df["Hispanic"] = map_codes(self.ddi_codebook, self.df, "HISPAN")
self.df['Hispanic or Not'] = np.where(self.df['Hispanic'] != 'Not Hispanic', 'Hispanic', 'Not Hispanic')
self.df["Asian"] = map_codes(self.ddi_codebook, self.df, "ASIAN")
self.df["Race"] = map_codes(self.ddi_codebook, self.df, "RACE")
self.df["Veteran_Status"] = map_codes(self.ddi_codebook, self.df, "VETSTAT")
Expand Down

0 comments on commit b698477

Please sign in to comment.