gedeck · pdxrod · Jul 2, 2021 · Jul 2, 2021 · Jul 3, 2021 · Jul 31, 2021
diff --git a/python/code/README.md b/python/code/README.md
@@ -0,0 +1,87 @@
+## Instructions and Errors
+
+Tested using Python 3.8.5 on Mac M1 and 3.8.11 on Mac Intel processor
+
+All:
+```
+$ pip install pandas
+$ pip install sklearn
+$ pip install dmba
+$ pip install matplotlib
+$ pip install seaborn
+```
+Then run the Python files from the command line, e.g.:
+
+`$ ch_1_01_states_populations_and_murders.py`
+
+At the top of the ch_*.py files is `#!/usr/local/bin/python`.
+This should point to Python 3.
+
+#### Chapter 3:
+`# python Chapter\ 3\ -\ Statistial\ Experiments\ and\ Significance\ Testing.py`
+
+Line 77
+
+`print(np.mean(perm_diffs > mean_b - mean_a))`
+
+blows up in Python 3.8.5, scipy 1.7.0, pandas 1.2.4, numpy 1.20.2.
+
+In `ch_3_01_resampling.py`, changed it to
+
+`print(np.mean(np.array(perm_diffs) > mean_b - mean_a))`
+
+
+#### Chapter 6:
+(On Mac M1 Arm processor)
+```
+$ arch -arm64 brew install libomp
+$ sudo mkdir -p /usr/local/opt/libomp/lib
+$ cd /usr/local/opt/libomp/lib
+$ ln -s /opt/homebrew/lib/libomp.dylib libomp.dylib
+$ pip install xgboost
+$ python Chapter\ 6\ -\ Statistical\ Machine\ Learning.py
+xgboost.core.XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
+Likely causes:
+  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
+  * You are running 32-bit Python on a 64-bit OS
+ Error message(s): ['dlopen(/opt/homebrew/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /opt/homebrew/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib\n  
+ Reason: no suitable image found.  
+ Did find:\n\t/usr/local/opt/libomp/lib/libomp.dylib: mach-o, but wrong architecture\n\t/opt/homebrew/Cellar/libomp/12.0.0/lib/libomp.dylib: mach-o, but wrong architecture']
+```
+I solved this problem by simply copying **libomp.a** and **libomp.dylib** from `/usr/local/Cellar/libomp/11.0.0/lib` on my Mac Intel Macbook Air, into `/opt/homebrew/Cellar/libomp/12.0.0/lib` on my Mac Arm M1 Macbook Pro. Now, Chapter 6, including all the `ch_6_*.py` files, run on the Arm M1 box. `brew install cmake` on the Intel Mac created these files originally.
+
+
+(On Mac 64-bit Intel processor)
+```
+$ brew install cmake
+$ pip install xgboost
+```
+
+#### Chapter 7:
+```
+$ pip install prince
+$ python Chapter\ 7\ -\ Unsupervised\ Learning.py
+...
+4 : GOOGL
+Traceback (most recent call last):
+  File "Chapter 7 - Unsupervised Learning.py", line 213, in <module>
+    ax = sns.scatterplot(x='XOM', y='CVX', hue=colors, style=colors,
+  File "/opt/homebrew/anaconda3/lib/python3.8/site-packages/seaborn/_decorators.py", line 46, in inner_f
+    return f(**kwargs)
+  File "/opt/homebrew/anaconda3/lib/python3.8/site-packages/seaborn/relational.py", line 794, in scatterplot
+    p = _ScatterPlotter(
+  File "/opt/homebrew/anaconda3/lib/python3.8/site-packages/seaborn/relational.py", line 580, in __init__
+    super().__init__(data=data, variables=variables)
+  File "/opt/homebrew/anaconda3/lib/python3.8/site-packages/seaborn/_core.py", line 604, in __init__
+    self.assign_variables(data, variables)
+  File "/opt/homebrew/anaconda3/lib/python3.8/site-packages/seaborn/_core.py", line 667, in assign_variables
+    plot_data, variables = self._assign_variables_longform(
+  File "/opt/homebrew/anaconda3/lib/python3.8/site-packages/seaborn/_core.py", line 895, in _assign_variables_longform
+    if val is not None and len(data) != len(val):
+TypeError: object of type 'float' has no len()
+```
+See comment on line 32 of `ch_7_04_model_based_clustering_mixtures_of_normals_selecting_the_number_of_clusters.py`
+
+
+Finally, a pedantic correction in the book.
+First edition, page 194. "For each additional year that the worker is exposed to cotton dust the worker's PEFR measurement is reduced by -4.185." No, it's reduced by +4.185.
diff --git a/python/code/ch_1_01_states_populations_and_murders.py b/python/code/ch_1_01_states_populations_and_murders.py
@@ -0,0 +1,92 @@
+#!/usr/local/bin/python
+
+## Practical Statistics for Data Scientists (Python)
+## Chapter 1. Exploratory Data Analysis
+# > (c) 2019 Peter C. Bruce, Andrew Bruce, Peter Gedeck
+
+from pathlib import Path
+import pandas as pd
+import numpy as np
+from scipy.stats import trim_mean
+from statsmodels import robust
+import wquantiles
+import seaborn as sns
+import matplotlib.pylab as plt
+import common
+
+print("""  Estimates of Location
+  Example: Location Estimates of Population and Murder Rates""")
+
+print("""
+  Compute the mean, trimmed mean, and median for Population.
+  For `mean` and `median` we can use the Pandas methods of the data frame.
+  The trimmed mean requires the `trim_mean` function in _scipy.stats_.
+""")
+print("state = pd.read_csv('state.csv')")
+state = pd.read_csv(common.STATE_CSV)
+print(common.printx( "",  "state['Population'].mean()", {'state': state}  ))
+
+print(common.printx( "",  "trim_mean(state['Population'], 0.1)", {'trim_mean': trim_mean, 'state': state}  ))
+
+print(common.printx( "",  "state['Population'].median()", {'state': state}  ))
+
+print("""
+  Weighted mean is available with numpy.
+  For weighted median, we can use the specialised package wquantiles
+  (https://pypi.org/project/wquantiles/).""")
+print(common.printx( "",  "state['Murder.Rate'].mean()", {'state': state}  ))
+
+print(common.printx( "",  "np.average(state['Murder.Rate'], weights=state['Population'])", {'np': np, 'state': state}  ))
+
+print(common.printx( "",  "wquantiles.median(state['Murder.Rate'], weights=state['Population'])", {'wquantiles': wquantiles, 'state': state}  ))
+
+print()
+print("  Estimates of Variability")
+print( "  # Table 1-2" )
+print(common.printx( "",  "state.head(8)", {'state': state}  ))
+print()
+
+print("  Standard deviation")
+print(common.printx( "",  "state['Population'].std()", {'state': state}  ))
+print()
+
+print("  Interquartile range is calculated as the difference of the 75% and 25% quantile.")
+print(common.printx( "",  "state['Population'].quantile(0.75) - state['Population'].quantile(0.25)", {'state': state}  ))
+print()
+
+print("  Median absolute deviation from the median can be calculated with a method in statsmodels")
+
+print(common.printx( "",  "robust.scale.mad(state['Population'])", {'robust': robust, 'state': state}  ))
+print(common.printx( "",  "abs(state['Population'] - state['Population'].median()).median() / 0.6744897501960817",
+                         {'abs': abs, 'state': state}  ))
+print()
+
+print("""  Percentiles and Boxplots
+  Pandas has the quantile method for data frames.""")
+print(common.printx( "",  "state['Murder.Rate'].quantile([0.05, 0.25, 0.5, 0.75, 0.95])", {'state': state}  ))
+print()
+
+print( "  # Table 1.4" )
+print("""
+percentages = [0.05, 0.25, 0.5, 0.75, 0.95]
+df = pd.DataFrame(state['Murder.Rate'].quantile(percentages))
+df.index = [f'{p * 100}%' for p in percentages]""")
+percentages = [0.05, 0.25, 0.5, 0.75, 0.95]
+df = pd.DataFrame(state['Murder.Rate'].quantile(percentages))
+df.index = [f'{p * 100}%' for p in percentages]
+print( "df.transpose()" )
+df.transpose()
+
+print("""
+  Pandas provides a number of basic exploratory plots; one of them is boxplots
+
+ax = (state['Population']/1_000_000).plot.box(figsize=(3, 4))
+ax.set_ylabel('Population (millions)')
+plt.tight_layout()
+plt.show()
+""")
+
+ax = (state['Population']/1_000_000).plot.box(figsize=(3, 4))
+ax.set_ylabel('Population (millions)')
+plt.tight_layout()
+plt.show()
diff --git a/python/code/ch_1_02_frequency_table_and_histograms.py b/python/code/ch_1_02_frequency_table_and_histograms.py
@@ -0,0 +1,71 @@
+#!/usr/local/bin/python
+
+## Practical Statistics for Data Scientists (Python)
+## Chapter 1. Exploratory Data Analysis
+# > (c) 2019 Peter C. Bruce, Andrew Bruce, Peter Gedeck
+
+from pathlib import Path
+import pandas as pd
+import numpy as np
+from scipy.stats import trim_mean
+from statsmodels import robust
+import wquantiles
+import seaborn as sns
+import matplotlib.pylab as plt
+import common
+
+print("""  Put the US states into \"bins\" by population:
+  Frequency Table and Histograms
+  The `cut` method for pandas data splits the dataset into bins.
+  There are a number of arguments for the method.
+  The following code creates equal sized bins.
+  The method `value_counts` returns a frequency table.
+""")
+
+print("state = pd.read_csv('state.csv')")
+state = pd.read_csv(common.STATE_CSV)
+print("""binnedPopulation = pd.cut(state['Population'], 10)
+print(binnedPopulation.value_counts())
+""")
+binnedPopulation = pd.cut(state['Population'], 10)
+print(binnedPopulation.value_counts())
+
+print()
+print( "  # Table 1.5" )
+print()
+print("binnedPopulation.name = 'binnedPopulation'")
+binnedPopulation.name = 'binnedPopulation'
+df = common.printx( "df = ", "pd.concat([state, binnedPopulation], axis=1)", {'state': state, 'pd': pd, 'binnedPopulation': binnedPopulation})
+df = common.printx( "df = ", "df.sort_values(by='Population')", {'df': df})
+
+print()
+print("groups = []")
+groups = []
+print("""
+for group, subset in df.groupby(by='binnedPopulation'):
+    groups.append({
+        'BinRange': group,
+        'Count': len(subset),
+        'States': ','.join(subset.Abbreviation)
+    })
+print(pd.DataFrame(groups))""")
+
+for group, subset in df.groupby(by='binnedPopulation'):
+    groups.append({
+        'BinRange': group,
+        'Count': len(subset),
+        'States': ','.join(subset.Abbreviation)
+    })
+print(pd.DataFrame(groups))
+
+print()
+print("Pandas also supports histograms for exploratory data analysis.")
+
+ax = common.printx( "ax = ", "(state['Population'] / 1_000_000).plot.hist(figsize=(4, 4))", {'state': state})
+print("ax.set_xlabel('Population (millions)')")
+ax.set_xlabel('Population (millions)')
+
+print("""plt.tight_layout()
+plt.show()""")
+plt.tight_layout()
+plt.show()
diff --git a/python/code/ch_1_03_binary_and_categorical_data.py b/python/code/ch_1_03_binary_and_categorical_data.py
@@ -0,0 +1,36 @@
+#!/usr/local/bin/python
+
+## Practical Statistics for Data Scientists (Python)
+## Chapter 1. Exploratory Data Analysis
+# > (c) 2019 Peter C. Bruce, Andrew Bruce, Peter Gedeck
+
+from pathlib import Path
+import pandas as pd
+import numpy as np
+from scipy.stats import trim_mean
+from statsmodels import robust
+import wquantiles
+import seaborn as sns
+import matplotlib.pylab as plt
+import common
+
+print("""  Exploring Binary and Categorical Data""")
+print()
+print( "  # Table 1-6" )
+print()
+print("  Pandas also supports bar charts for displaying a single categorical variable.")
+
+print("""
+dfw = pd.read_csv('dfw_airline.csv')
+ax = dfw.transpose().plot.bar(figsize=(4, 4), legend=False)
+ax.set_xlabel('Cause of delay')
+ax.set_ylabel('Count')
+plt.tight_layout()
+plt.show()
+""")
+dfw = pd.read_csv(common.AIRPORT_DELAYS_CSV)
+ax = dfw.transpose().plot.bar(figsize=(4, 4), legend=False)
+ax.set_xlabel('Cause of delay')
+ax.set_ylabel('Count')
+plt.tight_layout()
+plt.show()
diff --git a/python/code/ch_1_04_density_estimates_murder_rate.py b/python/code/ch_1_04_density_estimates_murder_rate.py
@@ -0,0 +1,34 @@
+#!/usr/local/bin/python
+
+## Practical Statistics for Data Scientists (Python)
+## Chapter 1. Exploratory Data Analysis
+# > (c) 2019 Peter C. Bruce, Andrew Bruce, Peter Gedeck
+
+from pathlib import Path
+import pandas as pd
+import numpy as np
+from scipy.stats import trim_mean
+from statsmodels import robust
+import wquantiles
+import seaborn as sns
+import matplotlib.pylab as plt
+import common
+
+print("""  Density Estimates
+  Density is an alternative to histograms that can provide more insight into the distribution of the data points.
+  Use the argument bw_method to control the smoothness of the density curve (bw_method = 0.3).
+""")
+print("state = pd.read_csv('state.csv')")
+state = pd.read_csv(common.STATE_CSV)
+
+ax = common.printx( "ax = ", """state['Murder.Rate'].plot.hist(density=True, xlim=[0, 12],
+                               bins=range(1,12), figsize=(4, 4))""", {'state': state} )
+print("""state['Murder.Rate'].plot.density(ax=ax, bw_method=0.3)
+ax.set_xlabel('Murder Rate (per 100,000)')
+plt.tight_layout()
+plt.show()""")
+
+state['Murder.Rate'].plot.density(ax=ax, bw_method=0.3)
+ax.set_xlabel('Murder Rate (per 100,000)')
+plt.tight_layout()
+plt.show()
diff --git a/python/code/ch_1_05_sectors_correlation.py b/python/code/ch_1_05_sectors_correlation.py
@@ -0,0 +1,64 @@
+#!/usr/local/bin/python
+
+## Practical Statistics for Data Scientists (Python)
+## Chapter 1. Exploratory Data Analysis
+# > (c) 2019 Peter C. Bruce, Andrew Bruce, Peter Gedeck
+
+from pathlib import Path
+import pandas as pd
+import numpy as np
+from scipy.stats import trim_mean
+from statsmodels import robust
+import wquantiles
+import seaborn as sns
+import matplotlib.pylab as plt
+import common
+
+print("""
+sp500_sym = pd.read_csv('sp500_sectors.csv')
+sp500_px = pd.read_csv( 'sp500_data.csv.gz', index_col=0)""")
+sp500_sym = pd.read_csv(common.SP500_SECTORS_CSV)
+sp500_px = pd.read_csv( common.SP500_DATA_CSV, index_col=0)
+
+print()
+print( "  # Table 1-7" )
+print("""
+  Determine telecommunications symbols
+
+telecomSymbols = sp500_sym[sp500_sym['sector'] == 'telecommunications_services']['symbol']
+""")
+telecomSymbols = sp500_sym[sp500_sym['sector'] == 'telecommunications_services']['symbol']
+
+print("  Filter data for dates July 2012 through June 2015")
+telecom = common.printx( "telecom = ", "sp500_px.loc[sp500_px.index >= '2012-07-01', telecomSymbols]",
+                        {'sp500_px': sp500_px, 'telecomSymbols': telecomSymbols} )
+print("""telecom.corr()
+print(telecom)""")
+telecom.corr()
+print(telecom)
+print()
+
+print("  Next we focus on funds traded on major exchanges (sector == 'etf').")
+
+etfs = common.printx( "etfx = ",  """sp500_px.loc[sp500_px.index > '2012-07-01',
+                    sp500_sym[sp500_sym['sector'] == 'etf']['symbol']]""",
+                    {'sp500_px': sp500_px, 'sp500_sym': sp500_sym}  )
+print("print(etfs.head())")
+print(etfs.head())
+
+print("""
+  Due to the large number of columns in this table, looking at the correlation matrix
+  is cumbersome and it's more convenient to plot the correlation as a heatmap.
+  The seaborn package provides a convenient implementation for heatmaps.
+""")
+
+print("fig, ax = plt.subplots(figsize=(5, 4))")
+fig, ax = plt.subplots(figsize=(5, 4))
+
+ax = common.printx( "ax = ", """sns.heatmap(etfs.corr(), vmin=-1, vmax=1,
+                         cmap=sns.diverging_palette(20, 220, as_cmap=True),
+                         ax=ax)""", {'sns': sns, 'etfs': etfs, 'ax': ax} )
+print("""plt.tight_layout()
+plt.show()""")
+plt.tight_layout()
+plt.show()