From cadbaf412566e356848b66afad88ba15c24085b2 Mon Sep 17 00:00:00 2001 From: Stephen Richer Date: Thu, 15 Sep 2022 10:50:05 +0100 Subject: [PATCH] Delete. --- development/1.buildCatBoost.md | 379 --------------------------------- development/1.simulateData.md | 74 ------- development/testDNAttend.md | 88 -------- 3 files changed, 541 deletions(-) delete mode 100644 development/1.buildCatBoost.md delete mode 100644 development/1.simulateData.md delete mode 100644 development/testDNAttend.md diff --git a/development/1.buildCatBoost.md b/development/1.buildCatBoost.md deleted file mode 100644 index 934f89c..0000000 --- a/development/1.buildCatBoost.md +++ /dev/null @@ -1,379 +0,0 @@ ---- -jupyter: - jupytext: - text_representation: - extension: .md - format_name: markdown - format_version: '1.3' - jupytext_version: 1.14.1 - kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - -```python -import sys -import json -import shap -import sklearn -import numpy as np -import pandas as pd -from pathlib import Path -from pprint import pprint -import matplotlib.pyplot as plt -from sklearn.pipeline import Pipeline -from sklearn.impute import SimpleImputer -from scipy.stats import randint, uniform -from catboost import CatBoostClassifier, Pool -from sklearn.compose import ColumnTransformer -from sklearn.model_selection import RandomizedSearchCV -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.model_selection import train_test_split, KFold -from sklearn.utils.class_weight import compute_class_weight -from sklearn.metrics import roc_curve, roc_auc_score, classification_report, RocCurveDisplay - -sklearn.set_config(display='diagram') -``` - -```python -from dnattend.data import generateData -``` - -```python -#df = pd.read_pickle('data/OPattendance.pkl').drop_duplicates() - -# Remove other non-attendance reasons (e.g. provider cancellation) -#df = df.loc[df['attendanceStatus'].isin(['Attended', 'Did Not Attend'])] - -df = generateData() -``` - -```python -# Train test validation split sizes -train_size = 0.7 -test_size = 0.15 -val_size = 0.15 - -prepArgs = ({ - 'catCols': [ - 'siteCode', 'priorityCode', 'consultationMediaCode', - 'specialityCode', 'appointmentWeekday', 'Age' - ], - 'boolCols': [ - 'firstAppointment' - ], -}) - -# Global seed for reproducibility -seed = 42 - -# Output JSON file to store best parameters and report -bestParamsOut = 'bestParams.json' -scoreReportOut = 'classifierReport.json' - -# Iterations for CatBoost during hyper-tuning -# Don't set too high (will slow down tuning) -# This is optimised later using early stopping -catboostIterations = 100 - -# Cross-validated hyper-parameter tuning -cvIterations = 5 # Increase for better tunings -nFolds = 5 # Number of cross-validation folds -nJobs = 1 # Cores - set to -1 to use all available -params = ({ - 'estimator__depth': randint(4, 10), - 'estimator__l2_leaf_reg': randint(2, 10), - 'estimator__random_strength': uniform.rvs(0, 10, size=100), -}) - -# Post-tuning early stopping iterations (with eval set) -evalIterations = 10000 -earlyStoppingRounds = 10 -``` - -```python -for var in ([ - seed, cvIterations, nFolds, nJobs, - evalIterations, earlyStoppingRounds - ]): - assert isinstance(var, int) - -for out in [bestParamsOut, scoreReportOut]: - assert out.endswith('.json') - assert Path(out).parent.exists() - -assert train_size + test_size + val_size == 1 - -np.random.seed(seed) # Set global seed -``` - -```python -# Keep only a single entry per patient (most recent) -if {'appointmentDateTime', 'patientID'}.issubset(df.columns): - df = ( - df.sort_values('appointmentDateTime', ascending=False) - .groupby('patientID').head(1) - ) -else: - msg = 'Recommended removing multiple records of same patient.' - print(msg, file=sys.stderr) -``` - -def mapTarget(x): - names = ({ - '1': 'DNA', 'y': 'DNA', 't': 'DNA', - 'd': 'DNA', '0': 'Attend', 'n': 'Attend', - 'f': 'Attend', 'a': 'Attend' - }) - x = str(x).lower().strip()[0] - return names[x] - -df['DNA'] = df['DNA'].apply(mapTarget) -assert set(df['DNA']) == set(['DNA', 'Attend']) - -```python -class prepareData(BaseEstimator, TransformerMixin): - - def __init__( - self, - catCols: list = None, - numericCols: list = None, - boolCols: list = None): - self.catCols = [] if catCols is None else catCols - self.numericCols = [] if numericCols is None else numericCols - self.boolCols = [] if boolCols is None else boolCols - self._setCatColIdx() - - def fit(self, X, y=None): - return self - - def transform(self, X, y=None): - for col in self.boolCols: - X[col] = self._mapBoolCol(X[col]) - X[self.catCols] = X[self.catCols].astype(str) - return X.loc[:, self.validCols] - - def _mapBoolCol(self, col): - col = col.apply(lambda x: str(x).lower().strip()[0]) - names = ({ - '1': 1, 'y': 1, 't': 1, - '0': 0, 'n': 0, 'f': 0 - }) - col = col.map(names) - col.loc[~col.isin([0,1])] = np.nan - return col - - @property - def validCols(self): - return self.catCols + self.numericCols + self.boolCols - - def _setCatColIdx(self): - """ Get indices of categoric cols """ - self.catColIdx = [] - for col in self.catCols: - if col in self.validCols: - self.catColIdx.append( - self.validCols.index(col)) -``` - -```python -def train_test_validation_split( - X, y, train_size=0.8, test_size=0.1, val_size=0.1, seed=None): - assert train_size + test_size + val_size == 1 - rng = np.random.default_rng(seed) - - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=(1 - train_size), - random_state=rng.integers(1e9)) - - split2Size = test_size / (test_size + val_size) - X_val, X_test, y_val, y_test = train_test_split( - X_test, y_test, test_size=split2Size, - random_state=rng.integers(1e9)) - - - return ( - X_train.copy(), X_test.copy(), X_val.copy(), - y_train.copy(), y_test.copy(), y_val.copy() - ) -``` - -```python -prepData = prepareData(**prepArgs) - -X = df.copy() -y = X.pop('DNA') - -split = train_test_validation_split( - X, y, train_size=train_size, test_size=test_size, - val_size=val_size, seed=np.random.randint(1e9) -) -X_train, X_test, X_val, y_train, y_test, y_val = map(lambda x: x.copy(), split) -``` - -```python -# Set class weights - this balanced prediction probalities for more easy interpretation -classes = np.unique(y_train) -weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train) -class_weights = dict(zip(classes, weights)) -``` - -```python -# Combine all transformations into a ColumnTransformer -transformers = ([ - ('categories', SimpleImputer(strategy='constant'), prepData.catCols), - ('numeric', SimpleImputer(strategy='mean'), prepData.numericCols), - ('boolean', SimpleImputer(strategy='most_frequent'), prepData.boolCols), -]) -featureTransformer = ColumnTransformer( - transformers=transformers, remainder='drop') -``` - -```python -# Define a preProcessor Pipeline encompassing the FeatureEngineering and featureTransformation steps -preProcessor = Pipeline(steps=[ - ('prepare', prepareData(**prepArgs)), - ('columnTransform', featureTransformer), -]) -``` - -```python -# Combine processor and modelling steps into a Pipeline object -model = Pipeline(steps=[ - ('preprocess', preProcessor), - ('estimator', CatBoostClassifier( - cat_features=preProcessor.named_steps['prepare'].catColIdx, - eval_metric='Logloss', - class_weights=class_weights, - iterations=catboostIterations, verbose=0, - random_seed=np.random.randint(1e9))), -]) -``` - -```python -gridSearch = RandomizedSearchCV( - model, params, scoring='neg_log_loss', - random_state=np.random.randint(1e9), cv=nFolds, - refit=False, n_jobs=nJobs, n_iter=cvIterations, verbose=2) -_ = gridSearch.fit(X_train, y_train) -``` - -```python -# Extract best parameters from cross-validated randomised search -params = gridSearch.best_params_ -params['estimator__iterations'] = evalIterations -_ = model.set_params(**params) -``` - -```python -# Pre-process the validation set with the tuned model parameters. -# Required since eval_set is other not processed before CatBoost -X_val = model.named_steps['preprocess'].fit(X_train, y_train).transform(X_val) -evalSet = Pool(X_val, y_val, cat_features=prepData.catColIdx) -``` - -```python -_ = model.fit( - X_train, y_train, estimator__eval_set=evalSet, - estimator__early_stopping_rounds=earlyStoppingRounds) -``` - -```python -# Update iteration parameter to optimal and write to file -bestIteration = model.named_steps['estimator'].get_best_iteration() -params['estimator__iterations'] = bestIteration - -with open(bestParamsOut, 'w') as fh: - json.dump(params, fh) -``` - -### Tune Decision Threshold - -```python -def predict(model, X, threshold=0.5): - posProb = pd.Series(model.predict_proba(X)[:,1]) - classes = model.classes_ - predictions = ( - posProb.apply( - lambda x: classes[0] if x < threshold else classes[1]) - ) - return predictions -``` - -```python -y_trainInt = y_train.apply(lambda x: 1 if x == model.classes_[1] else 0) -y_predPos = model.predict_proba(X_train)[:,1] - -fpr, tpr, thresholds = roc_curve( - y_trainInt, y_predPos, drop_intermediate=False) -AUC = roc_auc_score(y_trainInt, y_predPos) - -idx = np.argmin(np.abs(fpr + tpr - 1)) -optimalThreshold = thresholds[idx] -``` - -```python -fig, ax = plt.subplots() -RocCurveDisplay.from_estimator(model, X_train, y_train, ax=ax) -ax.set_xlim([0, 1]) -ax.set_ylim([0, 1]) -ax.axhline(tpr[idx], xmax=fpr[idx], ls='--', alpha=0.5, c='black') -ax.axvline(fpr[idx], ymax=tpr[idx], ls='--', alpha=0.5, c='black') -ax.scatter(fpr[idx], tpr[idx], c='black') -ax.set_xlabel('False Positive Rate') -ax.set_ylabel('True Positive Rate') -label = f'AUC = {AUC:.2f}, Optimal Threshold = {optimalThreshold:.2f}' -ax.legend(labels=[label], loc='lower right') -fig.savefig('ROCcurve.pdf') -``` - -```python -importances = pd.Series( - model.named_steps['estimator'].feature_importances_, - prepareData(**prepArgs).validCols -).sort_values(ascending=False) -importances -``` - -```python -df['DNAprob'] = pd.DataFrame(model.predict_proba(df), columns=model.classes_)['DNA'].values -df['DNApredict'] = predict(model, df, threshold=optimalThreshold) -``` - -```python -testPredictions = predict(model, X_test, threshold=optimalThreshold) -report = classification_report(y_test, testPredictions, output_dict=True) -with open(scoreReportOut, 'w') as fh: - json.dump(report, fh) -``` - -```python -pprint(report) -``` - -```python -explainer = shap.Explainer(model.named_steps['estimator'], feature_names=prepData.validCols) -``` - -```python -preTransformer = model.named_steps['preprocess'].fit(X_train, y_train) -X_transformed = preTransformer.transform(X.sample(1).copy()) -``` - -```python -shap_values = explainer(X_transformed) -``` - -```python -# waterfall plot for first observation -shap.plots.waterfall(shap_values[0]) -``` - -```python -df.groupby(['firstAppointment'])['DNAprob'].agg(['mean', 'size']) -``` - -```python - -``` diff --git a/development/1.simulateData.md b/development/1.simulateData.md deleted file mode 100644 index ab7f4c7..0000000 --- a/development/1.simulateData.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -jupyter: - jupytext: - formats: ipynb,md - text_representation: - extension: .md - format_name: markdown - format_version: '1.3' - jupytext_version: 1.14.1 - kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - -```python -import numpy as np -import pandas as pd -``` - -```python -def DNAprob(x): - """ Simulate DNA probability with artificial values """ - modifiers = ({ - 'weekday': 0.05, - 'priority': 0.1, - 'age': 0.1, - 'firstAppointment': 0.2, - 'consultationMedia': 0.3, - 'site': 0.4 - }) - maxModifier = np.array(list(modifiers.values())).sum() - minModifier = -maxModifier - if x['weekday'] in ['Saturday', 'Sunday']: - modifiers['weekday'] *= -1 - if x['priority'] == 'Two Week Wait': - modifiers['priority'] *= -1 - if x['ageGroup'] == 'Over 65': - modifiers['age'] *= -1 - if not x['firstAppointment']: - modifiers['firstAppointment'] *= -1 - if x['consultationMedia'] == 'In-Person': - modifiers['consultationMedia'] *= -1 - if x['site'] == 'Lakeside': - modifiers['site'] *= -1 - # Get probability score as sum of modifiers - p = np.array(list(modifiers.values())).sum() - # Normalise p to [0, 1] - p = ((p - minModifier) / (maxModifier - minModifier)) - DNA = np.random.choice([True, False], p=[p, 1-p]) - return pd.Series([p, DNA]) -``` - -```python -seed = 42 -samples = 50_000 -np.random.seed(seed) - -daysOfWeek = ([ - 'Wednesday', 'Tuesday', 'Monday', 'Sunday', - 'Saturday', 'Friday', 'Thursday' -]) - -data = pd.DataFrame({ - 'weekday': np.random.choice(daysOfWeek, samples), - 'priority': np.random.choice(['Urgent', 'Two Week Wait'], samples), - 'ageGroup': np.random.choice(['Over 65', 'Under 65'], samples), - 'speciality': np.random.choice(['Ophthalmology', 'Audiology'], samples), - 'firstAppointment': np.random.choice([True, False], samples), - 'consultationMedia': np.random.choice(['Telephone', 'In-Person'], samples), - 'site': np.random.choice(['Fairview', 'Lakeside'], samples) -}) -data[['DNAprob', 'DNA']] = data.apply(DNAprob, axis=1) -``` diff --git a/development/testDNAttend.md b/development/testDNAttend.md deleted file mode 100644 index 364f861..0000000 --- a/development/testDNAttend.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -jupyter: - jupytext: - formats: ipynb,md - text_representation: - extension: .md - format_name: markdown - format_version: '1.3' - jupytext_version: 1.14.1 - kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - -```python -%matplotlib inline -from sklearn.base import clone -from scipy.stats import randint, uniform -from dnattend.data import generateData -from dnattend.train import trainModel, splitData, refitAllData -from dnattend.test import getFeatureImportance, plotROC, predict, evaluate -``` - -```python -df = generateData(size=50_000, seed=42) -``` - -```python -data = splitData(df, target='status', train_size=0.7, test_size=0.15, val_size=0.15) -``` - -```python -catCols = ['day', 'priority', 'speciality', 'consultationMedia', 'site'] -boolCols = ['firstAppointment'] -numericCols = ['age'] - -trainingParams = ({ - 'catCols': catCols, - 'boolCols': boolCols, - 'numericCols': numericCols, - 'cvFolds': 5, - 'catboostIterations': 100, - 'hypertuneIterations': 5, - 'evalIterations': 10_000, - 'earlyStoppingRounds': 10, - 'seed': 42 -}) - -hyperParams = ({ - 'estimator__depth': randint(4, 10), - 'estimator__l2_leaf_reg': randint(2, 10), - 'estimator__random_strength': uniform.rvs(0, 10, size=100), -}) - -model, params = trainModel(data, hyperParams=hyperParams, **trainingParams) -``` - -```python -oldParams = model.get_params() -``` - -```python -featureImportances = getFeatureImportance(model) -fig = featureImportances.plot.barh() -fig.figure.savefig('../README_files/featureImportances.svg', dpi=300) -``` - -```python -fig, ax = plotROC(model, data) -fig.figure.savefig('../README_files/ROCcurve.svg', dpi=300) -``` - -```python -report = evaluate(model, data) -``` - -```python -model = refitAllData(model, params, data) -``` - -```python -df[['Attend', 'DNA', 'class']] = predict(model, df) -``` - -```python - -```