From cadbaf412566e356848b66afad88ba15c24085b2 Mon Sep 17 00:00:00 2001
From: Stephen Richer <sr467@bath.ac.uk>
Date: Thu, 15 Sep 2022 10:50:05 +0100
Subject: [PATCH] Delete.

---
 development/1.buildCatBoost.md | 379 ---------------------------------
 development/1.simulateData.md  |  74 -------
 development/testDNAttend.md    |  88 --------
 3 files changed, 541 deletions(-)
 delete mode 100644 development/1.buildCatBoost.md
 delete mode 100644 development/1.simulateData.md
 delete mode 100644 development/testDNAttend.md

diff --git a/development/1.buildCatBoost.md b/development/1.buildCatBoost.md
deleted file mode 100644
index 934f89c..0000000
--- a/development/1.buildCatBoost.md
+++ /dev/null
@@ -1,379 +0,0 @@
----
-jupyter:
-  jupytext:
-    text_representation:
-      extension: .md
-      format_name: markdown
-      format_version: '1.3'
-      jupytext_version: 1.14.1
-  kernelspec:
-    display_name: Python 3 (ipykernel)
-    language: python
-    name: python3
----
-
-```python
-import sys
-import json
-import shap
-import sklearn
-import numpy as np
-import pandas as pd
-from pathlib import Path
-from pprint import pprint
-import matplotlib.pyplot as plt
-from sklearn.pipeline import Pipeline
-from sklearn.impute import SimpleImputer
-from scipy.stats import randint, uniform
-from catboost import CatBoostClassifier, Pool
-from sklearn.compose import ColumnTransformer
-from sklearn.model_selection import RandomizedSearchCV
-from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.model_selection import train_test_split, KFold
-from sklearn.utils.class_weight import compute_class_weight
-from sklearn.metrics import roc_curve, roc_auc_score, classification_report, RocCurveDisplay
-
-sklearn.set_config(display='diagram')
-```
-
-```python
-from dnattend.data import generateData
-```
-
-```python
-#df = pd.read_pickle('data/OPattendance.pkl').drop_duplicates()
-
-# Remove other non-attendance reasons (e.g. provider cancellation)
-#df = df.loc[df['attendanceStatus'].isin(['Attended', 'Did Not Attend'])]
-
-df = generateData()
-```
-
-```python
-# Train test validation split sizes
-train_size = 0.7
-test_size = 0.15
-val_size = 0.15
-
-prepArgs = ({
-    'catCols': [
-        'siteCode', 'priorityCode', 'consultationMediaCode',
-        'specialityCode', 'appointmentWeekday', 'Age'
-    ],
-    'boolCols': [
-        'firstAppointment'
-    ],
-})
-
-# Global seed for reproducibility
-seed = 42
-
-# Output JSON file to store best parameters and report
-bestParamsOut = 'bestParams.json'
-scoreReportOut = 'classifierReport.json'
-
-# Iterations for CatBoost during hyper-tuning
-# Don't set too high (will slow down tuning)
-# This is optimised later using early stopping
-catboostIterations = 100
-
-# Cross-validated hyper-parameter tuning
-cvIterations = 5  # Increase for better tunings
-nFolds = 5        # Number of cross-validation folds
-nJobs = 1         # Cores - set to -1 to use all available
-params = ({
-    'estimator__depth':           randint(4, 10),
-    'estimator__l2_leaf_reg':     randint(2, 10),
-    'estimator__random_strength': uniform.rvs(0, 10, size=100),
-})
-
-# Post-tuning early stopping iterations (with eval set)
-evalIterations = 10000
-earlyStoppingRounds = 10
-```
-
-```python
-for var in ([
-        seed, cvIterations, nFolds, nJobs, 
-        evalIterations, earlyStoppingRounds
-    ]):
-    assert isinstance(var, int)
-
-for out in [bestParamsOut, scoreReportOut]:
-    assert out.endswith('.json')
-    assert Path(out).parent.exists()
-
-assert train_size + test_size + val_size == 1
-
-np.random.seed(seed) # Set global seed
-```
-
-```python
-# Keep only a single entry per patient (most recent)
-if {'appointmentDateTime', 'patientID'}.issubset(df.columns):
-    df = (
-        df.sort_values('appointmentDateTime', ascending=False)
-        .groupby('patientID').head(1)
-    )
-else:
-    msg = 'Recommended removing multiple records of same patient.'
-    print(msg, file=sys.stderr)
-```
-
-def mapTarget(x):
-    names = ({
-        '1': 'DNA', 'y': 'DNA', 't': 'DNA', 
-        'd': 'DNA', '0': 'Attend', 'n': 'Attend', 
-        'f': 'Attend', 'a': 'Attend'
-    })
-    x = str(x).lower().strip()[0]
-    return names[x]
-
-df['DNA'] = df['DNA'].apply(mapTarget)
-assert set(df['DNA']) == set(['DNA', 'Attend'])
-
-```python
-class prepareData(BaseEstimator, TransformerMixin):  
-    
-    def __init__(
-            self, 
-            catCols: list = None, 
-            numericCols: list = None, 
-            boolCols: list = None):
-        self.catCols = [] if catCols is None else catCols
-        self.numericCols = [] if numericCols is None else numericCols
-        self.boolCols = [] if boolCols is None else boolCols
-        self._setCatColIdx()
-    
-    def fit(self, X, y=None):
-        return self
-    
-    def transform(self, X, y=None):
-        for col in self.boolCols:
-            X[col] = self._mapBoolCol(X[col])
-        X[self.catCols] = X[self.catCols].astype(str)
-        return X.loc[:, self.validCols]
-    
-    def _mapBoolCol(self, col):
-        col = col.apply(lambda x: str(x).lower().strip()[0])
-        names = ({
-            '1': 1, 'y': 1, 't': 1,
-            '0': 0, 'n': 0, 'f': 0
-        })
-        col = col.map(names)
-        col.loc[~col.isin([0,1])] = np.nan
-        return col
-    
-    @property
-    def validCols(self):
-        return self.catCols + self.numericCols + self.boolCols 
-    
-    def _setCatColIdx(self):
-        """ Get indices of categoric cols """
-        self.catColIdx = []
-        for col in self.catCols:
-            if col in self.validCols:
-                self.catColIdx.append(
-                    self.validCols.index(col))
-```
-
-```python
-def train_test_validation_split(
-        X, y, train_size=0.8, test_size=0.1, val_size=0.1, seed=None):
-    assert train_size + test_size + val_size == 1
-    rng = np.random.default_rng(seed)
-    
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=(1 - train_size), 
-        random_state=rng.integers(1e9))
-    
-    split2Size = test_size / (test_size + val_size)
-    X_val, X_test, y_val, y_test = train_test_split(
-        X_test, y_test, test_size=split2Size, 
-        random_state=rng.integers(1e9))
-    
-    
-    return (
-        X_train.copy(), X_test.copy(), X_val.copy(), 
-        y_train.copy(), y_test.copy(), y_val.copy()
-    )
-```
-
-```python
-prepData = prepareData(**prepArgs)
-
-X = df.copy()
-y = X.pop('DNA')
-
-split = train_test_validation_split(
-    X, y, train_size=train_size, test_size=test_size, 
-    val_size=val_size, seed=np.random.randint(1e9)
-)
-X_train, X_test, X_val, y_train, y_test, y_val = map(lambda x: x.copy(), split)
-```
-
-```python
-# Set class weights - this balanced prediction probalities for more easy interpretation
-classes = np.unique(y_train)
-weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
-class_weights = dict(zip(classes, weights))
-```
-
-```python
-# Combine all transformations into a ColumnTransformer
-transformers = ([
-    ('categories', SimpleImputer(strategy='constant'), prepData.catCols),
-    ('numeric',    SimpleImputer(strategy='mean'), prepData.numericCols),
-    ('boolean',    SimpleImputer(strategy='most_frequent'), prepData.boolCols),
-])
-featureTransformer = ColumnTransformer(
-    transformers=transformers, remainder='drop')
-```
-
-```python
-# Define a preProcessor Pipeline encompassing the FeatureEngineering and featureTransformation steps
-preProcessor = Pipeline(steps=[
-    ('prepare',         prepareData(**prepArgs)),
-    ('columnTransform', featureTransformer),
-])
-```
-
-```python
-# Combine processor and modelling steps into a Pipeline object
-model = Pipeline(steps=[
-    ('preprocess',     preProcessor),
-    ('estimator',      CatBoostClassifier(
-        cat_features=preProcessor.named_steps['prepare'].catColIdx,
-        eval_metric='Logloss',
-        class_weights=class_weights,
-        iterations=catboostIterations, verbose=0, 
-        random_seed=np.random.randint(1e9))),
-])
-```
-
-```python
-gridSearch = RandomizedSearchCV(
-    model, params, scoring='neg_log_loss', 
-    random_state=np.random.randint(1e9), cv=nFolds, 
-    refit=False, n_jobs=nJobs, n_iter=cvIterations, verbose=2)
-_ = gridSearch.fit(X_train, y_train)
-```
-
-```python
-# Extract best parameters from cross-validated randomised search
-params = gridSearch.best_params_
-params['estimator__iterations'] = evalIterations
-_ = model.set_params(**params)
-```
-
-```python
-# Pre-process the validation set with the tuned model parameters.
-# Required since eval_set is other not processed before CatBoost
-X_val = model.named_steps['preprocess'].fit(X_train, y_train).transform(X_val)
-evalSet = Pool(X_val, y_val, cat_features=prepData.catColIdx)
-```
-
-```python
-_ = model.fit(
-    X_train, y_train, estimator__eval_set=evalSet,
-    estimator__early_stopping_rounds=earlyStoppingRounds)
-```
-
-```python
-# Update iteration parameter to optimal and write to file
-bestIteration = model.named_steps['estimator'].get_best_iteration()
-params['estimator__iterations'] = bestIteration
-
-with open(bestParamsOut, 'w') as fh:
-    json.dump(params, fh)
-```
-
-### Tune Decision Threshold
-
-```python
-def predict(model, X, threshold=0.5):
-    posProb = pd.Series(model.predict_proba(X)[:,1])
-    classes = model.classes_
-    predictions = (
-        posProb.apply(
-            lambda x: classes[0] if x < threshold else classes[1])
-    )
-    return predictions
-```
-
-```python
-y_trainInt = y_train.apply(lambda x: 1 if x == model.classes_[1] else 0)
-y_predPos = model.predict_proba(X_train)[:,1]
-
-fpr, tpr, thresholds = roc_curve(
-    y_trainInt, y_predPos, drop_intermediate=False)
-AUC = roc_auc_score(y_trainInt, y_predPos)
-
-idx = np.argmin(np.abs(fpr + tpr - 1))
-optimalThreshold = thresholds[idx]
-```
-
-```python
-fig, ax = plt.subplots()
-RocCurveDisplay.from_estimator(model, X_train, y_train, ax=ax)
-ax.set_xlim([0, 1])
-ax.set_ylim([0, 1])
-ax.axhline(tpr[idx], xmax=fpr[idx], ls='--', alpha=0.5, c='black')
-ax.axvline(fpr[idx], ymax=tpr[idx], ls='--', alpha=0.5, c='black')
-ax.scatter(fpr[idx], tpr[idx], c='black')
-ax.set_xlabel('False Positive Rate')
-ax.set_ylabel('True Positive Rate')
-label = f'AUC = {AUC:.2f}, Optimal Threshold = {optimalThreshold:.2f}'
-ax.legend(labels=[label], loc='lower right')
-fig.savefig('ROCcurve.pdf')
-```
-
-```python
-importances = pd.Series(
-    model.named_steps['estimator'].feature_importances_,
-    prepareData(**prepArgs).validCols
-).sort_values(ascending=False)
-importances
-```
-
-```python
-df['DNAprob'] = pd.DataFrame(model.predict_proba(df), columns=model.classes_)['DNA'].values
-df['DNApredict'] = predict(model, df, threshold=optimalThreshold)
-```
-
-```python
-testPredictions = predict(model, X_test, threshold=optimalThreshold)
-report = classification_report(y_test, testPredictions, output_dict=True)
-with open(scoreReportOut, 'w') as fh:
-    json.dump(report, fh)
-```
-
-```python
-pprint(report)
-```
-
-```python
-explainer = shap.Explainer(model.named_steps['estimator'], feature_names=prepData.validCols)
-```
-
-```python
-preTransformer = model.named_steps['preprocess'].fit(X_train, y_train)
-X_transformed = preTransformer.transform(X.sample(1).copy())
-```
-
-```python
-shap_values = explainer(X_transformed)
-```
-
-```python
-# waterfall plot for first observation
-shap.plots.waterfall(shap_values[0])
-```
-
-```python
-df.groupby(['firstAppointment'])['DNAprob'].agg(['mean', 'size'])
-```
-
-```python
-
-```
diff --git a/development/1.simulateData.md b/development/1.simulateData.md
deleted file mode 100644
index ab7f4c7..0000000
--- a/development/1.simulateData.md
+++ /dev/null
@@ -1,74 +0,0 @@
----
-jupyter:
-  jupytext:
-    formats: ipynb,md
-    text_representation:
-      extension: .md
-      format_name: markdown
-      format_version: '1.3'
-      jupytext_version: 1.14.1
-  kernelspec:
-    display_name: Python 3 (ipykernel)
-    language: python
-    name: python3
----
-
-```python
-import numpy as np
-import pandas as pd
-```
-
-```python
-def DNAprob(x):
-    """ Simulate DNA probability with artificial values """
-    modifiers = ({
-        'weekday': 0.05,
-        'priority': 0.1,
-        'age': 0.1,
-        'firstAppointment': 0.2,
-        'consultationMedia': 0.3,
-        'site': 0.4
-    })
-    maxModifier = np.array(list(modifiers.values())).sum()
-    minModifier = -maxModifier
-    if x['weekday'] in ['Saturday', 'Sunday']:
-        modifiers['weekday'] *= -1
-    if x['priority'] == 'Two Week Wait':
-        modifiers['priority'] *= -1
-    if x['ageGroup'] == 'Over 65':
-        modifiers['age'] *= -1
-    if not x['firstAppointment']:
-        modifiers['firstAppointment'] *= -1
-    if x['consultationMedia'] == 'In-Person':
-        modifiers['consultationMedia'] *= -1
-    if x['site'] == 'Lakeside':
-        modifiers['site'] *= -1
-    # Get probability score as sum of modifiers
-    p = np.array(list(modifiers.values())).sum()
-    # Normalise p to [0, 1]
-    p = ((p - minModifier) / (maxModifier - minModifier))
-    DNA = np.random.choice([True, False], p=[p, 1-p])
-    return pd.Series([p, DNA])
-```
-
-```python
-seed = 42
-samples = 50_000
-np.random.seed(seed)
-
-daysOfWeek = ([
-    'Wednesday', 'Tuesday', 'Monday', 'Sunday', 
-    'Saturday', 'Friday', 'Thursday'
-])
-
-data = pd.DataFrame({
-    'weekday': np.random.choice(daysOfWeek, samples),
-    'priority': np.random.choice(['Urgent', 'Two Week Wait'], samples),
-    'ageGroup': np.random.choice(['Over 65', 'Under 65'], samples),
-    'speciality': np.random.choice(['Ophthalmology', 'Audiology'], samples),
-    'firstAppointment': np.random.choice([True, False], samples),
-    'consultationMedia': np.random.choice(['Telephone', 'In-Person'], samples),
-    'site': np.random.choice(['Fairview', 'Lakeside'], samples)
-})
-data[['DNAprob', 'DNA']] = data.apply(DNAprob, axis=1)
-```
diff --git a/development/testDNAttend.md b/development/testDNAttend.md
deleted file mode 100644
index 364f861..0000000
--- a/development/testDNAttend.md
+++ /dev/null
@@ -1,88 +0,0 @@
----
-jupyter:
-  jupytext:
-    formats: ipynb,md
-    text_representation:
-      extension: .md
-      format_name: markdown
-      format_version: '1.3'
-      jupytext_version: 1.14.1
-  kernelspec:
-    display_name: Python 3 (ipykernel)
-    language: python
-    name: python3
----
-
-```python
-%matplotlib inline
-from sklearn.base import clone
-from scipy.stats import randint, uniform
-from dnattend.data import generateData
-from dnattend.train import trainModel, splitData, refitAllData
-from dnattend.test import getFeatureImportance, plotROC, predict, evaluate
-```
-
-```python
-df = generateData(size=50_000, seed=42)
-```
-
-```python
-data = splitData(df, target='status', train_size=0.7, test_size=0.15, val_size=0.15)
-```
-
-```python
-catCols = ['day', 'priority', 'speciality', 'consultationMedia', 'site']
-boolCols = ['firstAppointment']
-numericCols = ['age']
-
-trainingParams = ({
-    'catCols':             catCols,
-    'boolCols':            boolCols,
-    'numericCols':         numericCols,
-    'cvFolds':             5,
-    'catboostIterations':  100,
-    'hypertuneIterations': 5,
-    'evalIterations':      10_000,
-    'earlyStoppingRounds': 10,
-    'seed':                42 
-})
-
-hyperParams = ({
-    'estimator__depth':           randint(4, 10),
-    'estimator__l2_leaf_reg':     randint(2, 10),
-    'estimator__random_strength': uniform.rvs(0, 10, size=100),
-})
-
-model, params = trainModel(data, hyperParams=hyperParams, **trainingParams)
-```
-
-```python
-oldParams = model.get_params()
-```
-
-```python
-featureImportances = getFeatureImportance(model)
-fig = featureImportances.plot.barh()
-fig.figure.savefig('../README_files/featureImportances.svg', dpi=300)
-```
-
-```python
-fig, ax = plotROC(model, data)
-fig.figure.savefig('../README_files/ROCcurve.svg', dpi=300)
-```
-
-```python
-report = evaluate(model, data)
-```
-
-```python
-model = refitAllData(model, params, data)
-```
-
-```python
-df[['Attend', 'DNA', 'class']] = predict(model, df)
-```
-
-```python
-
-```