-
Notifications
You must be signed in to change notification settings - Fork 3
/
bison_create.py
446 lines (393 loc) · 18 KB
/
bison_create.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
import pandas as pd
import nni
import pandas_ta as pd_ta
from hep_ml import reweight
import matplotlib.pyplot as plt
from hep_ml.metrics_utils import ks_2samp_weighted
from fetch_alpaca_data import fetch_alpaca_csv
from pandas import DataFrame
from sklearn.metrics import balanced_accuracy_score, accuracy_score, roc_auc_score, f1_score
import warnings
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
import pywt
import pandas_market_calendars as mcal
from collections import Counter
import numpy as np
from time import perf_counter
import yfinance as yf
from scipy.stats import ttest_ind
from sklearn.preprocessing import quantile_transform
warnings.filterwarnings('ignore')
# set random seed on numpy
np.random.RandomState(42)
np.random.seed(42)
# performance csv headers
header = ['test_acc', 'live_acc']
params: dict = {
'years': 3,
'wavelet': 'bior1.1',
'window': 70
}
optimized_params = nni.get_next_parameter()
params.update(optimized_params)
# create script parameters (dynamic)
years = params['years']
trend = params['window']
wavelet = params['wavelet']
shuffle_split: bool = True
# find yesterday closing price
ticker = yf.Ticker('SPY')
yesterday_price = ticker.history(interval='1d')['Close'][-2]
# mean absolute error evaluator for wavelet transform
def madev(d, axis=None):
""" Mean absolute deviation of a signal """
return np.mean(np.absolute(d - np.mean(d, axis)), axis)
# de-noise original signal with selected wavelet function
def wavelet_denoising(x, wavelet='db4', level=1):
coeff = pywt.wavedec(x, wavelet, mode="per")
sigma = (1 / 0.6745) * madev(coeff[-level])
uthresh = sigma * np.sqrt(2 * np.log(len(x)))
coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])
return pywt.waverec(coeff, wavelet, mode='per')
# smooth initial model features in order to extract longer trend pattern
def _wavelet_smooth(data, wavelet):
"""wavelet transformation applied to curve"""
processed_arr = []
for col in ['open', 'high', 'low', 'close', 'volume']:
processed_arr.append(wavelet_denoising(data[col], wavelet=wavelet, level=1))
return processed_arr
def _get_indicator_data(data_raw):
# technical analysis indicators
data_raw['rsi'] = pd_ta.rsi(data_raw['close'])
data_raw['macd'] = pd_ta.macd(data_raw['close']).iloc[:, -1]
data_raw['willr'] = pd_ta.willr(data_raw['high'], data_raw['low'], data_raw['close'])
data_raw['obv'] = pd_ta.obv(data_raw['close'], data_raw['volume'])
data_raw['proc'] = pd_ta.roc(data_raw['close'])
data_raw['stoch_k'] = pd_ta.stoch(data_raw['high'], data_raw['low'], data_raw['close']).iloc[:, 0]
# Remove columns that won't be used as features
del (data_raw['open'])
del (data_raw['high'])
del (data_raw['low'])
del (data_raw['volume'])
try:
del (data_raw['Adj Close'])
except KeyError:
pass
return data_raw
def _produce_prediction_gaps(data, transformed_data, window, from_hardcoded=False, year=None) -> DataFrame:
"""create predicted target values"""
try:
if from_hardcoded:
return pd.read_csv(f'targets/{year}y_{window}w.csv')
except FileNotFoundError:
pass
# create containers and calendars
nyse = mcal.get_calendar('NYSE')
calendar = nyse.schedule(start_date=data['day'].iloc[0], end_date=data['day'].iloc[-1])
pred_days = data['day'].unique()[-(window // 14):]
dropped_idx: list = []
missing_days = []
missing_counter = 0
value_counts = data['day'].value_counts()
for key, value in value_counts.items():
if value != 14:
missing_days.append(key)
missing_counter += 1
print(value, key)
data = data.loc[~data['day'].isin(missing_days)]
transformed_data = transformed_data.loc[~transformed_data['day'].isin(missing_days)]
# custom mapping for trading window at given intraday time
map: dict = {
'09:30:00': 13, '10:00:00': 12, '10:30:00': 11, '11:00:00': 10, '11:30:00': 9, "12:00:00": 8,
'12:30:00': 7, '13:00:00': 6, '13:30:00': 5, '14:00:00': 4, '14:30:00': 3, '15:00:00': 2,
'15:30:00': 1, '16:00:00': 0
}
# create containers
times, closes, days = data['time'], data['close'], data['day']
targets = []
end_dates = []
compare_closes = []
for start_idx in tqdm(range(data.shape[0])):
# get prices to compare for target variable
current_close = closes.iloc[start_idx]
if days.iloc[start_idx] in pred_days:
end_dates.append(np.nan)
compare_closes.append(np.nan)
targets.append(np.nan)
continue
# get price comparisons
try:
compare_close = closes.iloc[start_idx + window + map[times.iloc[start_idx]]]
end_dates.append(days.iloc[start_idx + window + map[times.iloc[start_idx]]])
compare_closes.append(compare_close)
except IndexError:
compare_close = np.nan
end_dates.append(np.nan)
compare_closes.append(np.nan) # only at the -window span at the end of the dataframe
# check market day difference
if not np.isnan(compare_close):
diff = calendar.loc[days.iloc[start_idx]:days.iloc[start_idx + window + map[times.iloc[start_idx]]]].shape[
0] - 1
if diff != window // 14 or '16:00:00' not in times.iloc[
start_idx + window + map[times.iloc[start_idx]]] or compare_close == current_close:
dropped_idx.append(start_idx)
targets.append('GAP')
continue
else:
if current_close < compare_close:
targets.append(1)
else:
targets.append(0)
else:
# check if larger or smaller
targets.append('GAP')
print(data.shape[0], len(end_dates))
data['end_timestamps'] = end_dates
data['compare_closes'] = compare_closes
data['pred'] = targets
# remove dropped indices from the dropped_idx array
data = data.loc[~data.index.isin(dropped_idx)]
transformed_data = transformed_data.loc[~transformed_data.index.isin(dropped_idx)]
# reset indices
data.reset_index(inplace=True, drop=True)
transformed_data.reset_index(inplace=True, drop=True)
# count number of GAP target variables
same_price_idx = data[data['pred'] == 'GAP'].index.values
data = data.loc[~data.index.isin(same_price_idx)]
transformed_data = transformed_data.loc[~transformed_data.index.isin(same_price_idx)]
# insert original target variables into transformed dataframe
transformed_data['pred'] = data['pred']
transformed_data['end_timestamps'] = data['end_timestamps']
transformed_data['compare_closes'] = data['compare_closes']
return transformed_data
def _produce_prediction(data, window):
"""create predicted target values"""
# custom mapping for trading window at given intraday time
map: dict = {
'09:30:00': 13, '10:00:00': 12, '10:30:00': 11, '11:00:00': 10, '11:30:00': 9, "12:00:00": 8,
'12:30:00': 7, '13:00:00': 6, '13:30:00': 5, '14:00:00': 4, '14:30:00': 3, '15:00:00': 2,
'15:30:00': 1, '16:00:00': 0
}
# create
times, closes = data['time'], data['close']
targets = np.array([])
for start_idx in range(data.shape[0]):
# get prices to compare for target variable
current_close = closes.iloc[start_idx]
try:
compare_close = closes.iloc[start_idx + window + map[times.iloc[start_idx]]]
except IndexError:
compare_close = np.nan
# check if larger or smaller
if np.isnan(compare_close):
targets = np.append(targets, np.nan)
else:
if current_close < compare_close:
targets = np.append(targets, 1)
else:
targets = np.append(targets, 0)
data['pred'] = targets
return data['pred']
def test_boundaries_whole(df, ranges, training_data_boundaries):
boundary_flag: bool = True
for var, range in zip(['rsi', 'macd', 'willr', 'obv', 'proc', 'stoch_k'], ranges):
if training_data_boundaries(range[0], range[1], df[var]):
boundary_flag = False
return boundary_flag
def live_accuracy(trends, strikes, comp_price):
"""check the live trades accuracy"""
# create output container
container: list = []
for i, (trend, strike) in enumerate(zip(trends, strikes)):
if i <= 13:
if trend == 1:
if strike < comp_price:
container.append(True)
else:
container.append(False)
else:
if strike > comp_price:
container.append(True)
else:
container.append(False)
else:
container.append('-')
return container
features = ['rsi', 'macd', 'willr', 'obv', 'proc', 'stoch_k']
def draw_distributions(original, target, new_original_weights, plot=False):
ks_vals: list = []
for id, column in enumerate(features, 1):
if plot:
plt.figure(figsize=[15, 7])
plt.subplot(2, 3, id)
sns.kdeplot(data=original, x=column, color='red', weights=new_original_weights)
sns.kdeplot(data=target, x=column, color='blue')
plt.title(column)
plt.show()
ks_vals.append(ks_2samp_weighted(original[column], target[column],
weights1=new_original_weights, weights2=np.ones(len(target), dtype=float)))
return sum(ks_vals)/len(ks_vals)
ticker = yf.Ticker('SPY')
yesterday_price_backtest = ticker.history(interval='1d', start="2020-01-01", end="2022-01-21")['Close']
nyse = mcal.get_calendar('NYSE')
def evaluate_preds(strike, close, trend):
"""get actual price movements to live data"""
if strike < close and trend == 1:
return True
elif strike > close and trend == 0:
return True
else:
return False
from datetime import timedelta
from datetime import datetime
def get_n_market_days_ahead(n, start_date):
"""get the next market day in an n-day interval"""
start_date = str(start_date)
start_date = datetime.strptime(start_date, "%Y-%m-%d")
end_date = start_date + timedelta(days=15)
early = nyse.schedule(start_date=start_date, end_date=end_date)
actual_end = str(early.iloc[:n+1].iloc[-1]['market_open']).split(' ')[0]
return yesterday_price_backtest[yesterday_price_backtest.index == str(actual_end)].iloc[0]
def backtest_func(live_data, model, threshold):
"""backtesting function"""
live_data['trend'] = model.predict(live_data[features])
live_data['day'] = live_data['start_date'].apply(lambda x: str(x).split(' ')[0])
live_data['close_price'] = live_data['day'].apply(lambda x: get_n_market_days_ahead(params['window']//14, str(x)))
live_data['actual'] = live_data.apply(lambda x: evaluate_preds(x.strikes, x.close_price, x.trend), axis=1).astype(int)
live_data.drop('day', axis=1, inplace=True)
print(live_data[live_data['prob'] > threshold]['actual'].value_counts())
# backtesting mode turn on/off
backtest: bool = True
def create_model(years=params['years'], trend=params['window'], wavelet=params['wavelet'], facade=False):
# get data from backend
data = fetch_alpaca_csv(years)
data.rename(columns={col: col.lower() for col in data.columns}, inplace=True)
copy_data = data.copy()
print(data.shape) # replace with wavelet transformed version
data['open'], data['high'], data['low'], data['close'], data['volume'] = _wavelet_smooth(data, wavelet)
# create indicator features
data = _get_indicator_data(data)
copy_data = _get_indicator_data(copy_data)
data.drop('unnamed: 0', axis=1, inplace=True)
copy_data.drop('unnamed: 0', axis=1, inplace=True)
if data.isna().sum().sum() != copy_data.isna().sum().sum():
print('NaN Error')
return
data.dropna(inplace=True)
copy_data.dropna(inplace=True)
# remove nan values and reset indices on both dataframes
data.reset_index(inplace=True)
copy_data.reset_index(inplace=True)
data.drop(['index'], axis=1, inplace=True)
copy_data.drop(['index'], axis=1, inplace=True)
start = perf_counter()
data['pred'] = _produce_prediction(copy_data, window=trend)
print("Target Variable Creation Runtime", perf_counter() - start)
# create live data samples from equivalent training distribution
live_data = data[-trend:].drop('pred', axis=1)[['rsi', 'macd', 'willr', 'obv', 'proc', 'stoch_k']]
strike_prices = copy_data[-trend:]['close']
live_dates = data[-trend:]['timestamp']
# resolve formatting and nan issues
del (data['close'])
data.dropna(inplace=True)
print(data.shape, 'tracking data shape 4')
#data = data.sample(frac=1).reset_index(drop=True)
features = ['rsi', 'macd', 'willr', 'obv', 'proc', 'stoch_k']
target = 'pred'
# train-test data splits
test_ratio = 0.30
#X_train, X_test, y_train, y_test = train_test_split(data[features], data[target],
# test_size=test_ratio, random_state=42)
X_train, X_test = data[features][:int(0.8*data.shape[0])], data[features][int(0.8*data.shape[0]):]
y_train, y_test = data[target][:int(0.8*data.shape[0])], data[target][int(0.8*data.shape[0]):]
# transform distributions into the normal distribution to standardize training-testing-live samples
live_data = live_data[features]
transform: bool = False
if transform:
output_dist = 'uniform'
train_trans = quantile_transform(X_train, output_distribution=output_dist, n_quantiles=len(X_train))
test_trans = quantile_transform(X_test, output_distribution=output_dist, n_quantiles=len(X_train))
live_trans = quantile_transform(live_data, output_distribution=output_dist, n_quantiles=len(X_train))
for i, col in enumerate(X_train.columns):
X_train[col] = train_trans[:, i]
X_test[col] = test_trans[:, i]
live_data[col] = live_trans[:, i]
reweight_check = False
opt_weights = np.ones(len(X_train))
if reweight_check and not transform:
opt_weights, distance = None, 10000
for estimator in tqdm([150, 200, 250]):
for lr in [0.05, 0.1]:
for depth in [3, 4]:
for leaf in [1000, 2000, 3000]:
reweighter = reweight.GBReweighter(n_estimators=estimator, learning_rate=lr, max_depth=depth,
min_samples_leaf=leaf)
reweighter.fit(X_train, live_data[features])
gb_weights_test = reweighter.predict_weights(X_train)
dist_distance = draw_distributions(X_train, live_data, gb_weights_test)
if dist_distance < distance:
print(dist_distance, distance)
opt_weights = gb_weights_test
distance = dist_distance
print(draw_distributions(X_train, live_data, opt_weights, plot=False)) ### DISABLE PLOTTING WHEN TUNING
ratio = Counter(y_train)
scale_pos_weight = max(ratio.values()) / min(ratio.values())
model = XGBClassifier(scale_pos_weight=scale_pos_weight).fit(X_train, y_train, sample_weight=opt_weights)
#model = RandomForestClassifier(class_weight='balanced').fit(X_train, y_train)
stat, p_value = ttest_ind(X_train, live_data)
print("T-test P-value: ", p_value)
# determine accuracy and append to results
rf_prediction = model.predict(X_test)
print("Raw Accuracy: ", accuracy_score(list(y_test), list(rf_prediction)))
print("ROC AUC: ", roc_auc_score(y_test, rf_prediction))
print('F1 Score: ', f1_score(y_test, rf_prediction, average='macro'))
rf_accuracy = balanced_accuracy_score(list(y_test), list(rf_prediction))
print('Test Accuracy: ', rf_accuracy)
print(years, trend, wavelet)
# make live predictions
live_predictions = model.predict(live_data)
live_prob_predictions = model.predict_proba(live_data)
live_data['pred'] = live_predictions
live_data['strikes'] = strike_prices
live_data['prob'] = [i[np.argmax(i)] for i in live_prob_predictions]
live_data['date'] = live_dates
live_data['window'] = [trend // 14 for _ in range(live_data.shape[0])]
live_data['day'] = live_data['date'].apply(lambda x: str(x).split(' ')[0])
live_data = live_data[live_data['day'].isin(live_data['day'].unique()[-3:])].drop('day', axis=1)
live_data.rename(columns={'pred': 'trend', 'date': 'start_date'}, inplace=True)
# measure accuracy across different confidence filters
correct, incorrect, skipped = 0, 0, 0
threshold = 0.9
try:
for pred, prob, actual in zip(rf_prediction, model.predict_proba(X_test), y_test):
if prob[int(pred)] >= threshold:
if pred == actual:
correct += 1
else:
incorrect += 1
else:
skipped += 1
print((f"Acc: {correct / (correct + incorrect)}"), f'Skip Ratio: {skipped / len(y_test)}')
except ZeroDivisionError:
pass
# create a backtest
if backtest:
backtest_func(live_data, model, 0.9)
if not facade:
live_data.to_csv('live_data_backtest.csv')
print(live_data[['window', 'trend', 'start_date', 'prob', 'strikes']][live_data['prob'] > 0.9])
# report the NNI console
nni.report_final_result(rf_accuracy)
#nni.report_intermediate_result(cross_val_acc)
if not facade:
return model, rf_accuracy, live_data[['trend', 'prob', 'strikes', 'start_date']], live_data[features], \
X_train, X_test, y_train, y_test, data
else:
return model, accuracy_score(list(y_test), list(rf_prediction)), \
live_data[['window', 'trend', 'start_date', 'prob', 'strikes']][live_data['prob'] > 0.9]
create_model()
#create_model(years=3, trend=42, wavelet='coif2')