automatically detect maximum required data based on user fed indicators (to avoid NaNs in dataset for rolling indicators), add new config parameter for backtesting to let users increase their startup_candles to accommodate high timeframe indicators, add docs to explain all. Add new feature for automatic indicator duplication according to user defined intervals (exhibited in example strat and configs now).

This commit is contained in:
robcaulk 2022-05-31 18:42:27 +02:00
parent 9b3b08a2bb
commit 7523ed825e
7 changed files with 141 additions and 71 deletions

View File

@ -59,6 +59,7 @@
} }
], ],
"freqai": { "freqai": {
"startup_candles": 10000,
"timeframes": [ "timeframes": [
"3m", "3m",
"15m", "15m",
@ -79,7 +80,9 @@
"weight_factor": 0.9, "weight_factor": 0.9,
"principal_component_analysis": false, "principal_component_analysis": false,
"use_SVM_to_remove_outliers": true, "use_SVM_to_remove_outliers": true,
"stratify": 0 "stratify": 0,
"indicator_max_period": 20,
"indicator_interval": 10
}, },
"data_split_parameters": { "data_split_parameters": {
"test_size": 0.33, "test_size": 0.33,

View File

@ -7,7 +7,7 @@
"dry_run": true, "dry_run": true,
"timeframe": "5m", "timeframe": "5m",
"dry_run_wallet": 4000, "dry_run_wallet": 4000,
"dataformat_ohlcv": "hdf5", "dataformat_ohlcv": "json",
"cancel_open_orders_on_exit": true, "cancel_open_orders_on_exit": true,
"unfilledtimeout": { "unfilledtimeout": {
"entry": 10, "entry": 10,
@ -51,6 +51,7 @@
} }
], ],
"freqai": { "freqai": {
"startup_candles": 10000,
"timeframes": [ "timeframes": [
"5m", "5m",
"15m", "15m",
@ -74,7 +75,9 @@
"weight_factor": 0.9, "weight_factor": 0.9,
"principal_component_analysis": false, "principal_component_analysis": false,
"use_SVM_to_remove_outliers": false, "use_SVM_to_remove_outliers": false,
"stratify": 0 "stratify": 0,
"indicator_max_period": 50,
"indicator_interval": 10
}, },
"data_split_parameters": { "data_split_parameters": {
"test_size": 0.33, "test_size": 0.33,

View File

@ -76,6 +76,7 @@ config setup includes:
```json ```json
"freqai": { "freqai": {
"startup_candles": 10000,
"timeframes" : ["5m","15m","4h"], "timeframes" : ["5m","15m","4h"],
"train_period" : 30, "train_period" : 30,
"backtest_period" : 7, "backtest_period" : 7,
@ -105,6 +106,7 @@ config setup includes:
### Building the feature set ### Building the feature set
!! slightly out of date, please refer to templates/FreqaiExampleStrategy.py for updated method !!
Features are added by the user inside the `populate_any_indicators()` method of the strategy Features are added by the user inside the `populate_any_indicators()` method of the strategy
by prepending indicators with `%`: by prepending indicators with `%`:
@ -194,7 +196,19 @@ Freqai will train 8 separate models (because the full range comprises 8 weeks),
and then backtest the subsequent week associated with each of the 8 training and then backtest the subsequent week associated with each of the 8 training
data set timerange months. Users can think of this as a "sliding window" which data set timerange months. Users can think of this as a "sliding window" which
emulates Freqai retraining itself once per week in live using the previous emulates Freqai retraining itself once per week in live using the previous
month of data. month of data._
In live, the required training data is automatically computed and downloaded. However, in backtesting
the user must manually enter the required number of `startup_candles` in the config. This value
is used to increase the available data to FreqAI and should be sufficient to enable all indicators
to be NaN free at the beginning of the first training timerange. This boils down to identifying the
highest timeframe (`4h` in present example) and the longest indicator period (25 in present example)
and adding this to the `train_period`. The units need to be in the base candle time frame:_
`startup_candles` = ( 4 hours * 25 max period * 60 minutes/hour + 30 day train_period * 1440 minutes per day ) / 5 min (base time frame) = 1488.
!!! Note: in dry/live, this is all precomputed and handled automatically. Thus, `startup_candle` has no
influence on dry/live.
## Running Freqai ## Running Freqai

View File

@ -60,11 +60,6 @@ class FreqaiDataKitchen:
self.pair = pair self.pair = pair
self.svm_model: linear_model.SGDOneClassSVM = None self.svm_model: linear_model.SGDOneClassSVM = None
if not self.live: if not self.live:
# if config.get('freqai', {}).get('backtest_period') < 1:
# raise OperationalException('backtest_period < 1,'
# 'Can only backtest on full day increments'
# 'backtest_period. Only live/dry mode'
# 'allows fractions of days')
self.full_timerange = self.create_fulltimerange(self.config["timerange"], self.full_timerange = self.create_fulltimerange(self.config["timerange"],
self.freqai_config.get("train_period") self.freqai_config.get("train_period")
) )
@ -291,10 +286,16 @@ class FreqaiDataKitchen:
labels = labels[ labels = labels[
(drop_index == 0) & (drop_index_labels == 0) (drop_index == 0) & (drop_index_labels == 0)
] # assuming the labels depend entirely on the dataframe here. ] # assuming the labels depend entirely on the dataframe here.
# logger.info( logger.info(
# "dropped %s training points due to NaNs, ensure all historical data downloaded", f'dropped {len(unfiltered_dataframe) - len(filtered_dataframe)} training points'
# len(unfiltered_dataframe) - len(filtered_dataframe), f' due to NaNs in populated dataset {len(unfiltered_dataframe)}.'
# ) )
if (1 - len(filtered_dataframe) / len(unfiltered_dataframe)) > 0.1 and self.live:
logger.warning(
f' {(1 - len(filtered_dataframe)/len(unfiltered_dataframe)) * 100} percent'
' of training data dropped due to NaNs, model may perform inconsistent'
'with expectations'
)
self.data["filter_drop_index_training"] = drop_index self.data["filter_drop_index_training"] = drop_index
else: else:
@ -685,10 +686,31 @@ class FreqaiDataKitchen:
return full_timerange return full_timerange
def check_if_new_training_required(self, trained_timestamp: int) -> Tuple[bool, TimeRange]: def check_if_new_training_required(self, trained_timestamp: int) -> Tuple[bool,
TimeRange, TimeRange]:
time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp() time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp()
trained_timerange = TimeRange() trained_timerange = TimeRange()
data_load_timerange = TimeRange()
# find the max indicator length required
max_timeframe_chars = self.freqai_config.get('timeframes')[-1]
max_period = self.freqai_config.get('feature_parameters', {}).get(
'indicator_max_period', 20)
additional_seconds = 0
if max_timeframe_chars[-1] == 'd':
additional_seconds = max_period * SECONDS_IN_DAY * int(max_timeframe_chars[-2])
elif max_timeframe_chars[-1] == 'h':
additional_seconds = max_period * 3600 * int(max_timeframe_chars[-2])
elif max_timeframe_chars[-1] == 'm':
if len(max_timeframe_chars) == 2:
additional_seconds = max_period * 60 * int(max_timeframe_chars[-2])
elif len(max_timeframe_chars) == 3:
additional_seconds = max_period * 60 * int(float(max_timeframe_chars[0:2]))
else:
logger.warning('FreqAI could not detect max timeframe and therefore may not '
'download the proper amount of data for training')
if trained_timestamp != 0: if trained_timestamp != 0:
elapsed_time = (time - trained_timestamp) / SECONDS_IN_DAY elapsed_time = (time - trained_timestamp) / SECONDS_IN_DAY
retrain = elapsed_time > self.freqai_config.get('backtest_period') retrain = elapsed_time > self.freqai_config.get('backtest_period')
@ -696,10 +718,22 @@ class FreqaiDataKitchen:
trained_timerange.startts = int(time - self.freqai_config.get( trained_timerange.startts = int(time - self.freqai_config.get(
'train_period', 0) * SECONDS_IN_DAY) 'train_period', 0) * SECONDS_IN_DAY)
trained_timerange.stopts = int(time) trained_timerange.stopts = int(time)
# we want to load/populate indicators on more data than we plan to train on so
# because most of the indicators have a rolling timeperiod, and are thus NaNs
# unless they have data further back in time before the start of the train period
data_load_timerange.startts = int(time - self.freqai_config.get(
'train_period', 0) * SECONDS_IN_DAY
- additional_seconds)
data_load_timerange.stopts = int(time)
else: # user passed no live_trained_timerange in config else: # user passed no live_trained_timerange in config
trained_timerange.startts = int(time - self.freqai_config.get('train_period') * trained_timerange.startts = int(time - self.freqai_config.get('train_period') *
SECONDS_IN_DAY) SECONDS_IN_DAY)
trained_timerange.stopts = int(time) trained_timerange.stopts = int(time)
data_load_timerange.startts = int(time - self.freqai_config.get(
'train_period', 0) * SECONDS_IN_DAY
- additional_seconds)
data_load_timerange.stopts = int(time)
retrain = True retrain = True
# if retrain: # if retrain:
@ -714,7 +748,7 @@ class FreqaiDataKitchen:
# # enables persistence, but not fully implemented into save/load data yer # # enables persistence, but not fully implemented into save/load data yer
# self.data['live_trained_timerange'] = str(int(trained_timerange.stopts)) # self.data['live_trained_timerange'] = str(int(trained_timerange.stopts))
return retrain, trained_timerange return retrain, trained_timerange, data_load_timerange
def set_new_model_names(self, metadata: dict, trained_timerange: TimeRange): def set_new_model_names(self, metadata: dict, trained_timerange: TimeRange):

View File

@ -1,4 +1,5 @@
# import contextlib # import contextlib
import datetime
import gc import gc
import logging import logging
# import sys # import sys
@ -149,8 +150,15 @@ class IFreqaiModel(ABC):
# self.training_timerange_timerange = tr_train # self.training_timerange_timerange = tr_train
dataframe_train = dh.slice_dataframe(tr_train, dataframe) dataframe_train = dh.slice_dataframe(tr_train, dataframe)
dataframe_backtest = dh.slice_dataframe(tr_backtest, dataframe) dataframe_backtest = dh.slice_dataframe(tr_backtest, dataframe)
logger.info("training %s for %s", metadata["pair"], tr_train)
trained_timestamp = tr_train # TimeRange.parse_timerange(tr_train) trained_timestamp = tr_train # TimeRange.parse_timerange(tr_train)
tr_train_startts_str = datetime.datetime.utcfromtimestamp(
tr_train.startts).strftime('%Y-%m-%d %H:%M:%S')
tr_train_stopts_str = datetime.datetime.utcfromtimestamp(
tr_train.stopts).strftime('%Y-%m-%d %H:%M:%S')
logger.info("Training %s", metadata["pair"])
logger.info(f'Training {tr_train_startts_str} to {tr_train_stopts_str}')
dh.data_path = Path(dh.full_path / dh.data_path = Path(dh.full_path /
str("sub-train" + "-" + metadata['pair'].split("/")[0] + str("sub-train" + "-" + metadata['pair'].split("/")[0] +
str(int(trained_timestamp.stopts)))) str(int(trained_timestamp.stopts))))
@ -218,16 +226,19 @@ class IFreqaiModel(ABC):
model_filename=model_filename) model_filename=model_filename)
(self.retrain, (self.retrain,
new_trained_timerange) = dh.check_if_new_training_required(trained_timestamp) new_trained_timerange,
data_load_timerange) = dh.check_if_new_training_required(trained_timestamp)
dh.set_paths(metadata, new_trained_timerange.stopts) dh.set_paths(metadata, new_trained_timerange.stopts)
if self.retrain or not file_exists: if self.retrain or not file_exists:
if coin_first: if coin_first:
self.train_model_in_series(new_trained_timerange, metadata, strategy, dh) self.train_model_in_series(new_trained_timerange, metadata,
strategy, dh, data_load_timerange)
else: else:
self.training_on_separate_thread = True # acts like a lock self.training_on_separate_thread = True # acts like a lock
self.retrain_model_on_separate_thread(new_trained_timerange, self.retrain_model_on_separate_thread(new_trained_timerange,
metadata, strategy, dh) metadata, strategy,
dh, data_load_timerange)
elif self.training_on_separate_thread and not self.follow_mode: elif self.training_on_separate_thread and not self.follow_mode:
logger.info("FreqAI training a new model on background thread.") logger.info("FreqAI training a new model on background thread.")
@ -342,11 +353,12 @@ class IFreqaiModel(ABC):
@threaded @threaded
def retrain_model_on_separate_thread(self, new_trained_timerange: TimeRange, metadata: dict, def retrain_model_on_separate_thread(self, new_trained_timerange: TimeRange, metadata: dict,
strategy: IStrategy, dh: FreqaiDataKitchen): strategy: IStrategy, dh: FreqaiDataKitchen,
data_load_timerange: TimeRange):
# with nostdout(): # with nostdout():
dh.download_new_data_for_retraining(new_trained_timerange, metadata, strategy) dh.download_new_data_for_retraining(data_load_timerange, metadata, strategy)
corr_dataframes, base_dataframes = dh.load_pairs_histories(new_trained_timerange, corr_dataframes, base_dataframes = dh.load_pairs_histories(data_load_timerange,
metadata) metadata)
# protecting from common benign errors associated with grabbing new data from exchange: # protecting from common benign errors associated with grabbing new data from exchange:
@ -355,6 +367,8 @@ class IFreqaiModel(ABC):
corr_dataframes, corr_dataframes,
base_dataframes, base_dataframes,
metadata) metadata)
unfiltered_dataframe = dh.slice_dataframe(new_trained_timerange, unfiltered_dataframe)
except Exception: except Exception:
logger.warning('Mismatched sizes encountered in strategy') logger.warning('Mismatched sizes encountered in strategy')
# self.data_drawer.pair_to_end_of_training_queue(metadata['pair']) # self.data_drawer.pair_to_end_of_training_queue(metadata['pair'])
@ -390,10 +404,11 @@ class IFreqaiModel(ABC):
return return
def train_model_in_series(self, new_trained_timerange: TimeRange, metadata: dict, def train_model_in_series(self, new_trained_timerange: TimeRange, metadata: dict,
strategy: IStrategy, dh: FreqaiDataKitchen): strategy: IStrategy, dh: FreqaiDataKitchen,
data_load_timerange: TimeRange):
dh.download_new_data_for_retraining(new_trained_timerange, metadata, strategy) dh.download_new_data_for_retraining(data_load_timerange, metadata, strategy)
corr_dataframes, base_dataframes = dh.load_pairs_histories(new_trained_timerange, corr_dataframes, base_dataframes = dh.load_pairs_histories(data_load_timerange,
metadata) metadata)
unfiltered_dataframe = dh.use_strategy_to_populate_indicators(strategy, unfiltered_dataframe = dh.use_strategy_to_populate_indicators(strategy,
@ -401,6 +416,8 @@ class IFreqaiModel(ABC):
base_dataframes, base_dataframes,
metadata) metadata)
unfiltered_dataframe = dh.slice_dataframe(new_trained_timerange, unfiltered_dataframe)
model = self.train(unfiltered_dataframe, metadata, dh) model = self.train(unfiltered_dataframe, metadata, dh)
self.data_drawer.pair_dict[metadata['pair']][ self.data_drawer.pair_dict[metadata['pair']][

View File

@ -205,8 +205,7 @@ class Backtesting:
self.progress.init_step(BacktestState.DATALOAD, 1) self.progress.init_step(BacktestState.DATALOAD, 1)
if self.config.get('freqai') is not None: if self.config.get('freqai') is not None:
self.required_startup += int((self.config.get('freqai', {}).get('train_period') * self.required_startup += int(self.config.get('freqai', {}).get('startup_candles', 1000))
86400) / timeframe_to_seconds(self.config['timeframe']))
logger.info(f'Increasing startup_candle_count for freqai to {self.required_startup}') logger.info(f'Increasing startup_candle_count for freqai to {self.required_startup}')
self.config['startup_candle_count'] = self.required_startup self.config['startup_candle_count'] = self.required_startup

View File

@ -85,55 +85,58 @@ class FreqaiExampleStrategy(IStrategy):
:informative: the dataframe associated with the informative pair :informative: the dataframe associated with the informative pair
:coin: the name of the coin which will modify the feature names. :coin: the name of the coin which will modify the feature names.
""" """
if informative is None: if informative is None:
informative = self.dp.get_pair_dataframe(pair, tf) informative = self.dp.get_pair_dataframe(pair, tf)
informative['%-' + coin + "rsi"] = ta.RSI(informative, timeperiod=14) # first loop is automatically duplicating indicators for time periods
informative['%-' + coin + "mfi"] = ta.MFI(informative, timeperiod=25) for t in np.arange(10, self.freqai_info["feature_parameters"]["indicator_max_period"],
informative['%-' + coin + "adx"] = ta.ADX(informative, window=20) self.freqai_info["feature_parameters"]["indicator_interval"]):
informative[coin + "20sma"] = ta.SMA(informative, timeperiod=20) t = int(t)
informative[coin + "21ema"] = ta.EMA(informative, timeperiod=21) informative['%-' + coin + "rsi-period_" + str(t)] = ta.RSI(informative, timeperiod=t)
informative['%-' + coin + "bmsb"] = np.where( informative['%-' + coin + "mfi-period_" + str(t)] = ta.MFI(informative, timeperiod=t)
informative[coin + "20sma"].lt(informative[coin + "21ema"]), 1, 0 informative['%-' + coin + "adx-period_" + str(t)] = ta.ADX(informative, window=t)
) informative[coin + "20sma-period_" + str(t)] = ta.SMA(informative, timeperiod=t)
informative['%-' + coin + "close_over_20sma"] = informative["close"] / informative[ informative[coin + "21ema-period_" + str(t)] = ta.EMA(informative, timeperiod=t)
coin + "20sma"] informative['%-' + coin + "close_over_20sma-period_" +
str(t)] = (informative["close"] /
informative[coin + "20sma-period_" + str(t)])
informative['%-' + coin + "mfi"] = ta.MFI(informative, timeperiod=25) informative['%-' + coin + "mfi-period_" + str(t)] = ta.MFI(informative, timeperiod=t)
informative[coin + "ema21"] = ta.EMA(informative, timeperiod=21) informative[coin + "ema21-period_" + str(t)] = ta.EMA(informative, timeperiod=t)
informative[coin + "sma20"] = ta.SMA(informative, timeperiod=20) informative[coin + "sma20-period_" + str(t)] = ta.SMA(informative, timeperiod=t)
stoch = ta.STOCHRSI(informative, 15, 20, 2, 2)
informative['%-' + coin + "srsi-fk"] = stoch["fastk"]
informative['%-' + coin + "srsi-fd"] = stoch["fastd"]
bollinger = qtpylib.bollinger_bands(qtpylib.typical_price(informative), window=14, stds=2.2) bollinger = qtpylib.bollinger_bands(qtpylib.typical_price(informative), window=t,
informative[coin + "bb_lowerband"] = bollinger["lower"] stds=2.2)
informative[coin + "bb_middleband"] = bollinger["mid"] informative[coin + "bb_lowerband-period_" + str(t)] = bollinger["lower"]
informative[coin + "bb_upperband"] = bollinger["upper"] informative[coin + "bb_middleband-period_" + str(t)] = bollinger["mid"]
informative['%-' + coin + "bb_width"] = ( informative[coin + "bb_upperband-period_" + str(t)] = bollinger["upper"]
informative[coin + "bb_upperband"] - informative[coin + "bb_lowerband"] informative['%-' + coin + "bb_width-period_" + str(t)] = (
) / informative[coin + "bb_middleband"] informative[coin + "bb_upperband-period_" + str(t)] -
informative['%-' + coin + "close-bb_lower"] = ( informative[coin + "bb_lowerband-period_" + str(t)]
informative["close"] / informative[coin + "bb_lowerband"] ) / informative[coin + "bb_middleband-period_" + str(t)]
informative['%-' + coin + "close-bb_lower-period_" + str(t)] = (
informative["close"] / informative[coin + "bb_lowerband-period_" + str(t)]
) )
informative['%-' + coin + "roc"] = ta.ROC(informative, timeperiod=3) informative['%-' + coin + "roc-period_" + str(t)] = ta.ROC(informative, timeperiod=t)
informative['%-' + coin + "adx"] = ta.ADX(informative, window=14) informative['%-' + coin + "adx-period_" + str(t)] = ta.ADX(informative, window=t)
macd = ta.MACD(informative) macd = ta.MACD(informative, timeperiod=t)
informative['%-' + coin + "macd"] = macd["macd"] informative['%-' + coin + "macd-period_" + str(t)] = macd["macd"]
informative[coin + "pct-change"] = informative["close"].pct_change()
informative['%-' + coin + "relative_volume"] = ( informative['%-' + coin + "relative_volume-period_" + str(t)] = (
informative["volume"] / informative["volume"].rolling(10).mean() informative["volume"] / informative["volume"].rolling(t).mean()
) )
informative[coin + "pct-change"] = informative["close"].pct_change() informative['%-' + coin + "pct-change"] = informative["close"].pct_change()
informative['%-' + coin + "raw_volume"] = informative["volume"]
informative['%-' + coin + 'raw_price'] = informative['close']
# The following code automatically adds features according to the `shift` parameter passed
# in the config. Do not remove
indicators = [col for col in informative if col.startswith('%')] indicators = [col for col in informative if col.startswith('%')]
# This loop duplicates and shifts all indicators to add a sense of recency to data
for n in range(self.freqai_info["feature_parameters"]["shift"] + 1): for n in range(self.freqai_info["feature_parameters"]["shift"] + 1):
if n == 0: if n == 0:
continue continue
@ -141,15 +144,12 @@ class FreqaiExampleStrategy(IStrategy):
informative_shift = informative_shift.add_suffix("_shift-" + str(n)) informative_shift = informative_shift.add_suffix("_shift-" + str(n))
informative = pd.concat((informative, informative_shift), axis=1) informative = pd.concat((informative, informative_shift), axis=1)
# The following code safely merges into the base timeframe.
# Do not remove.
df = merge_informative_pair(df, informative, self.config["timeframe"], tf, ffill=True) df = merge_informative_pair(df, informative, self.config["timeframe"], tf, ffill=True)
skip_columns = [(s + "_" + tf) for s in ["date", "open", "high", "low", "close", "volume"]] skip_columns = [(s + "_" + tf) for s in ["date", "open", "high", "low", "close", "volume"]]
df = df.drop(columns=skip_columns) df = df.drop(columns=skip_columns)
# Add generalized indicators (not associated to any individual coin or timeframe) here # Add generalized indicators here (because in live, it will call this function to populate
# because in live, it will call this function to populate # indicators during training). Notice how we ensure not to add them multiple times
# indicators during training. Notice how we ensure not to add them multiple times
if pair == metadata['pair'] and tf == self.timeframe: if pair == metadata['pair'] and tf == self.timeframe:
df['%-day_of_week'] = (df["date"].dt.dayofweek + 1) / 7 df['%-day_of_week'] = (df["date"].dt.dayofweek + 1) / 7
df['%-hour_of_day'] = (df['date'].dt.hour + 1) / 25 df['%-hour_of_day'] = (df['date'].dt.hour + 1) / 25
@ -314,10 +314,10 @@ class FreqaiExampleStrategy(IStrategy):
last_candle = df.iloc[-1].squeeze() last_candle = df.iloc[-1].squeeze()
if side == 'long': if side == 'long':
if last_candle['close'] > (last_candle['close'] * (1 + 0.0025)): if rate > (last_candle['close'] * (1 + 0.0025)):
return False return False
else: else:
if last_candle['close'] < (last_candle['close'] * (1 - 0.0025)): if rate < (last_candle['close'] * (1 - 0.0025)):
return False return False
return True return True