From 7523ed825eed804be66451e2a08b936bb0b05f45 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Tue, 31 May 2022 18:42:27 +0200 Subject: [PATCH] automatically detect maximum required data based on user fed indicators (to avoid NaNs in dataset for rolling indicators), add new config parameter for backtesting to let users increase their startup_candles to accommodate high timeframe indicators, add docs to explain all. Add new feature for automatic indicator duplication according to user defined intervals (exhibited in example strat and configs now). --- .../config_freqai_futures.example.json | 5 +- .../config_freqai_spot.example.json | 7 +- docs/freqai.md | 16 +++- freqtrade/freqai/data_kitchen.py | 56 +++++++++--- freqtrade/freqai/freqai_interface.py | 37 +++++--- freqtrade/optimize/backtesting.py | 3 +- freqtrade/templates/FreqaiExampleStrategy.py | 88 +++++++++---------- 7 files changed, 141 insertions(+), 71 deletions(-) diff --git a/config_examples/config_freqai_futures.example.json b/config_examples/config_freqai_futures.example.json index 5cd867e53..55217ee0c 100644 --- a/config_examples/config_freqai_futures.example.json +++ b/config_examples/config_freqai_futures.example.json @@ -59,6 +59,7 @@ } ], "freqai": { + "startup_candles": 10000, "timeframes": [ "3m", "15m", @@ -79,7 +80,9 @@ "weight_factor": 0.9, "principal_component_analysis": false, "use_SVM_to_remove_outliers": true, - "stratify": 0 + "stratify": 0, + "indicator_max_period": 20, + "indicator_interval": 10 }, "data_split_parameters": { "test_size": 0.33, diff --git a/config_examples/config_freqai_spot.example.json b/config_examples/config_freqai_spot.example.json index 0b4d4e7c5..5ba0615d2 100644 --- a/config_examples/config_freqai_spot.example.json +++ b/config_examples/config_freqai_spot.example.json @@ -7,7 +7,7 @@ "dry_run": true, "timeframe": "5m", "dry_run_wallet": 4000, - "dataformat_ohlcv": "hdf5", + "dataformat_ohlcv": "json", "cancel_open_orders_on_exit": true, "unfilledtimeout": { "entry": 10, @@ -51,6 +51,7 @@ } ], "freqai": { + "startup_candles": 10000, "timeframes": [ "5m", "15m", @@ -74,7 +75,9 @@ "weight_factor": 0.9, "principal_component_analysis": false, "use_SVM_to_remove_outliers": false, - "stratify": 0 + "stratify": 0, + "indicator_max_period": 50, + "indicator_interval": 10 }, "data_split_parameters": { "test_size": 0.33, diff --git a/docs/freqai.md b/docs/freqai.md index fa2ca3724..8d54a7535 100644 --- a/docs/freqai.md +++ b/docs/freqai.md @@ -76,6 +76,7 @@ config setup includes: ```json "freqai": { + "startup_candles": 10000, "timeframes" : ["5m","15m","4h"], "train_period" : 30, "backtest_period" : 7, @@ -105,6 +106,7 @@ config setup includes: ### Building the feature set +!! slightly out of date, please refer to templates/FreqaiExampleStrategy.py for updated method !! Features are added by the user inside the `populate_any_indicators()` method of the strategy by prepending indicators with `%`: @@ -194,7 +196,19 @@ Freqai will train 8 separate models (because the full range comprises 8 weeks), and then backtest the subsequent week associated with each of the 8 training data set timerange months. Users can think of this as a "sliding window" which emulates Freqai retraining itself once per week in live using the previous -month of data. +month of data._ + +In live, the required training data is automatically computed and downloaded. However, in backtesting +the user must manually enter the required number of `startup_candles` in the config. This value +is used to increase the available data to FreqAI and should be sufficient to enable all indicators +to be NaN free at the beginning of the first training timerange. This boils down to identifying the +highest timeframe (`4h` in present example) and the longest indicator period (25 in present example) +and adding this to the `train_period`. The units need to be in the base candle time frame:_ + +`startup_candles` = ( 4 hours * 25 max period * 60 minutes/hour + 30 day train_period * 1440 minutes per day ) / 5 min (base time frame) = 1488. + +!!! Note: in dry/live, this is all precomputed and handled automatically. Thus, `startup_candle` has no +influence on dry/live. ## Running Freqai diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 99bd4d6fc..dceb721c5 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -60,11 +60,6 @@ class FreqaiDataKitchen: self.pair = pair self.svm_model: linear_model.SGDOneClassSVM = None if not self.live: - # if config.get('freqai', {}).get('backtest_period') < 1: - # raise OperationalException('backtest_period < 1,' - # 'Can only backtest on full day increments' - # 'backtest_period. Only live/dry mode' - # 'allows fractions of days') self.full_timerange = self.create_fulltimerange(self.config["timerange"], self.freqai_config.get("train_period") ) @@ -291,10 +286,16 @@ class FreqaiDataKitchen: labels = labels[ (drop_index == 0) & (drop_index_labels == 0) ] # assuming the labels depend entirely on the dataframe here. - # logger.info( - # "dropped %s training points due to NaNs, ensure all historical data downloaded", - # len(unfiltered_dataframe) - len(filtered_dataframe), - # ) + logger.info( + f'dropped {len(unfiltered_dataframe) - len(filtered_dataframe)} training points' + f' due to NaNs in populated dataset {len(unfiltered_dataframe)}.' + ) + if (1 - len(filtered_dataframe) / len(unfiltered_dataframe)) > 0.1 and self.live: + logger.warning( + f' {(1 - len(filtered_dataframe)/len(unfiltered_dataframe)) * 100} percent' + ' of training data dropped due to NaNs, model may perform inconsistent' + 'with expectations' + ) self.data["filter_drop_index_training"] = drop_index else: @@ -685,10 +686,31 @@ class FreqaiDataKitchen: return full_timerange - def check_if_new_training_required(self, trained_timestamp: int) -> Tuple[bool, TimeRange]: + def check_if_new_training_required(self, trained_timestamp: int) -> Tuple[bool, + TimeRange, TimeRange]: time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp() trained_timerange = TimeRange() + data_load_timerange = TimeRange() + + # find the max indicator length required + max_timeframe_chars = self.freqai_config.get('timeframes')[-1] + max_period = self.freqai_config.get('feature_parameters', {}).get( + 'indicator_max_period', 20) + additional_seconds = 0 + if max_timeframe_chars[-1] == 'd': + additional_seconds = max_period * SECONDS_IN_DAY * int(max_timeframe_chars[-2]) + elif max_timeframe_chars[-1] == 'h': + additional_seconds = max_period * 3600 * int(max_timeframe_chars[-2]) + elif max_timeframe_chars[-1] == 'm': + if len(max_timeframe_chars) == 2: + additional_seconds = max_period * 60 * int(max_timeframe_chars[-2]) + elif len(max_timeframe_chars) == 3: + additional_seconds = max_period * 60 * int(float(max_timeframe_chars[0:2])) + else: + logger.warning('FreqAI could not detect max timeframe and therefore may not ' + 'download the proper amount of data for training') + if trained_timestamp != 0: elapsed_time = (time - trained_timestamp) / SECONDS_IN_DAY retrain = elapsed_time > self.freqai_config.get('backtest_period') @@ -696,10 +718,22 @@ class FreqaiDataKitchen: trained_timerange.startts = int(time - self.freqai_config.get( 'train_period', 0) * SECONDS_IN_DAY) trained_timerange.stopts = int(time) + # we want to load/populate indicators on more data than we plan to train on so + # because most of the indicators have a rolling timeperiod, and are thus NaNs + # unless they have data further back in time before the start of the train period + data_load_timerange.startts = int(time - self.freqai_config.get( + 'train_period', 0) * SECONDS_IN_DAY + - additional_seconds) + data_load_timerange.stopts = int(time) else: # user passed no live_trained_timerange in config trained_timerange.startts = int(time - self.freqai_config.get('train_period') * SECONDS_IN_DAY) trained_timerange.stopts = int(time) + + data_load_timerange.startts = int(time - self.freqai_config.get( + 'train_period', 0) * SECONDS_IN_DAY + - additional_seconds) + data_load_timerange.stopts = int(time) retrain = True # if retrain: @@ -714,7 +748,7 @@ class FreqaiDataKitchen: # # enables persistence, but not fully implemented into save/load data yer # self.data['live_trained_timerange'] = str(int(trained_timerange.stopts)) - return retrain, trained_timerange + return retrain, trained_timerange, data_load_timerange def set_new_model_names(self, metadata: dict, trained_timerange: TimeRange): diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index c8ce1a19a..ed7cab287 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -1,4 +1,5 @@ # import contextlib +import datetime import gc import logging # import sys @@ -149,8 +150,15 @@ class IFreqaiModel(ABC): # self.training_timerange_timerange = tr_train dataframe_train = dh.slice_dataframe(tr_train, dataframe) dataframe_backtest = dh.slice_dataframe(tr_backtest, dataframe) - logger.info("training %s for %s", metadata["pair"], tr_train) + trained_timestamp = tr_train # TimeRange.parse_timerange(tr_train) + tr_train_startts_str = datetime.datetime.utcfromtimestamp( + tr_train.startts).strftime('%Y-%m-%d %H:%M:%S') + tr_train_stopts_str = datetime.datetime.utcfromtimestamp( + tr_train.stopts).strftime('%Y-%m-%d %H:%M:%S') + logger.info("Training %s", metadata["pair"]) + logger.info(f'Training {tr_train_startts_str} to {tr_train_stopts_str}') + dh.data_path = Path(dh.full_path / str("sub-train" + "-" + metadata['pair'].split("/")[0] + str(int(trained_timestamp.stopts)))) @@ -218,16 +226,19 @@ class IFreqaiModel(ABC): model_filename=model_filename) (self.retrain, - new_trained_timerange) = dh.check_if_new_training_required(trained_timestamp) + new_trained_timerange, + data_load_timerange) = dh.check_if_new_training_required(trained_timestamp) dh.set_paths(metadata, new_trained_timerange.stopts) if self.retrain or not file_exists: if coin_first: - self.train_model_in_series(new_trained_timerange, metadata, strategy, dh) + self.train_model_in_series(new_trained_timerange, metadata, + strategy, dh, data_load_timerange) else: self.training_on_separate_thread = True # acts like a lock self.retrain_model_on_separate_thread(new_trained_timerange, - metadata, strategy, dh) + metadata, strategy, + dh, data_load_timerange) elif self.training_on_separate_thread and not self.follow_mode: logger.info("FreqAI training a new model on background thread.") @@ -342,11 +353,12 @@ class IFreqaiModel(ABC): @threaded def retrain_model_on_separate_thread(self, new_trained_timerange: TimeRange, metadata: dict, - strategy: IStrategy, dh: FreqaiDataKitchen): + strategy: IStrategy, dh: FreqaiDataKitchen, + data_load_timerange: TimeRange): # with nostdout(): - dh.download_new_data_for_retraining(new_trained_timerange, metadata, strategy) - corr_dataframes, base_dataframes = dh.load_pairs_histories(new_trained_timerange, + dh.download_new_data_for_retraining(data_load_timerange, metadata, strategy) + corr_dataframes, base_dataframes = dh.load_pairs_histories(data_load_timerange, metadata) # protecting from common benign errors associated with grabbing new data from exchange: @@ -355,6 +367,8 @@ class IFreqaiModel(ABC): corr_dataframes, base_dataframes, metadata) + unfiltered_dataframe = dh.slice_dataframe(new_trained_timerange, unfiltered_dataframe) + except Exception: logger.warning('Mismatched sizes encountered in strategy') # self.data_drawer.pair_to_end_of_training_queue(metadata['pair']) @@ -390,10 +404,11 @@ class IFreqaiModel(ABC): return def train_model_in_series(self, new_trained_timerange: TimeRange, metadata: dict, - strategy: IStrategy, dh: FreqaiDataKitchen): + strategy: IStrategy, dh: FreqaiDataKitchen, + data_load_timerange: TimeRange): - dh.download_new_data_for_retraining(new_trained_timerange, metadata, strategy) - corr_dataframes, base_dataframes = dh.load_pairs_histories(new_trained_timerange, + dh.download_new_data_for_retraining(data_load_timerange, metadata, strategy) + corr_dataframes, base_dataframes = dh.load_pairs_histories(data_load_timerange, metadata) unfiltered_dataframe = dh.use_strategy_to_populate_indicators(strategy, @@ -401,6 +416,8 @@ class IFreqaiModel(ABC): base_dataframes, metadata) + unfiltered_dataframe = dh.slice_dataframe(new_trained_timerange, unfiltered_dataframe) + model = self.train(unfiltered_dataframe, metadata, dh) self.data_drawer.pair_dict[metadata['pair']][ diff --git a/freqtrade/optimize/backtesting.py b/freqtrade/optimize/backtesting.py index 3996dd08d..607128eef 100755 --- a/freqtrade/optimize/backtesting.py +++ b/freqtrade/optimize/backtesting.py @@ -205,8 +205,7 @@ class Backtesting: self.progress.init_step(BacktestState.DATALOAD, 1) if self.config.get('freqai') is not None: - self.required_startup += int((self.config.get('freqai', {}).get('train_period') * - 86400) / timeframe_to_seconds(self.config['timeframe'])) + self.required_startup += int(self.config.get('freqai', {}).get('startup_candles', 1000)) logger.info(f'Increasing startup_candle_count for freqai to {self.required_startup}') self.config['startup_candle_count'] = self.required_startup diff --git a/freqtrade/templates/FreqaiExampleStrategy.py b/freqtrade/templates/FreqaiExampleStrategy.py index daf59fe12..bc1aba361 100644 --- a/freqtrade/templates/FreqaiExampleStrategy.py +++ b/freqtrade/templates/FreqaiExampleStrategy.py @@ -85,55 +85,58 @@ class FreqaiExampleStrategy(IStrategy): :informative: the dataframe associated with the informative pair :coin: the name of the coin which will modify the feature names. """ + if informative is None: informative = self.dp.get_pair_dataframe(pair, tf) - informative['%-' + coin + "rsi"] = ta.RSI(informative, timeperiod=14) - informative['%-' + coin + "mfi"] = ta.MFI(informative, timeperiod=25) - informative['%-' + coin + "adx"] = ta.ADX(informative, window=20) + # first loop is automatically duplicating indicators for time periods + for t in np.arange(10, self.freqai_info["feature_parameters"]["indicator_max_period"], + self.freqai_info["feature_parameters"]["indicator_interval"]): - informative[coin + "20sma"] = ta.SMA(informative, timeperiod=20) - informative[coin + "21ema"] = ta.EMA(informative, timeperiod=21) - informative['%-' + coin + "bmsb"] = np.where( - informative[coin + "20sma"].lt(informative[coin + "21ema"]), 1, 0 - ) - informative['%-' + coin + "close_over_20sma"] = informative["close"] / informative[ - coin + "20sma"] + t = int(t) + informative['%-' + coin + "rsi-period_" + str(t)] = ta.RSI(informative, timeperiod=t) + informative['%-' + coin + "mfi-period_" + str(t)] = ta.MFI(informative, timeperiod=t) + informative['%-' + coin + "adx-period_" + str(t)] = ta.ADX(informative, window=t) + informative[coin + "20sma-period_" + str(t)] = ta.SMA(informative, timeperiod=t) + informative[coin + "21ema-period_" + str(t)] = ta.EMA(informative, timeperiod=t) + informative['%-' + coin + "close_over_20sma-period_" + + str(t)] = (informative["close"] / + informative[coin + "20sma-period_" + str(t)]) - informative['%-' + coin + "mfi"] = ta.MFI(informative, timeperiod=25) + informative['%-' + coin + "mfi-period_" + str(t)] = ta.MFI(informative, timeperiod=t) - informative[coin + "ema21"] = ta.EMA(informative, timeperiod=21) - informative[coin + "sma20"] = ta.SMA(informative, timeperiod=20) - stoch = ta.STOCHRSI(informative, 15, 20, 2, 2) - informative['%-' + coin + "srsi-fk"] = stoch["fastk"] - informative['%-' + coin + "srsi-fd"] = stoch["fastd"] + informative[coin + "ema21-period_" + str(t)] = ta.EMA(informative, timeperiod=t) + informative[coin + "sma20-period_" + str(t)] = ta.SMA(informative, timeperiod=t) - bollinger = qtpylib.bollinger_bands(qtpylib.typical_price(informative), window=14, stds=2.2) - informative[coin + "bb_lowerband"] = bollinger["lower"] - informative[coin + "bb_middleband"] = bollinger["mid"] - informative[coin + "bb_upperband"] = bollinger["upper"] - informative['%-' + coin + "bb_width"] = ( - informative[coin + "bb_upperband"] - informative[coin + "bb_lowerband"] - ) / informative[coin + "bb_middleband"] - informative['%-' + coin + "close-bb_lower"] = ( - informative["close"] / informative[coin + "bb_lowerband"] - ) + bollinger = qtpylib.bollinger_bands(qtpylib.typical_price(informative), window=t, + stds=2.2) + informative[coin + "bb_lowerband-period_" + str(t)] = bollinger["lower"] + informative[coin + "bb_middleband-period_" + str(t)] = bollinger["mid"] + informative[coin + "bb_upperband-period_" + str(t)] = bollinger["upper"] + informative['%-' + coin + "bb_width-period_" + str(t)] = ( + informative[coin + "bb_upperband-period_" + str(t)] - + informative[coin + "bb_lowerband-period_" + str(t)] + ) / informative[coin + "bb_middleband-period_" + str(t)] + informative['%-' + coin + "close-bb_lower-period_" + str(t)] = ( + informative["close"] / informative[coin + "bb_lowerband-period_" + str(t)] + ) - informative['%-' + coin + "roc"] = ta.ROC(informative, timeperiod=3) - informative['%-' + coin + "adx"] = ta.ADX(informative, window=14) + informative['%-' + coin + "roc-period_" + str(t)] = ta.ROC(informative, timeperiod=t) + informative['%-' + coin + "adx-period_" + str(t)] = ta.ADX(informative, window=t) - macd = ta.MACD(informative) - informative['%-' + coin + "macd"] = macd["macd"] - informative[coin + "pct-change"] = informative["close"].pct_change() - informative['%-' + coin + "relative_volume"] = ( - informative["volume"] / informative["volume"].rolling(10).mean() - ) + macd = ta.MACD(informative, timeperiod=t) + informative['%-' + coin + "macd-period_" + str(t)] = macd["macd"] - informative[coin + "pct-change"] = informative["close"].pct_change() + informative['%-' + coin + "relative_volume-period_" + str(t)] = ( + informative["volume"] / informative["volume"].rolling(t).mean() + ) + + informative['%-' + coin + "pct-change"] = informative["close"].pct_change() + informative['%-' + coin + "raw_volume"] = informative["volume"] + informative['%-' + coin + 'raw_price'] = informative['close'] - # The following code automatically adds features according to the `shift` parameter passed - # in the config. Do not remove indicators = [col for col in informative if col.startswith('%')] + # This loop duplicates and shifts all indicators to add a sense of recency to data for n in range(self.freqai_info["feature_parameters"]["shift"] + 1): if n == 0: continue @@ -141,15 +144,12 @@ class FreqaiExampleStrategy(IStrategy): informative_shift = informative_shift.add_suffix("_shift-" + str(n)) informative = pd.concat((informative, informative_shift), axis=1) - # The following code safely merges into the base timeframe. - # Do not remove. df = merge_informative_pair(df, informative, self.config["timeframe"], tf, ffill=True) skip_columns = [(s + "_" + tf) for s in ["date", "open", "high", "low", "close", "volume"]] df = df.drop(columns=skip_columns) - # Add generalized indicators (not associated to any individual coin or timeframe) here - # because in live, it will call this function to populate - # indicators during training. Notice how we ensure not to add them multiple times + # Add generalized indicators here (because in live, it will call this function to populate + # indicators during training). Notice how we ensure not to add them multiple times if pair == metadata['pair'] and tf == self.timeframe: df['%-day_of_week'] = (df["date"].dt.dayofweek + 1) / 7 df['%-hour_of_day'] = (df['date'].dt.hour + 1) / 25 @@ -314,10 +314,10 @@ class FreqaiExampleStrategy(IStrategy): last_candle = df.iloc[-1].squeeze() if side == 'long': - if last_candle['close'] > (last_candle['close'] * (1 + 0.0025)): + if rate > (last_candle['close'] * (1 + 0.0025)): return False else: - if last_candle['close'] < (last_candle['close'] * (1 - 0.0025)): + if rate < (last_candle['close'] * (1 - 0.0025)): return False return True