From 7523ed825eed804be66451e2a08b936bb0b05f45 Mon Sep 17 00:00:00 2001
From: robcaulk <rob.caulk@gmail.com>
Date: Tue, 31 May 2022 18:42:27 +0200
Subject: [PATCH] automatically detect maximum required data based on user fed
 indicators (to avoid NaNs in dataset for rolling indicators), add new config
 parameter for backtesting to let users increase their startup_candles to
 accommodate high timeframe indicators, add docs to explain all. Add new
 feature for automatic indicator duplication according to user defined
 intervals (exhibited in example strat and configs now).

---
 .../config_freqai_futures.example.json        |  5 +-
 .../config_freqai_spot.example.json           |  7 +-
 docs/freqai.md                                | 16 +++-
 freqtrade/freqai/data_kitchen.py              | 56 +++++++++---
 freqtrade/freqai/freqai_interface.py          | 37 +++++---
 freqtrade/optimize/backtesting.py             |  3 +-
 freqtrade/templates/FreqaiExampleStrategy.py  | 88 +++++++++----------
 7 files changed, 141 insertions(+), 71 deletions(-)

diff --git a/config_examples/config_freqai_futures.example.json b/config_examples/config_freqai_futures.example.json
index 5cd867e53..55217ee0c 100644
--- a/config_examples/config_freqai_futures.example.json
+++ b/config_examples/config_freqai_futures.example.json
@@ -59,6 +59,7 @@
         }
     ],
     "freqai": {
+        "startup_candles": 10000,
         "timeframes": [
             "3m",
             "15m",
@@ -79,7 +80,9 @@
             "weight_factor": 0.9,
             "principal_component_analysis": false,
             "use_SVM_to_remove_outliers": true,
-            "stratify": 0
+            "stratify": 0,
+            "indicator_max_period": 20,
+            "indicator_interval": 10
         },
         "data_split_parameters": {
             "test_size": 0.33,
diff --git a/config_examples/config_freqai_spot.example.json b/config_examples/config_freqai_spot.example.json
index 0b4d4e7c5..5ba0615d2 100644
--- a/config_examples/config_freqai_spot.example.json
+++ b/config_examples/config_freqai_spot.example.json
@@ -7,7 +7,7 @@
     "dry_run": true,
     "timeframe": "5m",
     "dry_run_wallet": 4000,
-    "dataformat_ohlcv": "hdf5",
+    "dataformat_ohlcv": "json",
     "cancel_open_orders_on_exit": true,
     "unfilledtimeout": {
         "entry": 10,
@@ -51,6 +51,7 @@
         }
     ],
     "freqai": {
+        "startup_candles": 10000,
         "timeframes": [
             "5m",
             "15m",
@@ -74,7 +75,9 @@
             "weight_factor": 0.9,
             "principal_component_analysis": false,
             "use_SVM_to_remove_outliers": false,
-            "stratify": 0
+            "stratify": 0,
+            "indicator_max_period": 50,
+            "indicator_interval": 10
         },
         "data_split_parameters": {
             "test_size": 0.33,
diff --git a/docs/freqai.md b/docs/freqai.md
index fa2ca3724..8d54a7535 100644
--- a/docs/freqai.md
+++ b/docs/freqai.md
@@ -76,6 +76,7 @@ config setup includes:
 
 ```json
     "freqai": {
+                "startup_candles": 10000,
                 "timeframes" : ["5m","15m","4h"],
                 "train_period" : 30,
                 "backtest_period" : 7,
@@ -105,6 +106,7 @@ config setup includes:
 
 ### Building the feature set
 
+!! slightly out of date, please refer to templates/FreqaiExampleStrategy.py for updated method !!
 Features are added by the user inside the `populate_any_indicators()` method of the strategy 
 by prepending indicators with `%`:
 
@@ -194,7 +196,19 @@ Freqai will train 8 separate models (because the full range comprises 8 weeks),
 and then backtest the subsequent week associated with each of the 8 training
 data set timerange months. Users can think of this as a "sliding window" which
 emulates Freqai retraining itself once per week in live using the previous
-month of data.
+month of data._
+
+In live, the required training data is automatically computed and downloaded. However, in backtesting
+the user must manually enter the required number of `startup_candles` in the config. This value
+is used to increase the available data to FreqAI and should be sufficient to enable all indicators 
+to be NaN free at the beginning of the first training timerange. This boils down to identifying the 
+highest timeframe (`4h` in present example)  and the longest indicator period (25 in present example)
+and adding this to the `train_period`. The units need to be in the base candle time frame:_
+
+`startup_candles` = ( 4 hours * 25 max period * 60 minutes/hour + 30 day train_period * 1440 minutes per day ) / 5 min (base time frame) = 1488.
+
+!!! Note: in dry/live, this is all precomputed and handled automatically. Thus, `startup_candle` has no 
+influence on dry/live.
 
 ## Running Freqai
 
diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py
index 99bd4d6fc..dceb721c5 100644
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@@ -60,11 +60,6 @@ class FreqaiDataKitchen:
         self.pair = pair
         self.svm_model: linear_model.SGDOneClassSVM = None
         if not self.live:
-            # if config.get('freqai', {}).get('backtest_period') < 1:
-            #     raise OperationalException('backtest_period < 1,'
-            #                                'Can only backtest on full day increments'
-            #                                'backtest_period. Only live/dry mode'
-            #                                'allows fractions of days')
             self.full_timerange = self.create_fulltimerange(self.config["timerange"],
                                                             self.freqai_config.get("train_period")
                                                             )
@@ -291,10 +286,16 @@ class FreqaiDataKitchen:
             labels = labels[
                 (drop_index == 0) & (drop_index_labels == 0)
             ]  # assuming the labels depend entirely on the dataframe here.
-            # logger.info(
-            #     "dropped %s training points due to NaNs, ensure all historical data downloaded",
-            #     len(unfiltered_dataframe) - len(filtered_dataframe),
-            # )
+            logger.info(
+                f'dropped {len(unfiltered_dataframe) - len(filtered_dataframe)} training points'
+                f' due to NaNs in populated dataset {len(unfiltered_dataframe)}.'
+            )
+            if (1 - len(filtered_dataframe) / len(unfiltered_dataframe)) > 0.1 and self.live:
+                logger.warning(
+                    f' {(1 - len(filtered_dataframe)/len(unfiltered_dataframe)) * 100} percent'
+                    ' of training data dropped due to NaNs, model may perform inconsistent'
+                    'with expectations'
+                )
             self.data["filter_drop_index_training"] = drop_index
 
         else:
@@ -685,10 +686,31 @@ class FreqaiDataKitchen:
 
         return full_timerange
 
-    def check_if_new_training_required(self, trained_timestamp: int) -> Tuple[bool, TimeRange]:
+    def check_if_new_training_required(self, trained_timestamp: int) -> Tuple[bool,
+                                                                              TimeRange, TimeRange]:
 
         time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp()
         trained_timerange = TimeRange()
+        data_load_timerange = TimeRange()
+
+        # find the max indicator length required
+        max_timeframe_chars = self.freqai_config.get('timeframes')[-1]
+        max_period = self.freqai_config.get('feature_parameters', {}).get(
+                                            'indicator_max_period', 20)
+        additional_seconds = 0
+        if max_timeframe_chars[-1] == 'd':
+            additional_seconds = max_period * SECONDS_IN_DAY * int(max_timeframe_chars[-2])
+        elif max_timeframe_chars[-1] == 'h':
+            additional_seconds = max_period * 3600 * int(max_timeframe_chars[-2])
+        elif max_timeframe_chars[-1] == 'm':
+            if len(max_timeframe_chars) == 2:
+                additional_seconds = max_period * 60 * int(max_timeframe_chars[-2])
+            elif len(max_timeframe_chars) == 3:
+                additional_seconds = max_period * 60 * int(float(max_timeframe_chars[0:2]))
+            else:
+                logger.warning('FreqAI could not detect max timeframe and therefore may not '
+                               'download the proper amount of data for training')
+
         if trained_timestamp != 0:
             elapsed_time = (time - trained_timestamp) / SECONDS_IN_DAY
             retrain = elapsed_time > self.freqai_config.get('backtest_period')
@@ -696,10 +718,22 @@ class FreqaiDataKitchen:
                 trained_timerange.startts = int(time - self.freqai_config.get(
                                              'train_period', 0) * SECONDS_IN_DAY)
                 trained_timerange.stopts = int(time)
+                # we want to load/populate indicators on more data than we plan to train on so
+                # because most of the indicators have a rolling timeperiod, and are thus NaNs
+                # unless they have data further back in time before the start of the train period
+                data_load_timerange.startts = int(time - self.freqai_config.get(
+                                             'train_period', 0) * SECONDS_IN_DAY
+                                             - additional_seconds)
+                data_load_timerange.stopts = int(time)
         else:  # user passed no live_trained_timerange in config
             trained_timerange.startts = int(time - self.freqai_config.get('train_period') *
                                             SECONDS_IN_DAY)
             trained_timerange.stopts = int(time)
+
+            data_load_timerange.startts = int(time - self.freqai_config.get(
+                                            'train_period', 0) * SECONDS_IN_DAY
+                                            - additional_seconds)
+            data_load_timerange.stopts = int(time)
             retrain = True
 
         # if retrain:
@@ -714,7 +748,7 @@ class FreqaiDataKitchen:
         #     # enables persistence, but not fully implemented into save/load data yer
         #     self.data['live_trained_timerange'] = str(int(trained_timerange.stopts))
 
-        return retrain, trained_timerange
+        return retrain, trained_timerange, data_load_timerange
 
     def set_new_model_names(self, metadata: dict, trained_timerange: TimeRange):
 
diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py
index c8ce1a19a..ed7cab287 100644
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -1,4 +1,5 @@
 # import contextlib
+import datetime
 import gc
 import logging
 # import sys
@@ -149,8 +150,15 @@ class IFreqaiModel(ABC):
             # self.training_timerange_timerange = tr_train
             dataframe_train = dh.slice_dataframe(tr_train, dataframe)
             dataframe_backtest = dh.slice_dataframe(tr_backtest, dataframe)
-            logger.info("training %s for %s", metadata["pair"], tr_train)
+
             trained_timestamp = tr_train  # TimeRange.parse_timerange(tr_train)
+            tr_train_startts_str = datetime.datetime.utcfromtimestamp(
+                tr_train.startts).strftime('%Y-%m-%d %H:%M:%S')
+            tr_train_stopts_str = datetime.datetime.utcfromtimestamp(
+                tr_train.stopts).strftime('%Y-%m-%d %H:%M:%S')
+            logger.info("Training %s", metadata["pair"])
+            logger.info(f'Training {tr_train_startts_str} to {tr_train_stopts_str}')
+
             dh.data_path = Path(dh.full_path /
                                 str("sub-train" + "-" + metadata['pair'].split("/")[0] +
                                     str(int(trained_timestamp.stopts))))
@@ -218,16 +226,19 @@ class IFreqaiModel(ABC):
                                                 model_filename=model_filename)
 
             (self.retrain,
-             new_trained_timerange) = dh.check_if_new_training_required(trained_timestamp)
+             new_trained_timerange,
+             data_load_timerange) = dh.check_if_new_training_required(trained_timestamp)
             dh.set_paths(metadata, new_trained_timerange.stopts)
 
             if self.retrain or not file_exists:
                 if coin_first:
-                    self.train_model_in_series(new_trained_timerange, metadata, strategy, dh)
+                    self.train_model_in_series(new_trained_timerange, metadata,
+                                               strategy, dh, data_load_timerange)
                 else:
                     self.training_on_separate_thread = True  # acts like a lock
                     self.retrain_model_on_separate_thread(new_trained_timerange,
-                                                          metadata, strategy, dh)
+                                                          metadata, strategy,
+                                                          dh, data_load_timerange)
 
         elif self.training_on_separate_thread and not self.follow_mode:
             logger.info("FreqAI training a new model on background thread.")
@@ -342,11 +353,12 @@ class IFreqaiModel(ABC):
 
     @threaded
     def retrain_model_on_separate_thread(self, new_trained_timerange: TimeRange, metadata: dict,
-                                         strategy: IStrategy, dh: FreqaiDataKitchen):
+                                         strategy: IStrategy, dh: FreqaiDataKitchen,
+                                         data_load_timerange: TimeRange):
 
         # with nostdout():
-        dh.download_new_data_for_retraining(new_trained_timerange, metadata, strategy)
-        corr_dataframes, base_dataframes = dh.load_pairs_histories(new_trained_timerange,
+        dh.download_new_data_for_retraining(data_load_timerange, metadata, strategy)
+        corr_dataframes, base_dataframes = dh.load_pairs_histories(data_load_timerange,
                                                                    metadata)
 
         # protecting from common benign errors associated with grabbing new data from exchange:
@@ -355,6 +367,8 @@ class IFreqaiModel(ABC):
                                                                           corr_dataframes,
                                                                           base_dataframes,
                                                                           metadata)
+            unfiltered_dataframe = dh.slice_dataframe(new_trained_timerange, unfiltered_dataframe)
+
         except Exception:
             logger.warning('Mismatched sizes encountered in strategy')
             # self.data_drawer.pair_to_end_of_training_queue(metadata['pair'])
@@ -390,10 +404,11 @@ class IFreqaiModel(ABC):
         return
 
     def train_model_in_series(self, new_trained_timerange: TimeRange, metadata: dict,
-                              strategy: IStrategy, dh: FreqaiDataKitchen):
+                              strategy: IStrategy, dh: FreqaiDataKitchen,
+                              data_load_timerange: TimeRange):
 
-        dh.download_new_data_for_retraining(new_trained_timerange, metadata, strategy)
-        corr_dataframes, base_dataframes = dh.load_pairs_histories(new_trained_timerange,
+        dh.download_new_data_for_retraining(data_load_timerange, metadata, strategy)
+        corr_dataframes, base_dataframes = dh.load_pairs_histories(data_load_timerange,
                                                                    metadata)
 
         unfiltered_dataframe = dh.use_strategy_to_populate_indicators(strategy,
@@ -401,6 +416,8 @@ class IFreqaiModel(ABC):
                                                                       base_dataframes,
                                                                       metadata)
 
+        unfiltered_dataframe = dh.slice_dataframe(new_trained_timerange, unfiltered_dataframe)
+
         model = self.train(unfiltered_dataframe, metadata, dh)
 
         self.data_drawer.pair_dict[metadata['pair']][
diff --git a/freqtrade/optimize/backtesting.py b/freqtrade/optimize/backtesting.py
index 3996dd08d..607128eef 100755
--- a/freqtrade/optimize/backtesting.py
+++ b/freqtrade/optimize/backtesting.py
@@ -205,8 +205,7 @@ class Backtesting:
         self.progress.init_step(BacktestState.DATALOAD, 1)
 
         if self.config.get('freqai') is not None:
-            self.required_startup += int((self.config.get('freqai', {}).get('train_period') *
-                                         86400) / timeframe_to_seconds(self.config['timeframe']))
+            self.required_startup += int(self.config.get('freqai', {}).get('startup_candles', 1000))
             logger.info(f'Increasing startup_candle_count for freqai to {self.required_startup}')
             self.config['startup_candle_count'] = self.required_startup
 
diff --git a/freqtrade/templates/FreqaiExampleStrategy.py b/freqtrade/templates/FreqaiExampleStrategy.py
index daf59fe12..bc1aba361 100644
--- a/freqtrade/templates/FreqaiExampleStrategy.py
+++ b/freqtrade/templates/FreqaiExampleStrategy.py
@@ -85,55 +85,58 @@ class FreqaiExampleStrategy(IStrategy):
         :informative: the dataframe associated with the informative pair
         :coin: the name of the coin which will modify the feature names.
         """
+
         if informative is None:
             informative = self.dp.get_pair_dataframe(pair, tf)
 
-        informative['%-' + coin + "rsi"] = ta.RSI(informative, timeperiod=14)
-        informative['%-' + coin + "mfi"] = ta.MFI(informative, timeperiod=25)
-        informative['%-' + coin + "adx"] = ta.ADX(informative, window=20)
+        # first loop is automatically duplicating indicators for time periods
+        for t in np.arange(10, self.freqai_info["feature_parameters"]["indicator_max_period"],
+                           self.freqai_info["feature_parameters"]["indicator_interval"]):
 
-        informative[coin + "20sma"] = ta.SMA(informative, timeperiod=20)
-        informative[coin + "21ema"] = ta.EMA(informative, timeperiod=21)
-        informative['%-' + coin + "bmsb"] = np.where(
-            informative[coin + "20sma"].lt(informative[coin + "21ema"]), 1, 0
-        )
-        informative['%-' + coin + "close_over_20sma"] = informative["close"] / informative[
-                                                                                    coin + "20sma"]
+            t = int(t)
+            informative['%-' + coin + "rsi-period_" + str(t)] = ta.RSI(informative, timeperiod=t)
+            informative['%-' + coin + "mfi-period_" + str(t)] = ta.MFI(informative, timeperiod=t)
+            informative['%-' + coin + "adx-period_" + str(t)] = ta.ADX(informative, window=t)
+            informative[coin + "20sma-period_" + str(t)] = ta.SMA(informative, timeperiod=t)
+            informative[coin + "21ema-period_" + str(t)] = ta.EMA(informative, timeperiod=t)
+            informative['%-' + coin + "close_over_20sma-period_" +
+                        str(t)] = (informative["close"] /
+                                   informative[coin + "20sma-period_" + str(t)])
 
-        informative['%-' + coin + "mfi"] = ta.MFI(informative, timeperiod=25)
+            informative['%-' + coin + "mfi-period_" + str(t)] = ta.MFI(informative, timeperiod=t)
 
-        informative[coin + "ema21"] = ta.EMA(informative, timeperiod=21)
-        informative[coin + "sma20"] = ta.SMA(informative, timeperiod=20)
-        stoch = ta.STOCHRSI(informative, 15, 20, 2, 2)
-        informative['%-' + coin + "srsi-fk"] = stoch["fastk"]
-        informative['%-' + coin + "srsi-fd"] = stoch["fastd"]
+            informative[coin + "ema21-period_" + str(t)] = ta.EMA(informative, timeperiod=t)
+            informative[coin + "sma20-period_" + str(t)] = ta.SMA(informative, timeperiod=t)
 
-        bollinger = qtpylib.bollinger_bands(qtpylib.typical_price(informative), window=14, stds=2.2)
-        informative[coin + "bb_lowerband"] = bollinger["lower"]
-        informative[coin + "bb_middleband"] = bollinger["mid"]
-        informative[coin + "bb_upperband"] = bollinger["upper"]
-        informative['%-' + coin + "bb_width"] = (
-            informative[coin + "bb_upperband"] - informative[coin + "bb_lowerband"]
-        ) / informative[coin + "bb_middleband"]
-        informative['%-' + coin + "close-bb_lower"] = (
-            informative["close"] / informative[coin + "bb_lowerband"]
-        )
+            bollinger = qtpylib.bollinger_bands(qtpylib.typical_price(informative), window=t,
+                                                stds=2.2)
+            informative[coin + "bb_lowerband-period_" + str(t)] = bollinger["lower"]
+            informative[coin + "bb_middleband-period_" + str(t)] = bollinger["mid"]
+            informative[coin + "bb_upperband-period_" + str(t)] = bollinger["upper"]
+            informative['%-' + coin + "bb_width-period_" + str(t)] = (
+                informative[coin + "bb_upperband-period_" + str(t)] -
+                informative[coin + "bb_lowerband-period_" + str(t)]
+            ) / informative[coin + "bb_middleband-period_" + str(t)]
+            informative['%-' + coin + "close-bb_lower-period_" + str(t)] = (
+                informative["close"] / informative[coin + "bb_lowerband-period_" + str(t)]
+            )
 
-        informative['%-' + coin + "roc"] = ta.ROC(informative, timeperiod=3)
-        informative['%-' + coin + "adx"] = ta.ADX(informative, window=14)
+            informative['%-' + coin + "roc-period_" + str(t)] = ta.ROC(informative, timeperiod=t)
+            informative['%-' + coin + "adx-period_" + str(t)] = ta.ADX(informative, window=t)
 
-        macd = ta.MACD(informative)
-        informative['%-' + coin + "macd"] = macd["macd"]
-        informative[coin + "pct-change"] = informative["close"].pct_change()
-        informative['%-' + coin + "relative_volume"] = (
-            informative["volume"] / informative["volume"].rolling(10).mean()
-        )
+            macd = ta.MACD(informative, timeperiod=t)
+            informative['%-' + coin + "macd-period_" + str(t)] = macd["macd"]
 
-        informative[coin + "pct-change"] = informative["close"].pct_change()
+            informative['%-' + coin + "relative_volume-period_" + str(t)] = (
+                informative["volume"] / informative["volume"].rolling(t).mean()
+            )
+
+        informative['%-' + coin + "pct-change"] = informative["close"].pct_change()
+        informative['%-' + coin + "raw_volume"] = informative["volume"]
+        informative['%-' + coin + 'raw_price'] = informative['close']
 
-        # The following code automatically adds features according to the `shift` parameter passed
-        # in the config. Do not remove
         indicators = [col for col in informative if col.startswith('%')]
+        # This loop duplicates and shifts all indicators to add a sense of recency to data
         for n in range(self.freqai_info["feature_parameters"]["shift"] + 1):
             if n == 0:
                 continue
@@ -141,15 +144,12 @@ class FreqaiExampleStrategy(IStrategy):
             informative_shift = informative_shift.add_suffix("_shift-" + str(n))
             informative = pd.concat((informative, informative_shift), axis=1)
 
-        # The following code safely merges into the base timeframe.
-        # Do not remove.
         df = merge_informative_pair(df, informative, self.config["timeframe"], tf, ffill=True)
         skip_columns = [(s + "_" + tf) for s in ["date", "open", "high", "low", "close", "volume"]]
         df = df.drop(columns=skip_columns)
 
-        # Add generalized indicators (not associated to any individual coin or timeframe) here
-        # because in live, it will call this function to populate
-        # indicators during training. Notice how we ensure not to add them multiple times
+        # Add generalized indicators here (because in live, it will call this function to populate
+        # indicators during training). Notice how we ensure not to add them multiple times
         if pair == metadata['pair'] and tf == self.timeframe:
             df['%-day_of_week'] = (df["date"].dt.dayofweek + 1) / 7
             df['%-hour_of_day'] = (df['date'].dt.hour + 1) / 25
@@ -314,10 +314,10 @@ class FreqaiExampleStrategy(IStrategy):
         last_candle = df.iloc[-1].squeeze()
 
         if side == 'long':
-            if last_candle['close'] > (last_candle['close'] * (1 + 0.0025)):
+            if rate > (last_candle['close'] * (1 + 0.0025)):
                 return False
         else:
-            if last_candle['close'] < (last_candle['close'] * (1 - 0.0025)):
+            if rate < (last_candle['close'] * (1 - 0.0025)):
                 return False
 
         return True