From 81f227cd9e82d321db4f0f70a54476576b246b8f Mon Sep 17 00:00:00 2001
From: robcaulk <rob.caulk@gmail.com>
Date: Mon, 9 May 2022 17:01:49 +0200
Subject: [PATCH] create more flexible whitelist, avoid duplicating whitelist
 features into corr_pairlist, update docs

---
 config_examples/config_freqai.example.json    |  5 +-
 docs/freqai.md                                | 82 ++++++++++++++++---
 freqtrade/freqai/data_kitchen.py              | 47 +++++++----
 freqtrade/freqai/freqai_interface.py          | 12 +--
 freqtrade/templates/ExamplePredictionModel.py |  7 +-
 freqtrade/templates/FreqaiExampleStrategy.py  |  5 +-
 6 files changed, 119 insertions(+), 39 deletions(-)

diff --git a/config_examples/config_freqai.example.json b/config_examples/config_freqai.example.json
index 351585d17..d89c835b1 100644
--- a/config_examples/config_freqai.example.json
+++ b/config_examples/config_freqai.example.json
@@ -24,7 +24,8 @@
             "rateLimit": 200
         },
         "pair_whitelist": [
-            "BTC/USDT"
+            "BTC/USDT",
+            "ETH/USDT"
         ],
         "pair_blacklist": []
     },
@@ -55,7 +56,7 @@
         ],
         "train_period": 30,
         "backtest_period": 7,
-        "identifier": "livetest5",
+        "identifier": "new_corrlist",
         "live_trained_timerange": "20220330-20220429",
         "live_full_backtestrange": "20220302-20220501",
         "base_features": [
diff --git a/docs/freqai.md b/docs/freqai.md
index 844881613..431705dd9 100644
--- a/docs/freqai.md
+++ b/docs/freqai.md
@@ -65,8 +65,6 @@ config setup includes:
                 "feature_parameters" : {
                         "period": 24,
                         "shift": 2,
-                        "drop_features": false,
-                        "DI_threshold": 1,
                         "weight_factor":  0,
                 },
                 "data_split_parameters" : {
@@ -79,8 +77,7 @@ config setup includes:
                     "learning_rate": 0.02,
                     "task_type": "CPU",
                 },
-        },
-
+        }
 ```
 
 ### Building the feature set
@@ -153,8 +150,6 @@ The Freqai strategy requires the user to include the following lines of code in
                 # the following loops are necessary for building the features 
                 # indicated by the user in the configuration file.
                 for tf in self.freqai_info['timeframes']:
-                        dataframe = self.populate_any_indicators(metadata['pair'],
-                                                                dataframe.copy(), tf)
                         for i in self.freqai_info['corr_pairlist']:
                         dataframe = self.populate_any_indicators(i,
                                         dataframe.copy(), tf, coin=i.split("/")[0]+'-')
@@ -177,8 +172,36 @@ and `make_labels()` to let them customize various aspects of their training proc
 
 ### Running the model live
 
-TODO: Freqai is not automated for live yet. 
+Freqai can be run dry/live using the following command
 
+```bash
+freqtrade trade --strategy FreqaiExampleStrategy --config config_freqai.example.json --freqaimodel ExamplePredictionModel
+```
+
+By default, Freqai will not find find any existing models and will start by training a new one 
+given the user configuration settings. Following training, it will use that model to predict for the
+duration of `backtest_period`. After a full `backtest_period` has elapsed, Freqai will auto retrain 
+a new model, and begin making predictions with the updated model. 
+
+If the user wishes to start dry/live from a saved model, the following configuration 
+parameters need to be set:
+
+```json
+    "freqai": {
+        "identifier": "example",
+        "live_trained_timerange": "20220330-20220429",
+        "live_full_backtestrange": "20220302-20220501"
+    }
+```
+
+Where the `identifier` is the same identifier which was set during the backtesting/training. Meanwhile,
+the `live_trained_timerange` is the sub-trained timerange (the training window) which was set 
+during backtesting/training. These are available to the user inside `user_data/models/*/sub-train-*`. 
+`live_full_backtestrange` was the full data range assocaited with the backtest/training (the full time 
+window that the training window and backtesting windows slide through). These values can be located 
+inside the `user_data/models/` directory. In this case, although Freqai will initiate with a 
+pretrained model, if a full `backtest_period` has elapsed since the end of the user set 
+`live_trained_timerange`, it will self retrain. 
 
 ## Data anylsis techniques
 ### Controlling the model learning process
@@ -226,12 +249,49 @@ $$ DI_k = d_k/\overline{d} $$
 Equity and crypto markets suffer from a high level of non-patterned noise in the
 form of outlier data points. The dissimilarity index allows predictions which
 are outliers and not existent in the model feature space, to be thrown out due
-to low levels of certainty. The user can tweak the DI with `DI_threshold` to increase
-or decrease the extrapolation of the trained model.
+to low levels of certainty. Activating the Dissimilarity Index can be achieved with:
+
+```json
+    "freqai": {
+        "feature_parameters" : {
+                "DI_threshold": 1
+        }
+    }
+```
+
+The user can tweak the DI with `DI_threshold` to increase or decrease the extrapolation of the 
+trained model.
 
 ### Reducing data dimensionality with Principal Component Analysis
 
-TO BE WRITTEN
+Users can reduce the dimensionality of their features by activating the `principal_component_analysis`:
+
+```json
+    "freqai": {
+        "feature_parameters" : {
+                "principal_component_analysis": true
+        }
+    }
+```
+
+Which will perform PCA on the features and reduce the dimensionality of the data so that the explained
+variance of the data set is >= 0.999. 
+
+### Removing outliers based on feature statistical distributions
+
+The user can tell Freqai to remove outlier data points from the trainig/test data sets by setting:
+
+```json
+    "freqai": {
+        "feature_parameters" : {
+                "remove_outliers": true
+        }
+    }
+```
+
+Freqai will check the statistical distributions of each feature (or component if the user activated
+`principal_component_analysis`) and remove any data point that sits more than 3 standard deviations away 
+from the mean. 
 
 ## Additional information
 ### Feature standardization
@@ -242,5 +302,5 @@ data only. This includes all test data and unseen prediction data (dry/live/back
 ### File structure
 
 `user_data_dir/models/` contains all the data associated with the trainings and
-backtestings. This file structure is heavily controlled and read by the `DataHandler()`
+backtestings. This file structure is heavily controlled and read by the `FreqaiDataKitchen()`
 and should thus not be modified. 
diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py
index 7b6a65a59..961f26e57 100644
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@@ -485,7 +485,7 @@ class FreqaiDataKitchen:
 
         return
 
-    def build_feature_list(self, config: dict) -> list:
+    def build_feature_list(self, config: dict, metadata: dict) -> list:
         """
         Build the list of features that will be used to filter
         the full dataframe. Feature list is construced from the
@@ -501,8 +501,10 @@ class FreqaiDataKitchen:
                     shift = ""
                     if n > 0:
                         shift = "_shift-" + str(n)
-                    # features.append(ft + shift + "_" + tf)
+                    features.append(metadata['pair'].split("/")[0] + "-" + ft + shift + "_" + tf)
                     for p in config["freqai"]["corr_pairlist"]:
+                        if metadata['pair'] in p:
+                            continue  # avoid duplicate features
                         features.append(p.split("/")[0] + "-" + ft + shift + "_" + tf)
 
         # logger.info("number of features %s", len(features))
@@ -640,9 +642,10 @@ class FreqaiDataKitchen:
 
         exchange = ExchangeResolver.load_exchange(self.config['exchange']['name'],
                                                   self.config, validate=False)
-        pairs = self.freqai_config['corr_pairlist'] + [metadata['pair']]
+        pairs = self.freqai_config['corr_pairlist']
+        if metadata['pair'] not in pairs:
+            pairs += metadata['pair']  # dont include pair twice
         timerange = TimeRange.parse_timerange(new_timerange)
-        # data_handler = get_datahandler(datadir, data_format)
 
         refresh_backtest_ohlcv_data(
                         exchange, pairs=pairs, timeframes=self.freqai_config['timeframes'],
@@ -656,33 +659,45 @@ class FreqaiDataKitchen:
     def load_pairs_histories(self, new_timerange: str, metadata: dict) -> Tuple[Dict[Any, Any],
                                                                                 DataFrame]:
         corr_dataframes: Dict[Any, Any] = {}
-        # pair_dataframes: Dict[Any, Any] = {}
+        base_dataframes: Dict[Any, Any] = {}
         pairs = self.freqai_config['corr_pairlist']  # + [metadata['pair']]
         timerange = TimeRange.parse_timerange(new_timerange)
 
-        for p in pairs:
-            corr_dataframes[p] = {}
-            for tf in self.freqai_config['timeframes']:
+        for tf in self.freqai_config['timeframes']:
+            base_dataframes[tf] = load_pair_history(datadir=self.config['datadir'],
+                                                    timeframe=tf,
+                                                    pair=metadata['pair'], timerange=timerange)
+            for p in pairs:
+                if metadata['pair'] in p:
+                    continue  # dont repeat anything from whitelist
+                corr_dataframes[p] = {}
                 corr_dataframes[p][tf] = load_pair_history(datadir=self.config['datadir'],
                                                            timeframe=tf,
                                                            pair=p, timerange=timerange)
 
-        base_dataframe = [dataframe for key, dataframe in corr_dataframes.items()
-                          if metadata['pair'] in key]
+        # base_dataframe = [dataframe for key, dataframe in corr_dataframes.items()
+        #                  if metadata['pair'] in key]
 
         # [0] indexes the lowest tf for the basepair
-        return corr_dataframes, base_dataframe[0][self.config['timeframe']]
+        return corr_dataframes, base_dataframes
 
-    def use_strategy_to_populate_indicators(self, strategy: IStrategy, metadata: dict,
+    def use_strategy_to_populate_indicators(self, strategy: IStrategy,
                                             corr_dataframes: dict,
-                                            dataframe: DataFrame) -> DataFrame:
+                                            base_dataframes: dict,
+                                            metadata: dict) -> DataFrame:
 
-        # dataframe = pair_dataframes[0]  # this is the base tf pair df
+        dataframe = base_dataframes[self.config['timeframe']]
 
         for tf in self.freqai_config["timeframes"]:
-            # dataframe = strategy.populate_any_indicators(metadata["pair"], dataframe.copy,
-            #                                              tf, pair_dataframes[tf])
+            dataframe = strategy.populate_any_indicators(metadata['pair'],
+                                                         dataframe.copy(),
+                                                         tf,
+                                                         base_dataframes[tf],
+                                                         coin=metadata['pair'].split("/")[0] + "-"
+                                                         )
             for i in self.freqai_config["corr_pairlist"]:
+                if metadata['pair'] in i:
+                    continue  # dont repeat anything from whitelist
                 dataframe = strategy.populate_any_indicators(i,
                                                              dataframe.copy(),
                                                              tf,
diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py
index 222061e2a..e019eb842 100644
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -93,7 +93,7 @@ class IFreqaiModel(ABC):
             else:
                 self.model = self.dh.load_data()
 
-            preds, do_preds = self.predict(dataframe_backtest)
+            preds, do_preds = self.predict(dataframe_backtest, metadata)
 
             self.dh.append_predictions(preds, do_preds, len(dataframe_backtest))
             print('predictions', len(self.dh.full_predictions),
@@ -120,13 +120,13 @@ class IFreqaiModel(ABC):
         if retrain or not file_exists:
             self.dh.download_new_data_for_retraining(new_trained_timerange, metadata)
             # dataframe = download-data
-            corr_dataframes, pair_dataframes = self.dh.load_pairs_histories(new_trained_timerange,
+            corr_dataframes, base_dataframes = self.dh.load_pairs_histories(new_trained_timerange,
                                                                             metadata)
 
             unfiltered_dataframe = self.dh.use_strategy_to_populate_indicators(strategy,
-                                                                               metadata,
                                                                                corr_dataframes,
-                                                                               pair_dataframes)
+                                                                               base_dataframes,
+                                                                               metadata)
 
             self.model = self.train(unfiltered_dataframe, metadata)
             self.dh.save_data(self.model)
@@ -134,7 +134,7 @@ class IFreqaiModel(ABC):
             self.freqai_info
 
         self.model = self.dh.load_data()
-        preds, do_preds = self.predict(dataframe)
+        preds, do_preds = self.predict(dataframe, metadata)
         self.dh.append_predictions(preds, do_preds, len(dataframe))
         # dataframe should have len 1 here
 
@@ -175,7 +175,7 @@ class IFreqaiModel(ABC):
         return
 
     @abstractmethod
-    def predict(self, dataframe: DataFrame) -> Tuple[npt.ArrayLike, npt.ArrayLike]:
+    def predict(self, dataframe: DataFrame, metadata: dict) -> Tuple[npt.ArrayLike, npt.ArrayLike]:
         """
         Filter the prediction features data and predict with it.
         :param: unfiltered_dataframe: Full dataframe for the current backtest period.
diff --git a/freqtrade/templates/ExamplePredictionModel.py b/freqtrade/templates/ExamplePredictionModel.py
index 08f9d2ba9..3db8d3aeb 100644
--- a/freqtrade/templates/ExamplePredictionModel.py
+++ b/freqtrade/templates/ExamplePredictionModel.py
@@ -53,7 +53,7 @@ class ExamplePredictionModel(IFreqaiModel):
         logger.info("--------------------Starting training--------------------")
 
         # create the full feature list based on user config info
-        self.dh.training_features_list = self.dh.build_feature_list(self.config)
+        self.dh.training_features_list = self.dh.build_feature_list(self.config, metadata)
         unfiltered_labels = self.make_labels(unfiltered_dataframe)
 
         # filter the features requested by user in the configuration file and elegantly handle NaNs
@@ -114,7 +114,8 @@ class ExamplePredictionModel(IFreqaiModel):
 
         return model
 
-    def predict(self, unfiltered_dataframe: DataFrame) -> Tuple[DataFrame, DataFrame]:
+    def predict(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Tuple[DataFrame,
+                                                                                DataFrame]:
         """
         Filter the prediction features data and predict with it.
         :param: unfiltered_dataframe: Full dataframe for the current backtest period.
@@ -126,7 +127,7 @@ class ExamplePredictionModel(IFreqaiModel):
 
         # logger.info("--------------------Starting prediction--------------------")
 
-        original_feature_list = self.dh.build_feature_list(self.config)
+        original_feature_list = self.dh.build_feature_list(self.config, metadata)
         filtered_dataframe, _ = self.dh.filter_features(
             unfiltered_dataframe, original_feature_list, training_filter=False
         )
diff --git a/freqtrade/templates/FreqaiExampleStrategy.py b/freqtrade/templates/FreqaiExampleStrategy.py
index 13309d8c3..45526e2ac 100644
--- a/freqtrade/templates/FreqaiExampleStrategy.py
+++ b/freqtrade/templates/FreqaiExampleStrategy.py
@@ -142,8 +142,11 @@ class FreqaiExampleStrategy(IStrategy):
         # the following loops are necessary for building the features
         # indicated by the user in the configuration file.
         for tf in self.freqai_info["timeframes"]:
-            # dataframe = self.populate_any_indicators(metadata["pair"], dataframe.copy(), tf)
+            dataframe = self.populate_any_indicators(self.pair, dataframe.copy(), tf,
+                                                     coin=self.pair.split("/")[0] + "-")
             for pair in self.freqai_info["corr_pairlist"]:
+                if metadata['pair'] in pair:
+                    continue  # do not include whitelisted pair twice if it is in corr_pairlist
                 dataframe = self.populate_any_indicators(
                     pair, dataframe.copy(), tf, coin=pair.split("/")[0] + "-"
                 )