From 6193205012f681d233933fee50df66f33b63ddcc Mon Sep 17 00:00:00 2001
From: robcaulk <rob.caulk@gmail.com>
Date: Thu, 26 May 2022 21:07:50 +0200
Subject: [PATCH] fix bug for target_mean/std array merging in backtesting

---
 config_examples/config_freqai.example.json    |  19 ++-
 docs/freqai.md                                | 106 +++++++++++---
 freqtrade/freqai/data_kitchen.py              | 131 ++++++++++--------
 freqtrade/freqai/freqai_interface.py          |  23 +--
 .../CatboostPredictionModel.py                |  13 +-
 freqtrade/templates/FreqaiExampleStrategy.py  |   4 +-
 6 files changed, 186 insertions(+), 110 deletions(-)

diff --git a/config_examples/config_freqai.example.json b/config_examples/config_freqai.example.json
index 7582afef0..b6c7ba7d8 100644
--- a/config_examples/config_freqai.example.json
+++ b/config_examples/config_freqai.example.json
@@ -1,7 +1,7 @@
 {
     "max_open_trades": 1,
     "stake_currency": "USDT",
-    "stake_amount": 800,
+    "stake_amount": 900,
     "tradable_balance_ratio": 1,
     "fiat_display_currency": "USD",
     "dry_run": true,
@@ -24,8 +24,7 @@
             "rateLimit": 200
         },
         "pair_whitelist": [
-            "BTC/USDT",
-            "ETH/USDT"
+            "BTC/USDT"
         ],
         "pair_blacklist": []
     },
@@ -55,7 +54,7 @@
             "15m"
         ],
         "train_period": 30,
-        "backtest_period": 10,
+        "backtest_period": 7,
         "identifier": "example",
         "live_trained_timestamp": 0,
         "corr_pairlist": [
@@ -64,16 +63,16 @@
             "DOT/USDT"
         ],
         "feature_parameters": {
-            "period": 12,
+            "period": 24,
             "shift": 1,
-            "DI_threshold": 1,
-            "weight_factor": 0,
+            "DI_threshold": 0,
+            "weight_factor": 0.9,
             "principal_component_analysis": false,
-            "use_SVM_to_remove_outliers": false,
-            "stratify": 0
+            "use_SVM_to_remove_outliers": true,
+            "stratify": 3
         },
         "data_split_parameters": {
-            "test_size": 0.25,
+            "test_size": 0.33,
             "random_state": 1
         },
         "model_training_parameters": {
diff --git a/docs/freqai.md b/docs/freqai.md
index 403145525..821f42258 100644
--- a/docs/freqai.md
+++ b/docs/freqai.md
@@ -221,33 +221,43 @@ This way, the user can return to using any model they wish by simply changing th
 
 ### Building a freqai strategy
 
-The Freqai strategy requires the user to include the following lines of code in `populate_ any _indicators()`
+The Freqai strategy requires the user to include the following lines of code in the strategy:
 
 ```python
-        from freqtrade.freqai.strategy_bridge import CustomModel
+    from freqtrade.freqai.strategy_bridge import CustomModel
 
-        def populate_indicators(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
-                # the configuration file parameters are stored here
-                self.freqai_info = self.config['freqai']
+    def informative_pairs(self):
+        whitelist_pairs = self.dp.current_whitelist()
+        corr_pairs = self.config["freqai"]["corr_pairlist"]
+        informative_pairs = []
+        for tf in self.config["freqai"]["timeframes"]:
+            for pair in whitelist_pairs:
+                informative_pairs.append((pair, tf))
+            for pair in corr_pairs:
+                if pair in whitelist_pairs:
+                    continue  # avoid duplication
+                informative_pairs.append((pair, tf))
+        return informative_pairs
 
-                # the model is instantiated here
-                self.model = CustomModel(self.config)
+    def bot_start(self):
+        self.model = CustomModel(self.config)
 
-                print('Populating indicators...')
+    def populate_indicators(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
+            self.freqai_info = self.config['freqai']
 
-                # the following loops are necessary for building the features 
-                # indicated by the user in the configuration file.
-                for tf in self.freqai_info['timeframes']:
-                        for i in self.freqai_info['corr_pairlist']:
-                        dataframe = self.populate_any_indicators(i,
-                                        dataframe.copy(), tf, coin=i.split("/")[0]+'-')
+            # the following loops are necessary for building the features 
+            # indicated by the user in the configuration file.
+            for tf in self.freqai_info['timeframes']:
+                    for i in self.freqai_info['corr_pairlist']:
+                    dataframe = self.populate_any_indicators(i,
+                                    dataframe.copy(), tf, coin=i.split("/")[0]+'-')
 
-                # the model will return 4 values, its prediction, an indication of whether or not the prediction 
-                # should be accepted, the target mean/std values from the labels used during each training period.
-                (dataframe['prediction'], dataframe['do_predict'], 
-                        dataframe['target_mean'], dataframe['target_std']) = self.model.bridge.start(dataframe, metadata)
+            # the model will return 4 values, its prediction, an indication of whether or not the prediction 
+            # should be accepted, the target mean/std values from the labels used during each training period.
+            (dataframe['prediction'], dataframe['do_predict'], 
+                    dataframe['target_mean'], dataframe['target_std']) = self.model.bridge.start(dataframe, metadata)
 
-                return dataframe
+            return dataframe
 ```
 
 The user should also include `populate_any_indicators()` from `templates/FreqaiExampleStrategy.py` which builds 
@@ -314,7 +324,7 @@ data point and all other training data points:
 
 $$ d_{ab} = \sqrt{\sum_{j=1}^p(X_{a,j}-X_{b,j})^2} $$
 
-where $d_{ab}$ is the distance between the standardized points $a$ and $b$. $p$
+where $d_{ab}$ is the distance between the normalized points $a$ and $b$. $p$
 is the number of features i.e. the length of the vector $X$. The
 characteristic distance, $\overline{d}$ for a set of training data points is simply the mean
 of the average distances:
@@ -392,13 +402,63 @@ The user can stratify the training/testing data using:
 
 which will split the data chronolocially so that every X data points is a testing data point. In the
 present example, the user is asking for every third data point in the dataframe to be used for 
-testing, the other points are used for training.
+testing, the other points are used for training. 
+
+<!-- ## Dynamic target expectation
+
+The labels used for model training have a unique statistical distribution for each separate model training. 
+We can use this information to know if our current prediction is in the realm of what the model was trained on, 
+and if so, what is the statistical probability of the current prediction. With this information, we can
+make more informed prediction._
+FreqAI builds this label distribution and provides a quantile to the strategy, which can be optionally used as a
+dynamic threshold. The `target_quantile: X` means that X% of the labels are below this value. So setting:
+
+```json
+    "freqai": {
+        "feature_parameters" : {
+            "target_quantile": 0.9
+        }
+    }
+```
+
+Means the user will get back in the strategy the label threshold at which 90% of the labels were 
+below this value. An example usage in the strategy may look something like:
+
+```python
+
+    def populate_indicators(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
+
+        # ... #
+
+        (
+            dataframe["prediction"],
+            dataframe["do_predict"],
+            dataframe["target_upper_quantile"],
+            dataframe["target_lower_quantile"],
+        ) = self.model.bridge.start(dataframe, metadata, self)
+
+        return dataframe
+
+    def populate_buy_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
+
+        buy_conditions = [
+            (dataframe["prediction"] > dataframe["target_upper_quantile"]) & (dataframe["do_predict"] == 1)
+        ]
+
+        if buy_conditions:
+            dataframe.loc[reduce(lambda x, y: x | y, buy_conditions), "buy"] = 1
+
+        return dataframe
+
+``` -->
+
+
 
 ## Additional information
 
-### Feature standardization
+### Feature normalization
 
-The feature set created by the user is automatically standardized to the training
+The feature set created by the user is automatically normalized to the training
 data only. This includes all test data and unseen prediction data (dry/live/backtest).
 
 ### File structure
diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py
index eafb9cc46..b5f1f6edb 100644
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@@ -141,9 +141,9 @@ class FreqaiDataKitchen:
         :model: User trained model which can be inferenced for new predictions
         """
 
-        # if self.live:
-        self.model_filename = self.data_drawer.pair_dict[coin]['model_filename']
-        self.data_path = Path(self.data_drawer.pair_dict[coin]['data_path'])
+        if self.live:
+            self.model_filename = self.data_drawer.pair_dict[coin]['model_filename']
+            self.data_path = Path(self.data_drawer.pair_dict[coin]['data_path'])
 
         with open(self.data_path / str(self.model_filename + "_metadata.json"), "r") as fp:
             self.data = json.load(fp)
@@ -329,42 +329,6 @@ class FreqaiDataKitchen:
         :data_dictionary: updated dictionary with standardized values.
         """
         # standardize the data by training stats
-        train_mean = data_dictionary["train_features"].mean()
-        train_std = data_dictionary["train_features"].std()
-        data_dictionary["train_features"] = (
-            data_dictionary["train_features"] - train_mean
-        ) / train_std
-        data_dictionary["test_features"] = (
-            data_dictionary["test_features"] - train_mean
-        ) / train_std
-
-        train_labels_std = data_dictionary["train_labels"].std()
-        train_labels_mean = data_dictionary["train_labels"].mean()
-        data_dictionary["train_labels"] = (
-            data_dictionary["train_labels"] - train_labels_mean
-        ) / train_labels_std
-        data_dictionary["test_labels"] = (
-            data_dictionary["test_labels"] - train_labels_mean
-        ) / train_labels_std
-
-        for item in train_std.keys():
-            self.data[item + "_std"] = train_std[item]
-            self.data[item + "_mean"] = train_mean[item]
-
-        self.data["labels_std"] = train_labels_std
-        self.data["labels_mean"] = train_labels_mean
-
-        return data_dictionary
-
-    def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
-        """
-        Standardize all data in the data_dictionary according to the training dataset
-        :params:
-        :data_dictionary: dictionary containing the cleaned and split training/test data/labels
-        :returns:
-        :data_dictionary: updated dictionary with standardized values.
-        """
-        # standardize the data by training stats
         train_max = data_dictionary["train_features"].max()
         train_min = data_dictionary["train_features"].min()
         data_dictionary["train_features"] = 2 * (
@@ -392,9 +356,9 @@ class FreqaiDataKitchen:
 
         return data_dictionary
 
-    def standardize_data_from_metadata(self, df: DataFrame) -> DataFrame:
+    def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
         """
-        Standardizes a set of data using the mean and standard deviation from
+        Normalize a set of data using the mean and standard deviation from
         the associated training data.
         :params:
         :df: Dataframe to be standardized
@@ -406,19 +370,6 @@ class FreqaiDataKitchen:
 
         return df
 
-    def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
-        """
-        Normalizes a set of data using the mean and standard deviation from
-        the associated training data.
-        :params:
-        :df: Dataframe to be standardized
-        """
-
-        for item in df.keys():
-            df[item] = (df[item] - self.data[item + "_mean"]) / self.data[item + "_std"]
-
-        return df
-
     def split_timerange(
         self, tr: str, train_split: int = 28, bt_split: int = 7
     ) -> Tuple[list, list]:
@@ -657,12 +608,12 @@ class FreqaiDataKitchen:
         """
 
         ones = np.ones(len_dataframe)
-        s_mean, s_std = ones * self.data["s_mean"], ones * self.data["s_std"]
+        target_mean, target_std = ones * self.data["target_mean"], ones * self.data["target_std"]
 
         self.full_predictions = np.append(self.full_predictions, predictions)
         self.full_do_predict = np.append(self.full_do_predict, do_predict)
-        self.full_target_mean = np.append(self.full_target_mean, s_mean)
-        self.full_target_std = np.append(self.full_target_std, s_std)
+        self.full_target_mean = np.append(self.full_target_mean, target_mean)
+        self.full_target_std = np.append(self.full_target_std, target_std)
 
         return
 
@@ -827,6 +778,23 @@ class FreqaiDataKitchen:
 
         return dataframe
 
+    def fit_labels(self) -> None:
+        import scipy as spy
+
+        f = spy.stats.norm.fit(self.data_dictionary["train_labels"])
+
+        # KEEPME incase we want to let user start to grab quantiles.
+        # upper_q = spy.stats.norm.ppf(self.freqai_config['feature_parameters'][
+        #                                                   'target_quantile'], *f)
+        # lower_q = spy.stats.norm.ppf(1 - self.freqai_config['feature_parameters'][
+        #                                                       'target_quantile'], *f)
+
+        self.data["target_mean"], self.data["target_std"] = f[0], f[1]
+        # self.data["upper_quantile"] = upper_q
+        # self.data["lower_quantile"] = lower_q
+
+        return
+
     def np_encoder(self, object):
         if isinstance(object, np.generic):
             return object.item()
@@ -968,3 +936,52 @@ class FreqaiDataKitchen:
     #         )
 
     #     return
+
+    # def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
+    #     """
+    #     standardize all data in the data_dictionary according to the training dataset
+    #     :params:
+    #     :data_dictionary: dictionary containing the cleaned and split training/test data/labels
+    #     :returns:
+    #     :data_dictionary: updated dictionary with standardized values.
+    #     """
+    #     # standardize the data by training stats
+    #     train_mean = data_dictionary["train_features"].mean()
+    #     train_std = data_dictionary["train_features"].std()
+    #     data_dictionary["train_features"] = (
+    #         data_dictionary["train_features"] - train_mean
+    #     ) / train_std
+    #     data_dictionary["test_features"] = (
+    #         data_dictionary["test_features"] - train_mean
+    #     ) / train_std
+
+    #     train_labels_std = data_dictionary["train_labels"].std()
+    #     train_labels_mean = data_dictionary["train_labels"].mean()
+    #     data_dictionary["train_labels"] = (
+    #         data_dictionary["train_labels"] - train_labels_mean
+    #     ) / train_labels_std
+    #     data_dictionary["test_labels"] = (
+    #         data_dictionary["test_labels"] - train_labels_mean
+    #     ) / train_labels_std
+
+    #     for item in train_std.keys():
+    #         self.data[item + "_std"] = train_std[item]
+    #         self.data[item + "_mean"] = train_mean[item]
+
+    #     self.data["labels_std"] = train_labels_std
+    #     self.data["labels_mean"] = train_labels_mean
+
+    #     return data_dictionary
+
+    # def standardize_data_from_metadata(self, df: DataFrame) -> DataFrame:
+    # """
+    # Normalizes a set of data using the mean and standard deviation from
+    # the associated training data.
+    # :params:
+    # :df: Dataframe to be standardized
+    # """
+
+    # for item in df.keys():
+    #     df[item] = (df[item] - self.data[item + "_mean"]) / self.data[item + "_std"]
+
+    # return df
diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py
index d7bbc549a..68d21ecdc 100644
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -158,12 +158,7 @@ class IFreqaiModel(ABC):
             else:
                 self.model = dh.load_data(metadata['pair'])
 
-                # strategy_provided_features = self.dh.find_features(dataframe_train)
-                # # FIXME doesnt work with PCA
-                # if strategy_provided_features != self.dh.training_features_list:
-                #     logger.info("User changed input features, retraining model.")
-                #     self.model = self.train(dataframe_train, metadata)
-                #     self.dh.save_data(self.model)
+            self.check_if_feature_list_matches_strategy(dataframe_train, dh)
 
             preds, do_preds = self.predict(dataframe_backtest, dh)
 
@@ -220,16 +215,23 @@ class IFreqaiModel(ABC):
 
         self.model = dh.load_data(coin=metadata['pair'])
 
-        # FIXME
-        # strategy_provided_features = dh.find_features(dataframe)
-        # if strategy_provided_features != dh.training_features_list:
-        #     self.train_model_in_series(new_trained_timerange, metadata, strategy)
+        self.check_if_feature_list_matches_strategy(dataframe, dh)
 
         preds, do_preds = self.predict(dataframe, dh)
         dh.append_predictions(preds, do_preds, len(dataframe))
 
         return dh
 
+    def check_if_feature_list_matches_strategy(self, dataframe: DataFrame,
+                                               dh: FreqaiDataKitchen) -> None:
+        strategy_provided_features = dh.find_features(dataframe)
+        if strategy_provided_features != dh.training_features_list:
+            raise OperationalException("Trying to access pretrained model with `identifier` "
+                                       "but found different features furnished by current strategy."
+                                       "Change `identifer` to train from scratch, or ensure the"
+                                       "strategy is furnishing the same features as the pretrained"
+                                       "model")
+
     def data_cleaning_train(self, dh: FreqaiDataKitchen) -> None:
         """
         Base data cleaning method for train
@@ -237,6 +239,7 @@ class IFreqaiModel(ABC):
         based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example
         of how outlier data points are dropped from the dataframe used for training.
         """
+
         if self.freqai_info.get('feature_parameters', {}).get('principal_component_analysis'):
             dh.principal_component_analysis()
 
diff --git a/freqtrade/freqai/prediction_models/CatboostPredictionModel.py b/freqtrade/freqai/prediction_models/CatboostPredictionModel.py
index 73ea46032..3f70400d8 100644
--- a/freqtrade/freqai/prediction_models/CatboostPredictionModel.py
+++ b/freqtrade/freqai/prediction_models/CatboostPredictionModel.py
@@ -33,10 +33,6 @@ class CatboostPredictionModel(IFreqaiModel):
             / dataframe["close"]
             - 1
         )
-        dh.data["s_mean"] = dataframe["s"].mean()
-        dh.data["s_std"] = dataframe["s"].std()
-
-        # logger.info("label mean", dh.data["s_mean"], "label std", dh.data["s_std"])
 
         return dataframe["s"]
 
@@ -68,8 +64,9 @@ class CatboostPredictionModel(IFreqaiModel):
 
         # split data into train/test data.
         data_dictionary = dh.make_train_test_datasets(features_filtered, labels_filtered)
-        # standardize all data based on train_dataset only
-        data_dictionary = dh.standardize_data(data_dictionary)
+        dh.fit_labels()  # fit labels to a cauchy distribution so we know what to expect in strategy
+        # normalize all data based on train_dataset only
+        data_dictionary = dh.normalize_data(data_dictionary)
 
         # optional additional data cleaning/analysis
         self.data_cleaning_train(dh)
@@ -128,7 +125,7 @@ class CatboostPredictionModel(IFreqaiModel):
         filtered_dataframe, _ = dh.filter_features(
             unfiltered_dataframe, original_feature_list, training_filter=False
         )
-        filtered_dataframe = dh.standardize_data_from_metadata(filtered_dataframe)
+        filtered_dataframe = dh.normalize_data_from_metadata(filtered_dataframe)
         dh.data_dictionary["prediction_features"] = filtered_dataframe
 
         # optional additional data cleaning/analysis
@@ -136,7 +133,7 @@ class CatboostPredictionModel(IFreqaiModel):
 
         predictions = self.model.predict(dh.data_dictionary["prediction_features"])
 
-        # compute the non-standardized predictions
+        # compute the non-normalized predictions
         dh.predictions = (predictions + 1) * (dh.data["labels_max"] -
                                               dh.data["labels_min"]) / 2 + dh.data["labels_min"]
 
diff --git a/freqtrade/templates/FreqaiExampleStrategy.py b/freqtrade/templates/FreqaiExampleStrategy.py
index d2eb2c306..ed7c828cc 100644
--- a/freqtrade/templates/FreqaiExampleStrategy.py
+++ b/freqtrade/templates/FreqaiExampleStrategy.py
@@ -178,8 +178,8 @@ class FreqaiExampleStrategy(IStrategy):
             dataframe["target_std"],
         ) = self.model.bridge.start(dataframe, metadata, self)
 
-        dataframe["target_roi"] = dataframe["target_mean"] + dataframe["target_std"] * 1.5
-        dataframe["sell_roi"] = dataframe["target_mean"] - dataframe["target_std"] * 1
+        dataframe["target_roi"] = dataframe["target_mean"] + dataframe["target_std"]
+        dataframe["sell_roi"] = dataframe["target_mean"] - dataframe["target_std"]
         return dataframe
 
     def populate_buy_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame: