From 873d2a5069b35ff5b5ff35368999f014590ba98d Mon Sep 17 00:00:00 2001
From: robcaulk <rob.caulk@gmail.com>
Date: Sun, 25 Sep 2022 11:18:10 +0200
Subject: [PATCH 1/2] no model save backtest, plot features backtest, ensure
 inlier plays nice, doc

---
 docs/freqai.md                       |  2 +-
 freqtrade/freqai/data_drawer.py      | 23 ++++++++++++++-
 freqtrade/freqai/data_kitchen.py     | 10 +++++--
 freqtrade/freqai/freqai_interface.py | 43 ++++++++++++++--------------
 freqtrade/freqai/utils.py            |  2 +-
 5 files changed, 52 insertions(+), 28 deletions(-)
diff --git a/docs/freqai.md b/docs/freqai.md
index a03162b45..a186ce01a 100644
--- a/docs/freqai.md
+++ b/docs/freqai.md
@@ -110,7 +110,7 @@ Mandatory parameters are marked as **Required**, which means that they are requi
 | `indicator_periods_candles` | Calculate indicators for `indicator_periods_candles` time periods and add them to the feature set. <br> **Datatype:** List of positive integers.
 | `stratify_training_data` | This value is used to indicate the grouping of the data. For example, 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. See details about how it works [here](#stratifying-the-data-for-training-and-testing-the-model) <br> **Datatype:** Positive integer.
 | `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis)
-| `plot_feature_importance` | Create an interactive feature importance plot for each model.<br> **Datatype:** Boolean.<br> **Datatype:** Boolean, defaults to `False`
+| `plot_feature_importances` | Create a feature importance plot for each model for the top/bottom `plot_feature_importances` number of features.<br> **Datatype:** Boolean.<br> **Datatype:** Boolean, defaults to `0`
 | `DI_threshold` | Activates the Dissimilarity Index for outlier detection when > 0. See details about how it works [here](#removing-outliers-with-the-dissimilarity-index). <br> **Datatype:** Positive float (typically < 1).
 | `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Boolean.
 | `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Dictionary.
diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py
index 7f4459fa5..e6a39b6e7 100644
--- a/freqtrade/freqai/data_drawer.py
+++ b/freqtrade/freqai/data_drawer.py
@@ -313,6 +313,7 @@ class FreqaiDataDrawer:
         """
 
         dk.find_features(dataframe)
+        dk.find_labels(dataframe)
 
         full_labels = dk.label_list + dk.unique_class_list
 
@@ -376,7 +377,27 @@ class FreqaiDataDrawer:
         if self.config.get("freqai", {}).get("purge_old_models", False):
             self.purge_old_models()
 
-    # Functions pulled back from FreqaiDataKitchen because they relied on DataDrawer
+    def save_metaddata(self, dk: FreqaiDataKitchen) -> None:
+        """
+        Saves only metadata for backtesting studies if user prefers
+        not to save model data. This saves tremendous amounts of space
+        for users generating huge studies.
+        This is only active when `save_backtest_models`: false (not default)
+        """
+        if not dk.data_path.is_dir():
+            dk.data_path.mkdir(parents=True, exist_ok=True)
+
+        save_path = Path(dk.data_path)
+
+        dk.data["data_path"] = str(dk.data_path)
+        dk.data["model_filename"] = str(dk.model_filename)
+        dk.data["training_features_list"] = list(dk.data_dictionary["train_features"].columns)
+        dk.data["label_list"] = dk.label_list
+
+        with open(save_path / f"{dk.model_filename}_metadata.json", "w") as fp:
+            rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE)
+
+        return
 
     def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None:
         """
diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py
index 752cd0e45..f4fa4e5fd 100644
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@@ -831,7 +831,7 @@ class FreqaiDataKitchen:
 
         inlier_metric = pd.DataFrame(
             data=inliers.sum(axis=1) / no_prev_pts,
-            columns=['inlier_metric'],
+            columns=['%-inlier_metric'],
             index=compute_df.index
         )
 
@@ -881,11 +881,14 @@ class FreqaiDataKitchen:
         """
         column_names = dataframe.columns
         features = [c for c in column_names if "%" in c]
-        labels = [c for c in column_names if "&" in c]
         if not features:
             raise OperationalException("Could not find any features!")
 
         self.training_features_list = features
+
+    def find_labels(self, dataframe: DataFrame) -> None:
+        column_names = dataframe.columns
+        labels = [c for c in column_names if "&" in c]
         self.label_list = labels
 
     def check_if_pred_in_training_spaces(self) -> None:
@@ -1206,7 +1209,8 @@ class FreqaiDataKitchen:
 
     def get_unique_classes_from_labels(self, dataframe: DataFrame) -> None:
 
-        self.find_features(dataframe)
+        # self.find_features(dataframe)
+        self.find_labels(dataframe)
 
         for key in self.label_list:
             if dataframe[key].dtype == object:
diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py
index e0a45fb38..988aae4f5 100644
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -92,6 +92,7 @@ class IFreqaiModel(ABC):
         self.begin_time_train: float = 0
         self.base_tf_seconds = timeframe_to_seconds(self.config['timeframe'])
         self.continual_learning = self.freqai_info.get('continual_learning', False)
+        self.plot_features = self.ft_params.get("plot_feature_importances", 0)
 
         self._threads: List[threading.Thread] = []
         self._stop_event = threading.Event()
@@ -278,22 +279,24 @@ class IFreqaiModel(ABC):
                 append_df = dk.get_backtesting_prediction()
                 dk.append_predictions(append_df)
             else:
-                if not self.model_exists(
-                    pair, dk, trained_timestamp=trained_timestamp_int
-                ):
+                if not self.model_exists(dk):
                     dk.find_features(dataframe_train)
+                    dk.find_labels(dataframe_train)
                     self.model = self.train(dataframe_train, pair, dk)
                     self.dd.pair_dict[pair]["trained_timestamp"] = int(
                         trained_timestamp.stopts)
-
+                    if self.plot_features:
+                        plot_feature_importance(self.model, pair, dk, self.plot_features)
                     if self.save_backtest_models:
                         logger.info('Saving backtest model to disk.')
                         self.dd.save_data(self.model, pair, dk)
+                    else:
+                        logger.info('Saving metadata to disk.')
+                        self.dd.save_metaddata(dk)
                 else:
                     self.model = self.dd.load_data(pair, dk)
 
-                self.check_if_feature_list_matches_strategy(dataframe_train, dk)
-
+                # self.check_if_feature_list_matches_strategy(dataframe_train, dk)
                 pred_df, do_preds = self.predict(dataframe_backtest, dk)
                 append_df = dk.get_predictions_to_append(pred_df, do_preds)
                 dk.append_predictions(append_df)
@@ -372,8 +375,7 @@ class IFreqaiModel(ABC):
             self.dd.return_null_values_to_strategy(dataframe, dk)
             return dk
 
-        # ensure user is feeding the correct indicators to the model
-        self.check_if_feature_list_matches_strategy(dataframe, dk)
+        dk.find_labels(dataframe)
 
         self.build_strategy_return_arrays(dataframe, dk, metadata["pair"], trained_timestamp)
 
@@ -492,7 +494,7 @@ class IFreqaiModel(ABC):
         if ft_params.get(
             "principal_component_analysis", False
         ):
-            dk.pca_transform(self.dk.data_dictionary['prediction_features'])
+            dk.pca_transform(dk.data_dictionary['prediction_features'])
 
         if ft_params.get("use_SVM_to_remove_outliers", False):
             dk.use_SVM_to_remove_outliers(predict=True)
@@ -503,14 +505,10 @@ class IFreqaiModel(ABC):
         if ft_params.get("use_DBSCAN_to_remove_outliers", False):
             dk.use_DBSCAN_to_remove_outliers(predict=True)
 
-    def model_exists(
-        self,
-        pair: str,
-        dk: FreqaiDataKitchen,
-        trained_timestamp: int = None,
-        model_filename: str = "",
-        scanning: bool = False,
-    ) -> bool:
+        # ensure user is feeding the correct indicators to the model
+        self.check_if_feature_list_matches_strategy(dk.data_dictionary['prediction_features'], dk)
+
+    def model_exists(self, dk: FreqaiDataKitchen) -> bool:
         """
         Given a pair and path, check if a model already exists
         :param pair: pair e.g. BTC/USD
@@ -518,11 +516,11 @@ class IFreqaiModel(ABC):
         :return:
         :boolean: whether the model file exists or not.
         """
-        path_to_modelfile = Path(dk.data_path / f"{model_filename}_model.joblib")
+        path_to_modelfile = Path(dk.data_path / f"{dk.model_filename}_model.joblib")
         file_exists = path_to_modelfile.is_file()
-        if file_exists and not scanning:
+        if file_exists:
             logger.info("Found model at %s", dk.data_path / dk.model_filename)
-        elif not scanning:
+        else:
             logger.info("Could not find model at %s", dk.data_path / dk.model_filename)
         return file_exists
 
@@ -569,6 +567,7 @@ class IFreqaiModel(ABC):
 
         # find the features indicated by strategy and store in datakitchen
         dk.find_features(unfiltered_dataframe)
+        dk.find_labels(unfiltered_dataframe)
 
         model = self.train(unfiltered_dataframe, pair, dk)
 
@@ -576,8 +575,8 @@ class IFreqaiModel(ABC):
         dk.set_new_model_names(pair, new_trained_timerange)
         self.dd.save_data(model, pair, dk)
 
-        if self.freqai_info["feature_parameters"].get("plot_feature_importance", False):
-            plot_feature_importance(model, pair, dk)
+        if self.plot_features:
+            plot_feature_importance(model, pair, dk, self.plot_features)
 
         if self.freqai_info.get("purge_old_models", False):
             self.dd.purge_old_models()
diff --git a/freqtrade/freqai/utils.py b/freqtrade/freqai/utils.py
index f6358925c..22bc1e06e 100644
--- a/freqtrade/freqai/utils.py
+++ b/freqtrade/freqai/utils.py
@@ -170,7 +170,7 @@ def plot_feature_importance(model: Any, pair: str, dk: FreqaiDataKitchen,
 
         # Data preparation
         fi_df = pd.DataFrame({
-            "feature_names": np.array(dk.training_features_list),
+            "feature_names": np.array(dk.data_dictionary['train_features'].columns),
             "feature_importance": np.array(feature_importance)
         })
         fi_df_top = fi_df.nlargest(count_max, "feature_importance")[::-1]

From 48e89e68b90d942dcbc8d42e7863378400430f1f Mon Sep 17 00:00:00 2001
From: robcaulk <rob.caulk@gmail.com>
Date: Sun, 25 Sep 2022 20:22:19 +0200
Subject: [PATCH 2/2] fix typos

---
 docs/freqai.md                       | 4 ++--
 freqtrade/freqai/data_drawer.py      | 2 +-
 freqtrade/freqai/freqai_interface.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/freqai.md b/docs/freqai.md
index a186ce01a..e840e7136 100644
--- a/docs/freqai.md
+++ b/docs/freqai.md
@@ -109,8 +109,8 @@ Mandatory parameters are marked as **Required**, which means that they are requi
 | `indicator_max_period_candles` | **No longer used**. User must use the strategy set `startup_candle_count` which defines the maximum *period* used in `populate_any_indicators()` for indicator creation (timeframe independent). FreqAI uses this information in combination with the maximum timeframe to calculate how many data points it should download so that the first data point does not have a NaN <br> **Datatype:** positive integer.
 | `indicator_periods_candles` | Calculate indicators for `indicator_periods_candles` time periods and add them to the feature set. <br> **Datatype:** List of positive integers.
 | `stratify_training_data` | This value is used to indicate the grouping of the data. For example, 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. See details about how it works [here](#stratifying-the-data-for-training-and-testing-the-model) <br> **Datatype:** Positive integer.
-| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis)
-| `plot_feature_importances` | Create a feature importance plot for each model for the top/bottom `plot_feature_importances` number of features.<br> **Datatype:** Boolean.<br> **Datatype:** Boolean, defaults to `0`
+| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis) <br> **Datatype:** Boolean. defaults to `false`.
+| `plot_feature_importances` | Create a feature importance plot for each model for the top/bottom `plot_feature_importances` number of features.<br> **Datatype:** Integer, defaults to `0`.
 | `DI_threshold` | Activates the Dissimilarity Index for outlier detection when > 0. See details about how it works [here](#removing-outliers-with-the-dissimilarity-index). <br> **Datatype:** Positive float (typically < 1).
 | `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Boolean.
 | `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Dictionary.
diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py
index e6a39b6e7..1839724f8 100644
--- a/freqtrade/freqai/data_drawer.py
+++ b/freqtrade/freqai/data_drawer.py
@@ -377,7 +377,7 @@ class FreqaiDataDrawer:
         if self.config.get("freqai", {}).get("purge_old_models", False):
             self.purge_old_models()
 
-    def save_metaddata(self, dk: FreqaiDataKitchen) -> None:
+    def save_metadata(self, dk: FreqaiDataKitchen) -> None:
         """
         Saves only metadata for backtesting studies if user prefers
         not to save model data. This saves tremendous amounts of space
diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py
index 988aae4f5..d9f917338 100644
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -292,7 +292,7 @@ class IFreqaiModel(ABC):
                         self.dd.save_data(self.model, pair, dk)
                     else:
                         logger.info('Saving metadata to disk.')
-                        self.dd.save_metaddata(dk)
+                        self.dd.save_metadata(dk)
                 else:
                     self.model = self.dd.load_data(pair, dk)