Merge pull request #7465 from freqtrade/aggregate-fixes-fai

Aggregate recent feature requests
2022-09-25 20:43:23 +02:00
parent adb5b98a3d 48e89e68b9
commit 8051235171
5 changed files with 53 additions and 29 deletions
@@ -109,8 +109,8 @@ Mandatory parameters are marked as **Required**, which means that they are requi
 | `indicator_max_period_candles` | **No longer used**. User must use the strategy set `startup_candle_count` which defines the maximum *period* used in `populate_any_indicators()` for indicator creation (timeframe independent). FreqAI uses this information in combination with the maximum timeframe to calculate how many data points it should download so that the first data point does not have a NaN <br> **Datatype:** positive integer.
 | `indicator_periods_candles` | Calculate indicators for `indicator_periods_candles` time periods and add them to the feature set. <br> **Datatype:** List of positive integers.
 | `stratify_training_data` | This value is used to indicate the grouping of the data. For example, 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. See details about how it works [here](#stratifying-the-data-for-training-and-testing-the-model) <br> **Datatype:** Positive integer.
-| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis)
-| `plot_feature_importance` | Create an interactive feature importance plot for each model.<br> **Datatype:** Boolean.<br> **Datatype:** Boolean, defaults to `False`
+| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis) <br> **Datatype:** Boolean. defaults to `false`.
+| `plot_feature_importances` | Create a feature importance plot for each model for the top/bottom `plot_feature_importances` number of features.<br> **Datatype:** Integer, defaults to `0`.
 | `DI_threshold` | Activates the Dissimilarity Index for outlier detection when > 0. See details about how it works [here](#removing-outliers-with-the-dissimilarity-index). <br> **Datatype:** Positive float (typically < 1).
 | `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Boolean.
 | `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Dictionary.
@@ -313,6 +313,7 @@ class FreqaiDataDrawer:
        """

        dk.find_features(dataframe)
+        dk.find_labels(dataframe)

        full_labels = dk.label_list + dk.unique_class_list

@@ -376,7 +377,27 @@ class FreqaiDataDrawer:
        if self.config.get("freqai", {}).get("purge_old_models", False):
            self.purge_old_models()

-    # Functions pulled back from FreqaiDataKitchen because they relied on DataDrawer
+    def save_metadata(self, dk: FreqaiDataKitchen) -> None:
+        """
+        Saves only metadata for backtesting studies if user prefers
+        not to save model data. This saves tremendous amounts of space
+        for users generating huge studies.
+        This is only active when `save_backtest_models`: false (not default)
+        """
+        if not dk.data_path.is_dir():
+            dk.data_path.mkdir(parents=True, exist_ok=True)
+
+        save_path = Path(dk.data_path)
+
+        dk.data["data_path"] = str(dk.data_path)
+        dk.data["model_filename"] = str(dk.model_filename)
+        dk.data["training_features_list"] = list(dk.data_dictionary["train_features"].columns)
+        dk.data["label_list"] = dk.label_list
+
+        with open(save_path / f"{dk.model_filename}_metadata.json", "w") as fp:
+            rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE)
+
+        return

    def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None:
        """
@@ -831,7 +831,7 @@ class FreqaiDataKitchen:

        inlier_metric = pd.DataFrame(
            data=inliers.sum(axis=1) / no_prev_pts,
-            columns=['inlier_metric'],
+            columns=['%-inlier_metric'],
            index=compute_df.index
        )

@@ -881,11 +881,14 @@ class FreqaiDataKitchen:
        """
        column_names = dataframe.columns
        features = [c for c in column_names if "%" in c]
-        labels = [c for c in column_names if "&" in c]
        if not features:
            raise OperationalException("Could not find any features!")

        self.training_features_list = features
+
+    def find_labels(self, dataframe: DataFrame) -> None:
+        column_names = dataframe.columns
+        labels = [c for c in column_names if "&" in c]
        self.label_list = labels

    def check_if_pred_in_training_spaces(self) -> None:
@@ -1206,7 +1209,8 @@ class FreqaiDataKitchen:

    def get_unique_classes_from_labels(self, dataframe: DataFrame) -> None:

-        self.find_features(dataframe)
+        # self.find_features(dataframe)
+        self.find_labels(dataframe)

        for key in self.label_list:
            if dataframe[key].dtype == object:
@@ -92,6 +92,7 @@ class IFreqaiModel(ABC):
        self.begin_time_train: float = 0
        self.base_tf_seconds = timeframe_to_seconds(self.config['timeframe'])
        self.continual_learning = self.freqai_info.get('continual_learning', False)
+        self.plot_features = self.ft_params.get("plot_feature_importances", 0)

        self._threads: List[threading.Thread] = []
        self._stop_event = threading.Event()
@@ -278,22 +279,24 @@ class IFreqaiModel(ABC):
                append_df = dk.get_backtesting_prediction()
                dk.append_predictions(append_df)
            else:
-                if not self.model_exists(
-                    pair, dk, trained_timestamp=trained_timestamp_int
-                ):
+                if not self.model_exists(dk):
                    dk.find_features(dataframe_train)
+                    dk.find_labels(dataframe_train)
                    self.model = self.train(dataframe_train, pair, dk)
                    self.dd.pair_dict[pair]["trained_timestamp"] = int(
                        trained_timestamp.stopts)
-
+                    if self.plot_features:
+                        plot_feature_importance(self.model, pair, dk, self.plot_features)
                    if self.save_backtest_models:
                        logger.info('Saving backtest model to disk.')
                        self.dd.save_data(self.model, pair, dk)
+                    else:
+                        logger.info('Saving metadata to disk.')
+                        self.dd.save_metadata(dk)
                else:
                    self.model = self.dd.load_data(pair, dk)

-                self.check_if_feature_list_matches_strategy(dataframe_train, dk)
-
+                # self.check_if_feature_list_matches_strategy(dataframe_train, dk)
                pred_df, do_preds = self.predict(dataframe_backtest, dk)
                append_df = dk.get_predictions_to_append(pred_df, do_preds)
                dk.append_predictions(append_df)
@@ -372,8 +375,7 @@ class IFreqaiModel(ABC):
            self.dd.return_null_values_to_strategy(dataframe, dk)
            return dk

-        # ensure user is feeding the correct indicators to the model
-        self.check_if_feature_list_matches_strategy(dataframe, dk)
+        dk.find_labels(dataframe)

        self.build_strategy_return_arrays(dataframe, dk, metadata["pair"], trained_timestamp)

@@ -492,7 +494,7 @@ class IFreqaiModel(ABC):
        if ft_params.get(
            "principal_component_analysis", False
        ):
-            dk.pca_transform(self.dk.data_dictionary['prediction_features'])
+            dk.pca_transform(dk.data_dictionary['prediction_features'])

        if ft_params.get("use_SVM_to_remove_outliers", False):
            dk.use_SVM_to_remove_outliers(predict=True)
@@ -503,14 +505,10 @@ class IFreqaiModel(ABC):
        if ft_params.get("use_DBSCAN_to_remove_outliers", False):
            dk.use_DBSCAN_to_remove_outliers(predict=True)

-    def model_exists(
-        self,
-        pair: str,
-        dk: FreqaiDataKitchen,
-        trained_timestamp: int = None,
-        model_filename: str = "",
-        scanning: bool = False,
-    ) -> bool:
+        # ensure user is feeding the correct indicators to the model
+        self.check_if_feature_list_matches_strategy(dk.data_dictionary['prediction_features'], dk)
+
+    def model_exists(self, dk: FreqaiDataKitchen) -> bool:
        """
        Given a pair and path, check if a model already exists
        :param pair: pair e.g. BTC/USD
@@ -518,11 +516,11 @@ class IFreqaiModel(ABC):
        :return:
        :boolean: whether the model file exists or not.
        """
-        path_to_modelfile = Path(dk.data_path / f"{model_filename}_model.joblib")
+        path_to_modelfile = Path(dk.data_path / f"{dk.model_filename}_model.joblib")
        file_exists = path_to_modelfile.is_file()
-        if file_exists and not scanning:
+        if file_exists:
            logger.info("Found model at %s", dk.data_path / dk.model_filename)
-        elif not scanning:
+        else:
            logger.info("Could not find model at %s", dk.data_path / dk.model_filename)
        return file_exists

@@ -569,6 +567,7 @@ class IFreqaiModel(ABC):

        # find the features indicated by strategy and store in datakitchen
        dk.find_features(unfiltered_dataframe)
+        dk.find_labels(unfiltered_dataframe)

        model = self.train(unfiltered_dataframe, pair, dk)

@@ -576,8 +575,8 @@ class IFreqaiModel(ABC):
        dk.set_new_model_names(pair, new_trained_timerange)
        self.dd.save_data(model, pair, dk)

-        if self.freqai_info["feature_parameters"].get("plot_feature_importance", False):
-            plot_feature_importance(model, pair, dk)
+        if self.plot_features:
+            plot_feature_importance(model, pair, dk, self.plot_features)

        if self.freqai_info.get("purge_old_models", False):
            self.dd.purge_old_models()
@@ -170,7 +170,7 @@ def plot_feature_importance(model: Any, pair: str, dk: FreqaiDataKitchen,

        # Data preparation
        fi_df = pd.DataFrame({
-            "feature_names": np.array(dk.training_features_list),
+            "feature_names": np.array(dk.data_dictionary['train_features'].columns),
            "feature_importance": np.array(feature_importance)
        })
        fi_df_top = fi_df.nlargest(count_max, "feature_importance")[::-1]