Merge pull request #7465 from freqtrade/aggregate-fixes-fai

Aggregate recent feature requests
2022-09-25 20:43:23 +02:00
parent adb5b98a3d 48e89e68b9
commit 8051235171
5 changed files with 53 additions and 29 deletions
@@ -109,8 +109,8 @@ Mandatory parameters are marked as **Required**, which means that they are requi
 | `indicator_max_period_candles` | **No longer used**. User must use the strategy set `startup_candle_count` which defines the maximum *period* used in `populate_any_indicators()` for indicator creation (timeframe independent). FreqAI uses this information in combination with the maximum timeframe to calculate how many data points it should download so that the first data point does not have a NaN <br> **Datatype:** positive integer.
 | `indicator_periods_candles` | Calculate indicators for `indicator_periods_candles` time periods and add them to the feature set. <br> **Datatype:** List of positive integers.
 | `stratify_training_data` | This value is used to indicate the grouping of the data. For example, 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. See details about how it works [here](#stratifying-the-data-for-training-and-testing-the-model) <br> **Datatype:** Positive integer.
-| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis)
+| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis) <br> **Datatype:** Boolean. defaults to `false`.
-| `plot_feature_importance` | Create an interactive feature importance plot for each model.<br> **Datatype:** Boolean.<br> **Datatype:** Boolean, defaults to `False`
+| `plot_feature_importances` | Create a feature importance plot for each model for the top/bottom `plot_feature_importances` number of features.<br> **Datatype:** Integer, defaults to `0`.
 | `DI_threshold` | Activates the Dissimilarity Index for outlier detection when > 0. See details about how it works [here](#removing-outliers-with-the-dissimilarity-index). <br> **Datatype:** Positive float (typically < 1).
 | `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Boolean.
 | `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Dictionary.
@@ -313,6 +313,7 @@ class FreqaiDataDrawer:
        """
        dk.find_features(dataframe)
        dk.find_labels(dataframe)
        full_labels = dk.label_list + dk.unique_class_list
@@ -376,7 +377,27 @@ class FreqaiDataDrawer:
        if self.config.get("freqai", {}).get("purge_old_models", False):
            self.purge_old_models()
-    # Functions pulled back from FreqaiDataKitchen because they relied on DataDrawer
+    def save_metadata(self, dk: FreqaiDataKitchen) -> None:
        """
        Saves only metadata for backtesting studies if user prefers
        not to save model data. This saves tremendous amounts of space
        for users generating huge studies.
        This is only active when `save_backtest_models`: false (not default)
        """
        if not dk.data_path.is_dir():
            dk.data_path.mkdir(parents=True, exist_ok=True)
        save_path = Path(dk.data_path)
        dk.data["data_path"] = str(dk.data_path)
        dk.data["model_filename"] = str(dk.model_filename)
        dk.data["training_features_list"] = list(dk.data_dictionary["train_features"].columns)
        dk.data["label_list"] = dk.label_list
        with open(save_path / f"{dk.model_filename}_metadata.json", "w") as fp:
            rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE)
        return
    def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None:
        """
@@ -831,7 +831,7 @@ class FreqaiDataKitchen:
        inlier_metric = pd.DataFrame(
            data=inliers.sum(axis=1) / no_prev_pts,
-            columns=['inlier_metric'],
+            columns=['%-inlier_metric'],
            index=compute_df.index
        )
@@ -881,11 +881,14 @@ class FreqaiDataKitchen:
        """
        column_names = dataframe.columns
        features = [c for c in column_names if "%" in c]
        labels = [c for c in column_names if "&" in c]
        if not features:
            raise OperationalException("Could not find any features!")
        self.training_features_list = features
    def find_labels(self, dataframe: DataFrame) -> None:
        column_names = dataframe.columns
        labels = [c for c in column_names if "&" in c]
        self.label_list = labels
    def check_if_pred_in_training_spaces(self) -> None:
@@ -1206,7 +1209,8 @@ class FreqaiDataKitchen:
    def get_unique_classes_from_labels(self, dataframe: DataFrame) -> None:
-        self.find_features(dataframe)
+        # self.find_features(dataframe)
        self.find_labels(dataframe)
        for key in self.label_list:
            if dataframe[key].dtype == object:
@@ -92,6 +92,7 @@ class IFreqaiModel(ABC):
        self.begin_time_train: float = 0
        self.base_tf_seconds = timeframe_to_seconds(self.config['timeframe'])
        self.continual_learning = self.freqai_info.get('continual_learning', False)
        self.plot_features = self.ft_params.get("plot_feature_importances", 0)
        self._threads: List[threading.Thread] = []
        self._stop_event = threading.Event()
@@ -278,22 +279,24 @@ class IFreqaiModel(ABC):
                append_df = dk.get_backtesting_prediction()
                dk.append_predictions(append_df)
            else:
-                if not self.model_exists(
+                if not self.model_exists(dk):
                    pair, dk, trained_timestamp=trained_timestamp_int
                ):
                    dk.find_features(dataframe_train)
                    dk.find_labels(dataframe_train)
                    self.model = self.train(dataframe_train, pair, dk)
                    self.dd.pair_dict[pair]["trained_timestamp"] = int(
                        trained_timestamp.stopts)
-
+                    if self.plot_features:
                        plot_feature_importance(self.model, pair, dk, self.plot_features)
                    if self.save_backtest_models:
                        logger.info('Saving backtest model to disk.')
                        self.dd.save_data(self.model, pair, dk)
                    else:
                        logger.info('Saving metadata to disk.')
                        self.dd.save_metadata(dk)
                else:
                    self.model = self.dd.load_data(pair, dk)
-                self.check_if_feature_list_matches_strategy(dataframe_train, dk)
+                # self.check_if_feature_list_matches_strategy(dataframe_train, dk)
                pred_df, do_preds = self.predict(dataframe_backtest, dk)
                append_df = dk.get_predictions_to_append(pred_df, do_preds)
                dk.append_predictions(append_df)
@@ -372,8 +375,7 @@ class IFreqaiModel(ABC):
            self.dd.return_null_values_to_strategy(dataframe, dk)
            return dk
-        # ensure user is feeding the correct indicators to the model
+        dk.find_labels(dataframe)
        self.check_if_feature_list_matches_strategy(dataframe, dk)
        self.build_strategy_return_arrays(dataframe, dk, metadata["pair"], trained_timestamp)
@@ -492,7 +494,7 @@ class IFreqaiModel(ABC):
        if ft_params.get(
            "principal_component_analysis", False
        ):
-            dk.pca_transform(self.dk.data_dictionary['prediction_features'])
+            dk.pca_transform(dk.data_dictionary['prediction_features'])
        if ft_params.get("use_SVM_to_remove_outliers", False):
            dk.use_SVM_to_remove_outliers(predict=True)
@@ -503,14 +505,10 @@ class IFreqaiModel(ABC):
        if ft_params.get("use_DBSCAN_to_remove_outliers", False):
            dk.use_DBSCAN_to_remove_outliers(predict=True)
-    def model_exists(
+        # ensure user is feeding the correct indicators to the model
-        self,
+        self.check_if_feature_list_matches_strategy(dk.data_dictionary['prediction_features'], dk)
-        pair: str,
+
-        dk: FreqaiDataKitchen,
+    def model_exists(self, dk: FreqaiDataKitchen) -> bool:
        trained_timestamp: int = None,
        model_filename: str = "",
        scanning: bool = False,
    ) -> bool:
        """
        Given a pair and path, check if a model already exists
        :param pair: pair e.g. BTC/USD
@@ -518,11 +516,11 @@ class IFreqaiModel(ABC):
        :return:
        :boolean: whether the model file exists or not.
        """
-        path_to_modelfile = Path(dk.data_path / f"{model_filename}_model.joblib")
+        path_to_modelfile = Path(dk.data_path / f"{dk.model_filename}_model.joblib")
        file_exists = path_to_modelfile.is_file()
-        if file_exists and not scanning:
+        if file_exists:
            logger.info("Found model at %s", dk.data_path / dk.model_filename)
-        elif not scanning:
+        else:
            logger.info("Could not find model at %s", dk.data_path / dk.model_filename)
        return file_exists
@@ -569,6 +567,7 @@ class IFreqaiModel(ABC):
        # find the features indicated by strategy and store in datakitchen
        dk.find_features(unfiltered_dataframe)
        dk.find_labels(unfiltered_dataframe)
        model = self.train(unfiltered_dataframe, pair, dk)
@@ -576,8 +575,8 @@ class IFreqaiModel(ABC):
        dk.set_new_model_names(pair, new_trained_timerange)
        self.dd.save_data(model, pair, dk)
-        if self.freqai_info["feature_parameters"].get("plot_feature_importance", False):
+        if self.plot_features:
-            plot_feature_importance(model, pair, dk)
+            plot_feature_importance(model, pair, dk, self.plot_features)
        if self.freqai_info.get("purge_old_models", False):
            self.dd.purge_old_models()
@@ -170,7 +170,7 @@ def plot_feature_importance(model: Any, pair: str, dk: FreqaiDataKitchen,
        # Data preparation
        fi_df = pd.DataFrame({
-            "feature_names": np.array(dk.training_features_list),
+            "feature_names": np.array(dk.data_dictionary['train_features'].columns),
            "feature_importance": np.array(feature_importance)
        })
        fi_df_top = fi_df.nlargest(count_max, "feature_importance")[::-1]