From 873d2a5069b35ff5b5ff35368999f014590ba98d Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sun, 25 Sep 2022 11:18:10 +0200 Subject: [PATCH] no model save backtest, plot features backtest, ensure inlier plays nice, doc --- docs/freqai.md | 2 +- freqtrade/freqai/data_drawer.py | 23 ++++++++++++++- freqtrade/freqai/data_kitchen.py | 10 +++++-- freqtrade/freqai/freqai_interface.py | 43 ++++++++++++++-------------- freqtrade/freqai/utils.py | 2 +- 5 files changed, 52 insertions(+), 28 deletions(-) diff --git a/docs/freqai.md b/docs/freqai.md index a03162b45..a186ce01a 100644 --- a/docs/freqai.md +++ b/docs/freqai.md @@ -110,7 +110,7 @@ Mandatory parameters are marked as **Required**, which means that they are requi | `indicator_periods_candles` | Calculate indicators for `indicator_periods_candles` time periods and add them to the feature set.
**Datatype:** List of positive integers. | `stratify_training_data` | This value is used to indicate the grouping of the data. For example, 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. See details about how it works [here](#stratifying-the-data-for-training-and-testing-the-model)
**Datatype:** Positive integer. | `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis) -| `plot_feature_importance` | Create an interactive feature importance plot for each model.
**Datatype:** Boolean.
**Datatype:** Boolean, defaults to `False` +| `plot_feature_importances` | Create a feature importance plot for each model for the top/bottom `plot_feature_importances` number of features.
**Datatype:** Boolean.
**Datatype:** Boolean, defaults to `0` | `DI_threshold` | Activates the Dissimilarity Index for outlier detection when > 0. See details about how it works [here](#removing-outliers-with-the-dissimilarity-index).
**Datatype:** Positive float (typically < 1). | `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm).
**Datatype:** Boolean. | `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm).
**Datatype:** Dictionary. diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py index 7f4459fa5..e6a39b6e7 100644 --- a/freqtrade/freqai/data_drawer.py +++ b/freqtrade/freqai/data_drawer.py @@ -313,6 +313,7 @@ class FreqaiDataDrawer: """ dk.find_features(dataframe) + dk.find_labels(dataframe) full_labels = dk.label_list + dk.unique_class_list @@ -376,7 +377,27 @@ class FreqaiDataDrawer: if self.config.get("freqai", {}).get("purge_old_models", False): self.purge_old_models() - # Functions pulled back from FreqaiDataKitchen because they relied on DataDrawer + def save_metaddata(self, dk: FreqaiDataKitchen) -> None: + """ + Saves only metadata for backtesting studies if user prefers + not to save model data. This saves tremendous amounts of space + for users generating huge studies. + This is only active when `save_backtest_models`: false (not default) + """ + if not dk.data_path.is_dir(): + dk.data_path.mkdir(parents=True, exist_ok=True) + + save_path = Path(dk.data_path) + + dk.data["data_path"] = str(dk.data_path) + dk.data["model_filename"] = str(dk.model_filename) + dk.data["training_features_list"] = list(dk.data_dictionary["train_features"].columns) + dk.data["label_list"] = dk.label_list + + with open(save_path / f"{dk.model_filename}_metadata.json", "w") as fp: + rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE) + + return def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None: """ diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 752cd0e45..f4fa4e5fd 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -831,7 +831,7 @@ class FreqaiDataKitchen: inlier_metric = pd.DataFrame( data=inliers.sum(axis=1) / no_prev_pts, - columns=['inlier_metric'], + columns=['%-inlier_metric'], index=compute_df.index ) @@ -881,11 +881,14 @@ class FreqaiDataKitchen: """ column_names = dataframe.columns features = [c for c in column_names if "%" in c] - labels = [c for c in column_names if "&" in c] if not features: raise OperationalException("Could not find any features!") self.training_features_list = features + + def find_labels(self, dataframe: DataFrame) -> None: + column_names = dataframe.columns + labels = [c for c in column_names if "&" in c] self.label_list = labels def check_if_pred_in_training_spaces(self) -> None: @@ -1206,7 +1209,8 @@ class FreqaiDataKitchen: def get_unique_classes_from_labels(self, dataframe: DataFrame) -> None: - self.find_features(dataframe) + # self.find_features(dataframe) + self.find_labels(dataframe) for key in self.label_list: if dataframe[key].dtype == object: diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index e0a45fb38..988aae4f5 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -92,6 +92,7 @@ class IFreqaiModel(ABC): self.begin_time_train: float = 0 self.base_tf_seconds = timeframe_to_seconds(self.config['timeframe']) self.continual_learning = self.freqai_info.get('continual_learning', False) + self.plot_features = self.ft_params.get("plot_feature_importances", 0) self._threads: List[threading.Thread] = [] self._stop_event = threading.Event() @@ -278,22 +279,24 @@ class IFreqaiModel(ABC): append_df = dk.get_backtesting_prediction() dk.append_predictions(append_df) else: - if not self.model_exists( - pair, dk, trained_timestamp=trained_timestamp_int - ): + if not self.model_exists(dk): dk.find_features(dataframe_train) + dk.find_labels(dataframe_train) self.model = self.train(dataframe_train, pair, dk) self.dd.pair_dict[pair]["trained_timestamp"] = int( trained_timestamp.stopts) - + if self.plot_features: + plot_feature_importance(self.model, pair, dk, self.plot_features) if self.save_backtest_models: logger.info('Saving backtest model to disk.') self.dd.save_data(self.model, pair, dk) + else: + logger.info('Saving metadata to disk.') + self.dd.save_metaddata(dk) else: self.model = self.dd.load_data(pair, dk) - self.check_if_feature_list_matches_strategy(dataframe_train, dk) - + # self.check_if_feature_list_matches_strategy(dataframe_train, dk) pred_df, do_preds = self.predict(dataframe_backtest, dk) append_df = dk.get_predictions_to_append(pred_df, do_preds) dk.append_predictions(append_df) @@ -372,8 +375,7 @@ class IFreqaiModel(ABC): self.dd.return_null_values_to_strategy(dataframe, dk) return dk - # ensure user is feeding the correct indicators to the model - self.check_if_feature_list_matches_strategy(dataframe, dk) + dk.find_labels(dataframe) self.build_strategy_return_arrays(dataframe, dk, metadata["pair"], trained_timestamp) @@ -492,7 +494,7 @@ class IFreqaiModel(ABC): if ft_params.get( "principal_component_analysis", False ): - dk.pca_transform(self.dk.data_dictionary['prediction_features']) + dk.pca_transform(dk.data_dictionary['prediction_features']) if ft_params.get("use_SVM_to_remove_outliers", False): dk.use_SVM_to_remove_outliers(predict=True) @@ -503,14 +505,10 @@ class IFreqaiModel(ABC): if ft_params.get("use_DBSCAN_to_remove_outliers", False): dk.use_DBSCAN_to_remove_outliers(predict=True) - def model_exists( - self, - pair: str, - dk: FreqaiDataKitchen, - trained_timestamp: int = None, - model_filename: str = "", - scanning: bool = False, - ) -> bool: + # ensure user is feeding the correct indicators to the model + self.check_if_feature_list_matches_strategy(dk.data_dictionary['prediction_features'], dk) + + def model_exists(self, dk: FreqaiDataKitchen) -> bool: """ Given a pair and path, check if a model already exists :param pair: pair e.g. BTC/USD @@ -518,11 +516,11 @@ class IFreqaiModel(ABC): :return: :boolean: whether the model file exists or not. """ - path_to_modelfile = Path(dk.data_path / f"{model_filename}_model.joblib") + path_to_modelfile = Path(dk.data_path / f"{dk.model_filename}_model.joblib") file_exists = path_to_modelfile.is_file() - if file_exists and not scanning: + if file_exists: logger.info("Found model at %s", dk.data_path / dk.model_filename) - elif not scanning: + else: logger.info("Could not find model at %s", dk.data_path / dk.model_filename) return file_exists @@ -569,6 +567,7 @@ class IFreqaiModel(ABC): # find the features indicated by strategy and store in datakitchen dk.find_features(unfiltered_dataframe) + dk.find_labels(unfiltered_dataframe) model = self.train(unfiltered_dataframe, pair, dk) @@ -576,8 +575,8 @@ class IFreqaiModel(ABC): dk.set_new_model_names(pair, new_trained_timerange) self.dd.save_data(model, pair, dk) - if self.freqai_info["feature_parameters"].get("plot_feature_importance", False): - plot_feature_importance(model, pair, dk) + if self.plot_features: + plot_feature_importance(model, pair, dk, self.plot_features) if self.freqai_info.get("purge_old_models", False): self.dd.purge_old_models() diff --git a/freqtrade/freqai/utils.py b/freqtrade/freqai/utils.py index f6358925c..22bc1e06e 100644 --- a/freqtrade/freqai/utils.py +++ b/freqtrade/freqai/utils.py @@ -170,7 +170,7 @@ def plot_feature_importance(model: Any, pair: str, dk: FreqaiDataKitchen, # Data preparation fi_df = pd.DataFrame({ - "feature_names": np.array(dk.training_features_list), + "feature_names": np.array(dk.data_dictionary['train_features'].columns), "feature_importance": np.array(feature_importance) }) fi_df_top = fi_df.nlargest(count_max, "feature_importance")[::-1]