diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py index 0e9d2e605..b4e9d02ec 100644 --- a/freqtrade/freqai/data_drawer.py +++ b/freqtrade/freqai/data_drawer.py @@ -496,6 +496,10 @@ class FreqaiDataDrawer: save_path / f"{dk.model_filename}_trained_df.pkl" ) + dk.data_dictionary["train_features_no_transf"].to_pickle( + save_path / f"{dk.model_filename}_trained_df_no_transf.pkl" + ) + dk.data_dictionary["train_dates"].to_pickle( save_path / f"{dk.model_filename}_trained_dates_df.pkl" ) @@ -513,6 +517,8 @@ class FreqaiDataDrawer: if coin not in self.meta_data_dictionary: self.meta_data_dictionary[coin] = {} self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"] + self.meta_data_dictionary[coin]["train_df_no_transf"] = \ + dk.data_dictionary["train_features_no_transf"] self.meta_data_dictionary[coin]["meta_data"] = dk.data self.save_drawer_to_disk() @@ -553,6 +559,8 @@ class FreqaiDataDrawer: if coin in self.meta_data_dictionary: dk.data = self.meta_data_dictionary[coin]["meta_data"] dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"] + dk.data_dictionary["train_features_no_transf"] = \ + self.meta_data_dictionary[coin]["train_df_no_transf"] else: with open(dk.data_path / f"{dk.model_filename}_metadata.json", "r") as fp: dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index e06709b2c..c0548d339 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -273,11 +273,15 @@ class FreqaiDataKitchen: test_labels: DataFrame, train_weights: Any, test_weights: Any, + train_df_no_transf: DataFrame = DataFrame(), + test_df_no_transf: DataFrame = DataFrame() ) -> Dict: self.data_dictionary = { "train_features": train_df, + "train_features_no_transf": train_df_no_transf, "test_features": test_df, + "test_features_no_transf": test_df_no_transf, "train_labels": train_labels, "test_labels": test_labels, "train_weights": train_weights, @@ -289,7 +293,7 @@ class FreqaiDataKitchen: def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: """ - Normalize all data in the data_dictionary according to the training dataset + Normalize all data in the data_dictionary according to the training dataset. :param data_dictionary: dictionary containing the cleaned and split training/test data/labels :returns: @@ -495,6 +499,9 @@ class FreqaiDataKitchen: """ from sklearn.decomposition import PCA # avoid importing if we dont need it + self.data_dictionary["train_features_no_transf"] = self.data_dictionary["train_features"] + self.data["training_features_list_no_transf"] = \ + self.data_dictionary["train_features"].columns pca = PCA(0.999) pca = pca.fit(self.data_dictionary["train_features"]) @@ -520,6 +527,7 @@ class FreqaiDataKitchen: self.training_features_list = self.data_dictionary["train_features"].columns if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: + self.data_dictionary["test_features_no_transf"] = self.data_dictionary["test_features"] test_components = pca.transform(self.data_dictionary["test_features"]) self.data_dictionary["test_features"] = pd.DataFrame( data=test_components, @@ -545,6 +553,8 @@ class FreqaiDataKitchen: Use an existing pca transform to transform data into components :param filtered_dataframe: DataFrame = the cleaned dataframe """ + self.data_dictionary["prediction_features_no_transf"] = \ + self.data_dictionary["prediction_features"] pca_components = self.pca.transform(filtered_dataframe) self.data_dictionary["prediction_features"] = pd.DataFrame( data=pca_components, @@ -559,7 +569,8 @@ class FreqaiDataKitchen: """ Compute distances between each training point and every other training point. This metric defines the neighborhood of trained data and is used - for prediction confidence in the Dissimilarity Index + for prediction confidence in the Dissimilarity Index. + Calculations are done on non-transformed (e.g., PCA) data. """ # logger.info("computing average mean distance for all training points") pairwise = pairwise_distances( @@ -586,7 +597,7 @@ class FreqaiDataKitchen: def use_SVM_to_remove_outliers(self, predict: bool) -> None: """ Build/inference a Support Vector Machine to detect outliers - in training data and prediction + in training data and prediction, before any transformation (e.g., PCA). :param predict: bool = If true, inference an existing SVM model, else construct one """ @@ -669,7 +680,7 @@ class FreqaiDataKitchen: def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None: """ - Use DBSCAN to cluster training data and remove "noisy" data (read outliers). + Use DBSCAN to cluster training data and remove outliers before transformation (e.g., PCA). User controls this via the config param `DBSCAN_outlier_pct` which indicates the pct of training data that they want to be considered outliers. :param predict: bool = If False (training), iterate to find the best hyper parameters @@ -682,7 +693,10 @@ class FreqaiDataKitchen: if predict: if not self.data['DBSCAN_eps']: return - train_ft_df = self.data_dictionary['train_features'] + if self.data_dictionary["train_features_no_transf"].empty: + train_ft_df = self.data_dictionary['train_features'] + else: + train_ft_df = self.data_dictionary["train_features_no_transf"] pred_ft_df = self.data_dictionary['prediction_features'] num_preds = len(pred_ft_df) df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True) @@ -773,7 +787,8 @@ class FreqaiDataKitchen: def compute_inlier_metric(self, set_='train') -> None: """ - Compute inlier metric from backwards distance distributions. + Compute inlier metric from backwards distance distributions before any transformation + (e.g., PCA). This metric defines how well features from a timepoint fit into previous timepoints. """ @@ -918,9 +933,13 @@ class FreqaiDataKitchen: and avoid making predictions on any points that are too far away from the training data set. """ + if self.data_dictionary["train_features_no_transf"].empty: + train_features = self.data_dictionary["train_features"] + else: + train_features = self.data_dictionary["train_features_no_transf"] distance = pairwise_distances( - self.data_dictionary["train_features"], + train_features, self.data_dictionary["prediction_features"], n_jobs=self.thread_count, ) diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index af158990b..2bbcc5a43 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -465,17 +465,9 @@ class IFreqaiModel(ABC): if self.freqai_info["data_split_parameters"]["test_size"] > 0: dk.compute_inlier_metric(set_='test') - if ft_params.get( - "principal_component_analysis", False - ): - dk.principal_component_analysis() - if ft_params.get("use_SVM_to_remove_outliers", False): dk.use_SVM_to_remove_outliers(predict=False) - if ft_params.get("DI_threshold", 0): - dk.data["avg_mean_dist"] = dk.compute_distances() - if ft_params.get("use_DBSCAN_to_remove_outliers", False): if dk.pair in self.dd.old_DBSCAN_eps: eps = self.dd.old_DBSCAN_eps[dk.pair] @@ -484,6 +476,14 @@ class IFreqaiModel(ABC): dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps) self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps'] + if ft_params.get("DI_threshold", 0): + dk.data["avg_mean_dist"] = dk.compute_distances() + + if ft_params.get( + "principal_component_analysis", False + ): + dk.principal_component_analysis() + if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0): dk.add_noise_to_training_features() @@ -500,19 +500,19 @@ class IFreqaiModel(ABC): if ft_params.get('inlier_metric_window', 0): dk.compute_inlier_metric(set_='predict') - if ft_params.get( - "principal_component_analysis", False - ): - dk.pca_transform(dk.data_dictionary['prediction_features']) - if ft_params.get("use_SVM_to_remove_outliers", False): dk.use_SVM_to_remove_outliers(predict=True) + if ft_params.get("use_DBSCAN_to_remove_outliers", False): + dk.use_DBSCAN_to_remove_outliers(predict=True) + if ft_params.get("DI_threshold", 0): dk.check_if_pred_in_training_spaces() - if ft_params.get("use_DBSCAN_to_remove_outliers", False): - dk.use_DBSCAN_to_remove_outliers(predict=True) + if ft_params.get( + "principal_component_analysis", False + ): + dk.pca_transform(dk.data_dictionary['prediction_features']) def model_exists(self, dk: FreqaiDataKitchen) -> bool: """ diff --git a/tests/freqai/test_freqai_datakitchen.py b/tests/freqai/test_freqai_datakitchen.py index f1203877e..51399a362 100644 --- a/tests/freqai/test_freqai_datakitchen.py +++ b/tests/freqai/test_freqai_datakitchen.py @@ -157,5 +157,5 @@ def test_make_train_test_datasets(mocker, freqai_conf): data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered) assert data_dictionary - assert len(data_dictionary) == 7 + assert len(data_dictionary) == 9 assert len(data_dictionary['train_features'].index) == 1916 diff --git a/tests/freqai/test_freqai_interface.py b/tests/freqai/test_freqai_interface.py index 2bc65d52e..e8ff3eb8d 100644 --- a/tests/freqai/test_freqai_interface.py +++ b/tests/freqai/test_freqai_interface.py @@ -338,7 +338,11 @@ def test_follow_mode(mocker, freqai_conf): freqai.dd.load_all_pair_histories(timerange, freqai.dk) df = strategy.dp.get_pair_dataframe('ADA/BTC', '5m') - + # import pytest + # pytest.set_trace() + freqai.dk.build_data_dictionary( + [], [], [], [], [], [] + ) freqai.start_live(df, metadata, strategy, freqai.dk) assert len(freqai.dk.return_dataframe.index) == 5702