Reorder data cleaning
This commit is contained in:
		| @@ -496,6 +496,10 @@ class FreqaiDataDrawer: | |||||||
|             save_path / f"{dk.model_filename}_trained_df.pkl" |             save_path / f"{dk.model_filename}_trained_df.pkl" | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |         dk.data_dictionary["train_features_no_transf"].to_pickle( | ||||||
|  |             save_path / f"{dk.model_filename}_trained_df_no_transf.pkl" | ||||||
|  |         ) | ||||||
|  |  | ||||||
|         dk.data_dictionary["train_dates"].to_pickle( |         dk.data_dictionary["train_dates"].to_pickle( | ||||||
|             save_path / f"{dk.model_filename}_trained_dates_df.pkl" |             save_path / f"{dk.model_filename}_trained_dates_df.pkl" | ||||||
|         ) |         ) | ||||||
| @@ -513,6 +517,8 @@ class FreqaiDataDrawer: | |||||||
|         if coin not in self.meta_data_dictionary: |         if coin not in self.meta_data_dictionary: | ||||||
|             self.meta_data_dictionary[coin] = {} |             self.meta_data_dictionary[coin] = {} | ||||||
|         self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"] |         self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"] | ||||||
|  |         self.meta_data_dictionary[coin]["train_df_no_transf"] = \ | ||||||
|  |             dk.data_dictionary["train_features_no_transf"] | ||||||
|         self.meta_data_dictionary[coin]["meta_data"] = dk.data |         self.meta_data_dictionary[coin]["meta_data"] = dk.data | ||||||
|         self.save_drawer_to_disk() |         self.save_drawer_to_disk() | ||||||
|  |  | ||||||
| @@ -553,6 +559,8 @@ class FreqaiDataDrawer: | |||||||
|         if coin in self.meta_data_dictionary: |         if coin in self.meta_data_dictionary: | ||||||
|             dk.data = self.meta_data_dictionary[coin]["meta_data"] |             dk.data = self.meta_data_dictionary[coin]["meta_data"] | ||||||
|             dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"] |             dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"] | ||||||
|  |             dk.data_dictionary["train_features_no_transf"] = \ | ||||||
|  |                 self.meta_data_dictionary[coin]["train_df_no_transf"] | ||||||
|         else: |         else: | ||||||
|             with open(dk.data_path / f"{dk.model_filename}_metadata.json", "r") as fp: |             with open(dk.data_path / f"{dk.model_filename}_metadata.json", "r") as fp: | ||||||
|                 dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE) |                 dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE) | ||||||
|   | |||||||
| @@ -273,11 +273,15 @@ class FreqaiDataKitchen: | |||||||
|         test_labels: DataFrame, |         test_labels: DataFrame, | ||||||
|         train_weights: Any, |         train_weights: Any, | ||||||
|         test_weights: Any, |         test_weights: Any, | ||||||
|  |         train_df_no_transf: DataFrame = DataFrame(), | ||||||
|  |         test_df_no_transf: DataFrame = DataFrame() | ||||||
|     ) -> Dict: |     ) -> Dict: | ||||||
|  |  | ||||||
|         self.data_dictionary = { |         self.data_dictionary = { | ||||||
|             "train_features": train_df, |             "train_features": train_df, | ||||||
|  |             "train_features_no_transf": train_df_no_transf, | ||||||
|             "test_features": test_df, |             "test_features": test_df, | ||||||
|  |             "test_features_no_transf": test_df_no_transf, | ||||||
|             "train_labels": train_labels, |             "train_labels": train_labels, | ||||||
|             "test_labels": test_labels, |             "test_labels": test_labels, | ||||||
|             "train_weights": train_weights, |             "train_weights": train_weights, | ||||||
| @@ -289,7 +293,7 @@ class FreqaiDataKitchen: | |||||||
|  |  | ||||||
|     def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: |     def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: | ||||||
|         """ |         """ | ||||||
|         Normalize all data in the data_dictionary according to the training dataset |         Normalize all data in the data_dictionary according to the training dataset. | ||||||
|         :param data_dictionary: dictionary containing the cleaned and |         :param data_dictionary: dictionary containing the cleaned and | ||||||
|                                 split training/test data/labels |                                 split training/test data/labels | ||||||
|         :returns: |         :returns: | ||||||
| @@ -495,6 +499,9 @@ class FreqaiDataKitchen: | |||||||
|         """ |         """ | ||||||
|  |  | ||||||
|         from sklearn.decomposition import PCA  # avoid importing if we dont need it |         from sklearn.decomposition import PCA  # avoid importing if we dont need it | ||||||
|  |         self.data_dictionary["train_features_no_transf"] = self.data_dictionary["train_features"] | ||||||
|  |         self.data["training_features_list_no_transf"] = \ | ||||||
|  |             self.data_dictionary["train_features"].columns | ||||||
|  |  | ||||||
|         pca = PCA(0.999) |         pca = PCA(0.999) | ||||||
|         pca = pca.fit(self.data_dictionary["train_features"]) |         pca = pca.fit(self.data_dictionary["train_features"]) | ||||||
| @@ -520,6 +527,7 @@ class FreqaiDataKitchen: | |||||||
|         self.training_features_list = self.data_dictionary["train_features"].columns |         self.training_features_list = self.data_dictionary["train_features"].columns | ||||||
|  |  | ||||||
|         if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: |         if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: | ||||||
|  |             self.data_dictionary["test_features_no_transf"] = self.data_dictionary["test_features"] | ||||||
|             test_components = pca.transform(self.data_dictionary["test_features"]) |             test_components = pca.transform(self.data_dictionary["test_features"]) | ||||||
|             self.data_dictionary["test_features"] = pd.DataFrame( |             self.data_dictionary["test_features"] = pd.DataFrame( | ||||||
|                 data=test_components, |                 data=test_components, | ||||||
| @@ -545,6 +553,8 @@ class FreqaiDataKitchen: | |||||||
|         Use an existing pca transform to transform data into components |         Use an existing pca transform to transform data into components | ||||||
|         :param filtered_dataframe: DataFrame = the cleaned dataframe |         :param filtered_dataframe: DataFrame = the cleaned dataframe | ||||||
|         """ |         """ | ||||||
|  |         self.data_dictionary["prediction_features_no_transf"] = \ | ||||||
|  |             self.data_dictionary["prediction_features"] | ||||||
|         pca_components = self.pca.transform(filtered_dataframe) |         pca_components = self.pca.transform(filtered_dataframe) | ||||||
|         self.data_dictionary["prediction_features"] = pd.DataFrame( |         self.data_dictionary["prediction_features"] = pd.DataFrame( | ||||||
|             data=pca_components, |             data=pca_components, | ||||||
| @@ -559,7 +569,8 @@ class FreqaiDataKitchen: | |||||||
|         """ |         """ | ||||||
|         Compute distances between each training point and every other training |         Compute distances between each training point and every other training | ||||||
|         point. This metric defines the neighborhood of trained data and is used |         point. This metric defines the neighborhood of trained data and is used | ||||||
|         for prediction confidence in the Dissimilarity Index |         for prediction confidence in the Dissimilarity Index. | ||||||
|  |         Calculations are done on non-transformed (e.g., PCA) data. | ||||||
|         """ |         """ | ||||||
|         # logger.info("computing average mean distance for all training points") |         # logger.info("computing average mean distance for all training points") | ||||||
|         pairwise = pairwise_distances( |         pairwise = pairwise_distances( | ||||||
| @@ -586,7 +597,7 @@ class FreqaiDataKitchen: | |||||||
|     def use_SVM_to_remove_outliers(self, predict: bool) -> None: |     def use_SVM_to_remove_outliers(self, predict: bool) -> None: | ||||||
|         """ |         """ | ||||||
|         Build/inference a Support Vector Machine to detect outliers |         Build/inference a Support Vector Machine to detect outliers | ||||||
|         in training data and prediction |         in training data and prediction, before any transformation (e.g., PCA). | ||||||
|         :param predict: bool = If true, inference an existing SVM model, else construct one |         :param predict: bool = If true, inference an existing SVM model, else construct one | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
| @@ -669,7 +680,7 @@ class FreqaiDataKitchen: | |||||||
|  |  | ||||||
|     def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None: |     def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None: | ||||||
|         """ |         """ | ||||||
|         Use DBSCAN to cluster training data and remove "noisy" data (read outliers). |         Use DBSCAN to cluster training data and remove outliers before transformation (e.g., PCA). | ||||||
|         User controls this via the config param `DBSCAN_outlier_pct` which indicates the |         User controls this via the config param `DBSCAN_outlier_pct` which indicates the | ||||||
|         pct of training data that they want to be considered outliers. |         pct of training data that they want to be considered outliers. | ||||||
|         :param predict: bool = If False (training), iterate to find the best hyper parameters |         :param predict: bool = If False (training), iterate to find the best hyper parameters | ||||||
| @@ -682,7 +693,10 @@ class FreqaiDataKitchen: | |||||||
|         if predict: |         if predict: | ||||||
|             if not self.data['DBSCAN_eps']: |             if not self.data['DBSCAN_eps']: | ||||||
|                 return |                 return | ||||||
|  |             if self.data_dictionary["train_features_no_transf"].empty: | ||||||
|                 train_ft_df = self.data_dictionary['train_features'] |                 train_ft_df = self.data_dictionary['train_features'] | ||||||
|  |             else: | ||||||
|  |                 train_ft_df = self.data_dictionary["train_features_no_transf"] | ||||||
|             pred_ft_df = self.data_dictionary['prediction_features'] |             pred_ft_df = self.data_dictionary['prediction_features'] | ||||||
|             num_preds = len(pred_ft_df) |             num_preds = len(pred_ft_df) | ||||||
|             df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True) |             df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True) | ||||||
| @@ -773,7 +787,8 @@ class FreqaiDataKitchen: | |||||||
|  |  | ||||||
|     def compute_inlier_metric(self, set_='train') -> None: |     def compute_inlier_metric(self, set_='train') -> None: | ||||||
|         """ |         """ | ||||||
|         Compute inlier metric from backwards distance distributions. |         Compute inlier metric from backwards distance distributions before any transformation | ||||||
|  |         (e.g., PCA). | ||||||
|         This metric defines how well features from a timepoint fit |         This metric defines how well features from a timepoint fit | ||||||
|         into previous timepoints. |         into previous timepoints. | ||||||
|         """ |         """ | ||||||
| @@ -918,9 +933,13 @@ class FreqaiDataKitchen: | |||||||
|         and avoid making predictions on any points that are too far away |         and avoid making predictions on any points that are too far away | ||||||
|         from the training data set. |         from the training data set. | ||||||
|         """ |         """ | ||||||
|  |         if self.data_dictionary["train_features_no_transf"].empty: | ||||||
|  |             train_features = self.data_dictionary["train_features"] | ||||||
|  |         else: | ||||||
|  |             train_features = self.data_dictionary["train_features_no_transf"] | ||||||
|  |  | ||||||
|         distance = pairwise_distances( |         distance = pairwise_distances( | ||||||
|             self.data_dictionary["train_features"], |             train_features, | ||||||
|             self.data_dictionary["prediction_features"], |             self.data_dictionary["prediction_features"], | ||||||
|             n_jobs=self.thread_count, |             n_jobs=self.thread_count, | ||||||
|         ) |         ) | ||||||
|   | |||||||
| @@ -465,17 +465,9 @@ class IFreqaiModel(ABC): | |||||||
|             if self.freqai_info["data_split_parameters"]["test_size"] > 0: |             if self.freqai_info["data_split_parameters"]["test_size"] > 0: | ||||||
|                 dk.compute_inlier_metric(set_='test') |                 dk.compute_inlier_metric(set_='test') | ||||||
|  |  | ||||||
|         if ft_params.get( |  | ||||||
|             "principal_component_analysis", False |  | ||||||
|         ): |  | ||||||
|             dk.principal_component_analysis() |  | ||||||
|  |  | ||||||
|         if ft_params.get("use_SVM_to_remove_outliers", False): |         if ft_params.get("use_SVM_to_remove_outliers", False): | ||||||
|             dk.use_SVM_to_remove_outliers(predict=False) |             dk.use_SVM_to_remove_outliers(predict=False) | ||||||
|  |  | ||||||
|         if ft_params.get("DI_threshold", 0): |  | ||||||
|             dk.data["avg_mean_dist"] = dk.compute_distances() |  | ||||||
|  |  | ||||||
|         if ft_params.get("use_DBSCAN_to_remove_outliers", False): |         if ft_params.get("use_DBSCAN_to_remove_outliers", False): | ||||||
|             if dk.pair in self.dd.old_DBSCAN_eps: |             if dk.pair in self.dd.old_DBSCAN_eps: | ||||||
|                 eps = self.dd.old_DBSCAN_eps[dk.pair] |                 eps = self.dd.old_DBSCAN_eps[dk.pair] | ||||||
| @@ -484,6 +476,14 @@ class IFreqaiModel(ABC): | |||||||
|             dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps) |             dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps) | ||||||
|             self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps'] |             self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps'] | ||||||
|  |  | ||||||
|  |         if ft_params.get("DI_threshold", 0): | ||||||
|  |             dk.data["avg_mean_dist"] = dk.compute_distances() | ||||||
|  |  | ||||||
|  |         if ft_params.get( | ||||||
|  |             "principal_component_analysis", False | ||||||
|  |         ): | ||||||
|  |             dk.principal_component_analysis() | ||||||
|  |  | ||||||
|         if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0): |         if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0): | ||||||
|             dk.add_noise_to_training_features() |             dk.add_noise_to_training_features() | ||||||
|  |  | ||||||
| @@ -500,19 +500,19 @@ class IFreqaiModel(ABC): | |||||||
|         if ft_params.get('inlier_metric_window', 0): |         if ft_params.get('inlier_metric_window', 0): | ||||||
|             dk.compute_inlier_metric(set_='predict') |             dk.compute_inlier_metric(set_='predict') | ||||||
|  |  | ||||||
|         if ft_params.get( |  | ||||||
|             "principal_component_analysis", False |  | ||||||
|         ): |  | ||||||
|             dk.pca_transform(dk.data_dictionary['prediction_features']) |  | ||||||
|  |  | ||||||
|         if ft_params.get("use_SVM_to_remove_outliers", False): |         if ft_params.get("use_SVM_to_remove_outliers", False): | ||||||
|             dk.use_SVM_to_remove_outliers(predict=True) |             dk.use_SVM_to_remove_outliers(predict=True) | ||||||
|  |  | ||||||
|  |         if ft_params.get("use_DBSCAN_to_remove_outliers", False): | ||||||
|  |             dk.use_DBSCAN_to_remove_outliers(predict=True) | ||||||
|  |  | ||||||
|         if ft_params.get("DI_threshold", 0): |         if ft_params.get("DI_threshold", 0): | ||||||
|             dk.check_if_pred_in_training_spaces() |             dk.check_if_pred_in_training_spaces() | ||||||
|  |  | ||||||
|         if ft_params.get("use_DBSCAN_to_remove_outliers", False): |         if ft_params.get( | ||||||
|             dk.use_DBSCAN_to_remove_outliers(predict=True) |             "principal_component_analysis", False | ||||||
|  |         ): | ||||||
|  |             dk.pca_transform(dk.data_dictionary['prediction_features']) | ||||||
|  |  | ||||||
|     def model_exists(self, dk: FreqaiDataKitchen) -> bool: |     def model_exists(self, dk: FreqaiDataKitchen) -> bool: | ||||||
|         """ |         """ | ||||||
|   | |||||||
| @@ -157,5 +157,5 @@ def test_make_train_test_datasets(mocker, freqai_conf): | |||||||
|     data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered) |     data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered) | ||||||
|  |  | ||||||
|     assert data_dictionary |     assert data_dictionary | ||||||
|     assert len(data_dictionary) == 7 |     assert len(data_dictionary) == 9 | ||||||
|     assert len(data_dictionary['train_features'].index) == 1916 |     assert len(data_dictionary['train_features'].index) == 1916 | ||||||
|   | |||||||
| @@ -338,7 +338,11 @@ def test_follow_mode(mocker, freqai_conf): | |||||||
|     freqai.dd.load_all_pair_histories(timerange, freqai.dk) |     freqai.dd.load_all_pair_histories(timerange, freqai.dk) | ||||||
|  |  | ||||||
|     df = strategy.dp.get_pair_dataframe('ADA/BTC', '5m') |     df = strategy.dp.get_pair_dataframe('ADA/BTC', '5m') | ||||||
|  |     # import pytest | ||||||
|  |     # pytest.set_trace() | ||||||
|  |     freqai.dk.build_data_dictionary( | ||||||
|  |                 [], [], [], [], [], [] | ||||||
|  |                 ) | ||||||
|     freqai.start_live(df, metadata, strategy, freqai.dk) |     freqai.start_live(df, metadata, strategy, freqai.dk) | ||||||
|  |  | ||||||
|     assert len(freqai.dk.return_dataframe.index) == 5702 |     assert len(freqai.dk.return_dataframe.index) == 5702 | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user