Merge pull request #7243 from lolongcovas/newbranch_test
Improve PCA and pairwise distance calcs
This commit is contained in:
		| @@ -454,7 +454,6 @@ class FreqaiDataKitchen: | |||||||
|         logger.info("reduced feature dimension by %s", n_components - n_keep_components) |         logger.info("reduced feature dimension by %s", n_components - n_keep_components) | ||||||
|         logger.info("explained variance %f", np.sum(pca2.explained_variance_ratio_)) |         logger.info("explained variance %f", np.sum(pca2.explained_variance_ratio_)) | ||||||
|         train_components = pca2.transform(self.data_dictionary["train_features"]) |         train_components = pca2.transform(self.data_dictionary["train_features"]) | ||||||
|         test_components = pca2.transform(self.data_dictionary["test_features"]) |  | ||||||
|  |  | ||||||
|         self.data_dictionary["train_features"] = pd.DataFrame( |         self.data_dictionary["train_features"] = pd.DataFrame( | ||||||
|             data=train_components, |             data=train_components, | ||||||
| @@ -468,6 +467,7 @@ class FreqaiDataKitchen: | |||||||
|         self.training_features_list = self.data_dictionary["train_features"].columns |         self.training_features_list = self.data_dictionary["train_features"].columns | ||||||
|  |  | ||||||
|         if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: |         if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: | ||||||
|  |             test_components = pca2.transform(self.data_dictionary["test_features"]) | ||||||
|             self.data_dictionary["test_features"] = pd.DataFrame( |             self.data_dictionary["test_features"] = pd.DataFrame( | ||||||
|                 data=test_components, |                 data=test_components, | ||||||
|                 columns=["PC" + str(i) for i in range(0, n_keep_components)], |                 columns=["PC" + str(i) for i in range(0, n_keep_components)], | ||||||
| @@ -506,7 +506,10 @@ class FreqaiDataKitchen: | |||||||
|         # logger.info("computing average mean distance for all training points") |         # logger.info("computing average mean distance for all training points") | ||||||
|         pairwise = pairwise_distances( |         pairwise = pairwise_distances( | ||||||
|             self.data_dictionary["train_features"], n_jobs=self.thread_count) |             self.data_dictionary["train_features"], n_jobs=self.thread_count) | ||||||
|         avg_mean_dist = pairwise.mean(axis=1).mean() |         # remove the diagonal distances which are itself distances ~0 | ||||||
|  |         np.fill_diagonal(pairwise, np.NaN) | ||||||
|  |         pairwise = pairwise.reshape(-1, 1) | ||||||
|  |         avg_mean_dist = pairwise[~np.isnan(pairwise)].mean() | ||||||
|  |  | ||||||
|         return avg_mean_dist |         return avg_mean_dist | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user