From 9c38c27eede56022a71f08a31bd8c880793db225 Mon Sep 17 00:00:00 2001 From: longyu Date: Wed, 17 Aug 2022 15:09:57 +0200 Subject: [PATCH] ignore sample itself distance for avg_mean_dist computation --- freqtrade/freqai/data_kitchen.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index b49df5a4d..85041515a 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -454,7 +454,6 @@ class FreqaiDataKitchen: logger.info("reduced feature dimension by %s", n_components - n_keep_components) logger.info("explained variance %f", np.sum(pca2.explained_variance_ratio_)) train_components = pca2.transform(self.data_dictionary["train_features"]) - test_components = pca2.transform(self.data_dictionary["test_features"]) self.data_dictionary["train_features"] = pd.DataFrame( data=train_components, @@ -468,6 +467,7 @@ class FreqaiDataKitchen: self.training_features_list = self.data_dictionary["train_features"].columns if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: + test_components = pca2.transform(self.data_dictionary["test_features"]) self.data_dictionary["test_features"] = pd.DataFrame( data=test_components, columns=["PC" + str(i) for i in range(0, n_keep_components)], @@ -506,7 +506,10 @@ class FreqaiDataKitchen: # logger.info("computing average mean distance for all training points") pairwise = pairwise_distances( self.data_dictionary["train_features"], n_jobs=self.thread_count) - avg_mean_dist = pairwise.mean(axis=1).mean() + # remove the diagonal distances which are itself distances ~0 + np.fill_diagonal(pairwise, np.NaN) + pairwise = pairwise.reshape(-1, 1) + avg_mean_dist = pairwise[~np.isnan(pairwise)].mean() return avg_mean_dist