ignore sample itself distance for avg_mean_dist computation
This commit is contained in:
parent
72c34291e3
commit
9c38c27eed
@ -454,7 +454,6 @@ class FreqaiDataKitchen:
|
|||||||
logger.info("reduced feature dimension by %s", n_components - n_keep_components)
|
logger.info("reduced feature dimension by %s", n_components - n_keep_components)
|
||||||
logger.info("explained variance %f", np.sum(pca2.explained_variance_ratio_))
|
logger.info("explained variance %f", np.sum(pca2.explained_variance_ratio_))
|
||||||
train_components = pca2.transform(self.data_dictionary["train_features"])
|
train_components = pca2.transform(self.data_dictionary["train_features"])
|
||||||
test_components = pca2.transform(self.data_dictionary["test_features"])
|
|
||||||
|
|
||||||
self.data_dictionary["train_features"] = pd.DataFrame(
|
self.data_dictionary["train_features"] = pd.DataFrame(
|
||||||
data=train_components,
|
data=train_components,
|
||||||
@ -468,6 +467,7 @@ class FreqaiDataKitchen:
|
|||||||
self.training_features_list = self.data_dictionary["train_features"].columns
|
self.training_features_list = self.data_dictionary["train_features"].columns
|
||||||
|
|
||||||
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
|
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
|
||||||
|
test_components = pca2.transform(self.data_dictionary["test_features"])
|
||||||
self.data_dictionary["test_features"] = pd.DataFrame(
|
self.data_dictionary["test_features"] = pd.DataFrame(
|
||||||
data=test_components,
|
data=test_components,
|
||||||
columns=["PC" + str(i) for i in range(0, n_keep_components)],
|
columns=["PC" + str(i) for i in range(0, n_keep_components)],
|
||||||
@ -506,7 +506,10 @@ class FreqaiDataKitchen:
|
|||||||
# logger.info("computing average mean distance for all training points")
|
# logger.info("computing average mean distance for all training points")
|
||||||
pairwise = pairwise_distances(
|
pairwise = pairwise_distances(
|
||||||
self.data_dictionary["train_features"], n_jobs=self.thread_count)
|
self.data_dictionary["train_features"], n_jobs=self.thread_count)
|
||||||
avg_mean_dist = pairwise.mean(axis=1).mean()
|
# remove the diagonal distances which are itself distances ~0
|
||||||
|
np.fill_diagonal(pairwise, np.NaN)
|
||||||
|
pairwise = pairwise.reshape(-1, 1)
|
||||||
|
avg_mean_dist = pairwise[~np.isnan(pairwise)].mean()
|
||||||
|
|
||||||
|
|
||||||
return avg_mean_dist
|
return avg_mean_dist
|
||||||
|
Loading…
Reference in New Issue
Block a user