Merge pull request #7331 from th0rntwig/pca

Normalise PCA space
This commit is contained in:
Robert Caulk 2022-09-03 21:49:54 +02:00 committed by GitHub
commit 8545d74378
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -289,6 +289,7 @@ class FreqaiDataKitchen:
:returns: :returns:
:data_dictionary: updated dictionary with standardized values. :data_dictionary: updated dictionary with standardized values.
""" """
# standardize the data by training stats # standardize the data by training stats
train_max = data_dictionary["train_features"].max() train_max = data_dictionary["train_features"].max()
train_min = data_dictionary["train_features"].min() train_min = data_dictionary["train_features"].min()
@ -322,10 +323,24 @@ class FreqaiDataKitchen:
- 1 - 1
) )
self.data[f"{item}_max"] = train_labels_max # .to_dict() self.data[f"{item}_max"] = train_labels_max
self.data[f"{item}_min"] = train_labels_min # .to_dict() self.data[f"{item}_min"] = train_labels_min
return data_dictionary return data_dictionary
def normalize_single_dataframe(self, df: DataFrame) -> DataFrame:
train_max = df.max()
train_min = df.min()
df = (
2 * (df - train_min) / (train_max - train_min) - 1
)
for item in train_max.keys():
self.data[item + "_max"] = train_max[item]
self.data[item + "_min"] = train_min[item]
return df
def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
""" """
Normalize a set of data using the mean and standard deviation from Normalize a set of data using the mean and standard deviation from
@ -452,22 +467,23 @@ class FreqaiDataKitchen:
from sklearn.decomposition import PCA # avoid importing if we dont need it from sklearn.decomposition import PCA # avoid importing if we dont need it
n_components = self.data_dictionary["train_features"].shape[1] pca = PCA(0.999)
pca = PCA(n_components=n_components)
pca = pca.fit(self.data_dictionary["train_features"]) pca = pca.fit(self.data_dictionary["train_features"])
n_keep_components = np.argmin(pca.explained_variance_ratio_.cumsum() < 0.999) n_keep_components = pca.n_components_
pca2 = PCA(n_components=n_keep_components)
self.data["n_kept_components"] = n_keep_components self.data["n_kept_components"] = n_keep_components
pca2 = pca2.fit(self.data_dictionary["train_features"]) n_components = self.data_dictionary["train_features"].shape[1]
logger.info("reduced feature dimension by %s", n_components - n_keep_components) logger.info("reduced feature dimension by %s", n_components - n_keep_components)
logger.info("explained variance %f", np.sum(pca2.explained_variance_ratio_)) logger.info("explained variance %f", np.sum(pca.explained_variance_ratio_))
train_components = pca2.transform(self.data_dictionary["train_features"])
train_components = pca.transform(self.data_dictionary["train_features"])
self.data_dictionary["train_features"] = pd.DataFrame( self.data_dictionary["train_features"] = pd.DataFrame(
data=train_components, data=train_components,
columns=["PC" + str(i) for i in range(0, n_keep_components)], columns=["PC" + str(i) for i in range(0, n_keep_components)],
index=self.data_dictionary["train_features"].index, index=self.data_dictionary["train_features"].index,
) )
# normalsing transformed training features
self.data_dictionary["train_features"] = self.normalize_single_dataframe(
self.data_dictionary["train_features"])
# keeping a copy of the non-transformed features so we can check for errors during # keeping a copy of the non-transformed features so we can check for errors during
# model load from disk # model load from disk
@ -475,15 +491,18 @@ class FreqaiDataKitchen:
self.training_features_list = self.data_dictionary["train_features"].columns self.training_features_list = self.data_dictionary["train_features"].columns
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
test_components = pca2.transform(self.data_dictionary["test_features"]) test_components = pca.transform(self.data_dictionary["test_features"])
self.data_dictionary["test_features"] = pd.DataFrame( self.data_dictionary["test_features"] = pd.DataFrame(
data=test_components, data=test_components,
columns=["PC" + str(i) for i in range(0, n_keep_components)], columns=["PC" + str(i) for i in range(0, n_keep_components)],
index=self.data_dictionary["test_features"].index, index=self.data_dictionary["test_features"].index,
) )
# normalise transformed test feature to transformed training features
self.data_dictionary["test_features"] = self.normalize_data_from_metadata(
self.data_dictionary["test_features"])
self.data["n_kept_components"] = n_keep_components self.data["n_kept_components"] = n_keep_components
self.pca = pca2 self.pca = pca
logger.info(f"PCA reduced total features from {n_components} to {n_keep_components}") logger.info(f"PCA reduced total features from {n_components} to {n_keep_components}")
@ -504,6 +523,9 @@ class FreqaiDataKitchen:
columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])], columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])],
index=filtered_dataframe.index, index=filtered_dataframe.index,
) )
# normalise transformed predictions to transformed training features
self.data_dictionary["prediction_features"] = self.normalize_data_from_metadata(
self.data_dictionary["prediction_features"])
def compute_distances(self) -> float: def compute_distances(self) -> float:
""" """