remove metadata redundancy, fix pca bug
This commit is contained in:
		| @@ -288,25 +288,20 @@ class FreqaiDataKitchen: | |||||||
|         :data_dictionary: updated dictionary with standardized values. |         :data_dictionary: updated dictionary with standardized values. | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|         df_train_features = data_dictionary["train_features"] |         df = data_dictionary["train_features"] | ||||||
|         # standardize the data by training stats |         # standardize the data by training stats | ||||||
|         train_max = df_train_features.max() |         train_max = df.max() | ||||||
|         train_min = df_train_features.min() |         train_min = df.min() | ||||||
|         df_train_features = ( |         df = ( | ||||||
|             2 * (df_train_features - train_min) / (train_max - train_min) - 1 |             2 * (df - train_min) / (train_max - train_min) - 1 | ||||||
|         ) |         ) | ||||||
|         data_dictionary["test_features"] = ( |         data_dictionary["test_features"] = ( | ||||||
|             2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1 |             2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1 | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|         for item in train_max.keys(): |         for item in train_max.keys(): | ||||||
|             if not [col for col in df_train_features.columns if col.startswith('PC')]: |             self.data[item + "_max"] = train_max[item] | ||||||
|                 self.data[item + "_max"] = train_max[item] |             self.data[item + "_min"] = train_min[item] | ||||||
|                 self.data[item + "_min"] = train_min[item] |  | ||||||
|             else: |  | ||||||
|                 # if PCA is enabled and has transformed the training features |  | ||||||
|                 self.data[item + "_pca_max"] = train_max[item] |  | ||||||
|                 self.data[item + "_pca_min"] = train_min[item] |  | ||||||
|  |  | ||||||
|         for item in data_dictionary["train_labels"].keys(): |         for item in data_dictionary["train_labels"].keys(): | ||||||
|             if data_dictionary["train_labels"][item].dtype == object: |             if data_dictionary["train_labels"][item].dtype == object: | ||||||
| @@ -327,16 +322,24 @@ class FreqaiDataKitchen: | |||||||
|                     - 1 |                     - 1 | ||||||
|                 ) |                 ) | ||||||
|  |  | ||||||
|             if not [col for col in df_train_features.columns if col.startswith('PC')]: |                 self.data[f"{item}_max"] = train_labels_max | ||||||
|                 self.data[f"{item}_max"] = train_labels_max  # .to_dict() |                 self.data[f"{item}_min"] = train_labels_min | ||||||
|                 self.data[f"{item}_min"] = train_labels_min  # .to_dict() |  | ||||||
|             else: |  | ||||||
|                 # if PCA is enabled and has transformed the training features |  | ||||||
|                 self.data[f"{item}_pca_max"] = train_labels_max  # .to_dict() |  | ||||||
|                 self.data[f"{item}_pca_min"] = train_labels_min  # .to_dict() |  | ||||||
|  |  | ||||||
|         return data_dictionary |         return data_dictionary | ||||||
|  |  | ||||||
|  |     def normalize_single_dataframe(self, df: DataFrame) -> DataFrame: | ||||||
|  |  | ||||||
|  |         train_max = df.max() | ||||||
|  |         train_min = df.min() | ||||||
|  |         df = ( | ||||||
|  |             2 * (df - train_min) / (train_max - train_min) - 1 | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         for item in train_max.keys(): | ||||||
|  |             self.data[item + "_max"] = train_max[item] | ||||||
|  |             self.data[item + "_min"] = train_min[item] | ||||||
|  |  | ||||||
|  |         return df | ||||||
|  |  | ||||||
|     def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: |     def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: | ||||||
|         """ |         """ | ||||||
|         Normalize a set of data using the mean and standard deviation from |         Normalize a set of data using the mean and standard deviation from | ||||||
| @@ -344,17 +347,11 @@ class FreqaiDataKitchen: | |||||||
|         :param df: Dataframe to be standardized |         :param df: Dataframe to be standardized | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|         if not [col for col in df.columns if col.startswith('PC')]: |  | ||||||
|             id_str = '' |  | ||||||
|         else: |  | ||||||
|             # if PCA is enabled |  | ||||||
|             id_str = '_pca' |  | ||||||
|  |  | ||||||
|         for item in df.keys(): |         for item in df.keys(): | ||||||
|             df[item] = ( |             df[item] = ( | ||||||
|                 2 |                 2 | ||||||
|                 * (df[item] - self.data[f"{item}{id_str}_min"]) |                 * (df[item] - self.data[f"{item}_min"]) | ||||||
|                 / (self.data[f"{item}{id_str}_max"] - self.data[f"{item}{id_str}_min"]) |                 / (self.data[f"{item}_max"] - self.data[f"{item}_min"]) | ||||||
|                 - 1 |                 - 1 | ||||||
|             ) |             ) | ||||||
|  |  | ||||||
| @@ -484,7 +481,7 @@ class FreqaiDataKitchen: | |||||||
|             index=self.data_dictionary["train_features"].index, |             index=self.data_dictionary["train_features"].index, | ||||||
|         ) |         ) | ||||||
|         # normalsing transformed training features |         # normalsing transformed training features | ||||||
|         self.data_dictionary["train_features"] = self.normalize_data( |         self.data_dictionary["train_features"] = self.normalize_single_dataframe( | ||||||
|             self.data_dictionary["train_features"]) |             self.data_dictionary["train_features"]) | ||||||
|  |  | ||||||
|         # keeping a copy of the non-transformed features so we can check for errors during |         # keeping a copy of the non-transformed features so we can check for errors during | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user