Merge branch 'feature/training_data_slice_normalization' into develop

This commit is contained in:
longyu
2022-10-08 09:23:40 +02:00
4 changed files with 690 additions and 8 deletions

View File

@@ -440,8 +440,8 @@ class FreqaiDataDrawer:
rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE)
# save the train data to file so we can check preds for area of applicability later
dk.data_dictionary["train_features"].to_pickle(
save_path / f"{dk.model_filename}_trained_df.pkl"
dk.data_dictionary["train_features"].astype("float32").to_pickle(
save_path / f"{dk.model_filename}_trained_df.pkl.bz2"
)
dk.data_dictionary["train_dates"].to_pickle(
@@ -499,7 +499,7 @@ class FreqaiDataDrawer:
dk.label_list = dk.data["label_list"]
dk.data_dictionary["train_features"] = pd.read_pickle(
dk.data_path / f"{dk.model_filename}_trained_df.pkl"
dk.data_path / f"{dk.model_filename}_trained_df.pkl.bz2"
)
# try to access model in memory instead of loading object from disk to save time

View File

@@ -859,11 +859,25 @@ class FreqaiDataKitchen:
"""
Add noise to train features to reduce the risk of overfitting.
"""
mu = 0 # no shift
sigma = self.freqai_config["feature_parameters"]["noise_standard_deviation"]
compute_df = self.data_dictionary['train_features']
noise = np.random.normal(mu, sigma, [compute_df.shape[0], compute_df.shape[1]])
self.data_dictionary['train_features'] += noise
da = self.freqai_config["feature_parameters"]["data_augment"]
X = self.data_dictionary['train_features']
y = self.data_dictionary['train_labels']
da_type = da.get("type", "std")
if da_type == "std":
# generate alpha values of 0-mean and 1-std
alpha = np.random.randn(*X.shape)
scale = da.get("vaue", 0.01)
Xaugmented = X + alpha * scale * X.std(0)[None, :]
X = np.vstack((X, Xaugmented))
y = y.append(y)
self.data_dictionary['train_features'] = X
self.data_dictionary['train_labels'] = y
elif da_type == "constant":
mu = 0 # no shift
sigma = self.freqai_config["feature_parameters"]["data_augment"]["value"]
compute_df = self.data_dictionary['train_features']
noise = np.random.normal(mu, sigma, [compute_df.shape[0], compute_df.shape[1]])
self.data_dictionary['train_features'] += noise
return
def find_features(self, dataframe: DataFrame) -> None:
@@ -1209,6 +1223,7 @@ class FreqaiDataKitchen:
for key in self.label_list:
if dataframe[key].dtype == object:
# TODO: make sure the `dataframe[key].dropna().unique()` are objet type too!
self.unique_classes[key] = dataframe[key].dropna().unique()
if self.unique_classes: