allow user to pass test_size = 0 and avoid using eval sets in prediction models

This commit is contained in:
robcaulk 2022-07-25 19:40:13 +02:00
parent 55cf378ec2
commit 56b17e6f3c
4 changed files with 67 additions and 44 deletions

View File

@ -243,20 +243,28 @@ class FreqaiDataKitchen:
else: else:
stratification = None stratification = None
( if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
train_features, (
test_features, train_features,
train_labels, test_features,
test_labels, train_labels,
train_weights, test_labels,
test_weights, train_weights,
) = train_test_split( test_weights,
filtered_dataframe[: filtered_dataframe.shape[0]], ) = train_test_split(
labels, filtered_dataframe[: filtered_dataframe.shape[0]],
weights, labels,
stratify=stratification, weights,
**self.config["freqai"]["data_split_parameters"], stratify=stratification,
) **self.config["freqai"]["data_split_parameters"],
)
else:
test_labels = np.zeros(2)
test_features = pd.DataFrame()
test_weights = np.zeros(2)
train_features = filtered_dataframe
train_labels = labels
train_weights = weights
return self.build_data_dictionary( return self.build_data_dictionary(
train_features, test_features, train_labels, test_labels, train_weights, test_weights train_features, test_features, train_labels, test_labels, train_weights, test_weights
@ -392,12 +400,13 @@ class FreqaiDataKitchen:
/ (train_labels_max - train_labels_min) / (train_labels_max - train_labels_min)
- 1 - 1
) )
data_dictionary["test_labels"][item] = ( if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
2 data_dictionary["test_labels"][item] = (
* (data_dictionary["test_labels"][item] - train_labels_min) 2
/ (train_labels_max - train_labels_min) * (data_dictionary["test_labels"][item] - train_labels_min)
- 1 / (train_labels_max - train_labels_min)
) - 1
)
self.data[f"{item}_max"] = train_labels_max # .to_dict() self.data[f"{item}_max"] = train_labels_max # .to_dict()
self.data[f"{item}_min"] = train_labels_min # .to_dict() self.data[f"{item}_min"] = train_labels_min # .to_dict()
@ -555,11 +564,12 @@ class FreqaiDataKitchen:
self.data["training_features_list_raw"] = copy.deepcopy(self.training_features_list) self.data["training_features_list_raw"] = copy.deepcopy(self.training_features_list)
self.training_features_list = self.data_dictionary["train_features"].columns self.training_features_list = self.data_dictionary["train_features"].columns
self.data_dictionary["test_features"] = pd.DataFrame( if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
data=test_components, self.data_dictionary["test_features"] = pd.DataFrame(
columns=["PC" + str(i) for i in range(0, n_keep_components)], data=test_components,
index=self.data_dictionary["test_features"].index, columns=["PC" + str(i) for i in range(0, n_keep_components)],
) index=self.data_dictionary["test_features"].index,
)
self.data["n_kept_components"] = n_keep_components self.data["n_kept_components"] = n_keep_components
self.pca = pca2 self.pca = pca2
@ -652,15 +662,17 @@ class FreqaiDataKitchen:
) )
# same for test data # same for test data
y_pred = self.svm_model.predict(self.data_dictionary["test_features"]) if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
dropped_points = np.where(y_pred == -1, 0, y_pred) y_pred = self.svm_model.predict(self.data_dictionary["test_features"])
self.data_dictionary["test_features"] = self.data_dictionary["test_features"][ dropped_points = np.where(y_pred == -1, 0, y_pred)
(y_pred == 1) self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
] (y_pred == 1)
self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][(y_pred == 1)] ]
self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][ self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][(
(y_pred == 1) y_pred == 1)]
] self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][
(y_pred == 1)
]
logger.info( logger.info(
f"svm_remove_outliers() tossed {len(y_pred) - dropped_points.sum()}" f"svm_remove_outliers() tossed {len(y_pred) - dropped_points.sum()}"

View File

@ -28,17 +28,22 @@ class CatboostPredictionModel(BaseRegressionModel):
label=data_dictionary["train_labels"], label=data_dictionary["train_labels"],
weight=data_dictionary["train_weights"], weight=data_dictionary["train_weights"],
) )
if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) == 0:
test_data = Pool( test_data = None
data=data_dictionary["test_features"], else:
label=data_dictionary["test_labels"], test_data = Pool(
weight=data_dictionary["test_weights"], data=data_dictionary["test_features"],
) label=data_dictionary["test_labels"],
weight=data_dictionary["test_weights"],
)
model = CatBoostRegressor( model = CatBoostRegressor(
allow_writing_files=False, allow_writing_files=False,
**self.model_training_parameters, **self.model_training_parameters,
) )
if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) == 0:
test_data = None
model.fit(X=train_data, eval_set=test_data) model.fit(X=train_data, eval_set=test_data)
return model return model

View File

@ -36,7 +36,9 @@ class CatboostPredictionMultiModel(BaseRegressionModel):
model = MultiOutputRegressor(estimator=cbr) model = MultiOutputRegressor(estimator=cbr)
model.fit(X=X, y=y, sample_weight=sample_weight) # , eval_set=eval_set) model.fit(X=X, y=y, sample_weight=sample_weight) # , eval_set=eval_set)
train_score = model.score(X, y)
test_score = model.score(*eval_set) if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
logger.info(f"Train score {train_score}, Test score {test_score}") train_score = model.score(X, y)
test_score = model.score(*eval_set)
logger.info(f"Train score {train_score}, Test score {test_score}")
return model return model

View File

@ -25,11 +25,15 @@ class LightGBMPredictionModel(BaseRegressionModel):
all the training and test data/labels. all the training and test data/labels.
""" """
eval_set = (data_dictionary["test_features"], data_dictionary["test_labels"]) if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) == 0:
eval_set = None
else:
eval_set = (data_dictionary["test_features"], data_dictionary["test_labels"])
X = data_dictionary["train_features"] X = data_dictionary["train_features"]
y = data_dictionary["train_labels"] y = data_dictionary["train_labels"]
model = LGBMRegressor(**self.model_training_parameters) model = LGBMRegressor(**self.model_training_parameters)
model.fit(X=X, y=y, eval_set=eval_set) model.fit(X=X, y=y, eval_set=eval_set)
return model return model