Add outlier percentage check

This commit is contained in:
elintornquist 2022-08-26 23:05:07 +02:00
parent b2d664c63c
commit 86c5ac44e4

View File

@ -513,6 +513,19 @@ class FreqaiDataKitchen:
return avg_mean_dist return avg_mean_dist
def get_outlier_percentage(self, dropped_pts: npt.ArrayLike) -> float:
"""
Check if more than X% of points werer dropped during outlier detection.
"""
outlier_protection_pct = self.freqai_config["feature_parameters"].get(
"outlier_protection_percentage", 30)
outlier_pct = dropped_pts.sum() / len(dropped_pts)
if outlier_pct >= outlier_protection_pct:
self.svm_model = None
return outlier_pct
else:
return 0.0
def use_SVM_to_remove_outliers(self, predict: bool) -> None: def use_SVM_to_remove_outliers(self, predict: bool) -> None:
""" """
Build/inference a Support Vector Machine to detect outliers Build/inference a Support Vector Machine to detect outliers
@ -552,6 +565,14 @@ class FreqaiDataKitchen:
y_pred = self.svm_model.predict(self.data_dictionary["train_features"]) y_pred = self.svm_model.predict(self.data_dictionary["train_features"])
dropped_points = np.where(y_pred == -1, 0, y_pred) dropped_points = np.where(y_pred == -1, 0, y_pred)
# keep_index = np.where(y_pred == 1) # keep_index = np.where(y_pred == 1)
outlier_ptc = self.get_outlier_percentage(dropped_points)
if outlier_ptc:
logger.warning(
f"SVM detected >{outlier_ptc}% of the points as outliers."
f"Keeping original dataset."
)
return
self.data_dictionary["train_features"] = self.data_dictionary["train_features"][ self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
(y_pred == 1) (y_pred == 1)
] ]
@ -667,6 +688,14 @@ class FreqaiDataKitchen:
self.data['DBSCAN_min_samples'] = MinPts self.data['DBSCAN_min_samples'] = MinPts
dropped_points = np.where(clustering.labels_ == -1, 1, 0) dropped_points = np.where(clustering.labels_ == -1, 1, 0)
outlier_ptc = self.get_outlier_percentage(dropped_points)
if outlier_ptc:
logger.warning(
f"DBSCAN detected >{outlier_ptc}% of the points as outliers."
f"Keeping original dataset."
)
return
self.data_dictionary['train_features'] = self.data_dictionary['train_features'][ self.data_dictionary['train_features'] = self.data_dictionary['train_features'][
(clustering.labels_ != -1) (clustering.labels_ != -1)
] ]
@ -722,6 +751,14 @@ class FreqaiDataKitchen:
0, 0,
) )
outlier_ptc = self.get_outlier_percentage(1 - do_predict)
if outlier_ptc:
logger.warning(
f"DBSCAN detected >{outlier_ptc}% of the points as outliers."
f"Keeping original dataset."
)
return
if (len(do_predict) - do_predict.sum()) > 0: if (len(do_predict) - do_predict.sum()) > 0:
logger.info( logger.info(
f"DI tossed {len(do_predict) - do_predict.sum()} predictions for " f"DI tossed {len(do_predict) - do_predict.sum()} predictions for "