fix outlier protection
This commit is contained in:
parent
22b42e91f3
commit
1e41c773a0
@ -113,7 +113,7 @@ Mandatory parameters are marked as **Required**, which means that they are requi
|
|||||||
| `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Boolean.
|
| `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Boolean.
|
||||||
| `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Dictionary.
|
| `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Dictionary.
|
||||||
| `use_DBSCAN_to_remove_outliers` | Cluster data using DBSCAN to identify and remove outliers from training and prediction data. See details about how it works [here](#removing-outliers-with-dbscan). <br> **Datatype:** Boolean.
|
| `use_DBSCAN_to_remove_outliers` | Cluster data using DBSCAN to identify and remove outliers from training and prediction data. See details about how it works [here](#removing-outliers-with-dbscan). <br> **Datatype:** Boolean.
|
||||||
| `outlier_protection_percentage` | If more than `outlier_protection_percentage` fraction of points are removed as outliers, FreqAI will log a warning message and ignore outlier detection while keeping the original dataset intact. <br> **Datatype:** float. Default: `0.3`
|
| `outlier_protection_percentage` | If more than `outlier_protection_percentage` fraction of points are removed as outliers, FreqAI will log a warning message and ignore outlier detection while keeping the original dataset intact. <br> **Datatype:** float. Default: `30`
|
||||||
| | **Data split parameters**
|
| | **Data split parameters**
|
||||||
| `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) (external website). <br> **Datatype:** Dictionary.
|
| `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) (external website). <br> **Datatype:** Dictionary.
|
||||||
| `test_size` | Fraction of data that should be used for testing instead of training. <br> **Datatype:** Positive float < 1.
|
| `test_size` | Fraction of data that should be used for testing instead of training. <br> **Datatype:** Positive float < 1.
|
||||||
|
@ -519,7 +519,7 @@ class FreqaiDataKitchen:
|
|||||||
"""
|
"""
|
||||||
outlier_protection_pct = self.freqai_config["feature_parameters"].get(
|
outlier_protection_pct = self.freqai_config["feature_parameters"].get(
|
||||||
"outlier_protection_percentage", 30)
|
"outlier_protection_percentage", 30)
|
||||||
outlier_pct = dropped_pts.sum() / len(dropped_pts)
|
outlier_pct = (dropped_pts.sum() / len(dropped_pts)) * 100
|
||||||
if outlier_pct >= outlier_protection_pct:
|
if outlier_pct >= outlier_protection_pct:
|
||||||
self.svm_model = None
|
self.svm_model = None
|
||||||
return outlier_pct
|
return outlier_pct
|
||||||
@ -563,12 +563,12 @@ class FreqaiDataKitchen:
|
|||||||
self.data_dictionary["train_features"]
|
self.data_dictionary["train_features"]
|
||||||
)
|
)
|
||||||
y_pred = self.svm_model.predict(self.data_dictionary["train_features"])
|
y_pred = self.svm_model.predict(self.data_dictionary["train_features"])
|
||||||
dropped_points = np.where(y_pred == -1, 0, y_pred)
|
kept_points = np.where(y_pred == -1, 0, y_pred)
|
||||||
# keep_index = np.where(y_pred == 1)
|
# keep_index = np.where(y_pred == 1)
|
||||||
outlier_ptc = self.get_outlier_percentage(dropped_points)
|
outlier_pct = self.get_outlier_percentage(1 - kept_points)
|
||||||
if outlier_ptc:
|
if outlier_pct:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"SVM detected > {outlier_ptc}% of the points as outliers."
|
f"SVM detected {outlier_pct:.2f}% of the points as outliers. "
|
||||||
f"Keeping original dataset."
|
f"Keeping original dataset."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
@ -584,7 +584,7 @@ class FreqaiDataKitchen:
|
|||||||
]
|
]
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"SVM tossed {len(y_pred) - dropped_points.sum()}"
|
f"SVM tossed {len(y_pred) - kept_points.sum()}"
|
||||||
f" train points from {len(y_pred)} total points."
|
f" train points from {len(y_pred)} total points."
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -593,7 +593,7 @@ class FreqaiDataKitchen:
|
|||||||
# to reduce code duplication
|
# to reduce code duplication
|
||||||
if self.freqai_config['data_split_parameters'].get('test_size', 0.1) != 0:
|
if self.freqai_config['data_split_parameters'].get('test_size', 0.1) != 0:
|
||||||
y_pred = self.svm_model.predict(self.data_dictionary["test_features"])
|
y_pred = self.svm_model.predict(self.data_dictionary["test_features"])
|
||||||
dropped_points = np.where(y_pred == -1, 0, y_pred)
|
kept_points = np.where(y_pred == -1, 0, y_pred)
|
||||||
self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
|
self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
|
||||||
(y_pred == 1)
|
(y_pred == 1)
|
||||||
]
|
]
|
||||||
@ -604,7 +604,7 @@ class FreqaiDataKitchen:
|
|||||||
]
|
]
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"SVM tossed {len(y_pred) - dropped_points.sum()}"
|
f"SVM tossed {len(y_pred) - kept_points.sum()}"
|
||||||
f" test points from {len(y_pred)} total points."
|
f" test points from {len(y_pred)} total points."
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -688,10 +688,10 @@ class FreqaiDataKitchen:
|
|||||||
self.data['DBSCAN_min_samples'] = MinPts
|
self.data['DBSCAN_min_samples'] = MinPts
|
||||||
dropped_points = np.where(clustering.labels_ == -1, 1, 0)
|
dropped_points = np.where(clustering.labels_ == -1, 1, 0)
|
||||||
|
|
||||||
outlier_ptc = self.get_outlier_percentage(dropped_points)
|
outlier_pct = self.get_outlier_percentage(dropped_points)
|
||||||
if outlier_ptc:
|
if outlier_pct:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"DBSCAN detected > {outlier_ptc}% of the points as outliers."
|
f"DBSCAN detected {outlier_pct:.2f}% of the points as outliers. "
|
||||||
f"Keeping original dataset."
|
f"Keeping original dataset."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
@ -751,10 +751,10 @@ class FreqaiDataKitchen:
|
|||||||
0,
|
0,
|
||||||
)
|
)
|
||||||
|
|
||||||
outlier_ptc = self.get_outlier_percentage(1 - do_predict)
|
outlier_pct = self.get_outlier_percentage(1 - do_predict)
|
||||||
if outlier_ptc:
|
if outlier_pct:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"DI detected > {outlier_ptc}% of the points as outliers."
|
f"DI detected {outlier_pct:.2f}% of the points as outliers. "
|
||||||
f"Keeping original dataset."
|
f"Keeping original dataset."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
Loading…
Reference in New Issue
Block a user