Merge pull request #7296 from th0rntwig/dbscan
Improve MinPts calculation in DBSCAN, add outlier protection, and add data_kitchen tests
This commit is contained in:
commit
39a739eadb
@ -113,6 +113,7 @@ Mandatory parameters are marked as **Required**, which means that they are requi
|
|||||||
| `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Boolean.
|
| `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Boolean.
|
||||||
| `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Dictionary.
|
| `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Dictionary.
|
||||||
| `use_DBSCAN_to_remove_outliers` | Cluster data using DBSCAN to identify and remove outliers from training and prediction data. See details about how it works [here](#removing-outliers-with-dbscan). <br> **Datatype:** Boolean.
|
| `use_DBSCAN_to_remove_outliers` | Cluster data using DBSCAN to identify and remove outliers from training and prediction data. See details about how it works [here](#removing-outliers-with-dbscan). <br> **Datatype:** Boolean.
|
||||||
|
| `outlier_protection_percentage` | If more than `outlier_protection_percentage` fraction of points are removed as outliers, FreqAI will log a warning message and ignore outlier detection while keeping the original dataset intact. <br> **Datatype:** float. Default: `30`
|
||||||
| | **Data split parameters**
|
| | **Data split parameters**
|
||||||
| `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) (external website). <br> **Datatype:** Dictionary.
|
| `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) (external website). <br> **Datatype:** Dictionary.
|
||||||
| `test_size` | Fraction of data that should be used for testing instead of training. <br> **Datatype:** Positive float < 1.
|
| `test_size` | Fraction of data that should be used for testing instead of training. <br> **Datatype:** Positive float < 1.
|
||||||
|
@ -566,7 +566,6 @@ class FreqaiDataDrawer:
|
|||||||
for training according to user defined train_period_days
|
for training according to user defined train_period_days
|
||||||
metadata: dict = strategy furnished pair metadata
|
metadata: dict = strategy furnished pair metadata
|
||||||
"""
|
"""
|
||||||
|
|
||||||
with self.history_lock:
|
with self.history_lock:
|
||||||
corr_dataframes: Dict[Any, Any] = {}
|
corr_dataframes: Dict[Any, Any] = {}
|
||||||
base_dataframes: Dict[Any, Any] = {}
|
base_dataframes: Dict[Any, Any] = {}
|
||||||
|
@ -513,6 +513,19 @@ class FreqaiDataKitchen:
|
|||||||
|
|
||||||
return avg_mean_dist
|
return avg_mean_dist
|
||||||
|
|
||||||
|
def get_outlier_percentage(self, dropped_pts: npt.NDArray) -> float:
|
||||||
|
"""
|
||||||
|
Check if more than X% of points werer dropped during outlier detection.
|
||||||
|
"""
|
||||||
|
outlier_protection_pct = self.freqai_config["feature_parameters"].get(
|
||||||
|
"outlier_protection_percentage", 30)
|
||||||
|
outlier_pct = (dropped_pts.sum() / len(dropped_pts)) * 100
|
||||||
|
if outlier_pct >= outlier_protection_pct:
|
||||||
|
self.svm_model = None
|
||||||
|
return outlier_pct
|
||||||
|
else:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
def use_SVM_to_remove_outliers(self, predict: bool) -> None:
|
def use_SVM_to_remove_outliers(self, predict: bool) -> None:
|
||||||
"""
|
"""
|
||||||
Build/inference a Support Vector Machine to detect outliers
|
Build/inference a Support Vector Machine to detect outliers
|
||||||
@ -550,8 +563,16 @@ class FreqaiDataKitchen:
|
|||||||
self.data_dictionary["train_features"]
|
self.data_dictionary["train_features"]
|
||||||
)
|
)
|
||||||
y_pred = self.svm_model.predict(self.data_dictionary["train_features"])
|
y_pred = self.svm_model.predict(self.data_dictionary["train_features"])
|
||||||
dropped_points = np.where(y_pred == -1, 0, y_pred)
|
kept_points = np.where(y_pred == -1, 0, y_pred)
|
||||||
# keep_index = np.where(y_pred == 1)
|
# keep_index = np.where(y_pred == 1)
|
||||||
|
outlier_pct = self.get_outlier_percentage(1 - kept_points)
|
||||||
|
if outlier_pct:
|
||||||
|
logger.warning(
|
||||||
|
f"SVM detected {outlier_pct:.2f}% of the points as outliers. "
|
||||||
|
f"Keeping original dataset."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
|
self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
|
||||||
(y_pred == 1)
|
(y_pred == 1)
|
||||||
]
|
]
|
||||||
@ -563,7 +584,7 @@ class FreqaiDataKitchen:
|
|||||||
]
|
]
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"SVM tossed {len(y_pred) - dropped_points.sum()}"
|
f"SVM tossed {len(y_pred) - kept_points.sum()}"
|
||||||
f" train points from {len(y_pred)} total points."
|
f" train points from {len(y_pred)} total points."
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -572,7 +593,7 @@ class FreqaiDataKitchen:
|
|||||||
# to reduce code duplication
|
# to reduce code duplication
|
||||||
if self.freqai_config['data_split_parameters'].get('test_size', 0.1) != 0:
|
if self.freqai_config['data_split_parameters'].get('test_size', 0.1) != 0:
|
||||||
y_pred = self.svm_model.predict(self.data_dictionary["test_features"])
|
y_pred = self.svm_model.predict(self.data_dictionary["test_features"])
|
||||||
dropped_points = np.where(y_pred == -1, 0, y_pred)
|
kept_points = np.where(y_pred == -1, 0, y_pred)
|
||||||
self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
|
self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
|
||||||
(y_pred == 1)
|
(y_pred == 1)
|
||||||
]
|
]
|
||||||
@ -583,7 +604,7 @@ class FreqaiDataKitchen:
|
|||||||
]
|
]
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"SVM tossed {len(y_pred) - dropped_points.sum()}"
|
f"SVM tossed {len(y_pred) - kept_points.sum()}"
|
||||||
f" test points from {len(y_pred)} total points."
|
f" test points from {len(y_pred)} total points."
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -635,8 +656,8 @@ class FreqaiDataKitchen:
|
|||||||
cos(angle) * (point[1] - origin[1])
|
cos(angle) * (point[1] - origin[1])
|
||||||
return (x, y)
|
return (x, y)
|
||||||
|
|
||||||
MinPts = len(self.data_dictionary['train_features'].columns) * 2
|
MinPts = int(len(self.data_dictionary['train_features'].index) * 0.25)
|
||||||
# measure pairwise distances to train_features.shape[1]*2 nearest neighbours
|
# measure pairwise distances to nearest neighbours
|
||||||
neighbors = NearestNeighbors(
|
neighbors = NearestNeighbors(
|
||||||
n_neighbors=MinPts, n_jobs=self.thread_count)
|
n_neighbors=MinPts, n_jobs=self.thread_count)
|
||||||
neighbors_fit = neighbors.fit(self.data_dictionary['train_features'])
|
neighbors_fit = neighbors.fit(self.data_dictionary['train_features'])
|
||||||
@ -667,6 +688,14 @@ class FreqaiDataKitchen:
|
|||||||
self.data['DBSCAN_min_samples'] = MinPts
|
self.data['DBSCAN_min_samples'] = MinPts
|
||||||
dropped_points = np.where(clustering.labels_ == -1, 1, 0)
|
dropped_points = np.where(clustering.labels_ == -1, 1, 0)
|
||||||
|
|
||||||
|
outlier_pct = self.get_outlier_percentage(dropped_points)
|
||||||
|
if outlier_pct:
|
||||||
|
logger.warning(
|
||||||
|
f"DBSCAN detected {outlier_pct:.2f}% of the points as outliers. "
|
||||||
|
f"Keeping original dataset."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
self.data_dictionary['train_features'] = self.data_dictionary['train_features'][
|
self.data_dictionary['train_features'] = self.data_dictionary['train_features'][
|
||||||
(clustering.labels_ != -1)
|
(clustering.labels_ != -1)
|
||||||
]
|
]
|
||||||
@ -722,6 +751,14 @@ class FreqaiDataKitchen:
|
|||||||
0,
|
0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
outlier_pct = self.get_outlier_percentage(1 - do_predict)
|
||||||
|
if outlier_pct:
|
||||||
|
logger.warning(
|
||||||
|
f"DI detected {outlier_pct:.2f}% of the points as outliers. "
|
||||||
|
f"Keeping original dataset."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
if (len(do_predict) - do_predict.sum()) > 0:
|
if (len(do_predict) - do_predict.sum()) > 0:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"DI tossed {len(do_predict) - do_predict.sum()} predictions for "
|
f"DI tossed {len(do_predict) - do_predict.sum()} predictions for "
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -81,6 +82,51 @@ def get_patched_freqaimodel(mocker, freqaiconf):
|
|||||||
return freqaimodel
|
return freqaimodel
|
||||||
|
|
||||||
|
|
||||||
|
def make_data_dictionary(mocker, freqai_conf):
|
||||||
|
freqai_conf.update({"timerange": "20180110-20180130"})
|
||||||
|
|
||||||
|
strategy = get_patched_freqai_strategy(mocker, freqai_conf)
|
||||||
|
exchange = get_patched_exchange(mocker, freqai_conf)
|
||||||
|
strategy.dp = DataProvider(freqai_conf, exchange)
|
||||||
|
strategy.freqai_info = freqai_conf.get("freqai", {})
|
||||||
|
freqai = strategy.freqai
|
||||||
|
freqai.live = True
|
||||||
|
freqai.dk = FreqaiDataKitchen(freqai_conf)
|
||||||
|
freqai.dk.pair = "ADA/BTC"
|
||||||
|
timerange = TimeRange.parse_timerange("20180110-20180130")
|
||||||
|
freqai.dd.load_all_pair_histories(timerange, freqai.dk)
|
||||||
|
|
||||||
|
freqai.dd.pair_dict = MagicMock()
|
||||||
|
|
||||||
|
data_load_timerange = TimeRange.parse_timerange("20180110-20180130")
|
||||||
|
new_timerange = TimeRange.parse_timerange("20180120-20180130")
|
||||||
|
|
||||||
|
corr_dataframes, base_dataframes = freqai.dd.get_base_and_corr_dataframes(
|
||||||
|
data_load_timerange, freqai.dk.pair, freqai.dk
|
||||||
|
)
|
||||||
|
|
||||||
|
unfiltered_dataframe = freqai.dk.use_strategy_to_populate_indicators(
|
||||||
|
strategy, corr_dataframes, base_dataframes, freqai.dk.pair
|
||||||
|
)
|
||||||
|
|
||||||
|
unfiltered_dataframe = freqai.dk.slice_dataframe(new_timerange, unfiltered_dataframe)
|
||||||
|
|
||||||
|
freqai.dk.find_features(unfiltered_dataframe)
|
||||||
|
|
||||||
|
features_filtered, labels_filtered = freqai.dk.filter_features(
|
||||||
|
unfiltered_dataframe,
|
||||||
|
freqai.dk.training_features_list,
|
||||||
|
freqai.dk.label_list,
|
||||||
|
training_filter=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered)
|
||||||
|
|
||||||
|
data_dictionary = freqai.dk.normalize_data(data_dictionary)
|
||||||
|
|
||||||
|
return freqai
|
||||||
|
|
||||||
|
|
||||||
def get_freqai_live_analyzed_dataframe(mocker, freqaiconf):
|
def get_freqai_live_analyzed_dataframe(mocker, freqaiconf):
|
||||||
strategy = get_patched_freqai_strategy(mocker, freqaiconf)
|
strategy = get_patched_freqai_strategy(mocker, freqaiconf)
|
||||||
exchange = get_patched_exchange(mocker, freqaiconf)
|
exchange = get_patched_exchange(mocker, freqaiconf)
|
||||||
|
@ -5,7 +5,8 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from freqtrade.exceptions import OperationalException
|
from freqtrade.exceptions import OperationalException
|
||||||
from tests.freqai.conftest import get_patched_data_kitchen
|
from tests.conftest import log_has_re
|
||||||
|
from tests.freqai.conftest import get_patched_data_kitchen, make_data_dictionary
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -66,3 +67,30 @@ def test_check_if_model_expired(mocker, freqai_conf, timestamp, expected):
|
|||||||
dk = get_patched_data_kitchen(mocker, freqai_conf)
|
dk = get_patched_data_kitchen(mocker, freqai_conf)
|
||||||
assert dk.check_if_model_expired(timestamp) == expected
|
assert dk.check_if_model_expired(timestamp) == expected
|
||||||
shutil.rmtree(Path(dk.full_path))
|
shutil.rmtree(Path(dk.full_path))
|
||||||
|
|
||||||
|
|
||||||
|
def test_use_DBSCAN_to_remove_outliers(mocker, freqai_conf, caplog):
|
||||||
|
freqai = make_data_dictionary(mocker, freqai_conf)
|
||||||
|
# freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 1})
|
||||||
|
freqai.dk.use_DBSCAN_to_remove_outliers(predict=False)
|
||||||
|
assert log_has_re(
|
||||||
|
"DBSCAN found eps of 2.42.",
|
||||||
|
caplog,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_distances(mocker, freqai_conf):
|
||||||
|
freqai = make_data_dictionary(mocker, freqai_conf)
|
||||||
|
freqai_conf['freqai']['feature_parameters'].update({"DI_threshold": 1})
|
||||||
|
avg_mean_dist = freqai.dk.compute_distances()
|
||||||
|
assert round(avg_mean_dist, 2) == 2.56
|
||||||
|
|
||||||
|
|
||||||
|
def test_use_SVM_to_remove_outliers_and_outlier_protection(mocker, freqai_conf, caplog):
|
||||||
|
freqai = make_data_dictionary(mocker, freqai_conf)
|
||||||
|
freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 0.1})
|
||||||
|
freqai.dk.use_SVM_to_remove_outliers(predict=False)
|
||||||
|
assert log_has_re(
|
||||||
|
"SVM detected 8.46%",
|
||||||
|
caplog,
|
||||||
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user