add DBSCAN outlier detection feature, add supporting documentation

This commit is contained in:
robcaulk 2022-08-04 12:14:56 +02:00
parent 778833f90e
commit 29225e4baf
3 changed files with 94 additions and 9 deletions

View File

@ -105,6 +105,7 @@ Mandatory parameters are marked as **Required**, which means that they are requi
| `stratify_training_data` | This value is used to indicate the stratification of the data. e.g. 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. <br> **Datatype:** positive integer. | `stratify_training_data` | This value is used to indicate the stratification of the data. e.g. 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. <br> **Datatype:** positive integer.
| `indicator_max_period_candles` | The maximum *period* used in `populate_any_indicators()` for indicator creation. FreqAI uses this information in combination with the maximum timeframe to calculate how many data points it should download so that the first data point does not have a NaN <br> **Datatype:** positive integer. | `indicator_max_period_candles` | The maximum *period* used in `populate_any_indicators()` for indicator creation. FreqAI uses this information in combination with the maximum timeframe to calculate how many data points it should download so that the first data point does not have a NaN <br> **Datatype:** positive integer.
| `indicator_periods_candles` | A list of integers used to duplicate all indicators according to a set of periods and add them to the feature set. <br> **Datatype:** list of positive integers. | `indicator_periods_candles` | A list of integers used to duplicate all indicators according to a set of periods and add them to the feature set. <br> **Datatype:** list of positive integers.
| `DBSCAN_outlier_pct` | Inactive by default. If user sets this to a fractional value, DBSCAN is used to cluster the training data and remove user set percentage of training data as outliers. <br> **Datatype:** float (fraction of 1).
| | **Data split parameters** | | **Data split parameters**
| `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) <br> **Datatype:** dictionary. | `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) <br> **Datatype:** dictionary.
| `test_size` | Fraction of data that should be used for testing instead of training. <br> **Datatype:** positive float below 1. | `test_size` | Fraction of data that should be used for testing instead of training. <br> **Datatype:** positive float below 1.
@ -519,7 +520,7 @@ variance of the data set is >= 0.999.
### Removing outliers using a Support Vector Machine (SVM) ### Removing outliers using a Support Vector Machine (SVM)
The user can tell Freqai to remove outlier data points from the training/test data sets by setting: The user can tell FreqAI to remove outlier data points from the training/test data sets by setting:
```json ```json
"freqai": { "freqai": {
@ -529,9 +530,20 @@ The user can tell Freqai to remove outlier data points from the training/test da
} }
``` ```
Freqai will train an SVM on the training data (or components if the user activated FreqAI will train an SVM on the training data (or components if the user activated
`principal_component_analysis`) and remove any data point that it deems to be sitting beyond the feature space. `principal_component_analysis`) and remove any data point that it deems to be sitting beyond the feature space.
### Clustering the training data and removing outliers with DBSCAN
The user can tell FreqAI to use DBSCAN to cluster training data and remove outliers from the training data set. The user set
parameter `DBSCAN_outlier_pct` allows the user to indicate the percent of expected outliers to be removed during each training
(typically set below 0.05). Higher value increases confidence in the model predictions but reduces the entry frequency.
The FreqAI DBSCAN wrapper performs an interative solution to solving the `eps` hyper parameter. `eps` controls the fraction of
training data considered to be an outlier - thus the iterative solution finds the exact value associated with the user designated
`DBSCAN_outlier_pct`. This iterative solution is performed once per training. FreqAI stores the `eps` to be used when DBSCAN
is again called to determine if incoming prediction candles are outliers.
### Stratifying the data ### Stratifying the data
The user can stratify the training/testing data using: The user can stratify the training/testing data using:

View File

@ -19,7 +19,7 @@ from freqtrade.data.history.history_utils import refresh_backtest_ohlcv_data
from freqtrade.exceptions import OperationalException from freqtrade.exceptions import OperationalException
from freqtrade.resolvers import ExchangeResolver from freqtrade.resolvers import ExchangeResolver
from freqtrade.strategy.interface import IStrategy from freqtrade.strategy.interface import IStrategy
from sklearn.cluster import DBSCAN
SECONDS_IN_DAY = 86400 SECONDS_IN_DAY = 86400
SECONDS_IN_HOUR = 3600 SECONDS_IN_HOUR = 3600
@ -91,6 +91,7 @@ class FreqaiDataKitchen:
self.trade_database_df: DataFrame = pd.DataFrame() self.trade_database_df: DataFrame = pd.DataFrame()
self.data['extra_returns_per_train'] = self.freqai_config.get('extra_returns_per_train', {}) self.data['extra_returns_per_train'] = self.freqai_config.get('extra_returns_per_train', {})
self.thread_count = self.freqai_config.get("data_kitchen_thread_count", -1)
def set_paths( def set_paths(
self, self,
@ -498,8 +499,7 @@ class FreqaiDataKitchen:
for prediction confidence in the Dissimilarity Index for prediction confidence in the Dissimilarity Index
""" """
logger.info("computing average mean distance for all training points") logger.info("computing average mean distance for all training points")
tc = self.freqai_config.get("model_training_parameters", {}).get("thread_count", -1) pairwise = pairwise_distances(self.data_dictionary["train_features"], n_jobs=self.thread_count)
pairwise = pairwise_distances(self.data_dictionary["train_features"], n_jobs=tc)
avg_mean_dist = pairwise.mean(axis=1).mean() avg_mean_dist = pairwise.mean(axis=1).mean()
return avg_mean_dist return avg_mean_dist
@ -580,6 +580,76 @@ class FreqaiDataKitchen:
return return
def use_DBSCAN_to_remove_outliers(self, predict: bool) -> None:
"""
Use DBSCAN to cluster training data and remove "noisy" data (read outliers).
User controls this via the config param `DBSCAN_outlier_pct` which indicates the
pct of training data that they want to be considered outliers.
:params:
predict: bool = If False (training), iterate to find the best hyper parameters to match
user requested outlier percent target. If True (prediction), use the parameters
determined from the previous training to estimate if the current prediction point
is an outlier.
"""
if predict:
train_ft_df = self.data_dictionary['train_features']
pred_ft_df = self.data_dictionary['prediction_features']
num_preds = len(pred_ft_df)
df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True)
clustering = DBSCAN(
eps=self.data['DBSCAN_eps'],
min_samples=self.data['DBSCAN_min_samples'],
n_jobs=-1
).fit(df)
do_predict = np.where(clustering.labels_[-num_preds:] == -1, 0, 1)
if (len(do_predict) - do_predict.sum()) > 0:
logger.info(
f"DBSCAN tossed {len(do_predict) - do_predict.sum()} predictions"
)
self.do_predict += do_predict
self.do_predict -= 1
else:
outlier_target = self.freqai_config['feature_parameters'].get('DBSCAN_outlier_pct')
eps = 1.8
error = 1.
MinPts = len(train_ft_df.columns) * 2
logger.info(
f'DBSCAN finding best clustering for {outlier_target}% outliers.')
# find optimal value for epsilon using an iterative approach:
while abs(error) > 0.01:
clustering = DBSCAN(eps=eps, min_samples=MinPts, n_jobs=-1).fit(
train_ft_df
)
outlier_pct = np.count_nonzero(clustering.labels_ == -1) / len(clustering.labels_)
error = (outlier_pct - outlier_target) / outlier_target
multiplier = 1 + error * (1.01 - 1.)
eps = multiplier * eps
self.data['DBSCAN_eps'] = eps
self.data['DBSCAN_min_samples'] = MinPts
dropped_points = np.where(clustering.labels_ == -1, 1, 0)
self.data_dictionary['train_features'] = self.data_dictionary['train_features'][
(clustering.labels_ != -1)
]
self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
(clustering.labels_ != -1)
]
self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
(clustering.labels_ != -1)
]
logger.info(
f"DBSCAN tossed {dropped_points.sum()}"
f" train points from {len(clustering.labels_)}"
)
return
def find_features(self, dataframe: DataFrame) -> None: def find_features(self, dataframe: DataFrame) -> None:
""" """
Find features in the strategy provided dataframe Find features in the strategy provided dataframe
@ -596,7 +666,6 @@ class FreqaiDataKitchen:
self.training_features_list = features self.training_features_list = features
self.label_list = labels self.label_list = labels
# return features, labels
def check_if_pred_in_training_spaces(self) -> None: def check_if_pred_in_training_spaces(self) -> None:
""" """
@ -606,11 +675,10 @@ class FreqaiDataKitchen:
from the training data set. from the training data set.
""" """
tc = self.freqai_config.get("model_training_parameters", {}).get("thread_count", -1)
distance = pairwise_distances( distance = pairwise_distances(
self.data_dictionary["train_features"], self.data_dictionary["train_features"],
self.data_dictionary["prediction_features"], self.data_dictionary["prediction_features"],
n_jobs=tc, n_jobs=self.thread_count,
) )
self.DI_values = distance.min(axis=0) / self.data["avg_mean_dist"] self.DI_values = distance.min(axis=0) / self.data["avg_mean_dist"]
@ -677,7 +745,6 @@ class FreqaiDataKitchen:
to_keep = [col for col in dataframe.columns if not col.startswith("&")] to_keep = [col for col in dataframe.columns if not col.startswith("&")]
self.return_dataframe = pd.concat([dataframe[to_keep], self.full_df], axis=1) self.return_dataframe = pd.concat([dataframe[to_keep], self.full_df], axis=1)
# self.append_df = DataFrame()
self.full_df = DataFrame() self.full_df = DataFrame()
return return

View File

@ -384,6 +384,9 @@ class IFreqaiModel(ABC):
if self.freqai_info["feature_parameters"].get("DI_threshold", 0): if self.freqai_info["feature_parameters"].get("DI_threshold", 0):
dk.data["avg_mean_dist"] = dk.compute_distances() dk.data["avg_mean_dist"] = dk.compute_distances()
if self.freqai_info["feature_parameters"].get("DBSCAN_outlier_pct", 0):
dk.use_DBSCAN_to_remove_outliers(predict=False)
def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None: def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None:
""" """
Base data cleaning method for predict. Base data cleaning method for predict.
@ -406,6 +409,9 @@ class IFreqaiModel(ABC):
if self.freqai_info["feature_parameters"].get("DI_threshold", 0): if self.freqai_info["feature_parameters"].get("DI_threshold", 0):
dk.check_if_pred_in_training_spaces() dk.check_if_pred_in_training_spaces()
if self.freqai_info["feature_parameters"].get("DBSCAN_outlier_pct", 0):
dk.use_DBSCAN_to_remove_outliers(predict=True)
def model_exists( def model_exists(
self, self,
pair: str, pair: str,