fix bug in DBSCAN, update doc

This commit is contained in:
Robert Caulk 2022-08-04 17:00:59 +02:00
parent 29225e4baf
commit fe1b8515a8
2 changed files with 33 additions and 11 deletions

View File

@ -539,11 +539,19 @@ The user can tell FreqAI to use DBSCAN to cluster training data and remove outli
parameter `DBSCAN_outlier_pct` allows the user to indicate the percent of expected outliers to be removed during each training parameter `DBSCAN_outlier_pct` allows the user to indicate the percent of expected outliers to be removed during each training
(typically set below 0.05). Higher value increases confidence in the model predictions but reduces the entry frequency. (typically set below 0.05). Higher value increases confidence in the model predictions but reduces the entry frequency.
The FreqAI DBSCAN wrapper performs an interative solution to solving the `eps` hyper parameter. `eps` controls the fraction of The FreqAI DBSCAN wrapper performs an iterative solution to solving the `eps` hyper parameter. `eps` controls the fraction of
training data considered to be an outlier - thus the iterative solution finds the exact value associated with the user designated training data considered to be an outlier - thus the iterative solution finds the exact value associated with the user designated
`DBSCAN_outlier_pct`. This iterative solution is performed once per training. FreqAI stores the `eps` to be used when DBSCAN `DBSCAN_outlier_pct`. This iterative solution is performed once per training. FreqAI stores the `eps` to be used when DBSCAN
is again called to determine if incoming prediction candles are outliers. is again called to determine if incoming prediction candles are outliers.
```json
"freqai": {
"feature_parameters" : {
"DBSCAN_outlier_pct": 0.05
}
}
```
### Stratifying the data ### Stratifying the data
The user can stratify the training/testing data using: The user can stratify the training/testing data using:

View File

@ -11,6 +11,7 @@ import numpy.typing as npt
import pandas as pd import pandas as pd
from pandas import DataFrame from pandas import DataFrame
from sklearn import linear_model from sklearn import linear_model
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
@ -19,7 +20,7 @@ from freqtrade.data.history.history_utils import refresh_backtest_ohlcv_data
from freqtrade.exceptions import OperationalException from freqtrade.exceptions import OperationalException
from freqtrade.resolvers import ExchangeResolver from freqtrade.resolvers import ExchangeResolver
from freqtrade.strategy.interface import IStrategy from freqtrade.strategy.interface import IStrategy
from sklearn.cluster import DBSCAN
SECONDS_IN_DAY = 86400 SECONDS_IN_DAY = 86400
SECONDS_IN_HOUR = 3600 SECONDS_IN_HOUR = 3600
@ -499,7 +500,8 @@ class FreqaiDataKitchen:
for prediction confidence in the Dissimilarity Index for prediction confidence in the Dissimilarity Index
""" """
logger.info("computing average mean distance for all training points") logger.info("computing average mean distance for all training points")
pairwise = pairwise_distances(self.data_dictionary["train_features"], n_jobs=self.thread_count) pairwise = pairwise_distances(
self.data_dictionary["train_features"], n_jobs=self.thread_count)
avg_mean_dist = pairwise.mean(axis=1).mean() avg_mean_dist = pairwise.mean(axis=1).mean()
return avg_mean_dist return avg_mean_dist
@ -613,21 +615,33 @@ class FreqaiDataKitchen:
else: else:
outlier_target = self.freqai_config['feature_parameters'].get('DBSCAN_outlier_pct') outlier_target = self.freqai_config['feature_parameters'].get('DBSCAN_outlier_pct')
eps = 1.8 if 'DBSCAN_eps' in self.data:
eps = self.data['DBSCAN_eps']
else:
eps = 10
logger.info('DBSCAN starting from high value. This should be faster next train.')
error = 1. error = 1.
MinPts = len(train_ft_df.columns) * 2 MinPts = len(self.data_dictionary['train_features'].columns)
logger.info( logger.info(
f'DBSCAN finding best clustering for {outlier_target}% outliers.') f'DBSCAN finding best clustering for {outlier_target}% outliers.')
# find optimal value for epsilon using an iterative approach: # find optimal value for epsilon using an iterative approach:
while abs(error) > 0.01: while abs(np.sqrt(error)) > 0.1:
clustering = DBSCAN(eps=eps, min_samples=MinPts, n_jobs=-1).fit( clustering = DBSCAN(eps=eps, min_samples=MinPts,
train_ft_df n_jobs=int(self.thread_count / 2)).fit(
) self.data_dictionary['train_features']
)
outlier_pct = np.count_nonzero(clustering.labels_ == -1) / len(clustering.labels_) outlier_pct = np.count_nonzero(clustering.labels_ == -1) / len(clustering.labels_)
error = (outlier_pct - outlier_target) / outlier_target error = (outlier_pct - outlier_target) ** 2 / outlier_target
multiplier = 1 + error * (1.01 - 1.) multiplier = (outlier_pct - outlier_target) if outlier_pct > 0 else 1 * \
np.sign(outlier_pct - outlier_target)
multiplier = 1 + error * multiplier
eps = multiplier * eps eps = multiplier * eps
logger.info(
f'DBSCAN error {error:.2f} for eps {eps:.2f} and outliet pct {outlier_pct:.2f}')
logger.info(f'DBSCAN found eps of {eps}.')
self.data['DBSCAN_eps'] = eps self.data['DBSCAN_eps'] = eps
self.data['DBSCAN_min_samples'] = MinPts self.data['DBSCAN_min_samples'] = MinPts