diff --git a/docs/freqai.md b/docs/freqai.md index 8537beac2..e38132cb1 100644 --- a/docs/freqai.md +++ b/docs/freqai.md @@ -539,11 +539,19 @@ The user can tell FreqAI to use DBSCAN to cluster training data and remove outli parameter `DBSCAN_outlier_pct` allows the user to indicate the percent of expected outliers to be removed during each training (typically set below 0.05). Higher value increases confidence in the model predictions but reduces the entry frequency. -The FreqAI DBSCAN wrapper performs an interative solution to solving the `eps` hyper parameter. `eps` controls the fraction of +The FreqAI DBSCAN wrapper performs an iterative solution to solving the `eps` hyper parameter. `eps` controls the fraction of training data considered to be an outlier - thus the iterative solution finds the exact value associated with the user designated `DBSCAN_outlier_pct`. This iterative solution is performed once per training. FreqAI stores the `eps` to be used when DBSCAN is again called to determine if incoming prediction candles are outliers. +```json + "freqai": { + "feature_parameters" : { + "DBSCAN_outlier_pct": 0.05 + } + } +``` + ### Stratifying the data The user can stratify the training/testing data using: diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 0cec0b9e1..823cf2a55 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -11,6 +11,7 @@ import numpy.typing as npt import pandas as pd from pandas import DataFrame from sklearn import linear_model +from sklearn.cluster import DBSCAN from sklearn.metrics.pairwise import pairwise_distances from sklearn.model_selection import train_test_split @@ -19,7 +20,7 @@ from freqtrade.data.history.history_utils import refresh_backtest_ohlcv_data from freqtrade.exceptions import OperationalException from freqtrade.resolvers import ExchangeResolver from freqtrade.strategy.interface import IStrategy -from sklearn.cluster import DBSCAN + SECONDS_IN_DAY = 86400 SECONDS_IN_HOUR = 3600 @@ -499,7 +500,8 @@ class FreqaiDataKitchen: for prediction confidence in the Dissimilarity Index """ logger.info("computing average mean distance for all training points") - pairwise = pairwise_distances(self.data_dictionary["train_features"], n_jobs=self.thread_count) + pairwise = pairwise_distances( + self.data_dictionary["train_features"], n_jobs=self.thread_count) avg_mean_dist = pairwise.mean(axis=1).mean() return avg_mean_dist @@ -613,21 +615,33 @@ class FreqaiDataKitchen: else: outlier_target = self.freqai_config['feature_parameters'].get('DBSCAN_outlier_pct') - eps = 1.8 + if 'DBSCAN_eps' in self.data: + eps = self.data['DBSCAN_eps'] + else: + eps = 10 + logger.info('DBSCAN starting from high value. This should be faster next train.') + error = 1. - MinPts = len(train_ft_df.columns) * 2 + MinPts = len(self.data_dictionary['train_features'].columns) logger.info( f'DBSCAN finding best clustering for {outlier_target}% outliers.') # find optimal value for epsilon using an iterative approach: - while abs(error) > 0.01: - clustering = DBSCAN(eps=eps, min_samples=MinPts, n_jobs=-1).fit( - train_ft_df - ) + while abs(np.sqrt(error)) > 0.1: + clustering = DBSCAN(eps=eps, min_samples=MinPts, + n_jobs=int(self.thread_count / 2)).fit( + self.data_dictionary['train_features'] + ) outlier_pct = np.count_nonzero(clustering.labels_ == -1) / len(clustering.labels_) - error = (outlier_pct - outlier_target) / outlier_target - multiplier = 1 + error * (1.01 - 1.) + error = (outlier_pct - outlier_target) ** 2 / outlier_target + multiplier = (outlier_pct - outlier_target) if outlier_pct > 0 else 1 * \ + np.sign(outlier_pct - outlier_target) + multiplier = 1 + error * multiplier eps = multiplier * eps + logger.info( + f'DBSCAN error {error:.2f} for eps {eps:.2f} and outliet pct {outlier_pct:.2f}') + + logger.info(f'DBSCAN found eps of {eps}.') self.data['DBSCAN_eps'] = eps self.data['DBSCAN_min_samples'] = MinPts