fix bug in DBSCAN, update doc
This commit is contained in:
		| @@ -539,11 +539,19 @@ The user can tell FreqAI to use DBSCAN to cluster training data and remove outli | ||||
| parameter `DBSCAN_outlier_pct` allows the user to indicate the percent of expected outliers to be removed during each training | ||||
| (typically set below 0.05). Higher value increases confidence in the model predictions but reduces the entry frequency. | ||||
|  | ||||
| The FreqAI DBSCAN wrapper performs an interative solution to solving the `eps` hyper parameter. `eps` controls the fraction of  | ||||
| The FreqAI DBSCAN wrapper performs an iterative solution to solving the `eps` hyper parameter. `eps` controls the fraction of  | ||||
| training data considered to be an outlier - thus the iterative solution finds the exact value associated with the user designated | ||||
| `DBSCAN_outlier_pct`. This iterative solution is performed once per training. FreqAI stores the `eps` to be used when DBSCAN | ||||
| is again called to determine if incoming prediction candles are outliers.  | ||||
|  | ||||
| ```json | ||||
|     "freqai": { | ||||
|         "feature_parameters" : { | ||||
|             "DBSCAN_outlier_pct": 0.05 | ||||
|         } | ||||
|     } | ||||
| ``` | ||||
|  | ||||
| ### Stratifying the data | ||||
|  | ||||
| The user can stratify the training/testing data using: | ||||
|   | ||||
| @@ -11,6 +11,7 @@ import numpy.typing as npt | ||||
| import pandas as pd | ||||
| from pandas import DataFrame | ||||
| from sklearn import linear_model | ||||
| from sklearn.cluster import DBSCAN | ||||
| from sklearn.metrics.pairwise import pairwise_distances | ||||
| from sklearn.model_selection import train_test_split | ||||
|  | ||||
| @@ -19,7 +20,7 @@ from freqtrade.data.history.history_utils import refresh_backtest_ohlcv_data | ||||
| from freqtrade.exceptions import OperationalException | ||||
| from freqtrade.resolvers import ExchangeResolver | ||||
| from freqtrade.strategy.interface import IStrategy | ||||
| from sklearn.cluster import DBSCAN | ||||
|  | ||||
|  | ||||
| SECONDS_IN_DAY = 86400 | ||||
| SECONDS_IN_HOUR = 3600 | ||||
| @@ -499,7 +500,8 @@ class FreqaiDataKitchen: | ||||
|         for prediction confidence in the Dissimilarity Index | ||||
|         """ | ||||
|         logger.info("computing average mean distance for all training points") | ||||
|         pairwise = pairwise_distances(self.data_dictionary["train_features"], n_jobs=self.thread_count) | ||||
|         pairwise = pairwise_distances( | ||||
|             self.data_dictionary["train_features"], n_jobs=self.thread_count) | ||||
|         avg_mean_dist = pairwise.mean(axis=1).mean() | ||||
|  | ||||
|         return avg_mean_dist | ||||
| @@ -613,21 +615,33 @@ class FreqaiDataKitchen: | ||||
|  | ||||
|         else: | ||||
|             outlier_target = self.freqai_config['feature_parameters'].get('DBSCAN_outlier_pct') | ||||
|             eps = 1.8 | ||||
|             if 'DBSCAN_eps' in self.data: | ||||
|                 eps = self.data['DBSCAN_eps'] | ||||
|             else: | ||||
|                 eps = 10 | ||||
|                 logger.info('DBSCAN starting from high value. This should be faster next train.') | ||||
|  | ||||
|             error = 1. | ||||
|             MinPts = len(train_ft_df.columns) * 2 | ||||
|             MinPts = len(self.data_dictionary['train_features'].columns) | ||||
|             logger.info( | ||||
|                     f'DBSCAN finding best clustering for {outlier_target}% outliers.') | ||||
|  | ||||
|             # find optimal value for epsilon using an iterative approach: | ||||
|             while abs(error) > 0.01: | ||||
|                 clustering = DBSCAN(eps=eps, min_samples=MinPts, n_jobs=-1).fit( | ||||
|                     train_ft_df | ||||
|                 ) | ||||
|             while abs(np.sqrt(error)) > 0.1: | ||||
|                 clustering = DBSCAN(eps=eps, min_samples=MinPts, | ||||
|                                     n_jobs=int(self.thread_count / 2)).fit( | ||||
|                                         self.data_dictionary['train_features'] | ||||
|                                     ) | ||||
|                 outlier_pct = np.count_nonzero(clustering.labels_ == -1) / len(clustering.labels_) | ||||
|                 error = (outlier_pct - outlier_target) / outlier_target | ||||
|                 multiplier = 1 + error * (1.01 - 1.) | ||||
|                 error = (outlier_pct - outlier_target) ** 2 / outlier_target | ||||
|                 multiplier = (outlier_pct - outlier_target) if outlier_pct > 0 else 1 * \ | ||||
|                     np.sign(outlier_pct - outlier_target) | ||||
|                 multiplier = 1 + error * multiplier | ||||
|                 eps = multiplier * eps | ||||
|                 logger.info( | ||||
|                     f'DBSCAN error {error:.2f} for eps {eps:.2f} and outliet pct {outlier_pct:.2f}') | ||||
|  | ||||
|             logger.info(f'DBSCAN found eps of {eps}.') | ||||
|  | ||||
|             self.data['DBSCAN_eps'] = eps | ||||
|             self.data['DBSCAN_min_samples'] = MinPts | ||||
|   | ||||
		Reference in New Issue
	
	Block a user