improve DBSCAN performance for subsequent trainings

This commit is contained in:
Robert Caulk 2022-08-04 17:41:58 +02:00
parent fe1b8515a8
commit 51a6b4289f
3 changed files with 17 additions and 10 deletions

View File

@ -76,6 +76,7 @@ class FreqaiDataDrawer:
self.load_historic_predictions_from_disk() self.load_historic_predictions_from_disk()
self.training_queue: Dict[str, int] = {} self.training_queue: Dict[str, int] = {}
self.history_lock = threading.Lock() self.history_lock = threading.Lock()
self.old_DBSCAN_eps: Dict[str, float] = {}
def load_drawer_from_disk(self): def load_drawer_from_disk(self):
""" """

View File

@ -582,7 +582,7 @@ class FreqaiDataKitchen:
return return
def use_DBSCAN_to_remove_outliers(self, predict: bool) -> None: def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None:
""" """
Use DBSCAN to cluster training data and remove "noisy" data (read outliers). Use DBSCAN to cluster training data and remove "noisy" data (read outliers).
User controls this via the config param `DBSCAN_outlier_pct` which indicates the User controls this via the config param `DBSCAN_outlier_pct` which indicates the
@ -615,10 +615,10 @@ class FreqaiDataKitchen:
else: else:
outlier_target = self.freqai_config['feature_parameters'].get('DBSCAN_outlier_pct') outlier_target = self.freqai_config['feature_parameters'].get('DBSCAN_outlier_pct')
if 'DBSCAN_eps' in self.data: if eps:
eps = self.data['DBSCAN_eps'] epsilon = eps
else: else:
eps = 10 epsilon = 10
logger.info('DBSCAN starting from high value. This should be faster next train.') logger.info('DBSCAN starting from high value. This should be faster next train.')
error = 1. error = 1.
@ -628,7 +628,7 @@ class FreqaiDataKitchen:
# find optimal value for epsilon using an iterative approach: # find optimal value for epsilon using an iterative approach:
while abs(np.sqrt(error)) > 0.1: while abs(np.sqrt(error)) > 0.1:
clustering = DBSCAN(eps=eps, min_samples=MinPts, clustering = DBSCAN(eps=epsilon, min_samples=MinPts,
n_jobs=int(self.thread_count / 2)).fit( n_jobs=int(self.thread_count / 2)).fit(
self.data_dictionary['train_features'] self.data_dictionary['train_features']
) )
@ -637,13 +637,14 @@ class FreqaiDataKitchen:
multiplier = (outlier_pct - outlier_target) if outlier_pct > 0 else 1 * \ multiplier = (outlier_pct - outlier_target) if outlier_pct > 0 else 1 * \
np.sign(outlier_pct - outlier_target) np.sign(outlier_pct - outlier_target)
multiplier = 1 + error * multiplier multiplier = 1 + error * multiplier
eps = multiplier * eps epsilon = multiplier * epsilon
logger.info( logger.info(
f'DBSCAN error {error:.2f} for eps {eps:.2f} and outliet pct {outlier_pct:.2f}') f'DBSCAN error {error:.2f} for eps {epsilon:.2f}'
f' and outlier pct {outlier_pct:.2f}')
logger.info(f'DBSCAN found eps of {eps}.') logger.info(f'DBSCAN found eps of {epsilon}.')
self.data['DBSCAN_eps'] = eps self.data['DBSCAN_eps'] = epsilon
self.data['DBSCAN_min_samples'] = MinPts self.data['DBSCAN_min_samples'] = MinPts
dropped_points = np.where(clustering.labels_ == -1, 1, 0) dropped_points = np.where(clustering.labels_ == -1, 1, 0)

View File

@ -385,7 +385,12 @@ class IFreqaiModel(ABC):
dk.data["avg_mean_dist"] = dk.compute_distances() dk.data["avg_mean_dist"] = dk.compute_distances()
if self.freqai_info["feature_parameters"].get("DBSCAN_outlier_pct", 0): if self.freqai_info["feature_parameters"].get("DBSCAN_outlier_pct", 0):
dk.use_DBSCAN_to_remove_outliers(predict=False) if dk.pair in self.dd.old_DBSCAN_eps:
eps = self.dd.old_DBSCAN_eps[dk.pair]
else:
eps = None
dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps)
self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps']
def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None: def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None:
""" """