From 52ee7fc981abf2efc153b52de5dcb151de636744 Mon Sep 17 00:00:00 2001 From: th0rntwig Date: Thu, 18 Aug 2022 14:44:49 +0200 Subject: [PATCH 1/3] Add inlier metric computation --- freqtrade/freqai/data_kitchen.py | 74 ++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 35f51baed..7a885659d 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -654,6 +654,80 @@ class FreqaiDataKitchen: ) return + + def compute_inlier_metric(self) -> None: + """ + + Compute inlier metric from backwards distance distributions. + This metric defines how well features from a timepoint fit + into previous timepoints. + """ + + import scipy.stats as ss + + nmb_previous_points = self.data['InlierMetric_nmb_points'] + weibull_percentile = self.data['InlierMetric_weib_perc'] + + train_ft_df = self.data_dictionary['train_features'] + train_ft_df_reindexed = train_ft_df.reindex( + index=np.flip(train_ft_df.index) + ) + + pairwise = pd.DataFrame( + np.triu( + pairwise_distances(train_ft_df_reindexed, n_jobs=self.thread_count) + ), + columns=train_ft_df_reindexed.index, + index=train_ft_df_reindexed.index + ) + pairwise = pairwise.round(5) + + column_labels = [ + '{}{}'.format('d', i) for i in range(1, nmb_previous_points+1) + ] + distances = pd.DataFrame( + columns=column_labels, index=train_ft_df.index + ) + for index in train_ft_df.index[nmb_previous_points]: + current_row = pairwise.loc[[index]] + current_row_no_zeros = current_row.loc[ + :, (current_row!=0).any(axis=0) + ] + distances.loc[[index]] = current_row_no_zeros.iloc[ + :, :nmb_previous_points + ] + distances = distances.replace([np.inf, -np.inf], np.nan) + drop_index = pd.isnull(distances).any(1) + distances = distances[drop_index==0] + + inliers = pd.DataFrame(index=distances.index) + for key in distances.keys(): + current_distances = distances[key].dropna() + fit_params = ss.weibull_min.fit(current_distances) + cutoff = ss.weibull_min.ppf(weibull_percentile, *fit_params) + is_inlier = np.where( + current_distances<=cutoff, 1, 0 + ) + df_inlier = pd.DataFrame( + {key+'_IsInlier':is_inlier}, index=distances.index + ) + inliers = pd.concat( + [inliers, df_inlier], axis=1 + ) + + self.data_dictionary['train_features'] = pd.DataFrame( + data=inliers.sum(axis=1)/nmb_previous_points, + columns=['inlier_metric'], + index = train_ft_df.index + ) + + percent_outliers = np.round( + 100*(1-self.data_dictionary['iniler_metric'].sum()/ + len(train_ft_df.index)), 2 + ) + logger.info('{percent_outliers}%% of data points were identified as outliers') + + return None def find_features(self, dataframe: DataFrame) -> None: """ From 98c62dad910ac74a8579e099d1a07e4cc5b0180c Mon Sep 17 00:00:00 2001 From: robcaulk Date: Thu, 18 Aug 2022 19:15:29 +0200 Subject: [PATCH 2/3] integrate inlier metric function --- freqtrade/freqai/data_kitchen.py | 85 ++++++++++++++++++---------- freqtrade/freqai/freqai_interface.py | 36 ++++++++---- 2 files changed, 79 insertions(+), 42 deletions(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 7a885659d..ca4687902 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -654,81 +654,104 @@ class FreqaiDataKitchen: ) return - - def compute_inlier_metric(self) -> None: + + def compute_inlier_metric(self, set_='train') -> None: """ - - Compute inlier metric from backwards distance distributions. - This metric defines how well features from a timepoint fit + + Compute inlier metric from backwards distance distributions. + This metric defines how well features from a timepoint fit into previous timepoints. """ import scipy.stats as ss - - nmb_previous_points = self.data['InlierMetric_nmb_points'] - weibull_percentile = self.data['InlierMetric_weib_perc'] - train_ft_df = self.data_dictionary['train_features'] - train_ft_df_reindexed = train_ft_df.reindex( - index=np.flip(train_ft_df.index) + no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"] + weib_pct = self.freqai_config["feature_parameters"]["inlier_metric_weibull_cutoff"] + + if set_ == 'train': + compute_df = copy.deepcopy(self.data_dictionary['train_features']) + elif set_ == 'test': + compute_df = copy.deepcopy(self.data_dictionary['test_features']) + else: + compute_df = copy.deepcopy(self.data_dictionary['prediction_features']) + + compute_df_reindexed = compute_df.reindex( + index=np.flip(compute_df.index) ) pairwise = pd.DataFrame( np.triu( - pairwise_distances(train_ft_df_reindexed, n_jobs=self.thread_count) + pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count) ), - columns=train_ft_df_reindexed.index, - index=train_ft_df_reindexed.index + columns=compute_df_reindexed.index, + index=compute_df_reindexed.index ) pairwise = pairwise.round(5) column_labels = [ - '{}{}'.format('d', i) for i in range(1, nmb_previous_points+1) + '{}{}'.format('d', i) for i in range(1, no_prev_pts + 1) ] distances = pd.DataFrame( - columns=column_labels, index=train_ft_df.index + columns=column_labels, index=compute_df.index ) - for index in train_ft_df.index[nmb_previous_points]: + + for index in compute_df.index[no_prev_pts:]: current_row = pairwise.loc[[index]] current_row_no_zeros = current_row.loc[ - :, (current_row!=0).any(axis=0) + :, (current_row != 0).any(axis=0) ] distances.loc[[index]] = current_row_no_zeros.iloc[ - :, :nmb_previous_points + :, :no_prev_pts ] distances = distances.replace([np.inf, -np.inf], np.nan) drop_index = pd.isnull(distances).any(1) - distances = distances[drop_index==0] + distances = distances[drop_index == 0] inliers = pd.DataFrame(index=distances.index) for key in distances.keys(): current_distances = distances[key].dropna() fit_params = ss.weibull_min.fit(current_distances) - cutoff = ss.weibull_min.ppf(weibull_percentile, *fit_params) + cutoff = ss.weibull_min.ppf(weib_pct, *fit_params) is_inlier = np.where( - current_distances<=cutoff, 1, 0 + current_distances <= cutoff, 1, 0 ) df_inlier = pd.DataFrame( - {key+'_IsInlier':is_inlier}, index=distances.index + {key + '_IsInlier': is_inlier}, index=distances.index ) inliers = pd.concat( [inliers, df_inlier], axis=1 ) - self.data_dictionary['train_features'] = pd.DataFrame( - data=inliers.sum(axis=1)/nmb_previous_points, + inlier_metric = pd.DataFrame( + data=inliers.sum(axis=1) / no_prev_pts, columns=['inlier_metric'], - index = train_ft_df.index + index=compute_df.index ) - percent_outliers = np.round( - 100*(1-self.data_dictionary['iniler_metric'].sum()/ - len(train_ft_df.index)), 2 - ) - logger.info('{percent_outliers}%% of data points were identified as outliers') + inlier_metric = 2 * (inlier_metric - inlier_metric.min()) / \ + (inlier_metric.max() - inlier_metric.min()) - 1 + + if set_ in ('train', 'test'): + inlier_metric = inlier_metric.iloc[no_prev_pts:] + compute_df = compute_df.iloc[no_prev_pts:] + self.remove_beginning_points_from_data_dict(set_, no_prev_pts) + self.data_dictionary[f'{set_}_features'] = pd.concat( + [compute_df, inlier_metric], axis=1) + else: + self.data_dictionary['prediction_features'] = pd.concat( + [compute_df, inlier_metric], axis=1) + self.data_dictionary['prediction_features'].fillna(0, inplace=True) return None + def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10): + features = self.data_dictionary[f'{set_}_features'] + weights = self.data_dictionary[f'{set_}_weights'] + labels = self.data_dictionary[f'{set_}_labels'] + self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:] + self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:] + self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:] + def find_features(self, dataframe: DataFrame) -> None: """ Find features in the strategy provided dataframe diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 49e4ce5c3..3535d7371 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -66,7 +66,6 @@ class IFreqaiModel(ABC): "data_split_parameters", {}) self.model_training_parameters: Dict[str, Any] = config.get("freqai", {}).get( "model_training_parameters", {}) - self.feature_parameters = config.get("freqai", {}).get("feature_parameters") self.retrain = False self.first = True self.set_full_path() @@ -74,11 +73,14 @@ class IFreqaiModel(ABC): self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode) self.identifier: str = self.freqai_info.get("identifier", "no_id_provided") self.scanning = False + self.ft_params = self.freqai_info["feature_parameters"] self.keras: bool = self.freqai_info.get("keras", False) - if self.keras and self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0): - self.freqai_info["feature_parameters"]["DI_threshold"] = 0 + if self.keras and self.ft_params.get("DI_threshold", 0): + self.ft_params["DI_threshold"] = 0 logger.warning("DI threshold is not configured for Keras models yet. Deactivating.") self.CONV_WIDTH = self.freqai_info.get("conv_width", 2) + if self.ft_params.get("inlier_metric_window", 0): + self.CONV_WIDTH = self.ft_params.get("inlier_metric_window", 0) * 2 self.pair_it = 0 self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist")) self.last_trade_database_summary: DataFrame = {} @@ -389,18 +391,20 @@ class IFreqaiModel(ABC): example of how outlier data points are dropped from the dataframe used for training. """ - if self.freqai_info["feature_parameters"].get( + ft_params = self.freqai_info["feature_parameters"] + + if ft_params.get( "principal_component_analysis", False ): dk.principal_component_analysis() - if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False): + if ft_params.get("use_SVM_to_remove_outliers", False): dk.use_SVM_to_remove_outliers(predict=False) - if self.freqai_info["feature_parameters"].get("DI_threshold", 0): + if ft_params.get("DI_threshold", 0): dk.data["avg_mean_dist"] = dk.compute_distances() - if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False): + if ft_params.get("use_DBSCAN_to_remove_outliers", False): if dk.pair in self.dd.old_DBSCAN_eps: eps = self.dd.old_DBSCAN_eps[dk.pair] else: @@ -408,6 +412,11 @@ class IFreqaiModel(ABC): dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps) self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps'] + if ft_params.get('inlier_metric_window', 0): + dk.compute_inlier_metric(set_='train') + if self.freqai_info["data_split_parameters"]["test_size"] > 0: + dk.compute_inlier_metric(set_='test') + def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None: """ Base data cleaning method for predict. @@ -419,18 +428,23 @@ class IFreqaiModel(ABC): of how the do_predict vector is modified. do_predict is ultimately passed back to strategy for buy signals. """ - if self.freqai_info["feature_parameters"].get( + ft_params = self.freqai_info["feature_parameters"] + + if ft_params.get('inlier_metric_window', 0): + dk.compute_inlier_metric(set_='predict') + + if ft_params.get( "principal_component_analysis", False ): dk.pca_transform(dataframe) - if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False): + if ft_params.get("use_SVM_to_remove_outliers", False): dk.use_SVM_to_remove_outliers(predict=True) - if self.freqai_info["feature_parameters"].get("DI_threshold", 0): + if ft_params.get("DI_threshold", 0): dk.check_if_pred_in_training_spaces() - if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False): + if ft_params.get("use_DBSCAN_to_remove_outliers", False): dk.use_DBSCAN_to_remove_outliers(predict=True) def model_exists( From 755041c134989d10093f1d65f23ebe2d45c643fe Mon Sep 17 00:00:00 2001 From: robcaulk Date: Fri, 19 Aug 2022 18:35:24 +0200 Subject: [PATCH 3/3] add noise feature, improve docstrings --- freqtrade/freqai/data_kitchen.py | 11 +++++++++++ freqtrade/freqai/freqai_interface.py | 18 +++++++----------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index ca4687902..c8516a8bd 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -752,6 +752,17 @@ class FreqaiDataKitchen: self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:] self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:] + def add_noise_to_training_features(self) -> None: + """ + Add noise to train features to reduce the risk of overfitting. + """ + mu = 0 # no shift + sigma = self.freqai_config["feature_parameters"]["noise_standard_deviation"] + compute_df = self.data_dictionary['train_features'] + noise = np.random.normal(mu, sigma, [compute_df.shape[0], compute_df.shape[1]]) + self.data_dictionary['train_features'] += noise + return + def find_features(self, dataframe: DataFrame) -> None: """ Find features in the strategy provided dataframe diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 3535d7371..07303b49f 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -385,10 +385,9 @@ class IFreqaiModel(ABC): def data_cleaning_train(self, dk: FreqaiDataKitchen) -> None: """ - Base data cleaning method for train - Any function inside this method should drop training data points from the filtered_dataframe - based on user decided logic. See FreqaiDataKitchen::use_SVM_to_remove_outliers() for an - example of how outlier data points are dropped from the dataframe used for training. + Base data cleaning method for train. + Functions here improve/modify the input data by identifying outliers, + computing additional metrics, adding noise, reducing dimensionality etc. """ ft_params = self.freqai_info["feature_parameters"] @@ -417,16 +416,13 @@ class IFreqaiModel(ABC): if self.freqai_info["data_split_parameters"]["test_size"] > 0: dk.compute_inlier_metric(set_='test') + if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0): + dk.add_noise_to_training_features() + def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None: """ Base data cleaning method for predict. - These functions each modify dk.do_predict, which is a dataframe with equal length - to the number of candles coming from and returning to the strategy. Inside do_predict, - 1 allows prediction and < 0 signals to the strategy that the model is not confident in - the prediction. - See FreqaiDataKitchen::remove_outliers() for an example - of how the do_predict vector is modified. do_predict is ultimately passed back to strategy - for buy signals. + Functions here are complementary to the functions of data_cleaning_train. """ ft_params = self.freqai_info["feature_parameters"]