From d3cb211283ced68d082cfdbdac12f3d2ab90d63b Mon Sep 17 00:00:00 2001 From: th0rntwig Date: Thu, 18 Aug 2022 14:44:49 +0200 Subject: [PATCH 01/11] Add inlier metric computation --- freqtrade/freqai/data_kitchen.py | 74 ++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 763a07375..62e353949 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -723,6 +723,80 @@ class FreqaiDataKitchen: ) return + + def compute_inlier_metric(self) -> None: + """ + + Compute inlier metric from backwards distance distributions. + This metric defines how well features from a timepoint fit + into previous timepoints. + """ + + import scipy.stats as ss + + nmb_previous_points = self.data['InlierMetric_nmb_points'] + weibull_percentile = self.data['InlierMetric_weib_perc'] + + train_ft_df = self.data_dictionary['train_features'] + train_ft_df_reindexed = train_ft_df.reindex( + index=np.flip(train_ft_df.index) + ) + + pairwise = pd.DataFrame( + np.triu( + pairwise_distances(train_ft_df_reindexed, n_jobs=self.thread_count) + ), + columns=train_ft_df_reindexed.index, + index=train_ft_df_reindexed.index + ) + pairwise = pairwise.round(5) + + column_labels = [ + '{}{}'.format('d', i) for i in range(1, nmb_previous_points+1) + ] + distances = pd.DataFrame( + columns=column_labels, index=train_ft_df.index + ) + for index in train_ft_df.index[nmb_previous_points]: + current_row = pairwise.loc[[index]] + current_row_no_zeros = current_row.loc[ + :, (current_row!=0).any(axis=0) + ] + distances.loc[[index]] = current_row_no_zeros.iloc[ + :, :nmb_previous_points + ] + distances = distances.replace([np.inf, -np.inf], np.nan) + drop_index = pd.isnull(distances).any(1) + distances = distances[drop_index==0] + + inliers = pd.DataFrame(index=distances.index) + for key in distances.keys(): + current_distances = distances[key].dropna() + fit_params = ss.weibull_min.fit(current_distances) + cutoff = ss.weibull_min.ppf(weibull_percentile, *fit_params) + is_inlier = np.where( + current_distances<=cutoff, 1, 0 + ) + df_inlier = pd.DataFrame( + {key+'_IsInlier':is_inlier}, index=distances.index + ) + inliers = pd.concat( + [inliers, df_inlier], axis=1 + ) + + self.data_dictionary['train_features'] = pd.DataFrame( + data=inliers.sum(axis=1)/nmb_previous_points, + columns=['inlier_metric'], + index = train_ft_df.index + ) + + percent_outliers = np.round( + 100*(1-self.data_dictionary['iniler_metric'].sum()/ + len(train_ft_df.index)), 2 + ) + logger.info('{percent_outliers}%% of data points were identified as outliers') + + return None def find_features(self, dataframe: DataFrame) -> None: """ From b11742a4c5a7130d7aabc0d20c343372f7911379 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Thu, 18 Aug 2022 19:15:29 +0200 Subject: [PATCH 02/11] integrate inlier metric function --- freqtrade/freqai/data_kitchen.py | 85 ++++++++++++++++++---------- freqtrade/freqai/freqai_interface.py | 36 ++++++++---- 2 files changed, 79 insertions(+), 42 deletions(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 62e353949..80919626c 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -723,81 +723,104 @@ class FreqaiDataKitchen: ) return - - def compute_inlier_metric(self) -> None: + + def compute_inlier_metric(self, set_='train') -> None: """ - - Compute inlier metric from backwards distance distributions. - This metric defines how well features from a timepoint fit + + Compute inlier metric from backwards distance distributions. + This metric defines how well features from a timepoint fit into previous timepoints. """ import scipy.stats as ss - - nmb_previous_points = self.data['InlierMetric_nmb_points'] - weibull_percentile = self.data['InlierMetric_weib_perc'] - train_ft_df = self.data_dictionary['train_features'] - train_ft_df_reindexed = train_ft_df.reindex( - index=np.flip(train_ft_df.index) + no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"] + weib_pct = self.freqai_config["feature_parameters"]["inlier_metric_weibull_cutoff"] + + if set_ == 'train': + compute_df = copy.deepcopy(self.data_dictionary['train_features']) + elif set_ == 'test': + compute_df = copy.deepcopy(self.data_dictionary['test_features']) + else: + compute_df = copy.deepcopy(self.data_dictionary['prediction_features']) + + compute_df_reindexed = compute_df.reindex( + index=np.flip(compute_df.index) ) pairwise = pd.DataFrame( np.triu( - pairwise_distances(train_ft_df_reindexed, n_jobs=self.thread_count) + pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count) ), - columns=train_ft_df_reindexed.index, - index=train_ft_df_reindexed.index + columns=compute_df_reindexed.index, + index=compute_df_reindexed.index ) pairwise = pairwise.round(5) column_labels = [ - '{}{}'.format('d', i) for i in range(1, nmb_previous_points+1) + '{}{}'.format('d', i) for i in range(1, no_prev_pts + 1) ] distances = pd.DataFrame( - columns=column_labels, index=train_ft_df.index + columns=column_labels, index=compute_df.index ) - for index in train_ft_df.index[nmb_previous_points]: + + for index in compute_df.index[no_prev_pts:]: current_row = pairwise.loc[[index]] current_row_no_zeros = current_row.loc[ - :, (current_row!=0).any(axis=0) + :, (current_row != 0).any(axis=0) ] distances.loc[[index]] = current_row_no_zeros.iloc[ - :, :nmb_previous_points + :, :no_prev_pts ] distances = distances.replace([np.inf, -np.inf], np.nan) drop_index = pd.isnull(distances).any(1) - distances = distances[drop_index==0] + distances = distances[drop_index == 0] inliers = pd.DataFrame(index=distances.index) for key in distances.keys(): current_distances = distances[key].dropna() fit_params = ss.weibull_min.fit(current_distances) - cutoff = ss.weibull_min.ppf(weibull_percentile, *fit_params) + cutoff = ss.weibull_min.ppf(weib_pct, *fit_params) is_inlier = np.where( - current_distances<=cutoff, 1, 0 + current_distances <= cutoff, 1, 0 ) df_inlier = pd.DataFrame( - {key+'_IsInlier':is_inlier}, index=distances.index + {key + '_IsInlier': is_inlier}, index=distances.index ) inliers = pd.concat( [inliers, df_inlier], axis=1 ) - self.data_dictionary['train_features'] = pd.DataFrame( - data=inliers.sum(axis=1)/nmb_previous_points, + inlier_metric = pd.DataFrame( + data=inliers.sum(axis=1) / no_prev_pts, columns=['inlier_metric'], - index = train_ft_df.index + index=compute_df.index ) - percent_outliers = np.round( - 100*(1-self.data_dictionary['iniler_metric'].sum()/ - len(train_ft_df.index)), 2 - ) - logger.info('{percent_outliers}%% of data points were identified as outliers') + inlier_metric = 2 * (inlier_metric - inlier_metric.min()) / \ + (inlier_metric.max() - inlier_metric.min()) - 1 + + if set_ in ('train', 'test'): + inlier_metric = inlier_metric.iloc[no_prev_pts:] + compute_df = compute_df.iloc[no_prev_pts:] + self.remove_beginning_points_from_data_dict(set_, no_prev_pts) + self.data_dictionary[f'{set_}_features'] = pd.concat( + [compute_df, inlier_metric], axis=1) + else: + self.data_dictionary['prediction_features'] = pd.concat( + [compute_df, inlier_metric], axis=1) + self.data_dictionary['prediction_features'].fillna(0, inplace=True) return None + def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10): + features = self.data_dictionary[f'{set_}_features'] + weights = self.data_dictionary[f'{set_}_weights'] + labels = self.data_dictionary[f'{set_}_labels'] + self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:] + self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:] + self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:] + def find_features(self, dataframe: DataFrame) -> None: """ Find features in the strategy provided dataframe diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 4106f24e0..e6e019b66 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -66,7 +66,6 @@ class IFreqaiModel(ABC): "data_split_parameters", {}) self.model_training_parameters: Dict[str, Any] = config.get("freqai", {}).get( "model_training_parameters", {}) - self.feature_parameters = config.get("freqai", {}).get("feature_parameters") self.retrain = False self.first = True self.set_full_path() @@ -74,11 +73,14 @@ class IFreqaiModel(ABC): self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode) self.identifier: str = self.freqai_info.get("identifier", "no_id_provided") self.scanning = False + self.ft_params = self.freqai_info["feature_parameters"] self.keras: bool = self.freqai_info.get("keras", False) - if self.keras and self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0): - self.freqai_info["feature_parameters"]["DI_threshold"] = 0 + if self.keras and self.ft_params.get("DI_threshold", 0): + self.ft_params["DI_threshold"] = 0 logger.warning("DI threshold is not configured for Keras models yet. Deactivating.") self.CONV_WIDTH = self.freqai_info.get("conv_width", 2) + if self.ft_params.get("inlier_metric_window", 0): + self.CONV_WIDTH = self.ft_params.get("inlier_metric_window", 0) * 2 self.pair_it = 0 self.pair_it_train = 0 self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist")) @@ -403,18 +405,20 @@ class IFreqaiModel(ABC): example of how outlier data points are dropped from the dataframe used for training. """ - if self.freqai_info["feature_parameters"].get( + ft_params = self.freqai_info["feature_parameters"] + + if ft_params.get( "principal_component_analysis", False ): dk.principal_component_analysis() - if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False): + if ft_params.get("use_SVM_to_remove_outliers", False): dk.use_SVM_to_remove_outliers(predict=False) - if self.freqai_info["feature_parameters"].get("DI_threshold", 0): + if ft_params.get("DI_threshold", 0): dk.data["avg_mean_dist"] = dk.compute_distances() - if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False): + if ft_params.get("use_DBSCAN_to_remove_outliers", False): if dk.pair in self.dd.old_DBSCAN_eps: eps = self.dd.old_DBSCAN_eps[dk.pair] else: @@ -422,6 +426,11 @@ class IFreqaiModel(ABC): dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps) self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps'] + if ft_params.get('inlier_metric_window', 0): + dk.compute_inlier_metric(set_='train') + if self.freqai_info["data_split_parameters"]["test_size"] > 0: + dk.compute_inlier_metric(set_='test') + def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None: """ Base data cleaning method for predict. @@ -433,18 +442,23 @@ class IFreqaiModel(ABC): of how the do_predict vector is modified. do_predict is ultimately passed back to strategy for buy signals. """ - if self.freqai_info["feature_parameters"].get( + ft_params = self.freqai_info["feature_parameters"] + + if ft_params.get('inlier_metric_window', 0): + dk.compute_inlier_metric(set_='predict') + + if ft_params.get( "principal_component_analysis", False ): dk.pca_transform(dataframe) - if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False): + if ft_params.get("use_SVM_to_remove_outliers", False): dk.use_SVM_to_remove_outliers(predict=True) - if self.freqai_info["feature_parameters"].get("DI_threshold", 0): + if ft_params.get("DI_threshold", 0): dk.check_if_pred_in_training_spaces() - if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False): + if ft_params.get("use_DBSCAN_to_remove_outliers", False): dk.use_DBSCAN_to_remove_outliers(predict=True) def model_exists( From a58dd0bbf9347a69fecde43e5735bc1851de3b9b Mon Sep 17 00:00:00 2001 From: robcaulk Date: Fri, 19 Aug 2022 18:35:24 +0200 Subject: [PATCH 03/11] add noise feature, improve docstrings --- freqtrade/freqai/data_kitchen.py | 11 +++++++++++ freqtrade/freqai/freqai_interface.py | 18 +++++++----------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 80919626c..0158996c7 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -821,6 +821,17 @@ class FreqaiDataKitchen: self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:] self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:] + def add_noise_to_training_features(self) -> None: + """ + Add noise to train features to reduce the risk of overfitting. + """ + mu = 0 # no shift + sigma = self.freqai_config["feature_parameters"]["noise_standard_deviation"] + compute_df = self.data_dictionary['train_features'] + noise = np.random.normal(mu, sigma, [compute_df.shape[0], compute_df.shape[1]]) + self.data_dictionary['train_features'] += noise + return + def find_features(self, dataframe: DataFrame) -> None: """ Find features in the strategy provided dataframe diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index e6e019b66..239cb1869 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -399,10 +399,9 @@ class IFreqaiModel(ABC): def data_cleaning_train(self, dk: FreqaiDataKitchen) -> None: """ - Base data cleaning method for train - Any function inside this method should drop training data points from the filtered_dataframe - based on user decided logic. See FreqaiDataKitchen::use_SVM_to_remove_outliers() for an - example of how outlier data points are dropped from the dataframe used for training. + Base data cleaning method for train. + Functions here improve/modify the input data by identifying outliers, + computing additional metrics, adding noise, reducing dimensionality etc. """ ft_params = self.freqai_info["feature_parameters"] @@ -431,16 +430,13 @@ class IFreqaiModel(ABC): if self.freqai_info["data_split_parameters"]["test_size"] > 0: dk.compute_inlier_metric(set_='test') + if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0): + dk.add_noise_to_training_features() + def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None: """ Base data cleaning method for predict. - These functions each modify dk.do_predict, which is a dataframe with equal length - to the number of candles coming from and returning to the strategy. Inside do_predict, - 1 allows prediction and < 0 signals to the strategy that the model is not confident in - the prediction. - See FreqaiDataKitchen::remove_outliers() for an example - of how the do_predict vector is modified. do_predict is ultimately passed back to strategy - for buy signals. + Functions here are complementary to the functions of data_cleaning_train. """ ft_params = self.freqai_info["feature_parameters"] From 7f52908e87a61ed60c7fe3f2b9b933b6211c60da Mon Sep 17 00:00:00 2001 From: robcaulk Date: Tue, 30 Aug 2022 18:55:58 +0200 Subject: [PATCH 04/11] ensure the lost points are prepended for FreqUI --- freqtrade/freqai/freqai_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 239cb1869..893f960ea 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -587,7 +587,7 @@ class IFreqaiModel(ABC): # # for keras type models, the conv_window needs to be prepended so # # viewing is correct in frequi - if self.freqai_info.get('keras', False): + if self.freqai_info.get('keras', False) or self.ft_params.get('inlier_metric_window', 0): n_lost_points = self.freqai_info.get('conv_width', 2) zeros_df = DataFrame(np.zeros((n_lost_points, len(hist_preds_df.columns))), columns=hist_preds_df.columns) From 0b8482360f0ec1aea6e61ac08b3746584c03800a Mon Sep 17 00:00:00 2001 From: robcaulk Date: Tue, 30 Aug 2022 20:32:49 +0200 Subject: [PATCH 05/11] add documentation for inlier metric --- docs/freqai.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/freqai.md b/docs/freqai.md index e4a451324..3fc76e9f0 100644 --- a/docs/freqai.md +++ b/docs/freqai.md @@ -115,6 +115,8 @@ Mandatory parameters are marked as **Required**, which means that they are requi | `use_DBSCAN_to_remove_outliers` | Cluster data using DBSCAN to identify and remove outliers from training and prediction data. See details about how it works [here](#removing-outliers-with-dbscan).
**Datatype:** Boolean. | `outlier_protection_percentage` | If more than `outlier_protection_percentage` fraction of points are removed as outliers, FreqAI will log a warning message and ignore outlier detection while keeping the original dataset intact.
**Datatype:** float. Default: `30` | `reverse_train_test_order` | If true, FreqAI will train on the latest data split and test on historical split of the data. This allows the model to be trained up to the most recent data point, while avoiding overfitting. However, users should be careful to understand unorthodox nature of this parameter before employing it.
**Datatype:** bool. Default: False +| `inlier_metric_window` | If set, FreqAI will add the `inlier_metric` to the training feature set and set the lookback to be the `inlier_metric_window`. Details of how the `inlier_metric` is computed can be found [here](#using-the-inliermetric)
**Datatype:** int. Default: 0 +| `inlier_metric_weibull_cutoff` | If the `inlier_metric_window` is set, this value is used to determine the tail cutoff in the weibull distribution fit. Details of how the `inlier_metric` is computed can be found [here](#using-the-inliermetric)
**Datatype:** float. Default: 0.95 | | **Data split parameters** | `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) (external website).
**Datatype:** Dictionary. | `test_size` | Fraction of data that should be used for testing instead of training.
**Datatype:** Positive float < 1. @@ -636,6 +638,20 @@ testing; the other points are used for training. The test data is used to evaluate the performance of the model after training. If the test score is high, the model is able to capture the behavior of the data well. If the test score is low, either the model either does not capture the complexity of the data, the test data is significantly different from the train data, or a different model should be used. +### Using the `inlier_metric` + +The `inlier_metric` is a metric aimed at quantifying how different a prediction data point is from the most recent historic data points. + +User can set `inlier_metric_window` to set the look back window. FreqAI will compute the distance between the present prediction point and each of the previous data points (total of `inlier_metric_window` points). + +This function goes one step further - during training, it computes the `inlier_metric` for all training data points and builds weibull distributions for each each lookback point. If one of the distances falls in the tail of the respective weibull distribution, it is considered an "outlier." If the distance to the lookback point is not in the tail, it is considered an "inlier." Inliers receive a value of 1, and outliers receive a value of 0. + +FreqAI adds this `inlier_metric` score to the training features! Thus, your model is trained to recognize how this temporal inlier metric is evolving. + +Users can control the weibull threshold using the `inlier_metric_weibull_cutoff` + +This function does not currently remove outliers from the data set. + ### Controlling the model learning process Model training parameters are unique to the machine learning library selected by the user. FreqAI allows the user to set any parameter for any library using the `model_training_parameters` dictionary in the user configuration file. The example configuration file (found in `config_examples/config_freqai.example.json`) show some of the example parameters associated with `Catboost` and `LightGBM`, but the user can add any parameters available in those libraries. From 7e8e29e42d4c2b0eb058d9408c27ebd220eb9f68 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Tue, 30 Aug 2022 20:41:37 +0200 Subject: [PATCH 06/11] use continuous value for inlier_metric --- docs/freqai.md | 9 +++------ freqtrade/freqai/data_kitchen.py | 9 +++------ 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/docs/freqai.md b/docs/freqai.md index 3fc76e9f0..2c6efa3b9 100644 --- a/docs/freqai.md +++ b/docs/freqai.md @@ -116,7 +116,6 @@ Mandatory parameters are marked as **Required**, which means that they are requi | `outlier_protection_percentage` | If more than `outlier_protection_percentage` fraction of points are removed as outliers, FreqAI will log a warning message and ignore outlier detection while keeping the original dataset intact.
**Datatype:** float. Default: `30` | `reverse_train_test_order` | If true, FreqAI will train on the latest data split and test on historical split of the data. This allows the model to be trained up to the most recent data point, while avoiding overfitting. However, users should be careful to understand unorthodox nature of this parameter before employing it.
**Datatype:** bool. Default: False | `inlier_metric_window` | If set, FreqAI will add the `inlier_metric` to the training feature set and set the lookback to be the `inlier_metric_window`. Details of how the `inlier_metric` is computed can be found [here](#using-the-inliermetric)
**Datatype:** int. Default: 0 -| `inlier_metric_weibull_cutoff` | If the `inlier_metric_window` is set, this value is used to determine the tail cutoff in the weibull distribution fit. Details of how the `inlier_metric` is computed can be found [here](#using-the-inliermetric)
**Datatype:** float. Default: 0.95 | | **Data split parameters** | `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) (external website).
**Datatype:** Dictionary. | `test_size` | Fraction of data that should be used for testing instead of training.
**Datatype:** Positive float < 1. @@ -644,13 +643,11 @@ The `inlier_metric` is a metric aimed at quantifying how different a prediction User can set `inlier_metric_window` to set the look back window. FreqAI will compute the distance between the present prediction point and each of the previous data points (total of `inlier_metric_window` points). -This function goes one step further - during training, it computes the `inlier_metric` for all training data points and builds weibull distributions for each each lookback point. If one of the distances falls in the tail of the respective weibull distribution, it is considered an "outlier." If the distance to the lookback point is not in the tail, it is considered an "inlier." Inliers receive a value of 1, and outliers receive a value of 0. +This function goes one step further - during training, it computes the `inlier_metric` for all training data points and builds weibull distributions for each each lookback point. The cumulative distribution function for the weibull distribution is used to produce a quantile for each of the data points. The quantiles for each lookback point are averaged to create the `inlier_metric`. -FreqAI adds this `inlier_metric` score to the training features! Thus, your model is trained to recognize how this temporal inlier metric is evolving. +FreqAI adds this `inlier_metric` score to the training features! In other words, your model is trained to recognize how this temporal inlier metric is related to the user set labels. -Users can control the weibull threshold using the `inlier_metric_weibull_cutoff` - -This function does not currently remove outliers from the data set. +This function does **not** remove outliers from the data set. ### Controlling the model learning process diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 0158996c7..9d4a69287 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -735,7 +735,6 @@ class FreqaiDataKitchen: import scipy.stats as ss no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"] - weib_pct = self.freqai_config["feature_parameters"]["inlier_metric_weibull_cutoff"] if set_ == 'train': compute_df = copy.deepcopy(self.data_dictionary['train_features']) @@ -780,12 +779,10 @@ class FreqaiDataKitchen: for key in distances.keys(): current_distances = distances[key].dropna() fit_params = ss.weibull_min.fit(current_distances) - cutoff = ss.weibull_min.ppf(weib_pct, *fit_params) - is_inlier = np.where( - current_distances <= cutoff, 1, 0 - ) + quantiles = ss.weibull_min.cdf(current_distances, *fit_params) + df_inlier = pd.DataFrame( - {key + '_IsInlier': is_inlier}, index=distances.index + {key: quantiles}, index=distances.index ) inliers = pd.concat( [inliers, df_inlier], axis=1 From c9be66b5b639f151abb1d9a5e76267d752eddb3b Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sat, 3 Sep 2022 15:52:29 +0200 Subject: [PATCH 07/11] increase test coverage for dk, improve function naming, extra cleaning --- freqtrade/freqai/data_kitchen.py | 30 ++++++----- freqtrade/freqai/freqai_interface.py | 10 ++-- tests/freqai/conftest.py | 32 +++++++++++ tests/freqai/test_freqai_datakitchen.py | 72 ++++++++++++++++++++++++- tests/freqai/test_freqai_interface.py | 34 ++++++------ 5 files changed, 143 insertions(+), 35 deletions(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 9d4a69287..fce9e8480 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -1,7 +1,7 @@ import copy -import datetime import logging import shutil +from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Tuple @@ -345,7 +345,7 @@ class FreqaiDataKitchen: def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame: """ - Normalize a set of data using the mean and standard deviation from + Denormalize a set of data using the mean and standard deviation from the associated training data. :param df: Dataframe of predictions to be denormalized """ @@ -384,7 +384,7 @@ class FreqaiDataKitchen: config_timerange = TimeRange.parse_timerange(self.config["timerange"]) if config_timerange.stopts == 0: config_timerange.stopts = int( - datetime.datetime.now(tz=datetime.timezone.utc).timestamp() + datetime.now(tz=timezone.utc).timestamp() ) timerange_train = copy.deepcopy(full_timerange) timerange_backtest = copy.deepcopy(full_timerange) @@ -401,8 +401,8 @@ class FreqaiDataKitchen: timerange_train.stopts = timerange_train.startts + train_period_days first = False - start = datetime.datetime.utcfromtimestamp(timerange_train.startts) - stop = datetime.datetime.utcfromtimestamp(timerange_train.stopts) + start = datetime.utcfromtimestamp(timerange_train.startts) + stop = datetime.utcfromtimestamp(timerange_train.stopts) tr_training_list.append(start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")) tr_training_list_timerange.append(copy.deepcopy(timerange_train)) @@ -415,8 +415,8 @@ class FreqaiDataKitchen: if timerange_backtest.stopts > config_timerange.stopts: timerange_backtest.stopts = config_timerange.stopts - start = datetime.datetime.utcfromtimestamp(timerange_backtest.startts) - stop = datetime.datetime.utcfromtimestamp(timerange_backtest.stopts) + start = datetime.utcfromtimestamp(timerange_backtest.startts) + stop = datetime.utcfromtimestamp(timerange_backtest.stopts) tr_backtesting_list.append(start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")) tr_backtesting_list_timerange.append(copy.deepcopy(timerange_backtest)) @@ -436,8 +436,8 @@ class FreqaiDataKitchen: it is sliced down to just the present training period. """ - start = datetime.datetime.fromtimestamp(timerange.startts, tz=datetime.timezone.utc) - stop = datetime.datetime.fromtimestamp(timerange.stopts, tz=datetime.timezone.utc) + start = datetime.fromtimestamp(timerange.startts, tz=timezone.utc) + stop = datetime.fromtimestamp(timerange.stopts, tz=timezone.utc) df = df.loc[df["date"] >= start, :] df = df.loc[df["date"] <= stop, :] @@ -808,6 +808,8 @@ class FreqaiDataKitchen: [compute_df, inlier_metric], axis=1) self.data_dictionary['prediction_features'].fillna(0, inplace=True) + logger.info('Inlier metric computed and added to features.') + return None def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10): @@ -948,14 +950,14 @@ class FreqaiDataKitchen: "Please indicate the end date of your desired backtesting. " "timerange.") # backtest_timerange.stopts = int( - # datetime.datetime.now(tz=datetime.timezone.utc).timestamp() + # datetime.now(tz=timezone.utc).timestamp() # ) backtest_timerange.startts = ( backtest_timerange.startts - backtest_period_days * SECONDS_IN_DAY ) - start = datetime.datetime.utcfromtimestamp(backtest_timerange.startts) - stop = datetime.datetime.utcfromtimestamp(backtest_timerange.stopts) + start = datetime.utcfromtimestamp(backtest_timerange.startts) + stop = datetime.utcfromtimestamp(backtest_timerange.stopts) full_timerange = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d") self.full_path = Path( @@ -981,7 +983,7 @@ class FreqaiDataKitchen: :return: bool = If the model is expired or not. """ - time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp() + time = datetime.now(tz=timezone.utc).timestamp() elapsed_time = (time - trained_timestamp) / 3600 # hours max_time = self.freqai_config.get("expiration_hours", 0) if max_time > 0: @@ -993,7 +995,7 @@ class FreqaiDataKitchen: self, trained_timestamp: int ) -> Tuple[bool, TimeRange, TimeRange]: - time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp() + time = datetime.now(tz=timezone.utc).timestamp() trained_timerange = TimeRange() data_load_timerange = TimeRange() diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 893f960ea..fd0554248 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -1,10 +1,10 @@ # import contextlib -import datetime import logging import shutil import threading import time from abc import ABC, abstractmethod +from datetime import datetime from pathlib import Path from threading import Lock from typing import Any, Dict, Tuple @@ -174,7 +174,7 @@ class IFreqaiModel(ABC): if retrain: self.train_timer('start') - self.train_model_in_series( + self.extract_data_and_train_model( new_trained_timerange, pair, strategy, dk, data_load_timerange ) self.train_timer('stop') @@ -214,10 +214,10 @@ class IFreqaiModel(ABC): dataframe_backtest = dk.slice_dataframe(tr_backtest, dataframe) trained_timestamp = tr_train - tr_train_startts_str = datetime.datetime.utcfromtimestamp(tr_train.startts).strftime( + tr_train_startts_str = datetime.utcfromtimestamp(tr_train.startts).strftime( "%Y-%m-%d %H:%M:%S" ) - tr_train_stopts_str = datetime.datetime.utcfromtimestamp(tr_train.stopts).strftime( + tr_train_stopts_str = datetime.utcfromtimestamp(tr_train.stopts).strftime( "%Y-%m-%d %H:%M:%S" ) logger.info( @@ -495,7 +495,7 @@ class IFreqaiModel(ABC): Path(self.full_path, Path(self.config["config_files"][0]).name), ) - def train_model_in_series( + def extract_data_and_train_model( self, new_trained_timerange: TimeRange, pair: str, diff --git a/tests/freqai/conftest.py b/tests/freqai/conftest.py index dd148da77..6528347e8 100644 --- a/tests/freqai/conftest.py +++ b/tests/freqai/conftest.py @@ -82,6 +82,38 @@ def get_patched_freqaimodel(mocker, freqaiconf): return freqaimodel +def make_unfiltered_dataframe(mocker, freqai_conf): + freqai_conf.update({"timerange": "20180110-20180130"}) + + strategy = get_patched_freqai_strategy(mocker, freqai_conf) + exchange = get_patched_exchange(mocker, freqai_conf) + strategy.dp = DataProvider(freqai_conf, exchange) + strategy.freqai_info = freqai_conf.get("freqai", {}) + freqai = strategy.freqai + freqai.live = True + freqai.dk = FreqaiDataKitchen(freqai_conf) + freqai.dk.pair = "ADA/BTC" + timerange = TimeRange.parse_timerange("20180110-20180130") + freqai.dd.load_all_pair_histories(timerange, freqai.dk) + + freqai.dd.pair_dict = MagicMock() + + data_load_timerange = TimeRange.parse_timerange("20180110-20180130") + new_timerange = TimeRange.parse_timerange("20180120-20180130") + + corr_dataframes, base_dataframes = freqai.dd.get_base_and_corr_dataframes( + data_load_timerange, freqai.dk.pair, freqai.dk + ) + + unfiltered_dataframe = freqai.dk.use_strategy_to_populate_indicators( + strategy, corr_dataframes, base_dataframes, freqai.dk.pair + ) + + unfiltered_dataframe = freqai.dk.slice_dataframe(new_timerange, unfiltered_dataframe) + + return freqai, unfiltered_dataframe + + def make_data_dictionary(mocker, freqai_conf): freqai_conf.update({"timerange": "20180110-20180130"}) diff --git a/tests/freqai/test_freqai_datakitchen.py b/tests/freqai/test_freqai_datakitchen.py index 9ef955695..2204e94c6 100644 --- a/tests/freqai/test_freqai_datakitchen.py +++ b/tests/freqai/test_freqai_datakitchen.py @@ -6,7 +6,8 @@ import pytest from freqtrade.exceptions import OperationalException from tests.conftest import log_has_re -from tests.freqai.conftest import get_patched_data_kitchen, make_data_dictionary +from tests.freqai.conftest import (get_patched_data_kitchen, make_data_dictionary, + make_unfiltered_dataframe) @pytest.mark.parametrize( @@ -94,3 +95,72 @@ def test_use_SVM_to_remove_outliers_and_outlier_protection(mocker, freqai_conf, "SVM detected 8.46%", caplog, ) + + +def test_compute_inlier_metric(mocker, freqai_conf, caplog): + freqai = make_data_dictionary(mocker, freqai_conf) + freqai_conf['freqai']['feature_parameters'].update({"inlier_metric_window": 10}) + freqai.dk.compute_inlier_metric(set_='train') + assert log_has_re( + "Inlier metric computed and added to features.", + caplog, + ) + + +def test_add_noise_to_training_features(mocker, freqai_conf): + freqai = make_data_dictionary(mocker, freqai_conf) + freqai_conf['freqai']['feature_parameters'].update({"noise_standard_deviation": 0.1}) + freqai.dk.add_noise_to_training_features() + + +def test_remove_beginning_points_from_data_dict(mocker, freqai_conf): + freqai = make_data_dictionary(mocker, freqai_conf) + freqai.dk.remove_beginning_points_from_data_dict(set_='train') + + +def test_principal_component_analysis(mocker, freqai_conf, caplog): + freqai = make_data_dictionary(mocker, freqai_conf) + freqai.dk.principal_component_analysis() + assert log_has_re( + "reduced feature dimension by", + caplog, + ) + + +def test_normalize_data(mocker, freqai_conf): + freqai = make_data_dictionary(mocker, freqai_conf) + data_dict = freqai.dk.data_dictionary + freqai.dk.normalize_data(data_dict) + assert len(freqai.dk.data) == 56 + + +def test_filter_features(mocker, freqai_conf): + freqai, unfiltered_dataframe = make_unfiltered_dataframe(mocker, freqai_conf) + freqai.dk.find_features(unfiltered_dataframe) + + filtered_df, labels = freqai.dk.filter_features( + unfiltered_dataframe, + freqai.dk.training_features_list, + freqai.dk.label_list, + training_filter=True, + ) + + assert len(filtered_df.columns) == 26 + + +def test_make_train_test_datasets(mocker, freqai_conf): + freqai, unfiltered_dataframe = make_unfiltered_dataframe(mocker, freqai_conf) + freqai.dk.find_features(unfiltered_dataframe) + + features_filtered, labels_filtered = freqai.dk.filter_features( + unfiltered_dataframe, + freqai.dk.training_features_list, + freqai.dk.label_list, + training_filter=True, + ) + + data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered) + + assert data_dictionary + assert len(data_dictionary) == 7 + assert len(data_dictionary['train_features'].index) == 1916 diff --git a/tests/freqai/test_freqai_interface.py b/tests/freqai/test_freqai_interface.py index 792ffc467..927af2a02 100644 --- a/tests/freqai/test_freqai_interface.py +++ b/tests/freqai/test_freqai_interface.py @@ -17,7 +17,7 @@ def is_arm() -> bool: return "arm" in machine or "aarch64" in machine -def test_train_model_in_series_LightGBM(mocker, freqai_conf): +def test_extract_data_and_train_model_LightGBM(mocker, freqai_conf): freqai_conf.update({"timerange": "20180110-20180130"}) strategy = get_patched_freqai_strategy(mocker, freqai_conf) @@ -35,7 +35,8 @@ def test_train_model_in_series_LightGBM(mocker, freqai_conf): data_load_timerange = TimeRange.parse_timerange("20180110-20180130") new_timerange = TimeRange.parse_timerange("20180120-20180130") - freqai.train_model_in_series(new_timerange, "ADA/BTC", strategy, freqai.dk, data_load_timerange) + freqai.extract_data_and_train_model( + new_timerange, "ADA/BTC", strategy, freqai.dk, data_load_timerange) assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_model.joblib").is_file() assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_metadata.json").is_file() @@ -45,7 +46,7 @@ def test_train_model_in_series_LightGBM(mocker, freqai_conf): shutil.rmtree(Path(freqai.dk.full_path)) -def test_train_model_in_series_LightGBMMultiModel(mocker, freqai_conf): +def test_extract_data_and_train_model_LightGBMMultiModel(mocker, freqai_conf): freqai_conf.update({"timerange": "20180110-20180130"}) freqai_conf.update({"strategy": "freqai_test_multimodel_strat"}) freqai_conf.update({"freqaimodel": "LightGBMRegressorMultiTarget"}) @@ -64,7 +65,8 @@ def test_train_model_in_series_LightGBMMultiModel(mocker, freqai_conf): data_load_timerange = TimeRange.parse_timerange("20180110-20180130") new_timerange = TimeRange.parse_timerange("20180120-20180130") - freqai.train_model_in_series(new_timerange, "ADA/BTC", strategy, freqai.dk, data_load_timerange) + freqai.extract_data_and_train_model( + new_timerange, "ADA/BTC", strategy, freqai.dk, data_load_timerange) assert len(freqai.dk.label_list) == 2 assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_model.joblib").is_file() @@ -77,7 +79,7 @@ def test_train_model_in_series_LightGBMMultiModel(mocker, freqai_conf): @pytest.mark.skipif(is_arm(), reason="no ARM for Catboost ...") -def test_train_model_in_series_Catboost(mocker, freqai_conf): +def test_extract_data_and_train_model_Catboost(mocker, freqai_conf): freqai_conf.update({"timerange": "20180110-20180130"}) freqai_conf.update({"freqaimodel": "CatboostRegressor"}) # freqai_conf.get('freqai', {}).update( @@ -98,8 +100,8 @@ def test_train_model_in_series_Catboost(mocker, freqai_conf): data_load_timerange = TimeRange.parse_timerange("20180110-20180130") new_timerange = TimeRange.parse_timerange("20180120-20180130") - freqai.train_model_in_series(new_timerange, "ADA/BTC", - strategy, freqai.dk, data_load_timerange) + freqai.extract_data_and_train_model(new_timerange, "ADA/BTC", + strategy, freqai.dk, data_load_timerange) assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_model.joblib").exists() assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_metadata.json").exists() @@ -110,7 +112,7 @@ def test_train_model_in_series_Catboost(mocker, freqai_conf): @pytest.mark.skipif(is_arm(), reason="no ARM for Catboost ...") -def test_train_model_in_series_CatboostClassifier(mocker, freqai_conf): +def test_extract_data_and_train_model_CatboostClassifier(mocker, freqai_conf): freqai_conf.update({"timerange": "20180110-20180130"}) freqai_conf.update({"freqaimodel": "CatboostClassifier"}) freqai_conf.update({"strategy": "freqai_test_classifier"}) @@ -130,8 +132,8 @@ def test_train_model_in_series_CatboostClassifier(mocker, freqai_conf): data_load_timerange = TimeRange.parse_timerange("20180110-20180130") new_timerange = TimeRange.parse_timerange("20180120-20180130") - freqai.train_model_in_series(new_timerange, "ADA/BTC", - strategy, freqai.dk, data_load_timerange) + freqai.extract_data_and_train_model(new_timerange, "ADA/BTC", + strategy, freqai.dk, data_load_timerange) assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_model.joblib").exists() assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_metadata.json").exists() @@ -141,7 +143,7 @@ def test_train_model_in_series_CatboostClassifier(mocker, freqai_conf): shutil.rmtree(Path(freqai.dk.full_path)) -def test_train_model_in_series_LightGBMClassifier(mocker, freqai_conf): +def test_extract_data_and_train_model_LightGBMClassifier(mocker, freqai_conf): freqai_conf.update({"timerange": "20180110-20180130"}) freqai_conf.update({"freqaimodel": "LightGBMClassifier"}) freqai_conf.update({"strategy": "freqai_test_classifier"}) @@ -161,8 +163,8 @@ def test_train_model_in_series_LightGBMClassifier(mocker, freqai_conf): data_load_timerange = TimeRange.parse_timerange("20180110-20180130") new_timerange = TimeRange.parse_timerange("20180120-20180130") - freqai.train_model_in_series(new_timerange, "ADA/BTC", - strategy, freqai.dk, data_load_timerange) + freqai.extract_data_and_train_model(new_timerange, "ADA/BTC", + strategy, freqai.dk, data_load_timerange) assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_model.joblib").exists() assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_metadata.json").exists() @@ -289,7 +291,8 @@ def test_follow_mode(mocker, freqai_conf): data_load_timerange = TimeRange.parse_timerange("20180110-20180130") new_timerange = TimeRange.parse_timerange("20180120-20180130") - freqai.train_model_in_series(new_timerange, "ADA/BTC", strategy, freqai.dk, data_load_timerange) + freqai.extract_data_and_train_model( + new_timerange, "ADA/BTC", strategy, freqai.dk, data_load_timerange) assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_model.joblib").is_file() assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_metadata.json").is_file() @@ -338,7 +341,8 @@ def test_principal_component_analysis(mocker, freqai_conf): data_load_timerange = TimeRange.parse_timerange("20180110-20180130") new_timerange = TimeRange.parse_timerange("20180120-20180130") - freqai.train_model_in_series(new_timerange, "ADA/BTC", strategy, freqai.dk, data_load_timerange) + freqai.extract_data_and_train_model( + new_timerange, "ADA/BTC", strategy, freqai.dk, data_load_timerange) assert Path(freqai.dk.data_path / f"{freqai.dk.model_filename}_pca_object.pkl") From fa8d5b9834ab1ba872e5ca2397e24ed3df9968a7 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sat, 3 Sep 2022 16:05:18 +0200 Subject: [PATCH 08/11] add documentation for noise_standard_deviation` --- docs/freqai.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/freqai.md b/docs/freqai.md index 2c6efa3b9..d504c93d6 100644 --- a/docs/freqai.md +++ b/docs/freqai.md @@ -116,6 +116,7 @@ Mandatory parameters are marked as **Required**, which means that they are requi | `outlier_protection_percentage` | If more than `outlier_protection_percentage` fraction of points are removed as outliers, FreqAI will log a warning message and ignore outlier detection while keeping the original dataset intact.
**Datatype:** float. Default: `30` | `reverse_train_test_order` | If true, FreqAI will train on the latest data split and test on historical split of the data. This allows the model to be trained up to the most recent data point, while avoiding overfitting. However, users should be careful to understand unorthodox nature of this parameter before employing it.
**Datatype:** bool. Default: False | `inlier_metric_window` | If set, FreqAI will add the `inlier_metric` to the training feature set and set the lookback to be the `inlier_metric_window`. Details of how the `inlier_metric` is computed can be found [here](#using-the-inliermetric)
**Datatype:** int. Default: 0 +| `noise_standard_deviation` | If > 0, FreqAI adds noise to the training features. FreqAI generates random deviates from a gaussian distribution with a standard deviation of `noise_standard_deviation` and adds them to all data points. Value should be kept relative to the normalized space between -1 and 1). In other words, since data is always normalized between -1 and 1 in FreqAI, the user can expect a `noise_standard_deviation: 0.05` to see 32% of data randomly increased/decreased by more than 2.5%. Good for preventing overfitting.
**Datatype:** int. Default: 0 | | **Data split parameters** | `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) (external website).
**Datatype:** Dictionary. | `test_size` | Fraction of data that should be used for testing instead of training.
**Datatype:** Positive float < 1. From 4b28d0495f46337700686f1b430827336285511a Mon Sep 17 00:00:00 2001 From: robcaulk Date: Tue, 6 Sep 2022 19:46:58 +0200 Subject: [PATCH 09/11] fix timestamping, move imports, add words to doc --- docs/freqai.md | 2 +- freqtrade/freqai/data_kitchen.py | 26 ++++++++++++-------------- freqtrade/freqai/freqai_interface.py | 15 +++++++-------- tests/freqai/conftest.py | 12 +++++------- 4 files changed, 25 insertions(+), 30 deletions(-) diff --git a/docs/freqai.md b/docs/freqai.md index d504c93d6..a8379106a 100644 --- a/docs/freqai.md +++ b/docs/freqai.md @@ -116,7 +116,7 @@ Mandatory parameters are marked as **Required**, which means that they are requi | `outlier_protection_percentage` | If more than `outlier_protection_percentage` fraction of points are removed as outliers, FreqAI will log a warning message and ignore outlier detection while keeping the original dataset intact.
**Datatype:** float. Default: `30` | `reverse_train_test_order` | If true, FreqAI will train on the latest data split and test on historical split of the data. This allows the model to be trained up to the most recent data point, while avoiding overfitting. However, users should be careful to understand unorthodox nature of this parameter before employing it.
**Datatype:** bool. Default: False | `inlier_metric_window` | If set, FreqAI will add the `inlier_metric` to the training feature set and set the lookback to be the `inlier_metric_window`. Details of how the `inlier_metric` is computed can be found [here](#using-the-inliermetric)
**Datatype:** int. Default: 0 -| `noise_standard_deviation` | If > 0, FreqAI adds noise to the training features. FreqAI generates random deviates from a gaussian distribution with a standard deviation of `noise_standard_deviation` and adds them to all data points. Value should be kept relative to the normalized space between -1 and 1). In other words, since data is always normalized between -1 and 1 in FreqAI, the user can expect a `noise_standard_deviation: 0.05` to see 32% of data randomly increased/decreased by more than 2.5%. Good for preventing overfitting.
**Datatype:** int. Default: 0 +| `noise_standard_deviation` | If > 0, FreqAI adds noise to the training features. FreqAI generates random deviates from a gaussian distribution with a standard deviation of `noise_standard_deviation` and adds them to all data points. Value should be kept relative to the normalized space between -1 and 1). In other words, since data is always normalized between -1 and 1 in FreqAI, the user can expect a `noise_standard_deviation: 0.05` to see 32% of data randomly increased/decreased by more than 2.5% (i.e. the percent of data falling within the first standard deviation). Good for preventing overfitting.
**Datatype:** int. Default: 0 | | **Data split parameters** | `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) (external website).
**Datatype:** Dictionary. | `test_size` | Fraction of data that should be used for testing instead of training.
**Datatype:** Positive float < 1. diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index fce9e8480..8ef2d6aea 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -2,12 +2,14 @@ import copy import logging import shutil from datetime import datetime, timezone +from math import cos, sin from pathlib import Path from typing import Any, Dict, List, Tuple import numpy as np import numpy.typing as npt import pandas as pd +import scipy.stats as stats from pandas import DataFrame from sklearn import linear_model from sklearn.cluster import DBSCAN @@ -401,8 +403,8 @@ class FreqaiDataKitchen: timerange_train.stopts = timerange_train.startts + train_period_days first = False - start = datetime.utcfromtimestamp(timerange_train.startts) - stop = datetime.utcfromtimestamp(timerange_train.stopts) + start = datetime.fromtimestamp(timerange_train.startts, tz=timezone.utc) + stop = datetime.fromtimestamp(timerange_train.stopts, tz=timezone.utc) tr_training_list.append(start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")) tr_training_list_timerange.append(copy.deepcopy(timerange_train)) @@ -415,8 +417,8 @@ class FreqaiDataKitchen: if timerange_backtest.stopts > config_timerange.stopts: timerange_backtest.stopts = config_timerange.stopts - start = datetime.utcfromtimestamp(timerange_backtest.startts) - stop = datetime.utcfromtimestamp(timerange_backtest.stopts) + start = datetime.fromtimestamp(timerange_backtest.startts, tz=timezone.utc) + stop = datetime.fromtimestamp(timerange_backtest.stopts, tz=timezone.utc) tr_backtesting_list.append(start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")) tr_backtesting_list_timerange.append(copy.deepcopy(timerange_backtest)) @@ -630,8 +632,6 @@ class FreqaiDataKitchen: is an outlier. """ - from math import cos, sin - if predict: if not self.data['DBSCAN_eps']: return @@ -732,8 +732,6 @@ class FreqaiDataKitchen: into previous timepoints. """ - import scipy.stats as ss - no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"] if set_ == 'train': @@ -778,8 +776,8 @@ class FreqaiDataKitchen: inliers = pd.DataFrame(index=distances.index) for key in distances.keys(): current_distances = distances[key].dropna() - fit_params = ss.weibull_min.fit(current_distances) - quantiles = ss.weibull_min.cdf(current_distances, *fit_params) + fit_params = stats.weibull_min.fit(current_distances) + quantiles = stats.weibull_min.cdf(current_distances, *fit_params) df_inlier = pd.DataFrame( {key: quantiles}, index=distances.index @@ -794,8 +792,8 @@ class FreqaiDataKitchen: index=compute_df.index ) - inlier_metric = 2 * (inlier_metric - inlier_metric.min()) / \ - (inlier_metric.max() - inlier_metric.min()) - 1 + inlier_metric = (2 * (inlier_metric - inlier_metric.min()) / + (inlier_metric.max() - inlier_metric.min()) - 1) if set_ in ('train', 'test'): inlier_metric = inlier_metric.iloc[no_prev_pts:] @@ -956,8 +954,8 @@ class FreqaiDataKitchen: backtest_timerange.startts = ( backtest_timerange.startts - backtest_period_days * SECONDS_IN_DAY ) - start = datetime.utcfromtimestamp(backtest_timerange.startts) - stop = datetime.utcfromtimestamp(backtest_timerange.stopts) + start = datetime.fromtimestamp(backtest_timerange.startts, tz=timezone.utc) + stop = datetime.fromtimestamp(backtest_timerange.stopts, tz=timezone.utc) full_timerange = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d") self.full_path = Path( diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index fd0554248..9b3e853ef 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -1,10 +1,9 @@ -# import contextlib import logging import shutil import threading import time from abc import ABC, abstractmethod -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path from threading import Lock from typing import Any, Dict, Tuple @@ -214,12 +213,12 @@ class IFreqaiModel(ABC): dataframe_backtest = dk.slice_dataframe(tr_backtest, dataframe) trained_timestamp = tr_train - tr_train_startts_str = datetime.utcfromtimestamp(tr_train.startts).strftime( - "%Y-%m-%d %H:%M:%S" - ) - tr_train_stopts_str = datetime.utcfromtimestamp(tr_train.stopts).strftime( - "%Y-%m-%d %H:%M:%S" - ) + tr_train_startts_str = datetime.fromtimestamp( + tr_train.startts, + tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S") + tr_train_stopts_str = datetime.fromtimestamp( + tr_train.stopts, + tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S") logger.info( f"Training {metadata['pair']}, {self.pair_it}/{self.total_pairs} pairs" f" from {tr_train_startts_str} to {tr_train_stopts_str}, {train_it}/{total_trains} " diff --git a/tests/freqai/conftest.py b/tests/freqai/conftest.py index 6528347e8..ffdc52ebc 100644 --- a/tests/freqai/conftest.py +++ b/tests/freqai/conftest.py @@ -93,12 +93,11 @@ def make_unfiltered_dataframe(mocker, freqai_conf): freqai.live = True freqai.dk = FreqaiDataKitchen(freqai_conf) freqai.dk.pair = "ADA/BTC" - timerange = TimeRange.parse_timerange("20180110-20180130") - freqai.dd.load_all_pair_histories(timerange, freqai.dk) + data_load_timerange = TimeRange.parse_timerange("20180110-20180130") + freqai.dd.load_all_pair_histories(data_load_timerange, freqai.dk) freqai.dd.pair_dict = MagicMock() - - data_load_timerange = TimeRange.parse_timerange("20180110-20180130") + new_timerange = TimeRange.parse_timerange("20180120-20180130") corr_dataframes, base_dataframes = freqai.dd.get_base_and_corr_dataframes( @@ -125,12 +124,11 @@ def make_data_dictionary(mocker, freqai_conf): freqai.live = True freqai.dk = FreqaiDataKitchen(freqai_conf) freqai.dk.pair = "ADA/BTC" - timerange = TimeRange.parse_timerange("20180110-20180130") - freqai.dd.load_all_pair_histories(timerange, freqai.dk) + data_load_timerange = TimeRange.parse_timerange("20180110-20180130") + freqai.dd.load_all_pair_histories(data_load_timerange, freqai.dk) freqai.dd.pair_dict = MagicMock() - data_load_timerange = TimeRange.parse_timerange("20180110-20180130") new_timerange = TimeRange.parse_timerange("20180120-20180130") corr_dataframes, base_dataframes = freqai.dd.get_base_and_corr_dataframes( From e83c9b276dab1659da3ea245792fe858b43ed054 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Tue, 6 Sep 2022 19:56:52 +0200 Subject: [PATCH 10/11] fix whitespace --- freqtrade/freqai/data_kitchen.py | 2 +- tests/freqai/conftest.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 8ef2d6aea..471634c85 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -9,7 +9,7 @@ from typing import Any, Dict, List, Tuple import numpy as np import numpy.typing as npt import pandas as pd -import scipy.stats as stats +from scipy import stats from pandas import DataFrame from sklearn import linear_model from sklearn.cluster import DBSCAN diff --git a/tests/freqai/conftest.py b/tests/freqai/conftest.py index ffdc52ebc..2bd744455 100644 --- a/tests/freqai/conftest.py +++ b/tests/freqai/conftest.py @@ -97,7 +97,7 @@ def make_unfiltered_dataframe(mocker, freqai_conf): freqai.dd.load_all_pair_histories(data_load_timerange, freqai.dk) freqai.dd.pair_dict = MagicMock() - + new_timerange = TimeRange.parse_timerange("20180120-20180130") corr_dataframes, base_dataframes = freqai.dd.get_base_and_corr_dataframes( From d44296783e6264e25a3c20cfc7f5e104883f8ce2 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Tue, 6 Sep 2022 20:10:12 +0200 Subject: [PATCH 11/11] isort datakitchen --- freqtrade/freqai/data_kitchen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 471634c85..2ed0d73af 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -9,8 +9,8 @@ from typing import Any, Dict, List, Tuple import numpy as np import numpy.typing as npt import pandas as pd -from scipy import stats from pandas import DataFrame +from scipy import stats from sklearn import linear_model from sklearn.cluster import DBSCAN from sklearn.metrics.pairwise import pairwise_distances