From 683b084323d45e0647327756e71ed4ccd4f1d6dd Mon Sep 17 00:00:00 2001 From: th0rntwig Date: Wed, 28 Sep 2022 18:23:56 +0200 Subject: [PATCH 1/4] Set train-test-split shuffle=False as default and remove stratification --- docs/freqai-parameter-table.md | 3 +-- docs/freqai-running.md | 17 ----------------- freqtrade/freqai/data_kitchen.py | 12 +++--------- 3 files changed, 4 insertions(+), 28 deletions(-) diff --git a/docs/freqai-parameter-table.md b/docs/freqai-parameter-table.md index 5969f43c6..c4d044ba4 100644 --- a/docs/freqai-parameter-table.md +++ b/docs/freqai-parameter-table.md @@ -27,8 +27,7 @@ Mandatory parameters are marked as **Required** and have to be set in one of the | `weight_factor` | Weight training data points according to their recency (see details [here](freqai-feature-engineering.md#weighting-features-for-temporal-importance)).
**Datatype:** Positive float (typically < 1). | `indicator_max_period_candles` | **No longer used (#7325)**. Replaced by `startup_candle_count` which is set in the [strategy](freqai-configuration.md#building-a-freqai-strategy). `startup_candle_count` is timeframe independent and defines the maximum *period* used in `populate_any_indicators()` for indicator creation. `FreqAI` uses this parameter together with the maximum timeframe in `include_time_frames` to calculate how many data points to download such that the first data point does not include a NaN
**Datatype:** Positive integer. | `indicator_periods_candles` | Time periods to calculate indicators for. The indicators are added to the base indicator dataset.
**Datatype:** List of positive integers. -| `stratify_training_data` | Split the feature set into training and testing datasets. For example, `stratify_training_data: 2` would set every 2nd data point into a separate dataset to be pulled from during training/testing. See details about how it works [here](freqai-running.md#data-stratification-for-training-and-testing-the-model).
**Datatype:** Positive integer. -| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis)
**Datatype:** Boolean. defaults to `false`. +| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis)
**Datatype:** Boolean. defaults to `False`. | `plot_feature_importances` | Create a feature importance plot for each model for the top/bottom `plot_feature_importances` number of features.
**Datatype:** Integer, defaults to `0`. | `DI_threshold` | Activates the use of the Dissimilarity Index for outlier detection when set to > 0. See details about how it works [here](freqai-feature-engineering.md#identifying-outliers-with-the-dissimilarity-index-di).
**Datatype:** Positive float (typically < 1). | `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training dataset, as well as from incoming data points. See details about how it works [here](freqai-feature-engineering.md#identifying-outliers-using-a-support-vector-machine-svm).
**Datatype:** Boolean. diff --git a/docs/freqai-running.md b/docs/freqai-running.md index 6c7b56da1..bfefe88c2 100644 --- a/docs/freqai-running.md +++ b/docs/freqai-running.md @@ -105,23 +105,6 @@ During dry/live mode, FreqAI trains each coin pair sequentially (on separate thr In the presented example config, the user will only allow predictions on models that are less than 1/2 hours old. -## Data stratification for training and testing the model - -You can stratify (group) the training/testing data using: - -```json - "freqai": { - "feature_parameters" : { - "stratify_training_data": 3 - } - } -``` - -This will split the data chronologically so that every Xth data point is used to test the model after training. In the example above, the user is asking for every third data point in the dataframe to be used for -testing; the other points are used for training. - -The test data is used to evaluate the performance of the model after training. If the test score is high, the model is able to capture the behavior of the data well. If the test score is low, either the model does not capture the complexity of the data, the test data is significantly different from the train data, or a different type of model should be used. - ## Controlling the model learning process Model training parameters are unique to the selected machine learning library. FreqAI allows you to set any parameter for any library using the `model_training_parameters` dictionary in the config. The example config (found in `config_examples/config_freqai.example.json`) shows some of the example parameters associated with `Catboost` and `LightGBM`, but you can add any parameters available in those libraries or any other machine learning library you choose to implement. diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index f4fa4e5fd..9e22667f3 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -134,20 +134,14 @@ class FreqaiDataKitchen: """ feat_dict = self.freqai_config["feature_parameters"] + shuffle = self.freqai_config.get('data_split_parameters', {}).get('shuffle', False) + weights: npt.ArrayLike if feat_dict.get("weight_factor", 0) > 0: weights = self.set_weights_higher_recent(len(filtered_dataframe)) else: weights = np.ones(len(filtered_dataframe)) - if feat_dict.get("stratify_training_data", 0) > 0: - stratification = np.zeros(len(filtered_dataframe)) - for i in range(1, len(stratification)): - if i % feat_dict.get("stratify_training_data", 0) == 0: - stratification[i] = 1 - else: - stratification = None - if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: ( train_features, @@ -160,7 +154,7 @@ class FreqaiDataKitchen: filtered_dataframe[: filtered_dataframe.shape[0]], labels, weights, - stratify=stratification, + shuffle=shuffle, **self.config["freqai"]["data_split_parameters"], ) else: From 772abfc6f033aec44c414cf3d183485bd5b5979c Mon Sep 17 00:00:00 2001 From: th0rntwig Date: Wed, 28 Sep 2022 19:29:02 +0200 Subject: [PATCH 2/4] Add default value for shuffle in docs --- docs/freqai-parameter-table.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/freqai-parameter-table.md b/docs/freqai-parameter-table.md index c4d044ba4..8e19226ba 100644 --- a/docs/freqai-parameter-table.md +++ b/docs/freqai-parameter-table.md @@ -40,7 +40,7 @@ Mandatory parameters are marked as **Required** and have to be set in one of the | | **Data split parameters** | `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) (external website).
**Datatype:** Dictionary. | `test_size` | The fraction of data that should be used for testing instead of training.
**Datatype:** Positive float < 1. -| `shuffle` | Shuffle the training data points during training. Typically, for time-series forecasting, this is set to `False`.
**Datatype:** Boolean. +| `shuffle` | Shuffle the training data points during training. Typically, to not remove the chronological order of data in time-series forecasting, this is set to `False`.
**Datatype:** Boolean.
Defaut: `False`. | | **Model training parameters** | `model_training_parameters` | A flexible dictionary that includes all parameters available by the selected model library. For example, if you use `LightGBMRegressor`, this dictionary can contain any parameter available by the `LightGBMRegressor` [here](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html) (external website). If you select a different model, this dictionary can contain any parameter from that model.
**Datatype:** Dictionary. | `n_estimators` | The number of boosted trees to fit in regression.
**Datatype:** Integer. From 38aca8e908fb532f44e889d35257961437866b93 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Fri, 30 Sep 2022 00:22:31 +0200 Subject: [PATCH 3/4] fix failing svm test --- tests/freqai/test_freqai_datakitchen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/freqai/test_freqai_datakitchen.py b/tests/freqai/test_freqai_datakitchen.py index f7446420d..b99ac236d 100644 --- a/tests/freqai/test_freqai_datakitchen.py +++ b/tests/freqai/test_freqai_datakitchen.py @@ -86,7 +86,7 @@ def test_use_SVM_to_remove_outliers_and_outlier_protection(mocker, freqai_conf, freqai_conf['freqai']['feature_parameters'].update({"outlier_protection_percentage": 0.1}) freqai.dk.use_SVM_to_remove_outliers(predict=False) assert log_has_re( - "SVM detected 8.09%", + "SVM detected 8.66%", caplog, ) From be48131185764d0b707e706de4f0aa6d2a688193 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Fri, 30 Sep 2022 00:33:08 +0200 Subject: [PATCH 4/4] make shuffle false in constants --- freqtrade/constants.py | 1 + freqtrade/freqai/data_kitchen.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/freqtrade/constants.py b/freqtrade/constants.py index e14e81343..acab8489c 100644 --- a/freqtrade/constants.py +++ b/freqtrade/constants.py @@ -567,6 +567,7 @@ CONF_SCHEMA = { "properties": { "test_size": {"type": "number"}, "random_state": {"type": "integer"}, + "shuffle": {"type": "boolean", "default": False} }, }, "model_training_parameters": { diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 9e22667f3..5cf9b2f03 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -134,7 +134,8 @@ class FreqaiDataKitchen: """ feat_dict = self.freqai_config["feature_parameters"] - shuffle = self.freqai_config.get('data_split_parameters', {}).get('shuffle', False) + if 'shuffle' not in self.freqai_config['data_split_parameters']: + self.freqai_config["data_split_parameters"].update({'shuffle': False}) weights: npt.ArrayLike if feat_dict.get("weight_factor", 0) > 0: @@ -154,7 +155,6 @@ class FreqaiDataKitchen: filtered_dataframe[: filtered_dataframe.shape[0]], labels, weights, - shuffle=shuffle, **self.config["freqai"]["data_split_parameters"], ) else: