From c9bc91c75b8414d70bbe6291497d068f5b9d355e Mon Sep 17 00:00:00 2001 From: robcaulk Date: Fri, 16 Dec 2022 11:20:37 +0100 Subject: [PATCH 1/3] add shuffle_after_split option --- freqtrade/freqai/data_kitchen.py | 14 ++++++++++++++ tests/freqai/test_freqai_interface.py | 20 +++++++++++--------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 9c8158c8a..de6b74b21 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -1,5 +1,6 @@ import copy import logging +import random import shutil from datetime import datetime, timezone from math import cos, sin @@ -168,6 +169,19 @@ class FreqaiDataKitchen: train_labels = labels train_weights = weights + if feat_dict.get("shuffle_after_split", False): + rint1 = random.randint(0, 100) + rint2 = random.randint(0, 100) + train_features = train_features.sample( + frac=1, random_state=rint1).reset_index(drop=True) + train_labels = train_labels.sample(frac=1, random_state=rint1).reset_index(drop=True) + train_weights = pd.DataFrame(train_weights).sample( + frac=1, random_state=rint1).reset_index(drop=True).to_numpy()[:, 0] + test_features = test_features.sample(frac=1, random_state=rint2).reset_index(drop=True) + test_labels = test_labels.sample(frac=1, random_state=rint2).reset_index(drop=True) + test_weights = pd.DataFrame(test_weights).sample( + frac=1, random_state=rint2).reset_index(drop=True).to_numpy()[:, 0] + # Simplest way to reverse the order of training and test data: if self.freqai_config['feature_parameters'].get('reverse_train_test_order', False): return self.build_data_dictionary( diff --git a/tests/freqai/test_freqai_interface.py b/tests/freqai/test_freqai_interface.py index f19acb018..fde167823 100644 --- a/tests/freqai/test_freqai_interface.py +++ b/tests/freqai/test_freqai_interface.py @@ -27,16 +27,17 @@ def is_mac() -> bool: return "Darwin" in machine -@pytest.mark.parametrize('model, pca, dbscan, float32', [ - ('LightGBMRegressor', True, False, True), - ('XGBoostRegressor', False, True, False), - ('XGBoostRFRegressor', False, False, False), - ('CatboostRegressor', False, False, False), - ('ReinforcementLearner', False, True, False), - ('ReinforcementLearner_multiproc', False, False, False), - ('ReinforcementLearner_test_4ac', False, False, False) +@pytest.mark.parametrize('model, pca, dbscan, float32, shuffle', [ + ('LightGBMRegressor', True, False, True, False), + ('XGBoostRegressor', False, True, False, False), + ('XGBoostRFRegressor', False, False, False, False), + ('CatboostRegressor', False, False, False, True), + ('ReinforcementLearner', False, True, False, False), + ('ReinforcementLearner_multiproc', False, False, False, False), + ('ReinforcementLearner_test_4ac', False, False, False, False) ]) -def test_extract_data_and_train_model_Standard(mocker, freqai_conf, model, pca, dbscan, float32): +def test_extract_data_and_train_model_Standard(mocker, freqai_conf, model, pca, + dbscan, float32, shuffle): if is_arm() and model == 'CatboostRegressor': pytest.skip("CatBoost is not supported on ARM") @@ -50,6 +51,7 @@ def test_extract_data_and_train_model_Standard(mocker, freqai_conf, model, pca, freqai_conf['freqai']['feature_parameters'].update({"principal_component_analysis": pca}) freqai_conf['freqai']['feature_parameters'].update({"use_DBSCAN_to_remove_outliers": dbscan}) freqai_conf.update({"reduce_df_footprint": float32}) + freqai_conf['freqai']['feature_parameters'].update({"shuffle_after_split": shuffle}) if 'ReinforcementLearner' in model: model_save_ext = 'zip' From be85ef2707fbcc3dd68964da7b1af791238d890a Mon Sep 17 00:00:00 2001 From: robcaulk Date: Thu, 16 Feb 2023 18:50:11 +0100 Subject: [PATCH 2/3] add documentation for shuffle_after_split, add to constants --- docs/freqai-parameter-table.md | 1 + freqtrade/constants.py | 3 ++- freqtrade/freqai/data_kitchen.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/freqai-parameter-table.md b/docs/freqai-parameter-table.md index 43a066fb8..328e7c0b5 100644 --- a/docs/freqai-parameter-table.md +++ b/docs/freqai-parameter-table.md @@ -45,6 +45,7 @@ Mandatory parameters are marked as **Required** and have to be set in one of the | `noise_standard_deviation` | If set, FreqAI adds noise to the training features with the aim of preventing overfitting. FreqAI generates random deviates from a gaussian distribution with a standard deviation of `noise_standard_deviation` and adds them to all data points. `noise_standard_deviation` should be kept relative to the normalized space, i.e., between -1 and 1. In other words, since data in FreqAI is always normalized to be between -1 and 1, `noise_standard_deviation: 0.05` would result in 32% of the data being randomly increased/decreased by more than 2.5% (i.e., the percent of data falling within the first standard deviation).
**Datatype:** Integer.
Default: `0`. | `outlier_protection_percentage` | Enable to prevent outlier detection methods from discarding too much data. If more than `outlier_protection_percentage` % of points are detected as outliers by the SVM or DBSCAN, FreqAI will log a warning message and ignore outlier detection, i.e., the original dataset will be kept intact. If the outlier protection is triggered, no predictions will be made based on the training dataset.
**Datatype:** Float.
Default: `30`. | `reverse_train_test_order` | Split the feature dataset (see below) and use the latest data split for training and test on historical split of the data. This allows the model to be trained up to the most recent data point, while avoiding overfitting. However, you should be careful to understand the unorthodox nature of this parameter before employing it.
**Datatype:** Boolean.
Default: `False` (no reversal). +| `shuffle_after_split` | Split the data into train and test sets, and then shuffle both sets individually.
**Datatype:** Boolean.
Default: `False`. ### Data split parameters diff --git a/freqtrade/constants.py b/freqtrade/constants.py index b2e707d1a..a724664a4 100644 --- a/freqtrade/constants.py +++ b/freqtrade/constants.py @@ -568,7 +568,8 @@ CONF_SCHEMA = { "shuffle": {"type": "boolean", "default": False}, "nu": {"type": "number", "default": 0.1} }, - } + }, + "shuffle_after_split": {"type": "boolean", "default": False} }, "required": ["include_timeframes", "include_corr_pairlist", ] }, diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 9c7c9101c..30d2509b5 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -171,7 +171,7 @@ class FreqaiDataKitchen: train_labels = labels train_weights = weights - if feat_dict.get("shuffle_after_split", False): + if feat_dict["shuffle_after_split"]: rint1 = random.randint(0, 100) rint2 = random.randint(0, 100) train_features = train_features.sample( From 351c5fbf7f12319a46439b5b2c18fe8650a8e8c4 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Thu, 16 Feb 2023 19:48:22 +0100 Subject: [PATCH 3/3] add shuffle_after_split to conftest --- tests/freqai/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/freqai/conftest.py b/tests/freqai/conftest.py index bee7df27e..5e8945239 100644 --- a/tests/freqai/conftest.py +++ b/tests/freqai/conftest.py @@ -46,6 +46,7 @@ def freqai_conf(default_conf, tmpdir): "use_SVM_to_remove_outliers": True, "stratify_training_data": 0, "indicator_periods_candles": [10], + "shuffle_after_split": False }, "data_split_parameters": {"test_size": 0.33, "shuffle": False}, "model_training_parameters": {"n_estimators": 100},