allow users to buffer train data with buffer_train_data_candles parameter

This commit is contained in:
robcaulk 2023-02-21 21:08:34 +01:00
parent 352f4962da
commit 2b5c11c7b4
6 changed files with 43 additions and 13 deletions

View File

@ -46,6 +46,7 @@ Mandatory parameters are marked as **Required** and have to be set in one of the
| `outlier_protection_percentage` | Enable to prevent outlier detection methods from discarding too much data. If more than `outlier_protection_percentage` % of points are detected as outliers by the SVM or DBSCAN, FreqAI will log a warning message and ignore outlier detection, i.e., the original dataset will be kept intact. If the outlier protection is triggered, no predictions will be made based on the training dataset. <br> **Datatype:** Float. <br> Default: `30`. | `outlier_protection_percentage` | Enable to prevent outlier detection methods from discarding too much data. If more than `outlier_protection_percentage` % of points are detected as outliers by the SVM or DBSCAN, FreqAI will log a warning message and ignore outlier detection, i.e., the original dataset will be kept intact. If the outlier protection is triggered, no predictions will be made based on the training dataset. <br> **Datatype:** Float. <br> Default: `30`.
| `reverse_train_test_order` | Split the feature dataset (see below) and use the latest data split for training and test on historical split of the data. This allows the model to be trained up to the most recent data point, while avoiding overfitting. However, you should be careful to understand the unorthodox nature of this parameter before employing it. <br> **Datatype:** Boolean. <br> Default: `False` (no reversal). | `reverse_train_test_order` | Split the feature dataset (see below) and use the latest data split for training and test on historical split of the data. This allows the model to be trained up to the most recent data point, while avoiding overfitting. However, you should be careful to understand the unorthodox nature of this parameter before employing it. <br> **Datatype:** Boolean. <br> Default: `False` (no reversal).
| `shuffle_after_split` | Split the data into train and test sets, and then shuffle both sets individually. <br> **Datatype:** Boolean. <br> Default: `False`. | `shuffle_after_split` | Split the data into train and test sets, and then shuffle both sets individually. <br> **Datatype:** Boolean. <br> Default: `False`.
| `buffer_train_data_candles` | Cut `buffer_train_data_candles` off the beginning and end of the training data *after* the indicators were populated. The main example use is when predicting maxima and minima, the argrelextrema function cannot know the maxima/minima at the edges of the timerange. To improve model accuracy, it is best to compute argrelextrema on the full timerange and then use this function to cut off the edges (buffer) by the kernel. In another case, if the targets are set to a shifted price movement, this buffer is unnecessary because the shifted candles at the end of the timerange will be NaN and FreqAI will automatically cut those off of the training dataset.<br> **Datatype:** Boolean. <br> Default: `False`.
### Data split parameters ### Data split parameters

View File

@ -569,7 +569,8 @@ CONF_SCHEMA = {
"nu": {"type": "number", "default": 0.1} "nu": {"type": "number", "default": 0.1}
}, },
}, },
"shuffle_after_split": {"type": "boolean", "default": False} "shuffle_after_split": {"type": "boolean", "default": False},
"buffer_train_data_candles": {"type": "integer", "default": 0}
}, },
"required": ["include_timeframes", "include_corr_pairlist", ] "required": ["include_timeframes", "include_corr_pairlist", ]
}, },

View File

@ -1562,3 +1562,25 @@ class FreqaiDataKitchen:
dataframe.columns = dataframe.columns.str.replace(c, "") dataframe.columns = dataframe.columns.str.replace(c, "")
return dataframe return dataframe
def buffer_timerange(self, timerange: TimeRange):
"""
Buffer the start and end of the timerange. This is used *after* the indicators
are populated.
The main example use is when predicting maxima and minima, the argrelextrema
function cannot know the maxima/minima at the edges of the timerange. To improve
model accuracy, it is best to compute argrelextrema on the full timerange
and then use this function to cut off the edges (buffer) by the kernel.
In another case, if the targets are set to a shifted price movement, this
buffer is unnecessary because the shifted candles at the end of the timerange
will be NaN and FreqAI will automatically cut those off of the training
dataset.
"""
buffer = self.freqai_config["feature_parameters"]["buffer_train_data_candles"]
if buffer:
timerange.stopts -= buffer * timeframe_to_seconds(self.config["timeframe"])
timerange.startts += buffer * timeframe_to_seconds(self.config["timeframe"])
return timerange

View File

@ -330,6 +330,8 @@ class IFreqaiModel(ABC):
dataframe_base_backtest = strategy.set_freqai_targets( dataframe_base_backtest = strategy.set_freqai_targets(
dataframe_base_backtest, metadata=metadata) dataframe_base_backtest, metadata=metadata)
tr_train = dk.buffer_timerange(tr_train)
dataframe_train = dk.slice_dataframe(tr_train, dataframe_base_train) dataframe_train = dk.slice_dataframe(tr_train, dataframe_base_train)
dataframe_backtest = dk.slice_dataframe(tr_backtest, dataframe_base_backtest) dataframe_backtest = dk.slice_dataframe(tr_backtest, dataframe_base_backtest)
@ -614,6 +616,8 @@ class IFreqaiModel(ABC):
strategy, corr_dataframes, base_dataframes, pair strategy, corr_dataframes, base_dataframes, pair
) )
new_trained_timerange = dk.buffer_timerange(new_trained_timerange)
unfiltered_dataframe = dk.slice_dataframe(new_trained_timerange, unfiltered_dataframe) unfiltered_dataframe = dk.slice_dataframe(new_trained_timerange, unfiltered_dataframe)
# find the features indicated by strategy and store in datakitchen # find the features indicated by strategy and store in datakitchen

View File

@ -46,7 +46,8 @@ def freqai_conf(default_conf, tmpdir):
"use_SVM_to_remove_outliers": True, "use_SVM_to_remove_outliers": True,
"stratify_training_data": 0, "stratify_training_data": 0,
"indicator_periods_candles": [10], "indicator_periods_candles": [10],
"shuffle_after_split": False "shuffle_after_split": False,
"buffer_train_data_candles": 0
}, },
"data_split_parameters": {"test_size": 0.33, "shuffle": False}, "data_split_parameters": {"test_size": 0.33, "shuffle": False},
"model_training_parameters": {"n_estimators": 100}, "model_training_parameters": {"n_estimators": 100},

View File

@ -27,19 +27,19 @@ def is_mac() -> bool:
return "Darwin" in machine return "Darwin" in machine
@pytest.mark.parametrize('model, pca, dbscan, float32, can_short, shuffle', [ @pytest.mark.parametrize('model, pca, dbscan, float32, can_short, shuffle, buffer', [
('LightGBMRegressor', True, False, True, True, False), ('LightGBMRegressor', True, False, True, True, False, 0),
('XGBoostRegressor', False, True, False, True, False), ('XGBoostRegressor', False, True, False, True, False, 10),
('XGBoostRFRegressor', False, False, False, True, False), ('XGBoostRFRegressor', False, False, False, True, False, 0),
('CatboostRegressor', False, False, False, True, True), ('CatboostRegressor', False, False, False, True, True, 0),
('ReinforcementLearner', False, True, False, True, False), ('ReinforcementLearner', False, True, False, True, False, 0),
('ReinforcementLearner_multiproc', False, False, False, True, False), ('ReinforcementLearner_multiproc', False, False, False, True, False, 0),
('ReinforcementLearner_test_3ac', False, False, False, False, False), ('ReinforcementLearner_test_3ac', False, False, False, False, False, 0),
('ReinforcementLearner_test_3ac', False, False, False, True, False), ('ReinforcementLearner_test_3ac', False, False, False, True, False, 0),
('ReinforcementLearner_test_4ac', False, False, False, True, False) ('ReinforcementLearner_test_4ac', False, False, False, True, False, 0)
]) ])
def test_extract_data_and_train_model_Standard(mocker, freqai_conf, model, pca, def test_extract_data_and_train_model_Standard(mocker, freqai_conf, model, pca,
dbscan, float32, can_short, shuffle): dbscan, float32, can_short, shuffle, buffer):
if is_arm() and model == 'CatboostRegressor': if is_arm() and model == 'CatboostRegressor':
pytest.skip("CatBoost is not supported on ARM") pytest.skip("CatBoost is not supported on ARM")
@ -55,6 +55,7 @@ def test_extract_data_and_train_model_Standard(mocker, freqai_conf, model, pca,
freqai_conf['freqai']['feature_parameters'].update({"use_DBSCAN_to_remove_outliers": dbscan}) freqai_conf['freqai']['feature_parameters'].update({"use_DBSCAN_to_remove_outliers": dbscan})
freqai_conf.update({"reduce_df_footprint": float32}) freqai_conf.update({"reduce_df_footprint": float32})
freqai_conf['freqai']['feature_parameters'].update({"shuffle_after_split": shuffle}) freqai_conf['freqai']['feature_parameters'].update({"shuffle_after_split": shuffle})
freqai_conf['freqai']['feature_parameters'].update({"buffer_train_data_candles": buffer})
if 'ReinforcementLearner' in model: if 'ReinforcementLearner' in model:
model_save_ext = 'zip' model_save_ext = 'zip'