Enable hourly/minute retraining in live/dry. Suppress catboost folder output. Update config + constants + docs to reflect updates.

This commit is contained in:
robcaulk 2022-05-23 00:06:26 +02:00
parent 42d95af829
commit af0cc21af9
6 changed files with 66 additions and 35 deletions

View File

@ -71,7 +71,8 @@
"DI_threshold": 1,
"weight_factor": 0,
"principal_component_analysis": false,
"use_SVM_to_remove_outliers": false
"use_SVM_to_remove_outliers": false,
"stratify": 0
},
"data_split_parameters": {
"test_size": 0.25,

View File

@ -151,7 +151,8 @@ no. `timeframes` * no. `base_features` * no. `corr_pairlist` * no. `shift`_
Users define the backtesting timerange with the typical `--timerange` parameter in the user
configuration file. `train_period` is the duration of the sliding training window, while
`backtest_period` is the sliding backtesting window, both in number of days. In the present example,
`backtest_period` is the sliding backtesting window, both in number of days (backtest_period can be
a float to indicate sub daily retraining in live/dry mode). In the present example,
the user is asking Freqai to use a training period of 30 days and backtest the subsequent 7 days.
This means that if the user sets `--timerange 20210501-20210701`,
Freqai will train 8 separate models (because the full range comprises 8 weeks),
@ -347,6 +348,22 @@ Freqai will train an SVM on the training data (or components if the user activat
`principal_component_analysis`) and remove any data point that it deems to be sit beyond the
feature space.
## Stratifying the data
The user can stratify the training/testing data using:
```json
"freqai": {
"feature_parameters" : {
"stratify": 3
}
}
```
which will split the data chronolocially so that every X data points is a testing data point. In the
present example, the user is asking for every third data point in the dataframe to be used for
testing, the other points are used for training.
## Additional information
### Feature standardization

View File

@ -438,7 +438,7 @@ CONF_SCHEMA = {
"properties": {
"timeframes": {"type": "list"},
"train_period": {"type": "integer", "default": 0},
"backtest_period": {"type": "integer", "default": 7},
"backtest_period": {"type": "float", "default": 7},
"identifier": {"type": "str", "default": "example"},
"live_trained_timerange": {"type": "str"},
"live_full_backtestrange": {"type": "str"},
@ -451,7 +451,7 @@ CONF_SCHEMA = {
"DI_threshold": {"type": "integer", "default": 0},
"weight_factor": {"type": "number", "default": 0},
"principal_component_analysis": {"type": "boolean", "default": False},
"remove_outliers": {"type": "boolean", "default": False},
"use_SVM_to_remove_outliers": {"type": "boolean", "default": False},
},
},
"data_split_parameters": {

View File

@ -689,50 +689,58 @@ class FreqaiDataKitchen:
return full_timerange
def check_if_new_training_required(self, training_timerange: str,
metadata: dict) -> Tuple[bool, str]:
def check_if_new_training_required(self, trained_timerange: TimeRange,
metadata: dict,
timestamp: int = 0) -> Tuple[bool, TimeRange, int]:
time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp()
if training_timerange: # user passed no live_trained_timerange in config
trained_timerange = TimeRange.parse_timerange(training_timerange)
if trained_timerange.startts != 0:
# trained_timerange = TimeRange.parse_timerange(training_timerange)
# keep hour available incase user wants to train multiple times per day
# training_timerange is a str for day range only, so we add the extra hours
# original_stop_seconds = trained_timerange.stopts
# trained_timerange.stopts += int(timestamp - original_stop_seconds)
# trained_timerange.startts += int(timestamp - original_stop_seconds)
elapsed_time = (time - trained_timerange.stopts) / SECONDS_IN_DAY
trained_timerange.startts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
trained_timerange.stopts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
retrain = elapsed_time > self.freqai_config['backtest_period']
else:
trained_timerange = TimeRange.parse_timerange("20000101-20000201")
if retrain:
trained_timerange.startts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
trained_timerange.stopts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
else: # user passed no live_trained_timerange in config
trained_timerange = TimeRange.parse_timerange("20000101-20000201") # arbitrary date
trained_timerange.startts = int(time - self.freqai_config['train_period'] *
SECONDS_IN_DAY)
trained_timerange.stopts = int(time)
retrain = True
start = datetime.datetime.utcfromtimestamp(trained_timerange.startts)
stop = datetime.datetime.utcfromtimestamp(trained_timerange.stopts)
new_trained_timerange = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")
timestamp = trained_timerange.stopts
# start = datetime.datetime.utcfromtimestamp(trained_timerange.startts)
# stop = datetime.datetime.utcfromtimestamp(trained_timerange.stopts)
# new_trained_timerange_str = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")
if retrain:
coin, _ = metadata['pair'].split("/")
# set the new model_path
self.model_path = Path(self.full_path / str("sub-train" + "-" +
str(new_trained_timerange)))
str(timestamp)))
self.model_filename = "cb_" + coin.lower() + "_" + new_trained_timerange
self.model_filename = "cb_" + coin.lower() + "_" + str(timestamp)
# this is not persistent at the moment TODO
self.freqai_config['live_trained_timerange'] = new_trained_timerange
self.freqai_config['live_trained_timerange'] = str(timestamp)
# enables persistence, but not fully implemented into save/load data yer
self.data['live_trained_timerange'] = new_trained_timerange
self.data['live_trained_timerange'] = str(timestamp)
return retrain, new_trained_timerange
return retrain, trained_timerange, timestamp
def download_new_data_for_retraining(self, new_timerange: str, metadata: dict) -> None:
def download_new_data_for_retraining(self, timerange: TimeRange, metadata: dict) -> None:
exchange = ExchangeResolver.load_exchange(self.config['exchange']['name'],
self.config, validate=False)
pairs = self.freqai_config['corr_pairlist']
if metadata['pair'] not in pairs:
pairs += metadata['pair'] # dont include pair twice
timerange = TimeRange.parse_timerange(new_timerange)
# timerange = TimeRange.parse_timerange(new_timerange)
refresh_backtest_ohlcv_data(
exchange, pairs=pairs, timeframes=self.freqai_config['timeframes'],
@ -743,12 +751,12 @@ class FreqaiDataKitchen:
prepend=self.config.get('prepend_data', False)
)
def load_pairs_histories(self, new_timerange: str, metadata: dict) -> Tuple[Dict[Any, Any],
DataFrame]:
def load_pairs_histories(self, timerange: TimeRange, metadata: dict) -> Tuple[Dict[Any, Any],
DataFrame]:
corr_dataframes: Dict[Any, Any] = {}
base_dataframes: Dict[Any, Any] = {}
pairs = self.freqai_config['corr_pairlist'] # + [metadata['pair']]
timerange = TimeRange.parse_timerange(new_timerange)
# timerange = TimeRange.parse_timerange(new_timerange)
for tf in self.freqai_config['timeframes']:
base_dataframes[tf] = load_pair_history(datadir=self.config['datadir'],
@ -763,10 +771,6 @@ class FreqaiDataKitchen:
timeframe=tf,
pair=p, timerange=timerange)
# base_dataframe = [dataframe for key, dataframe in corr_dataframes.items()
# if metadata['pair'] in key]
# [0] indexes the lowest tf for the basepair
return corr_dataframes, base_dataframes
def use_strategy_to_populate_indicators(self, strategy: IStrategy,

View File

@ -11,6 +11,7 @@ import numpy.typing as npt
import pandas as pd
from pandas import DataFrame
from freqtrade.configuration import TimeRange
from freqtrade.enums import RunMode
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from freqtrade.strategy.interface import IStrategy
@ -63,6 +64,12 @@ class IFreqaiModel(ABC):
self.training_on_separate_thread = False
self.retrain = False
self.first = True
self.timestamp = 0
if self.freqai_info['live_trained_timerange']:
self.new_trained_timerange = TimeRange.parse_timerange(
self.freqai_info['live_trained_timerange'])
else:
self.new_trained_timerange = TimeRange()
def start(self, dataframe: DataFrame, metadata: dict, strategy: IStrategy) -> DataFrame:
"""
@ -150,9 +157,10 @@ class IFreqaiModel(ABC):
if not self.training_on_separate_thread:
# this will also prevent other pairs from trying to train simultaneously.
(self.retrain,
self.new_trained_timerange) = self.dh.check_if_new_training_required(self.freqai_info[
'live_trained_timerange'],
metadata)
self.new_trained_timerange,
self.timestamp) = self.dh.check_if_new_training_required(self.new_trained_timerange,
metadata,
timestamp=self.timestamp)
else:
logger.info("FreqAI training a new model on background thread.")
self.retrain = False
@ -250,7 +258,7 @@ class IFreqaiModel(ABC):
:param pair: pair e.g. BTC/USD
:param path: path to model
"""
if self.live and training_timerange is None:
if self.live and training_timerange == "":
return False
coin, _ = pair.split("/")
self.dh.model_filename = "cb_" + coin.lower() + "_" + training_timerange
@ -263,7 +271,7 @@ class IFreqaiModel(ABC):
return file_exists
@threaded
def retrain_model_on_separate_thread(self, new_trained_timerange: str, metadata: dict,
def retrain_model_on_separate_thread(self, new_trained_timerange: TimeRange, metadata: dict,
strategy: IStrategy):
# with nostdout():
@ -282,7 +290,7 @@ class IFreqaiModel(ABC):
self.training_on_separate_thread = False
self.retrain = False
def train_model_in_series(self, new_trained_timerange: str, metadata: dict,
def train_model_in_series(self, new_trained_timerange: TimeRange, metadata: dict,
strategy: IStrategy):
self.dh.download_new_data_for_retraining(new_trained_timerange, metadata)

View File

@ -101,6 +101,7 @@ class CatboostPredictionModel(IFreqaiModel):
)
model = CatBoostRegressor(
allow_writing_files=False,
verbose=100, early_stopping_rounds=400, **self.model_training_parameters
)
model.fit(X=train_data, eval_set=test_data)