rehaul of backend data management - increasing performance by holding history in memory, reducing load on the ratelimit by only pinging exchange once per candle. Improve code readability.

This commit is contained in:
robcaulk
2022-06-03 15:19:46 +02:00
parent 4ac6ef2972
commit 16b4a5b71f
5 changed files with 342 additions and 70 deletions

View File

@@ -44,9 +44,9 @@ class IFreqaiModel(ABC):
self.config = config
self.assert_config(self.config)
self.freqai_info = config["freqai"]
self.data_split_parameters = config["freqai"]["data_split_parameters"]
self.model_training_parameters = config["freqai"]["model_training_parameters"]
self.feature_parameters = config["freqai"]["feature_parameters"]
self.data_split_parameters = config.get('freqai', {}).get("data_split_parameters")
self.model_training_parameters = config.get("freqai", {}).get("model_training_parameters")
self.feature_parameters = config.get("freqai", {}).get("feature_parameters")
self.time_last_trained = None
self.current_time = None
self.model = None
@@ -54,6 +54,7 @@ class IFreqaiModel(ABC):
self.training_on_separate_thread = False
self.retrain = False
self.first = True
self.update_historic_data = 0
self.set_full_path()
self.follow_mode = self.freqai_info.get('follow_mode', False)
self.data_drawer = FreqaiDataDrawer(Path(self.full_path),
@@ -95,15 +96,12 @@ class IFreqaiModel(ABC):
self.dh = FreqaiDataKitchen(self.config, self.data_drawer,
self.live, metadata["pair"])
dh = self.start_live(dataframe, metadata, strategy, self.dh)
dh = self.start_live(dataframe, metadata, strategy, self.dh, trainable=True)
else:
# we will have at max 2 separate instances of the kitchen at once.
self.dh_fg = FreqaiDataKitchen(self.config, self.data_drawer,
self.live, metadata["pair"])
dh = self.start_live(dataframe, metadata, strategy, self.dh_fg)
# return (dh.full_predictions, dh.full_do_predict,
# dh.full_target_mean, dh.full_target_std)
dh = self.start_live(dataframe, metadata, strategy, self.dh_fg, trainable=False)
# For backtesting, each pair enters and then gets trained for each window along the
# sliding window defined by "train_period" (training window) and "backtest_period"
@@ -115,8 +113,9 @@ class IFreqaiModel(ABC):
logger.info(f'Training {len(self.dh.training_timeranges)} timeranges')
dh = self.start_backtesting(dataframe, metadata, self.dh)
return (dh.full_predictions, dh.full_do_predict,
dh.full_target_mean, dh.full_target_std)
return self.return_values(dataframe, dh)
# return (dh.full_predictions, dh.full_do_predict,
# dh.full_target_mean, dh.full_target_std)
def start_backtesting(self, dataframe: DataFrame, metadata: dict,
dh: FreqaiDataKitchen) -> FreqaiDataKitchen:
@@ -185,7 +184,8 @@ class IFreqaiModel(ABC):
return dh
def start_live(self, dataframe: DataFrame, metadata: dict,
strategy: IStrategy, dh: FreqaiDataKitchen) -> FreqaiDataKitchen:
strategy: IStrategy, dh: FreqaiDataKitchen,
trainable: bool) -> FreqaiDataKitchen:
"""
The main broad execution for dry/live. This function will check if a retraining should be
performed, and if so, retrain and reset the model.
@@ -198,25 +198,35 @@ class IFreqaiModel(ABC):
dh: FreqaiDataKitchen = Data management/analysis tool assoicated to present pair only
"""
# update follower
if self.follow_mode:
# follower needs to load from disk to get any changes made by leader to pair_dict
self.data_drawer.load_drawer_from_disk()
if self.freqai_info.get('purge_old_models', False):
self.data_drawer.purge_old_models()
self.data_drawer.update_follower_metadata()
# get the model metadata associated with the current pair
(model_filename,
trained_timestamp,
coin_first,
return_null_array) = self.data_drawer.get_pair_dict_info(metadata)
# if the files do not yet exist, the follower returns null arrays to strategy
# if the metadata doesnt exist, the follower returns null arrays to strategy
if self.follow_mode and return_null_array:
logger.info('Returning null array from follower to strategy')
self.data_drawer.return_null_values_to_strategy(dataframe, dh)
return dh
if (not self.training_on_separate_thread and not self.follow_mode
and self.data_drawer.pair_dict[metadata['pair']]['priority'] == 1) or coin_first:
# append the historic data once per round
if (self.data_drawer.historic_data and
self.update_historic_data >= len(self.config.get('exchange', '')
.get('pair_whitelist'))):
dh.update_historic_data(strategy)
self.update_historic_data = 1
else:
self.update_historic_data += 1
# if trainable, check if model needs training, if so compute new timerange,
# then save model and metadata.
# if not trainable, load existing data
if (trainable and not self.follow_mode) or coin_first:
file_exists = False
if trained_timestamp != 0: # historical model available
@@ -231,6 +241,15 @@ class IFreqaiModel(ABC):
data_load_timerange) = dh.check_if_new_training_required(trained_timestamp)
dh.set_paths(metadata, new_trained_timerange.stopts)
# download candle history if it is not already in memory
if not self.data_drawer.historic_data:
logger.info('Downloading all training data for all pairs in whitelist and '
'corr_pairlist, this may take a while if you do not have the '
'data saved')
dh.download_all_data_for_training(data_load_timerange)
dh.load_all_pair_histories(data_load_timerange)
# train the model on the trained timerange
if self.retrain or not file_exists:
if coin_first:
self.train_model_in_series(new_trained_timerange, metadata,
@@ -241,17 +260,24 @@ class IFreqaiModel(ABC):
metadata, strategy,
dh, data_load_timerange)
elif self.training_on_separate_thread and not self.follow_mode:
logger.info("FreqAI training a new model on background thread.")
elif not trainable and not self.follow_mode:
logger.info(f'{metadata["pair"]} holds spot '
f'{self.data_drawer.pair_dict[metadata["pair"]]["priority"]} '
'in training queue')
elif self.follow_mode:
dh.set_paths(metadata, trained_timestamp)
logger.info('FreqAI instance set to follow_mode, finding existing pair'
f'using { self.identifier }')
# load the model and associated data into the data kitchen
self.model = dh.load_data(coin=metadata['pair'])
# ensure user is feeding the correct indicators to the model
self.check_if_feature_list_matches_strategy(dataframe, dh)
# hold the historical predictions in memory so we are sending back
# correct array to strategy FIXME currently broken, but only affecting
# Frequi reporting. Signals remain unaffeted.
if metadata['pair'] not in self.data_drawer.model_return_values:
preds, do_preds = self.predict(dataframe, dh)
dh.append_predictions(preds, do_preds, len(dataframe))
@@ -268,6 +294,13 @@ class IFreqaiModel(ABC):
def check_if_feature_list_matches_strategy(self, dataframe: DataFrame,
dh: FreqaiDataKitchen) -> None:
"""
Ensure user is passing the proper feature set if they are reusing an `identifier` pointing
to a folder holding existing models.
:params:
dataframe: DataFrame = strategy provided dataframe
dh: FreqaiDataKitchen = non-persistent data container/analyzer for current coin/bot loop
"""
strategy_provided_features = dh.find_features(dataframe)
if 'training_features_list_raw' in dh.data:
feature_list = dh.data['training_features_list_raw']
@@ -356,11 +389,24 @@ class IFreqaiModel(ABC):
def retrain_model_on_separate_thread(self, new_trained_timerange: TimeRange, metadata: dict,
strategy: IStrategy, dh: FreqaiDataKitchen,
data_load_timerange: TimeRange):
"""
Retreive data and train model on separate thread. Always called if the model folder already
contains a full set of trained models.
:params:
new_trained_timerange: TimeRange = the timerange to train the model on
metadata: dict = strategy provided metadata
strategy: IStrategy = user defined strategy object
dh: FreqaiDataKitchen = non-persistent data container for current coin/loop
data_load_timerange: TimeRange = the amount of data to be loaded for populate_any_indicators
(larger than new_trained_timerange so that new_trained_timerange does not contain any NaNs)
"""
# with nostdout():
dh.download_new_data_for_retraining(data_load_timerange, metadata, strategy)
corr_dataframes, base_dataframes = dh.load_pairs_histories(data_load_timerange,
metadata)
# dh.download_new_data_for_retraining(data_load_timerange, metadata, strategy)
# corr_dataframes, base_dataframes = dh.load_pairs_histories(data_load_timerange,
# metadata)
corr_dataframes, base_dataframes = dh.get_base_and_corr_dataframes(data_load_timerange,
metadata)
# protecting from common benign errors associated with grabbing new data from exchange:
try:
@@ -408,10 +454,22 @@ class IFreqaiModel(ABC):
def train_model_in_series(self, new_trained_timerange: TimeRange, metadata: dict,
strategy: IStrategy, dh: FreqaiDataKitchen,
data_load_timerange: TimeRange):
dh.download_new_data_for_retraining(data_load_timerange, metadata, strategy)
corr_dataframes, base_dataframes = dh.load_pairs_histories(data_load_timerange,
metadata)
"""
Retreive data and train model in single threaded mode (only used if model directory is empty
upon startup for dry/live )
:params:
new_trained_timerange: TimeRange = the timerange to train the model on
metadata: dict = strategy provided metadata
strategy: IStrategy = user defined strategy object
dh: FreqaiDataKitchen = non-persistent data container for current coin/loop
data_load_timerange: TimeRange = the amount of data to be loaded for populate_any_indicators
(larger than new_trained_timerange so that new_trained_timerange does not contain any NaNs)
"""
# dh.download_new_data_for_retraining(data_load_timerange, metadata, strategy)
# corr_dataframes, base_dataframes = dh.load_pairs_histories(data_load_timerange,
# metadata)
corr_dataframes, base_dataframes = dh.get_base_and_corr_dataframes(data_load_timerange,
metadata)
unfiltered_dataframe = dh.use_strategy_to_populate_indicators(strategy,
corr_dataframes,
@@ -481,3 +539,17 @@ class IFreqaiModel(ABC):
"""
return
@abstractmethod
def return_values(self, dataframe: DataFrame, dh: FreqaiDataKitchen) -> DataFrame:
"""
User defines the dataframe to be returned to strategy here.
:params:
dataframe: DataFrame = the full dataframe for the current prediction (live)
or --timerange (backtesting)
dh: FreqaiDataKitchen = Data management/analysis tool assoicated to present pair only
:returns:
dataframe: DataFrame = dataframe filled with user defined data
"""
return