add config asserts, use .get method with default values for optional functionality, move data_cleaning_* to freqai_interface (away from user custom pred model) since it is controlled by config params.

This commit is contained in:
robcaulk 2022-05-23 12:07:09 +02:00
parent dede128648
commit e1c068ca66
4 changed files with 162 additions and 93 deletions

View File

@ -43,6 +43,7 @@ class FreqaiDataKitchen:
self.data: Dict[Any, Any] = {} self.data: Dict[Any, Any] = {}
self.data_dictionary: Dict[Any, Any] = {} self.data_dictionary: Dict[Any, Any] = {}
self.config = config self.config = config
self.assert_config(self.config, live)
self.freqai_config = config["freqai"] self.freqai_config = config["freqai"]
self.predictions: npt.ArrayLike = np.array([]) self.predictions: npt.ArrayLike = np.array([])
self.do_predict: npt.ArrayLike = np.array([]) self.do_predict: npt.ArrayLike = np.array([])
@ -59,7 +60,7 @@ class FreqaiDataKitchen:
self.svm_model: linear_model.SGDOneClassSVM = None self.svm_model: linear_model.SGDOneClassSVM = None
if not self.live: if not self.live:
self.full_timerange = self.create_fulltimerange(self.config["timerange"], self.full_timerange = self.create_fulltimerange(self.config["timerange"],
self.freqai_config["train_period"] self.freqai_config.get("train_period")
) )
(self.training_timeranges, self.backtesting_timeranges) = self.split_timerange( (self.training_timeranges, self.backtesting_timeranges) = self.split_timerange(
@ -68,14 +69,33 @@ class FreqaiDataKitchen:
config["freqai"]["backtest_period"], config["freqai"]["backtest_period"],
) )
def assert_config(self, config: Dict[str, Any], live: bool) -> None:
assert config.get('freqai'), "No Freqai parameters found in config file."
assert config.get('freqai', {}).get('train_period'), ("No Freqai train_period found in"
"config file.")
assert type(config.get('freqai', {})
.get('train_period')) is int, ('Can only train on full day period.'
'No fractional days permitted.')
assert config.get('freqai', {}).get('backtest_period'), ("No Freqai backtest_period found"
"in config file.")
if not live:
assert type(config.get('freqai', {})
.get('backtest_period')) is int, ('Can only backtest on full day'
'backtest_period. Only live/dry mode'
'allows fractions of days')
assert config.get('freqai', {}).get('identifier'), ("No Freqai identifier found in config"
"file.")
assert config.get('freqai', {}).get('feature_parameters'), ("No Freqai feature_parameters"
"found in config file.")
def set_paths(self) -> None: def set_paths(self) -> None:
self.full_path = Path(self.config['user_data_dir'] / self.full_path = Path(self.config['user_data_dir'] /
"models" / "models" /
str(self.freqai_config['live_full_backtestrange'] + str(self.freqai_config.get('live_full_backtestrange') +
self.freqai_config['identifier'])) self.freqai_config.get('identifier')))
self.model_path = Path(self.full_path / str("sub-train" + "-" + self.model_path = Path(self.full_path / str("sub-train" + "-" +
str(self.freqai_config['live_trained_timerange']))) str(self.freqai_config.get('live_trained_timerange'))))
return return
@ -117,7 +137,7 @@ class FreqaiDataKitchen:
# do not want them having to edit the default save/load methods here. Below is an example # do not want them having to edit the default save/load methods here. Below is an example
# of what we do NOT want. # of what we do NOT want.
# if self.freqai_config['feature_parameters']['determine_statistical_distributions']: # if self.freqai_config.get('feature_parameters','determine_statistical_distributions'):
# self.data_dictionary["upper_quantiles"].to_pickle( # self.data_dictionary["upper_quantiles"].to_pickle(
# save_path / str(self.model_filename + "_upper_quantiles.pkl") # save_path / str(self.model_filename + "_upper_quantiles.pkl")
# ) # )
@ -147,7 +167,7 @@ class FreqaiDataKitchen:
# do not want them having to edit the default save/load methods here. Below is an example # do not want them having to edit the default save/load methods here. Below is an example
# of what we do NOT want. # of what we do NOT want.
# if self.freqai_config['feature_parameters']['determine_statistical_distributions']: # if self.freqai_config.get('feature_parameters','determine_statistical_distributions'):
# self.data_dictionary["upper_quantiles"] = pd.read_pickle( # self.data_dictionary["upper_quantiles"] = pd.read_pickle(
# self.model_path / str(self.model_filename + "_upper_quantiles.pkl") # self.model_path / str(self.model_filename + "_upper_quantiles.pkl")
# ) # )
@ -193,15 +213,15 @@ class FreqaiDataKitchen:
""" """
weights: npt.ArrayLike weights: npt.ArrayLike
if self.config["freqai"]["feature_parameters"]["weight_factor"] > 0: if self.freqai_config["feature_parameters"].get("weight_factor", 0) > 0:
weights = self.set_weights_higher_recent(len(filtered_dataframe)) weights = self.set_weights_higher_recent(len(filtered_dataframe))
else: else:
weights = np.ones(len(filtered_dataframe)) weights = np.ones(len(filtered_dataframe))
if self.config["freqai"]["feature_parameters"]["stratify"] > 0: if self.freqai_config["feature_parameters"].get("stratify", 0) > 0:
stratification = np.zeros(len(filtered_dataframe)) stratification = np.zeros(len(filtered_dataframe))
for i in range(1, len(stratification)): for i in range(1, len(stratification)):
if i % self.config["freqai"]["feature_parameters"]["stratify"] == 0: if i % self.freqai_config.get("feature_parameters", {}).get("stratify", 0) == 0:
stratification[i] = 1 stratification[i] = 1
( (
@ -525,6 +545,14 @@ class FreqaiDataKitchen:
return None return None
def pca_transform(self, filtered_dataframe: DataFrame) -> None:
pca_components = self.pca.transform(filtered_dataframe)
self.data_dictionary["prediction_features"] = pd.DataFrame(
data=pca_components,
columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])],
index=filtered_dataframe.index,
)
def compute_distances(self) -> float: def compute_distances(self) -> float:
logger.info("computing average mean distance for all training points") logger.info("computing average mean distance for all training points")
pairwise = pairwise_distances(self.data_dictionary["train_features"], n_jobs=-1) pairwise = pairwise_distances(self.data_dictionary["train_features"], n_jobs=-1)
@ -675,7 +703,7 @@ class FreqaiDataKitchen:
self.full_path = Path( self.full_path = Path(
self.config["user_data_dir"] self.config["user_data_dir"]
/ "models" / "models"
/ str(full_timerange + self.freqai_config["identifier"]) / str(full_timerange + self.freqai_config.get("identifier"))
) )
config_path = Path(self.config["config_files"][0]) config_path = Path(self.config["config_files"][0])
@ -696,13 +724,15 @@ class FreqaiDataKitchen:
if trained_timerange.startts != 0: if trained_timerange.startts != 0:
elapsed_time = (time - trained_timerange.stopts) / SECONDS_IN_DAY elapsed_time = (time - trained_timerange.stopts) / SECONDS_IN_DAY
retrain = elapsed_time > self.freqai_config['backtest_period'] retrain = elapsed_time > self.freqai_config.get('backtest_period')
if retrain: if retrain:
trained_timerange.startts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY trained_timerange.startts += self.freqai_config.get(
trained_timerange.stopts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY 'backtest_period', 0) * SECONDS_IN_DAY
trained_timerange.stopts += self.freqai_config.get(
'backtest_period', 0) * SECONDS_IN_DAY
else: # user passed no live_trained_timerange in config else: # user passed no live_trained_timerange in config
trained_timerange = TimeRange() trained_timerange = TimeRange()
trained_timerange.startts = int(time - self.freqai_config['train_period'] * trained_timerange.startts = int(time - self.freqai_config.get('train_period') *
SECONDS_IN_DAY) SECONDS_IN_DAY)
trained_timerange.stopts = int(time) trained_timerange.stopts = int(time)
retrain = True retrain = True
@ -725,13 +755,13 @@ class FreqaiDataKitchen:
exchange = ExchangeResolver.load_exchange(self.config['exchange']['name'], exchange = ExchangeResolver.load_exchange(self.config['exchange']['name'],
self.config, validate=False) self.config, validate=False)
pairs = self.freqai_config['corr_pairlist'] pairs = self.freqai_config.get('corr_pairlist', [])
if metadata['pair'] not in pairs: if metadata['pair'] not in pairs:
pairs += metadata['pair'] # dont include pair twice pairs += metadata['pair'] # dont include pair twice
# timerange = TimeRange.parse_timerange(new_timerange) # timerange = TimeRange.parse_timerange(new_timerange)
refresh_backtest_ohlcv_data( refresh_backtest_ohlcv_data(
exchange, pairs=pairs, timeframes=self.freqai_config['timeframes'], exchange, pairs=pairs, timeframes=self.freqai_config.get('timeframes'),
datadir=self.config['datadir'], timerange=timerange, datadir=self.config['datadir'], timerange=timerange,
new_pairs_days=self.config['new_pairs_days'], new_pairs_days=self.config['new_pairs_days'],
erase=False, data_format=self.config['dataformat_ohlcv'], erase=False, data_format=self.config['dataformat_ohlcv'],
@ -743,21 +773,22 @@ class FreqaiDataKitchen:
DataFrame]: DataFrame]:
corr_dataframes: Dict[Any, Any] = {} corr_dataframes: Dict[Any, Any] = {}
base_dataframes: Dict[Any, Any] = {} base_dataframes: Dict[Any, Any] = {}
pairs = self.freqai_config['corr_pairlist'] # + [metadata['pair']] pairs = self.freqai_config.get('corr_pairlist', []) # + [metadata['pair']]
# timerange = TimeRange.parse_timerange(new_timerange) # timerange = TimeRange.parse_timerange(new_timerange)
for tf in self.freqai_config['timeframes']: for tf in self.freqai_config.get('timeframes'):
base_dataframes[tf] = load_pair_history(datadir=self.config['datadir'], base_dataframes[tf] = load_pair_history(datadir=self.config['datadir'],
timeframe=tf, timeframe=tf,
pair=metadata['pair'], timerange=timerange) pair=metadata['pair'], timerange=timerange)
for p in pairs: if pairs:
if metadata['pair'] in p: for p in pairs:
continue # dont repeat anything from whitelist if metadata['pair'] in p:
if p not in corr_dataframes: continue # dont repeat anything from whitelist
corr_dataframes[p] = {} if p not in corr_dataframes:
corr_dataframes[p][tf] = load_pair_history(datadir=self.config['datadir'], corr_dataframes[p] = {}
timeframe=tf, corr_dataframes[p][tf] = load_pair_history(datadir=self.config['datadir'],
pair=p, timerange=timerange) timeframe=tf,
pair=p, timerange=timerange)
return corr_dataframes, base_dataframes return corr_dataframes, base_dataframes
@ -767,23 +798,25 @@ class FreqaiDataKitchen:
metadata: dict) -> DataFrame: metadata: dict) -> DataFrame:
dataframe = base_dataframes[self.config['timeframe']] dataframe = base_dataframes[self.config['timeframe']]
pairs = self.freqai_config.get("corr_pairlist", [])
for tf in self.freqai_config["timeframes"]: for tf in self.freqai_config.get("timeframes"):
dataframe = strategy.populate_any_indicators(metadata['pair'], dataframe = strategy.populate_any_indicators(metadata['pair'],
dataframe.copy(), dataframe.copy(),
tf, tf,
base_dataframes[tf], base_dataframes[tf],
coin=metadata['pair'].split("/")[0] + "-" coin=metadata['pair'].split("/")[0] + "-"
) )
for i in self.freqai_config["corr_pairlist"]: if pairs:
if metadata['pair'] in i: for i in pairs:
continue # dont repeat anything from whitelist if metadata['pair'] in i:
dataframe = strategy.populate_any_indicators(i, continue # dont repeat anything from whitelist
dataframe.copy(), dataframe = strategy.populate_any_indicators(i,
tf, dataframe.copy(),
corr_dataframes[i][tf], tf,
coin=i.split("/")[0] + "-" corr_dataframes[i][tf],
) coin=i.split("/")[0] + "-"
)
return dataframe return dataframe

View File

@ -20,7 +20,7 @@ from freqtrade.strategy.interface import IStrategy
pd.options.mode.chained_assignment = None pd.options.mode.chained_assignment = None
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# FIXME: suppress stdout for background training # FIXME: suppress stdout for background training?
# class DummyFile(object): # class DummyFile(object):
# def write(self, x): pass # def write(self, x): pass
@ -51,6 +51,7 @@ class IFreqaiModel(ABC):
def __init__(self, config: Dict[str, Any]) -> None: def __init__(self, config: Dict[str, Any]) -> None:
self.config = config self.config = config
self.assert_config(self.config)
self.freqai_info = config["freqai"] self.freqai_info = config["freqai"]
self.data_split_parameters = config["freqai"]["data_split_parameters"] self.data_split_parameters = config["freqai"]["data_split_parameters"]
self.model_training_parameters = config["freqai"]["model_training_parameters"] self.model_training_parameters = config["freqai"]["model_training_parameters"]
@ -64,12 +65,25 @@ class IFreqaiModel(ABC):
self.training_on_separate_thread = False self.training_on_separate_thread = False
self.retrain = False self.retrain = False
self.first = True self.first = True
if self.freqai_info['live_trained_timerange']: if self.freqai_info.get('live_trained_timerange'):
self.new_trained_timerange = TimeRange.parse_timerange( self.new_trained_timerange = TimeRange.parse_timerange(
self.freqai_info['live_trained_timerange']) self.freqai_info['live_trained_timerange'])
else: else:
self.new_trained_timerange = TimeRange() self.new_trained_timerange = TimeRange()
def assert_config(self, config: Dict[str, Any]) -> None:
assert config.get('freqai'), "No Freqai parameters found in config file."
assert config.get('freqai', {}).get('data_split_parameters'), ("No Freqai"
"data_split_parameters"
"in config file.")
assert config.get('freqai', {}).get('model_training_parameters'), ("No Freqai"
"modeltrainingparameters"
"found in config file.")
assert config.get('freqai', {}).get('feature_parameters'), ("No Freqai"
"feature_parameters found in"
"config file.")
def start(self, dataframe: DataFrame, metadata: dict, strategy: IStrategy) -> DataFrame: def start(self, dataframe: DataFrame, metadata: dict, strategy: IStrategy) -> DataFrame:
""" """
Entry point to the FreqaiModel, it will train a new model if Entry point to the FreqaiModel, it will train a new model if
@ -192,55 +206,30 @@ class IFreqaiModel(ABC):
return return
@abstractmethod
def train(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Any:
"""
Filter the training data and train a model to it. Train makes heavy use of the datahandler
for storing, saving, loading, and analyzing the data.
:params:
:unfiltered_dataframe: Full dataframe for the current training period
:metadata: pair metadata from strategy.
:returns:
:model: Trained model which can be used to inference (self.predict)
"""
@abstractmethod
def fit(self) -> Any:
"""
Most regressors use the same function names and arguments e.g. user
can drop in LGBMRegressor in place of CatBoostRegressor and all data
management will be properly handled by Freqai.
:params:
:data_dictionary: the dictionary constructed by DataHandler to hold
all the training and test data/labels.
"""
return
@abstractmethod
def predict(self, dataframe: DataFrame, metadata: dict) -> Tuple[npt.ArrayLike, npt.ArrayLike]:
"""
Filter the prediction features data and predict with it.
:param: unfiltered_dataframe: Full dataframe for the current backtest period.
:return:
:predictions: np.array of predictions
:do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
data (NaNs) or felt uncertain about data (PCA and DI index)
"""
@abstractmethod
def data_cleaning_train(self) -> None: def data_cleaning_train(self) -> None:
""" """
User can add data analysis and cleaning here. Base data cleaning method for train
Any function inside this method should drop training data points from the filtered_dataframe Any function inside this method should drop training data points from the filtered_dataframe
based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example
of how outlier data points are dropped from the dataframe used for training. of how outlier data points are dropped from the dataframe used for training.
""" """
if self.freqai_info.get('feature_parameters', {}).get('principal_component_analysis'):
self.dh.principal_component_analysis()
@abstractmethod # if self.feature_parameters["determine_statistical_distributions"]:
def data_cleaning_predict(self) -> None: # self.dh.determine_statistical_distributions()
# if self.feature_parameters["remove_outliers"]:
# self.dh.remove_outliers(predict=False)
if self.freqai_info.get('feature_parameters', {}).get('use_SVM_to_remove_outliers'):
self.dh.use_SVM_to_remove_outliers(predict=False)
if self.freqai_info.get('feature_parameters', {}).get('DI_threshold'):
self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
def data_cleaning_predict(self, filtered_dataframe: DataFrame) -> None:
""" """
User can add data analysis and cleaning here. Base data cleaning method for predict.
These functions each modify self.dh.do_predict, which is a dataframe with equal length These functions each modify self.dh.do_predict, which is a dataframe with equal length
to the number of candles coming from and returning to the strategy. Inside do_predict, to the number of candles coming from and returning to the strategy. Inside do_predict,
1 allows prediction and < 0 signals to the strategy that the model is not confident in 1 allows prediction and < 0 signals to the strategy that the model is not confident in
@ -249,6 +238,19 @@ class IFreqaiModel(ABC):
of how the do_predict vector is modified. do_predict is ultimately passed back to strategy of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
for buy signals. for buy signals.
""" """
if self.freqai_info.get('feature_parameters', {}).get('principal_component_analysis'):
self.dh.pca_transform()
# if self.feature_parameters["determine_statistical_distributions"]:
# self.dh.determine_statistical_distributions()
# if self.feature_parameters["remove_outliers"]:
# self.dh.remove_outliers(predict=True) # creates dropped index
if self.freqai_info.get('feature_parameters', {}).get('use_SVM_to_remove_outliers'):
self.dh.use_SVM_to_remove_outliers(predict=True)
if self.freqai_info.get('feature_parameters', {}).get('DI_threshold'):
self.dh.check_if_pred_in_training_spaces() # sets do_predict
def model_exists(self, pair: str, training_timerange: str) -> bool: def model_exists(self, pair: str, training_timerange: str) -> bool:
""" """
@ -303,3 +305,42 @@ class IFreqaiModel(ABC):
self.model = self.train(unfiltered_dataframe, metadata) self.model = self.train(unfiltered_dataframe, metadata)
self.dh.save_data(self.model) self.dh.save_data(self.model)
self.retrain = False self.retrain = False
# Methods which are overridden by user made prediction models.
# See freqai/prediction_models/CatboostPredictionModlel.py for an example.
@abstractmethod
def train(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Any:
"""
Filter the training data and train a model to it. Train makes heavy use of the datahandler
for storing, saving, loading, and analyzing the data.
:params:
:unfiltered_dataframe: Full dataframe for the current training period
:metadata: pair metadata from strategy.
:returns:
:model: Trained model which can be used to inference (self.predict)
"""
@abstractmethod
def fit(self) -> Any:
"""
Most regressors use the same function names and arguments e.g. user
can drop in LGBMRegressor in place of CatBoostRegressor and all data
management will be properly handled by Freqai.
:params:
:data_dictionary: the dictionary constructed by DataHandler to hold
all the training and test data/labels.
"""
return
@abstractmethod
def predict(self, dataframe: DataFrame, metadata: dict) -> Tuple[npt.ArrayLike, npt.ArrayLike]:
"""
Filter the prediction features data and predict with it.
:param: unfiltered_dataframe: Full dataframe for the current backtest period.
:return:
:predictions: np.array of predictions
:do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
data (NaNs) or felt uncertain about data (PCA and DI index)
"""

View File

@ -1,7 +1,6 @@
import logging import logging
from typing import Any, Dict, Tuple from typing import Any, Dict, Tuple
import pandas as pd
from catboost import CatBoostRegressor, Pool from catboost import CatBoostRegressor, Pool
from pandas import DataFrame from pandas import DataFrame
@ -149,7 +148,7 @@ class CatboostPredictionModel(IFreqaiModel):
based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example
of how outlier data points are dropped from the dataframe used for training. of how outlier data points are dropped from the dataframe used for training.
""" """
if self.feature_parameters["principal_component_analysis"]: if self.freqai_info.get('feature_parameters', {}).get('principal_component_analysis'):
self.dh.principal_component_analysis() self.dh.principal_component_analysis()
# if self.feature_parameters["determine_statistical_distributions"]: # if self.feature_parameters["determine_statistical_distributions"]:
@ -157,9 +156,10 @@ class CatboostPredictionModel(IFreqaiModel):
# if self.feature_parameters["remove_outliers"]: # if self.feature_parameters["remove_outliers"]:
# self.dh.remove_outliers(predict=False) # self.dh.remove_outliers(predict=False)
if self.feature_parameters["use_SVM_to_remove_outliers"]: if self.freqai_info.get('feature_parameters', {}).get('use_SVM_to_remove_outliers'):
self.dh.use_SVM_to_remove_outliers(predict=False) self.dh.use_SVM_to_remove_outliers(predict=False)
if self.feature_parameters["DI_threshold"]:
if self.freqai_info.get('feature_parameters', {}).get('DI_threshold'):
self.dh.data["avg_mean_dist"] = self.dh.compute_distances() self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
def data_cleaning_predict(self, filtered_dataframe: DataFrame) -> None: def data_cleaning_predict(self, filtered_dataframe: DataFrame) -> None:
@ -173,21 +173,16 @@ class CatboostPredictionModel(IFreqaiModel):
of how the do_predict vector is modified. do_predict is ultimately passed back to strategy of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
for buy signals. for buy signals.
""" """
if self.feature_parameters["principal_component_analysis"]: if self.freqai_info.get('feature_parameters', {}).get('principal_component_analysis'):
pca_components = self.dh.pca.transform(filtered_dataframe) self.dh.pca_transform()
self.dh.data_dictionary["prediction_features"] = pd.DataFrame(
data=pca_components,
columns=["PC" + str(i) for i in range(0, self.dh.data["n_kept_components"])],
index=filtered_dataframe.index,
)
# if self.feature_parameters["determine_statistical_distributions"]: # if self.feature_parameters["determine_statistical_distributions"]:
# self.dh.determine_statistical_distributions() # self.dh.determine_statistical_distributions()
# if self.feature_parameters["remove_outliers"]: # if self.feature_parameters["remove_outliers"]:
# self.dh.remove_outliers(predict=True) # creates dropped index # self.dh.remove_outliers(predict=True) # creates dropped index
if self.feature_parameters["use_SVM_to_remove_outliers"]: if self.freqai_info.get('feature_parameters', {}).get('use_SVM_to_remove_outliers'):
self.dh.use_SVM_to_remove_outliers(predict=True) self.dh.use_SVM_to_remove_outliers(predict=True)
if self.feature_parameters["DI_threshold"]: if self.freqai_info.get('feature_parameters', {}).get('DI_threshold'):
self.dh.check_if_pred_in_training_spaces() # sets do_predict self.dh.check_if_pred_in_training_spaces() # sets do_predict

View File

@ -207,7 +207,7 @@ class Backtesting:
if self.config.get('freqai') is not None: if self.config.get('freqai') is not None:
self.required_startup += int((self.config.get('freqai', {}).get('train_period') * self.required_startup += int((self.config.get('freqai', {}).get('train_period') *
86400) / timeframe_to_seconds(self.config['timeframe'])) 86400) / timeframe_to_seconds(self.config['timeframe']))
logger.info("Increasing startup_candle_count for freqai to %s", self.required_startup) logger.info(f'Increasing startup_candle_count for freqai to {self.required_startup}')
self.config['startup_candle_count'] = self.required_startup self.config['startup_candle_count'] = self.required_startup
data = history.load_data( data = history.load_data(