add config asserts, use .get method with default values for optional functionality, move data_cleaning_* to freqai_interface (away from user custom pred model) since it is controlled by config params.

2022-05-23 12:07:09 +02:00 · 2022-05-23 12:07:09 +02:00 · e1c068ca66
commit e1c068ca66
parent dede128648
4 changed files with 162 additions and 93 deletions
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@ -43,6 +43,7 @@ class FreqaiDataKitchen:
        self.data: Dict[Any, Any] = {}
        self.data_dictionary: Dict[Any, Any] = {}
        self.config = config
        self.assert_config(self.config, live)
        self.freqai_config = config["freqai"]
        self.predictions: npt.ArrayLike = np.array([])
        self.do_predict: npt.ArrayLike = np.array([])
@ -59,7 +60,7 @@ class FreqaiDataKitchen:
        self.svm_model: linear_model.SGDOneClassSVM = None
        if not self.live:
            self.full_timerange = self.create_fulltimerange(self.config["timerange"],
-                                                            self.freqai_config["train_period"]
+                                                            self.freqai_config.get("train_period")
                                                            )
            (self.training_timeranges, self.backtesting_timeranges) = self.split_timerange(
@ -68,14 +69,33 @@ class FreqaiDataKitchen:
                config["freqai"]["backtest_period"],
            )
    def assert_config(self, config: Dict[str, Any], live: bool) -> None:
        assert config.get('freqai'), "No Freqai parameters found in config file."
        assert config.get('freqai', {}).get('train_period'), ("No Freqai train_period found in"
                                                              "config file.")
        assert type(config.get('freqai', {})
                    .get('train_period')) is int, ('Can only train on full day period.'
                                                   'No fractional days permitted.')
        assert config.get('freqai', {}).get('backtest_period'), ("No Freqai backtest_period found"
                                                                 "in config file.")
        if not live:
            assert type(config.get('freqai', {})
                        .get('backtest_period')) is int, ('Can only backtest on full day'
                                                          'backtest_period. Only live/dry mode'
                                                          'allows fractions of days')
        assert config.get('freqai', {}).get('identifier'), ("No Freqai identifier found in config"
                                                            "file.")
        assert config.get('freqai', {}).get('feature_parameters'), ("No Freqai feature_parameters"
                                                                    "found in config file.")
    def set_paths(self) -> None:
        self.full_path = Path(self.config['user_data_dir'] /
                              "models" /
-                              str(self.freqai_config['live_full_backtestrange'] +
+                              str(self.freqai_config.get('live_full_backtestrange') +
-                                  self.freqai_config['identifier']))
+                                  self.freqai_config.get('identifier')))
        self.model_path = Path(self.full_path / str("sub-train" + "-" +
-                               str(self.freqai_config['live_trained_timerange'])))
+                               str(self.freqai_config.get('live_trained_timerange'))))
        return
@ -117,7 +137,7 @@ class FreqaiDataKitchen:
        # do not want them having to edit the default save/load methods here. Below is an example
        # of what we do NOT want.
-        # if self.freqai_config['feature_parameters']['determine_statistical_distributions']:
+        # if self.freqai_config.get('feature_parameters','determine_statistical_distributions'):
        #     self.data_dictionary["upper_quantiles"].to_pickle(
        #         save_path / str(self.model_filename + "_upper_quantiles.pkl")
        #     )
@ -147,7 +167,7 @@ class FreqaiDataKitchen:
        # do not want them having to edit the default save/load methods here. Below is an example
        # of what we do NOT want.
-        # if self.freqai_config['feature_parameters']['determine_statistical_distributions']:
+        # if self.freqai_config.get('feature_parameters','determine_statistical_distributions'):
        #     self.data_dictionary["upper_quantiles"] = pd.read_pickle(
        #         self.model_path / str(self.model_filename + "_upper_quantiles.pkl")
        #     )
@ -193,15 +213,15 @@ class FreqaiDataKitchen:
        """
        weights: npt.ArrayLike
-        if self.config["freqai"]["feature_parameters"]["weight_factor"] > 0:
+        if self.freqai_config["feature_parameters"].get("weight_factor", 0) > 0:
            weights = self.set_weights_higher_recent(len(filtered_dataframe))
        else:
            weights = np.ones(len(filtered_dataframe))
-        if self.config["freqai"]["feature_parameters"]["stratify"] > 0:
+        if self.freqai_config["feature_parameters"].get("stratify", 0) > 0:
            stratification = np.zeros(len(filtered_dataframe))
            for i in range(1, len(stratification)):
-                if i % self.config["freqai"]["feature_parameters"]["stratify"] == 0:
+                if i % self.freqai_config.get("feature_parameters", {}).get("stratify", 0) == 0:
                    stratification[i] = 1
        (
@ -525,6 +545,14 @@ class FreqaiDataKitchen:
        return None
    def pca_transform(self, filtered_dataframe: DataFrame) -> None:
        pca_components = self.pca.transform(filtered_dataframe)
        self.data_dictionary["prediction_features"] = pd.DataFrame(
            data=pca_components,
            columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])],
            index=filtered_dataframe.index,
        )
    def compute_distances(self) -> float:
        logger.info("computing average mean distance for all training points")
        pairwise = pairwise_distances(self.data_dictionary["train_features"], n_jobs=-1)
@ -675,7 +703,7 @@ class FreqaiDataKitchen:
        self.full_path = Path(
            self.config["user_data_dir"]
            / "models"
-            / str(full_timerange + self.freqai_config["identifier"])
+            / str(full_timerange + self.freqai_config.get("identifier"))
        )
        config_path = Path(self.config["config_files"][0])
@ -696,13 +724,15 @@ class FreqaiDataKitchen:
        if trained_timerange.startts != 0:
            elapsed_time = (time - trained_timerange.stopts) / SECONDS_IN_DAY
-            retrain = elapsed_time > self.freqai_config['backtest_period']
+            retrain = elapsed_time > self.freqai_config.get('backtest_period')
            if retrain:
-                trained_timerange.startts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
+                trained_timerange.startts += self.freqai_config.get(
-                trained_timerange.stopts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
+                                             'backtest_period', 0) * SECONDS_IN_DAY
                trained_timerange.stopts += self.freqai_config.get(
                                            'backtest_period', 0) * SECONDS_IN_DAY
        else:  # user passed no live_trained_timerange in config
            trained_timerange = TimeRange()
-            trained_timerange.startts = int(time - self.freqai_config['train_period'] *
+            trained_timerange.startts = int(time - self.freqai_config.get('train_period') *
                                            SECONDS_IN_DAY)
            trained_timerange.stopts = int(time)
            retrain = True
@ -725,13 +755,13 @@ class FreqaiDataKitchen:
        exchange = ExchangeResolver.load_exchange(self.config['exchange']['name'],
                                                  self.config, validate=False)
-        pairs = self.freqai_config['corr_pairlist']
+        pairs = self.freqai_config.get('corr_pairlist', [])
        if metadata['pair'] not in pairs:
            pairs += metadata['pair']  # dont include pair twice
        # timerange = TimeRange.parse_timerange(new_timerange)
        refresh_backtest_ohlcv_data(
-                        exchange, pairs=pairs, timeframes=self.freqai_config['timeframes'],
+                        exchange, pairs=pairs, timeframes=self.freqai_config.get('timeframes'),
                        datadir=self.config['datadir'], timerange=timerange,
                        new_pairs_days=self.config['new_pairs_days'],
                        erase=False, data_format=self.config['dataformat_ohlcv'],
@ -743,21 +773,22 @@ class FreqaiDataKitchen:
                                                                                  DataFrame]:
        corr_dataframes: Dict[Any, Any] = {}
        base_dataframes: Dict[Any, Any] = {}
-        pairs = self.freqai_config['corr_pairlist']  # + [metadata['pair']]
+        pairs = self.freqai_config.get('corr_pairlist', [])  # + [metadata['pair']]
        # timerange = TimeRange.parse_timerange(new_timerange)
-        for tf in self.freqai_config['timeframes']:
+        for tf in self.freqai_config.get('timeframes'):
            base_dataframes[tf] = load_pair_history(datadir=self.config['datadir'],
                                                    timeframe=tf,
                                                    pair=metadata['pair'], timerange=timerange)
-            for p in pairs:
+            if pairs:
-                if metadata['pair'] in p:
+                for p in pairs:
-                    continue  # dont repeat anything from whitelist
+                    if metadata['pair'] in p:
-                if p not in corr_dataframes:
+                        continue  # dont repeat anything from whitelist
-                    corr_dataframes[p] = {}
+                    if p not in corr_dataframes:
-                corr_dataframes[p][tf] = load_pair_history(datadir=self.config['datadir'],
+                        corr_dataframes[p] = {}
-                                                           timeframe=tf,
+                    corr_dataframes[p][tf] = load_pair_history(datadir=self.config['datadir'],
-                                                           pair=p, timerange=timerange)
+                                                               timeframe=tf,
                                                               pair=p, timerange=timerange)
        return corr_dataframes, base_dataframes
@ -767,23 +798,25 @@ class FreqaiDataKitchen:
                                            metadata: dict) -> DataFrame:
        dataframe = base_dataframes[self.config['timeframe']]
        pairs = self.freqai_config.get("corr_pairlist", [])
-        for tf in self.freqai_config["timeframes"]:
+        for tf in self.freqai_config.get("timeframes"):
            dataframe = strategy.populate_any_indicators(metadata['pair'],
                                                         dataframe.copy(),
                                                         tf,
                                                         base_dataframes[tf],
                                                         coin=metadata['pair'].split("/")[0] + "-"
                                                         )
-            for i in self.freqai_config["corr_pairlist"]:
+            if pairs:
-                if metadata['pair'] in i:
+                for i in pairs:
-                    continue  # dont repeat anything from whitelist
+                    if metadata['pair'] in i:
-                dataframe = strategy.populate_any_indicators(i,
+                        continue  # dont repeat anything from whitelist
-                                                             dataframe.copy(),
+                    dataframe = strategy.populate_any_indicators(i,
-                                                             tf,
+                                                                 dataframe.copy(),
-                                                             corr_dataframes[i][tf],
+                                                                 tf,
-                                                             coin=i.split("/")[0] + "-"
+                                                                 corr_dataframes[i][tf],
-                                                             )
+                                                                 coin=i.split("/")[0] + "-"
                                                                 )
        return dataframe
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@ -20,7 +20,7 @@ from freqtrade.strategy.interface import IStrategy
 pd.options.mode.chained_assignment = None
 logger = logging.getLogger(__name__)
-# FIXME: suppress stdout for background training
+# FIXME: suppress stdout for background training?
 # class DummyFile(object):
 #     def write(self, x): pass
@ -51,6 +51,7 @@ class IFreqaiModel(ABC):
    def __init__(self, config: Dict[str, Any]) -> None:
        self.config = config
        self.assert_config(self.config)
        self.freqai_info = config["freqai"]
        self.data_split_parameters = config["freqai"]["data_split_parameters"]
        self.model_training_parameters = config["freqai"]["model_training_parameters"]
@ -64,12 +65,25 @@ class IFreqaiModel(ABC):
        self.training_on_separate_thread = False
        self.retrain = False
        self.first = True
-        if self.freqai_info['live_trained_timerange']:
+        if self.freqai_info.get('live_trained_timerange'):
            self.new_trained_timerange = TimeRange.parse_timerange(
                                                   self.freqai_info['live_trained_timerange'])
        else:
            self.new_trained_timerange = TimeRange()
    def assert_config(self, config: Dict[str, Any]) -> None:
        assert config.get('freqai'), "No Freqai parameters found in config file."
        assert config.get('freqai', {}).get('data_split_parameters'), ("No Freqai"
                                                                       "data_split_parameters"
                                                                       "in config file.")
        assert config.get('freqai', {}).get('model_training_parameters'), ("No Freqai"
                                                                           "modeltrainingparameters"
                                                                           "found in config file.")
        assert config.get('freqai', {}).get('feature_parameters'), ("No Freqai"
                                                                    "feature_parameters found in"
                                                                    "config file.")
    def start(self, dataframe: DataFrame, metadata: dict, strategy: IStrategy) -> DataFrame:
        """
        Entry point to the FreqaiModel, it will train a new model if
@ -192,55 +206,30 @@ class IFreqaiModel(ABC):
        return
    @abstractmethod
    def train(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Any:
        """
        Filter the training data and train a model to it. Train makes heavy use of the datahandler
        for storing, saving, loading, and analyzing the data.
        :params:
        :unfiltered_dataframe: Full dataframe for the current training period
        :metadata: pair metadata from strategy.
        :returns:
        :model: Trained model which can be used to inference (self.predict)
        """
    @abstractmethod
    def fit(self) -> Any:
        """
        Most regressors use the same function names and arguments e.g. user
        can drop in LGBMRegressor in place of CatBoostRegressor and all data
        management will be properly handled by Freqai.
        :params:
        :data_dictionary: the dictionary constructed by DataHandler to hold
        all the training and test data/labels.
        """
        return
    @abstractmethod
    def predict(self, dataframe: DataFrame, metadata: dict) -> Tuple[npt.ArrayLike, npt.ArrayLike]:
        """
        Filter the prediction features data and predict with it.
        :param: unfiltered_dataframe: Full dataframe for the current backtest period.
        :return:
        :predictions: np.array of predictions
        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
        data (NaNs) or felt uncertain about data (PCA and DI index)
        """
    @abstractmethod
    def data_cleaning_train(self) -> None:
        """
-        User can add data analysis and cleaning here.
+        Base data cleaning method for train
        Any function inside this method should drop training data points from the filtered_dataframe
        based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example
        of how outlier data points are dropped from the dataframe used for training.
        """
        if self.freqai_info.get('feature_parameters', {}).get('principal_component_analysis'):
            self.dh.principal_component_analysis()
-    @abstractmethod
+        # if self.feature_parameters["determine_statistical_distributions"]:
-    def data_cleaning_predict(self) -> None:
+        #     self.dh.determine_statistical_distributions()
        # if self.feature_parameters["remove_outliers"]:
        #     self.dh.remove_outliers(predict=False)
        if self.freqai_info.get('feature_parameters', {}).get('use_SVM_to_remove_outliers'):
            self.dh.use_SVM_to_remove_outliers(predict=False)
        if self.freqai_info.get('feature_parameters', {}).get('DI_threshold'):
            self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
    def data_cleaning_predict(self, filtered_dataframe: DataFrame) -> None:
        """
-        User can add data analysis and cleaning here.
+        Base data cleaning method for predict.
        These functions each modify self.dh.do_predict, which is a dataframe with equal length
        to the number of candles coming from and returning to the strategy. Inside do_predict,
         1 allows prediction and < 0 signals to the strategy that the model is not confident in
@ -249,6 +238,19 @@ class IFreqaiModel(ABC):
        of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
        for buy signals.
        """
        if self.freqai_info.get('feature_parameters', {}).get('principal_component_analysis'):
            self.dh.pca_transform()
        # if self.feature_parameters["determine_statistical_distributions"]:
        #     self.dh.determine_statistical_distributions()
        # if self.feature_parameters["remove_outliers"]:
        #     self.dh.remove_outliers(predict=True)  # creates dropped index
        if self.freqai_info.get('feature_parameters', {}).get('use_SVM_to_remove_outliers'):
            self.dh.use_SVM_to_remove_outliers(predict=True)
        if self.freqai_info.get('feature_parameters', {}).get('DI_threshold'):
            self.dh.check_if_pred_in_training_spaces()  # sets do_predict
    def model_exists(self, pair: str, training_timerange: str) -> bool:
        """
@ -303,3 +305,42 @@ class IFreqaiModel(ABC):
        self.model = self.train(unfiltered_dataframe, metadata)
        self.dh.save_data(self.model)
        self.retrain = False
    # Methods which are overridden by user made prediction models.
    # See freqai/prediction_models/CatboostPredictionModlel.py for an example.
    @abstractmethod
    def train(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Any:
        """
        Filter the training data and train a model to it. Train makes heavy use of the datahandler
        for storing, saving, loading, and analyzing the data.
        :params:
        :unfiltered_dataframe: Full dataframe for the current training period
        :metadata: pair metadata from strategy.
        :returns:
        :model: Trained model which can be used to inference (self.predict)
        """
    @abstractmethod
    def fit(self) -> Any:
        """
        Most regressors use the same function names and arguments e.g. user
        can drop in LGBMRegressor in place of CatBoostRegressor and all data
        management will be properly handled by Freqai.
        :params:
        :data_dictionary: the dictionary constructed by DataHandler to hold
        all the training and test data/labels.
        """
        return
    @abstractmethod
    def predict(self, dataframe: DataFrame, metadata: dict) -> Tuple[npt.ArrayLike, npt.ArrayLike]:
        """
        Filter the prediction features data and predict with it.
        :param: unfiltered_dataframe: Full dataframe for the current backtest period.
        :return:
        :predictions: np.array of predictions
        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
        data (NaNs) or felt uncertain about data (PCA and DI index)
        """
--- a/freqtrade/freqai/prediction_models/CatboostPredictionModel.py
+++ b/freqtrade/freqai/prediction_models/CatboostPredictionModel.py
@ -1,7 +1,6 @@
 import logging
 from typing import Any, Dict, Tuple
 import pandas as pd
 from catboost import CatBoostRegressor, Pool
 from pandas import DataFrame
@ -149,7 +148,7 @@ class CatboostPredictionModel(IFreqaiModel):
        based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example
        of how outlier data points are dropped from the dataframe used for training.
        """
-        if self.feature_parameters["principal_component_analysis"]:
+        if self.freqai_info.get('feature_parameters', {}).get('principal_component_analysis'):
            self.dh.principal_component_analysis()
        # if self.feature_parameters["determine_statistical_distributions"]:
@ -157,9 +156,10 @@ class CatboostPredictionModel(IFreqaiModel):
        # if self.feature_parameters["remove_outliers"]:
        #     self.dh.remove_outliers(predict=False)
-        if self.feature_parameters["use_SVM_to_remove_outliers"]:
+        if self.freqai_info.get('feature_parameters', {}).get('use_SVM_to_remove_outliers'):
            self.dh.use_SVM_to_remove_outliers(predict=False)
-        if self.feature_parameters["DI_threshold"]:
+
        if self.freqai_info.get('feature_parameters', {}).get('DI_threshold'):
            self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
    def data_cleaning_predict(self, filtered_dataframe: DataFrame) -> None:
@ -173,21 +173,16 @@ class CatboostPredictionModel(IFreqaiModel):
        of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
        for buy signals.
        """
-        if self.feature_parameters["principal_component_analysis"]:
+        if self.freqai_info.get('feature_parameters', {}).get('principal_component_analysis'):
-            pca_components = self.dh.pca.transform(filtered_dataframe)
+            self.dh.pca_transform()
            self.dh.data_dictionary["prediction_features"] = pd.DataFrame(
                data=pca_components,
                columns=["PC" + str(i) for i in range(0, self.dh.data["n_kept_components"])],
                index=filtered_dataframe.index,
            )
        # if self.feature_parameters["determine_statistical_distributions"]:
        #     self.dh.determine_statistical_distributions()
        # if self.feature_parameters["remove_outliers"]:
        #     self.dh.remove_outliers(predict=True)  # creates dropped index
-        if self.feature_parameters["use_SVM_to_remove_outliers"]:
+        if self.freqai_info.get('feature_parameters', {}).get('use_SVM_to_remove_outliers'):
            self.dh.use_SVM_to_remove_outliers(predict=True)
-        if self.feature_parameters["DI_threshold"]:
+        if self.freqai_info.get('feature_parameters', {}).get('DI_threshold'):
            self.dh.check_if_pred_in_training_spaces()  # sets do_predict
--- a/freqtrade/optimize/backtesting.py
+++ b/freqtrade/optimize/backtesting.py
@ -207,7 +207,7 @@ class Backtesting:
        if self.config.get('freqai') is not None:
            self.required_startup += int((self.config.get('freqai', {}).get('train_period') *
                                         86400) / timeframe_to_seconds(self.config['timeframe']))
-            logger.info("Increasing startup_candle_count for freqai to %s", self.required_startup)
+            logger.info(f'Increasing startup_candle_count for freqai to {self.required_startup}')
            self.config['startup_candle_count'] = self.required_startup
        data = history.load_data(