merge develop into feat/freqai-rl-dev

2022-11-12 10:54:34 +01:00
parent a2843165e1 e6172a68d7
commit 7a4bb040a5
60 changed files with 1314 additions and 264 deletions
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -5,8 +5,7 @@ from abc import ABC, abstractmethod
 from collections import deque
 from datetime import datetime, timezone
 from pathlib import Path
-from threading import Lock
-from typing import Any, Dict, List, Optional, Literal, Tuple
+from typing import Any, Dict, List, Literal, Optional, Tuple

 import numpy as np
 import pandas as pd
@@ -70,22 +69,23 @@ class IFreqaiModel(ABC):
        if self.save_backtest_models:
            logger.info('Backtesting module configured to save all models.')
        self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode)
+        # set current candle to arbitrary historical date
+        self.current_candle: datetime = datetime.fromtimestamp(637887600, tz=timezone.utc)
+        self.dd.current_candle = self.current_candle
        self.scanning = False
        self.ft_params = self.freqai_info["feature_parameters"]
+        self.corr_pairlist: List[str] = self.ft_params.get("include_corr_pairlist", [])
        self.keras: bool = self.freqai_info.get("keras", False)
        if self.keras and self.ft_params.get("DI_threshold", 0):
            self.ft_params["DI_threshold"] = 0
            logger.warning("DI threshold is not configured for Keras models yet. Deactivating.")
-        self.CONV_WIDTH = self.freqai_info.get("conv_width", 2)
+        self.CONV_WIDTH = self.freqai_info.get('conv_width', 1)
        if self.ft_params.get("inlier_metric_window", 0):
            self.CONV_WIDTH = self.ft_params.get("inlier_metric_window", 0) * 2
        self.pair_it = 0
        self.pair_it_train = 0
        self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist"))
        self.train_queue = self._set_train_queue()
-        self.last_trade_database_summary: DataFrame = {}
-        self.current_trade_database_summary: DataFrame = {}
-        self.analysis_lock = Lock()
        self.inference_time: float = 0
        self.train_time: float = 0
        self.begin_time: float = 0
@@ -93,7 +93,10 @@ class IFreqaiModel(ABC):
        self.base_tf_seconds = timeframe_to_seconds(self.config['timeframe'])
        self.continual_learning = self.freqai_info.get('continual_learning', False)
        self.plot_features = self.ft_params.get("plot_feature_importances", 0)
-
+        self.corr_dataframes: Dict[str, DataFrame] = {}
+        # get_corr_dataframes is controlling the caching of corr_dataframes
+        # for improved performance. Careful with this boolean.
+        self.get_corr_dataframes: bool = True
        self._threads: List[threading.Thread] = []
        self._stop_event = threading.Event()
        self.strategy: Optional[IStrategy] = None
@@ -140,7 +143,11 @@ class IFreqaiModel(ABC):
        # the concatenated results for the full backtesting period back to the strategy.
        elif not self.follow_mode:
            self.dk = FreqaiDataKitchen(self.config, self.live, metadata["pair"])
-            logger.info(f"Training {len(self.dk.training_timeranges)} timeranges")
+            if self.dk.backtest_live_models:
+                logger.info(
+                    f"Backtesting {len(self.dk.backtesting_timeranges)} timeranges (live models)")
+            else:
+                logger.info(f"Training {len(self.dk.training_timeranges)} timeranges")
            dataframe = self.dk.use_strategy_to_populate_indicators(
                strategy, prediction_dataframe=dataframe, pair=metadata["pair"]
            )
@@ -269,25 +276,20 @@ class IFreqaiModel(ABC):
            dataframe_train = dk.slice_dataframe(tr_train, dataframe)
            dataframe_backtest = dk.slice_dataframe(tr_backtest, dataframe)

-            trained_timestamp = tr_train
-            tr_train_startts_str = datetime.fromtimestamp(
-                                                tr_train.startts,
-                                                tz=timezone.utc).strftime(DATETIME_PRINT_FORMAT)
-            tr_train_stopts_str = datetime.fromtimestamp(
-                                                tr_train.stopts,
-                                                tz=timezone.utc).strftime(DATETIME_PRINT_FORMAT)
-            logger.info(
-                f"Training {pair}, {self.pair_it}/{self.total_pairs} pairs"
-                f" from {tr_train_startts_str} to {tr_train_stopts_str}, {train_it}/{total_trains} "
-                "trains"
-            )
+            if not self.ensure_data_exists(dataframe_backtest, tr_backtest, pair):
+                continue

-            trained_timestamp_int = int(trained_timestamp.stopts)
-            dk.set_paths(pair, trained_timestamp_int)
+            self.log_backtesting_progress(tr_train, pair, train_it, total_trains)

-            dk.set_new_model_names(pair, trained_timestamp)
+            timestamp_model_id = int(tr_train.stopts)
+            if dk.backtest_live_models:
+                timestamp_model_id = int(tr_backtest.startts)

-            if dk.check_if_backtest_prediction_exists():
+            dk.set_paths(pair, timestamp_model_id)
+
+            dk.set_new_model_names(pair, timestamp_model_id)
+
+            if dk.check_if_backtest_prediction_is_valid(len(dataframe_backtest)):
                self.dd.load_metadata(dk)
                dk.find_features(dataframe_train)
                self.check_if_feature_list_matches_strategy(dk)
@@ -299,7 +301,7 @@ class IFreqaiModel(ABC):
                    dk.find_labels(dataframe_train)
                    self.model = self.train(dataframe_train, pair, dk)
                    self.dd.pair_dict[pair]["trained_timestamp"] = int(
-                        trained_timestamp.stopts)
+                        tr_train.stopts)
                    if self.plot_features:
                        plot_feature_importance(self.model, pair, dk, self.plot_features)
                    if self.save_backtest_models:
@@ -351,6 +353,7 @@ class IFreqaiModel(ABC):
        if self.dd.historic_data:
            self.dd.update_historic_data(strategy, dk)
            logger.debug(f'Updating historic data on pair {metadata["pair"]}')
+            self.track_current_candle()

        if not self.follow_mode:

@@ -377,10 +380,10 @@ class IFreqaiModel(ABC):
        # load the model and associated data into the data kitchen
        self.model = self.dd.load_data(metadata["pair"], dk)

-        with self.analysis_lock:
-            dataframe = self.dk.use_strategy_to_populate_indicators(
-                strategy, prediction_dataframe=dataframe, pair=metadata["pair"]
-            )
+        dataframe = dk.use_strategy_to_populate_indicators(
+            strategy, prediction_dataframe=dataframe, pair=metadata["pair"],
+            do_corr_pairs=self.get_corr_dataframes
+        )

        if not self.model:
            logger.warning(
@@ -389,6 +392,9 @@ class IFreqaiModel(ABC):
            self.dd.return_null_values_to_strategy(dataframe, dk)
            return dk

+        if self.corr_pairlist:
+            dataframe = self.cache_corr_pairlist_dfs(dataframe, dk)
+
        dk.find_labels(dataframe)

        self.build_strategy_return_arrays(dataframe, dk, metadata["pair"], trained_timestamp)
@@ -572,10 +578,9 @@ class IFreqaiModel(ABC):
            data_load_timerange, pair, dk
        )

-        with self.analysis_lock:
-            unfiltered_dataframe = dk.use_strategy_to_populate_indicators(
-                strategy, corr_dataframes, base_dataframes, pair
-            )
+        unfiltered_dataframe = dk.use_strategy_to_populate_indicators(
+            strategy, corr_dataframes, base_dataframes, pair
+        )

        unfiltered_dataframe = dk.slice_dataframe(new_trained_timerange, unfiltered_dataframe)

@@ -586,7 +591,7 @@ class IFreqaiModel(ABC):
        model = self.train(unfiltered_dataframe, pair, dk)

        self.dd.pair_dict[pair]["trained_timestamp"] = new_trained_timerange.stopts
-        dk.set_new_model_names(pair, new_trained_timerange)
+        dk.set_new_model_names(pair, new_trained_timerange.stopts)
        self.dd.save_data(model, pair, dk)

        if self.plot_features:
@@ -751,6 +756,87 @@ class IFreqaiModel(ABC):
                    f'Best approximation queue: {best_queue}')
        return best_queue

+    def cache_corr_pairlist_dfs(self, dataframe: DataFrame, dk: FreqaiDataKitchen) -> DataFrame:
+        """
+        Cache the corr_pairlist dfs to speed up performance for subsequent pairs during the
+        current candle.
+        :param dataframe: strategy fed dataframe
+        :param dk: datakitchen object for current asset
+        :return: dataframe to attach/extract cached corr_pair dfs to/from.
+        """
+
+        if self.get_corr_dataframes:
+            self.corr_dataframes = dk.extract_corr_pair_columns_from_populated_indicators(dataframe)
+            if not self.corr_dataframes:
+                logger.warning("Couldn't cache corr_pair dataframes for improved performance. "
+                               "Consider ensuring that the full coin/stake, e.g. XYZ/USD, "
+                               "is included in the column names when you are creating features "
+                               "in `populate_any_indicators()`.")
+            self.get_corr_dataframes = not bool(self.corr_dataframes)
+        elif self.corr_dataframes:
+            dataframe = dk.attach_corr_pair_columns(
+                dataframe, self.corr_dataframes, dk.pair)
+
+        return dataframe
+
+    def track_current_candle(self):
+        """
+        Checks if the latest candle appended by the datadrawer is
+        equivalent to the latest candle seen by FreqAI. If not, it
+        asks to refresh the cached corr_dfs, and resets the pair
+        counter.
+        """
+        if self.dd.current_candle > self.current_candle:
+            self.get_corr_dataframes = True
+            self.pair_it = 1
+            self.current_candle = self.dd.current_candle
+
+    def ensure_data_exists(self, dataframe_backtest: DataFrame,
+                           tr_backtest: TimeRange, pair: str) -> bool:
+        """
+        Check if the dataframe is empty, if not, report useful information to user.
+        :param dataframe_backtest: the backtesting dataframe, maybe empty.
+        :param tr_backtest: current backtesting timerange.
+        :param pair: current pair
+        :return: if the data exists or not
+        """
+        if self.config.get("freqai_backtest_live_models", False) and len(dataframe_backtest) == 0:
+            tr_backtest_startts_str = datetime.fromtimestamp(
+                                            tr_backtest.startts,
+                                            tz=timezone.utc).strftime(DATETIME_PRINT_FORMAT)
+            tr_backtest_stopts_str = datetime.fromtimestamp(
+                                            tr_backtest.stopts,
+                                            tz=timezone.utc).strftime(DATETIME_PRINT_FORMAT)
+            logger.info(f"No data found for pair {pair} from {tr_backtest_startts_str} "
+                        f" from {tr_backtest_startts_str} to {tr_backtest_stopts_str}. "
+                        "Probably more than one training within the same candle period.")
+            return False
+        return True
+
+    def log_backtesting_progress(self, tr_train: TimeRange, pair: str,
+                                 train_it: int, total_trains: int):
+        """
+        Log the backtesting progress so user knows how many pairs have been trained and
+        how many more pairs/trains remain.
+        :param tr_train: the training timerange
+        :param train_it: the train iteration for the current pair (the sliding window progress)
+        :param pair: the current pair
+        :param total_trains: total trains (total number of slides for the sliding window)
+        """
+        tr_train_startts_str = datetime.fromtimestamp(
+                                            tr_train.startts,
+                                            tz=timezone.utc).strftime(DATETIME_PRINT_FORMAT)
+        tr_train_stopts_str = datetime.fromtimestamp(
+                                            tr_train.stopts,
+                                            tz=timezone.utc).strftime(DATETIME_PRINT_FORMAT)
+
+        if not self.config.get("freqai_backtest_live_models", False):
+            logger.info(
+                f"Training {pair}, {self.pair_it}/{self.total_pairs} pairs"
+                f" from {tr_train_startts_str} "
+                f"to {tr_train_stopts_str}, {train_it}/{total_trains} "
+                "trains"
+            )
    # Following methods which are overridden by user made prediction models.
    # See freqai/prediction_models/CatboostPredictionModel.py for an example.