Merge branch 'develop' into dev-merge-rl

2022-09-22 19:46:50 +02:00
parent 7b1d409c98 71e6c54ea4
commit ea8e34e192
121 changed files with 1525 additions and 564 deletions
@@ -16,6 +16,7 @@ from numpy.typing import NDArray
 from pandas import DataFrame

 from freqtrade.configuration import TimeRange
+from freqtrade.constants import Config
 from freqtrade.data.history import load_pair_history
 from freqtrade.exceptions import OperationalException
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
@@ -27,9 +28,7 @@ logger = logging.getLogger(__name__)

 class pair_info(TypedDict):
    model_filename: str
-    first: bool
    trained_timestamp: int
-    priority: int
    data_path: str
    extras: dict

@@ -58,7 +57,7 @@ class FreqaiDataDrawer:
    Juha Nykänen @suikula, Wagner Costa @wagnercosta, Johan Vlugt @Jooopieeert
    """

-    def __init__(self, full_path: Path, config: dict, follow_mode: bool = False):
+    def __init__(self, full_path: Path, config: Config, follow_mode: bool = False):

        self.config = config
        self.freqai_info = config.get("freqai", {})
@@ -91,7 +90,7 @@ class FreqaiDataDrawer:
        self.old_DBSCAN_eps: Dict[str, float] = {}
        self.empty_pair_dict: pair_info = {
                "model_filename": "", "trained_timestamp": 0,
-                "priority": 1, "first": True, "data_path": "", "extras": {}}
+                "data_path": "", "extras": {}}
        self.limit_ram_use = self.freqai_info.get('limit_ram_usage', False)

    def load_drawer_from_disk(self):
@@ -217,7 +216,6 @@ class FreqaiDataDrawer:
            self.pair_dict[pair] = self.empty_pair_dict.copy()
            model_filename = ""
            trained_timestamp = 0
-            self.pair_dict[pair]["priority"] = len(self.pair_dict)

        if not data_path_set and self.follow_mode:
            logger.warning(
@@ -237,18 +235,9 @@ class FreqaiDataDrawer:
            return
        else:
            self.pair_dict[metadata["pair"]] = self.empty_pair_dict.copy()
-            self.pair_dict[metadata["pair"]]["priority"] = len(self.pair_dict)

            return

-    def pair_to_end_of_training_queue(self, pair: str) -> None:
-        # march all pairs up in the queue
-        with self.pair_dict_lock:
-            for p in self.pair_dict:
-                self.pair_dict[p]["priority"] -= 1
-            # send pair to end of queue
-            self.pair_dict[pair]["priority"] = len(self.pair_dict)
-
    def set_initial_return_values(self, pair: str, pred_df: DataFrame) -> None:
        """
        Set the initial return values to the historical predictions dataframe. This avoids needing
@@ -356,7 +345,7 @@ class FreqaiDataDrawer:
        for dir in model_folders:
            result = pattern.match(str(dir.name))
            if result is None:
-                break
+                continue
            coin = result.group(1)
            timestamp = result.group(2)

@@ -18,6 +18,7 @@ from sklearn.model_selection import train_test_split
 from sklearn.neighbors import NearestNeighbors

 from freqtrade.configuration import TimeRange
+from freqtrade.constants import Config
 from freqtrade.exceptions import OperationalException
 from freqtrade.exchange import timeframe_to_seconds
 from freqtrade.strategy.interface import IStrategy
@@ -57,7 +58,7 @@ class FreqaiDataKitchen:

    def __init__(
        self,
-        config: Dict[str, Any],
+        config: Config,
        live: bool = False,
        pair: str = "",
    ):
@@ -774,12 +775,22 @@ class FreqaiDataKitchen:

    def compute_inlier_metric(self, set_='train') -> None:
        """
-
        Compute inlier metric from backwards distance distributions.
        This metric defines how well features from a timepoint fit
        into previous timepoints.
        """

+        def normalise(dataframe: DataFrame, key: str) -> DataFrame:
+            if set_ == 'train':
+                min_value = dataframe.min()
+                max_value = dataframe.max()
+                self.data[f'{key}_min'] = min_value
+                self.data[f'{key}_max'] = max_value
+            else:
+                min_value = self.data[f'{key}_min']
+                max_value = self.data[f'{key}_max']
+            return (dataframe - min_value) / (max_value - min_value)
+
        no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"]

        if set_ == 'train':
@@ -824,7 +835,12 @@ class FreqaiDataKitchen:
        inliers = pd.DataFrame(index=distances.index)
        for key in distances.keys():
            current_distances = distances[key].dropna()
-            fit_params = stats.weibull_min.fit(current_distances)
+            current_distances = normalise(current_distances, key)
+            if set_ == 'train':
+                fit_params = stats.weibull_min.fit(current_distances)
+                self.data[f'{key}_fit_params'] = fit_params
+            else:
+                fit_params = self.data[f'{key}_fit_params']
            quantiles = stats.weibull_min.cdf(current_distances, *fit_params)

            df_inlier = pd.DataFrame(
@@ -3,6 +3,7 @@ import shutil
 import threading
 import time
 from abc import ABC, abstractmethod
+from collections import deque
 from datetime import datetime, timezone
 from pathlib import Path
 from threading import Lock
@@ -14,12 +15,13 @@ from numpy.typing import NDArray
 from pandas import DataFrame

 from freqtrade.configuration import TimeRange
-from freqtrade.constants import DATETIME_PRINT_FORMAT
+from freqtrade.constants import DATETIME_PRINT_FORMAT, Config
 from freqtrade.enums import RunMode
 from freqtrade.exceptions import OperationalException
 from freqtrade.exchange import timeframe_to_seconds
 from freqtrade.freqai.data_drawer import FreqaiDataDrawer
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from freqtrade.freqai.utils import plot_feature_importance
 from freqtrade.strategy.interface import IStrategy


@@ -50,7 +52,7 @@ class IFreqaiModel(ABC):
    Juha Nykänen @suikula, Wagner Costa @wagnercosta, Johan Vlugt @Jooopieeert
    """

-    def __init__(self, config: Dict[str, Any]) -> None:
+    def __init__(self, config: Config) -> None:

        self.config = config
        self.assert_config(self.config)
@@ -80,6 +82,7 @@ class IFreqaiModel(ABC):
        self.pair_it = 0
        self.pair_it_train = 0
        self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist"))
+        self.train_queue = self._set_train_queue()
        self.last_trade_database_summary: DataFrame = {}
        self.current_trade_database_summary: DataFrame = {}
        self.analysis_lock = Lock()
@@ -101,7 +104,7 @@ class IFreqaiModel(ABC):
        return ({})
        self.strategy: Optional[IStrategy] = None

-    def assert_config(self, config: Dict[str, Any]) -> None:
+    def assert_config(self, config: Config) -> None:

        if not config.get("freqai", {}):
            raise OperationalException("No freqai parameters found in configuration file.")
@@ -184,29 +187,40 @@ class IFreqaiModel(ABC):
        """
        while not self._stop_event.is_set():
            time.sleep(1)
-            for pair in self.config.get("exchange", {}).get("pair_whitelist"):
+            pair = self.train_queue[0]

-                (_, trained_timestamp, _) = self.dd.get_pair_dict_info(pair)
+            # ensure pair is avaialble in dp
+            if pair not in strategy.dp.current_whitelist():
+                self.train_queue.popleft()
+                logger.warning(f'{pair} not in current whitelist, removing from train queue.')
+                continue

-                if self.dd.pair_dict[pair]["priority"] != 1:
-                    continue
-                dk = FreqaiDataKitchen(self.config, self.live, pair)
-                dk.set_paths(pair, trained_timestamp)
-                (
-                    retrain,
-                    new_trained_timerange,
-                    data_load_timerange,
-                ) = dk.check_if_new_training_required(trained_timestamp)
-                dk.set_paths(pair, new_trained_timerange.stopts)
+            (_, trained_timestamp, _) = self.dd.get_pair_dict_info(pair)

-                if retrain:
-                    self.train_timer('start')
+            dk = FreqaiDataKitchen(self.config, self.live, pair)
+            dk.set_paths(pair, trained_timestamp)
+            (
+                retrain,
+                new_trained_timerange,
+                data_load_timerange,
+            ) = dk.check_if_new_training_required(trained_timestamp)
+            dk.set_paths(pair, new_trained_timerange.stopts)
+
+            if retrain:
+                self.train_timer('start')
+                try:
                    self.extract_data_and_train_model(
                        new_trained_timerange, pair, strategy, dk, data_load_timerange
                    )
-                    self.train_timer('stop')
+                except Exception as msg:
+                    logger.warning(f'Training {pair} raised exception {msg}, skipping.')

-            self.dd.save_historic_predictions_to_disk()
+                self.train_timer('stop')
+
+                # only rotate the queue after the first has been trained.
+                self.train_queue.rotate(-1)
+
+                self.dd.save_historic_predictions_to_disk()

    def start_backtesting(
        self, dataframe: DataFrame, metadata: dict, dk: FreqaiDataKitchen
@@ -561,11 +575,11 @@ class IFreqaiModel(ABC):

        self.dd.pair_dict[pair]["trained_timestamp"] = new_trained_timerange.stopts
        dk.set_new_model_names(pair, new_trained_timerange)
-        self.dd.pair_dict[pair]["first"] = False
-        if self.dd.pair_dict[pair]["priority"] == 1 and self.scanning:
-            self.dd.pair_to_end_of_training_queue(pair)
        self.dd.save_data(model, pair, dk)

+        if self.freqai_info["feature_parameters"].get("plot_feature_importance", False):
+            plot_feature_importance(model, pair, dk)
+
        if self.freqai_info.get("purge_old_models", False):
            self.dd.purge_old_models()

@@ -689,6 +703,32 @@ class IFreqaiModel(ABC):

        return init_model

+    def _set_train_queue(self):
+        """
+        Sets train queue from existing train timestamps if they exist
+        otherwise it sets the train queue based on the provided whitelist.
+        """
+        current_pairlist = self.config.get("exchange", {}).get("pair_whitelist")
+        if not self.dd.pair_dict:
+            logger.info('Set fresh train queue from whitelist. '
+                        f'Queue: {current_pairlist}')
+            return deque(current_pairlist)
+
+        best_queue = deque()
+
+        pair_dict_sorted = sorted(self.dd.pair_dict.items(),
+                                  key=lambda k: k[1]['trained_timestamp'])
+        for pair in pair_dict_sorted:
+            if pair[0] in current_pairlist:
+                best_queue.append(pair[0])
+        for pair in current_pairlist:
+            if pair not in best_queue:
+                best_queue.appendleft(pair)
+
+        logger.info('Set existing queue from trained timestamps. '
+                    f'Best approximation queue: {best_queue}')
+        return best_queue
+
    # Following methods which are overridden by user made prediction models.
    # See freqai/prediction_models/CatboostPredictionModel.py for an example.

@@ -0,0 +1,85 @@
+import logging
+from typing import Any, Dict, Tuple
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+from pandas import DataFrame
+from pandas.api.types import is_integer_dtype
+from sklearn.preprocessing import LabelEncoder
+from xgboost import XGBClassifier
+
+from freqtrade.freqai.base_models.BaseClassifierModel import BaseClassifierModel
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+
+
+logger = logging.getLogger(__name__)
+
+
+class XGBoostClassifier(BaseClassifierModel):
+    """
+    User created prediction model. The class needs to override three necessary
+    functions, predict(), train(), fit(). The class inherits ModelHandler which
+    has its own DataHandler where data is held, saved, loaded, and managed.
+    """
+
+    def fit(self, data_dictionary: Dict, dk: FreqaiDataKitchen, **kwargs) -> Any:
+        """
+        User sets up the training and test data to fit their desired model here
+        :params:
+        :data_dictionary: the dictionary constructed by DataHandler to hold
+        all the training and test data/labels.
+        """
+
+        X = data_dictionary["train_features"].to_numpy()
+        y = data_dictionary["train_labels"].to_numpy()[:, 0]
+
+        le = LabelEncoder()
+        if not is_integer_dtype(y):
+            y = pd.Series(le.fit_transform(y), dtype="int64")
+
+        if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) == 0:
+            eval_set = None
+        else:
+            test_features = data_dictionary["test_features"].to_numpy()
+            test_labels = data_dictionary["test_labels"].to_numpy()[:, 0]
+
+            if not is_integer_dtype(test_labels):
+                test_labels = pd.Series(le.transform(test_labels), dtype="int64")
+
+            eval_set = [(test_features, test_labels)]
+
+        train_weights = data_dictionary["train_weights"]
+
+        init_model = self.get_init_model(dk.pair)
+
+        model = XGBClassifier(**self.model_training_parameters)
+
+        model.fit(X=X, y=y, eval_set=eval_set, sample_weight=train_weights,
+                  xgb_model=init_model)
+
+        return model
+
+    def predict(
+        self, unfiltered_df: DataFrame, dk: FreqaiDataKitchen, **kwargs
+    ) -> Tuple[DataFrame, npt.NDArray[np.int_]]:
+        """
+        Filter the prediction features data and predict with it.
+        :param: unfiltered_df: Full dataframe for the current backtest period.
+        :return:
+        :pred_df: dataframe containing the predictions
+        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
+        data (NaNs) or felt uncertain about data (PCA and DI index)
+        """
+
+        (pred_df, dk.do_predict) = super().predict(unfiltered_df, dk, **kwargs)
+
+        le = LabelEncoder()
+        label = dk.label_list[0]
+        labels_before = list(dk.data['labels_std'].keys())
+        labels_after = le.fit_transform(labels_before).tolist()
+        pred_df[label] = le.inverse_transform(pred_df[label])
+        pred_df = pred_df.rename(
+            columns={labels_after[i]: labels_before[i] for i in range(len(labels_before))})
+
+        return (pred_df, dk.do_predict)
@@ -1,19 +1,25 @@
 import logging
 from datetime import datetime, timezone
+from typing import Any
+
+import numpy as np
+import pandas as pd

 from freqtrade.configuration import TimeRange
+from freqtrade.constants import Config
 from freqtrade.data.dataprovider import DataProvider
 from freqtrade.data.history.history_utils import refresh_backtest_ohlcv_data
 from freqtrade.exceptions import OperationalException
 from freqtrade.exchange import timeframe_to_seconds
 from freqtrade.exchange.exchange import market_is_active
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.plugins.pairlist.pairlist_helpers import dynamic_expand_pairlist


 logger = logging.getLogger(__name__)


-def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
+def download_all_data_for_training(dp: DataProvider, config: Config) -> None:
    """
    Called only once upon start of bot to download the necessary data for
    populating indicators and training the model.
@@ -47,9 +53,7 @@ def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
    )


-def get_required_data_timerange(
-    config: dict
-) -> TimeRange:
+def get_required_data_timerange(config: Config) -> TimeRange:
    """
    Used to compute the required data download time range
    for auto data-download in FreqAI
@@ -86,7 +90,7 @@ def get_required_data_timerange(


 # Keep below for when we wish to download heterogeneously lengthed data for FreqAI.
-# def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
+# def download_all_data_for_training(dp: DataProvider, config: Config) -> None:
 #     """
 #     Called only once upon start of bot to download the necessary data for
 #     populating indicators and training a FreqAI model.
@@ -132,3 +136,58 @@ def get_required_data_timerange(
 #             trading_mode=config.get("trading_mode", "spot"),
 #             prepend=config.get("prepend_data", False),
 #         )
+
+
+def plot_feature_importance(model: Any, pair: str, dk: FreqaiDataKitchen,
+                            count_max: int = 25) -> None:
+    """
+        Plot Best and worst features by importance for a single sub-train.
+        :param model: Any = A model which was `fit` using a common library
+                            such as catboost or lightgbm
+        :param pair: str = pair e.g. BTC/USD
+        :param dk: FreqaiDataKitchen = non-persistent data container for current coin/loop
+        :param count_max: int = the amount of features to be loaded per column
+    """
+    from freqtrade.plot.plotting import go, make_subplots, store_plot_file
+
+    # Extract feature importance from model
+    models = {}
+    if 'FreqaiMultiOutputRegressor' in str(model.__class__):
+        for estimator, label in zip(model.estimators_, dk.label_list):
+            models[label] = estimator
+    else:
+        models[dk.label_list[0]] = model
+
+    for label in models:
+        mdl = models[label]
+        if "catboost.core" in str(mdl.__class__):
+            feature_importance = mdl.get_feature_importance()
+        elif "lightgbm.sklearn" or "xgb" in str(mdl.__class__):
+            feature_importance = mdl.feature_importances_
+        else:
+            logger.info('Model type not support for generating feature importances.')
+            return
+
+        # Data preparation
+        fi_df = pd.DataFrame({
+            "feature_names": np.array(dk.training_features_list),
+            "feature_importance": np.array(feature_importance)
+        })
+        fi_df_top = fi_df.nlargest(count_max, "feature_importance")[::-1]
+        fi_df_worst = fi_df.nsmallest(count_max, "feature_importance")[::-1]
+
+        # Plotting
+        def add_feature_trace(fig, fi_df, col):
+            return fig.add_trace(
+                go.Bar(
+                    x=fi_df["feature_importance"],
+                    y=fi_df["feature_names"],
+                    orientation='h', showlegend=False
+                ), row=1, col=col
+            )
+        fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.5)
+        fig = add_feature_trace(fig, fi_df_top, 1)
+        fig = add_feature_trace(fig, fi_df_worst, 2)
+        fig.update_layout(title_text=f"Best and worst features by importance {pair}")
+        label = label.replace('&', '').replace('%', '')  # escape two FreqAI specific characters
+        store_plot_file(fig, f"{dk.model_filename}-{label}.html", dk.data_path)