Merge remote-tracking branch 'origin/develop' into add-metric-tracker

2022-10-14 19:00:49 +02:00
parent 88b8f18639 4a8cb3359b
commit b236e362ba
72 changed files with 570 additions and 341 deletions
@@ -78,7 +78,7 @@ class BaseClassifierModel(IFreqaiModel):
    ) -> Tuple[DataFrame, npt.NDArray[np.int_]]:
        """
        Filter the prediction features data and predict with it.
-        :param: unfiltered_df: Full dataframe for the current backtest period.
+        :param unfiltered_df: Full dataframe for the current backtest period.
        :return:
        :pred_df: dataframe containing the predictions
        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
@@ -77,7 +77,7 @@ class BaseRegressionModel(IFreqaiModel):
    ) -> Tuple[DataFrame, npt.NDArray[np.int_]]:
        """
        Filter the prediction features data and predict with it.
-        :param: unfiltered_df: Full dataframe for the current backtest period.
+        :param unfiltered_df: Full dataframe for the current backtest period.
        :return:
        :pred_df: dataframe containing the predictions
        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
@@ -461,9 +461,8 @@ class FreqaiDataDrawer:
    def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None:
        """
        Saves all data associated with a model for a single sub-train time range
-        :params:
-        :model: User trained model which can be reused for inferencing to generate
-        predictions
+        :param model: User trained model which can be reused for inferencing to generate
+                      predictions
        """

        if not dk.data_path.is_dir():
@@ -581,8 +580,7 @@ class FreqaiDataDrawer:
        Append new candles to our stores historic data (in memory) so that
        we do not need to load candle history from disk and we dont need to
        pinging exchange multiple times for the same candle.
-        :params:
-        dataframe: DataFrame = strategy provided dataframe
+        :param dataframe: DataFrame = strategy provided dataframe
        """
        feat_params = self.freqai_info["feature_parameters"]
        with self.history_lock:
@@ -628,9 +626,8 @@ class FreqaiDataDrawer:
        """
        Load pair histories for all whitelist and corr_pairlist pairs.
        Only called once upon startup of bot.
-        :params:
-        timerange: TimeRange = full timerange required to populate all indicators
-        for training according to user defined train_period_days
+        :param timerange: TimeRange = full timerange required to populate all indicators
+                          for training according to user defined train_period_days
        """
        history_data = self.historic_data

@@ -653,10 +650,9 @@ class FreqaiDataDrawer:
        """
        Searches through our historic_data in memory and returns the dataframes relevant
        to the present pair.
-        :params:
-        timerange: TimeRange = full timerange required to populate all indicators
-        for training according to user defined train_period_days
-        metadata: dict = strategy furnished pair metadata
+        :param timerange: TimeRange = full timerange required to populate all indicators
+                          for training according to user defined train_period_days
+        :param metadata: dict = strategy furnished pair metadata
        """
        with self.history_lock:
            corr_dataframes: Dict[Any, Any] = {}
@@ -107,9 +107,8 @@ class FreqaiDataKitchen:
    ) -> None:
        """
        Set the paths to the data for the present coin/botloop
-        :params:
-        metadata: dict = strategy furnished pair metadata
-        trained_timestamp: int = timestamp of most recent training
+        :param metadata: dict = strategy furnished pair metadata
+        :param trained_timestamp: int = timestamp of most recent training
        """
        self.full_path = Path(
            self.config["user_data_dir"] / "models" / str(self.freqai_config.get("identifier"))
@@ -129,8 +128,8 @@ class FreqaiDataKitchen:
        Given the dataframe for the full history for training, split the data into
        training and test data according to user specified parameters in configuration
        file.
-        :filtered_dataframe: cleaned dataframe ready to be split.
-        :labels: cleaned labels ready to be split.
+        :param filtered_dataframe: cleaned dataframe ready to be split.
+        :param labels: cleaned labels ready to be split.
        """
        feat_dict = self.freqai_config["feature_parameters"]

@@ -189,13 +188,14 @@ class FreqaiDataKitchen:
        remove all NaNs. Any row with a NaN is removed from training dataset or replaced with
        0s in the prediction dataset. However, prediction dataset do_predict will reflect any
        row that had a NaN and will shield user from that prediction.
-        :params:
-        :unfiltered_df: the full dataframe for the present training period
-        :training_feature_list: list, the training feature list constructed by
-        self.build_feature_list() according to user specified parameters in the configuration file.
-        :labels: the labels for the dataset
-        :training_filter: boolean which lets the function know if it is training data or
-        prediction data to be filtered.
+
+        :param unfiltered_df: the full dataframe for the present training period
+        :param training_feature_list: list, the training feature list constructed by
+                                      self.build_feature_list() according to user specified
+                                      parameters in the configuration file.
+        :param labels: the labels for the dataset
+        :param training_filter: boolean which lets the function know if it is training data or
+                                prediction data to be filtered.
        :returns:
        :filtered_df: dataframe cleaned of NaNs and only containing the user
        requested feature set.
@@ -241,6 +241,7 @@ class FreqaiDataKitchen:
            self.data["filter_drop_index_training"] = drop_index

        else:
+            filtered_df = self.check_pred_labels(filtered_df)
            # we are backtesting so we need to preserve row number to send back to strategy,
            # so now we use do_predict to avoid any prediction based on a NaN
            drop_index = pd.isnull(filtered_df).any(axis=1)
@@ -285,8 +286,8 @@ class FreqaiDataKitchen:
    def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
        """
        Normalize all data in the data_dictionary according to the training dataset
-        :params:
-        :data_dictionary: dictionary containing the cleaned and split training/test data/labels
+        :param data_dictionary: dictionary containing the cleaned and
+                                split training/test data/labels
        :returns:
        :data_dictionary: updated dictionary with standardized values.
        """
@@ -460,6 +461,24 @@ class FreqaiDataKitchen:

        return df

+    def check_pred_labels(self, df_predictions: DataFrame) -> DataFrame:
+        """
+        Check that prediction feature labels match training feature labels.
+        :params:
+        :df_predictions: incoming predictions
+        """
+        train_labels = self.data_dictionary["train_features"].columns
+        pred_labels = df_predictions.columns
+        num_diffs = len(pred_labels.difference(train_labels))
+        if num_diffs != 0:
+            df_predictions = df_predictions[train_labels]
+            logger.warning(
+                f"Removed {num_diffs} features from prediction features, "
+                f"these were likely considered constant values during most recent training."
+            )
+
+        return df_predictions
+
    def principal_component_analysis(self) -> None:
        """
        Performs Principal Component Analysis on the data for dimensionality reduction
@@ -516,8 +535,7 @@ class FreqaiDataKitchen:
    def pca_transform(self, filtered_dataframe: DataFrame) -> None:
        """
        Use an existing pca transform to transform data into components
-        :params:
-        filtered_dataframe: DataFrame = the cleaned dataframe
+        :param filtered_dataframe: DataFrame = the cleaned dataframe
        """
        pca_components = self.pca.transform(filtered_dataframe)
        self.data_dictionary["prediction_features"] = pd.DataFrame(
@@ -561,8 +579,7 @@ class FreqaiDataKitchen:
        """
        Build/inference a Support Vector Machine to detect outliers
        in training data and prediction
-        :params:
-        predict: bool = If true, inference an existing SVM model, else construct one
+        :param predict: bool = If true, inference an existing SVM model, else construct one
        """

        if self.keras:
@@ -647,11 +664,11 @@ class FreqaiDataKitchen:
        Use DBSCAN to cluster training data and remove "noisy" data (read outliers).
        User controls this via the config param `DBSCAN_outlier_pct` which indicates the
        pct of training data that they want to be considered outliers.
-        :params:
-        predict: bool = If False (training), iterate to find the best hyper parameters to match
-        user requested outlier percent target. If True (prediction), use the parameters
-        determined from the previous training to estimate if the current prediction point
-        is an outlier.
+        :param predict: bool = If False (training), iterate to find the best hyper parameters
+                        to match user requested outlier percent target.
+                        If True (prediction), use the parameters determined from
+                        the previous training to estimate if the current prediction point
+                        is an outlier.
        """

        if predict:
@@ -1118,15 +1135,13 @@ class FreqaiDataKitchen:
        prediction_dataframe: DataFrame = pd.DataFrame(),
    ) -> DataFrame:
        """
-        Use the user defined strategy for populating indicators during
-        retrain
-        :params:
-        strategy: IStrategy = user defined strategy object
-        corr_dataframes: dict = dict containing the informative pair dataframes
-        (for user defined timeframes)
-        base_dataframes: dict = dict containing the current pair dataframes
-        (for user defined timeframes)
-        metadata: dict = strategy furnished pair metadata
+        Use the user defined strategy for populating indicators during retrain
+        :param strategy: IStrategy = user defined strategy object
+        :param corr_dataframes: dict = dict containing the informative pair dataframes
+                                (for user defined timeframes)
+        :param base_dataframes: dict = dict containing the current pair dataframes
+                                (for user defined timeframes)
+        :param metadata: dict = strategy furnished pair metadata
        :returns:
        dataframe: DataFrame = dataframe containing populated indicators
        """
@@ -196,16 +196,15 @@ class IFreqaiModel(ABC):
            (_, trained_timestamp, _) = self.dd.get_pair_dict_info(pair)

            dk = FreqaiDataKitchen(self.config, self.live, pair)
-            dk.set_paths(pair, trained_timestamp)
            (
                retrain,
                new_trained_timerange,
                data_load_timerange,
            ) = dk.check_if_new_training_required(trained_timestamp)
-            dk.set_paths(pair, new_trained_timerange.stopts)

            if retrain:
                self.train_timer('start')
+                dk.set_paths(pair, new_trained_timerange.stopts)
                try:
                    self.extract_data_and_train_model(
                        new_trained_timerange, pair, strategy, dk, data_load_timerange
@@ -270,9 +269,7 @@ class IFreqaiModel(ABC):
            )

            trained_timestamp_int = int(trained_timestamp.stopts)
-            dk.data_path = Path(
-                dk.full_path / f"sub-train-{pair.split('/')[0]}_{trained_timestamp_int}"
-                )
+            dk.set_paths(pair, trained_timestamp_int)

            dk.set_new_model_names(pair, trained_timestamp)

@@ -605,11 +602,11 @@ class IFreqaiModel(ABC):
        If the user reuses an identifier on a subsequent instance,
        this function will not be called. In that case, "real" predictions
        will be appended to the loaded set of historic predictions.
-        :param: df: DataFrame = the dataframe containing the training feature data
-        :param: model: Any = A model which was `fit` using a common library such as
-        catboost or lightgbm
-        :param: dk: FreqaiDataKitchen = object containing methods for data analysis
-        :param: pair: str = current pair
+        :param df: DataFrame = the dataframe containing the training feature data
+        :param model: Any = A model which was `fit` using a common library such as
+                      catboost or lightgbm
+        :param dk: FreqaiDataKitchen = object containing methods for data analysis
+        :param pair: str = current pair
        """

        self.dd.historic_predictions[pair] = pred_df
@@ -1,4 +1,5 @@
 import logging
+from pathlib import Path
 from typing import Any, Dict

 from catboost import CatBoostClassifier, Pool
@@ -20,9 +21,8 @@ class CatboostClassifier(BaseClassifierModel):
    def fit(self, data_dictionary: Dict, dk: FreqaiDataKitchen, **kwargs) -> Any:
        """
        User sets up the training and test data to fit their desired model here
-        :params:
-        :data_dictionary: the dictionary constructed by DataHandler to hold
-        all the training and test data/labels.
+        :param data_dictionary: the dictionary constructed by DataHandler to hold
+                                all the training and test data/labels.
        """

        train_data = Pool(
@@ -32,8 +32,9 @@ class CatboostClassifier(BaseClassifierModel):
        )

        cbr = CatBoostClassifier(
-            allow_writing_files=False,
+            allow_writing_files=True,
            loss_function='MultiClass',
+            train_dir=Path(dk.data_path),
            **self.model_training_parameters,
        )

@@ -1,4 +1,5 @@
 import logging
+from pathlib import Path
 from typing import Any, Dict

 from catboost import CatBoostRegressor, Pool
@@ -41,7 +42,8 @@ class CatboostRegressor(BaseRegressionModel):
        init_model = self.get_init_model(dk.pair)

        model = CatBoostRegressor(
-            allow_writing_files=False,
+            allow_writing_files=True,
+            train_dir=Path(dk.data_path),
            **self.model_training_parameters,
        )

@@ -1,4 +1,5 @@
 import logging
+from pathlib import Path
 from typing import Any, Dict

 from catboost import CatBoostRegressor, Pool
@@ -26,7 +27,8 @@ class CatboostRegressorMultiTarget(BaseRegressionModel):
        """

        cbr = CatBoostRegressor(
-            allow_writing_files=False,
+            allow_writing_files=True,
+            train_dir=Path(dk.data_path),
            **self.model_training_parameters,
        )

@@ -20,9 +20,8 @@ class LightGBMClassifier(BaseClassifierModel):
    def fit(self, data_dictionary: Dict, dk: FreqaiDataKitchen, **kwargs) -> Any:
        """
        User sets up the training and test data to fit their desired model here
-        :params:
-        :data_dictionary: the dictionary constructed by DataHandler to hold
-        all the training and test data/labels.
+        :param data_dictionary: the dictionary constructed by DataHandler to hold
+                                all the training and test data/labels.
        """

        if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) == 0:
@@ -26,9 +26,8 @@ class XGBoostClassifier(BaseClassifierModel):
    def fit(self, data_dictionary: Dict, dk: FreqaiDataKitchen, **kwargs) -> Any:
        """
        User sets up the training and test data to fit their desired model here
-        :params:
-        :data_dictionary: the dictionary constructed by DataHandler to hold
-        all the training and test data/labels.
+        :param data_dictionary: the dictionary constructed by DataHandler to hold
+                                all the training and test data/labels.
        """

        X = data_dictionary["train_features"].to_numpy()
@@ -65,7 +64,7 @@ class XGBoostClassifier(BaseClassifierModel):
    ) -> Tuple[DataFrame, npt.NDArray[np.int_]]:
        """
        Filter the prediction features data and predict with it.
-        :param: unfiltered_df: Full dataframe for the current backtest period.
+        :param unfiltered_df: Full dataframe for the current backtest period.
        :return:
        :pred_df: dataframe containing the predictions
        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove