Merge branch 'develop' into add-inlier-metric

2022-09-06 20:40:21 +02:00
parent d44296783e dc4a4bdf09
commit d7585161b2
38 changed files with 664 additions and 349 deletions
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@@ -18,8 +18,6 @@ from sklearn.model_selection import train_test_split
 from sklearn.neighbors import NearestNeighbors

 from freqtrade.configuration import TimeRange
-from freqtrade.data.dataprovider import DataProvider
-from freqtrade.data.history.history_utils import refresh_backtest_ohlcv_data
 from freqtrade.exceptions import OperationalException
 from freqtrade.exchange import timeframe_to_seconds
 from freqtrade.strategy.interface import IStrategy
@@ -73,6 +71,8 @@ class FreqaiDataKitchen:
        self.label_list: List = []
        self.training_features_list: List = []
        self.model_filename: str = ""
+        self.backtesting_results_path = Path()
+        self.backtest_predictions_folder: str = "backtesting_predictions"
        self.live = live
        self.pair = pair

@@ -291,6 +291,7 @@ class FreqaiDataKitchen:
        :returns:
        :data_dictionary: updated dictionary with standardized values.
        """
+
        # standardize the data by training stats
        train_max = data_dictionary["train_features"].max()
        train_min = data_dictionary["train_features"].min()
@@ -324,10 +325,24 @@ class FreqaiDataKitchen:
                    - 1
                )

-            self.data[f"{item}_max"] = train_labels_max  # .to_dict()
-            self.data[f"{item}_min"] = train_labels_min  # .to_dict()
+            self.data[f"{item}_max"] = train_labels_max
+            self.data[f"{item}_min"] = train_labels_min
        return data_dictionary

+    def normalize_single_dataframe(self, df: DataFrame) -> DataFrame:
+
+        train_max = df.max()
+        train_min = df.min()
+        df = (
+            2 * (df - train_min) / (train_max - train_min) - 1
+        )
+
+        for item in train_max.keys():
+            self.data[item + "_max"] = train_max[item]
+            self.data[item + "_min"] = train_min[item]
+
+        return df
+
    def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
        """
        Normalize a set of data using the mean and standard deviation from
@@ -441,7 +456,8 @@ class FreqaiDataKitchen:
        start = datetime.fromtimestamp(timerange.startts, tz=timezone.utc)
        stop = datetime.fromtimestamp(timerange.stopts, tz=timezone.utc)
        df = df.loc[df["date"] >= start, :]
-        df = df.loc[df["date"] <= stop, :]
+        if not self.live:
+            df = df.loc[df["date"] < stop, :]

        return df

@@ -454,22 +470,23 @@ class FreqaiDataKitchen:

        from sklearn.decomposition import PCA  # avoid importing if we dont need it

-        n_components = self.data_dictionary["train_features"].shape[1]
-        pca = PCA(n_components=n_components)
+        pca = PCA(0.999)
        pca = pca.fit(self.data_dictionary["train_features"])
-        n_keep_components = np.argmin(pca.explained_variance_ratio_.cumsum() < 0.999)
-        pca2 = PCA(n_components=n_keep_components)
+        n_keep_components = pca.n_components_
        self.data["n_kept_components"] = n_keep_components
-        pca2 = pca2.fit(self.data_dictionary["train_features"])
+        n_components = self.data_dictionary["train_features"].shape[1]
        logger.info("reduced feature dimension by %s", n_components - n_keep_components)
-        logger.info("explained variance %f", np.sum(pca2.explained_variance_ratio_))
-        train_components = pca2.transform(self.data_dictionary["train_features"])
+        logger.info("explained variance %f", np.sum(pca.explained_variance_ratio_))

+        train_components = pca.transform(self.data_dictionary["train_features"])
        self.data_dictionary["train_features"] = pd.DataFrame(
            data=train_components,
            columns=["PC" + str(i) for i in range(0, n_keep_components)],
            index=self.data_dictionary["train_features"].index,
        )
+        # normalsing transformed training features
+        self.data_dictionary["train_features"] = self.normalize_single_dataframe(
+            self.data_dictionary["train_features"])

        # keeping a copy of the non-transformed features so we can check for errors during
        # model load from disk
@@ -477,15 +494,18 @@ class FreqaiDataKitchen:
        self.training_features_list = self.data_dictionary["train_features"].columns

        if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
-            test_components = pca2.transform(self.data_dictionary["test_features"])
+            test_components = pca.transform(self.data_dictionary["test_features"])
            self.data_dictionary["test_features"] = pd.DataFrame(
                data=test_components,
                columns=["PC" + str(i) for i in range(0, n_keep_components)],
                index=self.data_dictionary["test_features"].index,
            )
+            # normalise transformed test feature to transformed training features
+            self.data_dictionary["test_features"] = self.normalize_data_from_metadata(
+                self.data_dictionary["test_features"])

        self.data["n_kept_components"] = n_keep_components
-        self.pca = pca2
+        self.pca = pca

        logger.info(f"PCA reduced total features from  {n_components} to {n_keep_components}")

@@ -506,6 +526,9 @@ class FreqaiDataKitchen:
            columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])],
            index=filtered_dataframe.index,
        )
+        # normalise transformed predictions to transformed training features
+        self.data_dictionary["prediction_features"] = self.normalize_data_from_metadata(
+            self.data_dictionary["prediction_features"])

    def compute_distances(self) -> float:
        """
@@ -885,9 +908,10 @@ class FreqaiDataKitchen:
        weights = np.exp(-np.arange(num_weights) / (wfactor * num_weights))[::-1]
        return weights

-    def append_predictions(self, predictions: DataFrame, do_predict: npt.ArrayLike) -> None:
+    def get_predictions_to_append(self, predictions: DataFrame,
+                                  do_predict: npt.ArrayLike) -> DataFrame:
        """
-        Append backtest prediction from current backtest period to all previous periods
+        Get backtest prediction from current backtest period
        """

        append_df = DataFrame()
@@ -902,13 +926,18 @@ class FreqaiDataKitchen:
        if self.freqai_config["feature_parameters"].get("DI_threshold", 0) > 0:
            append_df["DI_values"] = self.DI_values

+        return append_df
+
+    def append_predictions(self, append_df: DataFrame) -> None:
+        """
+        Append backtest prediction from current backtest period to all previous periods
+        """
+
        if self.full_df.empty:
            self.full_df = append_df
        else:
            self.full_df = pd.concat([self.full_df, append_df], axis=0)

-        return
-
    def fill_predictions(self, dataframe):
        """
        Back fill values to before the backtesting range so that the dataframe matches size
@@ -1008,9 +1037,7 @@ class FreqaiDataKitchen:
        # We notice that users like to use exotic indicators where
        # they do not know the required timeperiod. Here we include a factor
        # of safety by multiplying the user considered "max" by 2.
-        max_period = self.freqai_config["feature_parameters"].get(
-            "indicator_max_period_candles", 20
-        ) * 2
+        max_period = self.config.get('startup_candle_count', 20) * 2
        additional_seconds = max_period * max_tf_seconds

        if trained_timestamp != 0:
@@ -1056,31 +1083,6 @@ class FreqaiDataKitchen:

        self.model_filename = f"cb_{coin.lower()}_{int(trained_timerange.stopts)}"

-    def download_all_data_for_training(self, timerange: TimeRange, dp: DataProvider) -> None:
-        """
-        Called only once upon start of bot to download the necessary data for
-        populating indicators and training the model.
-        :param timerange: TimeRange = The full data timerange for populating the indicators
-                                      and training the model.
-        :param dp: DataProvider instance attached to the strategy
-        """
-        new_pairs_days = int((timerange.stopts - timerange.startts) / SECONDS_IN_DAY)
-        if not dp._exchange:
-            # Not realistic - this is only called in live mode.
-            raise OperationalException("Dataprovider did not have an exchange attached.")
-        refresh_backtest_ohlcv_data(
-            dp._exchange,
-            pairs=self.all_pairs,
-            timeframes=self.freqai_config["feature_parameters"].get("include_timeframes"),
-            datadir=self.config["datadir"],
-            timerange=timerange,
-            new_pairs_days=new_pairs_days,
-            erase=False,
-            data_format=self.config.get("dataformat_ohlcv", "json"),
-            trading_mode=self.config.get("trading_mode", "spot"),
-            prepend=self.config.get("prepend_data", False),
-        )
-
    def set_all_pairs(self) -> None:

        self.all_pairs = copy.deepcopy(
@@ -1194,3 +1196,50 @@ class FreqaiDataKitchen:
        if self.unique_classes:
            for label in self.unique_classes:
                self.unique_class_list += list(self.unique_classes[label])
+
+    def save_backtesting_prediction(
+        self, append_df: DataFrame
+    ) -> None:
+
+        """
+        Save prediction dataframe from backtesting to h5 file format
+        :param append_df: dataframe for backtesting period
+        """
+        full_predictions_folder = Path(self.full_path / self.backtest_predictions_folder)
+        if not full_predictions_folder.is_dir():
+            full_predictions_folder.mkdir(parents=True, exist_ok=True)
+
+        append_df.to_hdf(self.backtesting_results_path, key='append_df', mode='w')
+
+    def get_backtesting_prediction(
+        self
+    ) -> DataFrame:
+
+        """
+        Get prediction dataframe from h5 file format
+        """
+        append_df = pd.read_hdf(self.backtesting_results_path)
+        return append_df
+
+    def check_if_backtest_prediction_exists(
+        self
+    ) -> bool:
+        """
+        Check if a backtesting prediction already exists
+        :param dk: FreqaiDataKitchen
+        :return:
+        :boolean: whether the prediction file exists or not.
+        """
+        path_to_predictionfile = Path(self.full_path /
+                                      self.backtest_predictions_folder /
+                                      f"{self.model_filename}_prediction.h5")
+        self.backtesting_results_path = path_to_predictionfile
+
+        file_exists = path_to_predictionfile.is_file()
+        if file_exists:
+            logger.info(f"Found backtesting prediction file at {path_to_predictionfile}")
+        else:
+            logger.info(
+                f"Could not find backtesting prediction file at {path_to_predictionfile}"
+            )
+        return file_exists