start collecting indefinite history of predictions. Allow user to generate statistics on these predictions. Direct FreqAI to save these to disk and reload them if available.

2022-07-11 22:01:48 +02:00
parent 3fc92b1b21
commit 8ce6b18318
5 changed files with 109 additions and 39 deletions
--- a/docs/freqai.md
+++ b/docs/freqai.md
@@ -562,6 +562,28 @@ a certain number of hours in age by setting the `expiration_hours` in the config
 In the present example, the user will only allow predictions on models that are less than 1/2 hours
 old. 
 ## Choosing the calculation of the `target_roi`
 As shown in `templates/FreqaiExampleStrategy.py`, the `target_roi` is based on two metrics computed
 by FreqAI: `label_mean` and `label_std`. These are the statistics associated with the labels used 
 *during the most recent training*. This allows the model to know what magnitude of a target to be 
 expecting since it is directly stemming from the training data. By default, FreqAI computes this based 
 on trainig data and it assumes the labels are Gaussian distributed. These are big assumptions 
 that the user should consider when creating their labels. If the user wants to consider the population
 of *historical predictions* for creating the dynamic target instead of the trained labels, the user 
 can do so by setting `fit_live_prediction_candles` to the number of historical prediction candles
 the user wishes to use to generate target statistics. 
 ```json
    "freqai": {
        "fit_live_prediction_candles": 300,
    }
 ```
 If the user sets this value, FreqAI will initially use the predictions from the training data set
 and then subsequently begin introducing real prediction data as it is generated. FreqAI will save 
 this historical data to be reloaded if the user stops and restarts with the same `identifier`.
 <!-- ## Dynamic target expectation
 The labels used for model training have a unique statistical distribution for each separate model training. 
--- a/freqtrade/freqai/data_drawer.py
+++ b/freqtrade/freqai/data_drawer.py
@@ -38,12 +38,14 @@ class FreqaiDataDrawer:
        self.model_return_values: Dict[str, Any] = {}
        self.pair_data_dict: Dict[str, Any] = {}
        self.historic_data: Dict[str, Any] = {}
        self.historic_predictions: Dict[str, Any] = {}
        self.follower_dict: Dict[str, Any] = {}
        self.full_path = full_path
        self.follow_mode = follow_mode
        if follow_mode:
            self.create_follower_dict()
        self.load_drawer_from_disk()
        self.load_historic_predictions_from_disk()
        self.training_queue: Dict[str, int] = {}
        self.history_lock = threading.Lock()
@@ -68,6 +70,29 @@ class FreqaiDataDrawer:
        return exists
    def load_historic_predictions_from_disk(self):
        """
        Locate and load a previously saved historic predictions.
        :returns:
        exists: bool = whether or not the drawer was located
        """
        exists = Path(self.full_path / str("historic_predictions.json")).resolve().exists()
        if exists:
            with open(self.full_path / str("historic_predictions.json"), "r") as fp:
                self.pair_dict = json.load(fp)
            logger.info(f"Found existing historic predictions at {self.full_path}, but beware of "
                        "that statistics may be inaccurate if the bot has been offline for "
                        "an extended period of time.")
        elif not self.follow_mode:
            logger.info("Could not find existing historic_predictions, starting from scratch")
        else:
            logger.warning(
                f"Follower could not find historic predictions at {self.full_path} "
                "sending null values back to strategy"
            )
        return exists
    def save_drawer_to_disk(self):
        """
        Save data drawer full of all pair model metadata in present model folder.
@@ -75,6 +100,13 @@ class FreqaiDataDrawer:
        with open(self.full_path / str("pair_dictionary.json"), "w") as fp:
            json.dump(self.pair_dict, fp, default=self.np_encoder)
    def save_historic_predictions_to_disk(self):
        """
        Save data drawer full of all pair model metadata in present model folder.
        """
        with open(self.full_path / str("historic_predictions.json"), "w") as fp:
            json.dump(self.historic_predictions, fp, default=self.np_encoder)
    def save_follower_dict_to_disk(self):
        """
        Save follower dictionary to disk (used by strategy for persistent prediction targets)
@@ -176,16 +208,18 @@ class FreqaiDataDrawer:
        historical candles, and also stores historical predictions despite retrainings (so stored
        predictions are true predictions, not just inferencing on trained data)
        """
-        self.model_return_values[pair] = pd.DataFrame()
+        # dynamic df returned to strategy and plotted in frequi
        mrv_df = self.model_return_values[pair] = pd.DataFrame()
        for label in dk.label_list:
-            self.model_return_values[pair][label] = pred_df[label]
+            mrv_df[label] = pred_df[label]
-            self.model_return_values[pair][f"{label}_mean"] = dk.data["labels_mean"][label]
+            mrv_df[f"{label}_mean"] = dk.data["labels_mean"][label]
-            self.model_return_values[pair][f"{label}_std"] = dk.data["labels_std"][label]
+            mrv_df[f"{label}_std"] = dk.data["labels_std"][label]
        if self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0) > 0:
-            self.model_return_values[pair]["DI_values"] = dk.DI_values
+            mrv_df["DI_values"] = dk.DI_values
-        self.model_return_values[pair]["do_predict"] = do_preds
+        mrv_df["do_predict"] = do_preds
    def append_model_predictions(self, pair: str, predictions, do_preds, dk, len_df) -> None:
@@ -201,6 +235,13 @@ class FreqaiDataDrawer:
            i = length_difference + 1
        df = self.model_return_values[pair] = self.model_return_values[pair].shift(-i)
        hp_df = self.historic_predictions[pair]
        # here are some pandas hula hoops to accommodate the possibility of a series
        # or dataframe depending number of labels requested by user
        nan_df = pd.DataFrame(np.nan, index=hp_df.index[-2:] + 2, columns=hp_df.columns)
        hp_df = pd.concat([hp_df, nan_df], ignore_index=True, axis=0)
        hp_df = pd.concat([hp_df, nan_df[-2:-1]], axis=0)
        for label in dk.label_list:
            df[label].iloc[-1] = predictions[label].iloc[-1]
@@ -212,6 +253,9 @@ class FreqaiDataDrawer:
        if self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0) > 0:
            df["DI_values"].iloc[-1] = dk.DI_values[-1]
        # append the new predictions to persistent storage
        hp_df.iloc[-1] = df[label].iloc[-1]
        if length_difference < 0:
            prepend_df = pd.DataFrame(
                np.zeros((abs(length_difference) - 1, len(df.columns))), columns=df.columns
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@@ -138,19 +138,6 @@ class FreqaiDataKitchen:
        self.dd.pair_dict[coin]["data_path"] = str(self.data_path)
        self.dd.save_drawer_to_disk()
        # TODO add a helper function to let user save/load any data they are custom adding. We
        # do not want them having to edit the default save/load methods here. Below is an example
        # of what we do NOT want.
        # if self.freqai_config.get('feature_parameters','determine_statistical_distributions'):
        #     self.data_dictionary["upper_quantiles"].to_pickle(
        #         save_path / str(self.model_filename + "_upper_quantiles.pkl")
        #     )
        #     self.data_dictionary["lower_quantiles"].to_pickle(
        #         save_path / str(self.model_filename + "_lower_quantiles.pkl")
        #     )
        return
    def load_data(self, coin: str = "", keras_model=False) -> Any:
@@ -184,22 +171,6 @@ class FreqaiDataKitchen:
            self.data_path / str(self.model_filename + "_trained_df.pkl")
        )
        # TODO add a helper function to let user save/load any data they are custom adding. We
        # do not want them having to edit the default save/load methods here. Below is an example
        # of what we do NOT want.
        # if self.freqai_config.get('feature_parameters','determine_statistical_distributions'):
        #     self.data_dictionary["upper_quantiles"] = pd.read_pickle(
        #         self.data_path / str(self.model_filename + "_upper_quantiles.pkl")
        #     )
        #     self.data_dictionary["lower_quantiles"] = pd.read_pickle(
        #         self.data_path / str(self.model_filename + "_lower_quantiles.pkl")
        #     )
        # self.data_path = Path(self.data["data_path"])
        # self.model_filename = self.data["model_filename"]
        # try to access model in memory instead of loading object from disk to save time
        if self.live and self.model_filename in self.dd.model_dictionary:
            model = self.dd.model_dictionary[self.model_filename]
@@ -207,7 +178,6 @@ class FreqaiDataKitchen:
            model = load(self.data_path / str(self.model_filename + "_model.joblib"))
        else:
            from tensorflow import keras
            model = keras.models.load_model(self.data_path / str(self.model_filename + "_model.h5"))
        if Path(self.data_path / str(self.model_filename + "_svm_model.joblib")).resolve().exists():
@@ -263,7 +233,6 @@ class FreqaiDataKitchen:
            labels,
            weights,
            stratify=stratification,
            # shuffle=False,
            **self.config["freqai"]["data_split_parameters"],
        )
@@ -276,7 +245,6 @@ class FreqaiDataKitchen:
        unfiltered_dataframe: DataFrame,
        training_feature_list: List,
        label_list: List = list(),
        # labels: DataFrame = pd.DataFrame(),
        training_filter: bool = True,
    ) -> Tuple[DataFrame, DataFrame]:
        """
@@ -1135,6 +1103,19 @@ class FreqaiDataKitchen:
        return dataframe
    def fit_live_predictions(self) -> None:
        """
        Fit the labels with a gaussian distribution
        """
        import scipy as spy
        num_candles = self.freqai_config.get('fit_live_predictions_candles', 100)
        self.data["labels_mean"], self.data["labels_std"] = {}, {}
        for label in self.label_list:
            f = spy.stats.norm.fit(self.dd.historic_predictions[self.pair][label].tail(num_candles))
            self.data["labels_mean"][label], self.data["labels_std"][label] = f[0], f[1]
        return
    def fit_labels(self) -> None:
        """
        Fit the labels with a gaussian distribution
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -1,4 +1,5 @@
 # import contextlib
 import copy
 import datetime
 import gc
 import logging
@@ -484,6 +485,20 @@ class IFreqaiModel(ABC):
            self.dd.purge_old_models()
        # self.retrain = False
    def set_initial_historic_predictions(self, df: DataFrame, model: Any,
                                         dk: FreqaiDataKitchen, pair: str) -> None:
        trained_predictions = model.predict(df)
        pred_df = DataFrame(trained_predictions, columns=dk.label_list)
        for label in dk.label_list:
            pred_df[label] = (
                (pred_df[label] + 1)
                * (dk.data["labels_max"][label] - dk.data["labels_min"][label])
                / 2
            ) + dk.data["labels_min"][label]
        self.dd.historic_predictions[pair] = pd.DataFrame()
        self.dd.historic_predictions[pair] = copy.deepcopy(pred_df)
    # Following methods which are overridden by user made prediction models.
    # See freqai/prediction_models/CatboostPredictionModlel.py for an example.
--- a/freqtrade/freqai/prediction_models/BaseRegressionModel.py
+++ b/freqtrade/freqai/prediction_models/BaseRegressionModel.py
@@ -51,7 +51,8 @@ class BaseRegressionModel(IFreqaiModel):
        # split data into train/test data.
        data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered)
-        dk.fit_labels()  # fit labels to a cauchy distribution so we know what to expect in strategy
+        if not self.freqai_info.get('fit_live_predictions', 0):
            dk.fit_labels()
        # normalize all data based on train_dataset only
        data_dictionary = dk.normalize_data(data_dictionary)
@@ -65,6 +66,13 @@ class BaseRegressionModel(IFreqaiModel):
        model = self.fit(data_dictionary)
        if pair not in self.dd.historic_predictions:
            self.set_initial_historic_predictions(
                data_dictionary['train_features'], model, dk, pair)
        elif self.freqai_info.get('fit_live_predictions_candles', 0):
            dk.fit_live_predictions()
            self.dd.save_historic_predictions_to_disk()
        logger.info(f"--------------------done training {pair}--------------------")
        return model