start collecting indefinite history of predictions. Allow user to generate statistics on these predictions. Direct FreqAI to save these to disk and reload them if available.

This commit is contained in:
Robert Caulk 2022-07-11 22:01:48 +02:00
parent 3fc92b1b21
commit 8ce6b18318
5 changed files with 109 additions and 39 deletions

View File

@ -562,6 +562,28 @@ a certain number of hours in age by setting the `expiration_hours` in the config
In the present example, the user will only allow predictions on models that are less than 1/2 hours In the present example, the user will only allow predictions on models that are less than 1/2 hours
old. old.
## Choosing the calculation of the `target_roi`
As shown in `templates/FreqaiExampleStrategy.py`, the `target_roi` is based on two metrics computed
by FreqAI: `label_mean` and `label_std`. These are the statistics associated with the labels used
*during the most recent training*. This allows the model to know what magnitude of a target to be
expecting since it is directly stemming from the training data. By default, FreqAI computes this based
on trainig data and it assumes the labels are Gaussian distributed. These are big assumptions
that the user should consider when creating their labels. If the user wants to consider the population
of *historical predictions* for creating the dynamic target instead of the trained labels, the user
can do so by setting `fit_live_prediction_candles` to the number of historical prediction candles
the user wishes to use to generate target statistics.
```json
"freqai": {
"fit_live_prediction_candles": 300,
}
```
If the user sets this value, FreqAI will initially use the predictions from the training data set
and then subsequently begin introducing real prediction data as it is generated. FreqAI will save
this historical data to be reloaded if the user stops and restarts with the same `identifier`.
<!-- ## Dynamic target expectation <!-- ## Dynamic target expectation
The labels used for model training have a unique statistical distribution for each separate model training. The labels used for model training have a unique statistical distribution for each separate model training.

View File

@ -38,12 +38,14 @@ class FreqaiDataDrawer:
self.model_return_values: Dict[str, Any] = {} self.model_return_values: Dict[str, Any] = {}
self.pair_data_dict: Dict[str, Any] = {} self.pair_data_dict: Dict[str, Any] = {}
self.historic_data: Dict[str, Any] = {} self.historic_data: Dict[str, Any] = {}
self.historic_predictions: Dict[str, Any] = {}
self.follower_dict: Dict[str, Any] = {} self.follower_dict: Dict[str, Any] = {}
self.full_path = full_path self.full_path = full_path
self.follow_mode = follow_mode self.follow_mode = follow_mode
if follow_mode: if follow_mode:
self.create_follower_dict() self.create_follower_dict()
self.load_drawer_from_disk() self.load_drawer_from_disk()
self.load_historic_predictions_from_disk()
self.training_queue: Dict[str, int] = {} self.training_queue: Dict[str, int] = {}
self.history_lock = threading.Lock() self.history_lock = threading.Lock()
@ -68,6 +70,29 @@ class FreqaiDataDrawer:
return exists return exists
def load_historic_predictions_from_disk(self):
"""
Locate and load a previously saved historic predictions.
:returns:
exists: bool = whether or not the drawer was located
"""
exists = Path(self.full_path / str("historic_predictions.json")).resolve().exists()
if exists:
with open(self.full_path / str("historic_predictions.json"), "r") as fp:
self.pair_dict = json.load(fp)
logger.info(f"Found existing historic predictions at {self.full_path}, but beware of "
"that statistics may be inaccurate if the bot has been offline for "
"an extended period of time.")
elif not self.follow_mode:
logger.info("Could not find existing historic_predictions, starting from scratch")
else:
logger.warning(
f"Follower could not find historic predictions at {self.full_path} "
"sending null values back to strategy"
)
return exists
def save_drawer_to_disk(self): def save_drawer_to_disk(self):
""" """
Save data drawer full of all pair model metadata in present model folder. Save data drawer full of all pair model metadata in present model folder.
@ -75,6 +100,13 @@ class FreqaiDataDrawer:
with open(self.full_path / str("pair_dictionary.json"), "w") as fp: with open(self.full_path / str("pair_dictionary.json"), "w") as fp:
json.dump(self.pair_dict, fp, default=self.np_encoder) json.dump(self.pair_dict, fp, default=self.np_encoder)
def save_historic_predictions_to_disk(self):
"""
Save data drawer full of all pair model metadata in present model folder.
"""
with open(self.full_path / str("historic_predictions.json"), "w") as fp:
json.dump(self.historic_predictions, fp, default=self.np_encoder)
def save_follower_dict_to_disk(self): def save_follower_dict_to_disk(self):
""" """
Save follower dictionary to disk (used by strategy for persistent prediction targets) Save follower dictionary to disk (used by strategy for persistent prediction targets)
@ -176,16 +208,18 @@ class FreqaiDataDrawer:
historical candles, and also stores historical predictions despite retrainings (so stored historical candles, and also stores historical predictions despite retrainings (so stored
predictions are true predictions, not just inferencing on trained data) predictions are true predictions, not just inferencing on trained data)
""" """
self.model_return_values[pair] = pd.DataFrame() # dynamic df returned to strategy and plotted in frequi
mrv_df = self.model_return_values[pair] = pd.DataFrame()
for label in dk.label_list: for label in dk.label_list:
self.model_return_values[pair][label] = pred_df[label] mrv_df[label] = pred_df[label]
self.model_return_values[pair][f"{label}_mean"] = dk.data["labels_mean"][label] mrv_df[f"{label}_mean"] = dk.data["labels_mean"][label]
self.model_return_values[pair][f"{label}_std"] = dk.data["labels_std"][label] mrv_df[f"{label}_std"] = dk.data["labels_std"][label]
if self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0) > 0: if self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0) > 0:
self.model_return_values[pair]["DI_values"] = dk.DI_values mrv_df["DI_values"] = dk.DI_values
self.model_return_values[pair]["do_predict"] = do_preds mrv_df["do_predict"] = do_preds
def append_model_predictions(self, pair: str, predictions, do_preds, dk, len_df) -> None: def append_model_predictions(self, pair: str, predictions, do_preds, dk, len_df) -> None:
@ -201,6 +235,13 @@ class FreqaiDataDrawer:
i = length_difference + 1 i = length_difference + 1
df = self.model_return_values[pair] = self.model_return_values[pair].shift(-i) df = self.model_return_values[pair] = self.model_return_values[pair].shift(-i)
hp_df = self.historic_predictions[pair]
# here are some pandas hula hoops to accommodate the possibility of a series
# or dataframe depending number of labels requested by user
nan_df = pd.DataFrame(np.nan, index=hp_df.index[-2:] + 2, columns=hp_df.columns)
hp_df = pd.concat([hp_df, nan_df], ignore_index=True, axis=0)
hp_df = pd.concat([hp_df, nan_df[-2:-1]], axis=0)
for label in dk.label_list: for label in dk.label_list:
df[label].iloc[-1] = predictions[label].iloc[-1] df[label].iloc[-1] = predictions[label].iloc[-1]
@ -212,6 +253,9 @@ class FreqaiDataDrawer:
if self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0) > 0: if self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0) > 0:
df["DI_values"].iloc[-1] = dk.DI_values[-1] df["DI_values"].iloc[-1] = dk.DI_values[-1]
# append the new predictions to persistent storage
hp_df.iloc[-1] = df[label].iloc[-1]
if length_difference < 0: if length_difference < 0:
prepend_df = pd.DataFrame( prepend_df = pd.DataFrame(
np.zeros((abs(length_difference) - 1, len(df.columns))), columns=df.columns np.zeros((abs(length_difference) - 1, len(df.columns))), columns=df.columns

View File

@ -138,19 +138,6 @@ class FreqaiDataKitchen:
self.dd.pair_dict[coin]["data_path"] = str(self.data_path) self.dd.pair_dict[coin]["data_path"] = str(self.data_path)
self.dd.save_drawer_to_disk() self.dd.save_drawer_to_disk()
# TODO add a helper function to let user save/load any data they are custom adding. We
# do not want them having to edit the default save/load methods here. Below is an example
# of what we do NOT want.
# if self.freqai_config.get('feature_parameters','determine_statistical_distributions'):
# self.data_dictionary["upper_quantiles"].to_pickle(
# save_path / str(self.model_filename + "_upper_quantiles.pkl")
# )
# self.data_dictionary["lower_quantiles"].to_pickle(
# save_path / str(self.model_filename + "_lower_quantiles.pkl")
# )
return return
def load_data(self, coin: str = "", keras_model=False) -> Any: def load_data(self, coin: str = "", keras_model=False) -> Any:
@ -184,22 +171,6 @@ class FreqaiDataKitchen:
self.data_path / str(self.model_filename + "_trained_df.pkl") self.data_path / str(self.model_filename + "_trained_df.pkl")
) )
# TODO add a helper function to let user save/load any data they are custom adding. We
# do not want them having to edit the default save/load methods here. Below is an example
# of what we do NOT want.
# if self.freqai_config.get('feature_parameters','determine_statistical_distributions'):
# self.data_dictionary["upper_quantiles"] = pd.read_pickle(
# self.data_path / str(self.model_filename + "_upper_quantiles.pkl")
# )
# self.data_dictionary["lower_quantiles"] = pd.read_pickle(
# self.data_path / str(self.model_filename + "_lower_quantiles.pkl")
# )
# self.data_path = Path(self.data["data_path"])
# self.model_filename = self.data["model_filename"]
# try to access model in memory instead of loading object from disk to save time # try to access model in memory instead of loading object from disk to save time
if self.live and self.model_filename in self.dd.model_dictionary: if self.live and self.model_filename in self.dd.model_dictionary:
model = self.dd.model_dictionary[self.model_filename] model = self.dd.model_dictionary[self.model_filename]
@ -207,7 +178,6 @@ class FreqaiDataKitchen:
model = load(self.data_path / str(self.model_filename + "_model.joblib")) model = load(self.data_path / str(self.model_filename + "_model.joblib"))
else: else:
from tensorflow import keras from tensorflow import keras
model = keras.models.load_model(self.data_path / str(self.model_filename + "_model.h5")) model = keras.models.load_model(self.data_path / str(self.model_filename + "_model.h5"))
if Path(self.data_path / str(self.model_filename + "_svm_model.joblib")).resolve().exists(): if Path(self.data_path / str(self.model_filename + "_svm_model.joblib")).resolve().exists():
@ -263,7 +233,6 @@ class FreqaiDataKitchen:
labels, labels,
weights, weights,
stratify=stratification, stratify=stratification,
# shuffle=False,
**self.config["freqai"]["data_split_parameters"], **self.config["freqai"]["data_split_parameters"],
) )
@ -276,7 +245,6 @@ class FreqaiDataKitchen:
unfiltered_dataframe: DataFrame, unfiltered_dataframe: DataFrame,
training_feature_list: List, training_feature_list: List,
label_list: List = list(), label_list: List = list(),
# labels: DataFrame = pd.DataFrame(),
training_filter: bool = True, training_filter: bool = True,
) -> Tuple[DataFrame, DataFrame]: ) -> Tuple[DataFrame, DataFrame]:
""" """
@ -1135,6 +1103,19 @@ class FreqaiDataKitchen:
return dataframe return dataframe
def fit_live_predictions(self) -> None:
"""
Fit the labels with a gaussian distribution
"""
import scipy as spy
num_candles = self.freqai_config.get('fit_live_predictions_candles', 100)
self.data["labels_mean"], self.data["labels_std"] = {}, {}
for label in self.label_list:
f = spy.stats.norm.fit(self.dd.historic_predictions[self.pair][label].tail(num_candles))
self.data["labels_mean"][label], self.data["labels_std"][label] = f[0], f[1]
return
def fit_labels(self) -> None: def fit_labels(self) -> None:
""" """
Fit the labels with a gaussian distribution Fit the labels with a gaussian distribution

View File

@ -1,4 +1,5 @@
# import contextlib # import contextlib
import copy
import datetime import datetime
import gc import gc
import logging import logging
@ -484,6 +485,20 @@ class IFreqaiModel(ABC):
self.dd.purge_old_models() self.dd.purge_old_models()
# self.retrain = False # self.retrain = False
def set_initial_historic_predictions(self, df: DataFrame, model: Any,
dk: FreqaiDataKitchen, pair: str) -> None:
trained_predictions = model.predict(df)
pred_df = DataFrame(trained_predictions, columns=dk.label_list)
for label in dk.label_list:
pred_df[label] = (
(pred_df[label] + 1)
* (dk.data["labels_max"][label] - dk.data["labels_min"][label])
/ 2
) + dk.data["labels_min"][label]
self.dd.historic_predictions[pair] = pd.DataFrame()
self.dd.historic_predictions[pair] = copy.deepcopy(pred_df)
# Following methods which are overridden by user made prediction models. # Following methods which are overridden by user made prediction models.
# See freqai/prediction_models/CatboostPredictionModlel.py for an example. # See freqai/prediction_models/CatboostPredictionModlel.py for an example.

View File

@ -51,7 +51,8 @@ class BaseRegressionModel(IFreqaiModel):
# split data into train/test data. # split data into train/test data.
data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered) data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered)
dk.fit_labels() # fit labels to a cauchy distribution so we know what to expect in strategy if not self.freqai_info.get('fit_live_predictions', 0):
dk.fit_labels()
# normalize all data based on train_dataset only # normalize all data based on train_dataset only
data_dictionary = dk.normalize_data(data_dictionary) data_dictionary = dk.normalize_data(data_dictionary)
@ -65,6 +66,13 @@ class BaseRegressionModel(IFreqaiModel):
model = self.fit(data_dictionary) model = self.fit(data_dictionary)
if pair not in self.dd.historic_predictions:
self.set_initial_historic_predictions(
data_dictionary['train_features'], model, dk, pair)
elif self.freqai_info.get('fit_live_predictions_candles', 0):
dk.fit_live_predictions()
self.dd.save_historic_predictions_to_disk()
logger.info(f"--------------------done training {pair}--------------------") logger.info(f"--------------------done training {pair}--------------------")
return model return model