Merge pull request #7465 from freqtrade/aggregate-fixes-fai

Aggregate recent feature requests
This commit is contained in:
Robert Caulk 2022-09-25 20:43:23 +02:00 committed by GitHub
commit 8051235171
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 53 additions and 29 deletions

View File

@ -109,8 +109,8 @@ Mandatory parameters are marked as **Required**, which means that they are requi
| `indicator_max_period_candles` | **No longer used**. User must use the strategy set `startup_candle_count` which defines the maximum *period* used in `populate_any_indicators()` for indicator creation (timeframe independent). FreqAI uses this information in combination with the maximum timeframe to calculate how many data points it should download so that the first data point does not have a NaN <br> **Datatype:** positive integer. | `indicator_max_period_candles` | **No longer used**. User must use the strategy set `startup_candle_count` which defines the maximum *period* used in `populate_any_indicators()` for indicator creation (timeframe independent). FreqAI uses this information in combination with the maximum timeframe to calculate how many data points it should download so that the first data point does not have a NaN <br> **Datatype:** positive integer.
| `indicator_periods_candles` | Calculate indicators for `indicator_periods_candles` time periods and add them to the feature set. <br> **Datatype:** List of positive integers. | `indicator_periods_candles` | Calculate indicators for `indicator_periods_candles` time periods and add them to the feature set. <br> **Datatype:** List of positive integers.
| `stratify_training_data` | This value is used to indicate the grouping of the data. For example, 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. See details about how it works [here](#stratifying-the-data-for-training-and-testing-the-model) <br> **Datatype:** Positive integer. | `stratify_training_data` | This value is used to indicate the grouping of the data. For example, 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. See details about how it works [here](#stratifying-the-data-for-training-and-testing-the-model) <br> **Datatype:** Positive integer.
| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis) | `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis) <br> **Datatype:** Boolean. defaults to `false`.
| `plot_feature_importance` | Create an interactive feature importance plot for each model.<br> **Datatype:** Boolean.<br> **Datatype:** Boolean, defaults to `False` | `plot_feature_importances` | Create a feature importance plot for each model for the top/bottom `plot_feature_importances` number of features.<br> **Datatype:** Integer, defaults to `0`.
| `DI_threshold` | Activates the Dissimilarity Index for outlier detection when > 0. See details about how it works [here](#removing-outliers-with-the-dissimilarity-index). <br> **Datatype:** Positive float (typically < 1). | `DI_threshold` | Activates the Dissimilarity Index for outlier detection when > 0. See details about how it works [here](#removing-outliers-with-the-dissimilarity-index). <br> **Datatype:** Positive float (typically < 1).
| `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Boolean. | `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Boolean.
| `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Dictionary. | `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Dictionary.

View File

@ -313,6 +313,7 @@ class FreqaiDataDrawer:
""" """
dk.find_features(dataframe) dk.find_features(dataframe)
dk.find_labels(dataframe)
full_labels = dk.label_list + dk.unique_class_list full_labels = dk.label_list + dk.unique_class_list
@ -376,7 +377,27 @@ class FreqaiDataDrawer:
if self.config.get("freqai", {}).get("purge_old_models", False): if self.config.get("freqai", {}).get("purge_old_models", False):
self.purge_old_models() self.purge_old_models()
# Functions pulled back from FreqaiDataKitchen because they relied on DataDrawer def save_metadata(self, dk: FreqaiDataKitchen) -> None:
"""
Saves only metadata for backtesting studies if user prefers
not to save model data. This saves tremendous amounts of space
for users generating huge studies.
This is only active when `save_backtest_models`: false (not default)
"""
if not dk.data_path.is_dir():
dk.data_path.mkdir(parents=True, exist_ok=True)
save_path = Path(dk.data_path)
dk.data["data_path"] = str(dk.data_path)
dk.data["model_filename"] = str(dk.model_filename)
dk.data["training_features_list"] = list(dk.data_dictionary["train_features"].columns)
dk.data["label_list"] = dk.label_list
with open(save_path / f"{dk.model_filename}_metadata.json", "w") as fp:
rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE)
return
def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None: def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None:
""" """

View File

@ -831,7 +831,7 @@ class FreqaiDataKitchen:
inlier_metric = pd.DataFrame( inlier_metric = pd.DataFrame(
data=inliers.sum(axis=1) / no_prev_pts, data=inliers.sum(axis=1) / no_prev_pts,
columns=['inlier_metric'], columns=['%-inlier_metric'],
index=compute_df.index index=compute_df.index
) )
@ -881,11 +881,14 @@ class FreqaiDataKitchen:
""" """
column_names = dataframe.columns column_names = dataframe.columns
features = [c for c in column_names if "%" in c] features = [c for c in column_names if "%" in c]
labels = [c for c in column_names if "&" in c]
if not features: if not features:
raise OperationalException("Could not find any features!") raise OperationalException("Could not find any features!")
self.training_features_list = features self.training_features_list = features
def find_labels(self, dataframe: DataFrame) -> None:
column_names = dataframe.columns
labels = [c for c in column_names if "&" in c]
self.label_list = labels self.label_list = labels
def check_if_pred_in_training_spaces(self) -> None: def check_if_pred_in_training_spaces(self) -> None:
@ -1206,7 +1209,8 @@ class FreqaiDataKitchen:
def get_unique_classes_from_labels(self, dataframe: DataFrame) -> None: def get_unique_classes_from_labels(self, dataframe: DataFrame) -> None:
self.find_features(dataframe) # self.find_features(dataframe)
self.find_labels(dataframe)
for key in self.label_list: for key in self.label_list:
if dataframe[key].dtype == object: if dataframe[key].dtype == object:

View File

@ -92,6 +92,7 @@ class IFreqaiModel(ABC):
self.begin_time_train: float = 0 self.begin_time_train: float = 0
self.base_tf_seconds = timeframe_to_seconds(self.config['timeframe']) self.base_tf_seconds = timeframe_to_seconds(self.config['timeframe'])
self.continual_learning = self.freqai_info.get('continual_learning', False) self.continual_learning = self.freqai_info.get('continual_learning', False)
self.plot_features = self.ft_params.get("plot_feature_importances", 0)
self._threads: List[threading.Thread] = [] self._threads: List[threading.Thread] = []
self._stop_event = threading.Event() self._stop_event = threading.Event()
@ -278,22 +279,24 @@ class IFreqaiModel(ABC):
append_df = dk.get_backtesting_prediction() append_df = dk.get_backtesting_prediction()
dk.append_predictions(append_df) dk.append_predictions(append_df)
else: else:
if not self.model_exists( if not self.model_exists(dk):
pair, dk, trained_timestamp=trained_timestamp_int
):
dk.find_features(dataframe_train) dk.find_features(dataframe_train)
dk.find_labels(dataframe_train)
self.model = self.train(dataframe_train, pair, dk) self.model = self.train(dataframe_train, pair, dk)
self.dd.pair_dict[pair]["trained_timestamp"] = int( self.dd.pair_dict[pair]["trained_timestamp"] = int(
trained_timestamp.stopts) trained_timestamp.stopts)
if self.plot_features:
plot_feature_importance(self.model, pair, dk, self.plot_features)
if self.save_backtest_models: if self.save_backtest_models:
logger.info('Saving backtest model to disk.') logger.info('Saving backtest model to disk.')
self.dd.save_data(self.model, pair, dk) self.dd.save_data(self.model, pair, dk)
else:
logger.info('Saving metadata to disk.')
self.dd.save_metadata(dk)
else: else:
self.model = self.dd.load_data(pair, dk) self.model = self.dd.load_data(pair, dk)
self.check_if_feature_list_matches_strategy(dataframe_train, dk) # self.check_if_feature_list_matches_strategy(dataframe_train, dk)
pred_df, do_preds = self.predict(dataframe_backtest, dk) pred_df, do_preds = self.predict(dataframe_backtest, dk)
append_df = dk.get_predictions_to_append(pred_df, do_preds) append_df = dk.get_predictions_to_append(pred_df, do_preds)
dk.append_predictions(append_df) dk.append_predictions(append_df)
@ -372,8 +375,7 @@ class IFreqaiModel(ABC):
self.dd.return_null_values_to_strategy(dataframe, dk) self.dd.return_null_values_to_strategy(dataframe, dk)
return dk return dk
# ensure user is feeding the correct indicators to the model dk.find_labels(dataframe)
self.check_if_feature_list_matches_strategy(dataframe, dk)
self.build_strategy_return_arrays(dataframe, dk, metadata["pair"], trained_timestamp) self.build_strategy_return_arrays(dataframe, dk, metadata["pair"], trained_timestamp)
@ -492,7 +494,7 @@ class IFreqaiModel(ABC):
if ft_params.get( if ft_params.get(
"principal_component_analysis", False "principal_component_analysis", False
): ):
dk.pca_transform(self.dk.data_dictionary['prediction_features']) dk.pca_transform(dk.data_dictionary['prediction_features'])
if ft_params.get("use_SVM_to_remove_outliers", False): if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=True) dk.use_SVM_to_remove_outliers(predict=True)
@ -503,14 +505,10 @@ class IFreqaiModel(ABC):
if ft_params.get("use_DBSCAN_to_remove_outliers", False): if ft_params.get("use_DBSCAN_to_remove_outliers", False):
dk.use_DBSCAN_to_remove_outliers(predict=True) dk.use_DBSCAN_to_remove_outliers(predict=True)
def model_exists( # ensure user is feeding the correct indicators to the model
self, self.check_if_feature_list_matches_strategy(dk.data_dictionary['prediction_features'], dk)
pair: str,
dk: FreqaiDataKitchen, def model_exists(self, dk: FreqaiDataKitchen) -> bool:
trained_timestamp: int = None,
model_filename: str = "",
scanning: bool = False,
) -> bool:
""" """
Given a pair and path, check if a model already exists Given a pair and path, check if a model already exists
:param pair: pair e.g. BTC/USD :param pair: pair e.g. BTC/USD
@ -518,11 +516,11 @@ class IFreqaiModel(ABC):
:return: :return:
:boolean: whether the model file exists or not. :boolean: whether the model file exists or not.
""" """
path_to_modelfile = Path(dk.data_path / f"{model_filename}_model.joblib") path_to_modelfile = Path(dk.data_path / f"{dk.model_filename}_model.joblib")
file_exists = path_to_modelfile.is_file() file_exists = path_to_modelfile.is_file()
if file_exists and not scanning: if file_exists:
logger.info("Found model at %s", dk.data_path / dk.model_filename) logger.info("Found model at %s", dk.data_path / dk.model_filename)
elif not scanning: else:
logger.info("Could not find model at %s", dk.data_path / dk.model_filename) logger.info("Could not find model at %s", dk.data_path / dk.model_filename)
return file_exists return file_exists
@ -569,6 +567,7 @@ class IFreqaiModel(ABC):
# find the features indicated by strategy and store in datakitchen # find the features indicated by strategy and store in datakitchen
dk.find_features(unfiltered_dataframe) dk.find_features(unfiltered_dataframe)
dk.find_labels(unfiltered_dataframe)
model = self.train(unfiltered_dataframe, pair, dk) model = self.train(unfiltered_dataframe, pair, dk)
@ -576,8 +575,8 @@ class IFreqaiModel(ABC):
dk.set_new_model_names(pair, new_trained_timerange) dk.set_new_model_names(pair, new_trained_timerange)
self.dd.save_data(model, pair, dk) self.dd.save_data(model, pair, dk)
if self.freqai_info["feature_parameters"].get("plot_feature_importance", False): if self.plot_features:
plot_feature_importance(model, pair, dk) plot_feature_importance(model, pair, dk, self.plot_features)
if self.freqai_info.get("purge_old_models", False): if self.freqai_info.get("purge_old_models", False):
self.dd.purge_old_models() self.dd.purge_old_models()

View File

@ -170,7 +170,7 @@ def plot_feature_importance(model: Any, pair: str, dk: FreqaiDataKitchen,
# Data preparation # Data preparation
fi_df = pd.DataFrame({ fi_df = pd.DataFrame({
"feature_names": np.array(dk.training_features_list), "feature_names": np.array(dk.data_dictionary['train_features'].columns),
"feature_importance": np.array(feature_importance) "feature_importance": np.array(feature_importance)
}) })
fi_df_top = fi_df.nlargest(count_max, "feature_importance")[::-1] fi_df_top = fi_df.nlargest(count_max, "feature_importance")[::-1]