Aggregated commit. Adding support vector machine for outlier detection, improve user interface to dry/live, better standardization, fix various other bugs

2022-05-22 17:51:49 +02:00 · 2022-05-22 17:51:49 +02:00 · 42d95af829
commit 42d95af829
parent c5ecf94177
7 changed files with 404 additions and 300 deletions
--- a/config_examples/config_freqai.example.json
+++ b/config_examples/config_freqai.example.json
@ -57,8 +57,8 @@
        "train_period": 30,
        "backtest_period": 7,
        "identifier": "example",
-        "live_trained_timerange": "20220330-20220429",
+        "live_trained_timerange": "",
-        "live_full_backtestrange": "20220302-20220501",
+        "live_full_backtestrange": "",
        "corr_pairlist": [
            "BTC/USDT",
            "ETH/USDT",
@ -68,20 +68,19 @@
        "feature_parameters": {
            "period": 12,
            "shift": 1,
            "drop_features": false,
            "DI_threshold": 1,
            "weight_factor": 0,
            "principal_component_analysis": false,
-            "remove_outliers": false
+            "use_SVM_to_remove_outliers": false
        },
        "data_split_parameters": {
            "test_size": 0.25,
            "random_state": 1
        },
        "model_training_parameters": {
-            "n_estimators": 2000,
+            "n_estimators": 1000,
            "random_state": 1,
-            "learning_rate": 0.02,
+            "learning_rate": 0.1,
            "task_type": "CPU"
        }
    },
--- a/docs/freqai.md
+++ b/docs/freqai.md
@ -331,21 +331,21 @@ Users can reduce the dimensionality of their features by activating the `princip
 Which will perform PCA on the features and reduce the dimensionality of the data so that the explained
 variance of the data set is >= 0.999.
-### Removing outliers based on feature statistical distributions
+### Removing outliers using a Support Vector Machine (SVM)
 The user can tell Freqai to remove outlier data points from the training/test data sets by setting:
 ```json
    "freqai": {
        "feature_parameters" : {
-                "remove_outliers": true
+            "use_SVM_to_remove_outliers: true
        }
    }
 ```
-Freqai will check the statistical distributions of each feature (or component if the user activated
+Freqai will train an SVM on the training data (or components if the user activated
-`principal_component_analysis`) and remove any data point that sits more than 3 standard deviations away 
+`principal_component_analysis`) and remove any data point that it deems to be sit beyond the 
-from the mean.
+feature space.
 ## Additional information
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@ -10,8 +10,9 @@ from typing import Any, Dict, List, Tuple
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
-from joblib import dump, load
+from joblib import dump, load  # , Parallel, delayed # used for auto distribution assignment
 from pandas import DataFrame
 from sklearn import linear_model
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.model_selection import train_test_split
@ -22,6 +23,9 @@ from freqtrade.resolvers import ExchangeResolver
 from freqtrade.strategy.interface import IStrategy
 # import scipy as spy  # used for auto distribution assignment
 SECONDS_IN_DAY = 86400
 logger = logging.getLogger(__name__)
@ -52,6 +56,7 @@ class FreqaiDataKitchen:
        self.model_filename: str = ""
        self.model_dictionary: Dict[Any, Any] = {}
        self.live = live
        self.svm_model: linear_model.SGDOneClassSVM = None
        if not self.live:
            self.full_timerange = self.create_fulltimerange(self.config["timerange"],
                                                            self.freqai_config["train_period"]
@ -89,6 +94,10 @@ class FreqaiDataKitchen:
        # Save the trained model
        dump(model, save_path / str(self.model_filename + "_model.joblib"))
        if self.svm_model is not None:
            dump(self.svm_model, save_path / str(self.model_filename + "_svm_model.joblib"))
        self.data["model_path"] = str(self.model_path)
        self.data["model_filename"] = str(self.model_filename)
        self.data["training_features_list"] = list(self.data_dictionary["train_features"].columns)
@ -104,6 +113,19 @@ class FreqaiDataKitchen:
        if self.live:
            self.model_dictionary[self.model_filename] = model
        # TODO add a helper function to let user save/load any data they are custom adding. We
        # do not want them having to edit the default save/load methods here. Below is an example
        # of what we do NOT want.
        # if self.freqai_config['feature_parameters']['determine_statistical_distributions']:
        #     self.data_dictionary["upper_quantiles"].to_pickle(
        #         save_path / str(self.model_filename + "_upper_quantiles.pkl")
        #     )
        #     self.data_dictionary["lower_quantiles"].to_pickle(
        #         save_path / str(self.model_filename + "_lower_quantiles.pkl")
        #     )
        return
    def load_data(self) -> Any:
@ -121,6 +143,19 @@ class FreqaiDataKitchen:
            self.model_path / str(self.model_filename + "_trained_df.pkl")
        )
        # TODO add a helper function to let user save/load any data they are custom adding. We
        # do not want them having to edit the default save/load methods here. Below is an example
        # of what we do NOT want.
        # if self.freqai_config['feature_parameters']['determine_statistical_distributions']:
        #     self.data_dictionary["upper_quantiles"] = pd.read_pickle(
        #         self.model_path / str(self.model_filename + "_upper_quantiles.pkl")
        #     )
        #     self.data_dictionary["lower_quantiles"] = pd.read_pickle(
        #         self.model_path / str(self.model_filename + "_lower_quantiles.pkl")
        #     )
        self.model_path = Path(self.data["model_path"])
        self.model_filename = self.data["model_filename"]
@ -130,6 +165,10 @@ class FreqaiDataKitchen:
        else:
            model = load(self.model_path / str(self.model_filename + "_model.joblib"))
        if Path(self.model_path / str(self.model_filename +
                "_svm_model.joblib")).resolve().exists():
            self.svm_model = load(self.model_path / str(self.model_filename + "_svm_model.joblib"))
        assert model, (
                       f"Unable to load model, ensure model exists at "
                       f"{self.model_path} "
@ -159,6 +198,12 @@ class FreqaiDataKitchen:
        else:
            weights = np.ones(len(filtered_dataframe))
        if self.config["freqai"]["feature_parameters"]["stratify"] > 0:
            stratification = np.zeros(len(filtered_dataframe))
            for i in range(1, len(stratification)):
                if i % self.config["freqai"]["feature_parameters"]["stratify"] == 0:
                    stratification[i] = 1
        (
            train_features,
            test_features,
@ -170,6 +215,8 @@ class FreqaiDataKitchen:
            filtered_dataframe[: filtered_dataframe.shape[0]],
            labels,
            weights,
            stratify=stratification,
            # shuffle=False,
            **self.config["freqai"]["data_split_parameters"]
        )
@ -261,9 +308,9 @@ class FreqaiDataKitchen:
        return self.data_dictionary
-    def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
+    def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
        """
-        Standardize all data in the data_dictionary according to the training dataset
+        Normalize all data in the data_dictionary according to the training dataset
        :params:
        :data_dictionary: dictionary containing the cleaned and split training/test data/labels
        :returns:
@ -297,6 +344,42 @@ class FreqaiDataKitchen:
        return data_dictionary
    def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
        """
        Standardize all data in the data_dictionary according to the training dataset
        :params:
        :data_dictionary: dictionary containing the cleaned and split training/test data/labels
        :returns:
        :data_dictionary: updated dictionary with standardized values.
        """
        # standardize the data by training stats
        train_max = data_dictionary["train_features"].max()
        train_min = data_dictionary["train_features"].min()
        data_dictionary["train_features"] = 2 * (
            data_dictionary["train_features"] - train_min
        ) / (train_max - train_min) - 1
        data_dictionary["test_features"] = 2 * (
            data_dictionary["test_features"] - train_min
        ) / (train_max - train_min) - 1
        train_labels_max = data_dictionary["train_labels"].max()
        train_labels_min = data_dictionary["train_labels"].min()
        data_dictionary["train_labels"] = 2 * (
            data_dictionary["train_labels"] - train_labels_min
        ) / (train_labels_max - train_labels_min) - 1
        data_dictionary["test_labels"] = 2 * (
            data_dictionary["test_labels"] - train_labels_min
        ) / (train_labels_max - train_labels_min) - 1
        for item in train_max.keys():
            self.data[item + "_max"] = train_max[item]
            self.data[item + "_min"] = train_min[item]
        self.data["labels_max"] = train_labels_max
        self.data["labels_min"] = train_labels_min
        return data_dictionary
    def standardize_data_from_metadata(self, df: DataFrame) -> DataFrame:
        """
        Standardizes a set of data using the mean and standard deviation from
@ -305,6 +388,20 @@ class FreqaiDataKitchen:
        :df: Dataframe to be standardized
        """
        for item in df.keys():
            df[item] = 2 * (df[item] - self.data[item + "_min"]) / (self.data[item + "_max"] -
                                                                    self.data[item + '_min']) - 1
        return df
    def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
        """
        Normalizes a set of data using the mean and standard deviation from
        the associated training data.
        :params:
        :df: Dataframe to be standardized
        """
        for item in df.keys():
            df[item] = (df[item] - self.data[item + "_mean"]) / self.data[item + "_std"]
@ -420,6 +517,8 @@ class FreqaiDataKitchen:
        self.data["n_kept_components"] = n_keep_components
        self.pca = pca2
        logger.info(f'PCA reduced total features from  {n_components} to {n_keep_components}')
        if not self.model_path.is_dir():
            self.model_path.mkdir(parents=True, exist_ok=True)
        pk.dump(pca2, open(self.model_path / str(self.model_filename + "_pca_object.pkl"), "wb"))
@ -434,70 +533,53 @@ class FreqaiDataKitchen:
        return avg_mean_dist
-    def remove_outliers(self, predict: bool) -> None:
+    def use_SVM_to_remove_outliers(self, predict: bool) -> None:
        """
        Remove data that looks like an outlier based on the distribution of each
        variable.
        :params:
        :predict: boolean which tells the function if this is prediction data or
        training data coming in.
        """
        lower_quantile = self.data_dictionary["train_features"].quantile(0.001)
        upper_quantile = self.data_dictionary["train_features"].quantile(0.999)
        if predict:
-
+            assert self.svm_model, "No svm model available for outlier removal"
-            df = self.data_dictionary["prediction_features"][
+            y_pred = self.svm_model.predict(self.data_dictionary["prediction_features"])
-                (self.data_dictionary["prediction_features"] < upper_quantile)
+            do_predict = np.where(y_pred == -1, 0, y_pred)
                & (self.data_dictionary["prediction_features"] > lower_quantile)
            ]
            drop_index = pd.isnull(df).any(1)
            self.data_dictionary["prediction_features"].fillna(0, inplace=True)
            drop_index = ~drop_index
            do_predict = np.array(drop_index.replace(True, 1).replace(False, 0))
            logger.info(
-                "remove_outliers() tossed %s predictions",
+                f'svm_remove_outliers() tossed {len(do_predict) - do_predict.sum()} predictions'
                len(do_predict) - do_predict.sum(),
            )
            self.do_predict += do_predict
            self.do_predict -= 1
        else:
            # use SGDOneClassSVM to increase speed?
            self.svm_model = linear_model.SGDOneClassSVM(nu=0.1).fit(
                                                            self.data_dictionary["train_features"]
                                                            )
            y_pred = self.svm_model.predict(self.data_dictionary["train_features"])
            dropped_points = np.where(y_pred == -1, 0, y_pred)
            # keep_index = np.where(y_pred == 1)
            self.data_dictionary["train_features"] = self.data_dictionary[
                                                                "train_features"][(y_pred == 1)]
            self.data_dictionary["train_labels"] = self.data_dictionary[
                                                                "train_labels"][(y_pred == 1)]
            self.data_dictionary["train_weights"] = self.data_dictionary[
                                                                "train_weights"][(y_pred == 1)]
-            filter_train_df = self.data_dictionary["train_features"][
+            logger.info(
-                (self.data_dictionary["train_features"] < upper_quantile)
+                f'svm_remove_outliers() tossed {len(y_pred) - dropped_points.sum()}'
-                & (self.data_dictionary["train_features"] > lower_quantile)
+                f' train points from {len(y_pred)}'
-            ]
+            )
            drop_index = pd.isnull(filter_train_df).any(1)
            drop_index = drop_index.replace(True, 1).replace(False, 0)
            self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
                (drop_index == 0)
            ]
            self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
                (drop_index == 0)
            ]
            self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
                (drop_index == 0)
            ]
-            # do the same for the test data
+            # same for test data
-            filter_test_df = self.data_dictionary["test_features"][
+            y_pred = self.svm_model.predict(self.data_dictionary["test_features"])
-                (self.data_dictionary["test_features"] < upper_quantile)
+            dropped_points = np.where(y_pred == -1, 0, y_pred)
-                & (self.data_dictionary["test_features"] > lower_quantile)
+            self.data_dictionary["test_features"] = self.data_dictionary[
-            ]
+                                                                "test_features"][(y_pred == 1)]
-            drop_index = pd.isnull(filter_test_df).any(1)
+            self.data_dictionary["test_labels"] = self.data_dictionary[
-            drop_index = drop_index.replace(True, 1).replace(False, 0)
+                                                                "test_labels"][(y_pred == 1)]
-            self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][
+            self.data_dictionary["test_weights"] = self.data_dictionary[
-                (drop_index == 0)
+                                                                "test_weights"][(y_pred == 1)]
-            ]
+
-            self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
+            logger.info(
-                (drop_index == 0)
+                f'svm_remove_outliers() tossed {len(y_pred) - dropped_points.sum()}'
-            ]
+                f' test points from {len(y_pred)}'
-            self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][
+            )
                (drop_index == 0)
            ]
        return
@ -507,32 +589,6 @@ class FreqaiDataKitchen:
        assert features, ("Could not find any features!")
        return features
    # def build_feature_list(self, config: dict, metadata: dict) -> list:
    #     """
    #     SUPERCEDED BY self.find_features()
    #     Build the list of features that will be used to filter
    #     the full dataframe. Feature list is construced from the
    #     user configuration file.
    #     :params:
    #     :config: Canonical freqtrade config file containing all
    #     user defined input in config['freqai] dictionary.
    #     """
    #     features = []
    #     for tf in config["freqai"]["timeframes"]:
    #         for ft in config["freqai"]["base_features"]:
    #             for n in range(config["freqai"]["feature_parameters"]["shift"] + 1):
    #                 shift = ""
    #                 if n > 0:
    #                     shift = "_shift-" + str(n)
    #                 features.append(metadata['pair'].split("/")[0] + "-" + ft + shift + "_" + tf)
    #                 for p in config["freqai"]["corr_pairlist"]:
    #                     if metadata['pair'] in p:
    #                         continue  # avoid duplicate features
    #                     features.append(p.split("/")[0] + "-" + ft + shift + "_" + tf)
    #     # logger.info("number of features %s", len(features))
    #     return features
    def check_if_pred_in_training_spaces(self) -> None:
        """
        Compares the distance from each prediction point to each training data
@ -568,7 +624,7 @@ class FreqaiDataKitchen:
        training than older data.
        """
-        weights = np.zeros_like(num_weights)
+        weights = np.zeros(num_weights)
        for i in range(1, len(weights)):
            weights[len(weights) - i] = np.exp(
                -i / (self.config["freqai"]["feature_parameters"]["weight_factor"] * num_weights)
@ -638,19 +694,23 @@ class FreqaiDataKitchen:
        time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp()
-        trained_timerange = TimeRange.parse_timerange(training_timerange)
+        if training_timerange:  # user passed no live_trained_timerange in config
            trained_timerange = TimeRange.parse_timerange(training_timerange)
            elapsed_time = (time - trained_timerange.stopts) / SECONDS_IN_DAY
            trained_timerange.startts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
            trained_timerange.stopts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
            retrain = elapsed_time > self.freqai_config['backtest_period']
        else:
            trained_timerange = TimeRange.parse_timerange("20000101-20000201")
            trained_timerange.startts = int(time - self.freqai_config['train_period'] *
                                            SECONDS_IN_DAY)
            trained_timerange.stopts = int(time)
            retrain = True
        elapsed_time = (time - trained_timerange.stopts) / SECONDS_IN_DAY
        trained_timerange.startts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
        trained_timerange.stopts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
        start = datetime.datetime.utcfromtimestamp(trained_timerange.startts)
        stop = datetime.datetime.utcfromtimestamp(trained_timerange.stopts)
        new_trained_timerange = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")
        retrain = elapsed_time > self.freqai_config['backtest_period']
        if retrain:
            coin, _ = metadata['pair'].split("/")
            # set the new model_path
@ -738,3 +798,141 @@ class FreqaiDataKitchen:
    def np_encoder(self, object):
        if isinstance(object, np.generic):
            return object.item()
    # Functions containing useful data manpulation examples. but not actively in use.
    # def build_feature_list(self, config: dict, metadata: dict) -> list:
    #     """
    #     SUPERCEDED BY self.find_features()
    #     Build the list of features that will be used to filter
    #     the full dataframe. Feature list is construced from the
    #     user configuration file.
    #     :params:
    #     :config: Canonical freqtrade config file containing all
    #     user defined input in config['freqai] dictionary.
    #     """
    #     features = []
    #     for tf in config["freqai"]["timeframes"]:
    #         for ft in config["freqai"]["base_features"]:
    #             for n in range(config["freqai"]["feature_parameters"]["shift"] + 1):
    #                 shift = ""
    #                 if n > 0:
    #                     shift = "_shift-" + str(n)
    #                 features.append(metadata['pair'].split("/")[0] + "-" + ft + shift + "_" + tf)
    #                 for p in config["freqai"]["corr_pairlist"]:
    #                     if metadata['pair'] in p:
    #                         continue  # avoid duplicate features
    #                     features.append(p.split("/")[0] + "-" + ft + shift + "_" + tf)
    #     # logger.info("number of features %s", len(features))
    #     return features
    # Possibly phasing these outlier removal methods below out in favor of
    # use_SVM_to_remove_outliers (computationally more efficient and apparently higher performance).
    # But these have good data manipulation examples, so keep them commented here for now.
    # def determine_statistical_distributions(self) -> None:
    #     from fitter import Fitter
    #     logger.info('Determining best model for all features, may take some time')
    #     def compute_quantiles(ft):
    #         f = Fitter(self.data_dictionary["train_features"][ft],
    #                    distributions=['gamma', 'cauchy', 'laplace',
    #                                   'beta', 'uniform', 'lognorm'])
    #         f.fit()
    #         # f.summary()
    #         dist = list(f.get_best().items())[0][0]
    #         params = f.get_best()[dist]
    #         upper_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.999, **params)
    #         lower_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.001, **params)
    #         return ft, upper_q, lower_q, dist
    #     quantiles_tuple = Parallel(n_jobs=-1)(
    #         delayed(compute_quantiles)(ft) for ft in self.data_dictionary[
    #                                                       'train_features'].columns)
    #     df = pd.DataFrame(quantiles_tuple, columns=['features', 'upper_quantiles',
    #                                                 'lower_quantiles', 'dist'])
    #     self.data_dictionary['upper_quantiles'] = df['upper_quantiles']
    #     self.data_dictionary['lower_quantiles'] = df['lower_quantiles']
    #     return
    # def remove_outliers(self, predict: bool) -> None:
    #     """
    #     Remove data that looks like an outlier based on the distribution of each
    #     variable.
    #     :params:
    #     :predict: boolean which tells the function if this is prediction data or
    #     training data coming in.
    #     """
    #     lower_quantile = self.data_dictionary["lower_quantiles"].to_numpy()
    #     upper_quantile = self.data_dictionary["upper_quantiles"].to_numpy()
    #     if predict:
    #         df = self.data_dictionary["prediction_features"][
    #             (self.data_dictionary["prediction_features"] < upper_quantile)
    #             & (self.data_dictionary["prediction_features"] > lower_quantile)
    #         ]
    #         drop_index = pd.isnull(df).any(1)
    #         self.data_dictionary["prediction_features"].fillna(0, inplace=True)
    #         drop_index = ~drop_index
    #         do_predict = np.array(drop_index.replace(True, 1).replace(False, 0))
    #         logger.info(
    #             "remove_outliers() tossed %s predictions",
    #             len(do_predict) - do_predict.sum(),
    #         )
    #         self.do_predict += do_predict
    #         self.do_predict -= 1
    #     else:
    #         filter_train_df = self.data_dictionary["train_features"][
    #             (self.data_dictionary["train_features"] < upper_quantile)
    #             & (self.data_dictionary["train_features"] > lower_quantile)
    #         ]
    #         drop_index = pd.isnull(filter_train_df).any(1)
    #         drop_index = drop_index.replace(True, 1).replace(False, 0)
    #         self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
    #             (drop_index == 0)
    #         ]
    #         self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
    #             (drop_index == 0)
    #         ]
    #         self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
    #             (drop_index == 0)
    #         ]
    #         logger.info(
    #             f'remove_outliers() tossed {drop_index.sum()}'
    #             f' training points from {len(filter_train_df)}'
    #         )
    #         # do the same for the test data
    #         filter_test_df = self.data_dictionary["test_features"][
    #             (self.data_dictionary["test_features"] < upper_quantile)
    #             & (self.data_dictionary["test_features"] > lower_quantile)
    #         ]
    #         drop_index = pd.isnull(filter_test_df).any(1)
    #         drop_index = drop_index.replace(True, 1).replace(False, 0)
    #         self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][
    #             (drop_index == 0)
    #         ]
    #         self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
    #             (drop_index == 0)
    #         ]
    #         self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][
    #             (drop_index == 0)
    #         ]
    #         logger.info(
    #             f'remove_outliers() tossed {drop_index.sum()}'
    #             f' test points from {len(filter_test_df)}'
    #         )
    #     return
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@ -62,6 +62,7 @@ class IFreqaiModel(ABC):
        self.predictions = None
        self.training_on_separate_thread = False
        self.retrain = False
        self.first = True
    def start(self, dataframe: DataFrame, metadata: dict, strategy: IStrategy) -> DataFrame:
        """
@ -80,12 +81,12 @@ class IFreqaiModel(ABC):
        :metadata: pair metadata coming from strategy.
        """
-        live = strategy.dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE)
+        self.live = strategy.dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE)
        self.pair = metadata["pair"]
-        self.dh = FreqaiDataKitchen(self.config, dataframe, live)
+        self.dh = FreqaiDataKitchen(self.config, dataframe, self.live)
-        if live:
+        if self.live:
            # logger.info('testing live')
            self.start_live(dataframe, metadata, strategy)
@ -115,11 +116,12 @@ class IFreqaiModel(ABC):
                self.dh.save_data(self.model)
            else:
                self.model = self.dh.load_data()
-                strategy_provided_features = self.dh.find_features(dataframe_train)
+                # strategy_provided_features = self.dh.find_features(dataframe_train)
-                if strategy_provided_features != self.dh.training_features_list:
+                # # TOFIX doesnt work with PCA
-                    logger.info("User changed input features, retraining model.")
+                # if strategy_provided_features != self.dh.training_features_list:
-                    self.model = self.train(dataframe_train, metadata)
+                #     logger.info("User changed input features, retraining model.")
-                    self.dh.save_data(self.model)
+                #     self.model = self.train(dataframe_train, metadata)
                #     self.dh.save_data(self.model)
            preds, do_preds = self.predict(dataframe_backtest, metadata)
@ -148,7 +150,7 @@ class IFreqaiModel(ABC):
        if not self.training_on_separate_thread:
            # this will also prevent other pairs from trying to train simultaneously.
            (self.retrain,
-             new_trained_timerange) = self.dh.check_if_new_training_required(self.freqai_info[
+             self.new_trained_timerange) = self.dh.check_if_new_training_required(self.freqai_info[
                                                                        'live_trained_timerange'],
                                                                        metadata)
        else:
@ -156,14 +158,19 @@ class IFreqaiModel(ABC):
            self.retrain = False
        if self.retrain or not file_exists:
-            self.training_on_separate_thread = True  # acts like a lock
+            if self.first:
-            self.retrain_model_on_separate_thread(new_trained_timerange, metadata, strategy)
+                self.train_model_in_series(self.new_trained_timerange, metadata, strategy)
                self.first = False
            else:
                self.training_on_separate_thread = True  # acts like a lock
                self.retrain_model_on_separate_thread(self.new_trained_timerange,
                                                      metadata, strategy)
        self.model = self.dh.load_data()
        strategy_provided_features = self.dh.find_features(dataframe)
        if strategy_provided_features != self.dh.training_features_list:
-            self.train_model_in_series(new_trained_timerange, metadata, strategy)
+            self.train_model_in_series(self.new_trained_timerange, metadata, strategy)
        preds, do_preds = self.predict(dataframe, metadata)
        self.dh.append_predictions(preds, do_preds, len(dataframe))
@ -215,12 +222,36 @@ class IFreqaiModel(ABC):
        data (NaNs) or felt uncertain about data (PCA and DI index)
        """
    @abstractmethod
    def data_cleaning_train(self) -> None:
        """
        User can add data analysis and cleaning here.
        Any function inside this method should drop training data points from the filtered_dataframe
        based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example
        of how outlier data points are dropped from the dataframe used for training.
        """
    @abstractmethod
    def data_cleaning_predict(self) -> None:
        """
        User can add data analysis and cleaning here.
        These functions each modify self.dh.do_predict, which is a dataframe with equal length
        to the number of candles coming from and returning to the strategy. Inside do_predict,
         1 allows prediction and < 0 signals to the strategy that the model is not confident in
         the prediction.
         See FreqaiDataKitchen::remove_outliers() for an example
        of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
        for buy signals.
        """
    def model_exists(self, pair: str, training_timerange: str) -> bool:
        """
        Given a pair and path, check if a model already exists
        :param pair: pair e.g. BTC/USD
        :param path: path to model
        """
        if self.live and training_timerange is None:
            return False
        coin, _ = pair.split("/")
        self.dh.model_filename = "cb_" + coin.lower() + "_" + training_timerange
        path_to_modelfile = Path(self.dh.model_path / str(self.dh.model_filename + "_model.joblib"))
@ -265,3 +296,4 @@ class IFreqaiModel(ABC):
        self.model = self.train(unfiltered_dataframe, metadata)
        self.dh.save_data(self.model)
        self.retrain = False
--- a/freqtrade/freqai/prediction_models/CatboostPredictionModel.py
+++ b/freqtrade/freqai/prediction_models/CatboostPredictionModel.py
@ -29,7 +29,7 @@ class CatboostPredictionModel(IFreqaiModel):
            dataframe["close"]
            .shift(-self.feature_parameters["period"])
            .rolling(self.feature_parameters["period"])
-            .max()
+            .mean()
            / dataframe["close"]
            - 1
        )
@ -68,15 +68,11 @@ class CatboostPredictionModel(IFreqaiModel):
        # standardize all data based on train_dataset only
        data_dictionary = self.dh.standardize_data(data_dictionary)
-        # optional additional data cleaning
+        # optional additional data cleaning/analysis
-        if self.feature_parameters["principal_component_analysis"]:
+        self.data_cleaning_train()
            self.dh.principal_component_analysis()
        if self.feature_parameters["remove_outliers"]:
            self.dh.remove_outliers(predict=False)
        if self.feature_parameters["DI_threshold"]:
            self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
-        logger.info("length of train data %s", len(data_dictionary["train_features"]))
+        logger.info(f'Training model on {len(self.dh.training_features_list)} features')
        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
        model = self.fit(data_dictionary)
@ -86,9 +82,7 @@ class CatboostPredictionModel(IFreqaiModel):
    def fit(self, data_dictionary: Dict) -> Any:
        """
-        Most regressors use the same function names and arguments e.g. user
+        User sets up the training and test data to fit their desired model here
        can drop in LGBMRegressor in place of CatBoostRegressor and all data
        management will be properly handled by Freqai.
        :params:
        :data_dictionary: the dictionary constructed by DataHandler to hold
        all the training and test data/labels.
@ -133,7 +127,51 @@ class CatboostPredictionModel(IFreqaiModel):
        filtered_dataframe = self.dh.standardize_data_from_metadata(filtered_dataframe)
        self.dh.data_dictionary["prediction_features"] = filtered_dataframe
-        # optional additional data cleaning
+        # optional additional data cleaning/analysis
        self.data_cleaning_predict(filtered_dataframe)
        predictions = self.model.predict(self.dh.data_dictionary["prediction_features"])
        # compute the non-standardized predictions
        self.dh.predictions = (predictions + 1) * (self.dh.data["labels_max"] -
                                                   self.dh.data["labels_min"]) / 2 + self.dh.data[
                                                                                     "labels_min"]
        # logger.info("--------------------Finished prediction--------------------")
        return (self.dh.predictions, self.dh.do_predict)
    def data_cleaning_train(self) -> None:
        """
        User can add data analysis and cleaning here.
        Any function inside this method should drop training data points from the filtered_dataframe
        based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example
        of how outlier data points are dropped from the dataframe used for training.
        """
        if self.feature_parameters["principal_component_analysis"]:
            self.dh.principal_component_analysis()
        # if self.feature_parameters["determine_statistical_distributions"]:
        #     self.dh.determine_statistical_distributions()
        # if self.feature_parameters["remove_outliers"]:
        #     self.dh.remove_outliers(predict=False)
        if self.feature_parameters["use_SVM_to_remove_outliers"]:
            self.dh.use_SVM_to_remove_outliers(predict=False)
        if self.feature_parameters["DI_threshold"]:
            self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
    def data_cleaning_predict(self, filtered_dataframe: DataFrame) -> None:
        """
        User can add data analysis and cleaning here.
        These functions each modify self.dh.do_predict, which is a dataframe with equal length
        to the number of candles coming from and returning to the strategy. Inside do_predict,
         1 allows prediction and < 0 signals to the strategy that the model is not confident in
         the prediction.
         See FreqaiDataKitchen::remove_outliers() for an example
        of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
        for buy signals.
        """
        if self.feature_parameters["principal_component_analysis"]:
            pca_components = self.dh.pca.transform(filtered_dataframe)
            self.dh.data_dictionary["prediction_features"] = pd.DataFrame(
@ -142,17 +180,13 @@ class CatboostPredictionModel(IFreqaiModel):
                index=filtered_dataframe.index,
            )
-        if self.feature_parameters["remove_outliers"]:
+        # if self.feature_parameters["determine_statistical_distributions"]:
-            self.dh.remove_outliers(predict=True)  # creates dropped index
+        #     self.dh.determine_statistical_distributions()
        # if self.feature_parameters["remove_outliers"]:
        #     self.dh.remove_outliers(predict=True)  # creates dropped index
        if self.feature_parameters["use_SVM_to_remove_outliers"]:
            self.dh.use_SVM_to_remove_outliers(predict=True)
        if self.feature_parameters["DI_threshold"]:
            self.dh.check_if_pred_in_training_spaces()  # sets do_predict
        predictions = self.model.predict(self.dh.data_dictionary["prediction_features"])
        # compute the non-standardized predictions
        self.dh.predictions = predictions * self.dh.data["labels_std"] + self.dh.data["labels_mean"]
        # logger.info("--------------------Finished prediction--------------------")
        return (self.dh.predictions, self.dh.do_predict)
--- a/freqtrade/templates/ExamplePredictionModel.py
+++ b/freqtrade/templates/ExamplePredictionModel.py
@ -1,159 +0,0 @@
 import logging
 from typing import Any, Dict, Tuple
 import pandas as pd
 from catboost import CatBoostRegressor, Pool
 from pandas import DataFrame
 from freqtrade.freqai.freqai_interface import IFreqaiModel
 logger = logging.getLogger(__name__)
 class ExamplePredictionModel(IFreqaiModel):
    """
    User created prediction model. The class needs to override three necessary
    functions, predict(), train(), fit(). The class inherits ModelHandler which
    has its own DataHandler where data is held, saved, loaded, and managed.
    """
    def make_labels(self, dataframe: DataFrame) -> DataFrame:
        """
        User defines the labels here (target values).
        :params:
        :dataframe: the full dataframe for the present training period
        """
        dataframe["s"] = (
            dataframe["close"]
            .shift(-self.feature_parameters["period"])
            .rolling(self.feature_parameters["period"])
            .max()
            / dataframe["close"]
            - 1
        )
        self.dh.data["s_mean"] = dataframe["s"].mean()
        self.dh.data["s_std"] = dataframe["s"].std()
        # logger.info("label mean", self.dh.data["s_mean"], "label std", self.dh.data["s_std"])
        return dataframe["s"]
    def train(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Tuple[DataFrame, DataFrame]:
        """
        Filter the training data and train a model to it. Train makes heavy use of the datakitchen
        for storing, saving, loading, and analyzing the data.
        :params:
        :unfiltered_dataframe: Full dataframe for the current training period
        :metadata: pair metadata from strategy.
        :returns:
        :model: Trained model which can be used to inference (self.predict)
        """
        logger.info("--------------------Starting training--------------------")
        # create the full feature list based on user config info
        self.dh.training_features_list = self.dh.build_feature_list(self.config, metadata)
        unfiltered_labels = self.make_labels(unfiltered_dataframe)
        # filter the features requested by user in the configuration file and elegantly handle NaNs
        features_filtered, labels_filtered = self.dh.filter_features(
            unfiltered_dataframe,
            self.dh.training_features_list,
            unfiltered_labels,
            training_filter=True,
        )
        # split data into train/test data.
        data_dictionary = self.dh.make_train_test_datasets(features_filtered, labels_filtered)
        # standardize all data based on train_dataset only
        data_dictionary = self.dh.standardize_data(data_dictionary)
        # optional additional data cleaning
        if self.feature_parameters["principal_component_analysis"]:
            self.dh.principal_component_analysis()
        if self.feature_parameters["remove_outliers"]:
            self.dh.remove_outliers(predict=False)
        if self.feature_parameters["DI_threshold"]:
            self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
        logger.info("length of train data %s", len(data_dictionary["train_features"]))
        model = self.fit(data_dictionary)
        logger.info(f'--------------------done training {metadata["pair"]}--------------------')
        return model
    def fit(self, data_dictionary: Dict) -> Any:
        """
        Most regressors use the same function names and arguments e.g. user
        can drop in LGBMRegressor in place of CatBoostRegressor and all data
        management will be properly handled by Freqai.
        :params:
        :data_dictionary: the dictionary constructed by DataHandler to hold
        all the training and test data/labels.
        """
        train_data = Pool(
            data=data_dictionary["train_features"],
            label=data_dictionary["train_labels"],
            weight=data_dictionary["train_weights"],
        )
        test_data = Pool(
            data=data_dictionary["test_features"],
            label=data_dictionary["test_labels"],
            weight=data_dictionary["test_weights"],
        )
        model = CatBoostRegressor(
            verbose=100, early_stopping_rounds=400, **self.model_training_parameters
        )
        model.fit(X=train_data, eval_set=test_data)
        return model
    def predict(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Tuple[DataFrame,
                                                                                DataFrame]:
        """
        Filter the prediction features data and predict with it.
        :param: unfiltered_dataframe: Full dataframe for the current backtest period.
        :return:
        :predictions: np.array of predictions
        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
        data (NaNs) or felt uncertain about data (PCA and DI index)
        """
        # logger.info("--------------------Starting prediction--------------------")
        original_feature_list = self.dh.build_feature_list(self.config, metadata)
        filtered_dataframe, _ = self.dh.filter_features(
            unfiltered_dataframe, original_feature_list, training_filter=False
        )
        filtered_dataframe = self.dh.standardize_data_from_metadata(filtered_dataframe)
        self.dh.data_dictionary["prediction_features"] = filtered_dataframe
        # optional additional data cleaning
        if self.feature_parameters["principal_component_analysis"]:
            pca_components = self.dh.pca.transform(filtered_dataframe)
            self.dh.data_dictionary["prediction_features"] = pd.DataFrame(
                data=pca_components,
                columns=["PC" + str(i) for i in range(0, self.dh.data["n_kept_components"])],
                index=filtered_dataframe.index,
            )
        if self.feature_parameters["remove_outliers"]:
            self.dh.remove_outliers(predict=True)  # creates dropped index
        if self.feature_parameters["DI_threshold"]:
            self.dh.check_if_pred_in_training_spaces()  # sets do_predict
        predictions = self.model.predict(self.dh.data_dictionary["prediction_features"])
        # compute the non-standardized predictions
        self.dh.predictions = predictions * self.dh.data["labels_std"] + self.dh.data["labels_mean"]
        # logger.info("--------------------Finished prediction--------------------")
        return (self.dh.predictions, self.dh.do_predict)
--- a/freqtrade/templates/FreqaiExampleStrategy.py
+++ b/freqtrade/templates/FreqaiExampleStrategy.py
@ -166,8 +166,8 @@ class FreqaiExampleStrategy(IStrategy):
            dataframe["target_std"],
        ) = self.model.bridge.start(dataframe, metadata, self)
-        dataframe["target_roi"] = dataframe["target_mean"] + dataframe["target_std"] * 0.5
+        dataframe["target_roi"] = dataframe["target_mean"] + dataframe["target_std"] * 1.5
-        dataframe["sell_roi"] = dataframe["target_mean"] - dataframe["target_std"] * 1.5
+        dataframe["sell_roi"] = dataframe["target_mean"] - dataframe["target_std"] * 1
        return dataframe
    def populate_buy_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
@ -183,7 +183,7 @@ class FreqaiExampleStrategy(IStrategy):
    def populate_sell_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
        sell_conditions = [
-            (dataframe["prediction"] < dataframe["sell_roi"]) & (dataframe["do_predict"] == 1)
+            (dataframe["do_predict"] <= 0)
        ]
        if sell_conditions:
            dataframe.loc[reduce(lambda x, y: x | y, sell_conditions), "sell"] = 1