diff --git a/config_examples/config_freqai.example.json b/config_examples/config_freqai.example.json index 648f36917..a895a7341 100644 --- a/config_examples/config_freqai.example.json +++ b/config_examples/config_freqai.example.json @@ -57,8 +57,8 @@ "train_period": 30, "backtest_period": 7, "identifier": "example", - "live_trained_timerange": "20220330-20220429", - "live_full_backtestrange": "20220302-20220501", + "live_trained_timerange": "", + "live_full_backtestrange": "", "corr_pairlist": [ "BTC/USDT", "ETH/USDT", @@ -68,20 +68,19 @@ "feature_parameters": { "period": 12, "shift": 1, - "drop_features": false, "DI_threshold": 1, "weight_factor": 0, "principal_component_analysis": false, - "remove_outliers": false + "use_SVM_to_remove_outliers": false }, "data_split_parameters": { "test_size": 0.25, "random_state": 1 }, "model_training_parameters": { - "n_estimators": 2000, + "n_estimators": 1000, "random_state": 1, - "learning_rate": 0.02, + "learning_rate": 0.1, "task_type": "CPU" } }, diff --git a/docs/freqai.md b/docs/freqai.md index df41846a4..8a37e7d66 100644 --- a/docs/freqai.md +++ b/docs/freqai.md @@ -331,21 +331,21 @@ Users can reduce the dimensionality of their features by activating the `princip Which will perform PCA on the features and reduce the dimensionality of the data so that the explained variance of the data set is >= 0.999. -### Removing outliers based on feature statistical distributions +### Removing outliers using a Support Vector Machine (SVM) The user can tell Freqai to remove outlier data points from the training/test data sets by setting: ```json "freqai": { "feature_parameters" : { - "remove_outliers": true + "use_SVM_to_remove_outliers: true } } ``` -Freqai will check the statistical distributions of each feature (or component if the user activated -`principal_component_analysis`) and remove any data point that sits more than 3 standard deviations away -from the mean. +Freqai will train an SVM on the training data (or components if the user activated +`principal_component_analysis`) and remove any data point that it deems to be sit beyond the +feature space. ## Additional information diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index e35243f6a..f589a1c89 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -10,8 +10,9 @@ from typing import Any, Dict, List, Tuple import numpy as np import numpy.typing as npt import pandas as pd -from joblib import dump, load +from joblib import dump, load # , Parallel, delayed # used for auto distribution assignment from pandas import DataFrame +from sklearn import linear_model from sklearn.metrics.pairwise import pairwise_distances from sklearn.model_selection import train_test_split @@ -22,6 +23,9 @@ from freqtrade.resolvers import ExchangeResolver from freqtrade.strategy.interface import IStrategy +# import scipy as spy # used for auto distribution assignment + + SECONDS_IN_DAY = 86400 logger = logging.getLogger(__name__) @@ -52,6 +56,7 @@ class FreqaiDataKitchen: self.model_filename: str = "" self.model_dictionary: Dict[Any, Any] = {} self.live = live + self.svm_model: linear_model.SGDOneClassSVM = None if not self.live: self.full_timerange = self.create_fulltimerange(self.config["timerange"], self.freqai_config["train_period"] @@ -89,6 +94,10 @@ class FreqaiDataKitchen: # Save the trained model dump(model, save_path / str(self.model_filename + "_model.joblib")) + + if self.svm_model is not None: + dump(self.svm_model, save_path / str(self.model_filename + "_svm_model.joblib")) + self.data["model_path"] = str(self.model_path) self.data["model_filename"] = str(self.model_filename) self.data["training_features_list"] = list(self.data_dictionary["train_features"].columns) @@ -104,6 +113,19 @@ class FreqaiDataKitchen: if self.live: self.model_dictionary[self.model_filename] = model + # TODO add a helper function to let user save/load any data they are custom adding. We + # do not want them having to edit the default save/load methods here. Below is an example + # of what we do NOT want. + + # if self.freqai_config['feature_parameters']['determine_statistical_distributions']: + # self.data_dictionary["upper_quantiles"].to_pickle( + # save_path / str(self.model_filename + "_upper_quantiles.pkl") + # ) + + # self.data_dictionary["lower_quantiles"].to_pickle( + # save_path / str(self.model_filename + "_lower_quantiles.pkl") + # ) + return def load_data(self) -> Any: @@ -121,6 +143,19 @@ class FreqaiDataKitchen: self.model_path / str(self.model_filename + "_trained_df.pkl") ) + # TODO add a helper function to let user save/load any data they are custom adding. We + # do not want them having to edit the default save/load methods here. Below is an example + # of what we do NOT want. + + # if self.freqai_config['feature_parameters']['determine_statistical_distributions']: + # self.data_dictionary["upper_quantiles"] = pd.read_pickle( + # self.model_path / str(self.model_filename + "_upper_quantiles.pkl") + # ) + + # self.data_dictionary["lower_quantiles"] = pd.read_pickle( + # self.model_path / str(self.model_filename + "_lower_quantiles.pkl") + # ) + self.model_path = Path(self.data["model_path"]) self.model_filename = self.data["model_filename"] @@ -130,6 +165,10 @@ class FreqaiDataKitchen: else: model = load(self.model_path / str(self.model_filename + "_model.joblib")) + if Path(self.model_path / str(self.model_filename + + "_svm_model.joblib")).resolve().exists(): + self.svm_model = load(self.model_path / str(self.model_filename + "_svm_model.joblib")) + assert model, ( f"Unable to load model, ensure model exists at " f"{self.model_path} " @@ -159,6 +198,12 @@ class FreqaiDataKitchen: else: weights = np.ones(len(filtered_dataframe)) + if self.config["freqai"]["feature_parameters"]["stratify"] > 0: + stratification = np.zeros(len(filtered_dataframe)) + for i in range(1, len(stratification)): + if i % self.config["freqai"]["feature_parameters"]["stratify"] == 0: + stratification[i] = 1 + ( train_features, test_features, @@ -170,6 +215,8 @@ class FreqaiDataKitchen: filtered_dataframe[: filtered_dataframe.shape[0]], labels, weights, + stratify=stratification, + # shuffle=False, **self.config["freqai"]["data_split_parameters"] ) @@ -261,9 +308,9 @@ class FreqaiDataKitchen: return self.data_dictionary - def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: + def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: """ - Standardize all data in the data_dictionary according to the training dataset + Normalize all data in the data_dictionary according to the training dataset :params: :data_dictionary: dictionary containing the cleaned and split training/test data/labels :returns: @@ -297,6 +344,42 @@ class FreqaiDataKitchen: return data_dictionary + def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: + """ + Standardize all data in the data_dictionary according to the training dataset + :params: + :data_dictionary: dictionary containing the cleaned and split training/test data/labels + :returns: + :data_dictionary: updated dictionary with standardized values. + """ + # standardize the data by training stats + train_max = data_dictionary["train_features"].max() + train_min = data_dictionary["train_features"].min() + data_dictionary["train_features"] = 2 * ( + data_dictionary["train_features"] - train_min + ) / (train_max - train_min) - 1 + data_dictionary["test_features"] = 2 * ( + data_dictionary["test_features"] - train_min + ) / (train_max - train_min) - 1 + + train_labels_max = data_dictionary["train_labels"].max() + train_labels_min = data_dictionary["train_labels"].min() + data_dictionary["train_labels"] = 2 * ( + data_dictionary["train_labels"] - train_labels_min + ) / (train_labels_max - train_labels_min) - 1 + data_dictionary["test_labels"] = 2 * ( + data_dictionary["test_labels"] - train_labels_min + ) / (train_labels_max - train_labels_min) - 1 + + for item in train_max.keys(): + self.data[item + "_max"] = train_max[item] + self.data[item + "_min"] = train_min[item] + + self.data["labels_max"] = train_labels_max + self.data["labels_min"] = train_labels_min + + return data_dictionary + def standardize_data_from_metadata(self, df: DataFrame) -> DataFrame: """ Standardizes a set of data using the mean and standard deviation from @@ -305,6 +388,20 @@ class FreqaiDataKitchen: :df: Dataframe to be standardized """ + for item in df.keys(): + df[item] = 2 * (df[item] - self.data[item + "_min"]) / (self.data[item + "_max"] - + self.data[item + '_min']) - 1 + + return df + + def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: + """ + Normalizes a set of data using the mean and standard deviation from + the associated training data. + :params: + :df: Dataframe to be standardized + """ + for item in df.keys(): df[item] = (df[item] - self.data[item + "_mean"]) / self.data[item + "_std"] @@ -420,6 +517,8 @@ class FreqaiDataKitchen: self.data["n_kept_components"] = n_keep_components self.pca = pca2 + logger.info(f'PCA reduced total features from {n_components} to {n_keep_components}') + if not self.model_path.is_dir(): self.model_path.mkdir(parents=True, exist_ok=True) pk.dump(pca2, open(self.model_path / str(self.model_filename + "_pca_object.pkl"), "wb")) @@ -434,70 +533,53 @@ class FreqaiDataKitchen: return avg_mean_dist - def remove_outliers(self, predict: bool) -> None: - """ - Remove data that looks like an outlier based on the distribution of each - variable. - :params: - :predict: boolean which tells the function if this is prediction data or - training data coming in. - """ - - lower_quantile = self.data_dictionary["train_features"].quantile(0.001) - upper_quantile = self.data_dictionary["train_features"].quantile(0.999) + def use_SVM_to_remove_outliers(self, predict: bool) -> None: if predict: - - df = self.data_dictionary["prediction_features"][ - (self.data_dictionary["prediction_features"] < upper_quantile) - & (self.data_dictionary["prediction_features"] > lower_quantile) - ] - drop_index = pd.isnull(df).any(1) - self.data_dictionary["prediction_features"].fillna(0, inplace=True) - drop_index = ~drop_index - do_predict = np.array(drop_index.replace(True, 1).replace(False, 0)) + assert self.svm_model, "No svm model available for outlier removal" + y_pred = self.svm_model.predict(self.data_dictionary["prediction_features"]) + do_predict = np.where(y_pred == -1, 0, y_pred) logger.info( - "remove_outliers() tossed %s predictions", - len(do_predict) - do_predict.sum(), + f'svm_remove_outliers() tossed {len(do_predict) - do_predict.sum()} predictions' ) self.do_predict += do_predict self.do_predict -= 1 else: + # use SGDOneClassSVM to increase speed? + self.svm_model = linear_model.SGDOneClassSVM(nu=0.1).fit( + self.data_dictionary["train_features"] + ) + y_pred = self.svm_model.predict(self.data_dictionary["train_features"]) + dropped_points = np.where(y_pred == -1, 0, y_pred) + # keep_index = np.where(y_pred == 1) + self.data_dictionary["train_features"] = self.data_dictionary[ + "train_features"][(y_pred == 1)] + self.data_dictionary["train_labels"] = self.data_dictionary[ + "train_labels"][(y_pred == 1)] + self.data_dictionary["train_weights"] = self.data_dictionary[ + "train_weights"][(y_pred == 1)] - filter_train_df = self.data_dictionary["train_features"][ - (self.data_dictionary["train_features"] < upper_quantile) - & (self.data_dictionary["train_features"] > lower_quantile) - ] - drop_index = pd.isnull(filter_train_df).any(1) - drop_index = drop_index.replace(True, 1).replace(False, 0) - self.data_dictionary["train_features"] = self.data_dictionary["train_features"][ - (drop_index == 0) - ] - self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][ - (drop_index == 0) - ] - self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][ - (drop_index == 0) - ] + logger.info( + f'svm_remove_outliers() tossed {len(y_pred) - dropped_points.sum()}' + f' train points from {len(y_pred)}' + ) - # do the same for the test data - filter_test_df = self.data_dictionary["test_features"][ - (self.data_dictionary["test_features"] < upper_quantile) - & (self.data_dictionary["test_features"] > lower_quantile) - ] - drop_index = pd.isnull(filter_test_df).any(1) - drop_index = drop_index.replace(True, 1).replace(False, 0) - self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][ - (drop_index == 0) - ] - self.data_dictionary["test_features"] = self.data_dictionary["test_features"][ - (drop_index == 0) - ] - self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][ - (drop_index == 0) - ] + # same for test data + y_pred = self.svm_model.predict(self.data_dictionary["test_features"]) + dropped_points = np.where(y_pred == -1, 0, y_pred) + self.data_dictionary["test_features"] = self.data_dictionary[ + "test_features"][(y_pred == 1)] + self.data_dictionary["test_labels"] = self.data_dictionary[ + "test_labels"][(y_pred == 1)] + self.data_dictionary["test_weights"] = self.data_dictionary[ + "test_weights"][(y_pred == 1)] + + logger.info( + f'svm_remove_outliers() tossed {len(y_pred) - dropped_points.sum()}' + f' test points from {len(y_pred)}' + ) return @@ -507,32 +589,6 @@ class FreqaiDataKitchen: assert features, ("Could not find any features!") return features - # def build_feature_list(self, config: dict, metadata: dict) -> list: - # """ - # SUPERCEDED BY self.find_features() - # Build the list of features that will be used to filter - # the full dataframe. Feature list is construced from the - # user configuration file. - # :params: - # :config: Canonical freqtrade config file containing all - # user defined input in config['freqai] dictionary. - # """ - # features = [] - # for tf in config["freqai"]["timeframes"]: - # for ft in config["freqai"]["base_features"]: - # for n in range(config["freqai"]["feature_parameters"]["shift"] + 1): - # shift = "" - # if n > 0: - # shift = "_shift-" + str(n) - # features.append(metadata['pair'].split("/")[0] + "-" + ft + shift + "_" + tf) - # for p in config["freqai"]["corr_pairlist"]: - # if metadata['pair'] in p: - # continue # avoid duplicate features - # features.append(p.split("/")[0] + "-" + ft + shift + "_" + tf) - - # # logger.info("number of features %s", len(features)) - # return features - def check_if_pred_in_training_spaces(self) -> None: """ Compares the distance from each prediction point to each training data @@ -568,7 +624,7 @@ class FreqaiDataKitchen: training than older data. """ - weights = np.zeros_like(num_weights) + weights = np.zeros(num_weights) for i in range(1, len(weights)): weights[len(weights) - i] = np.exp( -i / (self.config["freqai"]["feature_parameters"]["weight_factor"] * num_weights) @@ -638,19 +694,23 @@ class FreqaiDataKitchen: time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp() - trained_timerange = TimeRange.parse_timerange(training_timerange) + if training_timerange: # user passed no live_trained_timerange in config + trained_timerange = TimeRange.parse_timerange(training_timerange) + elapsed_time = (time - trained_timerange.stopts) / SECONDS_IN_DAY + trained_timerange.startts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY + trained_timerange.stopts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY + retrain = elapsed_time > self.freqai_config['backtest_period'] + else: + trained_timerange = TimeRange.parse_timerange("20000101-20000201") + trained_timerange.startts = int(time - self.freqai_config['train_period'] * + SECONDS_IN_DAY) + trained_timerange.stopts = int(time) + retrain = True - elapsed_time = (time - trained_timerange.stopts) / SECONDS_IN_DAY - - trained_timerange.startts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY - trained_timerange.stopts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY start = datetime.datetime.utcfromtimestamp(trained_timerange.startts) stop = datetime.datetime.utcfromtimestamp(trained_timerange.stopts) - new_trained_timerange = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d") - retrain = elapsed_time > self.freqai_config['backtest_period'] - if retrain: coin, _ = metadata['pair'].split("/") # set the new model_path @@ -738,3 +798,141 @@ class FreqaiDataKitchen: def np_encoder(self, object): if isinstance(object, np.generic): return object.item() + + # Functions containing useful data manpulation examples. but not actively in use. + + # def build_feature_list(self, config: dict, metadata: dict) -> list: + # """ + # SUPERCEDED BY self.find_features() + # Build the list of features that will be used to filter + # the full dataframe. Feature list is construced from the + # user configuration file. + # :params: + # :config: Canonical freqtrade config file containing all + # user defined input in config['freqai] dictionary. + # """ + # features = [] + # for tf in config["freqai"]["timeframes"]: + # for ft in config["freqai"]["base_features"]: + # for n in range(config["freqai"]["feature_parameters"]["shift"] + 1): + # shift = "" + # if n > 0: + # shift = "_shift-" + str(n) + # features.append(metadata['pair'].split("/")[0] + "-" + ft + shift + "_" + tf) + # for p in config["freqai"]["corr_pairlist"]: + # if metadata['pair'] in p: + # continue # avoid duplicate features + # features.append(p.split("/")[0] + "-" + ft + shift + "_" + tf) + + # # logger.info("number of features %s", len(features)) + # return features + + # Possibly phasing these outlier removal methods below out in favor of + # use_SVM_to_remove_outliers (computationally more efficient and apparently higher performance). + # But these have good data manipulation examples, so keep them commented here for now. + + # def determine_statistical_distributions(self) -> None: + # from fitter import Fitter + + # logger.info('Determining best model for all features, may take some time') + + # def compute_quantiles(ft): + # f = Fitter(self.data_dictionary["train_features"][ft], + # distributions=['gamma', 'cauchy', 'laplace', + # 'beta', 'uniform', 'lognorm']) + # f.fit() + # # f.summary() + # dist = list(f.get_best().items())[0][0] + # params = f.get_best()[dist] + # upper_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.999, **params) + # lower_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.001, **params) + + # return ft, upper_q, lower_q, dist + + # quantiles_tuple = Parallel(n_jobs=-1)( + # delayed(compute_quantiles)(ft) for ft in self.data_dictionary[ + # 'train_features'].columns) + + # df = pd.DataFrame(quantiles_tuple, columns=['features', 'upper_quantiles', + # 'lower_quantiles', 'dist']) + # self.data_dictionary['upper_quantiles'] = df['upper_quantiles'] + # self.data_dictionary['lower_quantiles'] = df['lower_quantiles'] + + # return + + # def remove_outliers(self, predict: bool) -> None: + # """ + # Remove data that looks like an outlier based on the distribution of each + # variable. + # :params: + # :predict: boolean which tells the function if this is prediction data or + # training data coming in. + # """ + + # lower_quantile = self.data_dictionary["lower_quantiles"].to_numpy() + # upper_quantile = self.data_dictionary["upper_quantiles"].to_numpy() + + # if predict: + + # df = self.data_dictionary["prediction_features"][ + # (self.data_dictionary["prediction_features"] < upper_quantile) + # & (self.data_dictionary["prediction_features"] > lower_quantile) + # ] + # drop_index = pd.isnull(df).any(1) + # self.data_dictionary["prediction_features"].fillna(0, inplace=True) + # drop_index = ~drop_index + # do_predict = np.array(drop_index.replace(True, 1).replace(False, 0)) + + # logger.info( + # "remove_outliers() tossed %s predictions", + # len(do_predict) - do_predict.sum(), + # ) + # self.do_predict += do_predict + # self.do_predict -= 1 + + # else: + + # filter_train_df = self.data_dictionary["train_features"][ + # (self.data_dictionary["train_features"] < upper_quantile) + # & (self.data_dictionary["train_features"] > lower_quantile) + # ] + # drop_index = pd.isnull(filter_train_df).any(1) + # drop_index = drop_index.replace(True, 1).replace(False, 0) + # self.data_dictionary["train_features"] = self.data_dictionary["train_features"][ + # (drop_index == 0) + # ] + # self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][ + # (drop_index == 0) + # ] + # self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][ + # (drop_index == 0) + # ] + + # logger.info( + # f'remove_outliers() tossed {drop_index.sum()}' + # f' training points from {len(filter_train_df)}' + # ) + + # # do the same for the test data + # filter_test_df = self.data_dictionary["test_features"][ + # (self.data_dictionary["test_features"] < upper_quantile) + # & (self.data_dictionary["test_features"] > lower_quantile) + # ] + # drop_index = pd.isnull(filter_test_df).any(1) + # drop_index = drop_index.replace(True, 1).replace(False, 0) + # self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][ + # (drop_index == 0) + # ] + # self.data_dictionary["test_features"] = self.data_dictionary["test_features"][ + # (drop_index == 0) + # ] + # self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][ + # (drop_index == 0) + # ] + + # logger.info( + # f'remove_outliers() tossed {drop_index.sum()}' + # f' test points from {len(filter_test_df)}' + # ) + + # return diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 2523cd561..f1dd5550a 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -62,6 +62,7 @@ class IFreqaiModel(ABC): self.predictions = None self.training_on_separate_thread = False self.retrain = False + self.first = True def start(self, dataframe: DataFrame, metadata: dict, strategy: IStrategy) -> DataFrame: """ @@ -80,12 +81,12 @@ class IFreqaiModel(ABC): :metadata: pair metadata coming from strategy. """ - live = strategy.dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE) + self.live = strategy.dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE) self.pair = metadata["pair"] - self.dh = FreqaiDataKitchen(self.config, dataframe, live) + self.dh = FreqaiDataKitchen(self.config, dataframe, self.live) - if live: + if self.live: # logger.info('testing live') self.start_live(dataframe, metadata, strategy) @@ -115,11 +116,12 @@ class IFreqaiModel(ABC): self.dh.save_data(self.model) else: self.model = self.dh.load_data() - strategy_provided_features = self.dh.find_features(dataframe_train) - if strategy_provided_features != self.dh.training_features_list: - logger.info("User changed input features, retraining model.") - self.model = self.train(dataframe_train, metadata) - self.dh.save_data(self.model) + # strategy_provided_features = self.dh.find_features(dataframe_train) + # # TOFIX doesnt work with PCA + # if strategy_provided_features != self.dh.training_features_list: + # logger.info("User changed input features, retraining model.") + # self.model = self.train(dataframe_train, metadata) + # self.dh.save_data(self.model) preds, do_preds = self.predict(dataframe_backtest, metadata) @@ -148,7 +150,7 @@ class IFreqaiModel(ABC): if not self.training_on_separate_thread: # this will also prevent other pairs from trying to train simultaneously. (self.retrain, - new_trained_timerange) = self.dh.check_if_new_training_required(self.freqai_info[ + self.new_trained_timerange) = self.dh.check_if_new_training_required(self.freqai_info[ 'live_trained_timerange'], metadata) else: @@ -156,14 +158,19 @@ class IFreqaiModel(ABC): self.retrain = False if self.retrain or not file_exists: - self.training_on_separate_thread = True # acts like a lock - self.retrain_model_on_separate_thread(new_trained_timerange, metadata, strategy) + if self.first: + self.train_model_in_series(self.new_trained_timerange, metadata, strategy) + self.first = False + else: + self.training_on_separate_thread = True # acts like a lock + self.retrain_model_on_separate_thread(self.new_trained_timerange, + metadata, strategy) self.model = self.dh.load_data() strategy_provided_features = self.dh.find_features(dataframe) if strategy_provided_features != self.dh.training_features_list: - self.train_model_in_series(new_trained_timerange, metadata, strategy) + self.train_model_in_series(self.new_trained_timerange, metadata, strategy) preds, do_preds = self.predict(dataframe, metadata) self.dh.append_predictions(preds, do_preds, len(dataframe)) @@ -215,12 +222,36 @@ class IFreqaiModel(ABC): data (NaNs) or felt uncertain about data (PCA and DI index) """ + @abstractmethod + def data_cleaning_train(self) -> None: + """ + User can add data analysis and cleaning here. + Any function inside this method should drop training data points from the filtered_dataframe + based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example + of how outlier data points are dropped from the dataframe used for training. + """ + + @abstractmethod + def data_cleaning_predict(self) -> None: + """ + User can add data analysis and cleaning here. + These functions each modify self.dh.do_predict, which is a dataframe with equal length + to the number of candles coming from and returning to the strategy. Inside do_predict, + 1 allows prediction and < 0 signals to the strategy that the model is not confident in + the prediction. + See FreqaiDataKitchen::remove_outliers() for an example + of how the do_predict vector is modified. do_predict is ultimately passed back to strategy + for buy signals. + """ + def model_exists(self, pair: str, training_timerange: str) -> bool: """ Given a pair and path, check if a model already exists :param pair: pair e.g. BTC/USD :param path: path to model """ + if self.live and training_timerange is None: + return False coin, _ = pair.split("/") self.dh.model_filename = "cb_" + coin.lower() + "_" + training_timerange path_to_modelfile = Path(self.dh.model_path / str(self.dh.model_filename + "_model.joblib")) @@ -265,3 +296,4 @@ class IFreqaiModel(ABC): self.model = self.train(unfiltered_dataframe, metadata) self.dh.save_data(self.model) + self.retrain = False diff --git a/freqtrade/freqai/prediction_models/CatboostPredictionModel.py b/freqtrade/freqai/prediction_models/CatboostPredictionModel.py index e2ba6bd29..8550f3f15 100644 --- a/freqtrade/freqai/prediction_models/CatboostPredictionModel.py +++ b/freqtrade/freqai/prediction_models/CatboostPredictionModel.py @@ -29,7 +29,7 @@ class CatboostPredictionModel(IFreqaiModel): dataframe["close"] .shift(-self.feature_parameters["period"]) .rolling(self.feature_parameters["period"]) - .max() + .mean() / dataframe["close"] - 1 ) @@ -68,15 +68,11 @@ class CatboostPredictionModel(IFreqaiModel): # standardize all data based on train_dataset only data_dictionary = self.dh.standardize_data(data_dictionary) - # optional additional data cleaning - if self.feature_parameters["principal_component_analysis"]: - self.dh.principal_component_analysis() - if self.feature_parameters["remove_outliers"]: - self.dh.remove_outliers(predict=False) - if self.feature_parameters["DI_threshold"]: - self.dh.data["avg_mean_dist"] = self.dh.compute_distances() + # optional additional data cleaning/analysis + self.data_cleaning_train() - logger.info("length of train data %s", len(data_dictionary["train_features"])) + logger.info(f'Training model on {len(self.dh.training_features_list)} features') + logger.info(f'Training model on {len(data_dictionary["train_features"])} data points') model = self.fit(data_dictionary) @@ -86,9 +82,7 @@ class CatboostPredictionModel(IFreqaiModel): def fit(self, data_dictionary: Dict) -> Any: """ - Most regressors use the same function names and arguments e.g. user - can drop in LGBMRegressor in place of CatBoostRegressor and all data - management will be properly handled by Freqai. + User sets up the training and test data to fit their desired model here :params: :data_dictionary: the dictionary constructed by DataHandler to hold all the training and test data/labels. @@ -133,7 +127,51 @@ class CatboostPredictionModel(IFreqaiModel): filtered_dataframe = self.dh.standardize_data_from_metadata(filtered_dataframe) self.dh.data_dictionary["prediction_features"] = filtered_dataframe - # optional additional data cleaning + # optional additional data cleaning/analysis + self.data_cleaning_predict(filtered_dataframe) + + predictions = self.model.predict(self.dh.data_dictionary["prediction_features"]) + + # compute the non-standardized predictions + self.dh.predictions = (predictions + 1) * (self.dh.data["labels_max"] - + self.dh.data["labels_min"]) / 2 + self.dh.data[ + "labels_min"] + + # logger.info("--------------------Finished prediction--------------------") + + return (self.dh.predictions, self.dh.do_predict) + + def data_cleaning_train(self) -> None: + """ + User can add data analysis and cleaning here. + Any function inside this method should drop training data points from the filtered_dataframe + based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example + of how outlier data points are dropped from the dataframe used for training. + """ + if self.feature_parameters["principal_component_analysis"]: + self.dh.principal_component_analysis() + + # if self.feature_parameters["determine_statistical_distributions"]: + # self.dh.determine_statistical_distributions() + # if self.feature_parameters["remove_outliers"]: + # self.dh.remove_outliers(predict=False) + + if self.feature_parameters["use_SVM_to_remove_outliers"]: + self.dh.use_SVM_to_remove_outliers(predict=False) + if self.feature_parameters["DI_threshold"]: + self.dh.data["avg_mean_dist"] = self.dh.compute_distances() + + def data_cleaning_predict(self, filtered_dataframe: DataFrame) -> None: + """ + User can add data analysis and cleaning here. + These functions each modify self.dh.do_predict, which is a dataframe with equal length + to the number of candles coming from and returning to the strategy. Inside do_predict, + 1 allows prediction and < 0 signals to the strategy that the model is not confident in + the prediction. + See FreqaiDataKitchen::remove_outliers() for an example + of how the do_predict vector is modified. do_predict is ultimately passed back to strategy + for buy signals. + """ if self.feature_parameters["principal_component_analysis"]: pca_components = self.dh.pca.transform(filtered_dataframe) self.dh.data_dictionary["prediction_features"] = pd.DataFrame( @@ -142,17 +180,13 @@ class CatboostPredictionModel(IFreqaiModel): index=filtered_dataframe.index, ) - if self.feature_parameters["remove_outliers"]: - self.dh.remove_outliers(predict=True) # creates dropped index + # if self.feature_parameters["determine_statistical_distributions"]: + # self.dh.determine_statistical_distributions() + # if self.feature_parameters["remove_outliers"]: + # self.dh.remove_outliers(predict=True) # creates dropped index + + if self.feature_parameters["use_SVM_to_remove_outliers"]: + self.dh.use_SVM_to_remove_outliers(predict=True) if self.feature_parameters["DI_threshold"]: self.dh.check_if_pred_in_training_spaces() # sets do_predict - - predictions = self.model.predict(self.dh.data_dictionary["prediction_features"]) - - # compute the non-standardized predictions - self.dh.predictions = predictions * self.dh.data["labels_std"] + self.dh.data["labels_mean"] - - # logger.info("--------------------Finished prediction--------------------") - - return (self.dh.predictions, self.dh.do_predict) diff --git a/freqtrade/templates/ExamplePredictionModel.py b/freqtrade/templates/ExamplePredictionModel.py deleted file mode 100644 index 3d2b7a808..000000000 --- a/freqtrade/templates/ExamplePredictionModel.py +++ /dev/null @@ -1,159 +0,0 @@ -import logging -from typing import Any, Dict, Tuple - -import pandas as pd -from catboost import CatBoostRegressor, Pool -from pandas import DataFrame - -from freqtrade.freqai.freqai_interface import IFreqaiModel - - -logger = logging.getLogger(__name__) - - -class ExamplePredictionModel(IFreqaiModel): - """ - User created prediction model. The class needs to override three necessary - functions, predict(), train(), fit(). The class inherits ModelHandler which - has its own DataHandler where data is held, saved, loaded, and managed. - """ - - def make_labels(self, dataframe: DataFrame) -> DataFrame: - """ - User defines the labels here (target values). - :params: - :dataframe: the full dataframe for the present training period - """ - - dataframe["s"] = ( - dataframe["close"] - .shift(-self.feature_parameters["period"]) - .rolling(self.feature_parameters["period"]) - .max() - / dataframe["close"] - - 1 - ) - self.dh.data["s_mean"] = dataframe["s"].mean() - self.dh.data["s_std"] = dataframe["s"].std() - - # logger.info("label mean", self.dh.data["s_mean"], "label std", self.dh.data["s_std"]) - - return dataframe["s"] - - def train(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Tuple[DataFrame, DataFrame]: - """ - Filter the training data and train a model to it. Train makes heavy use of the datakitchen - for storing, saving, loading, and analyzing the data. - :params: - :unfiltered_dataframe: Full dataframe for the current training period - :metadata: pair metadata from strategy. - :returns: - :model: Trained model which can be used to inference (self.predict) - """ - logger.info("--------------------Starting training--------------------") - - # create the full feature list based on user config info - self.dh.training_features_list = self.dh.build_feature_list(self.config, metadata) - unfiltered_labels = self.make_labels(unfiltered_dataframe) - - # filter the features requested by user in the configuration file and elegantly handle NaNs - features_filtered, labels_filtered = self.dh.filter_features( - unfiltered_dataframe, - self.dh.training_features_list, - unfiltered_labels, - training_filter=True, - ) - - # split data into train/test data. - data_dictionary = self.dh.make_train_test_datasets(features_filtered, labels_filtered) - # standardize all data based on train_dataset only - data_dictionary = self.dh.standardize_data(data_dictionary) - - # optional additional data cleaning - if self.feature_parameters["principal_component_analysis"]: - self.dh.principal_component_analysis() - if self.feature_parameters["remove_outliers"]: - self.dh.remove_outliers(predict=False) - if self.feature_parameters["DI_threshold"]: - self.dh.data["avg_mean_dist"] = self.dh.compute_distances() - - logger.info("length of train data %s", len(data_dictionary["train_features"])) - - model = self.fit(data_dictionary) - - logger.info(f'--------------------done training {metadata["pair"]}--------------------') - - return model - - def fit(self, data_dictionary: Dict) -> Any: - """ - Most regressors use the same function names and arguments e.g. user - can drop in LGBMRegressor in place of CatBoostRegressor and all data - management will be properly handled by Freqai. - :params: - :data_dictionary: the dictionary constructed by DataHandler to hold - all the training and test data/labels. - """ - - train_data = Pool( - data=data_dictionary["train_features"], - label=data_dictionary["train_labels"], - weight=data_dictionary["train_weights"], - ) - - test_data = Pool( - data=data_dictionary["test_features"], - label=data_dictionary["test_labels"], - weight=data_dictionary["test_weights"], - ) - - model = CatBoostRegressor( - verbose=100, early_stopping_rounds=400, **self.model_training_parameters - ) - model.fit(X=train_data, eval_set=test_data) - - return model - - def predict(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Tuple[DataFrame, - DataFrame]: - """ - Filter the prediction features data and predict with it. - :param: unfiltered_dataframe: Full dataframe for the current backtest period. - :return: - :predictions: np.array of predictions - :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove - data (NaNs) or felt uncertain about data (PCA and DI index) - """ - - # logger.info("--------------------Starting prediction--------------------") - - original_feature_list = self.dh.build_feature_list(self.config, metadata) - filtered_dataframe, _ = self.dh.filter_features( - unfiltered_dataframe, original_feature_list, training_filter=False - ) - filtered_dataframe = self.dh.standardize_data_from_metadata(filtered_dataframe) - self.dh.data_dictionary["prediction_features"] = filtered_dataframe - - # optional additional data cleaning - if self.feature_parameters["principal_component_analysis"]: - pca_components = self.dh.pca.transform(filtered_dataframe) - self.dh.data_dictionary["prediction_features"] = pd.DataFrame( - data=pca_components, - columns=["PC" + str(i) for i in range(0, self.dh.data["n_kept_components"])], - index=filtered_dataframe.index, - ) - - if self.feature_parameters["remove_outliers"]: - self.dh.remove_outliers(predict=True) # creates dropped index - - if self.feature_parameters["DI_threshold"]: - self.dh.check_if_pred_in_training_spaces() # sets do_predict - - predictions = self.model.predict(self.dh.data_dictionary["prediction_features"]) - - # compute the non-standardized predictions - self.dh.predictions = predictions * self.dh.data["labels_std"] + self.dh.data["labels_mean"] - - # logger.info("--------------------Finished prediction--------------------") - - return (self.dh.predictions, self.dh.do_predict) diff --git a/freqtrade/templates/FreqaiExampleStrategy.py b/freqtrade/templates/FreqaiExampleStrategy.py index c8befebcf..a76ea2303 100644 --- a/freqtrade/templates/FreqaiExampleStrategy.py +++ b/freqtrade/templates/FreqaiExampleStrategy.py @@ -166,8 +166,8 @@ class FreqaiExampleStrategy(IStrategy): dataframe["target_std"], ) = self.model.bridge.start(dataframe, metadata, self) - dataframe["target_roi"] = dataframe["target_mean"] + dataframe["target_std"] * 0.5 - dataframe["sell_roi"] = dataframe["target_mean"] - dataframe["target_std"] * 1.5 + dataframe["target_roi"] = dataframe["target_mean"] + dataframe["target_std"] * 1.5 + dataframe["sell_roi"] = dataframe["target_mean"] - dataframe["target_std"] * 1 return dataframe def populate_buy_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame: @@ -183,7 +183,7 @@ class FreqaiExampleStrategy(IStrategy): def populate_sell_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame: sell_conditions = [ - (dataframe["prediction"] < dataframe["sell_roi"]) & (dataframe["do_predict"] == 1) + (dataframe["do_predict"] <= 0) ] if sell_conditions: dataframe.loc[reduce(lambda x, y: x | y, sell_conditions), "sell"] = 1