Aggregated commit. Adding support vector machine for outlier detection, improve user interface to dry/live, better standardization, fix various other bugs

This commit is contained in:
robcaulk 2022-05-22 17:51:49 +02:00
parent c5ecf94177
commit 42d95af829
7 changed files with 404 additions and 300 deletions

View File

@ -57,8 +57,8 @@
"train_period": 30, "train_period": 30,
"backtest_period": 7, "backtest_period": 7,
"identifier": "example", "identifier": "example",
"live_trained_timerange": "20220330-20220429", "live_trained_timerange": "",
"live_full_backtestrange": "20220302-20220501", "live_full_backtestrange": "",
"corr_pairlist": [ "corr_pairlist": [
"BTC/USDT", "BTC/USDT",
"ETH/USDT", "ETH/USDT",
@ -68,20 +68,19 @@
"feature_parameters": { "feature_parameters": {
"period": 12, "period": 12,
"shift": 1, "shift": 1,
"drop_features": false,
"DI_threshold": 1, "DI_threshold": 1,
"weight_factor": 0, "weight_factor": 0,
"principal_component_analysis": false, "principal_component_analysis": false,
"remove_outliers": false "use_SVM_to_remove_outliers": false
}, },
"data_split_parameters": { "data_split_parameters": {
"test_size": 0.25, "test_size": 0.25,
"random_state": 1 "random_state": 1
}, },
"model_training_parameters": { "model_training_parameters": {
"n_estimators": 2000, "n_estimators": 1000,
"random_state": 1, "random_state": 1,
"learning_rate": 0.02, "learning_rate": 0.1,
"task_type": "CPU" "task_type": "CPU"
} }
}, },

View File

@ -331,21 +331,21 @@ Users can reduce the dimensionality of their features by activating the `princip
Which will perform PCA on the features and reduce the dimensionality of the data so that the explained Which will perform PCA on the features and reduce the dimensionality of the data so that the explained
variance of the data set is >= 0.999. variance of the data set is >= 0.999.
### Removing outliers based on feature statistical distributions ### Removing outliers using a Support Vector Machine (SVM)
The user can tell Freqai to remove outlier data points from the training/test data sets by setting: The user can tell Freqai to remove outlier data points from the training/test data sets by setting:
```json ```json
"freqai": { "freqai": {
"feature_parameters" : { "feature_parameters" : {
"remove_outliers": true "use_SVM_to_remove_outliers: true
} }
} }
``` ```
Freqai will check the statistical distributions of each feature (or component if the user activated Freqai will train an SVM on the training data (or components if the user activated
`principal_component_analysis`) and remove any data point that sits more than 3 standard deviations away `principal_component_analysis`) and remove any data point that it deems to be sit beyond the
from the mean. feature space.
## Additional information ## Additional information

View File

@ -10,8 +10,9 @@ from typing import Any, Dict, List, Tuple
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
import pandas as pd import pandas as pd
from joblib import dump, load from joblib import dump, load # , Parallel, delayed # used for auto distribution assignment
from pandas import DataFrame from pandas import DataFrame
from sklearn import linear_model
from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
@ -22,6 +23,9 @@ from freqtrade.resolvers import ExchangeResolver
from freqtrade.strategy.interface import IStrategy from freqtrade.strategy.interface import IStrategy
# import scipy as spy # used for auto distribution assignment
SECONDS_IN_DAY = 86400 SECONDS_IN_DAY = 86400
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -52,6 +56,7 @@ class FreqaiDataKitchen:
self.model_filename: str = "" self.model_filename: str = ""
self.model_dictionary: Dict[Any, Any] = {} self.model_dictionary: Dict[Any, Any] = {}
self.live = live self.live = live
self.svm_model: linear_model.SGDOneClassSVM = None
if not self.live: if not self.live:
self.full_timerange = self.create_fulltimerange(self.config["timerange"], self.full_timerange = self.create_fulltimerange(self.config["timerange"],
self.freqai_config["train_period"] self.freqai_config["train_period"]
@ -89,6 +94,10 @@ class FreqaiDataKitchen:
# Save the trained model # Save the trained model
dump(model, save_path / str(self.model_filename + "_model.joblib")) dump(model, save_path / str(self.model_filename + "_model.joblib"))
if self.svm_model is not None:
dump(self.svm_model, save_path / str(self.model_filename + "_svm_model.joblib"))
self.data["model_path"] = str(self.model_path) self.data["model_path"] = str(self.model_path)
self.data["model_filename"] = str(self.model_filename) self.data["model_filename"] = str(self.model_filename)
self.data["training_features_list"] = list(self.data_dictionary["train_features"].columns) self.data["training_features_list"] = list(self.data_dictionary["train_features"].columns)
@ -104,6 +113,19 @@ class FreqaiDataKitchen:
if self.live: if self.live:
self.model_dictionary[self.model_filename] = model self.model_dictionary[self.model_filename] = model
# TODO add a helper function to let user save/load any data they are custom adding. We
# do not want them having to edit the default save/load methods here. Below is an example
# of what we do NOT want.
# if self.freqai_config['feature_parameters']['determine_statistical_distributions']:
# self.data_dictionary["upper_quantiles"].to_pickle(
# save_path / str(self.model_filename + "_upper_quantiles.pkl")
# )
# self.data_dictionary["lower_quantiles"].to_pickle(
# save_path / str(self.model_filename + "_lower_quantiles.pkl")
# )
return return
def load_data(self) -> Any: def load_data(self) -> Any:
@ -121,6 +143,19 @@ class FreqaiDataKitchen:
self.model_path / str(self.model_filename + "_trained_df.pkl") self.model_path / str(self.model_filename + "_trained_df.pkl")
) )
# TODO add a helper function to let user save/load any data they are custom adding. We
# do not want them having to edit the default save/load methods here. Below is an example
# of what we do NOT want.
# if self.freqai_config['feature_parameters']['determine_statistical_distributions']:
# self.data_dictionary["upper_quantiles"] = pd.read_pickle(
# self.model_path / str(self.model_filename + "_upper_quantiles.pkl")
# )
# self.data_dictionary["lower_quantiles"] = pd.read_pickle(
# self.model_path / str(self.model_filename + "_lower_quantiles.pkl")
# )
self.model_path = Path(self.data["model_path"]) self.model_path = Path(self.data["model_path"])
self.model_filename = self.data["model_filename"] self.model_filename = self.data["model_filename"]
@ -130,6 +165,10 @@ class FreqaiDataKitchen:
else: else:
model = load(self.model_path / str(self.model_filename + "_model.joblib")) model = load(self.model_path / str(self.model_filename + "_model.joblib"))
if Path(self.model_path / str(self.model_filename +
"_svm_model.joblib")).resolve().exists():
self.svm_model = load(self.model_path / str(self.model_filename + "_svm_model.joblib"))
assert model, ( assert model, (
f"Unable to load model, ensure model exists at " f"Unable to load model, ensure model exists at "
f"{self.model_path} " f"{self.model_path} "
@ -159,6 +198,12 @@ class FreqaiDataKitchen:
else: else:
weights = np.ones(len(filtered_dataframe)) weights = np.ones(len(filtered_dataframe))
if self.config["freqai"]["feature_parameters"]["stratify"] > 0:
stratification = np.zeros(len(filtered_dataframe))
for i in range(1, len(stratification)):
if i % self.config["freqai"]["feature_parameters"]["stratify"] == 0:
stratification[i] = 1
( (
train_features, train_features,
test_features, test_features,
@ -170,6 +215,8 @@ class FreqaiDataKitchen:
filtered_dataframe[: filtered_dataframe.shape[0]], filtered_dataframe[: filtered_dataframe.shape[0]],
labels, labels,
weights, weights,
stratify=stratification,
# shuffle=False,
**self.config["freqai"]["data_split_parameters"] **self.config["freqai"]["data_split_parameters"]
) )
@ -261,9 +308,9 @@ class FreqaiDataKitchen:
return self.data_dictionary return self.data_dictionary
def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
""" """
Standardize all data in the data_dictionary according to the training dataset Normalize all data in the data_dictionary according to the training dataset
:params: :params:
:data_dictionary: dictionary containing the cleaned and split training/test data/labels :data_dictionary: dictionary containing the cleaned and split training/test data/labels
:returns: :returns:
@ -297,6 +344,42 @@ class FreqaiDataKitchen:
return data_dictionary return data_dictionary
def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
"""
Standardize all data in the data_dictionary according to the training dataset
:params:
:data_dictionary: dictionary containing the cleaned and split training/test data/labels
:returns:
:data_dictionary: updated dictionary with standardized values.
"""
# standardize the data by training stats
train_max = data_dictionary["train_features"].max()
train_min = data_dictionary["train_features"].min()
data_dictionary["train_features"] = 2 * (
data_dictionary["train_features"] - train_min
) / (train_max - train_min) - 1
data_dictionary["test_features"] = 2 * (
data_dictionary["test_features"] - train_min
) / (train_max - train_min) - 1
train_labels_max = data_dictionary["train_labels"].max()
train_labels_min = data_dictionary["train_labels"].min()
data_dictionary["train_labels"] = 2 * (
data_dictionary["train_labels"] - train_labels_min
) / (train_labels_max - train_labels_min) - 1
data_dictionary["test_labels"] = 2 * (
data_dictionary["test_labels"] - train_labels_min
) / (train_labels_max - train_labels_min) - 1
for item in train_max.keys():
self.data[item + "_max"] = train_max[item]
self.data[item + "_min"] = train_min[item]
self.data["labels_max"] = train_labels_max
self.data["labels_min"] = train_labels_min
return data_dictionary
def standardize_data_from_metadata(self, df: DataFrame) -> DataFrame: def standardize_data_from_metadata(self, df: DataFrame) -> DataFrame:
""" """
Standardizes a set of data using the mean and standard deviation from Standardizes a set of data using the mean and standard deviation from
@ -305,6 +388,20 @@ class FreqaiDataKitchen:
:df: Dataframe to be standardized :df: Dataframe to be standardized
""" """
for item in df.keys():
df[item] = 2 * (df[item] - self.data[item + "_min"]) / (self.data[item + "_max"] -
self.data[item + '_min']) - 1
return df
def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
"""
Normalizes a set of data using the mean and standard deviation from
the associated training data.
:params:
:df: Dataframe to be standardized
"""
for item in df.keys(): for item in df.keys():
df[item] = (df[item] - self.data[item + "_mean"]) / self.data[item + "_std"] df[item] = (df[item] - self.data[item + "_mean"]) / self.data[item + "_std"]
@ -420,6 +517,8 @@ class FreqaiDataKitchen:
self.data["n_kept_components"] = n_keep_components self.data["n_kept_components"] = n_keep_components
self.pca = pca2 self.pca = pca2
logger.info(f'PCA reduced total features from {n_components} to {n_keep_components}')
if not self.model_path.is_dir(): if not self.model_path.is_dir():
self.model_path.mkdir(parents=True, exist_ok=True) self.model_path.mkdir(parents=True, exist_ok=True)
pk.dump(pca2, open(self.model_path / str(self.model_filename + "_pca_object.pkl"), "wb")) pk.dump(pca2, open(self.model_path / str(self.model_filename + "_pca_object.pkl"), "wb"))
@ -434,70 +533,53 @@ class FreqaiDataKitchen:
return avg_mean_dist return avg_mean_dist
def remove_outliers(self, predict: bool) -> None: def use_SVM_to_remove_outliers(self, predict: bool) -> None:
"""
Remove data that looks like an outlier based on the distribution of each
variable.
:params:
:predict: boolean which tells the function if this is prediction data or
training data coming in.
"""
lower_quantile = self.data_dictionary["train_features"].quantile(0.001)
upper_quantile = self.data_dictionary["train_features"].quantile(0.999)
if predict: if predict:
assert self.svm_model, "No svm model available for outlier removal"
df = self.data_dictionary["prediction_features"][ y_pred = self.svm_model.predict(self.data_dictionary["prediction_features"])
(self.data_dictionary["prediction_features"] < upper_quantile) do_predict = np.where(y_pred == -1, 0, y_pred)
& (self.data_dictionary["prediction_features"] > lower_quantile)
]
drop_index = pd.isnull(df).any(1)
self.data_dictionary["prediction_features"].fillna(0, inplace=True)
drop_index = ~drop_index
do_predict = np.array(drop_index.replace(True, 1).replace(False, 0))
logger.info( logger.info(
"remove_outliers() tossed %s predictions", f'svm_remove_outliers() tossed {len(do_predict) - do_predict.sum()} predictions'
len(do_predict) - do_predict.sum(),
) )
self.do_predict += do_predict self.do_predict += do_predict
self.do_predict -= 1 self.do_predict -= 1
else: else:
# use SGDOneClassSVM to increase speed?
self.svm_model = linear_model.SGDOneClassSVM(nu=0.1).fit(
self.data_dictionary["train_features"]
)
y_pred = self.svm_model.predict(self.data_dictionary["train_features"])
dropped_points = np.where(y_pred == -1, 0, y_pred)
# keep_index = np.where(y_pred == 1)
self.data_dictionary["train_features"] = self.data_dictionary[
"train_features"][(y_pred == 1)]
self.data_dictionary["train_labels"] = self.data_dictionary[
"train_labels"][(y_pred == 1)]
self.data_dictionary["train_weights"] = self.data_dictionary[
"train_weights"][(y_pred == 1)]
filter_train_df = self.data_dictionary["train_features"][ logger.info(
(self.data_dictionary["train_features"] < upper_quantile) f'svm_remove_outliers() tossed {len(y_pred) - dropped_points.sum()}'
& (self.data_dictionary["train_features"] > lower_quantile) f' train points from {len(y_pred)}'
] )
drop_index = pd.isnull(filter_train_df).any(1)
drop_index = drop_index.replace(True, 1).replace(False, 0)
self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
(drop_index == 0)
]
self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
(drop_index == 0)
]
self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
(drop_index == 0)
]
# do the same for the test data # same for test data
filter_test_df = self.data_dictionary["test_features"][ y_pred = self.svm_model.predict(self.data_dictionary["test_features"])
(self.data_dictionary["test_features"] < upper_quantile) dropped_points = np.where(y_pred == -1, 0, y_pred)
& (self.data_dictionary["test_features"] > lower_quantile) self.data_dictionary["test_features"] = self.data_dictionary[
] "test_features"][(y_pred == 1)]
drop_index = pd.isnull(filter_test_df).any(1) self.data_dictionary["test_labels"] = self.data_dictionary[
drop_index = drop_index.replace(True, 1).replace(False, 0) "test_labels"][(y_pred == 1)]
self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][ self.data_dictionary["test_weights"] = self.data_dictionary[
(drop_index == 0) "test_weights"][(y_pred == 1)]
]
self.data_dictionary["test_features"] = self.data_dictionary["test_features"][ logger.info(
(drop_index == 0) f'svm_remove_outliers() tossed {len(y_pred) - dropped_points.sum()}'
] f' test points from {len(y_pred)}'
self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][ )
(drop_index == 0)
]
return return
@ -507,32 +589,6 @@ class FreqaiDataKitchen:
assert features, ("Could not find any features!") assert features, ("Could not find any features!")
return features return features
# def build_feature_list(self, config: dict, metadata: dict) -> list:
# """
# SUPERCEDED BY self.find_features()
# Build the list of features that will be used to filter
# the full dataframe. Feature list is construced from the
# user configuration file.
# :params:
# :config: Canonical freqtrade config file containing all
# user defined input in config['freqai] dictionary.
# """
# features = []
# for tf in config["freqai"]["timeframes"]:
# for ft in config["freqai"]["base_features"]:
# for n in range(config["freqai"]["feature_parameters"]["shift"] + 1):
# shift = ""
# if n > 0:
# shift = "_shift-" + str(n)
# features.append(metadata['pair'].split("/")[0] + "-" + ft + shift + "_" + tf)
# for p in config["freqai"]["corr_pairlist"]:
# if metadata['pair'] in p:
# continue # avoid duplicate features
# features.append(p.split("/")[0] + "-" + ft + shift + "_" + tf)
# # logger.info("number of features %s", len(features))
# return features
def check_if_pred_in_training_spaces(self) -> None: def check_if_pred_in_training_spaces(self) -> None:
""" """
Compares the distance from each prediction point to each training data Compares the distance from each prediction point to each training data
@ -568,7 +624,7 @@ class FreqaiDataKitchen:
training than older data. training than older data.
""" """
weights = np.zeros_like(num_weights) weights = np.zeros(num_weights)
for i in range(1, len(weights)): for i in range(1, len(weights)):
weights[len(weights) - i] = np.exp( weights[len(weights) - i] = np.exp(
-i / (self.config["freqai"]["feature_parameters"]["weight_factor"] * num_weights) -i / (self.config["freqai"]["feature_parameters"]["weight_factor"] * num_weights)
@ -638,19 +694,23 @@ class FreqaiDataKitchen:
time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp() time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp()
trained_timerange = TimeRange.parse_timerange(training_timerange) if training_timerange: # user passed no live_trained_timerange in config
trained_timerange = TimeRange.parse_timerange(training_timerange)
elapsed_time = (time - trained_timerange.stopts) / SECONDS_IN_DAY
trained_timerange.startts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
trained_timerange.stopts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
retrain = elapsed_time > self.freqai_config['backtest_period']
else:
trained_timerange = TimeRange.parse_timerange("20000101-20000201")
trained_timerange.startts = int(time - self.freqai_config['train_period'] *
SECONDS_IN_DAY)
trained_timerange.stopts = int(time)
retrain = True
elapsed_time = (time - trained_timerange.stopts) / SECONDS_IN_DAY
trained_timerange.startts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
trained_timerange.stopts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
start = datetime.datetime.utcfromtimestamp(trained_timerange.startts) start = datetime.datetime.utcfromtimestamp(trained_timerange.startts)
stop = datetime.datetime.utcfromtimestamp(trained_timerange.stopts) stop = datetime.datetime.utcfromtimestamp(trained_timerange.stopts)
new_trained_timerange = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d") new_trained_timerange = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")
retrain = elapsed_time > self.freqai_config['backtest_period']
if retrain: if retrain:
coin, _ = metadata['pair'].split("/") coin, _ = metadata['pair'].split("/")
# set the new model_path # set the new model_path
@ -738,3 +798,141 @@ class FreqaiDataKitchen:
def np_encoder(self, object): def np_encoder(self, object):
if isinstance(object, np.generic): if isinstance(object, np.generic):
return object.item() return object.item()
# Functions containing useful data manpulation examples. but not actively in use.
# def build_feature_list(self, config: dict, metadata: dict) -> list:
# """
# SUPERCEDED BY self.find_features()
# Build the list of features that will be used to filter
# the full dataframe. Feature list is construced from the
# user configuration file.
# :params:
# :config: Canonical freqtrade config file containing all
# user defined input in config['freqai] dictionary.
# """
# features = []
# for tf in config["freqai"]["timeframes"]:
# for ft in config["freqai"]["base_features"]:
# for n in range(config["freqai"]["feature_parameters"]["shift"] + 1):
# shift = ""
# if n > 0:
# shift = "_shift-" + str(n)
# features.append(metadata['pair'].split("/")[0] + "-" + ft + shift + "_" + tf)
# for p in config["freqai"]["corr_pairlist"]:
# if metadata['pair'] in p:
# continue # avoid duplicate features
# features.append(p.split("/")[0] + "-" + ft + shift + "_" + tf)
# # logger.info("number of features %s", len(features))
# return features
# Possibly phasing these outlier removal methods below out in favor of
# use_SVM_to_remove_outliers (computationally more efficient and apparently higher performance).
# But these have good data manipulation examples, so keep them commented here for now.
# def determine_statistical_distributions(self) -> None:
# from fitter import Fitter
# logger.info('Determining best model for all features, may take some time')
# def compute_quantiles(ft):
# f = Fitter(self.data_dictionary["train_features"][ft],
# distributions=['gamma', 'cauchy', 'laplace',
# 'beta', 'uniform', 'lognorm'])
# f.fit()
# # f.summary()
# dist = list(f.get_best().items())[0][0]
# params = f.get_best()[dist]
# upper_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.999, **params)
# lower_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.001, **params)
# return ft, upper_q, lower_q, dist
# quantiles_tuple = Parallel(n_jobs=-1)(
# delayed(compute_quantiles)(ft) for ft in self.data_dictionary[
# 'train_features'].columns)
# df = pd.DataFrame(quantiles_tuple, columns=['features', 'upper_quantiles',
# 'lower_quantiles', 'dist'])
# self.data_dictionary['upper_quantiles'] = df['upper_quantiles']
# self.data_dictionary['lower_quantiles'] = df['lower_quantiles']
# return
# def remove_outliers(self, predict: bool) -> None:
# """
# Remove data that looks like an outlier based on the distribution of each
# variable.
# :params:
# :predict: boolean which tells the function if this is prediction data or
# training data coming in.
# """
# lower_quantile = self.data_dictionary["lower_quantiles"].to_numpy()
# upper_quantile = self.data_dictionary["upper_quantiles"].to_numpy()
# if predict:
# df = self.data_dictionary["prediction_features"][
# (self.data_dictionary["prediction_features"] < upper_quantile)
# & (self.data_dictionary["prediction_features"] > lower_quantile)
# ]
# drop_index = pd.isnull(df).any(1)
# self.data_dictionary["prediction_features"].fillna(0, inplace=True)
# drop_index = ~drop_index
# do_predict = np.array(drop_index.replace(True, 1).replace(False, 0))
# logger.info(
# "remove_outliers() tossed %s predictions",
# len(do_predict) - do_predict.sum(),
# )
# self.do_predict += do_predict
# self.do_predict -= 1
# else:
# filter_train_df = self.data_dictionary["train_features"][
# (self.data_dictionary["train_features"] < upper_quantile)
# & (self.data_dictionary["train_features"] > lower_quantile)
# ]
# drop_index = pd.isnull(filter_train_df).any(1)
# drop_index = drop_index.replace(True, 1).replace(False, 0)
# self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
# (drop_index == 0)
# ]
# self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
# (drop_index == 0)
# ]
# self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
# (drop_index == 0)
# ]
# logger.info(
# f'remove_outliers() tossed {drop_index.sum()}'
# f' training points from {len(filter_train_df)}'
# )
# # do the same for the test data
# filter_test_df = self.data_dictionary["test_features"][
# (self.data_dictionary["test_features"] < upper_quantile)
# & (self.data_dictionary["test_features"] > lower_quantile)
# ]
# drop_index = pd.isnull(filter_test_df).any(1)
# drop_index = drop_index.replace(True, 1).replace(False, 0)
# self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][
# (drop_index == 0)
# ]
# self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
# (drop_index == 0)
# ]
# self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][
# (drop_index == 0)
# ]
# logger.info(
# f'remove_outliers() tossed {drop_index.sum()}'
# f' test points from {len(filter_test_df)}'
# )
# return

View File

@ -62,6 +62,7 @@ class IFreqaiModel(ABC):
self.predictions = None self.predictions = None
self.training_on_separate_thread = False self.training_on_separate_thread = False
self.retrain = False self.retrain = False
self.first = True
def start(self, dataframe: DataFrame, metadata: dict, strategy: IStrategy) -> DataFrame: def start(self, dataframe: DataFrame, metadata: dict, strategy: IStrategy) -> DataFrame:
""" """
@ -80,12 +81,12 @@ class IFreqaiModel(ABC):
:metadata: pair metadata coming from strategy. :metadata: pair metadata coming from strategy.
""" """
live = strategy.dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE) self.live = strategy.dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE)
self.pair = metadata["pair"] self.pair = metadata["pair"]
self.dh = FreqaiDataKitchen(self.config, dataframe, live) self.dh = FreqaiDataKitchen(self.config, dataframe, self.live)
if live: if self.live:
# logger.info('testing live') # logger.info('testing live')
self.start_live(dataframe, metadata, strategy) self.start_live(dataframe, metadata, strategy)
@ -115,11 +116,12 @@ class IFreqaiModel(ABC):
self.dh.save_data(self.model) self.dh.save_data(self.model)
else: else:
self.model = self.dh.load_data() self.model = self.dh.load_data()
strategy_provided_features = self.dh.find_features(dataframe_train) # strategy_provided_features = self.dh.find_features(dataframe_train)
if strategy_provided_features != self.dh.training_features_list: # # TOFIX doesnt work with PCA
logger.info("User changed input features, retraining model.") # if strategy_provided_features != self.dh.training_features_list:
self.model = self.train(dataframe_train, metadata) # logger.info("User changed input features, retraining model.")
self.dh.save_data(self.model) # self.model = self.train(dataframe_train, metadata)
# self.dh.save_data(self.model)
preds, do_preds = self.predict(dataframe_backtest, metadata) preds, do_preds = self.predict(dataframe_backtest, metadata)
@ -148,7 +150,7 @@ class IFreqaiModel(ABC):
if not self.training_on_separate_thread: if not self.training_on_separate_thread:
# this will also prevent other pairs from trying to train simultaneously. # this will also prevent other pairs from trying to train simultaneously.
(self.retrain, (self.retrain,
new_trained_timerange) = self.dh.check_if_new_training_required(self.freqai_info[ self.new_trained_timerange) = self.dh.check_if_new_training_required(self.freqai_info[
'live_trained_timerange'], 'live_trained_timerange'],
metadata) metadata)
else: else:
@ -156,14 +158,19 @@ class IFreqaiModel(ABC):
self.retrain = False self.retrain = False
if self.retrain or not file_exists: if self.retrain or not file_exists:
self.training_on_separate_thread = True # acts like a lock if self.first:
self.retrain_model_on_separate_thread(new_trained_timerange, metadata, strategy) self.train_model_in_series(self.new_trained_timerange, metadata, strategy)
self.first = False
else:
self.training_on_separate_thread = True # acts like a lock
self.retrain_model_on_separate_thread(self.new_trained_timerange,
metadata, strategy)
self.model = self.dh.load_data() self.model = self.dh.load_data()
strategy_provided_features = self.dh.find_features(dataframe) strategy_provided_features = self.dh.find_features(dataframe)
if strategy_provided_features != self.dh.training_features_list: if strategy_provided_features != self.dh.training_features_list:
self.train_model_in_series(new_trained_timerange, metadata, strategy) self.train_model_in_series(self.new_trained_timerange, metadata, strategy)
preds, do_preds = self.predict(dataframe, metadata) preds, do_preds = self.predict(dataframe, metadata)
self.dh.append_predictions(preds, do_preds, len(dataframe)) self.dh.append_predictions(preds, do_preds, len(dataframe))
@ -215,12 +222,36 @@ class IFreqaiModel(ABC):
data (NaNs) or felt uncertain about data (PCA and DI index) data (NaNs) or felt uncertain about data (PCA and DI index)
""" """
@abstractmethod
def data_cleaning_train(self) -> None:
"""
User can add data analysis and cleaning here.
Any function inside this method should drop training data points from the filtered_dataframe
based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example
of how outlier data points are dropped from the dataframe used for training.
"""
@abstractmethod
def data_cleaning_predict(self) -> None:
"""
User can add data analysis and cleaning here.
These functions each modify self.dh.do_predict, which is a dataframe with equal length
to the number of candles coming from and returning to the strategy. Inside do_predict,
1 allows prediction and < 0 signals to the strategy that the model is not confident in
the prediction.
See FreqaiDataKitchen::remove_outliers() for an example
of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
for buy signals.
"""
def model_exists(self, pair: str, training_timerange: str) -> bool: def model_exists(self, pair: str, training_timerange: str) -> bool:
""" """
Given a pair and path, check if a model already exists Given a pair and path, check if a model already exists
:param pair: pair e.g. BTC/USD :param pair: pair e.g. BTC/USD
:param path: path to model :param path: path to model
""" """
if self.live and training_timerange is None:
return False
coin, _ = pair.split("/") coin, _ = pair.split("/")
self.dh.model_filename = "cb_" + coin.lower() + "_" + training_timerange self.dh.model_filename = "cb_" + coin.lower() + "_" + training_timerange
path_to_modelfile = Path(self.dh.model_path / str(self.dh.model_filename + "_model.joblib")) path_to_modelfile = Path(self.dh.model_path / str(self.dh.model_filename + "_model.joblib"))
@ -265,3 +296,4 @@ class IFreqaiModel(ABC):
self.model = self.train(unfiltered_dataframe, metadata) self.model = self.train(unfiltered_dataframe, metadata)
self.dh.save_data(self.model) self.dh.save_data(self.model)
self.retrain = False

View File

@ -29,7 +29,7 @@ class CatboostPredictionModel(IFreqaiModel):
dataframe["close"] dataframe["close"]
.shift(-self.feature_parameters["period"]) .shift(-self.feature_parameters["period"])
.rolling(self.feature_parameters["period"]) .rolling(self.feature_parameters["period"])
.max() .mean()
/ dataframe["close"] / dataframe["close"]
- 1 - 1
) )
@ -68,15 +68,11 @@ class CatboostPredictionModel(IFreqaiModel):
# standardize all data based on train_dataset only # standardize all data based on train_dataset only
data_dictionary = self.dh.standardize_data(data_dictionary) data_dictionary = self.dh.standardize_data(data_dictionary)
# optional additional data cleaning # optional additional data cleaning/analysis
if self.feature_parameters["principal_component_analysis"]: self.data_cleaning_train()
self.dh.principal_component_analysis()
if self.feature_parameters["remove_outliers"]:
self.dh.remove_outliers(predict=False)
if self.feature_parameters["DI_threshold"]:
self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
logger.info("length of train data %s", len(data_dictionary["train_features"])) logger.info(f'Training model on {len(self.dh.training_features_list)} features')
logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
model = self.fit(data_dictionary) model = self.fit(data_dictionary)
@ -86,9 +82,7 @@ class CatboostPredictionModel(IFreqaiModel):
def fit(self, data_dictionary: Dict) -> Any: def fit(self, data_dictionary: Dict) -> Any:
""" """
Most regressors use the same function names and arguments e.g. user User sets up the training and test data to fit their desired model here
can drop in LGBMRegressor in place of CatBoostRegressor and all data
management will be properly handled by Freqai.
:params: :params:
:data_dictionary: the dictionary constructed by DataHandler to hold :data_dictionary: the dictionary constructed by DataHandler to hold
all the training and test data/labels. all the training and test data/labels.
@ -133,7 +127,51 @@ class CatboostPredictionModel(IFreqaiModel):
filtered_dataframe = self.dh.standardize_data_from_metadata(filtered_dataframe) filtered_dataframe = self.dh.standardize_data_from_metadata(filtered_dataframe)
self.dh.data_dictionary["prediction_features"] = filtered_dataframe self.dh.data_dictionary["prediction_features"] = filtered_dataframe
# optional additional data cleaning # optional additional data cleaning/analysis
self.data_cleaning_predict(filtered_dataframe)
predictions = self.model.predict(self.dh.data_dictionary["prediction_features"])
# compute the non-standardized predictions
self.dh.predictions = (predictions + 1) * (self.dh.data["labels_max"] -
self.dh.data["labels_min"]) / 2 + self.dh.data[
"labels_min"]
# logger.info("--------------------Finished prediction--------------------")
return (self.dh.predictions, self.dh.do_predict)
def data_cleaning_train(self) -> None:
"""
User can add data analysis and cleaning here.
Any function inside this method should drop training data points from the filtered_dataframe
based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example
of how outlier data points are dropped from the dataframe used for training.
"""
if self.feature_parameters["principal_component_analysis"]:
self.dh.principal_component_analysis()
# if self.feature_parameters["determine_statistical_distributions"]:
# self.dh.determine_statistical_distributions()
# if self.feature_parameters["remove_outliers"]:
# self.dh.remove_outliers(predict=False)
if self.feature_parameters["use_SVM_to_remove_outliers"]:
self.dh.use_SVM_to_remove_outliers(predict=False)
if self.feature_parameters["DI_threshold"]:
self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
def data_cleaning_predict(self, filtered_dataframe: DataFrame) -> None:
"""
User can add data analysis and cleaning here.
These functions each modify self.dh.do_predict, which is a dataframe with equal length
to the number of candles coming from and returning to the strategy. Inside do_predict,
1 allows prediction and < 0 signals to the strategy that the model is not confident in
the prediction.
See FreqaiDataKitchen::remove_outliers() for an example
of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
for buy signals.
"""
if self.feature_parameters["principal_component_analysis"]: if self.feature_parameters["principal_component_analysis"]:
pca_components = self.dh.pca.transform(filtered_dataframe) pca_components = self.dh.pca.transform(filtered_dataframe)
self.dh.data_dictionary["prediction_features"] = pd.DataFrame( self.dh.data_dictionary["prediction_features"] = pd.DataFrame(
@ -142,17 +180,13 @@ class CatboostPredictionModel(IFreqaiModel):
index=filtered_dataframe.index, index=filtered_dataframe.index,
) )
if self.feature_parameters["remove_outliers"]: # if self.feature_parameters["determine_statistical_distributions"]:
self.dh.remove_outliers(predict=True) # creates dropped index # self.dh.determine_statistical_distributions()
# if self.feature_parameters["remove_outliers"]:
# self.dh.remove_outliers(predict=True) # creates dropped index
if self.feature_parameters["use_SVM_to_remove_outliers"]:
self.dh.use_SVM_to_remove_outliers(predict=True)
if self.feature_parameters["DI_threshold"]: if self.feature_parameters["DI_threshold"]:
self.dh.check_if_pred_in_training_spaces() # sets do_predict self.dh.check_if_pred_in_training_spaces() # sets do_predict
predictions = self.model.predict(self.dh.data_dictionary["prediction_features"])
# compute the non-standardized predictions
self.dh.predictions = predictions * self.dh.data["labels_std"] + self.dh.data["labels_mean"]
# logger.info("--------------------Finished prediction--------------------")
return (self.dh.predictions, self.dh.do_predict)

View File

@ -1,159 +0,0 @@
import logging
from typing import Any, Dict, Tuple
import pandas as pd
from catboost import CatBoostRegressor, Pool
from pandas import DataFrame
from freqtrade.freqai.freqai_interface import IFreqaiModel
logger = logging.getLogger(__name__)
class ExamplePredictionModel(IFreqaiModel):
"""
User created prediction model. The class needs to override three necessary
functions, predict(), train(), fit(). The class inherits ModelHandler which
has its own DataHandler where data is held, saved, loaded, and managed.
"""
def make_labels(self, dataframe: DataFrame) -> DataFrame:
"""
User defines the labels here (target values).
:params:
:dataframe: the full dataframe for the present training period
"""
dataframe["s"] = (
dataframe["close"]
.shift(-self.feature_parameters["period"])
.rolling(self.feature_parameters["period"])
.max()
/ dataframe["close"]
- 1
)
self.dh.data["s_mean"] = dataframe["s"].mean()
self.dh.data["s_std"] = dataframe["s"].std()
# logger.info("label mean", self.dh.data["s_mean"], "label std", self.dh.data["s_std"])
return dataframe["s"]
def train(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Tuple[DataFrame, DataFrame]:
"""
Filter the training data and train a model to it. Train makes heavy use of the datakitchen
for storing, saving, loading, and analyzing the data.
:params:
:unfiltered_dataframe: Full dataframe for the current training period
:metadata: pair metadata from strategy.
:returns:
:model: Trained model which can be used to inference (self.predict)
"""
logger.info("--------------------Starting training--------------------")
# create the full feature list based on user config info
self.dh.training_features_list = self.dh.build_feature_list(self.config, metadata)
unfiltered_labels = self.make_labels(unfiltered_dataframe)
# filter the features requested by user in the configuration file and elegantly handle NaNs
features_filtered, labels_filtered = self.dh.filter_features(
unfiltered_dataframe,
self.dh.training_features_list,
unfiltered_labels,
training_filter=True,
)
# split data into train/test data.
data_dictionary = self.dh.make_train_test_datasets(features_filtered, labels_filtered)
# standardize all data based on train_dataset only
data_dictionary = self.dh.standardize_data(data_dictionary)
# optional additional data cleaning
if self.feature_parameters["principal_component_analysis"]:
self.dh.principal_component_analysis()
if self.feature_parameters["remove_outliers"]:
self.dh.remove_outliers(predict=False)
if self.feature_parameters["DI_threshold"]:
self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
logger.info("length of train data %s", len(data_dictionary["train_features"]))
model = self.fit(data_dictionary)
logger.info(f'--------------------done training {metadata["pair"]}--------------------')
return model
def fit(self, data_dictionary: Dict) -> Any:
"""
Most regressors use the same function names and arguments e.g. user
can drop in LGBMRegressor in place of CatBoostRegressor and all data
management will be properly handled by Freqai.
:params:
:data_dictionary: the dictionary constructed by DataHandler to hold
all the training and test data/labels.
"""
train_data = Pool(
data=data_dictionary["train_features"],
label=data_dictionary["train_labels"],
weight=data_dictionary["train_weights"],
)
test_data = Pool(
data=data_dictionary["test_features"],
label=data_dictionary["test_labels"],
weight=data_dictionary["test_weights"],
)
model = CatBoostRegressor(
verbose=100, early_stopping_rounds=400, **self.model_training_parameters
)
model.fit(X=train_data, eval_set=test_data)
return model
def predict(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Tuple[DataFrame,
DataFrame]:
"""
Filter the prediction features data and predict with it.
:param: unfiltered_dataframe: Full dataframe for the current backtest period.
:return:
:predictions: np.array of predictions
:do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
data (NaNs) or felt uncertain about data (PCA and DI index)
"""
# logger.info("--------------------Starting prediction--------------------")
original_feature_list = self.dh.build_feature_list(self.config, metadata)
filtered_dataframe, _ = self.dh.filter_features(
unfiltered_dataframe, original_feature_list, training_filter=False
)
filtered_dataframe = self.dh.standardize_data_from_metadata(filtered_dataframe)
self.dh.data_dictionary["prediction_features"] = filtered_dataframe
# optional additional data cleaning
if self.feature_parameters["principal_component_analysis"]:
pca_components = self.dh.pca.transform(filtered_dataframe)
self.dh.data_dictionary["prediction_features"] = pd.DataFrame(
data=pca_components,
columns=["PC" + str(i) for i in range(0, self.dh.data["n_kept_components"])],
index=filtered_dataframe.index,
)
if self.feature_parameters["remove_outliers"]:
self.dh.remove_outliers(predict=True) # creates dropped index
if self.feature_parameters["DI_threshold"]:
self.dh.check_if_pred_in_training_spaces() # sets do_predict
predictions = self.model.predict(self.dh.data_dictionary["prediction_features"])
# compute the non-standardized predictions
self.dh.predictions = predictions * self.dh.data["labels_std"] + self.dh.data["labels_mean"]
# logger.info("--------------------Finished prediction--------------------")
return (self.dh.predictions, self.dh.do_predict)

View File

@ -166,8 +166,8 @@ class FreqaiExampleStrategy(IStrategy):
dataframe["target_std"], dataframe["target_std"],
) = self.model.bridge.start(dataframe, metadata, self) ) = self.model.bridge.start(dataframe, metadata, self)
dataframe["target_roi"] = dataframe["target_mean"] + dataframe["target_std"] * 0.5 dataframe["target_roi"] = dataframe["target_mean"] + dataframe["target_std"] * 1.5
dataframe["sell_roi"] = dataframe["target_mean"] - dataframe["target_std"] * 1.5 dataframe["sell_roi"] = dataframe["target_mean"] - dataframe["target_std"] * 1
return dataframe return dataframe
def populate_buy_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame: def populate_buy_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
@ -183,7 +183,7 @@ class FreqaiExampleStrategy(IStrategy):
def populate_sell_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame: def populate_sell_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
sell_conditions = [ sell_conditions = [
(dataframe["prediction"] < dataframe["sell_roi"]) & (dataframe["do_predict"] == 1) (dataframe["do_predict"] <= 0)
] ]
if sell_conditions: if sell_conditions:
dataframe.loc[reduce(lambda x, y: x | y, sell_conditions), "sell"] = 1 dataframe.loc[reduce(lambda x, y: x | y, sell_conditions), "sell"] = 1