Aggregated commit. Adding support vector machine for outlier detection, improve user interface to dry/live, better standardization, fix various other bugs
This commit is contained in:
parent
c5ecf94177
commit
42d95af829
@ -57,8 +57,8 @@
|
|||||||
"train_period": 30,
|
"train_period": 30,
|
||||||
"backtest_period": 7,
|
"backtest_period": 7,
|
||||||
"identifier": "example",
|
"identifier": "example",
|
||||||
"live_trained_timerange": "20220330-20220429",
|
"live_trained_timerange": "",
|
||||||
"live_full_backtestrange": "20220302-20220501",
|
"live_full_backtestrange": "",
|
||||||
"corr_pairlist": [
|
"corr_pairlist": [
|
||||||
"BTC/USDT",
|
"BTC/USDT",
|
||||||
"ETH/USDT",
|
"ETH/USDT",
|
||||||
@ -68,20 +68,19 @@
|
|||||||
"feature_parameters": {
|
"feature_parameters": {
|
||||||
"period": 12,
|
"period": 12,
|
||||||
"shift": 1,
|
"shift": 1,
|
||||||
"drop_features": false,
|
|
||||||
"DI_threshold": 1,
|
"DI_threshold": 1,
|
||||||
"weight_factor": 0,
|
"weight_factor": 0,
|
||||||
"principal_component_analysis": false,
|
"principal_component_analysis": false,
|
||||||
"remove_outliers": false
|
"use_SVM_to_remove_outliers": false
|
||||||
},
|
},
|
||||||
"data_split_parameters": {
|
"data_split_parameters": {
|
||||||
"test_size": 0.25,
|
"test_size": 0.25,
|
||||||
"random_state": 1
|
"random_state": 1
|
||||||
},
|
},
|
||||||
"model_training_parameters": {
|
"model_training_parameters": {
|
||||||
"n_estimators": 2000,
|
"n_estimators": 1000,
|
||||||
"random_state": 1,
|
"random_state": 1,
|
||||||
"learning_rate": 0.02,
|
"learning_rate": 0.1,
|
||||||
"task_type": "CPU"
|
"task_type": "CPU"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -331,21 +331,21 @@ Users can reduce the dimensionality of their features by activating the `princip
|
|||||||
Which will perform PCA on the features and reduce the dimensionality of the data so that the explained
|
Which will perform PCA on the features and reduce the dimensionality of the data so that the explained
|
||||||
variance of the data set is >= 0.999.
|
variance of the data set is >= 0.999.
|
||||||
|
|
||||||
### Removing outliers based on feature statistical distributions
|
### Removing outliers using a Support Vector Machine (SVM)
|
||||||
|
|
||||||
The user can tell Freqai to remove outlier data points from the training/test data sets by setting:
|
The user can tell Freqai to remove outlier data points from the training/test data sets by setting:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"freqai": {
|
"freqai": {
|
||||||
"feature_parameters" : {
|
"feature_parameters" : {
|
||||||
"remove_outliers": true
|
"use_SVM_to_remove_outliers: true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Freqai will check the statistical distributions of each feature (or component if the user activated
|
Freqai will train an SVM on the training data (or components if the user activated
|
||||||
`principal_component_analysis`) and remove any data point that sits more than 3 standard deviations away
|
`principal_component_analysis`) and remove any data point that it deems to be sit beyond the
|
||||||
from the mean.
|
feature space.
|
||||||
|
|
||||||
## Additional information
|
## Additional information
|
||||||
|
|
||||||
|
@ -10,8 +10,9 @@ from typing import Any, Dict, List, Tuple
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from joblib import dump, load
|
from joblib import dump, load # , Parallel, delayed # used for auto distribution assignment
|
||||||
from pandas import DataFrame
|
from pandas import DataFrame
|
||||||
|
from sklearn import linear_model
|
||||||
from sklearn.metrics.pairwise import pairwise_distances
|
from sklearn.metrics.pairwise import pairwise_distances
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
@ -22,6 +23,9 @@ from freqtrade.resolvers import ExchangeResolver
|
|||||||
from freqtrade.strategy.interface import IStrategy
|
from freqtrade.strategy.interface import IStrategy
|
||||||
|
|
||||||
|
|
||||||
|
# import scipy as spy # used for auto distribution assignment
|
||||||
|
|
||||||
|
|
||||||
SECONDS_IN_DAY = 86400
|
SECONDS_IN_DAY = 86400
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -52,6 +56,7 @@ class FreqaiDataKitchen:
|
|||||||
self.model_filename: str = ""
|
self.model_filename: str = ""
|
||||||
self.model_dictionary: Dict[Any, Any] = {}
|
self.model_dictionary: Dict[Any, Any] = {}
|
||||||
self.live = live
|
self.live = live
|
||||||
|
self.svm_model: linear_model.SGDOneClassSVM = None
|
||||||
if not self.live:
|
if not self.live:
|
||||||
self.full_timerange = self.create_fulltimerange(self.config["timerange"],
|
self.full_timerange = self.create_fulltimerange(self.config["timerange"],
|
||||||
self.freqai_config["train_period"]
|
self.freqai_config["train_period"]
|
||||||
@ -89,6 +94,10 @@ class FreqaiDataKitchen:
|
|||||||
|
|
||||||
# Save the trained model
|
# Save the trained model
|
||||||
dump(model, save_path / str(self.model_filename + "_model.joblib"))
|
dump(model, save_path / str(self.model_filename + "_model.joblib"))
|
||||||
|
|
||||||
|
if self.svm_model is not None:
|
||||||
|
dump(self.svm_model, save_path / str(self.model_filename + "_svm_model.joblib"))
|
||||||
|
|
||||||
self.data["model_path"] = str(self.model_path)
|
self.data["model_path"] = str(self.model_path)
|
||||||
self.data["model_filename"] = str(self.model_filename)
|
self.data["model_filename"] = str(self.model_filename)
|
||||||
self.data["training_features_list"] = list(self.data_dictionary["train_features"].columns)
|
self.data["training_features_list"] = list(self.data_dictionary["train_features"].columns)
|
||||||
@ -104,6 +113,19 @@ class FreqaiDataKitchen:
|
|||||||
if self.live:
|
if self.live:
|
||||||
self.model_dictionary[self.model_filename] = model
|
self.model_dictionary[self.model_filename] = model
|
||||||
|
|
||||||
|
# TODO add a helper function to let user save/load any data they are custom adding. We
|
||||||
|
# do not want them having to edit the default save/load methods here. Below is an example
|
||||||
|
# of what we do NOT want.
|
||||||
|
|
||||||
|
# if self.freqai_config['feature_parameters']['determine_statistical_distributions']:
|
||||||
|
# self.data_dictionary["upper_quantiles"].to_pickle(
|
||||||
|
# save_path / str(self.model_filename + "_upper_quantiles.pkl")
|
||||||
|
# )
|
||||||
|
|
||||||
|
# self.data_dictionary["lower_quantiles"].to_pickle(
|
||||||
|
# save_path / str(self.model_filename + "_lower_quantiles.pkl")
|
||||||
|
# )
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def load_data(self) -> Any:
|
def load_data(self) -> Any:
|
||||||
@ -121,6 +143,19 @@ class FreqaiDataKitchen:
|
|||||||
self.model_path / str(self.model_filename + "_trained_df.pkl")
|
self.model_path / str(self.model_filename + "_trained_df.pkl")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# TODO add a helper function to let user save/load any data they are custom adding. We
|
||||||
|
# do not want them having to edit the default save/load methods here. Below is an example
|
||||||
|
# of what we do NOT want.
|
||||||
|
|
||||||
|
# if self.freqai_config['feature_parameters']['determine_statistical_distributions']:
|
||||||
|
# self.data_dictionary["upper_quantiles"] = pd.read_pickle(
|
||||||
|
# self.model_path / str(self.model_filename + "_upper_quantiles.pkl")
|
||||||
|
# )
|
||||||
|
|
||||||
|
# self.data_dictionary["lower_quantiles"] = pd.read_pickle(
|
||||||
|
# self.model_path / str(self.model_filename + "_lower_quantiles.pkl")
|
||||||
|
# )
|
||||||
|
|
||||||
self.model_path = Path(self.data["model_path"])
|
self.model_path = Path(self.data["model_path"])
|
||||||
self.model_filename = self.data["model_filename"]
|
self.model_filename = self.data["model_filename"]
|
||||||
|
|
||||||
@ -130,6 +165,10 @@ class FreqaiDataKitchen:
|
|||||||
else:
|
else:
|
||||||
model = load(self.model_path / str(self.model_filename + "_model.joblib"))
|
model = load(self.model_path / str(self.model_filename + "_model.joblib"))
|
||||||
|
|
||||||
|
if Path(self.model_path / str(self.model_filename +
|
||||||
|
"_svm_model.joblib")).resolve().exists():
|
||||||
|
self.svm_model = load(self.model_path / str(self.model_filename + "_svm_model.joblib"))
|
||||||
|
|
||||||
assert model, (
|
assert model, (
|
||||||
f"Unable to load model, ensure model exists at "
|
f"Unable to load model, ensure model exists at "
|
||||||
f"{self.model_path} "
|
f"{self.model_path} "
|
||||||
@ -159,6 +198,12 @@ class FreqaiDataKitchen:
|
|||||||
else:
|
else:
|
||||||
weights = np.ones(len(filtered_dataframe))
|
weights = np.ones(len(filtered_dataframe))
|
||||||
|
|
||||||
|
if self.config["freqai"]["feature_parameters"]["stratify"] > 0:
|
||||||
|
stratification = np.zeros(len(filtered_dataframe))
|
||||||
|
for i in range(1, len(stratification)):
|
||||||
|
if i % self.config["freqai"]["feature_parameters"]["stratify"] == 0:
|
||||||
|
stratification[i] = 1
|
||||||
|
|
||||||
(
|
(
|
||||||
train_features,
|
train_features,
|
||||||
test_features,
|
test_features,
|
||||||
@ -170,6 +215,8 @@ class FreqaiDataKitchen:
|
|||||||
filtered_dataframe[: filtered_dataframe.shape[0]],
|
filtered_dataframe[: filtered_dataframe.shape[0]],
|
||||||
labels,
|
labels,
|
||||||
weights,
|
weights,
|
||||||
|
stratify=stratification,
|
||||||
|
# shuffle=False,
|
||||||
**self.config["freqai"]["data_split_parameters"]
|
**self.config["freqai"]["data_split_parameters"]
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -261,9 +308,9 @@ class FreqaiDataKitchen:
|
|||||||
|
|
||||||
return self.data_dictionary
|
return self.data_dictionary
|
||||||
|
|
||||||
def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
|
def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
|
||||||
"""
|
"""
|
||||||
Standardize all data in the data_dictionary according to the training dataset
|
Normalize all data in the data_dictionary according to the training dataset
|
||||||
:params:
|
:params:
|
||||||
:data_dictionary: dictionary containing the cleaned and split training/test data/labels
|
:data_dictionary: dictionary containing the cleaned and split training/test data/labels
|
||||||
:returns:
|
:returns:
|
||||||
@ -297,6 +344,42 @@ class FreqaiDataKitchen:
|
|||||||
|
|
||||||
return data_dictionary
|
return data_dictionary
|
||||||
|
|
||||||
|
def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
|
||||||
|
"""
|
||||||
|
Standardize all data in the data_dictionary according to the training dataset
|
||||||
|
:params:
|
||||||
|
:data_dictionary: dictionary containing the cleaned and split training/test data/labels
|
||||||
|
:returns:
|
||||||
|
:data_dictionary: updated dictionary with standardized values.
|
||||||
|
"""
|
||||||
|
# standardize the data by training stats
|
||||||
|
train_max = data_dictionary["train_features"].max()
|
||||||
|
train_min = data_dictionary["train_features"].min()
|
||||||
|
data_dictionary["train_features"] = 2 * (
|
||||||
|
data_dictionary["train_features"] - train_min
|
||||||
|
) / (train_max - train_min) - 1
|
||||||
|
data_dictionary["test_features"] = 2 * (
|
||||||
|
data_dictionary["test_features"] - train_min
|
||||||
|
) / (train_max - train_min) - 1
|
||||||
|
|
||||||
|
train_labels_max = data_dictionary["train_labels"].max()
|
||||||
|
train_labels_min = data_dictionary["train_labels"].min()
|
||||||
|
data_dictionary["train_labels"] = 2 * (
|
||||||
|
data_dictionary["train_labels"] - train_labels_min
|
||||||
|
) / (train_labels_max - train_labels_min) - 1
|
||||||
|
data_dictionary["test_labels"] = 2 * (
|
||||||
|
data_dictionary["test_labels"] - train_labels_min
|
||||||
|
) / (train_labels_max - train_labels_min) - 1
|
||||||
|
|
||||||
|
for item in train_max.keys():
|
||||||
|
self.data[item + "_max"] = train_max[item]
|
||||||
|
self.data[item + "_min"] = train_min[item]
|
||||||
|
|
||||||
|
self.data["labels_max"] = train_labels_max
|
||||||
|
self.data["labels_min"] = train_labels_min
|
||||||
|
|
||||||
|
return data_dictionary
|
||||||
|
|
||||||
def standardize_data_from_metadata(self, df: DataFrame) -> DataFrame:
|
def standardize_data_from_metadata(self, df: DataFrame) -> DataFrame:
|
||||||
"""
|
"""
|
||||||
Standardizes a set of data using the mean and standard deviation from
|
Standardizes a set of data using the mean and standard deviation from
|
||||||
@ -305,6 +388,20 @@ class FreqaiDataKitchen:
|
|||||||
:df: Dataframe to be standardized
|
:df: Dataframe to be standardized
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
for item in df.keys():
|
||||||
|
df[item] = 2 * (df[item] - self.data[item + "_min"]) / (self.data[item + "_max"] -
|
||||||
|
self.data[item + '_min']) - 1
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
|
||||||
|
"""
|
||||||
|
Normalizes a set of data using the mean and standard deviation from
|
||||||
|
the associated training data.
|
||||||
|
:params:
|
||||||
|
:df: Dataframe to be standardized
|
||||||
|
"""
|
||||||
|
|
||||||
for item in df.keys():
|
for item in df.keys():
|
||||||
df[item] = (df[item] - self.data[item + "_mean"]) / self.data[item + "_std"]
|
df[item] = (df[item] - self.data[item + "_mean"]) / self.data[item + "_std"]
|
||||||
|
|
||||||
@ -420,6 +517,8 @@ class FreqaiDataKitchen:
|
|||||||
self.data["n_kept_components"] = n_keep_components
|
self.data["n_kept_components"] = n_keep_components
|
||||||
self.pca = pca2
|
self.pca = pca2
|
||||||
|
|
||||||
|
logger.info(f'PCA reduced total features from {n_components} to {n_keep_components}')
|
||||||
|
|
||||||
if not self.model_path.is_dir():
|
if not self.model_path.is_dir():
|
||||||
self.model_path.mkdir(parents=True, exist_ok=True)
|
self.model_path.mkdir(parents=True, exist_ok=True)
|
||||||
pk.dump(pca2, open(self.model_path / str(self.model_filename + "_pca_object.pkl"), "wb"))
|
pk.dump(pca2, open(self.model_path / str(self.model_filename + "_pca_object.pkl"), "wb"))
|
||||||
@ -434,70 +533,53 @@ class FreqaiDataKitchen:
|
|||||||
|
|
||||||
return avg_mean_dist
|
return avg_mean_dist
|
||||||
|
|
||||||
def remove_outliers(self, predict: bool) -> None:
|
def use_SVM_to_remove_outliers(self, predict: bool) -> None:
|
||||||
"""
|
|
||||||
Remove data that looks like an outlier based on the distribution of each
|
|
||||||
variable.
|
|
||||||
:params:
|
|
||||||
:predict: boolean which tells the function if this is prediction data or
|
|
||||||
training data coming in.
|
|
||||||
"""
|
|
||||||
|
|
||||||
lower_quantile = self.data_dictionary["train_features"].quantile(0.001)
|
|
||||||
upper_quantile = self.data_dictionary["train_features"].quantile(0.999)
|
|
||||||
|
|
||||||
if predict:
|
if predict:
|
||||||
|
assert self.svm_model, "No svm model available for outlier removal"
|
||||||
df = self.data_dictionary["prediction_features"][
|
y_pred = self.svm_model.predict(self.data_dictionary["prediction_features"])
|
||||||
(self.data_dictionary["prediction_features"] < upper_quantile)
|
do_predict = np.where(y_pred == -1, 0, y_pred)
|
||||||
& (self.data_dictionary["prediction_features"] > lower_quantile)
|
|
||||||
]
|
|
||||||
drop_index = pd.isnull(df).any(1)
|
|
||||||
self.data_dictionary["prediction_features"].fillna(0, inplace=True)
|
|
||||||
drop_index = ~drop_index
|
|
||||||
do_predict = np.array(drop_index.replace(True, 1).replace(False, 0))
|
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"remove_outliers() tossed %s predictions",
|
f'svm_remove_outliers() tossed {len(do_predict) - do_predict.sum()} predictions'
|
||||||
len(do_predict) - do_predict.sum(),
|
|
||||||
)
|
)
|
||||||
self.do_predict += do_predict
|
self.do_predict += do_predict
|
||||||
self.do_predict -= 1
|
self.do_predict -= 1
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
# use SGDOneClassSVM to increase speed?
|
||||||
|
self.svm_model = linear_model.SGDOneClassSVM(nu=0.1).fit(
|
||||||
|
self.data_dictionary["train_features"]
|
||||||
|
)
|
||||||
|
y_pred = self.svm_model.predict(self.data_dictionary["train_features"])
|
||||||
|
dropped_points = np.where(y_pred == -1, 0, y_pred)
|
||||||
|
# keep_index = np.where(y_pred == 1)
|
||||||
|
self.data_dictionary["train_features"] = self.data_dictionary[
|
||||||
|
"train_features"][(y_pred == 1)]
|
||||||
|
self.data_dictionary["train_labels"] = self.data_dictionary[
|
||||||
|
"train_labels"][(y_pred == 1)]
|
||||||
|
self.data_dictionary["train_weights"] = self.data_dictionary[
|
||||||
|
"train_weights"][(y_pred == 1)]
|
||||||
|
|
||||||
filter_train_df = self.data_dictionary["train_features"][
|
logger.info(
|
||||||
(self.data_dictionary["train_features"] < upper_quantile)
|
f'svm_remove_outliers() tossed {len(y_pred) - dropped_points.sum()}'
|
||||||
& (self.data_dictionary["train_features"] > lower_quantile)
|
f' train points from {len(y_pred)}'
|
||||||
]
|
)
|
||||||
drop_index = pd.isnull(filter_train_df).any(1)
|
|
||||||
drop_index = drop_index.replace(True, 1).replace(False, 0)
|
|
||||||
self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
|
|
||||||
(drop_index == 0)
|
|
||||||
]
|
|
||||||
self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
|
|
||||||
(drop_index == 0)
|
|
||||||
]
|
|
||||||
self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
|
|
||||||
(drop_index == 0)
|
|
||||||
]
|
|
||||||
|
|
||||||
# do the same for the test data
|
# same for test data
|
||||||
filter_test_df = self.data_dictionary["test_features"][
|
y_pred = self.svm_model.predict(self.data_dictionary["test_features"])
|
||||||
(self.data_dictionary["test_features"] < upper_quantile)
|
dropped_points = np.where(y_pred == -1, 0, y_pred)
|
||||||
& (self.data_dictionary["test_features"] > lower_quantile)
|
self.data_dictionary["test_features"] = self.data_dictionary[
|
||||||
]
|
"test_features"][(y_pred == 1)]
|
||||||
drop_index = pd.isnull(filter_test_df).any(1)
|
self.data_dictionary["test_labels"] = self.data_dictionary[
|
||||||
drop_index = drop_index.replace(True, 1).replace(False, 0)
|
"test_labels"][(y_pred == 1)]
|
||||||
self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][
|
self.data_dictionary["test_weights"] = self.data_dictionary[
|
||||||
(drop_index == 0)
|
"test_weights"][(y_pred == 1)]
|
||||||
]
|
|
||||||
self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
|
logger.info(
|
||||||
(drop_index == 0)
|
f'svm_remove_outliers() tossed {len(y_pred) - dropped_points.sum()}'
|
||||||
]
|
f' test points from {len(y_pred)}'
|
||||||
self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][
|
)
|
||||||
(drop_index == 0)
|
|
||||||
]
|
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -507,32 +589,6 @@ class FreqaiDataKitchen:
|
|||||||
assert features, ("Could not find any features!")
|
assert features, ("Could not find any features!")
|
||||||
return features
|
return features
|
||||||
|
|
||||||
# def build_feature_list(self, config: dict, metadata: dict) -> list:
|
|
||||||
# """
|
|
||||||
# SUPERCEDED BY self.find_features()
|
|
||||||
# Build the list of features that will be used to filter
|
|
||||||
# the full dataframe. Feature list is construced from the
|
|
||||||
# user configuration file.
|
|
||||||
# :params:
|
|
||||||
# :config: Canonical freqtrade config file containing all
|
|
||||||
# user defined input in config['freqai] dictionary.
|
|
||||||
# """
|
|
||||||
# features = []
|
|
||||||
# for tf in config["freqai"]["timeframes"]:
|
|
||||||
# for ft in config["freqai"]["base_features"]:
|
|
||||||
# for n in range(config["freqai"]["feature_parameters"]["shift"] + 1):
|
|
||||||
# shift = ""
|
|
||||||
# if n > 0:
|
|
||||||
# shift = "_shift-" + str(n)
|
|
||||||
# features.append(metadata['pair'].split("/")[0] + "-" + ft + shift + "_" + tf)
|
|
||||||
# for p in config["freqai"]["corr_pairlist"]:
|
|
||||||
# if metadata['pair'] in p:
|
|
||||||
# continue # avoid duplicate features
|
|
||||||
# features.append(p.split("/")[0] + "-" + ft + shift + "_" + tf)
|
|
||||||
|
|
||||||
# # logger.info("number of features %s", len(features))
|
|
||||||
# return features
|
|
||||||
|
|
||||||
def check_if_pred_in_training_spaces(self) -> None:
|
def check_if_pred_in_training_spaces(self) -> None:
|
||||||
"""
|
"""
|
||||||
Compares the distance from each prediction point to each training data
|
Compares the distance from each prediction point to each training data
|
||||||
@ -568,7 +624,7 @@ class FreqaiDataKitchen:
|
|||||||
training than older data.
|
training than older data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
weights = np.zeros_like(num_weights)
|
weights = np.zeros(num_weights)
|
||||||
for i in range(1, len(weights)):
|
for i in range(1, len(weights)):
|
||||||
weights[len(weights) - i] = np.exp(
|
weights[len(weights) - i] = np.exp(
|
||||||
-i / (self.config["freqai"]["feature_parameters"]["weight_factor"] * num_weights)
|
-i / (self.config["freqai"]["feature_parameters"]["weight_factor"] * num_weights)
|
||||||
@ -638,19 +694,23 @@ class FreqaiDataKitchen:
|
|||||||
|
|
||||||
time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp()
|
time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp()
|
||||||
|
|
||||||
trained_timerange = TimeRange.parse_timerange(training_timerange)
|
if training_timerange: # user passed no live_trained_timerange in config
|
||||||
|
trained_timerange = TimeRange.parse_timerange(training_timerange)
|
||||||
|
elapsed_time = (time - trained_timerange.stopts) / SECONDS_IN_DAY
|
||||||
|
trained_timerange.startts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
|
||||||
|
trained_timerange.stopts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
|
||||||
|
retrain = elapsed_time > self.freqai_config['backtest_period']
|
||||||
|
else:
|
||||||
|
trained_timerange = TimeRange.parse_timerange("20000101-20000201")
|
||||||
|
trained_timerange.startts = int(time - self.freqai_config['train_period'] *
|
||||||
|
SECONDS_IN_DAY)
|
||||||
|
trained_timerange.stopts = int(time)
|
||||||
|
retrain = True
|
||||||
|
|
||||||
elapsed_time = (time - trained_timerange.stopts) / SECONDS_IN_DAY
|
|
||||||
|
|
||||||
trained_timerange.startts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
|
|
||||||
trained_timerange.stopts += self.freqai_config['backtest_period'] * SECONDS_IN_DAY
|
|
||||||
start = datetime.datetime.utcfromtimestamp(trained_timerange.startts)
|
start = datetime.datetime.utcfromtimestamp(trained_timerange.startts)
|
||||||
stop = datetime.datetime.utcfromtimestamp(trained_timerange.stopts)
|
stop = datetime.datetime.utcfromtimestamp(trained_timerange.stopts)
|
||||||
|
|
||||||
new_trained_timerange = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")
|
new_trained_timerange = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")
|
||||||
|
|
||||||
retrain = elapsed_time > self.freqai_config['backtest_period']
|
|
||||||
|
|
||||||
if retrain:
|
if retrain:
|
||||||
coin, _ = metadata['pair'].split("/")
|
coin, _ = metadata['pair'].split("/")
|
||||||
# set the new model_path
|
# set the new model_path
|
||||||
@ -738,3 +798,141 @@ class FreqaiDataKitchen:
|
|||||||
def np_encoder(self, object):
|
def np_encoder(self, object):
|
||||||
if isinstance(object, np.generic):
|
if isinstance(object, np.generic):
|
||||||
return object.item()
|
return object.item()
|
||||||
|
|
||||||
|
# Functions containing useful data manpulation examples. but not actively in use.
|
||||||
|
|
||||||
|
# def build_feature_list(self, config: dict, metadata: dict) -> list:
|
||||||
|
# """
|
||||||
|
# SUPERCEDED BY self.find_features()
|
||||||
|
# Build the list of features that will be used to filter
|
||||||
|
# the full dataframe. Feature list is construced from the
|
||||||
|
# user configuration file.
|
||||||
|
# :params:
|
||||||
|
# :config: Canonical freqtrade config file containing all
|
||||||
|
# user defined input in config['freqai] dictionary.
|
||||||
|
# """
|
||||||
|
# features = []
|
||||||
|
# for tf in config["freqai"]["timeframes"]:
|
||||||
|
# for ft in config["freqai"]["base_features"]:
|
||||||
|
# for n in range(config["freqai"]["feature_parameters"]["shift"] + 1):
|
||||||
|
# shift = ""
|
||||||
|
# if n > 0:
|
||||||
|
# shift = "_shift-" + str(n)
|
||||||
|
# features.append(metadata['pair'].split("/")[0] + "-" + ft + shift + "_" + tf)
|
||||||
|
# for p in config["freqai"]["corr_pairlist"]:
|
||||||
|
# if metadata['pair'] in p:
|
||||||
|
# continue # avoid duplicate features
|
||||||
|
# features.append(p.split("/")[0] + "-" + ft + shift + "_" + tf)
|
||||||
|
|
||||||
|
# # logger.info("number of features %s", len(features))
|
||||||
|
# return features
|
||||||
|
|
||||||
|
# Possibly phasing these outlier removal methods below out in favor of
|
||||||
|
# use_SVM_to_remove_outliers (computationally more efficient and apparently higher performance).
|
||||||
|
# But these have good data manipulation examples, so keep them commented here for now.
|
||||||
|
|
||||||
|
# def determine_statistical_distributions(self) -> None:
|
||||||
|
# from fitter import Fitter
|
||||||
|
|
||||||
|
# logger.info('Determining best model for all features, may take some time')
|
||||||
|
|
||||||
|
# def compute_quantiles(ft):
|
||||||
|
# f = Fitter(self.data_dictionary["train_features"][ft],
|
||||||
|
# distributions=['gamma', 'cauchy', 'laplace',
|
||||||
|
# 'beta', 'uniform', 'lognorm'])
|
||||||
|
# f.fit()
|
||||||
|
# # f.summary()
|
||||||
|
# dist = list(f.get_best().items())[0][0]
|
||||||
|
# params = f.get_best()[dist]
|
||||||
|
# upper_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.999, **params)
|
||||||
|
# lower_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.001, **params)
|
||||||
|
|
||||||
|
# return ft, upper_q, lower_q, dist
|
||||||
|
|
||||||
|
# quantiles_tuple = Parallel(n_jobs=-1)(
|
||||||
|
# delayed(compute_quantiles)(ft) for ft in self.data_dictionary[
|
||||||
|
# 'train_features'].columns)
|
||||||
|
|
||||||
|
# df = pd.DataFrame(quantiles_tuple, columns=['features', 'upper_quantiles',
|
||||||
|
# 'lower_quantiles', 'dist'])
|
||||||
|
# self.data_dictionary['upper_quantiles'] = df['upper_quantiles']
|
||||||
|
# self.data_dictionary['lower_quantiles'] = df['lower_quantiles']
|
||||||
|
|
||||||
|
# return
|
||||||
|
|
||||||
|
# def remove_outliers(self, predict: bool) -> None:
|
||||||
|
# """
|
||||||
|
# Remove data that looks like an outlier based on the distribution of each
|
||||||
|
# variable.
|
||||||
|
# :params:
|
||||||
|
# :predict: boolean which tells the function if this is prediction data or
|
||||||
|
# training data coming in.
|
||||||
|
# """
|
||||||
|
|
||||||
|
# lower_quantile = self.data_dictionary["lower_quantiles"].to_numpy()
|
||||||
|
# upper_quantile = self.data_dictionary["upper_quantiles"].to_numpy()
|
||||||
|
|
||||||
|
# if predict:
|
||||||
|
|
||||||
|
# df = self.data_dictionary["prediction_features"][
|
||||||
|
# (self.data_dictionary["prediction_features"] < upper_quantile)
|
||||||
|
# & (self.data_dictionary["prediction_features"] > lower_quantile)
|
||||||
|
# ]
|
||||||
|
# drop_index = pd.isnull(df).any(1)
|
||||||
|
# self.data_dictionary["prediction_features"].fillna(0, inplace=True)
|
||||||
|
# drop_index = ~drop_index
|
||||||
|
# do_predict = np.array(drop_index.replace(True, 1).replace(False, 0))
|
||||||
|
|
||||||
|
# logger.info(
|
||||||
|
# "remove_outliers() tossed %s predictions",
|
||||||
|
# len(do_predict) - do_predict.sum(),
|
||||||
|
# )
|
||||||
|
# self.do_predict += do_predict
|
||||||
|
# self.do_predict -= 1
|
||||||
|
|
||||||
|
# else:
|
||||||
|
|
||||||
|
# filter_train_df = self.data_dictionary["train_features"][
|
||||||
|
# (self.data_dictionary["train_features"] < upper_quantile)
|
||||||
|
# & (self.data_dictionary["train_features"] > lower_quantile)
|
||||||
|
# ]
|
||||||
|
# drop_index = pd.isnull(filter_train_df).any(1)
|
||||||
|
# drop_index = drop_index.replace(True, 1).replace(False, 0)
|
||||||
|
# self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
|
||||||
|
# (drop_index == 0)
|
||||||
|
# ]
|
||||||
|
# self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
|
||||||
|
# (drop_index == 0)
|
||||||
|
# ]
|
||||||
|
# self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
|
||||||
|
# (drop_index == 0)
|
||||||
|
# ]
|
||||||
|
|
||||||
|
# logger.info(
|
||||||
|
# f'remove_outliers() tossed {drop_index.sum()}'
|
||||||
|
# f' training points from {len(filter_train_df)}'
|
||||||
|
# )
|
||||||
|
|
||||||
|
# # do the same for the test data
|
||||||
|
# filter_test_df = self.data_dictionary["test_features"][
|
||||||
|
# (self.data_dictionary["test_features"] < upper_quantile)
|
||||||
|
# & (self.data_dictionary["test_features"] > lower_quantile)
|
||||||
|
# ]
|
||||||
|
# drop_index = pd.isnull(filter_test_df).any(1)
|
||||||
|
# drop_index = drop_index.replace(True, 1).replace(False, 0)
|
||||||
|
# self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][
|
||||||
|
# (drop_index == 0)
|
||||||
|
# ]
|
||||||
|
# self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
|
||||||
|
# (drop_index == 0)
|
||||||
|
# ]
|
||||||
|
# self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][
|
||||||
|
# (drop_index == 0)
|
||||||
|
# ]
|
||||||
|
|
||||||
|
# logger.info(
|
||||||
|
# f'remove_outliers() tossed {drop_index.sum()}'
|
||||||
|
# f' test points from {len(filter_test_df)}'
|
||||||
|
# )
|
||||||
|
|
||||||
|
# return
|
||||||
|
@ -62,6 +62,7 @@ class IFreqaiModel(ABC):
|
|||||||
self.predictions = None
|
self.predictions = None
|
||||||
self.training_on_separate_thread = False
|
self.training_on_separate_thread = False
|
||||||
self.retrain = False
|
self.retrain = False
|
||||||
|
self.first = True
|
||||||
|
|
||||||
def start(self, dataframe: DataFrame, metadata: dict, strategy: IStrategy) -> DataFrame:
|
def start(self, dataframe: DataFrame, metadata: dict, strategy: IStrategy) -> DataFrame:
|
||||||
"""
|
"""
|
||||||
@ -80,12 +81,12 @@ class IFreqaiModel(ABC):
|
|||||||
:metadata: pair metadata coming from strategy.
|
:metadata: pair metadata coming from strategy.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
live = strategy.dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE)
|
self.live = strategy.dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE)
|
||||||
|
|
||||||
self.pair = metadata["pair"]
|
self.pair = metadata["pair"]
|
||||||
self.dh = FreqaiDataKitchen(self.config, dataframe, live)
|
self.dh = FreqaiDataKitchen(self.config, dataframe, self.live)
|
||||||
|
|
||||||
if live:
|
if self.live:
|
||||||
# logger.info('testing live')
|
# logger.info('testing live')
|
||||||
self.start_live(dataframe, metadata, strategy)
|
self.start_live(dataframe, metadata, strategy)
|
||||||
|
|
||||||
@ -115,11 +116,12 @@ class IFreqaiModel(ABC):
|
|||||||
self.dh.save_data(self.model)
|
self.dh.save_data(self.model)
|
||||||
else:
|
else:
|
||||||
self.model = self.dh.load_data()
|
self.model = self.dh.load_data()
|
||||||
strategy_provided_features = self.dh.find_features(dataframe_train)
|
# strategy_provided_features = self.dh.find_features(dataframe_train)
|
||||||
if strategy_provided_features != self.dh.training_features_list:
|
# # TOFIX doesnt work with PCA
|
||||||
logger.info("User changed input features, retraining model.")
|
# if strategy_provided_features != self.dh.training_features_list:
|
||||||
self.model = self.train(dataframe_train, metadata)
|
# logger.info("User changed input features, retraining model.")
|
||||||
self.dh.save_data(self.model)
|
# self.model = self.train(dataframe_train, metadata)
|
||||||
|
# self.dh.save_data(self.model)
|
||||||
|
|
||||||
preds, do_preds = self.predict(dataframe_backtest, metadata)
|
preds, do_preds = self.predict(dataframe_backtest, metadata)
|
||||||
|
|
||||||
@ -148,7 +150,7 @@ class IFreqaiModel(ABC):
|
|||||||
if not self.training_on_separate_thread:
|
if not self.training_on_separate_thread:
|
||||||
# this will also prevent other pairs from trying to train simultaneously.
|
# this will also prevent other pairs from trying to train simultaneously.
|
||||||
(self.retrain,
|
(self.retrain,
|
||||||
new_trained_timerange) = self.dh.check_if_new_training_required(self.freqai_info[
|
self.new_trained_timerange) = self.dh.check_if_new_training_required(self.freqai_info[
|
||||||
'live_trained_timerange'],
|
'live_trained_timerange'],
|
||||||
metadata)
|
metadata)
|
||||||
else:
|
else:
|
||||||
@ -156,14 +158,19 @@ class IFreqaiModel(ABC):
|
|||||||
self.retrain = False
|
self.retrain = False
|
||||||
|
|
||||||
if self.retrain or not file_exists:
|
if self.retrain or not file_exists:
|
||||||
self.training_on_separate_thread = True # acts like a lock
|
if self.first:
|
||||||
self.retrain_model_on_separate_thread(new_trained_timerange, metadata, strategy)
|
self.train_model_in_series(self.new_trained_timerange, metadata, strategy)
|
||||||
|
self.first = False
|
||||||
|
else:
|
||||||
|
self.training_on_separate_thread = True # acts like a lock
|
||||||
|
self.retrain_model_on_separate_thread(self.new_trained_timerange,
|
||||||
|
metadata, strategy)
|
||||||
|
|
||||||
self.model = self.dh.load_data()
|
self.model = self.dh.load_data()
|
||||||
|
|
||||||
strategy_provided_features = self.dh.find_features(dataframe)
|
strategy_provided_features = self.dh.find_features(dataframe)
|
||||||
if strategy_provided_features != self.dh.training_features_list:
|
if strategy_provided_features != self.dh.training_features_list:
|
||||||
self.train_model_in_series(new_trained_timerange, metadata, strategy)
|
self.train_model_in_series(self.new_trained_timerange, metadata, strategy)
|
||||||
|
|
||||||
preds, do_preds = self.predict(dataframe, metadata)
|
preds, do_preds = self.predict(dataframe, metadata)
|
||||||
self.dh.append_predictions(preds, do_preds, len(dataframe))
|
self.dh.append_predictions(preds, do_preds, len(dataframe))
|
||||||
@ -215,12 +222,36 @@ class IFreqaiModel(ABC):
|
|||||||
data (NaNs) or felt uncertain about data (PCA and DI index)
|
data (NaNs) or felt uncertain about data (PCA and DI index)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def data_cleaning_train(self) -> None:
|
||||||
|
"""
|
||||||
|
User can add data analysis and cleaning here.
|
||||||
|
Any function inside this method should drop training data points from the filtered_dataframe
|
||||||
|
based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example
|
||||||
|
of how outlier data points are dropped from the dataframe used for training.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def data_cleaning_predict(self) -> None:
|
||||||
|
"""
|
||||||
|
User can add data analysis and cleaning here.
|
||||||
|
These functions each modify self.dh.do_predict, which is a dataframe with equal length
|
||||||
|
to the number of candles coming from and returning to the strategy. Inside do_predict,
|
||||||
|
1 allows prediction and < 0 signals to the strategy that the model is not confident in
|
||||||
|
the prediction.
|
||||||
|
See FreqaiDataKitchen::remove_outliers() for an example
|
||||||
|
of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
|
||||||
|
for buy signals.
|
||||||
|
"""
|
||||||
|
|
||||||
def model_exists(self, pair: str, training_timerange: str) -> bool:
|
def model_exists(self, pair: str, training_timerange: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Given a pair and path, check if a model already exists
|
Given a pair and path, check if a model already exists
|
||||||
:param pair: pair e.g. BTC/USD
|
:param pair: pair e.g. BTC/USD
|
||||||
:param path: path to model
|
:param path: path to model
|
||||||
"""
|
"""
|
||||||
|
if self.live and training_timerange is None:
|
||||||
|
return False
|
||||||
coin, _ = pair.split("/")
|
coin, _ = pair.split("/")
|
||||||
self.dh.model_filename = "cb_" + coin.lower() + "_" + training_timerange
|
self.dh.model_filename = "cb_" + coin.lower() + "_" + training_timerange
|
||||||
path_to_modelfile = Path(self.dh.model_path / str(self.dh.model_filename + "_model.joblib"))
|
path_to_modelfile = Path(self.dh.model_path / str(self.dh.model_filename + "_model.joblib"))
|
||||||
@ -265,3 +296,4 @@ class IFreqaiModel(ABC):
|
|||||||
|
|
||||||
self.model = self.train(unfiltered_dataframe, metadata)
|
self.model = self.train(unfiltered_dataframe, metadata)
|
||||||
self.dh.save_data(self.model)
|
self.dh.save_data(self.model)
|
||||||
|
self.retrain = False
|
||||||
|
@ -29,7 +29,7 @@ class CatboostPredictionModel(IFreqaiModel):
|
|||||||
dataframe["close"]
|
dataframe["close"]
|
||||||
.shift(-self.feature_parameters["period"])
|
.shift(-self.feature_parameters["period"])
|
||||||
.rolling(self.feature_parameters["period"])
|
.rolling(self.feature_parameters["period"])
|
||||||
.max()
|
.mean()
|
||||||
/ dataframe["close"]
|
/ dataframe["close"]
|
||||||
- 1
|
- 1
|
||||||
)
|
)
|
||||||
@ -68,15 +68,11 @@ class CatboostPredictionModel(IFreqaiModel):
|
|||||||
# standardize all data based on train_dataset only
|
# standardize all data based on train_dataset only
|
||||||
data_dictionary = self.dh.standardize_data(data_dictionary)
|
data_dictionary = self.dh.standardize_data(data_dictionary)
|
||||||
|
|
||||||
# optional additional data cleaning
|
# optional additional data cleaning/analysis
|
||||||
if self.feature_parameters["principal_component_analysis"]:
|
self.data_cleaning_train()
|
||||||
self.dh.principal_component_analysis()
|
|
||||||
if self.feature_parameters["remove_outliers"]:
|
|
||||||
self.dh.remove_outliers(predict=False)
|
|
||||||
if self.feature_parameters["DI_threshold"]:
|
|
||||||
self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
|
|
||||||
|
|
||||||
logger.info("length of train data %s", len(data_dictionary["train_features"]))
|
logger.info(f'Training model on {len(self.dh.training_features_list)} features')
|
||||||
|
logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
|
||||||
|
|
||||||
model = self.fit(data_dictionary)
|
model = self.fit(data_dictionary)
|
||||||
|
|
||||||
@ -86,9 +82,7 @@ class CatboostPredictionModel(IFreqaiModel):
|
|||||||
|
|
||||||
def fit(self, data_dictionary: Dict) -> Any:
|
def fit(self, data_dictionary: Dict) -> Any:
|
||||||
"""
|
"""
|
||||||
Most regressors use the same function names and arguments e.g. user
|
User sets up the training and test data to fit their desired model here
|
||||||
can drop in LGBMRegressor in place of CatBoostRegressor and all data
|
|
||||||
management will be properly handled by Freqai.
|
|
||||||
:params:
|
:params:
|
||||||
:data_dictionary: the dictionary constructed by DataHandler to hold
|
:data_dictionary: the dictionary constructed by DataHandler to hold
|
||||||
all the training and test data/labels.
|
all the training and test data/labels.
|
||||||
@ -133,7 +127,51 @@ class CatboostPredictionModel(IFreqaiModel):
|
|||||||
filtered_dataframe = self.dh.standardize_data_from_metadata(filtered_dataframe)
|
filtered_dataframe = self.dh.standardize_data_from_metadata(filtered_dataframe)
|
||||||
self.dh.data_dictionary["prediction_features"] = filtered_dataframe
|
self.dh.data_dictionary["prediction_features"] = filtered_dataframe
|
||||||
|
|
||||||
# optional additional data cleaning
|
# optional additional data cleaning/analysis
|
||||||
|
self.data_cleaning_predict(filtered_dataframe)
|
||||||
|
|
||||||
|
predictions = self.model.predict(self.dh.data_dictionary["prediction_features"])
|
||||||
|
|
||||||
|
# compute the non-standardized predictions
|
||||||
|
self.dh.predictions = (predictions + 1) * (self.dh.data["labels_max"] -
|
||||||
|
self.dh.data["labels_min"]) / 2 + self.dh.data[
|
||||||
|
"labels_min"]
|
||||||
|
|
||||||
|
# logger.info("--------------------Finished prediction--------------------")
|
||||||
|
|
||||||
|
return (self.dh.predictions, self.dh.do_predict)
|
||||||
|
|
||||||
|
def data_cleaning_train(self) -> None:
|
||||||
|
"""
|
||||||
|
User can add data analysis and cleaning here.
|
||||||
|
Any function inside this method should drop training data points from the filtered_dataframe
|
||||||
|
based on user decided logic. See FreqaiDataKitchen::remove_outliers() for an example
|
||||||
|
of how outlier data points are dropped from the dataframe used for training.
|
||||||
|
"""
|
||||||
|
if self.feature_parameters["principal_component_analysis"]:
|
||||||
|
self.dh.principal_component_analysis()
|
||||||
|
|
||||||
|
# if self.feature_parameters["determine_statistical_distributions"]:
|
||||||
|
# self.dh.determine_statistical_distributions()
|
||||||
|
# if self.feature_parameters["remove_outliers"]:
|
||||||
|
# self.dh.remove_outliers(predict=False)
|
||||||
|
|
||||||
|
if self.feature_parameters["use_SVM_to_remove_outliers"]:
|
||||||
|
self.dh.use_SVM_to_remove_outliers(predict=False)
|
||||||
|
if self.feature_parameters["DI_threshold"]:
|
||||||
|
self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
|
||||||
|
|
||||||
|
def data_cleaning_predict(self, filtered_dataframe: DataFrame) -> None:
|
||||||
|
"""
|
||||||
|
User can add data analysis and cleaning here.
|
||||||
|
These functions each modify self.dh.do_predict, which is a dataframe with equal length
|
||||||
|
to the number of candles coming from and returning to the strategy. Inside do_predict,
|
||||||
|
1 allows prediction and < 0 signals to the strategy that the model is not confident in
|
||||||
|
the prediction.
|
||||||
|
See FreqaiDataKitchen::remove_outliers() for an example
|
||||||
|
of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
|
||||||
|
for buy signals.
|
||||||
|
"""
|
||||||
if self.feature_parameters["principal_component_analysis"]:
|
if self.feature_parameters["principal_component_analysis"]:
|
||||||
pca_components = self.dh.pca.transform(filtered_dataframe)
|
pca_components = self.dh.pca.transform(filtered_dataframe)
|
||||||
self.dh.data_dictionary["prediction_features"] = pd.DataFrame(
|
self.dh.data_dictionary["prediction_features"] = pd.DataFrame(
|
||||||
@ -142,17 +180,13 @@ class CatboostPredictionModel(IFreqaiModel):
|
|||||||
index=filtered_dataframe.index,
|
index=filtered_dataframe.index,
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.feature_parameters["remove_outliers"]:
|
# if self.feature_parameters["determine_statistical_distributions"]:
|
||||||
self.dh.remove_outliers(predict=True) # creates dropped index
|
# self.dh.determine_statistical_distributions()
|
||||||
|
# if self.feature_parameters["remove_outliers"]:
|
||||||
|
# self.dh.remove_outliers(predict=True) # creates dropped index
|
||||||
|
|
||||||
|
if self.feature_parameters["use_SVM_to_remove_outliers"]:
|
||||||
|
self.dh.use_SVM_to_remove_outliers(predict=True)
|
||||||
|
|
||||||
if self.feature_parameters["DI_threshold"]:
|
if self.feature_parameters["DI_threshold"]:
|
||||||
self.dh.check_if_pred_in_training_spaces() # sets do_predict
|
self.dh.check_if_pred_in_training_spaces() # sets do_predict
|
||||||
|
|
||||||
predictions = self.model.predict(self.dh.data_dictionary["prediction_features"])
|
|
||||||
|
|
||||||
# compute the non-standardized predictions
|
|
||||||
self.dh.predictions = predictions * self.dh.data["labels_std"] + self.dh.data["labels_mean"]
|
|
||||||
|
|
||||||
# logger.info("--------------------Finished prediction--------------------")
|
|
||||||
|
|
||||||
return (self.dh.predictions, self.dh.do_predict)
|
|
||||||
|
@ -1,159 +0,0 @@
|
|||||||
import logging
|
|
||||||
from typing import Any, Dict, Tuple
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
from catboost import CatBoostRegressor, Pool
|
|
||||||
from pandas import DataFrame
|
|
||||||
|
|
||||||
from freqtrade.freqai.freqai_interface import IFreqaiModel
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class ExamplePredictionModel(IFreqaiModel):
|
|
||||||
"""
|
|
||||||
User created prediction model. The class needs to override three necessary
|
|
||||||
functions, predict(), train(), fit(). The class inherits ModelHandler which
|
|
||||||
has its own DataHandler where data is held, saved, loaded, and managed.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def make_labels(self, dataframe: DataFrame) -> DataFrame:
|
|
||||||
"""
|
|
||||||
User defines the labels here (target values).
|
|
||||||
:params:
|
|
||||||
:dataframe: the full dataframe for the present training period
|
|
||||||
"""
|
|
||||||
|
|
||||||
dataframe["s"] = (
|
|
||||||
dataframe["close"]
|
|
||||||
.shift(-self.feature_parameters["period"])
|
|
||||||
.rolling(self.feature_parameters["period"])
|
|
||||||
.max()
|
|
||||||
/ dataframe["close"]
|
|
||||||
- 1
|
|
||||||
)
|
|
||||||
self.dh.data["s_mean"] = dataframe["s"].mean()
|
|
||||||
self.dh.data["s_std"] = dataframe["s"].std()
|
|
||||||
|
|
||||||
# logger.info("label mean", self.dh.data["s_mean"], "label std", self.dh.data["s_std"])
|
|
||||||
|
|
||||||
return dataframe["s"]
|
|
||||||
|
|
||||||
def train(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Tuple[DataFrame, DataFrame]:
|
|
||||||
"""
|
|
||||||
Filter the training data and train a model to it. Train makes heavy use of the datakitchen
|
|
||||||
for storing, saving, loading, and analyzing the data.
|
|
||||||
:params:
|
|
||||||
:unfiltered_dataframe: Full dataframe for the current training period
|
|
||||||
:metadata: pair metadata from strategy.
|
|
||||||
:returns:
|
|
||||||
:model: Trained model which can be used to inference (self.predict)
|
|
||||||
"""
|
|
||||||
logger.info("--------------------Starting training--------------------")
|
|
||||||
|
|
||||||
# create the full feature list based on user config info
|
|
||||||
self.dh.training_features_list = self.dh.build_feature_list(self.config, metadata)
|
|
||||||
unfiltered_labels = self.make_labels(unfiltered_dataframe)
|
|
||||||
|
|
||||||
# filter the features requested by user in the configuration file and elegantly handle NaNs
|
|
||||||
features_filtered, labels_filtered = self.dh.filter_features(
|
|
||||||
unfiltered_dataframe,
|
|
||||||
self.dh.training_features_list,
|
|
||||||
unfiltered_labels,
|
|
||||||
training_filter=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# split data into train/test data.
|
|
||||||
data_dictionary = self.dh.make_train_test_datasets(features_filtered, labels_filtered)
|
|
||||||
# standardize all data based on train_dataset only
|
|
||||||
data_dictionary = self.dh.standardize_data(data_dictionary)
|
|
||||||
|
|
||||||
# optional additional data cleaning
|
|
||||||
if self.feature_parameters["principal_component_analysis"]:
|
|
||||||
self.dh.principal_component_analysis()
|
|
||||||
if self.feature_parameters["remove_outliers"]:
|
|
||||||
self.dh.remove_outliers(predict=False)
|
|
||||||
if self.feature_parameters["DI_threshold"]:
|
|
||||||
self.dh.data["avg_mean_dist"] = self.dh.compute_distances()
|
|
||||||
|
|
||||||
logger.info("length of train data %s", len(data_dictionary["train_features"]))
|
|
||||||
|
|
||||||
model = self.fit(data_dictionary)
|
|
||||||
|
|
||||||
logger.info(f'--------------------done training {metadata["pair"]}--------------------')
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
def fit(self, data_dictionary: Dict) -> Any:
|
|
||||||
"""
|
|
||||||
Most regressors use the same function names and arguments e.g. user
|
|
||||||
can drop in LGBMRegressor in place of CatBoostRegressor and all data
|
|
||||||
management will be properly handled by Freqai.
|
|
||||||
:params:
|
|
||||||
:data_dictionary: the dictionary constructed by DataHandler to hold
|
|
||||||
all the training and test data/labels.
|
|
||||||
"""
|
|
||||||
|
|
||||||
train_data = Pool(
|
|
||||||
data=data_dictionary["train_features"],
|
|
||||||
label=data_dictionary["train_labels"],
|
|
||||||
weight=data_dictionary["train_weights"],
|
|
||||||
)
|
|
||||||
|
|
||||||
test_data = Pool(
|
|
||||||
data=data_dictionary["test_features"],
|
|
||||||
label=data_dictionary["test_labels"],
|
|
||||||
weight=data_dictionary["test_weights"],
|
|
||||||
)
|
|
||||||
|
|
||||||
model = CatBoostRegressor(
|
|
||||||
verbose=100, early_stopping_rounds=400, **self.model_training_parameters
|
|
||||||
)
|
|
||||||
model.fit(X=train_data, eval_set=test_data)
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
def predict(self, unfiltered_dataframe: DataFrame, metadata: dict) -> Tuple[DataFrame,
|
|
||||||
DataFrame]:
|
|
||||||
"""
|
|
||||||
Filter the prediction features data and predict with it.
|
|
||||||
:param: unfiltered_dataframe: Full dataframe for the current backtest period.
|
|
||||||
:return:
|
|
||||||
:predictions: np.array of predictions
|
|
||||||
:do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
|
|
||||||
data (NaNs) or felt uncertain about data (PCA and DI index)
|
|
||||||
"""
|
|
||||||
|
|
||||||
# logger.info("--------------------Starting prediction--------------------")
|
|
||||||
|
|
||||||
original_feature_list = self.dh.build_feature_list(self.config, metadata)
|
|
||||||
filtered_dataframe, _ = self.dh.filter_features(
|
|
||||||
unfiltered_dataframe, original_feature_list, training_filter=False
|
|
||||||
)
|
|
||||||
filtered_dataframe = self.dh.standardize_data_from_metadata(filtered_dataframe)
|
|
||||||
self.dh.data_dictionary["prediction_features"] = filtered_dataframe
|
|
||||||
|
|
||||||
# optional additional data cleaning
|
|
||||||
if self.feature_parameters["principal_component_analysis"]:
|
|
||||||
pca_components = self.dh.pca.transform(filtered_dataframe)
|
|
||||||
self.dh.data_dictionary["prediction_features"] = pd.DataFrame(
|
|
||||||
data=pca_components,
|
|
||||||
columns=["PC" + str(i) for i in range(0, self.dh.data["n_kept_components"])],
|
|
||||||
index=filtered_dataframe.index,
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.feature_parameters["remove_outliers"]:
|
|
||||||
self.dh.remove_outliers(predict=True) # creates dropped index
|
|
||||||
|
|
||||||
if self.feature_parameters["DI_threshold"]:
|
|
||||||
self.dh.check_if_pred_in_training_spaces() # sets do_predict
|
|
||||||
|
|
||||||
predictions = self.model.predict(self.dh.data_dictionary["prediction_features"])
|
|
||||||
|
|
||||||
# compute the non-standardized predictions
|
|
||||||
self.dh.predictions = predictions * self.dh.data["labels_std"] + self.dh.data["labels_mean"]
|
|
||||||
|
|
||||||
# logger.info("--------------------Finished prediction--------------------")
|
|
||||||
|
|
||||||
return (self.dh.predictions, self.dh.do_predict)
|
|
@ -166,8 +166,8 @@ class FreqaiExampleStrategy(IStrategy):
|
|||||||
dataframe["target_std"],
|
dataframe["target_std"],
|
||||||
) = self.model.bridge.start(dataframe, metadata, self)
|
) = self.model.bridge.start(dataframe, metadata, self)
|
||||||
|
|
||||||
dataframe["target_roi"] = dataframe["target_mean"] + dataframe["target_std"] * 0.5
|
dataframe["target_roi"] = dataframe["target_mean"] + dataframe["target_std"] * 1.5
|
||||||
dataframe["sell_roi"] = dataframe["target_mean"] - dataframe["target_std"] * 1.5
|
dataframe["sell_roi"] = dataframe["target_mean"] - dataframe["target_std"] * 1
|
||||||
return dataframe
|
return dataframe
|
||||||
|
|
||||||
def populate_buy_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
|
def populate_buy_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
|
||||||
@ -183,7 +183,7 @@ class FreqaiExampleStrategy(IStrategy):
|
|||||||
|
|
||||||
def populate_sell_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
|
def populate_sell_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
|
||||||
sell_conditions = [
|
sell_conditions = [
|
||||||
(dataframe["prediction"] < dataframe["sell_roi"]) & (dataframe["do_predict"] == 1)
|
(dataframe["do_predict"] <= 0)
|
||||||
]
|
]
|
||||||
if sell_conditions:
|
if sell_conditions:
|
||||||
dataframe.loc[reduce(lambda x, y: x | y, sell_conditions), "sell"] = 1
|
dataframe.loc[reduce(lambda x, y: x | y, sell_conditions), "sell"] = 1
|
||||||
|
Loading…
Reference in New Issue
Block a user