Merge branch 'develop' into dev-merge-rl

This commit is contained in:
robcaulk
2022-09-04 11:23:25 +02:00
97 changed files with 1710 additions and 958 deletions

View File

@@ -579,7 +579,6 @@ class FreqaiDataDrawer:
for training according to user defined train_period_days
metadata: dict = strategy furnished pair metadata
"""
with self.history_lock:
corr_dataframes: Dict[Any, Any] = {}
base_dataframes: Dict[Any, Any] = {}

View File

@@ -16,8 +16,6 @@ from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from freqtrade.configuration import TimeRange
from freqtrade.data.dataprovider import DataProvider
from freqtrade.data.history.history_utils import refresh_backtest_ohlcv_data
from freqtrade.exceptions import OperationalException
from freqtrade.exchange import timeframe_to_seconds
from freqtrade.strategy.interface import IStrategy
@@ -71,6 +69,8 @@ class FreqaiDataKitchen:
self.label_list: List = []
self.training_features_list: List = []
self.model_filename: str = ""
self.backtesting_results_path = Path()
self.backtest_predictions_folder: str = "backtesting_predictions"
self.live = live
self.pair = pair
@@ -168,9 +168,17 @@ class FreqaiDataKitchen:
train_labels = labels
train_weights = weights
return self.build_data_dictionary(
train_features, test_features, train_labels, test_labels, train_weights, test_weights
)
# Simplest way to reverse the order of training and test data:
if self.freqai_config['feature_parameters'].get('reverse_train_test_order', False):
return self.build_data_dictionary(
test_features, train_features, test_labels,
train_labels, test_weights, train_weights
)
else:
return self.build_data_dictionary(
train_features, test_features, train_labels,
test_labels, train_weights, test_weights
)
def filter_features(
self,
@@ -281,6 +289,7 @@ class FreqaiDataKitchen:
:returns:
:data_dictionary: updated dictionary with standardized values.
"""
# standardize the data by training stats
train_max = data_dictionary["train_features"].max()
train_min = data_dictionary["train_features"].min()
@@ -314,10 +323,24 @@ class FreqaiDataKitchen:
- 1
)
self.data[f"{item}_max"] = train_labels_max # .to_dict()
self.data[f"{item}_min"] = train_labels_min # .to_dict()
self.data[f"{item}_max"] = train_labels_max
self.data[f"{item}_min"] = train_labels_min
return data_dictionary
def normalize_single_dataframe(self, df: DataFrame) -> DataFrame:
train_max = df.max()
train_min = df.min()
df = (
2 * (df - train_min) / (train_max - train_min) - 1
)
for item in train_max.keys():
self.data[item + "_max"] = train_max[item]
self.data[item + "_min"] = train_min[item]
return df
def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
"""
Normalize a set of data using the mean and standard deviation from
@@ -444,22 +467,23 @@ class FreqaiDataKitchen:
from sklearn.decomposition import PCA # avoid importing if we dont need it
n_components = self.data_dictionary["train_features"].shape[1]
pca = PCA(n_components=n_components)
pca = PCA(0.999)
pca = pca.fit(self.data_dictionary["train_features"])
n_keep_components = np.argmin(pca.explained_variance_ratio_.cumsum() < 0.999)
pca2 = PCA(n_components=n_keep_components)
n_keep_components = pca.n_components_
self.data["n_kept_components"] = n_keep_components
pca2 = pca2.fit(self.data_dictionary["train_features"])
n_components = self.data_dictionary["train_features"].shape[1]
logger.info("reduced feature dimension by %s", n_components - n_keep_components)
logger.info("explained variance %f", np.sum(pca2.explained_variance_ratio_))
train_components = pca2.transform(self.data_dictionary["train_features"])
logger.info("explained variance %f", np.sum(pca.explained_variance_ratio_))
train_components = pca.transform(self.data_dictionary["train_features"])
self.data_dictionary["train_features"] = pd.DataFrame(
data=train_components,
columns=["PC" + str(i) for i in range(0, n_keep_components)],
index=self.data_dictionary["train_features"].index,
)
# normalsing transformed training features
self.data_dictionary["train_features"] = self.normalize_single_dataframe(
self.data_dictionary["train_features"])
# keeping a copy of the non-transformed features so we can check for errors during
# model load from disk
@@ -467,15 +491,18 @@ class FreqaiDataKitchen:
self.training_features_list = self.data_dictionary["train_features"].columns
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
test_components = pca2.transform(self.data_dictionary["test_features"])
test_components = pca.transform(self.data_dictionary["test_features"])
self.data_dictionary["test_features"] = pd.DataFrame(
data=test_components,
columns=["PC" + str(i) for i in range(0, n_keep_components)],
index=self.data_dictionary["test_features"].index,
)
# normalise transformed test feature to transformed training features
self.data_dictionary["test_features"] = self.normalize_data_from_metadata(
self.data_dictionary["test_features"])
self.data["n_kept_components"] = n_keep_components
self.pca = pca2
self.pca = pca
logger.info(f"PCA reduced total features from {n_components} to {n_keep_components}")
@@ -496,6 +523,9 @@ class FreqaiDataKitchen:
columns=["PC" + str(i) for i in range(0, self.data["n_kept_components"])],
index=filtered_dataframe.index,
)
# normalise transformed predictions to transformed training features
self.data_dictionary["prediction_features"] = self.normalize_data_from_metadata(
self.data_dictionary["prediction_features"])
def compute_distances(self) -> float:
"""
@@ -513,6 +543,18 @@ class FreqaiDataKitchen:
return avg_mean_dist
def get_outlier_percentage(self, dropped_pts: npt.NDArray) -> float:
"""
Check if more than X% of points werer dropped during outlier detection.
"""
outlier_protection_pct = self.freqai_config["feature_parameters"].get(
"outlier_protection_percentage", 30)
outlier_pct = (dropped_pts.sum() / len(dropped_pts)) * 100
if outlier_pct >= outlier_protection_pct:
return outlier_pct
else:
return 0.0
def use_SVM_to_remove_outliers(self, predict: bool) -> None:
"""
Build/inference a Support Vector Machine to detect outliers
@@ -550,8 +592,17 @@ class FreqaiDataKitchen:
self.data_dictionary["train_features"]
)
y_pred = self.svm_model.predict(self.data_dictionary["train_features"])
dropped_points = np.where(y_pred == -1, 0, y_pred)
kept_points = np.where(y_pred == -1, 0, y_pred)
# keep_index = np.where(y_pred == 1)
outlier_pct = self.get_outlier_percentage(1 - kept_points)
if outlier_pct:
logger.warning(
f"SVM detected {outlier_pct:.2f}% of the points as outliers. "
f"Keeping original dataset."
)
self.svm_model = None
return
self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
(y_pred == 1)
]
@@ -563,7 +614,7 @@ class FreqaiDataKitchen:
]
logger.info(
f"SVM tossed {len(y_pred) - dropped_points.sum()}"
f"SVM tossed {len(y_pred) - kept_points.sum()}"
f" train points from {len(y_pred)} total points."
)
@@ -572,7 +623,7 @@ class FreqaiDataKitchen:
# to reduce code duplication
if self.freqai_config['data_split_parameters'].get('test_size', 0.1) != 0:
y_pred = self.svm_model.predict(self.data_dictionary["test_features"])
dropped_points = np.where(y_pred == -1, 0, y_pred)
kept_points = np.where(y_pred == -1, 0, y_pred)
self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
(y_pred == 1)
]
@@ -583,7 +634,7 @@ class FreqaiDataKitchen:
]
logger.info(
f"SVM tossed {len(y_pred) - dropped_points.sum()}"
f"SVM tossed {len(y_pred) - kept_points.sum()}"
f" test points from {len(y_pred)} total points."
)
@@ -604,6 +655,8 @@ class FreqaiDataKitchen:
from math import cos, sin
if predict:
if not self.data['DBSCAN_eps']:
return
train_ft_df = self.data_dictionary['train_features']
pred_ft_df = self.data_dictionary['prediction_features']
num_preds = len(pred_ft_df)
@@ -635,8 +688,8 @@ class FreqaiDataKitchen:
cos(angle) * (point[1] - origin[1])
return (x, y)
MinPts = len(self.data_dictionary['train_features'].columns) * 2
# measure pairwise distances to train_features.shape[1]*2 nearest neighbours
MinPts = int(len(self.data_dictionary['train_features'].index) * 0.25)
# measure pairwise distances to nearest neighbours
neighbors = NearestNeighbors(
n_neighbors=MinPts, n_jobs=self.thread_count)
neighbors_fit = neighbors.fit(self.data_dictionary['train_features'])
@@ -667,6 +720,15 @@ class FreqaiDataKitchen:
self.data['DBSCAN_min_samples'] = MinPts
dropped_points = np.where(clustering.labels_ == -1, 1, 0)
outlier_pct = self.get_outlier_percentage(dropped_points)
if outlier_pct:
logger.warning(
f"DBSCAN detected {outlier_pct:.2f}% of the points as outliers. "
f"Keeping original dataset."
)
self.data['DBSCAN_eps'] = 0
return
self.data_dictionary['train_features'] = self.data_dictionary['train_features'][
(clustering.labels_ != -1)
]
@@ -725,7 +787,7 @@ class FreqaiDataKitchen:
if (len(do_predict) - do_predict.sum()) > 0:
logger.info(
f"DI tossed {len(do_predict) - do_predict.sum()} predictions for "
"being too far from training data"
"being too far from training data."
)
self.do_predict += do_predict
@@ -740,9 +802,10 @@ class FreqaiDataKitchen:
weights = np.exp(-np.arange(num_weights) / (wfactor * num_weights))[::-1]
return weights
def append_predictions(self, predictions: DataFrame, do_predict: npt.ArrayLike) -> None:
def get_predictions_to_append(self, predictions: DataFrame,
do_predict: npt.ArrayLike) -> DataFrame:
"""
Append backtest prediction from current backtest period to all previous periods
Get backtest prediction from current backtest period
"""
append_df = DataFrame()
@@ -757,13 +820,18 @@ class FreqaiDataKitchen:
if self.freqai_config["feature_parameters"].get("DI_threshold", 0) > 0:
append_df["DI_values"] = self.DI_values
return append_df
def append_predictions(self, append_df: DataFrame) -> None:
"""
Append backtest prediction from current backtest period to all previous periods
"""
if self.full_df.empty:
self.full_df = append_df
else:
self.full_df = pd.concat([self.full_df, append_df], axis=0)
return
def fill_predictions(self, dataframe):
"""
Back fill values to before the backtesting range so that the dataframe matches size
@@ -863,9 +931,7 @@ class FreqaiDataKitchen:
# We notice that users like to use exotic indicators where
# they do not know the required timeperiod. Here we include a factor
# of safety by multiplying the user considered "max" by 2.
max_period = self.freqai_config["feature_parameters"].get(
"indicator_max_period_candles", 20
) * 2
max_period = self.config.get('startup_candle_count', 20) * 2
additional_seconds = max_period * max_tf_seconds
if trained_timestamp != 0:
@@ -911,31 +977,6 @@ class FreqaiDataKitchen:
self.model_filename = f"cb_{coin.lower()}_{int(trained_timerange.stopts)}"
def download_all_data_for_training(self, timerange: TimeRange, dp: DataProvider) -> None:
"""
Called only once upon start of bot to download the necessary data for
populating indicators and training the model.
:param timerange: TimeRange = The full data timerange for populating the indicators
and training the model.
:param dp: DataProvider instance attached to the strategy
"""
new_pairs_days = int((timerange.stopts - timerange.startts) / SECONDS_IN_DAY)
if not dp._exchange:
# Not realistic - this is only called in live mode.
raise OperationalException("Dataprovider did not have an exchange attached.")
refresh_backtest_ohlcv_data(
dp._exchange,
pairs=self.all_pairs,
timeframes=self.freqai_config["feature_parameters"].get("include_timeframes"),
datadir=self.config["datadir"],
timerange=timerange,
new_pairs_days=new_pairs_days,
erase=False,
data_format=self.config.get("dataformat_ohlcv", "json"),
trading_mode=self.config.get("trading_mode", "spot"),
prepend=self.config.get("prepend_data", False),
)
def set_all_pairs(self) -> None:
self.all_pairs = copy.deepcopy(
@@ -1049,3 +1090,50 @@ class FreqaiDataKitchen:
if self.unique_classes:
for label in self.unique_classes:
self.unique_class_list += list(self.unique_classes[label])
def save_backtesting_prediction(
self, append_df: DataFrame
) -> None:
"""
Save prediction dataframe from backtesting to h5 file format
:param append_df: dataframe for backtesting period
"""
full_predictions_folder = Path(self.full_path / self.backtest_predictions_folder)
if not full_predictions_folder.is_dir():
full_predictions_folder.mkdir(parents=True, exist_ok=True)
append_df.to_hdf(self.backtesting_results_path, key='append_df', mode='w')
def get_backtesting_prediction(
self
) -> DataFrame:
"""
Get prediction dataframe from h5 file format
"""
append_df = pd.read_hdf(self.backtesting_results_path)
return append_df
def check_if_backtest_prediction_exists(
self
) -> bool:
"""
Check if a backtesting prediction already exists
:param dk: FreqaiDataKitchen
:return:
:boolean: whether the prediction file exists or not.
"""
path_to_predictionfile = Path(self.full_path /
self.backtest_predictions_folder /
f"{self.model_filename}_prediction.h5")
self.backtesting_results_path = path_to_predictionfile
file_exists = path_to_predictionfile.is_file()
if file_exists:
logger.info(f"Found backtesting prediction file at {path_to_predictionfile}")
else:
logger.info(
f"Could not find backtesting prediction file at {path_to_predictionfile}"
)
return file_exists

View File

@@ -71,6 +71,9 @@ class IFreqaiModel(ABC):
self.first = True
self.set_full_path()
self.follow_mode: bool = self.freqai_info.get("follow_mode", False)
self.save_backtest_models: bool = self.freqai_info.get("save_backtest_models", False)
if self.save_backtest_models:
logger.info('Backtesting module configured to save all models.')
self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode)
self.identifier: str = self.freqai_info.get("identifier", "no_id_provided")
self.scanning = False
@@ -125,10 +128,9 @@ class IFreqaiModel(ABC):
elif not self.follow_mode:
self.dk = FreqaiDataKitchen(self.config, self.live, metadata["pair"])
logger.info(f"Training {len(self.dk.training_timeranges)} timeranges")
with self.analysis_lock:
dataframe = self.dk.use_strategy_to_populate_indicators(
strategy, prediction_dataframe=dataframe, pair=metadata["pair"]
)
dataframe = self.dk.use_strategy_to_populate_indicators(
strategy, prediction_dataframe=dataframe, pair=metadata["pair"]
)
dk = self.start_backtesting(dataframe, metadata, self.dk)
dataframe = dk.remove_features_from_df(dk.return_dataframe)
@@ -225,28 +227,39 @@ class IFreqaiModel(ABC):
"trains"
)
trained_timestamp_int = int(trained_timestamp.stopts)
dk.data_path = Path(
dk.full_path
/
f"sub-train-{metadata['pair'].split('/')[0]}_{int(trained_timestamp.stopts)}"
f"sub-train-{metadata['pair'].split('/')[0]}_{trained_timestamp_int}"
)
if not self.model_exists(
metadata["pair"], dk, trained_timestamp=int(trained_timestamp.stopts)
):
dk.find_features(dataframe_train)
self.model = self.train(dataframe_train, metadata["pair"], dk)
self.dd.pair_dict[metadata["pair"]]["trained_timestamp"] = int(
trained_timestamp.stopts)
dk.set_new_model_names(metadata["pair"], trained_timestamp)
self.dd.save_data(self.model, metadata["pair"], dk)
dk.set_new_model_names(metadata["pair"], trained_timestamp)
if dk.check_if_backtest_prediction_exists():
append_df = dk.get_backtesting_prediction()
dk.append_predictions(append_df)
else:
self.model = self.dd.load_data(metadata["pair"], dk)
if not self.model_exists(
metadata["pair"], dk, trained_timestamp=trained_timestamp_int
):
dk.find_features(dataframe_train)
self.model = self.train(dataframe_train, metadata["pair"], dk)
self.dd.pair_dict[metadata["pair"]]["trained_timestamp"] = int(
trained_timestamp.stopts)
self.check_if_feature_list_matches_strategy(dataframe_train, dk)
if self.save_backtest_models:
logger.info('Saving backtest model to disk.')
self.dd.save_data(self.model, metadata["pair"], dk)
else:
self.model = self.dd.load_data(metadata["pair"], dk)
pred_df, do_preds = self.predict(dataframe_backtest, dk)
self.check_if_feature_list_matches_strategy(dataframe_train, dk)
dk.append_predictions(pred_df, do_preds)
pred_df, do_preds = self.predict(dataframe_backtest, dk)
append_df = dk.get_predictions_to_append(pred_df, do_preds)
dk.append_predictions(append_df)
dk.save_backtesting_prediction(append_df)
dk.fill_predictions(dataframe)
@@ -291,14 +304,8 @@ class IFreqaiModel(ABC):
)
dk.set_paths(metadata["pair"], new_trained_timerange.stopts)
# download candle history if it is not already in memory
# load candle history into memory if it is not yet.
if not self.dd.historic_data:
logger.info(
"Downloading all training data for all pairs in whitelist and "
"corr_pairlist, this may take a while if you do not have the "
"data saved"
)
dk.download_all_data_for_training(data_load_timerange, strategy.dp)
self.dd.load_all_pair_histories(data_load_timerange, dk)
if not self.scanning:
@@ -463,11 +470,6 @@ class IFreqaiModel(ABC):
:return:
:boolean: whether the model file exists or not.
"""
coin, _ = pair.split("/")
if not self.live:
dk.model_filename = model_filename = f"cb_{coin.lower()}_{trained_timestamp}"
path_to_modelfile = Path(dk.data_path / f"{model_filename}_model.joblib")
file_exists = path_to_modelfile.is_file()
if file_exists and not scanning:
@@ -620,8 +622,8 @@ class IFreqaiModel(ABC):
logger.info(
f'Total time spent inferencing pairlist {self.inference_time:.2f} seconds')
if self.inference_time > 0.25 * self.base_tf_seconds:
logger.warning('Inference took over 25/% of the candle time. Reduce pairlist to'
' avoid blinding open trades and degrading performance.')
logger.warning("Inference took over 25% of the candle time. Reduce pairlist to"
" avoid blinding open trades and degrading performance.")
self.pair_it = 0
self.inference_time = 0
return

134
freqtrade/freqai/utils.py Normal file
View File

@@ -0,0 +1,134 @@
import logging
from datetime import datetime, timezone
from freqtrade.configuration import TimeRange
from freqtrade.data.dataprovider import DataProvider
from freqtrade.data.history.history_utils import refresh_backtest_ohlcv_data
from freqtrade.exceptions import OperationalException
from freqtrade.exchange import timeframe_to_seconds
from freqtrade.exchange.exchange import market_is_active
from freqtrade.plugins.pairlist.pairlist_helpers import dynamic_expand_pairlist
logger = logging.getLogger(__name__)
def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
"""
Called only once upon start of bot to download the necessary data for
populating indicators and training the model.
:param timerange: TimeRange = The full data timerange for populating the indicators
and training the model.
:param dp: DataProvider instance attached to the strategy
"""
if dp._exchange is None:
raise OperationalException('No exchange object found.')
markets = [p for p, m in dp._exchange.markets.items() if market_is_active(m)
or config.get('include_inactive')]
all_pairs = dynamic_expand_pairlist(config, markets)
timerange = get_required_data_timerange(config)
new_pairs_days = int((timerange.stopts - timerange.startts) / 86400)
refresh_backtest_ohlcv_data(
dp._exchange,
pairs=all_pairs,
timeframes=config["freqai"]["feature_parameters"].get("include_timeframes"),
datadir=config["datadir"],
timerange=timerange,
new_pairs_days=new_pairs_days,
erase=False,
data_format=config.get("dataformat_ohlcv", "json"),
trading_mode=config.get("trading_mode", "spot"),
prepend=config.get("prepend_data", False),
)
def get_required_data_timerange(
config: dict
) -> TimeRange:
"""
Used to compute the required data download time range
for auto data-download in FreqAI
"""
time = datetime.now(tz=timezone.utc).timestamp()
timeframes = config["freqai"]["feature_parameters"].get("include_timeframes")
max_tf_seconds = 0
for tf in timeframes:
secs = timeframe_to_seconds(tf)
if secs > max_tf_seconds:
max_tf_seconds = secs
startup_candles = config.get('startup_candle_count', 0)
indicator_periods = config["freqai"]["feature_parameters"]["indicator_periods_candles"]
# factor the max_period as a factor of safety.
max_period = int(max(startup_candles, max(indicator_periods)) * 1.5)
config['startup_candle_count'] = max_period
logger.info(f'FreqAI auto-downloader using {max_period} startup candles.')
additional_seconds = max_period * max_tf_seconds
startts = int(
time
- config["freqai"].get("train_period_days", 0) * 86400
- additional_seconds
)
stopts = int(time)
data_load_timerange = TimeRange('date', 'date', startts, stopts)
return data_load_timerange
# Keep below for when we wish to download heterogeneously lengthed data for FreqAI.
# def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
# """
# Called only once upon start of bot to download the necessary data for
# populating indicators and training a FreqAI model.
# :param timerange: TimeRange = The full data timerange for populating the indicators
# and training the model.
# :param dp: DataProvider instance attached to the strategy
# """
# if dp._exchange is not None:
# markets = [p for p, m in dp._exchange.markets.items() if market_is_active(m)
# or config.get('include_inactive')]
# else:
# # This should not occur:
# raise OperationalException('No exchange object found.')
# all_pairs = dynamic_expand_pairlist(config, markets)
# if not dp._exchange:
# # Not realistic - this is only called in live mode.
# raise OperationalException("Dataprovider did not have an exchange attached.")
# time = datetime.now(tz=timezone.utc).timestamp()
# for tf in config["freqai"]["feature_parameters"].get("include_timeframes"):
# timerange = TimeRange()
# timerange.startts = int(time)
# timerange.stopts = int(time)
# startup_candles = dp.get_required_startup(str(tf))
# tf_seconds = timeframe_to_seconds(str(tf))
# timerange.subtract_start(tf_seconds * startup_candles)
# new_pairs_days = int((timerange.stopts - timerange.startts) / 86400)
# # FIXME: now that we are looping on `refresh_backtest_ohlcv_data`, the function
# # redownloads the funding rate for each pair.
# refresh_backtest_ohlcv_data(
# dp._exchange,
# pairs=all_pairs,
# timeframes=[tf],
# datadir=config["datadir"],
# timerange=timerange,
# new_pairs_days=new_pairs_days,
# erase=False,
# data_format=config.get("dataformat_ohlcv", "json"),
# trading_mode=config.get("trading_mode", "spot"),
# prepend=config.get("prepend_data", False),
# )