Merge branch 'freqtrade:develop' into fixHyperoptFreqai

This commit is contained in:
wagnercosta
2022-09-07 11:07:51 -03:00
committed by GitHub
11 changed files with 370 additions and 73 deletions

View File

@@ -1,7 +1,8 @@
import copy
import datetime
import logging
import shutil
from datetime import datetime, timezone
from math import cos, sin
from pathlib import Path
from typing import Any, Dict, List, Tuple
@@ -9,6 +10,7 @@ import numpy as np
import numpy.typing as npt
import pandas as pd
from pandas import DataFrame
from scipy import stats
from sklearn import linear_model
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import pairwise_distances
@@ -360,7 +362,7 @@ class FreqaiDataKitchen:
def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame:
"""
Normalize a set of data using the mean and standard deviation from
Denormalize a set of data using the mean and standard deviation from
the associated training data.
:param df: Dataframe of predictions to be denormalized
"""
@@ -399,7 +401,7 @@ class FreqaiDataKitchen:
config_timerange = TimeRange.parse_timerange(self.config["timerange"])
if config_timerange.stopts == 0:
config_timerange.stopts = int(
datetime.datetime.now(tz=datetime.timezone.utc).timestamp()
datetime.now(tz=timezone.utc).timestamp()
)
timerange_train = copy.deepcopy(full_timerange)
timerange_backtest = copy.deepcopy(full_timerange)
@@ -416,8 +418,8 @@ class FreqaiDataKitchen:
timerange_train.stopts = timerange_train.startts + train_period_days
first = False
start = datetime.datetime.utcfromtimestamp(timerange_train.startts)
stop = datetime.datetime.utcfromtimestamp(timerange_train.stopts)
start = datetime.fromtimestamp(timerange_train.startts, tz=timezone.utc)
stop = datetime.fromtimestamp(timerange_train.stopts, tz=timezone.utc)
tr_training_list.append(start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d"))
tr_training_list_timerange.append(copy.deepcopy(timerange_train))
@@ -430,8 +432,8 @@ class FreqaiDataKitchen:
if timerange_backtest.stopts > config_timerange.stopts:
timerange_backtest.stopts = config_timerange.stopts
start = datetime.datetime.utcfromtimestamp(timerange_backtest.startts)
stop = datetime.datetime.utcfromtimestamp(timerange_backtest.stopts)
start = datetime.fromtimestamp(timerange_backtest.startts, tz=timezone.utc)
stop = datetime.fromtimestamp(timerange_backtest.stopts, tz=timezone.utc)
tr_backtesting_list.append(start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d"))
tr_backtesting_list_timerange.append(copy.deepcopy(timerange_backtest))
@@ -451,8 +453,8 @@ class FreqaiDataKitchen:
it is sliced down to just the present training period.
"""
start = datetime.datetime.fromtimestamp(timerange.startts, tz=datetime.timezone.utc)
stop = datetime.datetime.fromtimestamp(timerange.stopts, tz=datetime.timezone.utc)
start = datetime.fromtimestamp(timerange.startts, tz=timezone.utc)
stop = datetime.fromtimestamp(timerange.stopts, tz=timezone.utc)
df = df.loc[df["date"] >= start, :]
if not self.live:
df = df.loc[df["date"] < stop, :]
@@ -653,8 +655,6 @@ class FreqaiDataKitchen:
is an outlier.
"""
from math import cos, sin
if predict:
if not self.data['DBSCAN_eps']:
return
@@ -747,6 +747,111 @@ class FreqaiDataKitchen:
return
def compute_inlier_metric(self, set_='train') -> None:
"""
Compute inlier metric from backwards distance distributions.
This metric defines how well features from a timepoint fit
into previous timepoints.
"""
no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"]
if set_ == 'train':
compute_df = copy.deepcopy(self.data_dictionary['train_features'])
elif set_ == 'test':
compute_df = copy.deepcopy(self.data_dictionary['test_features'])
else:
compute_df = copy.deepcopy(self.data_dictionary['prediction_features'])
compute_df_reindexed = compute_df.reindex(
index=np.flip(compute_df.index)
)
pairwise = pd.DataFrame(
np.triu(
pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count)
),
columns=compute_df_reindexed.index,
index=compute_df_reindexed.index
)
pairwise = pairwise.round(5)
column_labels = [
'{}{}'.format('d', i) for i in range(1, no_prev_pts + 1)
]
distances = pd.DataFrame(
columns=column_labels, index=compute_df.index
)
for index in compute_df.index[no_prev_pts:]:
current_row = pairwise.loc[[index]]
current_row_no_zeros = current_row.loc[
:, (current_row != 0).any(axis=0)
]
distances.loc[[index]] = current_row_no_zeros.iloc[
:, :no_prev_pts
]
distances = distances.replace([np.inf, -np.inf], np.nan)
drop_index = pd.isnull(distances).any(1)
distances = distances[drop_index == 0]
inliers = pd.DataFrame(index=distances.index)
for key in distances.keys():
current_distances = distances[key].dropna()
fit_params = stats.weibull_min.fit(current_distances)
quantiles = stats.weibull_min.cdf(current_distances, *fit_params)
df_inlier = pd.DataFrame(
{key: quantiles}, index=distances.index
)
inliers = pd.concat(
[inliers, df_inlier], axis=1
)
inlier_metric = pd.DataFrame(
data=inliers.sum(axis=1) / no_prev_pts,
columns=['inlier_metric'],
index=compute_df.index
)
inlier_metric = (2 * (inlier_metric - inlier_metric.min()) /
(inlier_metric.max() - inlier_metric.min()) - 1)
if set_ in ('train', 'test'):
inlier_metric = inlier_metric.iloc[no_prev_pts:]
compute_df = compute_df.iloc[no_prev_pts:]
self.remove_beginning_points_from_data_dict(set_, no_prev_pts)
self.data_dictionary[f'{set_}_features'] = pd.concat(
[compute_df, inlier_metric], axis=1)
else:
self.data_dictionary['prediction_features'] = pd.concat(
[compute_df, inlier_metric], axis=1)
self.data_dictionary['prediction_features'].fillna(0, inplace=True)
logger.info('Inlier metric computed and added to features.')
return None
def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10):
features = self.data_dictionary[f'{set_}_features']
weights = self.data_dictionary[f'{set_}_weights']
labels = self.data_dictionary[f'{set_}_labels']
self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:]
self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:]
self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:]
def add_noise_to_training_features(self) -> None:
"""
Add noise to train features to reduce the risk of overfitting.
"""
mu = 0 # no shift
sigma = self.freqai_config["feature_parameters"]["noise_standard_deviation"]
compute_df = self.data_dictionary['train_features']
noise = np.random.normal(mu, sigma, [compute_df.shape[0], compute_df.shape[1]])
self.data_dictionary['train_features'] += noise
return
def find_features(self, dataframe: DataFrame) -> None:
"""
Find features in the strategy provided dataframe
@@ -872,14 +977,14 @@ class FreqaiDataKitchen:
"Please indicate the end date of your desired backtesting. "
"timerange.")
# backtest_timerange.stopts = int(
# datetime.datetime.now(tz=datetime.timezone.utc).timestamp()
# datetime.now(tz=timezone.utc).timestamp()
# )
backtest_timerange.startts = (
backtest_timerange.startts - backtest_period_days * SECONDS_IN_DAY
)
start = datetime.datetime.utcfromtimestamp(backtest_timerange.startts)
stop = datetime.datetime.utcfromtimestamp(backtest_timerange.stopts)
start = datetime.fromtimestamp(backtest_timerange.startts, tz=timezone.utc)
stop = datetime.fromtimestamp(backtest_timerange.stopts, tz=timezone.utc)
full_timerange = start.strftime("%Y%m%d") + "-" + stop.strftime("%Y%m%d")
self.full_path = Path(
@@ -905,7 +1010,7 @@ class FreqaiDataKitchen:
:return:
bool = If the model is expired or not.
"""
time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp()
time = datetime.now(tz=timezone.utc).timestamp()
elapsed_time = (time - trained_timestamp) / 3600 # hours
max_time = self.freqai_config.get("expiration_hours", 0)
if max_time > 0:
@@ -917,7 +1022,7 @@ class FreqaiDataKitchen:
self, trained_timestamp: int
) -> Tuple[bool, TimeRange, TimeRange]:
time = datetime.datetime.now(tz=datetime.timezone.utc).timestamp()
time = datetime.now(tz=timezone.utc).timestamp()
trained_timerange = TimeRange()
data_load_timerange = TimeRange()

View File

@@ -1,10 +1,9 @@
# import contextlib
import datetime
import logging
import shutil
import threading
import time
from abc import ABC, abstractmethod
from datetime import datetime, timezone
from pathlib import Path
from threading import Lock
from typing import Any, Dict, List, Tuple
@@ -59,7 +58,6 @@ class IFreqaiModel(ABC):
"data_split_parameters", {})
self.model_training_parameters: Dict[str, Any] = config.get("freqai", {}).get(
"model_training_parameters", {})
self.feature_parameters = config.get("freqai", {}).get("feature_parameters")
self.retrain = False
self.first = True
self.set_full_path()
@@ -70,11 +68,14 @@ class IFreqaiModel(ABC):
self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode)
self.identifier: str = self.freqai_info.get("identifier", "no_id_provided")
self.scanning = False
self.ft_params = self.freqai_info["feature_parameters"]
self.keras: bool = self.freqai_info.get("keras", False)
if self.keras and self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0):
self.freqai_info["feature_parameters"]["DI_threshold"] = 0
if self.keras and self.ft_params.get("DI_threshold", 0):
self.ft_params["DI_threshold"] = 0
logger.warning("DI threshold is not configured for Keras models yet. Deactivating.")
self.CONV_WIDTH = self.freqai_info.get("conv_width", 2)
if self.ft_params.get("inlier_metric_window", 0):
self.CONV_WIDTH = self.ft_params.get("inlier_metric_window", 0) * 2
self.pair_it = 0
self.pair_it_train = 0
self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist"))
@@ -195,7 +196,7 @@ class IFreqaiModel(ABC):
if retrain:
self.train_timer('start')
self.train_model_in_series(
self.extract_data_and_train_model(
new_trained_timerange, pair, strategy, dk, data_load_timerange
)
self.train_timer('stop')
@@ -235,12 +236,12 @@ class IFreqaiModel(ABC):
dataframe_backtest = dk.slice_dataframe(tr_backtest, dataframe)
trained_timestamp = tr_train
tr_train_startts_str = datetime.datetime.utcfromtimestamp(tr_train.startts).strftime(
"%Y-%m-%d %H:%M:%S"
)
tr_train_stopts_str = datetime.datetime.utcfromtimestamp(tr_train.stopts).strftime(
"%Y-%m-%d %H:%M:%S"
)
tr_train_startts_str = datetime.fromtimestamp(
tr_train.startts,
tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
tr_train_stopts_str = datetime.fromtimestamp(
tr_train.stopts,
tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
logger.info(
f"Training {metadata['pair']}, {self.pair_it}/{self.total_pairs} pairs"
f" from {tr_train_startts_str} to {tr_train_stopts_str}, {train_it}/{total_trains} "
@@ -425,24 +426,25 @@ class IFreqaiModel(ABC):
def data_cleaning_train(self, dk: FreqaiDataKitchen) -> None:
"""
Base data cleaning method for train
Any function inside this method should drop training data points from the filtered_dataframe
based on user decided logic. See FreqaiDataKitchen::use_SVM_to_remove_outliers() for an
example of how outlier data points are dropped from the dataframe used for training.
Base data cleaning method for train.
Functions here improve/modify the input data by identifying outliers,
computing additional metrics, adding noise, reducing dimensionality etc.
"""
if self.freqai_info["feature_parameters"].get(
ft_params = self.freqai_info["feature_parameters"]
if ft_params.get(
"principal_component_analysis", False
):
dk.principal_component_analysis()
if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False):
if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=False)
if self.freqai_info["feature_parameters"].get("DI_threshold", 0):
if ft_params.get("DI_threshold", 0):
dk.data["avg_mean_dist"] = dk.compute_distances()
if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False):
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
if dk.pair in self.dd.old_DBSCAN_eps:
eps = self.dd.old_DBSCAN_eps[dk.pair]
else:
@@ -450,29 +452,36 @@ class IFreqaiModel(ABC):
dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps)
self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps']
if ft_params.get('inlier_metric_window', 0):
dk.compute_inlier_metric(set_='train')
if self.freqai_info["data_split_parameters"]["test_size"] > 0:
dk.compute_inlier_metric(set_='test')
if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0):
dk.add_noise_to_training_features()
def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None:
"""
Base data cleaning method for predict.
These functions each modify dk.do_predict, which is a dataframe with equal length
to the number of candles coming from and returning to the strategy. Inside do_predict,
1 allows prediction and < 0 signals to the strategy that the model is not confident in
the prediction.
See FreqaiDataKitchen::remove_outliers() for an example
of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
for buy signals.
Functions here are complementary to the functions of data_cleaning_train.
"""
if self.freqai_info["feature_parameters"].get(
ft_params = self.freqai_info["feature_parameters"]
if ft_params.get('inlier_metric_window', 0):
dk.compute_inlier_metric(set_='predict')
if ft_params.get(
"principal_component_analysis", False
):
dk.pca_transform(dataframe)
if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False):
if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=True)
if self.freqai_info["feature_parameters"].get("DI_threshold", 0):
if ft_params.get("DI_threshold", 0):
dk.check_if_pred_in_training_spaces()
if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False):
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
dk.use_DBSCAN_to_remove_outliers(predict=True)
def model_exists(
@@ -508,7 +517,7 @@ class IFreqaiModel(ABC):
Path(self.full_path, Path(self.config["config_files"][0]).name),
)
def train_model_in_series(
def extract_data_and_train_model(
self,
new_trained_timerange: TimeRange,
pair: str,
@@ -600,7 +609,7 @@ class IFreqaiModel(ABC):
# # for keras type models, the conv_window needs to be prepended so
# # viewing is correct in frequi
if self.freqai_info.get('keras', False):
if self.freqai_info.get('keras', False) or self.ft_params.get('inlier_metric_window', 0):
n_lost_points = self.freqai_info.get('conv_width', 2)
zeros_df = DataFrame(np.zeros((n_lost_points, len(hist_preds_df.columns))),
columns=hist_preds_df.columns)