Compare commits

...

3 Commits

Author SHA1 Message Date
robcaulk
755041c134 add noise feature, improve docstrings 2022-08-19 18:35:24 +02:00
robcaulk
98c62dad91 integrate inlier metric function 2022-08-19 15:22:54 +02:00
th0rntwig
52ee7fc981 Add inlier metric computation 2022-08-19 15:22:54 +02:00
2 changed files with 140 additions and 22 deletions

View File

@ -655,6 +655,114 @@ class FreqaiDataKitchen:
return return
def compute_inlier_metric(self, set_='train') -> None:
"""
Compute inlier metric from backwards distance distributions.
This metric defines how well features from a timepoint fit
into previous timepoints.
"""
import scipy.stats as ss
no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"]
weib_pct = self.freqai_config["feature_parameters"]["inlier_metric_weibull_cutoff"]
if set_ == 'train':
compute_df = copy.deepcopy(self.data_dictionary['train_features'])
elif set_ == 'test':
compute_df = copy.deepcopy(self.data_dictionary['test_features'])
else:
compute_df = copy.deepcopy(self.data_dictionary['prediction_features'])
compute_df_reindexed = compute_df.reindex(
index=np.flip(compute_df.index)
)
pairwise = pd.DataFrame(
np.triu(
pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count)
),
columns=compute_df_reindexed.index,
index=compute_df_reindexed.index
)
pairwise = pairwise.round(5)
column_labels = [
'{}{}'.format('d', i) for i in range(1, no_prev_pts + 1)
]
distances = pd.DataFrame(
columns=column_labels, index=compute_df.index
)
for index in compute_df.index[no_prev_pts:]:
current_row = pairwise.loc[[index]]
current_row_no_zeros = current_row.loc[
:, (current_row != 0).any(axis=0)
]
distances.loc[[index]] = current_row_no_zeros.iloc[
:, :no_prev_pts
]
distances = distances.replace([np.inf, -np.inf], np.nan)
drop_index = pd.isnull(distances).any(1)
distances = distances[drop_index == 0]
inliers = pd.DataFrame(index=distances.index)
for key in distances.keys():
current_distances = distances[key].dropna()
fit_params = ss.weibull_min.fit(current_distances)
cutoff = ss.weibull_min.ppf(weib_pct, *fit_params)
is_inlier = np.where(
current_distances <= cutoff, 1, 0
)
df_inlier = pd.DataFrame(
{key + '_IsInlier': is_inlier}, index=distances.index
)
inliers = pd.concat(
[inliers, df_inlier], axis=1
)
inlier_metric = pd.DataFrame(
data=inliers.sum(axis=1) / no_prev_pts,
columns=['inlier_metric'],
index=compute_df.index
)
inlier_metric = 2 * (inlier_metric - inlier_metric.min()) / \
(inlier_metric.max() - inlier_metric.min()) - 1
if set_ in ('train', 'test'):
inlier_metric = inlier_metric.iloc[no_prev_pts:]
compute_df = compute_df.iloc[no_prev_pts:]
self.remove_beginning_points_from_data_dict(set_, no_prev_pts)
self.data_dictionary[f'{set_}_features'] = pd.concat(
[compute_df, inlier_metric], axis=1)
else:
self.data_dictionary['prediction_features'] = pd.concat(
[compute_df, inlier_metric], axis=1)
self.data_dictionary['prediction_features'].fillna(0, inplace=True)
return None
def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10):
features = self.data_dictionary[f'{set_}_features']
weights = self.data_dictionary[f'{set_}_weights']
labels = self.data_dictionary[f'{set_}_labels']
self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:]
self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:]
self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:]
def add_noise_to_training_features(self) -> None:
"""
Add noise to train features to reduce the risk of overfitting.
"""
mu = 0 # no shift
sigma = self.freqai_config["feature_parameters"]["noise_standard_deviation"]
compute_df = self.data_dictionary['train_features']
noise = np.random.normal(mu, sigma, [compute_df.shape[0], compute_df.shape[1]])
self.data_dictionary['train_features'] += noise
return
def find_features(self, dataframe: DataFrame) -> None: def find_features(self, dataframe: DataFrame) -> None:
""" """
Find features in the strategy provided dataframe Find features in the strategy provided dataframe

View File

@ -66,7 +66,6 @@ class IFreqaiModel(ABC):
"data_split_parameters", {}) "data_split_parameters", {})
self.model_training_parameters: Dict[str, Any] = config.get("freqai", {}).get( self.model_training_parameters: Dict[str, Any] = config.get("freqai", {}).get(
"model_training_parameters", {}) "model_training_parameters", {})
self.feature_parameters = config.get("freqai", {}).get("feature_parameters")
self.retrain = False self.retrain = False
self.first = True self.first = True
self.set_full_path() self.set_full_path()
@ -74,11 +73,14 @@ class IFreqaiModel(ABC):
self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode) self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode)
self.identifier: str = self.freqai_info.get("identifier", "no_id_provided") self.identifier: str = self.freqai_info.get("identifier", "no_id_provided")
self.scanning = False self.scanning = False
self.ft_params = self.freqai_info["feature_parameters"]
self.keras: bool = self.freqai_info.get("keras", False) self.keras: bool = self.freqai_info.get("keras", False)
if self.keras and self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0): if self.keras and self.ft_params.get("DI_threshold", 0):
self.freqai_info["feature_parameters"]["DI_threshold"] = 0 self.ft_params["DI_threshold"] = 0
logger.warning("DI threshold is not configured for Keras models yet. Deactivating.") logger.warning("DI threshold is not configured for Keras models yet. Deactivating.")
self.CONV_WIDTH = self.freqai_info.get("conv_width", 2) self.CONV_WIDTH = self.freqai_info.get("conv_width", 2)
if self.ft_params.get("inlier_metric_window", 0):
self.CONV_WIDTH = self.ft_params.get("inlier_metric_window", 0) * 2
self.pair_it = 0 self.pair_it = 0
self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist")) self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist"))
self.last_trade_database_summary: DataFrame = {} self.last_trade_database_summary: DataFrame = {}
@ -383,24 +385,25 @@ class IFreqaiModel(ABC):
def data_cleaning_train(self, dk: FreqaiDataKitchen) -> None: def data_cleaning_train(self, dk: FreqaiDataKitchen) -> None:
""" """
Base data cleaning method for train Base data cleaning method for train.
Any function inside this method should drop training data points from the filtered_dataframe Functions here improve/modify the input data by identifying outliers,
based on user decided logic. See FreqaiDataKitchen::use_SVM_to_remove_outliers() for an computing additional metrics, adding noise, reducing dimensionality etc.
example of how outlier data points are dropped from the dataframe used for training.
""" """
if self.freqai_info["feature_parameters"].get( ft_params = self.freqai_info["feature_parameters"]
if ft_params.get(
"principal_component_analysis", False "principal_component_analysis", False
): ):
dk.principal_component_analysis() dk.principal_component_analysis()
if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False): if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=False) dk.use_SVM_to_remove_outliers(predict=False)
if self.freqai_info["feature_parameters"].get("DI_threshold", 0): if ft_params.get("DI_threshold", 0):
dk.data["avg_mean_dist"] = dk.compute_distances() dk.data["avg_mean_dist"] = dk.compute_distances()
if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False): if ft_params.get("use_DBSCAN_to_remove_outliers", False):
if dk.pair in self.dd.old_DBSCAN_eps: if dk.pair in self.dd.old_DBSCAN_eps:
eps = self.dd.old_DBSCAN_eps[dk.pair] eps = self.dd.old_DBSCAN_eps[dk.pair]
else: else:
@ -408,29 +411,36 @@ class IFreqaiModel(ABC):
dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps) dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps)
self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps'] self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps']
if ft_params.get('inlier_metric_window', 0):
dk.compute_inlier_metric(set_='train')
if self.freqai_info["data_split_parameters"]["test_size"] > 0:
dk.compute_inlier_metric(set_='test')
if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0):
dk.add_noise_to_training_features()
def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None: def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None:
""" """
Base data cleaning method for predict. Base data cleaning method for predict.
These functions each modify dk.do_predict, which is a dataframe with equal length Functions here are complementary to the functions of data_cleaning_train.
to the number of candles coming from and returning to the strategy. Inside do_predict,
1 allows prediction and < 0 signals to the strategy that the model is not confident in
the prediction.
See FreqaiDataKitchen::remove_outliers() for an example
of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
for buy signals.
""" """
if self.freqai_info["feature_parameters"].get( ft_params = self.freqai_info["feature_parameters"]
if ft_params.get('inlier_metric_window', 0):
dk.compute_inlier_metric(set_='predict')
if ft_params.get(
"principal_component_analysis", False "principal_component_analysis", False
): ):
dk.pca_transform(dataframe) dk.pca_transform(dataframe)
if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False): if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=True) dk.use_SVM_to_remove_outliers(predict=True)
if self.freqai_info["feature_parameters"].get("DI_threshold", 0): if ft_params.get("DI_threshold", 0):
dk.check_if_pred_in_training_spaces() dk.check_if_pred_in_training_spaces()
if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False): if ft_params.get("use_DBSCAN_to_remove_outliers", False):
dk.use_DBSCAN_to_remove_outliers(predict=True) dk.use_DBSCAN_to_remove_outliers(predict=True)
def model_exists( def model_exists(