integrate inlier metric function

This commit is contained in:
robcaulk 2022-08-18 19:15:29 +02:00
parent d3cb211283
commit b11742a4c5
2 changed files with 79 additions and 42 deletions

View File

@ -723,81 +723,104 @@ class FreqaiDataKitchen:
) )
return return
def compute_inlier_metric(self) -> None: def compute_inlier_metric(self, set_='train') -> None:
""" """
Compute inlier metric from backwards distance distributions. Compute inlier metric from backwards distance distributions.
This metric defines how well features from a timepoint fit This metric defines how well features from a timepoint fit
into previous timepoints. into previous timepoints.
""" """
import scipy.stats as ss import scipy.stats as ss
nmb_previous_points = self.data['InlierMetric_nmb_points']
weibull_percentile = self.data['InlierMetric_weib_perc']
train_ft_df = self.data_dictionary['train_features'] no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"]
train_ft_df_reindexed = train_ft_df.reindex( weib_pct = self.freqai_config["feature_parameters"]["inlier_metric_weibull_cutoff"]
index=np.flip(train_ft_df.index)
if set_ == 'train':
compute_df = copy.deepcopy(self.data_dictionary['train_features'])
elif set_ == 'test':
compute_df = copy.deepcopy(self.data_dictionary['test_features'])
else:
compute_df = copy.deepcopy(self.data_dictionary['prediction_features'])
compute_df_reindexed = compute_df.reindex(
index=np.flip(compute_df.index)
) )
pairwise = pd.DataFrame( pairwise = pd.DataFrame(
np.triu( np.triu(
pairwise_distances(train_ft_df_reindexed, n_jobs=self.thread_count) pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count)
), ),
columns=train_ft_df_reindexed.index, columns=compute_df_reindexed.index,
index=train_ft_df_reindexed.index index=compute_df_reindexed.index
) )
pairwise = pairwise.round(5) pairwise = pairwise.round(5)
column_labels = [ column_labels = [
'{}{}'.format('d', i) for i in range(1, nmb_previous_points+1) '{}{}'.format('d', i) for i in range(1, no_prev_pts + 1)
] ]
distances = pd.DataFrame( distances = pd.DataFrame(
columns=column_labels, index=train_ft_df.index columns=column_labels, index=compute_df.index
) )
for index in train_ft_df.index[nmb_previous_points]:
for index in compute_df.index[no_prev_pts:]:
current_row = pairwise.loc[[index]] current_row = pairwise.loc[[index]]
current_row_no_zeros = current_row.loc[ current_row_no_zeros = current_row.loc[
:, (current_row!=0).any(axis=0) :, (current_row != 0).any(axis=0)
] ]
distances.loc[[index]] = current_row_no_zeros.iloc[ distances.loc[[index]] = current_row_no_zeros.iloc[
:, :nmb_previous_points :, :no_prev_pts
] ]
distances = distances.replace([np.inf, -np.inf], np.nan) distances = distances.replace([np.inf, -np.inf], np.nan)
drop_index = pd.isnull(distances).any(1) drop_index = pd.isnull(distances).any(1)
distances = distances[drop_index==0] distances = distances[drop_index == 0]
inliers = pd.DataFrame(index=distances.index) inliers = pd.DataFrame(index=distances.index)
for key in distances.keys(): for key in distances.keys():
current_distances = distances[key].dropna() current_distances = distances[key].dropna()
fit_params = ss.weibull_min.fit(current_distances) fit_params = ss.weibull_min.fit(current_distances)
cutoff = ss.weibull_min.ppf(weibull_percentile, *fit_params) cutoff = ss.weibull_min.ppf(weib_pct, *fit_params)
is_inlier = np.where( is_inlier = np.where(
current_distances<=cutoff, 1, 0 current_distances <= cutoff, 1, 0
) )
df_inlier = pd.DataFrame( df_inlier = pd.DataFrame(
{key+'_IsInlier':is_inlier}, index=distances.index {key + '_IsInlier': is_inlier}, index=distances.index
) )
inliers = pd.concat( inliers = pd.concat(
[inliers, df_inlier], axis=1 [inliers, df_inlier], axis=1
) )
self.data_dictionary['train_features'] = pd.DataFrame( inlier_metric = pd.DataFrame(
data=inliers.sum(axis=1)/nmb_previous_points, data=inliers.sum(axis=1) / no_prev_pts,
columns=['inlier_metric'], columns=['inlier_metric'],
index = train_ft_df.index index=compute_df.index
) )
percent_outliers = np.round( inlier_metric = 2 * (inlier_metric - inlier_metric.min()) / \
100*(1-self.data_dictionary['iniler_metric'].sum()/ (inlier_metric.max() - inlier_metric.min()) - 1
len(train_ft_df.index)), 2
) if set_ in ('train', 'test'):
logger.info('{percent_outliers}%% of data points were identified as outliers') inlier_metric = inlier_metric.iloc[no_prev_pts:]
compute_df = compute_df.iloc[no_prev_pts:]
self.remove_beginning_points_from_data_dict(set_, no_prev_pts)
self.data_dictionary[f'{set_}_features'] = pd.concat(
[compute_df, inlier_metric], axis=1)
else:
self.data_dictionary['prediction_features'] = pd.concat(
[compute_df, inlier_metric], axis=1)
self.data_dictionary['prediction_features'].fillna(0, inplace=True)
return None return None
def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10):
features = self.data_dictionary[f'{set_}_features']
weights = self.data_dictionary[f'{set_}_weights']
labels = self.data_dictionary[f'{set_}_labels']
self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:]
self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:]
self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:]
def find_features(self, dataframe: DataFrame) -> None: def find_features(self, dataframe: DataFrame) -> None:
""" """
Find features in the strategy provided dataframe Find features in the strategy provided dataframe

View File

@ -66,7 +66,6 @@ class IFreqaiModel(ABC):
"data_split_parameters", {}) "data_split_parameters", {})
self.model_training_parameters: Dict[str, Any] = config.get("freqai", {}).get( self.model_training_parameters: Dict[str, Any] = config.get("freqai", {}).get(
"model_training_parameters", {}) "model_training_parameters", {})
self.feature_parameters = config.get("freqai", {}).get("feature_parameters")
self.retrain = False self.retrain = False
self.first = True self.first = True
self.set_full_path() self.set_full_path()
@ -74,11 +73,14 @@ class IFreqaiModel(ABC):
self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode) self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode)
self.identifier: str = self.freqai_info.get("identifier", "no_id_provided") self.identifier: str = self.freqai_info.get("identifier", "no_id_provided")
self.scanning = False self.scanning = False
self.ft_params = self.freqai_info["feature_parameters"]
self.keras: bool = self.freqai_info.get("keras", False) self.keras: bool = self.freqai_info.get("keras", False)
if self.keras and self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0): if self.keras and self.ft_params.get("DI_threshold", 0):
self.freqai_info["feature_parameters"]["DI_threshold"] = 0 self.ft_params["DI_threshold"] = 0
logger.warning("DI threshold is not configured for Keras models yet. Deactivating.") logger.warning("DI threshold is not configured for Keras models yet. Deactivating.")
self.CONV_WIDTH = self.freqai_info.get("conv_width", 2) self.CONV_WIDTH = self.freqai_info.get("conv_width", 2)
if self.ft_params.get("inlier_metric_window", 0):
self.CONV_WIDTH = self.ft_params.get("inlier_metric_window", 0) * 2
self.pair_it = 0 self.pair_it = 0
self.pair_it_train = 0 self.pair_it_train = 0
self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist")) self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist"))
@ -403,18 +405,20 @@ class IFreqaiModel(ABC):
example of how outlier data points are dropped from the dataframe used for training. example of how outlier data points are dropped from the dataframe used for training.
""" """
if self.freqai_info["feature_parameters"].get( ft_params = self.freqai_info["feature_parameters"]
if ft_params.get(
"principal_component_analysis", False "principal_component_analysis", False
): ):
dk.principal_component_analysis() dk.principal_component_analysis()
if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False): if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=False) dk.use_SVM_to_remove_outliers(predict=False)
if self.freqai_info["feature_parameters"].get("DI_threshold", 0): if ft_params.get("DI_threshold", 0):
dk.data["avg_mean_dist"] = dk.compute_distances() dk.data["avg_mean_dist"] = dk.compute_distances()
if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False): if ft_params.get("use_DBSCAN_to_remove_outliers", False):
if dk.pair in self.dd.old_DBSCAN_eps: if dk.pair in self.dd.old_DBSCAN_eps:
eps = self.dd.old_DBSCAN_eps[dk.pair] eps = self.dd.old_DBSCAN_eps[dk.pair]
else: else:
@ -422,6 +426,11 @@ class IFreqaiModel(ABC):
dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps) dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps)
self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps'] self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps']
if ft_params.get('inlier_metric_window', 0):
dk.compute_inlier_metric(set_='train')
if self.freqai_info["data_split_parameters"]["test_size"] > 0:
dk.compute_inlier_metric(set_='test')
def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None: def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None:
""" """
Base data cleaning method for predict. Base data cleaning method for predict.
@ -433,18 +442,23 @@ class IFreqaiModel(ABC):
of how the do_predict vector is modified. do_predict is ultimately passed back to strategy of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
for buy signals. for buy signals.
""" """
if self.freqai_info["feature_parameters"].get( ft_params = self.freqai_info["feature_parameters"]
if ft_params.get('inlier_metric_window', 0):
dk.compute_inlier_metric(set_='predict')
if ft_params.get(
"principal_component_analysis", False "principal_component_analysis", False
): ):
dk.pca_transform(dataframe) dk.pca_transform(dataframe)
if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False): if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=True) dk.use_SVM_to_remove_outliers(predict=True)
if self.freqai_info["feature_parameters"].get("DI_threshold", 0): if ft_params.get("DI_threshold", 0):
dk.check_if_pred_in_training_spaces() dk.check_if_pred_in_training_spaces()
if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False): if ft_params.get("use_DBSCAN_to_remove_outliers", False):
dk.use_DBSCAN_to_remove_outliers(predict=True) dk.use_DBSCAN_to_remove_outliers(predict=True)
def model_exists( def model_exists(