integrate inlier metric function
This commit is contained in:
parent
d3cb211283
commit
b11742a4c5
@ -723,81 +723,104 @@ class FreqaiDataKitchen:
|
|||||||
)
|
)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def compute_inlier_metric(self) -> None:
|
def compute_inlier_metric(self, set_='train') -> None:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
Compute inlier metric from backwards distance distributions.
|
Compute inlier metric from backwards distance distributions.
|
||||||
This metric defines how well features from a timepoint fit
|
This metric defines how well features from a timepoint fit
|
||||||
into previous timepoints.
|
into previous timepoints.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import scipy.stats as ss
|
import scipy.stats as ss
|
||||||
|
|
||||||
nmb_previous_points = self.data['InlierMetric_nmb_points']
|
|
||||||
weibull_percentile = self.data['InlierMetric_weib_perc']
|
|
||||||
|
|
||||||
train_ft_df = self.data_dictionary['train_features']
|
no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"]
|
||||||
train_ft_df_reindexed = train_ft_df.reindex(
|
weib_pct = self.freqai_config["feature_parameters"]["inlier_metric_weibull_cutoff"]
|
||||||
index=np.flip(train_ft_df.index)
|
|
||||||
|
if set_ == 'train':
|
||||||
|
compute_df = copy.deepcopy(self.data_dictionary['train_features'])
|
||||||
|
elif set_ == 'test':
|
||||||
|
compute_df = copy.deepcopy(self.data_dictionary['test_features'])
|
||||||
|
else:
|
||||||
|
compute_df = copy.deepcopy(self.data_dictionary['prediction_features'])
|
||||||
|
|
||||||
|
compute_df_reindexed = compute_df.reindex(
|
||||||
|
index=np.flip(compute_df.index)
|
||||||
)
|
)
|
||||||
|
|
||||||
pairwise = pd.DataFrame(
|
pairwise = pd.DataFrame(
|
||||||
np.triu(
|
np.triu(
|
||||||
pairwise_distances(train_ft_df_reindexed, n_jobs=self.thread_count)
|
pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count)
|
||||||
),
|
),
|
||||||
columns=train_ft_df_reindexed.index,
|
columns=compute_df_reindexed.index,
|
||||||
index=train_ft_df_reindexed.index
|
index=compute_df_reindexed.index
|
||||||
)
|
)
|
||||||
pairwise = pairwise.round(5)
|
pairwise = pairwise.round(5)
|
||||||
|
|
||||||
column_labels = [
|
column_labels = [
|
||||||
'{}{}'.format('d', i) for i in range(1, nmb_previous_points+1)
|
'{}{}'.format('d', i) for i in range(1, no_prev_pts + 1)
|
||||||
]
|
]
|
||||||
distances = pd.DataFrame(
|
distances = pd.DataFrame(
|
||||||
columns=column_labels, index=train_ft_df.index
|
columns=column_labels, index=compute_df.index
|
||||||
)
|
)
|
||||||
for index in train_ft_df.index[nmb_previous_points]:
|
|
||||||
|
for index in compute_df.index[no_prev_pts:]:
|
||||||
current_row = pairwise.loc[[index]]
|
current_row = pairwise.loc[[index]]
|
||||||
current_row_no_zeros = current_row.loc[
|
current_row_no_zeros = current_row.loc[
|
||||||
:, (current_row!=0).any(axis=0)
|
:, (current_row != 0).any(axis=0)
|
||||||
]
|
]
|
||||||
distances.loc[[index]] = current_row_no_zeros.iloc[
|
distances.loc[[index]] = current_row_no_zeros.iloc[
|
||||||
:, :nmb_previous_points
|
:, :no_prev_pts
|
||||||
]
|
]
|
||||||
distances = distances.replace([np.inf, -np.inf], np.nan)
|
distances = distances.replace([np.inf, -np.inf], np.nan)
|
||||||
drop_index = pd.isnull(distances).any(1)
|
drop_index = pd.isnull(distances).any(1)
|
||||||
distances = distances[drop_index==0]
|
distances = distances[drop_index == 0]
|
||||||
|
|
||||||
inliers = pd.DataFrame(index=distances.index)
|
inliers = pd.DataFrame(index=distances.index)
|
||||||
for key in distances.keys():
|
for key in distances.keys():
|
||||||
current_distances = distances[key].dropna()
|
current_distances = distances[key].dropna()
|
||||||
fit_params = ss.weibull_min.fit(current_distances)
|
fit_params = ss.weibull_min.fit(current_distances)
|
||||||
cutoff = ss.weibull_min.ppf(weibull_percentile, *fit_params)
|
cutoff = ss.weibull_min.ppf(weib_pct, *fit_params)
|
||||||
is_inlier = np.where(
|
is_inlier = np.where(
|
||||||
current_distances<=cutoff, 1, 0
|
current_distances <= cutoff, 1, 0
|
||||||
)
|
)
|
||||||
df_inlier = pd.DataFrame(
|
df_inlier = pd.DataFrame(
|
||||||
{key+'_IsInlier':is_inlier}, index=distances.index
|
{key + '_IsInlier': is_inlier}, index=distances.index
|
||||||
)
|
)
|
||||||
inliers = pd.concat(
|
inliers = pd.concat(
|
||||||
[inliers, df_inlier], axis=1
|
[inliers, df_inlier], axis=1
|
||||||
)
|
)
|
||||||
|
|
||||||
self.data_dictionary['train_features'] = pd.DataFrame(
|
inlier_metric = pd.DataFrame(
|
||||||
data=inliers.sum(axis=1)/nmb_previous_points,
|
data=inliers.sum(axis=1) / no_prev_pts,
|
||||||
columns=['inlier_metric'],
|
columns=['inlier_metric'],
|
||||||
index = train_ft_df.index
|
index=compute_df.index
|
||||||
)
|
)
|
||||||
|
|
||||||
percent_outliers = np.round(
|
inlier_metric = 2 * (inlier_metric - inlier_metric.min()) / \
|
||||||
100*(1-self.data_dictionary['iniler_metric'].sum()/
|
(inlier_metric.max() - inlier_metric.min()) - 1
|
||||||
len(train_ft_df.index)), 2
|
|
||||||
)
|
if set_ in ('train', 'test'):
|
||||||
logger.info('{percent_outliers}%% of data points were identified as outliers')
|
inlier_metric = inlier_metric.iloc[no_prev_pts:]
|
||||||
|
compute_df = compute_df.iloc[no_prev_pts:]
|
||||||
|
self.remove_beginning_points_from_data_dict(set_, no_prev_pts)
|
||||||
|
self.data_dictionary[f'{set_}_features'] = pd.concat(
|
||||||
|
[compute_df, inlier_metric], axis=1)
|
||||||
|
else:
|
||||||
|
self.data_dictionary['prediction_features'] = pd.concat(
|
||||||
|
[compute_df, inlier_metric], axis=1)
|
||||||
|
self.data_dictionary['prediction_features'].fillna(0, inplace=True)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10):
|
||||||
|
features = self.data_dictionary[f'{set_}_features']
|
||||||
|
weights = self.data_dictionary[f'{set_}_weights']
|
||||||
|
labels = self.data_dictionary[f'{set_}_labels']
|
||||||
|
self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:]
|
||||||
|
self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:]
|
||||||
|
self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:]
|
||||||
|
|
||||||
def find_features(self, dataframe: DataFrame) -> None:
|
def find_features(self, dataframe: DataFrame) -> None:
|
||||||
"""
|
"""
|
||||||
Find features in the strategy provided dataframe
|
Find features in the strategy provided dataframe
|
||||||
|
@ -66,7 +66,6 @@ class IFreqaiModel(ABC):
|
|||||||
"data_split_parameters", {})
|
"data_split_parameters", {})
|
||||||
self.model_training_parameters: Dict[str, Any] = config.get("freqai", {}).get(
|
self.model_training_parameters: Dict[str, Any] = config.get("freqai", {}).get(
|
||||||
"model_training_parameters", {})
|
"model_training_parameters", {})
|
||||||
self.feature_parameters = config.get("freqai", {}).get("feature_parameters")
|
|
||||||
self.retrain = False
|
self.retrain = False
|
||||||
self.first = True
|
self.first = True
|
||||||
self.set_full_path()
|
self.set_full_path()
|
||||||
@ -74,11 +73,14 @@ class IFreqaiModel(ABC):
|
|||||||
self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode)
|
self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode)
|
||||||
self.identifier: str = self.freqai_info.get("identifier", "no_id_provided")
|
self.identifier: str = self.freqai_info.get("identifier", "no_id_provided")
|
||||||
self.scanning = False
|
self.scanning = False
|
||||||
|
self.ft_params = self.freqai_info["feature_parameters"]
|
||||||
self.keras: bool = self.freqai_info.get("keras", False)
|
self.keras: bool = self.freqai_info.get("keras", False)
|
||||||
if self.keras and self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0):
|
if self.keras and self.ft_params.get("DI_threshold", 0):
|
||||||
self.freqai_info["feature_parameters"]["DI_threshold"] = 0
|
self.ft_params["DI_threshold"] = 0
|
||||||
logger.warning("DI threshold is not configured for Keras models yet. Deactivating.")
|
logger.warning("DI threshold is not configured for Keras models yet. Deactivating.")
|
||||||
self.CONV_WIDTH = self.freqai_info.get("conv_width", 2)
|
self.CONV_WIDTH = self.freqai_info.get("conv_width", 2)
|
||||||
|
if self.ft_params.get("inlier_metric_window", 0):
|
||||||
|
self.CONV_WIDTH = self.ft_params.get("inlier_metric_window", 0) * 2
|
||||||
self.pair_it = 0
|
self.pair_it = 0
|
||||||
self.pair_it_train = 0
|
self.pair_it_train = 0
|
||||||
self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist"))
|
self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist"))
|
||||||
@ -403,18 +405,20 @@ class IFreqaiModel(ABC):
|
|||||||
example of how outlier data points are dropped from the dataframe used for training.
|
example of how outlier data points are dropped from the dataframe used for training.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if self.freqai_info["feature_parameters"].get(
|
ft_params = self.freqai_info["feature_parameters"]
|
||||||
|
|
||||||
|
if ft_params.get(
|
||||||
"principal_component_analysis", False
|
"principal_component_analysis", False
|
||||||
):
|
):
|
||||||
dk.principal_component_analysis()
|
dk.principal_component_analysis()
|
||||||
|
|
||||||
if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False):
|
if ft_params.get("use_SVM_to_remove_outliers", False):
|
||||||
dk.use_SVM_to_remove_outliers(predict=False)
|
dk.use_SVM_to_remove_outliers(predict=False)
|
||||||
|
|
||||||
if self.freqai_info["feature_parameters"].get("DI_threshold", 0):
|
if ft_params.get("DI_threshold", 0):
|
||||||
dk.data["avg_mean_dist"] = dk.compute_distances()
|
dk.data["avg_mean_dist"] = dk.compute_distances()
|
||||||
|
|
||||||
if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False):
|
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
|
||||||
if dk.pair in self.dd.old_DBSCAN_eps:
|
if dk.pair in self.dd.old_DBSCAN_eps:
|
||||||
eps = self.dd.old_DBSCAN_eps[dk.pair]
|
eps = self.dd.old_DBSCAN_eps[dk.pair]
|
||||||
else:
|
else:
|
||||||
@ -422,6 +426,11 @@ class IFreqaiModel(ABC):
|
|||||||
dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps)
|
dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps)
|
||||||
self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps']
|
self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps']
|
||||||
|
|
||||||
|
if ft_params.get('inlier_metric_window', 0):
|
||||||
|
dk.compute_inlier_metric(set_='train')
|
||||||
|
if self.freqai_info["data_split_parameters"]["test_size"] > 0:
|
||||||
|
dk.compute_inlier_metric(set_='test')
|
||||||
|
|
||||||
def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None:
|
def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None:
|
||||||
"""
|
"""
|
||||||
Base data cleaning method for predict.
|
Base data cleaning method for predict.
|
||||||
@ -433,18 +442,23 @@ class IFreqaiModel(ABC):
|
|||||||
of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
|
of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
|
||||||
for buy signals.
|
for buy signals.
|
||||||
"""
|
"""
|
||||||
if self.freqai_info["feature_parameters"].get(
|
ft_params = self.freqai_info["feature_parameters"]
|
||||||
|
|
||||||
|
if ft_params.get('inlier_metric_window', 0):
|
||||||
|
dk.compute_inlier_metric(set_='predict')
|
||||||
|
|
||||||
|
if ft_params.get(
|
||||||
"principal_component_analysis", False
|
"principal_component_analysis", False
|
||||||
):
|
):
|
||||||
dk.pca_transform(dataframe)
|
dk.pca_transform(dataframe)
|
||||||
|
|
||||||
if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False):
|
if ft_params.get("use_SVM_to_remove_outliers", False):
|
||||||
dk.use_SVM_to_remove_outliers(predict=True)
|
dk.use_SVM_to_remove_outliers(predict=True)
|
||||||
|
|
||||||
if self.freqai_info["feature_parameters"].get("DI_threshold", 0):
|
if ft_params.get("DI_threshold", 0):
|
||||||
dk.check_if_pred_in_training_spaces()
|
dk.check_if_pred_in_training_spaces()
|
||||||
|
|
||||||
if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False):
|
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
|
||||||
dk.use_DBSCAN_to_remove_outliers(predict=True)
|
dk.use_DBSCAN_to_remove_outliers(predict=True)
|
||||||
|
|
||||||
def model_exists(
|
def model_exists(
|
||||||
|
Loading…
Reference in New Issue
Block a user