integrate inlier metric function
This commit is contained in:
		| @@ -654,81 +654,104 @@ class FreqaiDataKitchen: | ||||
|             ) | ||||
|  | ||||
|         return | ||||
|          | ||||
|     def compute_inlier_metric(self) -> None: | ||||
|  | ||||
|     def compute_inlier_metric(self, set_='train') -> None: | ||||
|         """ | ||||
|          | ||||
|         Compute inlier metric from backwards distance distributions.  | ||||
|         This metric defines how well features from a timepoint fit  | ||||
|  | ||||
|         Compute inlier metric from backwards distance distributions. | ||||
|         This metric defines how well features from a timepoint fit | ||||
|         into previous timepoints. | ||||
|         """ | ||||
|  | ||||
|         import scipy.stats as ss | ||||
|      | ||||
|         nmb_previous_points = self.data['InlierMetric_nmb_points'] | ||||
|         weibull_percentile = self.data['InlierMetric_weib_perc'] | ||||
|  | ||||
|         train_ft_df = self.data_dictionary['train_features'] | ||||
|         train_ft_df_reindexed = train_ft_df.reindex( | ||||
|             index=np.flip(train_ft_df.index)  | ||||
|         no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"] | ||||
|         weib_pct = self.freqai_config["feature_parameters"]["inlier_metric_weibull_cutoff"] | ||||
|  | ||||
|         if set_ == 'train': | ||||
|             compute_df = copy.deepcopy(self.data_dictionary['train_features']) | ||||
|         elif set_ == 'test': | ||||
|             compute_df = copy.deepcopy(self.data_dictionary['test_features']) | ||||
|         else: | ||||
|             compute_df = copy.deepcopy(self.data_dictionary['prediction_features']) | ||||
|  | ||||
|         compute_df_reindexed = compute_df.reindex( | ||||
|             index=np.flip(compute_df.index) | ||||
|         ) | ||||
|  | ||||
|         pairwise = pd.DataFrame( | ||||
|             np.triu( | ||||
|                 pairwise_distances(train_ft_df_reindexed, n_jobs=self.thread_count) | ||||
|                 pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count) | ||||
|             ), | ||||
|             columns=train_ft_df_reindexed.index, | ||||
|             index=train_ft_df_reindexed.index | ||||
|             columns=compute_df_reindexed.index, | ||||
|             index=compute_df_reindexed.index | ||||
|         ) | ||||
|         pairwise = pairwise.round(5) | ||||
|  | ||||
|         column_labels = [ | ||||
|             '{}{}'.format('d', i) for i in range(1, nmb_previous_points+1) | ||||
|             '{}{}'.format('d', i) for i in range(1, no_prev_pts + 1) | ||||
|         ] | ||||
|         distances = pd.DataFrame( | ||||
|             columns=column_labels, index=train_ft_df.index | ||||
|             columns=column_labels, index=compute_df.index | ||||
|         ) | ||||
|         for index in train_ft_df.index[nmb_previous_points]: | ||||
|  | ||||
|         for index in compute_df.index[no_prev_pts:]: | ||||
|             current_row = pairwise.loc[[index]] | ||||
|             current_row_no_zeros = current_row.loc[ | ||||
|                 :, (current_row!=0).any(axis=0) | ||||
|                 :, (current_row != 0).any(axis=0) | ||||
|             ] | ||||
|             distances.loc[[index]] = current_row_no_zeros.iloc[ | ||||
|                 :, :nmb_previous_points | ||||
|                 :, :no_prev_pts | ||||
|             ] | ||||
|         distances = distances.replace([np.inf, -np.inf], np.nan) | ||||
|         drop_index = pd.isnull(distances).any(1) | ||||
|         distances = distances[drop_index==0] | ||||
|         distances = distances[drop_index == 0] | ||||
|  | ||||
|         inliers = pd.DataFrame(index=distances.index) | ||||
|         for key in distances.keys(): | ||||
|             current_distances = distances[key].dropna() | ||||
|             fit_params = ss.weibull_min.fit(current_distances) | ||||
|             cutoff = ss.weibull_min.ppf(weibull_percentile, *fit_params) | ||||
|             cutoff = ss.weibull_min.ppf(weib_pct, *fit_params) | ||||
|             is_inlier = np.where( | ||||
|                 current_distances<=cutoff, 1, 0 | ||||
|                 current_distances <= cutoff, 1, 0 | ||||
|             ) | ||||
|             df_inlier = pd.DataFrame( | ||||
|                 {key+'_IsInlier':is_inlier}, index=distances.index | ||||
|                 {key + '_IsInlier': is_inlier}, index=distances.index | ||||
|             ) | ||||
|             inliers = pd.concat( | ||||
|                 [inliers, df_inlier], axis=1 | ||||
|             ) | ||||
|  | ||||
|         self.data_dictionary['train_features'] = pd.DataFrame( | ||||
|             data=inliers.sum(axis=1)/nmb_previous_points, | ||||
|         inlier_metric = pd.DataFrame( | ||||
|             data=inliers.sum(axis=1) / no_prev_pts, | ||||
|             columns=['inlier_metric'], | ||||
|             index = train_ft_df.index | ||||
|             index=compute_df.index | ||||
|         ) | ||||
|  | ||||
|         percent_outliers = np.round( | ||||
|             100*(1-self.data_dictionary['iniler_metric'].sum()/ | ||||
|             len(train_ft_df.index)), 2 | ||||
|         ) | ||||
|         logger.info('{percent_outliers}%% of data points were identified as outliers') | ||||
|         inlier_metric = 2 * (inlier_metric - inlier_metric.min()) / \ | ||||
|             (inlier_metric.max() - inlier_metric.min()) - 1 | ||||
|  | ||||
|         if set_ in ('train', 'test'): | ||||
|             inlier_metric = inlier_metric.iloc[no_prev_pts:] | ||||
|             compute_df = compute_df.iloc[no_prev_pts:] | ||||
|             self.remove_beginning_points_from_data_dict(set_, no_prev_pts) | ||||
|             self.data_dictionary[f'{set_}_features'] = pd.concat( | ||||
|                 [compute_df, inlier_metric], axis=1) | ||||
|         else: | ||||
|             self.data_dictionary['prediction_features'] = pd.concat( | ||||
|                 [compute_df, inlier_metric], axis=1) | ||||
|             self.data_dictionary['prediction_features'].fillna(0, inplace=True) | ||||
|  | ||||
|         return None | ||||
|  | ||||
|     def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10): | ||||
|         features = self.data_dictionary[f'{set_}_features'] | ||||
|         weights = self.data_dictionary[f'{set_}_weights'] | ||||
|         labels = self.data_dictionary[f'{set_}_labels'] | ||||
|         self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:] | ||||
|         self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:] | ||||
|         self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:] | ||||
|  | ||||
|     def find_features(self, dataframe: DataFrame) -> None: | ||||
|         """ | ||||
|         Find features in the strategy provided dataframe | ||||
|   | ||||
| @@ -66,7 +66,6 @@ class IFreqaiModel(ABC): | ||||
|             "data_split_parameters", {}) | ||||
|         self.model_training_parameters: Dict[str, Any] = config.get("freqai", {}).get( | ||||
|             "model_training_parameters", {}) | ||||
|         self.feature_parameters = config.get("freqai", {}).get("feature_parameters") | ||||
|         self.retrain = False | ||||
|         self.first = True | ||||
|         self.set_full_path() | ||||
| @@ -74,11 +73,14 @@ class IFreqaiModel(ABC): | ||||
|         self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode) | ||||
|         self.identifier: str = self.freqai_info.get("identifier", "no_id_provided") | ||||
|         self.scanning = False | ||||
|         self.ft_params = self.freqai_info["feature_parameters"] | ||||
|         self.keras: bool = self.freqai_info.get("keras", False) | ||||
|         if self.keras and self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0): | ||||
|             self.freqai_info["feature_parameters"]["DI_threshold"] = 0 | ||||
|         if self.keras and self.ft_params.get("DI_threshold", 0): | ||||
|             self.ft_params["DI_threshold"] = 0 | ||||
|             logger.warning("DI threshold is not configured for Keras models yet. Deactivating.") | ||||
|         self.CONV_WIDTH = self.freqai_info.get("conv_width", 2) | ||||
|         if self.ft_params.get("inlier_metric_window", 0): | ||||
|             self.CONV_WIDTH = self.ft_params.get("inlier_metric_window", 0) * 2 | ||||
|         self.pair_it = 0 | ||||
|         self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist")) | ||||
|         self.last_trade_database_summary: DataFrame = {} | ||||
| @@ -389,18 +391,20 @@ class IFreqaiModel(ABC): | ||||
|         example of how outlier data points are dropped from the dataframe used for training. | ||||
|         """ | ||||
|  | ||||
|         if self.freqai_info["feature_parameters"].get( | ||||
|         ft_params = self.freqai_info["feature_parameters"] | ||||
|  | ||||
|         if ft_params.get( | ||||
|             "principal_component_analysis", False | ||||
|         ): | ||||
|             dk.principal_component_analysis() | ||||
|  | ||||
|         if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False): | ||||
|         if ft_params.get("use_SVM_to_remove_outliers", False): | ||||
|             dk.use_SVM_to_remove_outliers(predict=False) | ||||
|  | ||||
|         if self.freqai_info["feature_parameters"].get("DI_threshold", 0): | ||||
|         if ft_params.get("DI_threshold", 0): | ||||
|             dk.data["avg_mean_dist"] = dk.compute_distances() | ||||
|  | ||||
|         if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False): | ||||
|         if ft_params.get("use_DBSCAN_to_remove_outliers", False): | ||||
|             if dk.pair in self.dd.old_DBSCAN_eps: | ||||
|                 eps = self.dd.old_DBSCAN_eps[dk.pair] | ||||
|             else: | ||||
| @@ -408,6 +412,11 @@ class IFreqaiModel(ABC): | ||||
|             dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps) | ||||
|             self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps'] | ||||
|  | ||||
|         if ft_params.get('inlier_metric_window', 0): | ||||
|             dk.compute_inlier_metric(set_='train') | ||||
|             if self.freqai_info["data_split_parameters"]["test_size"] > 0: | ||||
|                 dk.compute_inlier_metric(set_='test') | ||||
|  | ||||
|     def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None: | ||||
|         """ | ||||
|         Base data cleaning method for predict. | ||||
| @@ -419,18 +428,23 @@ class IFreqaiModel(ABC): | ||||
|         of how the do_predict vector is modified. do_predict is ultimately passed back to strategy | ||||
|         for buy signals. | ||||
|         """ | ||||
|         if self.freqai_info["feature_parameters"].get( | ||||
|         ft_params = self.freqai_info["feature_parameters"] | ||||
|  | ||||
|         if ft_params.get('inlier_metric_window', 0): | ||||
|             dk.compute_inlier_metric(set_='predict') | ||||
|  | ||||
|         if ft_params.get( | ||||
|             "principal_component_analysis", False | ||||
|         ): | ||||
|             dk.pca_transform(dataframe) | ||||
|  | ||||
|         if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False): | ||||
|         if ft_params.get("use_SVM_to_remove_outliers", False): | ||||
|             dk.use_SVM_to_remove_outliers(predict=True) | ||||
|  | ||||
|         if self.freqai_info["feature_parameters"].get("DI_threshold", 0): | ||||
|         if ft_params.get("DI_threshold", 0): | ||||
|             dk.check_if_pred_in_training_spaces() | ||||
|  | ||||
|         if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False): | ||||
|         if ft_params.get("use_DBSCAN_to_remove_outliers", False): | ||||
|             dk.use_DBSCAN_to_remove_outliers(predict=True) | ||||
|  | ||||
|     def model_exists( | ||||
|   | ||||
		Reference in New Issue
	
	Block a user