From 52ee7fc981abf2efc153b52de5dcb151de636744 Mon Sep 17 00:00:00 2001
From: th0rntwig <tornquist.elin@gmail.com>
Date: Thu, 18 Aug 2022 14:44:49 +0200
Subject: [PATCH 1/3] Add inlier metric computation

---
 freqtrade/freqai/data_kitchen.py | 74 ++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py
index 35f51baed..7a885659d 100644
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@@ -654,6 +654,80 @@ class FreqaiDataKitchen:
             )
 
         return
+        
+    def compute_inlier_metric(self) -> None:
+        """
+        
+        Compute inlier metric from backwards distance distributions. 
+        This metric defines how well features from a timepoint fit 
+        into previous timepoints.
+        """
+
+        import scipy.stats as ss
+    
+        nmb_previous_points = self.data['InlierMetric_nmb_points']
+        weibull_percentile = self.data['InlierMetric_weib_perc']
+
+        train_ft_df = self.data_dictionary['train_features']
+        train_ft_df_reindexed = train_ft_df.reindex(
+            index=np.flip(train_ft_df.index) 
+        )
+
+        pairwise = pd.DataFrame(
+            np.triu(
+                pairwise_distances(train_ft_df_reindexed, n_jobs=self.thread_count)
+            ),
+            columns=train_ft_df_reindexed.index,
+            index=train_ft_df_reindexed.index
+        )
+        pairwise = pairwise.round(5)
+
+        column_labels = [
+            '{}{}'.format('d', i) for i in range(1, nmb_previous_points+1)
+        ]
+        distances = pd.DataFrame(
+            columns=column_labels, index=train_ft_df.index
+        )
+        for index in train_ft_df.index[nmb_previous_points]:
+            current_row = pairwise.loc[[index]]
+            current_row_no_zeros = current_row.loc[
+                :, (current_row!=0).any(axis=0)
+            ]
+            distances.loc[[index]] = current_row_no_zeros.iloc[
+                :, :nmb_previous_points
+            ]
+        distances = distances.replace([np.inf, -np.inf], np.nan)
+        drop_index = pd.isnull(distances).any(1)
+        distances = distances[drop_index==0]
+
+        inliers = pd.DataFrame(index=distances.index)
+        for key in distances.keys():
+            current_distances = distances[key].dropna()
+            fit_params = ss.weibull_min.fit(current_distances)
+            cutoff = ss.weibull_min.ppf(weibull_percentile, *fit_params)
+            is_inlier = np.where(
+                current_distances<=cutoff, 1, 0
+            )
+            df_inlier = pd.DataFrame(
+                {key+'_IsInlier':is_inlier}, index=distances.index
+            )
+            inliers = pd.concat(
+                [inliers, df_inlier], axis=1
+            )
+
+        self.data_dictionary['train_features'] = pd.DataFrame(
+            data=inliers.sum(axis=1)/nmb_previous_points,
+            columns=['inlier_metric'],
+            index = train_ft_df.index
+        )
+
+        percent_outliers = np.round(
+            100*(1-self.data_dictionary['iniler_metric'].sum()/
+            len(train_ft_df.index)), 2
+        )
+        logger.info('{percent_outliers}%% of data points were identified as outliers')
+
+        return None
 
     def find_features(self, dataframe: DataFrame) -> None:
         """

From 98c62dad910ac74a8579e099d1a07e4cc5b0180c Mon Sep 17 00:00:00 2001
From: robcaulk <rob.caulk@gmail.com>
Date: Thu, 18 Aug 2022 19:15:29 +0200
Subject: [PATCH 2/3] integrate inlier metric function

---
 freqtrade/freqai/data_kitchen.py     | 85 ++++++++++++++++++----------
 freqtrade/freqai/freqai_interface.py | 36 ++++++++----
 2 files changed, 79 insertions(+), 42 deletions(-)

diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py
index 7a885659d..ca4687902 100644
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@@ -654,81 +654,104 @@ class FreqaiDataKitchen:
             )
 
         return
-        
-    def compute_inlier_metric(self) -> None:
+
+    def compute_inlier_metric(self, set_='train') -> None:
         """
-        
-        Compute inlier metric from backwards distance distributions. 
-        This metric defines how well features from a timepoint fit 
+
+        Compute inlier metric from backwards distance distributions.
+        This metric defines how well features from a timepoint fit
         into previous timepoints.
         """
 
         import scipy.stats as ss
-    
-        nmb_previous_points = self.data['InlierMetric_nmb_points']
-        weibull_percentile = self.data['InlierMetric_weib_perc']
 
-        train_ft_df = self.data_dictionary['train_features']
-        train_ft_df_reindexed = train_ft_df.reindex(
-            index=np.flip(train_ft_df.index) 
+        no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"]
+        weib_pct = self.freqai_config["feature_parameters"]["inlier_metric_weibull_cutoff"]
+
+        if set_ == 'train':
+            compute_df = copy.deepcopy(self.data_dictionary['train_features'])
+        elif set_ == 'test':
+            compute_df = copy.deepcopy(self.data_dictionary['test_features'])
+        else:
+            compute_df = copy.deepcopy(self.data_dictionary['prediction_features'])
+
+        compute_df_reindexed = compute_df.reindex(
+            index=np.flip(compute_df.index)
         )
 
         pairwise = pd.DataFrame(
             np.triu(
-                pairwise_distances(train_ft_df_reindexed, n_jobs=self.thread_count)
+                pairwise_distances(compute_df_reindexed, n_jobs=self.thread_count)
             ),
-            columns=train_ft_df_reindexed.index,
-            index=train_ft_df_reindexed.index
+            columns=compute_df_reindexed.index,
+            index=compute_df_reindexed.index
         )
         pairwise = pairwise.round(5)
 
         column_labels = [
-            '{}{}'.format('d', i) for i in range(1, nmb_previous_points+1)
+            '{}{}'.format('d', i) for i in range(1, no_prev_pts + 1)
         ]
         distances = pd.DataFrame(
-            columns=column_labels, index=train_ft_df.index
+            columns=column_labels, index=compute_df.index
         )
-        for index in train_ft_df.index[nmb_previous_points]:
+
+        for index in compute_df.index[no_prev_pts:]:
             current_row = pairwise.loc[[index]]
             current_row_no_zeros = current_row.loc[
-                :, (current_row!=0).any(axis=0)
+                :, (current_row != 0).any(axis=0)
             ]
             distances.loc[[index]] = current_row_no_zeros.iloc[
-                :, :nmb_previous_points
+                :, :no_prev_pts
             ]
         distances = distances.replace([np.inf, -np.inf], np.nan)
         drop_index = pd.isnull(distances).any(1)
-        distances = distances[drop_index==0]
+        distances = distances[drop_index == 0]
 
         inliers = pd.DataFrame(index=distances.index)
         for key in distances.keys():
             current_distances = distances[key].dropna()
             fit_params = ss.weibull_min.fit(current_distances)
-            cutoff = ss.weibull_min.ppf(weibull_percentile, *fit_params)
+            cutoff = ss.weibull_min.ppf(weib_pct, *fit_params)
             is_inlier = np.where(
-                current_distances<=cutoff, 1, 0
+                current_distances <= cutoff, 1, 0
             )
             df_inlier = pd.DataFrame(
-                {key+'_IsInlier':is_inlier}, index=distances.index
+                {key + '_IsInlier': is_inlier}, index=distances.index
             )
             inliers = pd.concat(
                 [inliers, df_inlier], axis=1
             )
 
-        self.data_dictionary['train_features'] = pd.DataFrame(
-            data=inliers.sum(axis=1)/nmb_previous_points,
+        inlier_metric = pd.DataFrame(
+            data=inliers.sum(axis=1) / no_prev_pts,
             columns=['inlier_metric'],
-            index = train_ft_df.index
+            index=compute_df.index
         )
 
-        percent_outliers = np.round(
-            100*(1-self.data_dictionary['iniler_metric'].sum()/
-            len(train_ft_df.index)), 2
-        )
-        logger.info('{percent_outliers}%% of data points were identified as outliers')
+        inlier_metric = 2 * (inlier_metric - inlier_metric.min()) / \
+            (inlier_metric.max() - inlier_metric.min()) - 1
+
+        if set_ in ('train', 'test'):
+            inlier_metric = inlier_metric.iloc[no_prev_pts:]
+            compute_df = compute_df.iloc[no_prev_pts:]
+            self.remove_beginning_points_from_data_dict(set_, no_prev_pts)
+            self.data_dictionary[f'{set_}_features'] = pd.concat(
+                [compute_df, inlier_metric], axis=1)
+        else:
+            self.data_dictionary['prediction_features'] = pd.concat(
+                [compute_df, inlier_metric], axis=1)
+            self.data_dictionary['prediction_features'].fillna(0, inplace=True)
 
         return None
 
+    def remove_beginning_points_from_data_dict(self, set_='train', no_prev_pts: int = 10):
+        features = self.data_dictionary[f'{set_}_features']
+        weights = self.data_dictionary[f'{set_}_weights']
+        labels = self.data_dictionary[f'{set_}_labels']
+        self.data_dictionary[f'{set_}_weights'] = weights[no_prev_pts:]
+        self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:]
+        self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:]
+
     def find_features(self, dataframe: DataFrame) -> None:
         """
         Find features in the strategy provided dataframe
diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py
index 49e4ce5c3..3535d7371 100644
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -66,7 +66,6 @@ class IFreqaiModel(ABC):
             "data_split_parameters", {})
         self.model_training_parameters: Dict[str, Any] = config.get("freqai", {}).get(
             "model_training_parameters", {})
-        self.feature_parameters = config.get("freqai", {}).get("feature_parameters")
         self.retrain = False
         self.first = True
         self.set_full_path()
@@ -74,11 +73,14 @@ class IFreqaiModel(ABC):
         self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode)
         self.identifier: str = self.freqai_info.get("identifier", "no_id_provided")
         self.scanning = False
+        self.ft_params = self.freqai_info["feature_parameters"]
         self.keras: bool = self.freqai_info.get("keras", False)
-        if self.keras and self.freqai_info.get("feature_parameters", {}).get("DI_threshold", 0):
-            self.freqai_info["feature_parameters"]["DI_threshold"] = 0
+        if self.keras and self.ft_params.get("DI_threshold", 0):
+            self.ft_params["DI_threshold"] = 0
             logger.warning("DI threshold is not configured for Keras models yet. Deactivating.")
         self.CONV_WIDTH = self.freqai_info.get("conv_width", 2)
+        if self.ft_params.get("inlier_metric_window", 0):
+            self.CONV_WIDTH = self.ft_params.get("inlier_metric_window", 0) * 2
         self.pair_it = 0
         self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist"))
         self.last_trade_database_summary: DataFrame = {}
@@ -389,18 +391,20 @@ class IFreqaiModel(ABC):
         example of how outlier data points are dropped from the dataframe used for training.
         """
 
-        if self.freqai_info["feature_parameters"].get(
+        ft_params = self.freqai_info["feature_parameters"]
+
+        if ft_params.get(
             "principal_component_analysis", False
         ):
             dk.principal_component_analysis()
 
-        if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False):
+        if ft_params.get("use_SVM_to_remove_outliers", False):
             dk.use_SVM_to_remove_outliers(predict=False)
 
-        if self.freqai_info["feature_parameters"].get("DI_threshold", 0):
+        if ft_params.get("DI_threshold", 0):
             dk.data["avg_mean_dist"] = dk.compute_distances()
 
-        if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False):
+        if ft_params.get("use_DBSCAN_to_remove_outliers", False):
             if dk.pair in self.dd.old_DBSCAN_eps:
                 eps = self.dd.old_DBSCAN_eps[dk.pair]
             else:
@@ -408,6 +412,11 @@ class IFreqaiModel(ABC):
             dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps)
             self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps']
 
+        if ft_params.get('inlier_metric_window', 0):
+            dk.compute_inlier_metric(set_='train')
+            if self.freqai_info["data_split_parameters"]["test_size"] > 0:
+                dk.compute_inlier_metric(set_='test')
+
     def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None:
         """
         Base data cleaning method for predict.
@@ -419,18 +428,23 @@ class IFreqaiModel(ABC):
         of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
         for buy signals.
         """
-        if self.freqai_info["feature_parameters"].get(
+        ft_params = self.freqai_info["feature_parameters"]
+
+        if ft_params.get('inlier_metric_window', 0):
+            dk.compute_inlier_metric(set_='predict')
+
+        if ft_params.get(
             "principal_component_analysis", False
         ):
             dk.pca_transform(dataframe)
 
-        if self.freqai_info["feature_parameters"].get("use_SVM_to_remove_outliers", False):
+        if ft_params.get("use_SVM_to_remove_outliers", False):
             dk.use_SVM_to_remove_outliers(predict=True)
 
-        if self.freqai_info["feature_parameters"].get("DI_threshold", 0):
+        if ft_params.get("DI_threshold", 0):
             dk.check_if_pred_in_training_spaces()
 
-        if self.freqai_info["feature_parameters"].get("use_DBSCAN_to_remove_outliers", False):
+        if ft_params.get("use_DBSCAN_to_remove_outliers", False):
             dk.use_DBSCAN_to_remove_outliers(predict=True)
 
     def model_exists(

From 755041c134989d10093f1d65f23ebe2d45c643fe Mon Sep 17 00:00:00 2001
From: robcaulk <rob.caulk@gmail.com>
Date: Fri, 19 Aug 2022 18:35:24 +0200
Subject: [PATCH 3/3] add noise feature, improve docstrings

---
 freqtrade/freqai/data_kitchen.py     | 11 +++++++++++
 freqtrade/freqai/freqai_interface.py | 18 +++++++-----------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py
index ca4687902..c8516a8bd 100644
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@@ -752,6 +752,17 @@ class FreqaiDataKitchen:
         self.data_dictionary[f'{set_}_features'] = features.iloc[no_prev_pts:]
         self.data_dictionary[f'{set_}_labels'] = labels.iloc[no_prev_pts:]
 
+    def add_noise_to_training_features(self) -> None:
+        """
+        Add noise to train features to reduce the risk of overfitting.
+        """
+        mu = 0  # no shift
+        sigma = self.freqai_config["feature_parameters"]["noise_standard_deviation"]
+        compute_df = self.data_dictionary['train_features']
+        noise = np.random.normal(mu, sigma, [compute_df.shape[0], compute_df.shape[1]])
+        self.data_dictionary['train_features'] += noise
+        return
+
     def find_features(self, dataframe: DataFrame) -> None:
         """
         Find features in the strategy provided dataframe
diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py
index 3535d7371..07303b49f 100644
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -385,10 +385,9 @@ class IFreqaiModel(ABC):
 
     def data_cleaning_train(self, dk: FreqaiDataKitchen) -> None:
         """
-        Base data cleaning method for train
-        Any function inside this method should drop training data points from the filtered_dataframe
-        based on user decided logic. See FreqaiDataKitchen::use_SVM_to_remove_outliers() for an
-        example of how outlier data points are dropped from the dataframe used for training.
+        Base data cleaning method for train.
+        Functions here improve/modify the input data by identifying outliers,
+        computing additional metrics, adding noise, reducing dimensionality etc.
         """
 
         ft_params = self.freqai_info["feature_parameters"]
@@ -417,16 +416,13 @@ class IFreqaiModel(ABC):
             if self.freqai_info["data_split_parameters"]["test_size"] > 0:
                 dk.compute_inlier_metric(set_='test')
 
+        if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0):
+            dk.add_noise_to_training_features()
+
     def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None:
         """
         Base data cleaning method for predict.
-        These functions each modify dk.do_predict, which is a dataframe with equal length
-        to the number of candles coming from and returning to the strategy. Inside do_predict,
-         1 allows prediction and < 0 signals to the strategy that the model is not confident in
-         the prediction.
-         See FreqaiDataKitchen::remove_outliers() for an example
-        of how the do_predict vector is modified. do_predict is ultimately passed back to strategy
-        for buy signals.
+        Functions here are complementary to the functions of data_cleaning_train.
         """
         ft_params = self.freqai_info["feature_parameters"]