remove trade database analyzer, clean up a bit

2022-08-10 17:43:06 +02:00
parent 91d0c91287
commit 2cae3c42e6
3 changed files with 1 additions and 287 deletions
--- a/docs/freqai.md
+++ b/docs/freqai.md
@@ -663,79 +663,13 @@ The user needs to set the standard dictionary in the config so FreqAI can return
 These values will likely be overridden by the user prediction model, but in the case where the user model has yet to set them, or needs
 a default initial value - this is the value that will be returned.
 ## Analyzing the trade live database
 Users can analyze the live trade database by calling `analyze_trade_database()` in their custom prediction model. FreqAI already has the
 database setup in a pandas dataframe and ready to be analyzed. Here is an example usecase:
 ```python
    def analyze_trade_database(self, dk: FreqaiDataKitchen, pair: str) -> None:
        """
        User analyzes the trade database here and returns summary stats which will be passed back
        to the strategy for reinforcement learning or for additional adaptive metrics for use
        in entry/exit signals. Store these metrics in dk.data['extra_returns_per_train'] and
        they will format themselves into the dataframe as an additional column in the user
        strategy. User has access to the current trade database in dk.trade_database_df.
        """
        total_profit = dk.trade_database_df['close_profit_abs'].sum()
        dk.data['extra_returns_per_train']['total_profit'] = total_profit
        return
 ```
 ## Building an IFreqaiModel
 FreqAI has multiple example prediction model based libraries such as `Catboost` regression (`freqai/prediction_models/CatboostRegressor.py`) and `LightGBM` regression. 
 However, users can customize and create their own prediction models using the `IFreqaiModel` class.
 Users are encouraged to inherit `train()` and `predict()` to let them customize various aspects of their training procedures.
 <!-- ## Dynamic target expectation
 The labels used for model training have a unique statistical distribution for each separate model training. 
 We can use this information to know if our current prediction is in the realm of what the model was trained on, 
 and if so, what is the statistical probability of the current prediction. With this information, we can
 make more informed prediction.
 FreqAI builds this label distribution and provides a quantile to the strategy, which can be optionally used as a
 dynamic threshold. The `target_quantile: X` means that X% of the labels are below this value. So setting:
 ```json
    "freqai": {
        "feature_parameters" : {
            "target_quantile": 0.9
        }
    }
 ```
 Means the user will get back in the strategy the label threshold at which 90% of the labels were 
 below this value. An example usage in the strategy may look something like:
 ```python
    def populate_indicators(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
        # ... #
        (
            dataframe["prediction"],
            dataframe["do_predict"],
            dataframe["target_upper_quantile"],
            dataframe["target_lower_quantile"],
        ) = self.freqai.start(dataframe, metadata, self)
        return dataframe
    def populate_buy_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
        buy_conditions = [
            (dataframe["prediction"] > dataframe["target_upper_quantile"]) & (dataframe["do_predict"] == 1)
        ]
        if buy_conditions:
            dataframe.loc[reduce(lambda x, y: x | y, buy_conditions), "buy"] = 1
        return dataframe
 ``` -->
 ## Additional information
 ### Common pitfalls
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@@ -2,9 +2,8 @@ import copy
 import datetime
 import logging
 import shutil
 import sqlite3
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Tuple
 import numpy as np
 import numpy.typing as npt
@@ -88,20 +87,6 @@ class FreqaiDataKitchen:
                config["freqai"]["backtest_period_days"],
            )
        self.database_path: Optional[Path] = None
        if self.live:
            db_url = self.config.get('db_url', None)
            self.database_path = Path(db_url)
            if 'sqlite' not in self.database_path.parts[0]:
                self.database_path = None
                logger.warning('FreqAI database analyzer only available for sqlite dbs. '
                               ' FreqAI will still run, but user cannot use database analyzer.')
            else:
                self.database_name = Path(*self.database_path.parts[1:])
        self.trade_database_df: DataFrame = pd.DataFrame()
        self.data['extra_returns_per_train'] = self.freqai_config.get('extra_returns_per_train', {})
        self.thread_count = self.freqai_config.get("data_kitchen_thread_count", -1)
        self.train_dates: DataFrame = pd.DataFrame()
@@ -1007,13 +992,6 @@ class FreqaiDataKitchen:
            f = spy.stats.norm.fit(self.data_dictionary["train_labels"][label])
            self.data["labels_mean"][label], self.data["labels_std"][label] = f[0], f[1]
        # KEEPME incase we want to let user start to grab quantiles.
        # upper_q = spy.stats.norm.ppf(self.freqai_config['feature_parameters'][
        #                                                   'target_quantile'], *f)
        # lower_q = spy.stats.norm.ppf(1 - self.freqai_config['feature_parameters'][
        #                                                       'target_quantile'], *f)
        # self.data["upper_quantile"] = upper_q
        # self.data["lower_quantile"] = lower_q
        return
    def remove_features_from_df(self, dataframe: DataFrame) -> DataFrame:
@@ -1025,181 +1003,3 @@ class FreqaiDataKitchen:
            col for col in dataframe.columns if not col.startswith("%") or col.startswith("%%")
        ]
        return dataframe[to_keep]
    def get_current_trade_database(self) -> None:
        if self.database_path is None:
            logger.warning('No trade database found. Skipping analysis.')
            return
        data = sqlite3.connect(self.database_name)
        query = data.execute("SELECT * From trades")
        cols = [column[0] for column in query.description]
        df = pd.DataFrame.from_records(data=query.fetchall(), columns=cols)
        self.trade_database_df = df.dropna(subset='close_date')
        data.close()
    def np_encoder(self, object):
        if isinstance(object, np.generic):
            return object.item()
    # Functions containing useful data manipulation examples. but not actively in use.
    # Possibly phasing these outlier removal methods below out in favor of
    # use_SVM_to_remove_outliers (computationally more efficient and apparently higher performance).
    # But these have good data manipulation examples, so keep them commented here for now.
    # def determine_statistical_distributions(self) -> None:
    #     from fitter import Fitter
    #     logger.info('Determining best model for all features, may take some time')
    #     def compute_quantiles(ft):
    #         f = Fitter(self.data_dictionary["train_features"][ft],
    #                    distributions=['gamma', 'cauchy', 'laplace',
    #                                   'beta', 'uniform', 'lognorm'])
    #         f.fit()
    #         # f.summary()
    #         dist = list(f.get_best().items())[0][0]
    #         params = f.get_best()[dist]
    #         upper_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.999, **params)
    #         lower_q = getattr(spy.stats, list(f.get_best().items())[0][0]).ppf(0.001, **params)
    #         return ft, upper_q, lower_q, dist
    #     quantiles_tuple = Parallel(n_jobs=-1)(
    #         delayed(compute_quantiles)(ft) for ft in self.data_dictionary[
    #                                                       'train_features'].columns)
    #     df = pd.DataFrame(quantiles_tuple, columns=['features', 'upper_quantiles',
    #                                                 'lower_quantiles', 'dist'])
    #     self.data_dictionary['upper_quantiles'] = df['upper_quantiles']
    #     self.data_dictionary['lower_quantiles'] = df['lower_quantiles']
    #     return
    # def remove_outliers(self, predict: bool) -> None:
    #     """
    #     Remove data that looks like an outlier based on the distribution of each
    #     variable.
    #     :params:
    #     :predict: boolean which tells the function if this is prediction data or
    #     training data coming in.
    #     """
    #     lower_quantile = self.data_dictionary["lower_quantiles"].to_numpy()
    #     upper_quantile = self.data_dictionary["upper_quantiles"].to_numpy()
    #     if predict:
    #         df = self.data_dictionary["prediction_features"][
    #             (self.data_dictionary["prediction_features"] < upper_quantile)
    #             & (self.data_dictionary["prediction_features"] > lower_quantile)
    #         ]
    #         drop_index = pd.isnull(df).any(1)
    #         self.data_dictionary["prediction_features"].fillna(0, inplace=True)
    #         drop_index = ~drop_index
    #         do_predict = np.array(drop_index.replace(True, 1).replace(False, 0))
    #         logger.info(
    #             "remove_outliers() tossed %s predictions",
    #             len(do_predict) - do_predict.sum(),
    #         )
    #         self.do_predict += do_predict
    #         self.do_predict -= 1
    #     else:
    #         filter_train_df = self.data_dictionary["train_features"][
    #             (self.data_dictionary["train_features"] < upper_quantile)
    #             & (self.data_dictionary["train_features"] > lower_quantile)
    #         ]
    #         drop_index = pd.isnull(filter_train_df).any(1)
    #         drop_index = drop_index.replace(True, 1).replace(False, 0)
    #         self.data_dictionary["train_features"] = self.data_dictionary["train_features"][
    #             (drop_index == 0)
    #         ]
    #         self.data_dictionary["train_labels"] = self.data_dictionary["train_labels"][
    #             (drop_index == 0)
    #         ]
    #         self.data_dictionary["train_weights"] = self.data_dictionary["train_weights"][
    #             (drop_index == 0)
    #         ]
    #         logger.info(
    #             f'remove_outliers() tossed {drop_index.sum()}'
    #             f' training points from {len(filter_train_df)}'
    #         )
    #         # do the same for the test data
    #         filter_test_df = self.data_dictionary["test_features"][
    #             (self.data_dictionary["test_features"] < upper_quantile)
    #             & (self.data_dictionary["test_features"] > lower_quantile)
    #         ]
    #         drop_index = pd.isnull(filter_test_df).any(1)
    #         drop_index = drop_index.replace(True, 1).replace(False, 0)
    #         self.data_dictionary["test_labels"] = self.data_dictionary["test_labels"][
    #             (drop_index == 0)
    #         ]
    #         self.data_dictionary["test_features"] = self.data_dictionary["test_features"][
    #             (drop_index == 0)
    #         ]
    #         self.data_dictionary["test_weights"] = self.data_dictionary["test_weights"][
    #             (drop_index == 0)
    #         ]
    #         logger.info(
    #             f'remove_outliers() tossed {drop_index.sum()}'
    #             f' test points from {len(filter_test_df)}'
    #         )
    #     return
    # def standardize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
    #     """
    #     standardize all data in the data_dictionary according to the training dataset
    #     :params:
    #     :data_dictionary: dictionary containing the cleaned and split training/test data/labels
    #     :returns:
    #     :data_dictionary: updated dictionary with standardized values.
    #     """
    #     # standardize the data by training stats
    #     train_mean = data_dictionary["train_features"].mean()
    #     train_std = data_dictionary["train_features"].std()
    #     data_dictionary["train_features"] = (
    #         data_dictionary["train_features"] - train_mean
    #     ) / train_std
    #     data_dictionary["test_features"] = (
    #         data_dictionary["test_features"] - train_mean
    #     ) / train_std
    #     train_labels_std = data_dictionary["train_labels"].std()
    #     train_labels_mean = data_dictionary["train_labels"].mean()
    #     data_dictionary["train_labels"] = (
    #         data_dictionary["train_labels"] - train_labels_mean
    #     ) / train_labels_std
    #     data_dictionary["test_labels"] = (
    #         data_dictionary["test_labels"] - train_labels_mean
    #     ) / train_labels_std
    #     for item in train_std.keys():
    #         self.data[item + "_std"] = train_std[item]
    #         self.data[item + "_mean"] = train_mean[item]
    #     self.data["labels_std"] = train_labels_std
    #     self.data["labels_mean"] = train_labels_mean
    #     return data_dictionary
    # def standardize_data_from_metadata(self, df: DataFrame) -> DataFrame:
    # """
    # Normalizes a set of data using the mean and standard deviation from
    # the associated training data.
    # :params:
    # :df: Dataframe to be standardized
    # """
    # for item in df.keys():
    #     df[item] = (df[item] - self.data[item + "_mean"]) / self.data[item + "_std"]
    # return df
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -491,9 +491,6 @@ class IFreqaiModel(ABC):
        model = self.train(unfiltered_dataframe, pair, dk)
        dk.get_current_trade_database()
        self.analyze_trade_database(dk, pair)
        self.dd.pair_dict[pair]["trained_timestamp"] = new_trained_timerange.stopts
        dk.set_new_model_names(pair, new_trained_timerange)
        self.dd.pair_dict[pair]["first"] = False
@@ -612,20 +609,3 @@ class IFreqaiModel(ABC):
        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
        data (NaNs) or felt uncertain about data (i.e. SVM and/or DI index)
        """
    def analyze_trade_database(self, dk: FreqaiDataKitchen, pair: str) -> None:
        """
        User analyzes the trade database here and returns summary stats which will be passed back
        to the strategy for reinforcement learning or for additional adaptive metrics for use
        in entry/exit signals. Store these metrics in dk.data['extra_returns_per_train'] and
        they will format themselves into the dataframe as an additional column in the user
        strategy. User has access to the current trade database in dk.trade_database_df.
        """
        # if dk.trade_database_df.empty:
        #     logger.warning(f'No trades found for {pair} to analyze DB')
        #     return
        # total_profit = dk.trade_database_df['close_profit_abs'].sum()
        # dk.data['extra_returns_per_train']['total_profit'] = total_profit
        return