Merge branch 'develop' into add-xgboostclassifier

2022-09-10 23:59:11 +03:00
parent 60eb02bb62 075748b21a
commit 330d7068ab
24 changed files with 254 additions and 403 deletions
--- a/freqtrade/freqai/base_models/FreqaiMultiOutputRegressor.py
+++ b/freqtrade/freqai/base_models/FreqaiMultiOutputRegressor.py
@@ -36,9 +36,6 @@ class FreqaiMultiOutputRegressor(MultiOutputRegressor):

        y = self._validate_data(X="no_validation", y=y, multi_output=True)

-        # if is_classifier(self):
-        #     check_classification_targets(y)
-
        if y.ndim == 1:
            raise ValueError(
                "y must have at least two dimensions for "
@@ -50,19 +47,12 @@ class FreqaiMultiOutputRegressor(MultiOutputRegressor):
        ):
            raise ValueError("Underlying estimator does not support sample weights.")

-        # fit_params_validated = _check_fit_params(X, fit_params)
-
        if not fit_params:
            fit_params = [None] * y.shape[1]

-        # if not init_models:
-        #     init_models = [None] * y.shape[1]
-
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_estimator)(
                self.estimator, X, y[:, i], sample_weight, **fit_params[i]
-                # init_model=init_models[i], eval_set=eval_sets[i],
-                # **fit_params_validated
            )
            for i in range(y.shape[1])
        )
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@@ -184,7 +184,7 @@ class FreqaiDataKitchen:

    def filter_features(
        self,
-        unfiltered_dataframe: DataFrame,
+        unfiltered_df: DataFrame,
        training_feature_list: List,
        label_list: List = list(),
        training_filter: bool = True,
@@ -195,31 +195,35 @@ class FreqaiDataKitchen:
        0s in the prediction dataset. However, prediction dataset do_predict will reflect any
        row that had a NaN and will shield user from that prediction.
        :params:
-        :unfiltered_dataframe: the full dataframe for the present training period
+        :unfiltered_df: the full dataframe for the present training period
        :training_feature_list: list, the training feature list constructed by
        self.build_feature_list() according to user specified parameters in the configuration file.
        :labels: the labels for the dataset
        :training_filter: boolean which lets the function know if it is training data or
        prediction data to be filtered.
        :returns:
-        :filtered_dataframe: dataframe cleaned of NaNs and only containing the user
+        :filtered_df: dataframe cleaned of NaNs and only containing the user
        requested feature set.
        :labels: labels cleaned of NaNs.
        """
-        filtered_dataframe = unfiltered_dataframe.filter(training_feature_list, axis=1)
-        filtered_dataframe = filtered_dataframe.replace([np.inf, -np.inf], np.nan)
+        filtered_df = unfiltered_df.filter(training_feature_list, axis=1)
+        filtered_df = filtered_df.replace([np.inf, -np.inf], np.nan)

-        drop_index = pd.isnull(filtered_dataframe).any(1)  # get the rows that have NaNs,
+        drop_index = pd.isnull(filtered_df).any(1)  # get the rows that have NaNs,
        drop_index = drop_index.replace(True, 1).replace(False, 0)  # pep8 requirement.
        if (training_filter):
+            const_cols = list((filtered_df.nunique() == 1).loc[lambda x: x].index)
+            if const_cols:
+                filtered_df = filtered_df.filter(filtered_df.columns.difference(const_cols))
+                logger.warning(f"Removed features {const_cols} with constant values.")
            # we don't care about total row number (total no. datapoints) in training, we only care
            # about removing any row with NaNs
            # if labels has multiple columns (user wants to train multiple modelEs), we detect here
-            labels = unfiltered_dataframe.filter(label_list, axis=1)
+            labels = unfiltered_df.filter(label_list, axis=1)
            drop_index_labels = pd.isnull(labels).any(1)
            drop_index_labels = drop_index_labels.replace(True, 1).replace(False, 0)
-            dates = unfiltered_dataframe['date']
-            filtered_dataframe = filtered_dataframe[
+            dates = unfiltered_df['date']
+            filtered_df = filtered_df[
                (drop_index == 0) & (drop_index_labels == 0)
            ]  # dropping values
            labels = labels[
@@ -229,13 +233,13 @@ class FreqaiDataKitchen:
                (drop_index == 0) & (drop_index_labels == 0)
            ]
            logger.info(
-                f"dropped {len(unfiltered_dataframe) - len(filtered_dataframe)} training points"
-                f" due to NaNs in populated dataset {len(unfiltered_dataframe)}."
+                f"dropped {len(unfiltered_df) - len(filtered_df)} training points"
+                f" due to NaNs in populated dataset {len(unfiltered_df)}."
            )
-            if (1 - len(filtered_dataframe) / len(unfiltered_dataframe)) > 0.1 and self.live:
-                worst_indicator = str(unfiltered_dataframe.count().idxmin())
+            if (1 - len(filtered_df) / len(unfiltered_df)) > 0.1 and self.live:
+                worst_indicator = str(unfiltered_df.count().idxmin())
                logger.warning(
-                    f" {(1 - len(filtered_dataframe)/len(unfiltered_dataframe)) * 100:.0f} percent "
+                    f" {(1 - len(filtered_df)/len(unfiltered_df)) * 100:.0f} percent "
                    " of training data dropped due to NaNs, model may perform inconsistent "
                    f"with expectations. Verify {worst_indicator}"
                )
@@ -244,9 +248,9 @@ class FreqaiDataKitchen:
        else:
            # we are backtesting so we need to preserve row number to send back to strategy,
            # so now we use do_predict to avoid any prediction based on a NaN
-            drop_index = pd.isnull(filtered_dataframe).any(1)
+            drop_index = pd.isnull(filtered_df).any(1)
            self.data["filter_drop_index_prediction"] = drop_index
-            filtered_dataframe.fillna(0, inplace=True)
+            filtered_df.fillna(0, inplace=True)
            # replacing all NaNs with zeros to avoid issues in 'prediction', but any prediction
            # that was based on a single NaN is ultimately protected from buys with do_predict
            drop_index = ~drop_index
@@ -255,11 +259,11 @@ class FreqaiDataKitchen:
                logger.info(
                    "dropped %s of %s prediction data points due to NaNs.",
                    len(self.do_predict) - self.do_predict.sum(),
-                    len(filtered_dataframe),
+                    len(filtered_df),
                )
            labels = []

-        return filtered_dataframe, labels
+        return filtered_df, labels

    def build_data_dictionary(
        self,
@@ -466,10 +470,17 @@ class FreqaiDataKitchen:
    ) -> DataFrame:
        """
        Function which takes the backtesting time range and
-        remove training data from dataframe
+        remove training data from dataframe, keeping only the
+        startup_candle_count candles
        """
+        startup_candle_count = self.config.get('startup_candle_count', 0)
+        tf = self.config['timeframe']
        tr = self.config["timerange"]
+
        backtesting_timerange = TimeRange.parse_timerange(tr)
+        if startup_candle_count > 0 and backtesting_timerange:
+            backtesting_timerange.subtract_start(timeframe_to_seconds(tf) * startup_candle_count)
+
        start = datetime.fromtimestamp(backtesting_timerange.startts, tz=timezone.utc)
        df = self.return_dataframe
        df = df.loc[df["date"] >= start, :]
@@ -1215,7 +1226,6 @@ class FreqaiDataKitchen:
    def save_backtesting_prediction(
        self, append_df: DataFrame
    ) -> None:
-
        """
        Save prediction dataframe from backtesting to h5 file format
        :param append_df: dataframe for backtesting period
@@ -1229,7 +1239,6 @@ class FreqaiDataKitchen:
    def get_backtesting_prediction(
        self
    ) -> DataFrame:
-
        """
        Get prediction dataframe from h5 file format
        """
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@@ -14,6 +14,7 @@ from numpy.typing import NDArray
 from pandas import DataFrame

 from freqtrade.configuration import TimeRange
+from freqtrade.constants import DATETIME_PRINT_FORMAT
 from freqtrade.enums import RunMode
 from freqtrade.exceptions import OperationalException
 from freqtrade.exchange import timeframe_to_seconds
@@ -92,6 +93,12 @@ class IFreqaiModel(ABC):
        self._threads: List[threading.Thread] = []
        self._stop_event = threading.Event()

+    def __getstate__(self):
+        """
+        Return an empty state to be pickled in hyperopt
+        """
+        return ({})
+
    def assert_config(self, config: Dict[str, Any]) -> None:

        if not config.get("freqai", {}):
@@ -233,10 +240,10 @@ class IFreqaiModel(ABC):
            trained_timestamp = tr_train
            tr_train_startts_str = datetime.fromtimestamp(
                                                tr_train.startts,
-                                                tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
+                                                tz=timezone.utc).strftime(DATETIME_PRINT_FORMAT)
            tr_train_stopts_str = datetime.fromtimestamp(
                                                tr_train.stopts,
-                                                tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
+                                                tz=timezone.utc).strftime(DATETIME_PRINT_FORMAT)
            logger.info(
                f"Training {metadata['pair']}, {self.pair_it}/{self.total_pairs} pairs"
                f" from {tr_train_startts_str} to {tr_train_stopts_str}, {train_it}/{total_trains} "
--- a/freqtrade/freqai/prediction_models/CatboostRegressorMultiTarget.py
+++ b/freqtrade/freqai/prediction_models/CatboostRegressorMultiTarget.py
@@ -60,6 +60,9 @@ class CatboostRegressorMultiTarget(BaseRegressionModel):
                {'eval_set': eval_sets[i],  'init_model': init_models[i]})

        model = FreqaiMultiOutputRegressor(estimator=cbr)
+        thread_training = self.freqai_info.get('multitarget_parallel_training', False)
+        if thread_training:
+            model.n_jobs = y.shape[1]
        model.fit(X=X, y=y, sample_weight=sample_weight, fit_params=fit_params)

        return model
--- a/freqtrade/freqai/prediction_models/LightGBMRegressorMultiTarget.py
+++ b/freqtrade/freqai/prediction_models/LightGBMRegressorMultiTarget.py
@@ -56,9 +56,9 @@ class LightGBMRegressorMultiTarget(BaseRegressionModel):
                 'init_model': init_models[i]})

        model = FreqaiMultiOutputRegressor(estimator=lgb)
+        thread_training = self.freqai_info.get('multitarget_parallel_training', False)
+        if thread_training:
+            model.n_jobs = y.shape[1]
        model.fit(X=X, y=y, sample_weight=sample_weight, fit_params=fit_params)

-        # model = FreqaiMultiOutputRegressor(estimator=lgb)
-        # model.fit(X=X, y=y, sample_weight=sample_weight, init_models=init_models,
-        #           eval_sets=eval_sets, eval_sample_weight=eval_weights)
        return model
--- a/freqtrade/freqai/prediction_models/XGBoostRegressorMultiTarget.py
+++ b/freqtrade/freqai/prediction_models/XGBoostRegressorMultiTarget.py
@@ -55,6 +55,9 @@ class XGBoostRegressorMultiTarget(BaseRegressionModel):
                 'xgb_model': init_models[i]})

        model = FreqaiMultiOutputRegressor(estimator=xgb)
+        thread_training = self.freqai_info.get('multitarget_parallel_training', False)
+        if thread_training:
+            model.n_jobs = y.shape[1]
        model.fit(X=X, y=y, sample_weight=sample_weight, fit_params=fit_params)

        return model