add min max strategies and data augmentation training

2022-09-15 14:14:02 +02:00
parent 8a236c3c4f
commit ff285fffe1
3 changed files with 687 additions and 5 deletions
--- a/user_data/freqaimodels/CatboostPredictionBinaryMultiModel.py
+++ b/user_data/freqaimodels/CatboostPredictionBinaryMultiModel.py
@@ -0,0 +1,118 @@
+import logging
+from typing import Any, Dict, Tuple
+
+from catboost import CatBoostClassifier  # , Pool
+from sklearn.multioutput import MultiOutputClassifier
+from pandas import DataFrame
+import numpy as np
+
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from freqtrade.freqai.base_models.BaseClassifierModel import BaseClassifierModel
+
+
+logger = logging.getLogger(__name__)
+
+
+class CatboostPredictionBinaryMultiModel(BaseClassifierModel):
+    """
+    User created prediction model. The class needs to override three necessary
+    functions, predict(), train(), fit(). The class inherits ModelHandler which
+    has its own DataHandler where data is held, saved, loaded, and managed.
+    """
+    
+    def fit_augmented(self, data_dictionary: Dict, dk: FreqaiDataKitchen,) -> Any:
+        """
+        User sets up the training and test data to fit their desired model here
+        :params:
+        :data_dictionary: the dictionary constructed by DataHandler to hold
+        all the training and test data/labels.
+        """
+
+        cbr = CatBoostClassifier(
+            allow_writing_files=False,
+            gpu_ram_part=0.5,
+            verbose=100,
+            early_stopping_rounds=400,
+            **self.model_training_parameters,
+        )
+
+        X = data_dictionary["train_features"]
+        y = data_dictionary["train_labels"]
+        if data_dictionary["test_features"].size:
+            eval_set = (data_dictionary["test_features"],
+                        data_dictionary["test_labels"])
+        sample_weight = data_dictionary["train_weights"]
+
+        if True :
+            # mu = 0
+            # sigma = 0.01
+            # noise = np.random.normal(mu, sigma, [X.shape[0], X.shape[1]])
+            # Xaugmented = X + noise
+
+            Xaugmented = X + np.random.randn(*X.shape) / 100 * X.std(0)[None, :]
+            X = np.vstack((X, Xaugmented))
+            y = y.append(y)
+            sample_weight = np.tile(sample_weight, 2)
+
+        from collections import Counter
+        weights = y.copy()
+        for col_name in y:
+            cnt = Counter(y[col_name])
+            for k, v in cnt.items():
+                weights[col_name][y[col_name] == k] = len(y) / v
+
+        # model = MultiOutputClassifier(estimator=cbr)
+        model = cbr
+
+        init_model = self.get_init_model(dk.pair)
+        
+        model.fit(X=X, y=y,
+                  sample_weight=sample_weight * weights.sum(1))  # , eval_set=eval_set)
+        train_score = model.score(X, y)
+        test_score = "Empty"
+        if data_dictionary["test_features"].size:
+            test_score = model.score(*eval_set)
+        logger.info(f"Augmented Train score {train_score}, Augmented Test score {test_score}")
+        return model
+
+    def fit(self, data_dictionary: Dict, dk: FreqaiDataKitchen,) -> Any:
+        """
+        User sets up the training and test data to fit their desired model here
+        :params:
+        :data_dictionary: the dictionary constructed by DataHandler to hold
+        all the training and test data/labels.
+        """
+
+        # cbr = CatBoostClassifier(
+        #     allow_writing_files=False,
+        #     gpu_ram_part=0.5,
+        #     verbose=100,
+        #     early_stopping_rounds=400,
+        #     **self.model_training_parameters,
+        # )
+
+        # X = data_dictionary["train_features"]
+        # y = data_dictionary["train_labels"]
+        # if data_dictionary["test_features"].size:
+        #     eval_set = (data_dictionary["test_features"],
+        #                 data_dictionary["test_labels"].values.astype(np.float32))
+        # sample_weight = data_dictionary["train_weights"]
+
+        # from collections import Counter
+        # weights = y.copy()
+        # for col_name in y:
+        #     cnt = Counter(y[col_name])
+        #     for k, v in cnt.items():
+        #         weights[col_name][y[col_name] == k] = len(y) / v
+
+        # model = MultiOutputClassifier(estimator=cbr)
+        # model.fit(X=X, Y=y.values.astype(np.float32),
+        #           sample_weight=sample_weight * weights.sum(1))  # , eval_set=eval_set)
+        # train_score = model.score(X, y.values.astype(np.float32))
+        # test_score = "Empty"
+        # if data_dictionary["test_features"].size:
+        #     test_score = model.score(*eval_set)
+        # logger.info(f"Train score {train_score}, Test score {test_score}")
+
+        model = self.fit_augmented(data_dictionary, dk)
+        return model