reuse callback, allow user to acces all stable_baselines3 agents via config

2022-08-20 16:35:29 +02:00 · 2022-08-20 16:35:29 +02:00 · 3eb897c2f8
commit 3eb897c2f8
parent 4b9499e321
11 changed files with 295 additions and 587 deletions
--- a/config_examples/config_freqai-rl.example.json
+++ b/config_examples/config_freqai-rl.example.json
@ -55,7 +55,7 @@
    ],
    "freqai": {
        "enabled": true,
-        "model_save_type": "stable_baselines_dqn",
+        "model_save_type": "stable_baselines",
        "conv_width": 10,
        "purge_old_models": true,
        "train_period_days": 10,
@ -85,8 +85,11 @@
            "verbose": 1
        },
        "rl_config": {
-            "train_cycles": 15,
-            "eval_cycles": 5,
+            "train_cycles": 10,
+            "eval_cycles": 3,
+            "thread_count": 4,
+            "model_type": "PPO",
+            "policy_type": "MlpPolicy",
            "model_reward_parameters": {
                "rr": 1,
                "profit_aim": 0.02
--- a/freqtrade/freqai/RL/Base5ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py
@ -266,59 +266,28 @@ class Base5ActionRLEnv(gym.Env):

        # close long
        if action == Actions.Long_exit.value and self._position == Positions.Long:
-            if len(self.close_trade_profit):
-                # aim x2 rw
-                if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                    last_trade_price = self.add_buy_fee(
-                        self.prices.iloc[self._last_trade_tick].open)
-                    current_price = self.add_sell_fee(
-                        self.prices.iloc[self._current_tick].open)
-                    return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-                # less than aim x1 rw
-                elif self.close_trade_profit[-1] < self.profit_aim * self.rr:
-                    last_trade_price = self.add_buy_fee(
-                        self.prices.iloc[self._last_trade_tick].open
-                    )
-                    current_price = self.add_sell_fee(
-                        self.prices.iloc[self._current_tick].open
-                    )
-                    return float(np.log(current_price) - np.log(last_trade_price))
-                # # less than RR SL x2 neg rw
-                # elif self.close_trade_profit[-1] < (self.profit_aim * -1):
-                #     last_trade_price = self.add_buy_fee(
-                #         self.prices.iloc[self._last_trade_tick].open)
-                #     current_price = self.add_sell_fee(
-                #         self.prices.iloc[self._current_tick].open)
-                #     return float((np.log(current_price) - np.log(last_trade_price)) * 2) * -1
+            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+            return float(np.log(current_price) - np.log(last_trade_price))
+
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(current_price) - np.log(last_trade_price)) * 2)

        # close short
        if action == Actions.Short_exit.value and self._position == Positions.Short:
-            if len(self.close_trade_profit):
-                # aim x2 rw
-                if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                    last_trade_price = self.add_sell_fee(
-                        self.prices.iloc[self._last_trade_tick].open
-                    )
-                    current_price = self.add_buy_fee(
-                        self.prices.iloc[self._current_tick].open
-                    )
-                    return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-                # less than aim x1 rw
-                elif self.close_trade_profit[-1] < self.profit_aim * self.rr:
-                    last_trade_price = self.add_sell_fee(
-                        self.prices.iloc[self._last_trade_tick].open
-                    )
-                    current_price = self.add_buy_fee(
-                        self.prices.iloc[self._current_tick].open
-                    )
-                    return float(np.log(last_trade_price) - np.log(current_price))
-                # # less than RR SL x2 neg rw
-                # elif self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                #     last_trade_price = self.add_sell_fee(
-                #         self.prices.iloc[self._last_trade_tick].open)
-                #     current_price = self.add_buy_fee(
-                #         self.prices.iloc[self._current_tick].open)
-                #     return float((np.log(last_trade_price) - np.log(current_price)) * 2) * -1
+            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+            return float(np.log(last_trade_price) - np.log(current_price))
+
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
+
        return 0.

    def _update_profit(self, action):
--- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
+++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
@ -11,8 +11,12 @@ from freqtrade.freqai.freqai_interface import IFreqaiModel
 from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
 from freqtrade.persistence import Trade
 import torch.multiprocessing
+from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.monitor import Monitor
 import torch as th
+from typing import Callable
+from stable_baselines3.common.utils import set_random_seed
+import gym
 logger = logging.getLogger(__name__)

 torch.multiprocessing.set_sharing_strategy('file_system')
@ -25,9 +29,15 @@ class BaseReinforcementLearningModel(IFreqaiModel):

    def __init__(self, **kwargs):
        super().__init__(config=kwargs['config'])
-        th.set_num_threads(self.freqai_info.get('data_kitchen_thread_count', 4))
+        th.set_num_threads(self.freqai_info['rl_config'].get('thread_count', 4))
        self.reward_params = self.freqai_info['rl_config']['model_reward_parameters']
        self.train_env: Base5ActionRLEnv = None
+        self.eval_env: Base5ActionRLEnv = None
+        self.eval_callback: EvalCallback = None
+        mod = __import__('stable_baselines3', fromlist=[
+                         self.freqai_info['rl_config']['model_type']])
+        self.MODELCLASS = getattr(mod, self.freqai_info['rl_config']['model_type'])
+        self.policy_type = self.freqai_info['rl_config']['policy_type']

    def train(
        self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen
@ -67,7 +77,7 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        )
        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')

-        self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test)
+        self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test, dk)

        model = self.fit_rl(data_dictionary, dk)

@ -75,13 +85,13 @@ class BaseReinforcementLearningModel(IFreqaiModel):

        return model

-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
+    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk):
        """
-        User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
-        leaving this will default to Base5ActEnv
+        User overrides this as shown here if they are using a custom MyRLEnv
        """
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)

        # environments
        if not self.train_env:
@ -90,11 +100,17 @@ class BaseReinforcementLearningModel(IFreqaiModel):
            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
                                    window_size=self.CONV_WIDTH,
                                    reward_kwargs=self.reward_params), ".")
+            self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
+                                              render=False, eval_freq=eval_freq,
+                                              best_model_save_path=dk.data_path)
        else:
-            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
            self.train_env.reset()
            self.eval_env.reset()
+            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
+            self.eval_env.reset_env(test_df, prices_test, self.CONV_WIDTH, self.reward_params)
+            self.eval_callback.__init__(self.eval_env, deterministic=True,
+                                        render=False, eval_freq=eval_freq,
+                                        best_model_save_path=dk.data_path)

    @abstractmethod
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
@ -206,16 +222,28 @@ class BaseReinforcementLearningModel(IFreqaiModel):
    # all the other existing fit() functions to include dk argument. For now we instantiate and
    # leave it.
    def fit(self, data_dictionary: Dict[str, Any], pair: str = '') -> Any:
-        """
-        Most regressors use the same function names and arguments e.g. user
-        can drop in LGBMRegressor in place of CatBoostRegressor and all data
-        management will be properly handled by Freqai.
-        :param data_dictionary: Dict = the dictionary constructed by DataHandler to hold
-                                all the training and test data/labels.
-        """
-
        return

+def make_env(env_id: str, rank: int, seed: int, train_df, price,
+             reward_params, window_size, monitor=False) -> Callable:
+    """
+    Utility function for multiprocessed env.
+
+    :param env_id: (str) the environment ID
+    :param num_env: (int) the number of environment you wish to have in subprocesses
+    :param seed: (int) the inital seed for RNG
+    :param rank: (int) index of the subprocess
+    :return: (Callable)
+    """
+    def _init() -> gym.Env:
+
+        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
+                      reward_kwargs=reward_params, id=env_id, seed=seed + rank)
+        if monitor:
+            env = Monitor(env, ".")
+        return env
+    set_random_seed(seed)
+    return _init

 class MyRLEnv(Base5ActionRLEnv):
    """
@ -229,24 +257,24 @@ class MyRLEnv(Base5ActionRLEnv):
            return 0.

        # close long
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))

-        if action == Actions.Long_sell.value and self._position == Positions.Long:
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(current_price) - np.log(last_trade_price)) * 2)

        # close short
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))

-        if action == Actions.Short_buy.value and self._position == Positions.Short:
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
--- a/freqtrade/freqai/data_drawer.py
+++ b/freqtrade/freqai/data_drawer.py
@ -471,12 +471,11 @@ class FreqaiDataDrawer:
        elif model_type == 'keras':
            from tensorflow import keras
            model = keras.models.load_model(dk.data_path / f"{dk.model_filename}_model.h5")
-        elif model_type == 'stable_baselines_ppo':
-            from stable_baselines3.ppo.ppo import PPO
-            model = PPO.load(dk.data_path / f"{dk.model_filename}_model")
-        elif model_type == 'stable_baselines_dqn':
-            from stable_baselines3 import DQN
-            model = DQN.load(dk.data_path / f"{dk.model_filename}_model")
+        elif model_type == 'stable_baselines':
+            mod = __import__('stable_baselines3', fromlist=[
+                             self.freqai_info['rl_config']['model_type']])
+            MODELCLASS = getattr(mod, self.freqai_info['rl_config']['model_type'])
+            model = MODELCLASS.load(dk.data_path / f"{dk.model_filename}_model")

        if Path(dk.data_path / f"{dk.model_filename}_svm_model.joblib").is_file():
            dk.svm_model = load(dk.data_path / f"{dk.model_filename}_svm_model.joblib")
--- a/freqtrade/freqai/prediction_models/ReinforcementLearner.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearner.py
@ -0,0 +1,82 @@
+import logging
+from typing import Any, Dict  # , Tuple
+
+# import numpy.typing as npt
+import torch as th
+import numpy as np
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearner(BaseReinforcementLearningModel):
+    """
+    User created Reinforcement Learning Model prediction model.
+    """
+
+    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
+
+        train_df = data_dictionary["train_features"]
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
+
+        policy_kwargs = dict(activation_fn=th.nn.ReLU,
+                             net_arch=[256, 256, 128])
+
+        model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
+                                tensorboard_log=Path(dk.data_path / "tensorboard"),
+                                **self.freqai_info['model_training_parameters']
+                                )
+
+        model.learn(
+            total_timesteps=int(total_timesteps),
+            callback=self.eval_callback
+        )
+
+        if Path(dk.data_path / "best_model.zip").is_file():
+            logger.info('Callback found a best model.')
+            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
+            return best_model
+
+        logger.info('Couldnt find best model, using final model instead.')
+
+        return model
+
+
+class MyRLEnv(Base5ActionRLEnv):
+    """
+    User can modify any part of the environment by overriding base
+    functions
+    """
+    def calculate_reward(self, action):
+
+        if self._last_trade_tick is None:
+            return 0.
+
+        # close long
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
+            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+            return float(np.log(current_price) - np.log(last_trade_price))
+
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
+
+        # close short
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
+            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+            return float(np.log(last_trade_price) - np.log(current_price))
+
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
+
+        return 0.
--- a/freqtrade/freqai/prediction_models/ReinforcementLearnerCustomAgent.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearnerCustomAgent.py
@ -1,17 +1,59 @@
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
-
-import gym
-import torch
+import logging
 import torch as th
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from stable_baselines3 import DQN
 from stable_baselines3.common.buffers import ReplayBuffer
-from stable_baselines3.common.policies import BasePolicy
-from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor,
-                                                   FlattenExtractor)
-from stable_baselines3.common.type_aliases import GymEnv, Schedule
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from pathlib import Path
 from stable_baselines3.dqn.policies import (CnnPolicy, DQNPolicy, MlpPolicy,
                                            QNetwork)
 from torch import nn
+import gym
+from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor,
+                                                   FlattenExtractor)
+from stable_baselines3.common.type_aliases import GymEnv, Schedule
+from stable_baselines3.common.policies import BasePolicy
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearnerCustomAgent(BaseReinforcementLearningModel):
+    """
+    User can customize agent by defining the class and using it directly.
+    Here the example is "TDQN"
+    """
+
+    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
+
+        train_df = data_dictionary["train_features"]
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
+
+        policy_kwargs = dict(activation_fn=th.nn.ReLU,
+                             net_arch=[256, 256, 128])
+
+        # TDQN is a custom agent defined below
+        model = TDQN(self.policy_type, self.train_env,
+                     tensorboard_log=Path(dk.data_path / "tensorboard"),
+                     policy_kwargs=policy_kwargs,
+                     **self.freqai_info['model_training_parameters']
+                     )
+
+        model.learn(
+            total_timesteps=int(total_timesteps),
+            callback=self.eval_callback
+        )
+
+        if Path(dk.data_path / "best_model.zip").is_file():
+            logger.info('Callback found a best model.')
+            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
+            return best_model
+
+        logger.info('Couldnt find best model, using final model instead.')
+
+        return model
+
+# User creates their custom agent and networks as shown below


 def create_mlp_(
@ -72,7 +114,7 @@ class TDQNetwork(QNetwork):

    def init_weights(self, m):
        if type(m) == nn.Linear:
-            torch.nn.init.kaiming_uniform_(m.weight)
+            th.nn.init.kaiming_uniform_(m.weight)


 class TDQNPolicy(DQNPolicy):
@ -175,7 +217,7 @@ class TDQN(DQN):
        exploration_initial_eps: float = 1.0,
        exploration_final_eps: float = 0.05,
        max_grad_norm: float = 10,
-        tensorboard_log: Optional[str] = None,
+        tensorboard_log: Optional[Path] = None,
        create_eval_env: bool = False,
        policy_kwargs: Optional[Dict[str, Any]] = None,
        verbose: int = 1,
--- a/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py
@ -0,0 +1,84 @@
+import logging
+from typing import Any, Dict  # , Tuple
+
+# import numpy.typing as npt
+import torch as th
+from stable_baselines3.common.callbacks import EvalCallback
+from stable_baselines3.common.vec_env import SubprocVecEnv
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import (BaseReinforcementLearningModel,
+                                                                make_env)
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearner_multiproc(BaseReinforcementLearningModel):
+    """
+    User created Reinforcement Learning Model prediction model.
+    """
+
+    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
+
+        train_df = data_dictionary["train_features"]
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
+
+        # model arch
+        policy_kwargs = dict(activation_fn=th.nn.ReLU,
+                             net_arch=[512, 512, 512])
+
+        model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
+                                tensorboard_log=Path(dk.data_path / "tensorboard"),
+                                **self.freqai_info['model_training_parameters']
+                                )
+
+        model.learn(
+            total_timesteps=int(total_timesteps),
+            callback=self.eval_callback
+        )
+
+        if Path(dk.data_path / "best_model.zip").is_file():
+            logger.info('Callback found a best model.')
+            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
+            return best_model
+
+        logger.info('Couldnt find best model, using final model instead.')
+
+        return model
+
+    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk):
+        """
+        If user has particular environment configuration needs, they can do that by
+        overriding this function. In the present case, the user wants to setup training
+        environments for multiple workers.
+        """
+        train_df = data_dictionary["train_features"]
+        test_df = data_dictionary["test_features"]
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
+
+        # environments
+        if not self.train_env:
+            env_id = "train_env"
+            num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
+            self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
+                                            self.reward_params, self.CONV_WIDTH) for i
+                                            in range(num_cpu)])
+
+            eval_env_id = 'eval_env'
+            self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
+                                           self.reward_params, self.CONV_WIDTH, monitor=True) for i
+                                           in range(num_cpu)])
+            self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
+                                              render=False, eval_freq=eval_freq,
+                                              best_model_save_path=dk.data_path)
+        else:
+            self.train_env.env_method('reset')
+            self.eval_env.env_method('reset')
+            self.train_env.env_method('reset_env', train_df, prices_train,
+                                      self.CONV_WIDTH, self.reward_params)
+            self.eval_env.env_method('reset_env', train_df, prices_train,
+                                     self.CONV_WIDTH, self.reward_params)
+            self.eval_callback.__init__(self.eval_env, deterministic=True,
+                                        render=False, eval_freq=eval_freq,
+                                        best_model_save_path=dk.data_path)
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py
@ -1,104 +0,0 @@
-import gc
-import logging
-from typing import Any, Dict  # , Tuple
-
-import numpy as np
-# import numpy.typing as npt
-import torch as th
-from stable_baselines3 import PPO
-from stable_baselines3.common.callbacks import EvalCallback
-from stable_baselines3.common.monitor import Monitor
-
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-from freqtrade.freqai.RL.Base3ActionRLEnv import Actions, Base3ActionRLEnv, Positions
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
-
-
-logger = logging.getLogger(__name__)
-
-
-class ReinforcementLearningPPO(BaseReinforcementLearningModel):
-    """
-    User created Reinforcement Learning Model prediction model.
-    """
-
-    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
-
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
-        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
-
-        path = dk.data_path
-        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
-                                     log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
-                                     deterministic=True, render=False)
-
-        # model arch
-        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[256, 256, 128])
-
-        model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs,
-                    tensorboard_log=f"{path}/ppo/tensorboard/",
-                    **self.freqai_info['model_training_parameters']
-                    )
-
-        model.learn(
-            total_timesteps=int(total_timesteps),
-            callback=eval_callback
-        )
-
-        del model
-        best_model = PPO.load(dk.data_path / "best_model")
-
-        print('Training finished!')
-        gc.collect()
-
-        return best_model
-
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
-        """
-        User overrides this as shown here if they are using a custom MyRLEnv
-        """
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-
-        # environments
-        if not self.train_env:
-            self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
-                                     reward_kwargs=self.reward_params)
-            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
-                                    window_size=self.CONV_WIDTH,
-                                    reward_kwargs=self.reward_params), ".")
-        else:
-            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.train_env.reset()
-            self.eval_env.reset()
-
-
-class MyRLEnv(Base3ActionRLEnv):
-    """
-    User can override any function in BaseRLEnv and gym.Env
-    """
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        # close short
-        if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        return 0.
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py
@ -1,132 +0,0 @@
-import logging
-from typing import Any, Dict  # , Tuple
-
-import numpy as np
-# import numpy.typing as npt
-import torch as th
-from stable_baselines3.common.monitor import Monitor
-from typing import Callable
-from stable_baselines3 import PPO
-from stable_baselines3.common.callbacks import EvalCallback
-from stable_baselines3.common.vec_env import SubprocVecEnv
-from stable_baselines3.common.utils import set_random_seed
-from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-import gym
-
-logger = logging.getLogger(__name__)
-
-
-def make_env(env_id: str, rank: int, seed: int, train_df, price,
-             reward_params, window_size, monitor=False) -> Callable:
-    """
-    Utility function for multiprocessed env.
-
-    :param env_id: (str) the environment ID
-    :param num_env: (int) the number of environment you wish to have in subprocesses
-    :param seed: (int) the inital seed for RNG
-    :param rank: (int) index of the subprocess
-    :return: (Callable)
-    """
-    def _init() -> gym.Env:
-
-        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
-                      reward_kwargs=reward_params, id=env_id, seed=seed + rank)
-        if monitor:
-            env = Monitor(env, ".")
-        return env
-    set_random_seed(seed)
-    return _init
-
-
-class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
-    """
-    User created Reinforcement Learning Model prediction model.
-    """
-
-    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
-
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
-        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
-
-        path = dk.data_path
-        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
-                                     log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
-                                     deterministic=True, render=False)
-
-        # model arch
-        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[512, 512, 512])
-
-        model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs,
-                    tensorboard_log=f"{path}/ppo/tensorboard/",
-                    **self.freqai_info['model_training_parameters']
-                    )
-
-        model.learn(
-            total_timesteps=int(total_timesteps),
-            callback=eval_callback
-        )
-
-        best_model = PPO.load(dk.data_path / "best_model")
-        print('Training finished!')
-
-        return best_model
-
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
-        """
-        User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
-        leaving this will default to Base5ActEnv
-        """
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-
-        # environments
-        if not self.train_env:
-            env_id = "train_env"
-            num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
-            self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
-                                            self.reward_params, self.CONV_WIDTH) for i
-                                            in range(num_cpu)])
-
-            eval_env_id = 'eval_env'
-            self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
-                                           self.reward_params, self.CONV_WIDTH, monitor=True) for i
-                                           in range(num_cpu)])
-        else:
-            self.train_env.env_method('reset_env', train_df, prices_train,
-                                      self.CONV_WIDTH, self.reward_params)
-            self.eval_env.env_method('reset_env', train_df, prices_train,
-                                     self.CONV_WIDTH, self.reward_params)
-            self.train_env.env_method('reset')
-            self.eval_env.env_method('reset')
-
-
-class MyRLEnv(Base3ActionRLEnv):
-    """
-    User can override any function in BaseRLEnv and gym.Env
-    """
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        # close short
-        if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        return 0.
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py
@ -1,115 +0,0 @@
-import logging
-from typing import Any, Dict  # Optional
-import torch as th
-from stable_baselines3.common.callbacks import EvalCallback
-from stable_baselines3.common.monitor import Monitor
-from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
-from freqtrade.freqai.RL.TDQNagent import TDQN
-from stable_baselines3 import DQN
-from stable_baselines3.common.buffers import ReplayBuffer
-import numpy as np
-import gc
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-
-logger = logging.getLogger(__name__)
-
-
-class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
-    """
-    User created Reinforcement Learning Model prediction model.
-    """
-
-    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
-
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
-        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
-
-        path = dk.data_path
-        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
-                                     log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
-                                     deterministic=True, render=False)
-
-        # model arch
-        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[256, 256, 128])
-
-        model = TDQN('TMultiInputPolicy', self.train_env,
-                     tensorboard_log=f"{path}/tdqn/tensorboard/",
-                     policy_kwargs=policy_kwargs,
-                     replay_buffer_class=ReplayBuffer,
-                     **self.freqai_info['model_training_parameters']
-                     )
-
-        model.learn(
-            total_timesteps=int(total_timesteps),
-            callback=eval_callback
-        )
-
-        del model
-        best_model = DQN.load(dk.data_path / "best_model")
-
-        print('Training finished!')
-        gc.collect()
-        return best_model
-
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
-        """
-        User overrides this as shown here if they are using a custom MyRLEnv
-        """
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-
-        # environments
-        if not self.train_env:
-            self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
-                                     reward_kwargs=self.reward_params)
-            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
-                                    window_size=self.CONV_WIDTH,
-                                    reward_kwargs=self.reward_params), ".")
-        else:
-            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.train_env.reset()
-            self.eval_env.reset()
-
-
-# User can inherit and customize 5 action environment
-class MyRLEnv(Base5ActionRLEnv):
-    """
-    User can override any function in BaseRLEnv and gym.Env. Here the user
-    Adds 5 actions.
-    """
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
-            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-
-        # close short
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
-            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-
-        return 0.
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py
@ -1,148 +0,0 @@
-import logging
-from typing import Any, Dict  # Optional
-import torch as th
-import numpy as np
-import gym
-from typing import Callable
-from stable_baselines3.common.callbacks import EvalCallback
-# EvalCallback , StopTrainingOnNoModelImprovement, StopTrainingOnRewardThreshold
-from stable_baselines3.common.monitor import Monitor
-from stable_baselines3.common.vec_env import SubprocVecEnv
-from stable_baselines3.common.utils import set_random_seed
-from stable_baselines3 import DQN
-from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
-from freqtrade.freqai.RL.TDQNagent import TDQN
-from stable_baselines3.common.buffers import ReplayBuffer
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-
-logger = logging.getLogger(__name__)
-
-
-def make_env(env_id: str, rank: int, seed: int, train_df, price,
-             reward_params, window_size, monitor=False) -> Callable:
-    """
-    Utility function for multiprocessed env.
-
-    :param env_id: (str) the environment ID
-    :param num_env: (int) the number of environment you wish to have in subprocesses
-    :param seed: (int) the inital seed for RNG
-    :param rank: (int) index of the subprocess
-    :return: (Callable)
-    """
-    def _init() -> gym.Env:
-
-        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
-                      reward_kwargs=reward_params, id=env_id, seed=seed + rank)
-        if monitor:
-            env = Monitor(env, ".")
-        return env
-    set_random_seed(seed)
-    return _init
-
-
-class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
-    """
-    User created Reinforcement Learning Model prediction model.
-    """
-
-    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
-
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
-        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
-
-        path = dk.data_path
-
-        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
-                                     log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
-                                     deterministic=True, render=False)
-        # model arch
-        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[512, 512, 512])
-
-        model = TDQN('TMultiInputPolicy', self.train_env,
-                     policy_kwargs=policy_kwargs,
-                     tensorboard_log=f"{path}/tdqn/tensorboard/",
-                     replay_buffer_class=ReplayBuffer,
-                     **self.freqai_info['model_training_parameters']
-                     )
-
-        model.learn(
-            total_timesteps=int(total_timesteps),
-            callback=eval_callback
-        )
-
-        best_model = DQN.load(dk.data_path / "best_model.zip")
-        print('Training finished!')
-
-        return best_model
-
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
-        """
-        User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
-        leaving this will default to Base5ActEnv
-        """
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-
-        # environments
-        if not self.train_env:
-            env_id = "train_env"
-            num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
-            self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
-                                            self.reward_params, self.CONV_WIDTH) for i
-                                            in range(num_cpu)])
-
-            eval_env_id = 'eval_env'
-            self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
-                                           self.reward_params, self.CONV_WIDTH, monitor=True) for i
-                                           in range(num_cpu)])
-        else:
-            self.train_env.env_method('reset_env', train_df, prices_train,
-                                      self.CONV_WIDTH, self.reward_params)
-            self.eval_env.env_method('reset_env', train_df, prices_train,
-                                     self.CONV_WIDTH, self.reward_params)
-            self.train_env.env_method('reset')
-            self.eval_env.env_method('reset')
-
-# User can inherit and customize 5 action environment
-
-
-class MyRLEnv(Base5ActionRLEnv):
-    """
-    User can override any function in BaseRLEnv and gym.Env. Here the user
-    Adds 5 actions.
-    """
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
-            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-
-        # close short
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
-            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-
-        return 0.