reuse callback, allow user to acces all stable_baselines3 agents via config

2022-08-20 16:35:29 +02:00
parent 4b9499e321
commit 3eb897c2f8
11 changed files with 295 additions and 587 deletions
@@ -0,0 +1,82 @@
+import logging
+from typing import Any, Dict  # , Tuple
+
+# import numpy.typing as npt
+import torch as th
+import numpy as np
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearner(BaseReinforcementLearningModel):
+    """
+    User created Reinforcement Learning Model prediction model.
+    """
+
+    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
+
+        train_df = data_dictionary["train_features"]
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
+
+        policy_kwargs = dict(activation_fn=th.nn.ReLU,
+                             net_arch=[256, 256, 128])
+
+        model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
+                                tensorboard_log=Path(dk.data_path / "tensorboard"),
+                                **self.freqai_info['model_training_parameters']
+                                )
+
+        model.learn(
+            total_timesteps=int(total_timesteps),
+            callback=self.eval_callback
+        )
+
+        if Path(dk.data_path / "best_model.zip").is_file():
+            logger.info('Callback found a best model.')
+            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
+            return best_model
+
+        logger.info('Couldnt find best model, using final model instead.')
+
+        return model
+
+
+class MyRLEnv(Base5ActionRLEnv):
+    """
+    User can modify any part of the environment by overriding base
+    functions
+    """
+    def calculate_reward(self, action):
+
+        if self._last_trade_tick is None:
+            return 0.
+
+        # close long
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
+            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+            return float(np.log(current_price) - np.log(last_trade_price))
+
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
+
+        # close short
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
+            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+            return float(np.log(last_trade_price) - np.log(current_price))
+
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
+
+        return 0.
@@ -0,0 +1,255 @@
+import logging
+import torch as th
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
+from stable_baselines3 import DQN
+from stable_baselines3.common.buffers import ReplayBuffer
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from pathlib import Path
+from stable_baselines3.dqn.policies import (CnnPolicy, DQNPolicy, MlpPolicy,
+                                            QNetwork)
+from torch import nn
+import gym
+from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor,
+                                                   FlattenExtractor)
+from stable_baselines3.common.type_aliases import GymEnv, Schedule
+from stable_baselines3.common.policies import BasePolicy
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearnerCustomAgent(BaseReinforcementLearningModel):
+    """
+    User can customize agent by defining the class and using it directly.
+    Here the example is "TDQN"
+    """
+
+    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
+
+        train_df = data_dictionary["train_features"]
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
+
+        policy_kwargs = dict(activation_fn=th.nn.ReLU,
+                             net_arch=[256, 256, 128])
+
+        # TDQN is a custom agent defined below
+        model = TDQN(self.policy_type, self.train_env,
+                     tensorboard_log=Path(dk.data_path / "tensorboard"),
+                     policy_kwargs=policy_kwargs,
+                     **self.freqai_info['model_training_parameters']
+                     )
+
+        model.learn(
+            total_timesteps=int(total_timesteps),
+            callback=self.eval_callback
+        )
+
+        if Path(dk.data_path / "best_model.zip").is_file():
+            logger.info('Callback found a best model.')
+            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
+            return best_model
+
+        logger.info('Couldnt find best model, using final model instead.')
+
+        return model
+
+# User creates their custom agent and networks as shown below
+
+
+def create_mlp_(
+    input_dim: int,
+    output_dim: int,
+    net_arch: List[int],
+    activation_fn: Type[nn.Module] = nn.ReLU,
+    squash_output: bool = False,
+) -> List[nn.Module]:
+    dropout = 0.2
+    if len(net_arch) > 0:
+        number_of_neural = net_arch[0]
+
+    modules = [
+        nn.Linear(input_dim, number_of_neural),
+        nn.BatchNorm1d(number_of_neural),
+        nn.LeakyReLU(),
+        nn.Dropout(dropout),
+        nn.Linear(number_of_neural, number_of_neural),
+        nn.BatchNorm1d(number_of_neural),
+        nn.LeakyReLU(),
+        nn.Dropout(dropout),
+        nn.Linear(number_of_neural, number_of_neural),
+        nn.BatchNorm1d(number_of_neural),
+        nn.LeakyReLU(),
+        nn.Dropout(dropout),
+        nn.Linear(number_of_neural, number_of_neural),
+        nn.BatchNorm1d(number_of_neural),
+        nn.LeakyReLU(),
+        nn.Dropout(dropout),
+        nn.Linear(number_of_neural, output_dim)
+    ]
+    return modules
+
+
+class TDQNetwork(QNetwork):
+    def __init__(self,
+                 observation_space: gym.spaces.Space,
+                 action_space: gym.spaces.Space,
+                 features_extractor: nn.Module,
+                 features_dim: int,
+                 net_arch: Optional[List[int]] = None,
+                 activation_fn: Type[nn.Module] = nn.ReLU,
+                 normalize_images: bool = True
+                 ):
+        super().__init__(
+            observation_space=observation_space,
+            action_space=action_space,
+            features_extractor=features_extractor,
+            features_dim=features_dim,
+            net_arch=net_arch,
+            activation_fn=activation_fn,
+            normalize_images=normalize_images
+        )
+        action_dim = self.action_space.n
+        q_net = create_mlp_(self.features_dim, action_dim, self.net_arch, self.activation_fn)
+        self.q_net = nn.Sequential(*q_net).apply(self.init_weights)
+
+    def init_weights(self, m):
+        if type(m) == nn.Linear:
+            th.nn.init.kaiming_uniform_(m.weight)
+
+
+class TDQNPolicy(DQNPolicy):
+
+    def __init__(
+        self,
+        observation_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        lr_schedule: Schedule,
+        net_arch: Optional[List[int]] = None,
+        activation_fn: Type[nn.Module] = nn.ReLU,
+        features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
+        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
+        normalize_images: bool = True,
+        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
+        optimizer_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(
+            observation_space=observation_space,
+            action_space=action_space,
+            lr_schedule=lr_schedule,
+            net_arch=net_arch,
+            activation_fn=activation_fn,
+            features_extractor_class=features_extractor_class,
+            features_extractor_kwargs=features_extractor_kwargs,
+            normalize_images=normalize_images,
+            optimizer_class=optimizer_class,
+            optimizer_kwargs=optimizer_kwargs
+        )
+
+    @staticmethod
+    def init_weights(module: nn.Module, gain: float = 1) -> None:
+        """
+        Orthogonal initialization (used in PPO and A2C)
+        """
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            nn.init.kaiming_uniform_(module.weight)
+            if module.bias is not None:
+                module.bias.data.fill_(0.0)
+
+    def make_q_net(self) -> TDQNetwork:
+        # Make sure we always have separate networks for features extractors etc
+        net_args = self._update_features_extractor(self.net_args, features_extractor=None)
+        return TDQNetwork(**net_args).to(self.device)
+
+
+class TMultiInputPolicy(TDQNPolicy):
+    def __init__(
+        self,
+        observation_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        lr_schedule: Schedule,
+        net_arch: Optional[List[int]] = None,
+        activation_fn: Type[nn.Module] = nn.ReLU,
+        features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
+        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
+        normalize_images: bool = True,
+        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
+        optimizer_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(
+            observation_space,
+            action_space,
+            lr_schedule,
+            net_arch,
+            activation_fn,
+            features_extractor_class,
+            features_extractor_kwargs,
+            normalize_images,
+            optimizer_class,
+            optimizer_kwargs,
+        )
+
+
+class TDQN(DQN):
+
+    policy_aliases: Dict[str, Type[BasePolicy]] = {
+        "MlpPolicy": MlpPolicy,
+        "CnnPolicy": CnnPolicy,
+        "TMultiInputPolicy": TMultiInputPolicy,
+    }
+
+    def __init__(
+        self,
+        policy: Union[str, Type[TDQNPolicy]],
+        env: Union[GymEnv, str],
+        learning_rate: Union[float, Schedule] = 1e-4,
+        buffer_size: int = 1000000,  # 1e6
+        learning_starts: int = 50000,
+        batch_size: int = 32,
+        tau: float = 1.0,
+        gamma: float = 0.99,
+        train_freq: Union[int, Tuple[int, str]] = 4,
+        gradient_steps: int = 1,
+        replay_buffer_class: Optional[ReplayBuffer] = None,
+        replay_buffer_kwargs: Optional[Dict[str, Any]] = None,
+        optimize_memory_usage: bool = False,
+        target_update_interval: int = 10000,
+        exploration_fraction: float = 0.1,
+        exploration_initial_eps: float = 1.0,
+        exploration_final_eps: float = 0.05,
+        max_grad_norm: float = 10,
+        tensorboard_log: Optional[Path] = None,
+        create_eval_env: bool = False,
+        policy_kwargs: Optional[Dict[str, Any]] = None,
+        verbose: int = 1,
+        seed: Optional[int] = None,
+        device: Union[th.device, str] = "auto",
+        _init_setup_model: bool = True,
+    ):
+
+        super().__init__(
+            policy=policy,
+            env=env,
+            learning_rate=learning_rate,
+            buffer_size=buffer_size,
+            learning_starts=learning_starts,
+            batch_size=batch_size,
+            tau=tau,
+            gamma=gamma,
+            train_freq=train_freq,
+            gradient_steps=gradient_steps,
+            replay_buffer_class=replay_buffer_class,  # No action noise
+            replay_buffer_kwargs=replay_buffer_kwargs,
+            optimize_memory_usage=optimize_memory_usage,
+            target_update_interval=target_update_interval,
+            exploration_fraction=exploration_fraction,
+            exploration_initial_eps=exploration_initial_eps,
+            exploration_final_eps=exploration_final_eps,
+            max_grad_norm=max_grad_norm,
+            tensorboard_log=tensorboard_log,
+            create_eval_env=create_eval_env,
+            policy_kwargs=policy_kwargs,
+            verbose=verbose,
+            seed=seed,
+            device=device,
+            _init_setup_model=_init_setup_model
+        )
@@ -0,0 +1,84 @@
+import logging
+from typing import Any, Dict  # , Tuple
+
+# import numpy.typing as npt
+import torch as th
+from stable_baselines3.common.callbacks import EvalCallback
+from stable_baselines3.common.vec_env import SubprocVecEnv
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import (BaseReinforcementLearningModel,
+                                                                make_env)
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearner_multiproc(BaseReinforcementLearningModel):
+    """
+    User created Reinforcement Learning Model prediction model.
+    """
+
+    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
+
+        train_df = data_dictionary["train_features"]
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
+
+        # model arch
+        policy_kwargs = dict(activation_fn=th.nn.ReLU,
+                             net_arch=[512, 512, 512])
+
+        model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
+                                tensorboard_log=Path(dk.data_path / "tensorboard"),
+                                **self.freqai_info['model_training_parameters']
+                                )
+
+        model.learn(
+            total_timesteps=int(total_timesteps),
+            callback=self.eval_callback
+        )
+
+        if Path(dk.data_path / "best_model.zip").is_file():
+            logger.info('Callback found a best model.')
+            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
+            return best_model
+
+        logger.info('Couldnt find best model, using final model instead.')
+
+        return model
+
+    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk):
+        """
+        If user has particular environment configuration needs, they can do that by
+        overriding this function. In the present case, the user wants to setup training
+        environments for multiple workers.
+        """
+        train_df = data_dictionary["train_features"]
+        test_df = data_dictionary["test_features"]
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
+
+        # environments
+        if not self.train_env:
+            env_id = "train_env"
+            num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
+            self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
+                                            self.reward_params, self.CONV_WIDTH) for i
+                                            in range(num_cpu)])
+
+            eval_env_id = 'eval_env'
+            self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
+                                           self.reward_params, self.CONV_WIDTH, monitor=True) for i
+                                           in range(num_cpu)])
+            self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
+                                              render=False, eval_freq=eval_freq,
+                                              best_model_save_path=dk.data_path)
+        else:
+            self.train_env.env_method('reset')
+            self.eval_env.env_method('reset')
+            self.train_env.env_method('reset_env', train_df, prices_train,
+                                      self.CONV_WIDTH, self.reward_params)
+            self.eval_env.env_method('reset_env', train_df, prices_train,
+                                     self.CONV_WIDTH, self.reward_params)
+            self.eval_callback.__init__(self.eval_env, deterministic=True,
+                                        render=False, eval_freq=eval_freq,
+                                        best_model_save_path=dk.data_path)
@@ -1,104 +0,0 @@
-import gc
-import logging
-from typing import Any, Dict  # , Tuple
-
-import numpy as np
-# import numpy.typing as npt
-import torch as th
-from stable_baselines3 import PPO
-from stable_baselines3.common.callbacks import EvalCallback
-from stable_baselines3.common.monitor import Monitor
-
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-from freqtrade.freqai.RL.Base3ActionRLEnv import Actions, Base3ActionRLEnv, Positions
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
-
-
-logger = logging.getLogger(__name__)
-
-
-class ReinforcementLearningPPO(BaseReinforcementLearningModel):
-    """
-    User created Reinforcement Learning Model prediction model.
-    """
-
-    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
-
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
-        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
-
-        path = dk.data_path
-        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
-                                     log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
-                                     deterministic=True, render=False)
-
-        # model arch
-        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[256, 256, 128])
-
-        model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs,
-                    tensorboard_log=f"{path}/ppo/tensorboard/",
-                    **self.freqai_info['model_training_parameters']
-                    )
-
-        model.learn(
-            total_timesteps=int(total_timesteps),
-            callback=eval_callback
-        )
-
-        del model
-        best_model = PPO.load(dk.data_path / "best_model")
-
-        print('Training finished!')
-        gc.collect()
-
-        return best_model
-
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
-        """
-        User overrides this as shown here if they are using a custom MyRLEnv
-        """
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-
-        # environments
-        if not self.train_env:
-            self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
-                                     reward_kwargs=self.reward_params)
-            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
-                                    window_size=self.CONV_WIDTH,
-                                    reward_kwargs=self.reward_params), ".")
-        else:
-            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.train_env.reset()
-            self.eval_env.reset()
-
-
-class MyRLEnv(Base3ActionRLEnv):
-    """
-    User can override any function in BaseRLEnv and gym.Env
-    """
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        # close short
-        if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        return 0.
@@ -1,132 +0,0 @@
-import logging
-from typing import Any, Dict  # , Tuple
-
-import numpy as np
-# import numpy.typing as npt
-import torch as th
-from stable_baselines3.common.monitor import Monitor
-from typing import Callable
-from stable_baselines3 import PPO
-from stable_baselines3.common.callbacks import EvalCallback
-from stable_baselines3.common.vec_env import SubprocVecEnv
-from stable_baselines3.common.utils import set_random_seed
-from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-import gym
-
-logger = logging.getLogger(__name__)
-
-
-def make_env(env_id: str, rank: int, seed: int, train_df, price,
-             reward_params, window_size, monitor=False) -> Callable:
-    """
-    Utility function for multiprocessed env.
-
-    :param env_id: (str) the environment ID
-    :param num_env: (int) the number of environment you wish to have in subprocesses
-    :param seed: (int) the inital seed for RNG
-    :param rank: (int) index of the subprocess
-    :return: (Callable)
-    """
-    def _init() -> gym.Env:
-
-        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
-                      reward_kwargs=reward_params, id=env_id, seed=seed + rank)
-        if monitor:
-            env = Monitor(env, ".")
-        return env
-    set_random_seed(seed)
-    return _init
-
-
-class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
-    """
-    User created Reinforcement Learning Model prediction model.
-    """
-
-    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
-
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
-        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
-
-        path = dk.data_path
-        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
-                                     log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
-                                     deterministic=True, render=False)
-
-        # model arch
-        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[512, 512, 512])
-
-        model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs,
-                    tensorboard_log=f"{path}/ppo/tensorboard/",
-                    **self.freqai_info['model_training_parameters']
-                    )
-
-        model.learn(
-            total_timesteps=int(total_timesteps),
-            callback=eval_callback
-        )
-
-        best_model = PPO.load(dk.data_path / "best_model")
-        print('Training finished!')
-
-        return best_model
-
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
-        """
-        User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
-        leaving this will default to Base5ActEnv
-        """
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-
-        # environments
-        if not self.train_env:
-            env_id = "train_env"
-            num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
-            self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
-                                            self.reward_params, self.CONV_WIDTH) for i
-                                            in range(num_cpu)])
-
-            eval_env_id = 'eval_env'
-            self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
-                                           self.reward_params, self.CONV_WIDTH, monitor=True) for i
-                                           in range(num_cpu)])
-        else:
-            self.train_env.env_method('reset_env', train_df, prices_train,
-                                      self.CONV_WIDTH, self.reward_params)
-            self.eval_env.env_method('reset_env', train_df, prices_train,
-                                     self.CONV_WIDTH, self.reward_params)
-            self.train_env.env_method('reset')
-            self.eval_env.env_method('reset')
-
-
-class MyRLEnv(Base3ActionRLEnv):
-    """
-    User can override any function in BaseRLEnv and gym.Env
-    """
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        # close short
-        if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        return 0.
@@ -1,115 +0,0 @@
-import logging
-from typing import Any, Dict  # Optional
-import torch as th
-from stable_baselines3.common.callbacks import EvalCallback
-from stable_baselines3.common.monitor import Monitor
-from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
-from freqtrade.freqai.RL.TDQNagent import TDQN
-from stable_baselines3 import DQN
-from stable_baselines3.common.buffers import ReplayBuffer
-import numpy as np
-import gc
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-
-logger = logging.getLogger(__name__)
-
-
-class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
-    """
-    User created Reinforcement Learning Model prediction model.
-    """
-
-    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
-
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
-        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
-
-        path = dk.data_path
-        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
-                                     log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
-                                     deterministic=True, render=False)
-
-        # model arch
-        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[256, 256, 128])
-
-        model = TDQN('TMultiInputPolicy', self.train_env,
-                     tensorboard_log=f"{path}/tdqn/tensorboard/",
-                     policy_kwargs=policy_kwargs,
-                     replay_buffer_class=ReplayBuffer,
-                     **self.freqai_info['model_training_parameters']
-                     )
-
-        model.learn(
-            total_timesteps=int(total_timesteps),
-            callback=eval_callback
-        )
-
-        del model
-        best_model = DQN.load(dk.data_path / "best_model")
-
-        print('Training finished!')
-        gc.collect()
-        return best_model
-
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
-        """
-        User overrides this as shown here if they are using a custom MyRLEnv
-        """
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-
-        # environments
-        if not self.train_env:
-            self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
-                                     reward_kwargs=self.reward_params)
-            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
-                                    window_size=self.CONV_WIDTH,
-                                    reward_kwargs=self.reward_params), ".")
-        else:
-            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.train_env.reset()
-            self.eval_env.reset()
-
-
-# User can inherit and customize 5 action environment
-class MyRLEnv(Base5ActionRLEnv):
-    """
-    User can override any function in BaseRLEnv and gym.Env. Here the user
-    Adds 5 actions.
-    """
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
-            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-
-        # close short
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
-            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-
-        return 0.
@@ -1,148 +0,0 @@
-import logging
-from typing import Any, Dict  # Optional
-import torch as th
-import numpy as np
-import gym
-from typing import Callable
-from stable_baselines3.common.callbacks import EvalCallback
-# EvalCallback , StopTrainingOnNoModelImprovement, StopTrainingOnRewardThreshold
-from stable_baselines3.common.monitor import Monitor
-from stable_baselines3.common.vec_env import SubprocVecEnv
-from stable_baselines3.common.utils import set_random_seed
-from stable_baselines3 import DQN
-from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
-from freqtrade.freqai.RL.TDQNagent import TDQN
-from stable_baselines3.common.buffers import ReplayBuffer
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-
-logger = logging.getLogger(__name__)
-
-
-def make_env(env_id: str, rank: int, seed: int, train_df, price,
-             reward_params, window_size, monitor=False) -> Callable:
-    """
-    Utility function for multiprocessed env.
-
-    :param env_id: (str) the environment ID
-    :param num_env: (int) the number of environment you wish to have in subprocesses
-    :param seed: (int) the inital seed for RNG
-    :param rank: (int) index of the subprocess
-    :return: (Callable)
-    """
-    def _init() -> gym.Env:
-
-        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
-                      reward_kwargs=reward_params, id=env_id, seed=seed + rank)
-        if monitor:
-            env = Monitor(env, ".")
-        return env
-    set_random_seed(seed)
-    return _init
-
-
-class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
-    """
-    User created Reinforcement Learning Model prediction model.
-    """
-
-    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
-
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
-        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
-
-        path = dk.data_path
-
-        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
-                                     log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
-                                     deterministic=True, render=False)
-        # model arch
-        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[512, 512, 512])
-
-        model = TDQN('TMultiInputPolicy', self.train_env,
-                     policy_kwargs=policy_kwargs,
-                     tensorboard_log=f"{path}/tdqn/tensorboard/",
-                     replay_buffer_class=ReplayBuffer,
-                     **self.freqai_info['model_training_parameters']
-                     )
-
-        model.learn(
-            total_timesteps=int(total_timesteps),
-            callback=eval_callback
-        )
-
-        best_model = DQN.load(dk.data_path / "best_model.zip")
-        print('Training finished!')
-
-        return best_model
-
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
-        """
-        User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
-        leaving this will default to Base5ActEnv
-        """
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-
-        # environments
-        if not self.train_env:
-            env_id = "train_env"
-            num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
-            self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
-                                            self.reward_params, self.CONV_WIDTH) for i
-                                            in range(num_cpu)])
-
-            eval_env_id = 'eval_env'
-            self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
-                                           self.reward_params, self.CONV_WIDTH, monitor=True) for i
-                                           in range(num_cpu)])
-        else:
-            self.train_env.env_method('reset_env', train_df, prices_train,
-                                      self.CONV_WIDTH, self.reward_params)
-            self.eval_env.env_method('reset_env', train_df, prices_train,
-                                     self.CONV_WIDTH, self.reward_params)
-            self.train_env.env_method('reset')
-            self.eval_env.env_method('reset')
-
-# User can inherit and customize 5 action environment
-
-
-class MyRLEnv(Base5ActionRLEnv):
-    """
-    User can override any function in BaseRLEnv and gym.Env. Here the user
-    Adds 5 actions.
-    """
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
-            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-
-        # close short
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
-            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-
-        return 0.