diff --git a/config_examples/config_freqai-rl.example.json b/config_examples/config_freqai-rl.example.json
index 1af872552..fa08cdd60 100644
--- a/config_examples/config_freqai-rl.example.json
+++ b/config_examples/config_freqai-rl.example.json
@@ -55,7 +55,7 @@
     ],
     "freqai": {
         "enabled": true,
-        "model_save_type": "stable_baselines_dqn",
+        "model_save_type": "stable_baselines",
         "conv_width": 10,
         "purge_old_models": true,
         "train_period_days": 10,
@@ -85,8 +85,11 @@
             "verbose": 1
         },
         "rl_config": {
-            "train_cycles": 15,
-            "eval_cycles": 5,
+            "train_cycles": 10,
+            "eval_cycles": 3,
+            "thread_count": 4,
+            "model_type": "PPO",
+            "policy_type": "MlpPolicy",
             "model_reward_parameters": {
                 "rr": 1,
                 "profit_aim": 0.02
diff --git a/freqtrade/freqai/RL/Base5ActionRLEnv.py b/freqtrade/freqai/RL/Base5ActionRLEnv.py
index 4c946a5b2..7d3cbffbe 100644
--- a/freqtrade/freqai/RL/Base5ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py
@@ -266,59 +266,28 @@ class Base5ActionRLEnv(gym.Env):
 
         # close long
         if action == Actions.Long_exit.value and self._position == Positions.Long:
-            if len(self.close_trade_profit):
-                # aim x2 rw
-                if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                    last_trade_price = self.add_buy_fee(
-                        self.prices.iloc[self._last_trade_tick].open)
-                    current_price = self.add_sell_fee(
-                        self.prices.iloc[self._current_tick].open)
-                    return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-                # less than aim x1 rw
-                elif self.close_trade_profit[-1] < self.profit_aim * self.rr:
-                    last_trade_price = self.add_buy_fee(
-                        self.prices.iloc[self._last_trade_tick].open
-                    )
-                    current_price = self.add_sell_fee(
-                        self.prices.iloc[self._current_tick].open
-                    )
-                    return float(np.log(current_price) - np.log(last_trade_price))
-                # # less than RR SL x2 neg rw
-                # elif self.close_trade_profit[-1] < (self.profit_aim * -1):
-                #     last_trade_price = self.add_buy_fee(
-                #         self.prices.iloc[self._last_trade_tick].open)
-                #     current_price = self.add_sell_fee(
-                #         self.prices.iloc[self._current_tick].open)
-                #     return float((np.log(current_price) - np.log(last_trade_price)) * 2) * -1
+            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+            return float(np.log(current_price) - np.log(last_trade_price))
+
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
 
         # close short
         if action == Actions.Short_exit.value and self._position == Positions.Short:
-            if len(self.close_trade_profit):
-                # aim x2 rw
-                if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                    last_trade_price = self.add_sell_fee(
-                        self.prices.iloc[self._last_trade_tick].open
-                    )
-                    current_price = self.add_buy_fee(
-                        self.prices.iloc[self._current_tick].open
-                    )
-                    return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-                # less than aim x1 rw
-                elif self.close_trade_profit[-1] < self.profit_aim * self.rr:
-                    last_trade_price = self.add_sell_fee(
-                        self.prices.iloc[self._last_trade_tick].open
-                    )
-                    current_price = self.add_buy_fee(
-                        self.prices.iloc[self._current_tick].open
-                    )
-                    return float(np.log(last_trade_price) - np.log(current_price))
-                # # less than RR SL x2 neg rw
-                # elif self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                #     last_trade_price = self.add_sell_fee(
-                #         self.prices.iloc[self._last_trade_tick].open)
-                #     current_price = self.add_buy_fee(
-                #         self.prices.iloc[self._current_tick].open)
-                #     return float((np.log(last_trade_price) - np.log(current_price)) * 2) * -1
+            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+            return float(np.log(last_trade_price) - np.log(current_price))
+
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
+
         return 0.
 
     def _update_profit(self, action):
diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
index 9c7b1e4b4..9cada2bf0 100644
--- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
+++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
@@ -11,8 +11,12 @@ from freqtrade.freqai.freqai_interface import IFreqaiModel
 from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
 from freqtrade.persistence import Trade
 import torch.multiprocessing
+from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.monitor import Monitor
 import torch as th
+from typing import Callable
+from stable_baselines3.common.utils import set_random_seed
+import gym
 logger = logging.getLogger(__name__)
 
 torch.multiprocessing.set_sharing_strategy('file_system')
@@ -25,9 +29,15 @@ class BaseReinforcementLearningModel(IFreqaiModel):
 
     def __init__(self, **kwargs):
         super().__init__(config=kwargs['config'])
-        th.set_num_threads(self.freqai_info.get('data_kitchen_thread_count', 4))
+        th.set_num_threads(self.freqai_info['rl_config'].get('thread_count', 4))
         self.reward_params = self.freqai_info['rl_config']['model_reward_parameters']
         self.train_env: Base5ActionRLEnv = None
+        self.eval_env: Base5ActionRLEnv = None
+        self.eval_callback: EvalCallback = None
+        mod = __import__('stable_baselines3', fromlist=[
+                         self.freqai_info['rl_config']['model_type']])
+        self.MODELCLASS = getattr(mod, self.freqai_info['rl_config']['model_type'])
+        self.policy_type = self.freqai_info['rl_config']['policy_type']
 
     def train(
         self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen
@@ -67,7 +77,7 @@ class BaseReinforcementLearningModel(IFreqaiModel):
         )
         logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
 
-        self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test)
+        self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test, dk)
 
         model = self.fit_rl(data_dictionary, dk)
 
@@ -75,13 +85,13 @@ class BaseReinforcementLearningModel(IFreqaiModel):
 
         return model
 
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
+    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk):
         """
-        User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
-        leaving this will default to Base5ActEnv
+        User overrides this as shown here if they are using a custom MyRLEnv
         """
         train_df = data_dictionary["train_features"]
         test_df = data_dictionary["test_features"]
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
 
         # environments
         if not self.train_env:
@@ -90,11 +100,17 @@ class BaseReinforcementLearningModel(IFreqaiModel):
             self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
                                     window_size=self.CONV_WIDTH,
                                     reward_kwargs=self.reward_params), ".")
+            self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
+                                              render=False, eval_freq=eval_freq,
+                                              best_model_save_path=dk.data_path)
         else:
-            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
             self.train_env.reset()
             self.eval_env.reset()
+            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
+            self.eval_env.reset_env(test_df, prices_test, self.CONV_WIDTH, self.reward_params)
+            self.eval_callback.__init__(self.eval_env, deterministic=True,
+                                        render=False, eval_freq=eval_freq,
+                                        best_model_save_path=dk.data_path)
 
     @abstractmethod
     def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
@@ -206,16 +222,28 @@ class BaseReinforcementLearningModel(IFreqaiModel):
     # all the other existing fit() functions to include dk argument. For now we instantiate and
     # leave it.
     def fit(self, data_dictionary: Dict[str, Any], pair: str = '') -> Any:
-        """
-        Most regressors use the same function names and arguments e.g. user
-        can drop in LGBMRegressor in place of CatBoostRegressor and all data
-        management will be properly handled by Freqai.
-        :param data_dictionary: Dict = the dictionary constructed by DataHandler to hold
-                                all the training and test data/labels.
-        """
-
         return
 
+def make_env(env_id: str, rank: int, seed: int, train_df, price,
+             reward_params, window_size, monitor=False) -> Callable:
+    """
+    Utility function for multiprocessed env.
+
+    :param env_id: (str) the environment ID
+    :param num_env: (int) the number of environment you wish to have in subprocesses
+    :param seed: (int) the inital seed for RNG
+    :param rank: (int) index of the subprocess
+    :return: (Callable)
+    """
+    def _init() -> gym.Env:
+
+        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
+                      reward_kwargs=reward_params, id=env_id, seed=seed + rank)
+        if monitor:
+            env = Monitor(env, ".")
+        return env
+    set_random_seed(seed)
+    return _init
 
 class MyRLEnv(Base5ActionRLEnv):
     """
@@ -229,24 +257,24 @@ class MyRLEnv(Base5ActionRLEnv):
             return 0.
 
         # close long
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
             return float(np.log(current_price) - np.log(last_trade_price))
 
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
             if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                 last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
                 current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
                 return float((np.log(current_price) - np.log(last_trade_price)) * 2)
 
         # close short
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
             return float(np.log(last_trade_price) - np.log(current_price))
 
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
             if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                 last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
                 current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py
index 9603fb9ab..c37973551 100644
--- a/freqtrade/freqai/data_drawer.py
+++ b/freqtrade/freqai/data_drawer.py
@@ -471,12 +471,11 @@ class FreqaiDataDrawer:
         elif model_type == 'keras':
             from tensorflow import keras
             model = keras.models.load_model(dk.data_path / f"{dk.model_filename}_model.h5")
-        elif model_type == 'stable_baselines_ppo':
-            from stable_baselines3.ppo.ppo import PPO
-            model = PPO.load(dk.data_path / f"{dk.model_filename}_model")
-        elif model_type == 'stable_baselines_dqn':
-            from stable_baselines3 import DQN
-            model = DQN.load(dk.data_path / f"{dk.model_filename}_model")
+        elif model_type == 'stable_baselines':
+            mod = __import__('stable_baselines3', fromlist=[
+                             self.freqai_info['rl_config']['model_type']])
+            MODELCLASS = getattr(mod, self.freqai_info['rl_config']['model_type'])
+            model = MODELCLASS.load(dk.data_path / f"{dk.model_filename}_model")
 
         if Path(dk.data_path / f"{dk.model_filename}_svm_model.joblib").is_file():
             dk.svm_model = load(dk.data_path / f"{dk.model_filename}_svm_model.joblib")
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearner.py b/freqtrade/freqai/prediction_models/ReinforcementLearner.py
new file mode 100644
index 000000000..2faa6eb3a
--- /dev/null
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearner.py
@@ -0,0 +1,82 @@
+import logging
+from typing import Any, Dict  # , Tuple
+
+# import numpy.typing as npt
+import torch as th
+import numpy as np
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearner(BaseReinforcementLearningModel):
+    """
+    User created Reinforcement Learning Model prediction model.
+    """
+
+    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
+
+        train_df = data_dictionary["train_features"]
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
+
+        policy_kwargs = dict(activation_fn=th.nn.ReLU,
+                             net_arch=[256, 256, 128])
+
+        model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
+                                tensorboard_log=Path(dk.data_path / "tensorboard"),
+                                **self.freqai_info['model_training_parameters']
+                                )
+
+        model.learn(
+            total_timesteps=int(total_timesteps),
+            callback=self.eval_callback
+        )
+
+        if Path(dk.data_path / "best_model.zip").is_file():
+            logger.info('Callback found a best model.')
+            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
+            return best_model
+
+        logger.info('Couldnt find best model, using final model instead.')
+
+        return model
+
+
+class MyRLEnv(Base5ActionRLEnv):
+    """
+    User can modify any part of the environment by overriding base
+    functions
+    """
+    def calculate_reward(self, action):
+
+        if self._last_trade_tick is None:
+            return 0.
+
+        # close long
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
+            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+            return float(np.log(current_price) - np.log(last_trade_price))
+
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
+
+        # close short
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
+            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+            return float(np.log(last_trade_price) - np.log(current_price))
+
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
+
+        return 0.
diff --git a/freqtrade/freqai/RL/TDQNagent.py b/freqtrade/freqai/prediction_models/ReinforcementLearnerCustomAgent.py
similarity index 81%
rename from freqtrade/freqai/RL/TDQNagent.py
rename to freqtrade/freqai/prediction_models/ReinforcementLearnerCustomAgent.py
index 584f6a8ef..bb16b612b 100644
--- a/freqtrade/freqai/RL/TDQNagent.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearnerCustomAgent.py
@@ -1,17 +1,59 @@
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
-
-import gym
-import torch
+import logging
 import torch as th
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from stable_baselines3 import DQN
 from stable_baselines3.common.buffers import ReplayBuffer
-from stable_baselines3.common.policies import BasePolicy
-from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor,
-                                                   FlattenExtractor)
-from stable_baselines3.common.type_aliases import GymEnv, Schedule
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from pathlib import Path
 from stable_baselines3.dqn.policies import (CnnPolicy, DQNPolicy, MlpPolicy,
                                             QNetwork)
 from torch import nn
+import gym
+from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor,
+                                                   FlattenExtractor)
+from stable_baselines3.common.type_aliases import GymEnv, Schedule
+from stable_baselines3.common.policies import BasePolicy
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearnerCustomAgent(BaseReinforcementLearningModel):
+    """
+    User can customize agent by defining the class and using it directly.
+    Here the example is "TDQN"
+    """
+
+    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
+
+        train_df = data_dictionary["train_features"]
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
+
+        policy_kwargs = dict(activation_fn=th.nn.ReLU,
+                             net_arch=[256, 256, 128])
+
+        # TDQN is a custom agent defined below
+        model = TDQN(self.policy_type, self.train_env,
+                     tensorboard_log=Path(dk.data_path / "tensorboard"),
+                     policy_kwargs=policy_kwargs,
+                     **self.freqai_info['model_training_parameters']
+                     )
+
+        model.learn(
+            total_timesteps=int(total_timesteps),
+            callback=self.eval_callback
+        )
+
+        if Path(dk.data_path / "best_model.zip").is_file():
+            logger.info('Callback found a best model.')
+            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
+            return best_model
+
+        logger.info('Couldnt find best model, using final model instead.')
+
+        return model
+
+# User creates their custom agent and networks as shown below
 
 
 def create_mlp_(
@@ -72,7 +114,7 @@ class TDQNetwork(QNetwork):
 
     def init_weights(self, m):
         if type(m) == nn.Linear:
-            torch.nn.init.kaiming_uniform_(m.weight)
+            th.nn.init.kaiming_uniform_(m.weight)
 
 
 class TDQNPolicy(DQNPolicy):
@@ -175,7 +217,7 @@ class TDQN(DQN):
         exploration_initial_eps: float = 1.0,
         exploration_final_eps: float = 0.05,
         max_grad_norm: float = 10,
-        tensorboard_log: Optional[str] = None,
+        tensorboard_log: Optional[Path] = None,
         create_eval_env: bool = False,
         policy_kwargs: Optional[Dict[str, Any]] = None,
         verbose: int = 1,
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py b/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py
new file mode 100644
index 000000000..1854bb1a5
--- /dev/null
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py
@@ -0,0 +1,84 @@
+import logging
+from typing import Any, Dict  # , Tuple
+
+# import numpy.typing as npt
+import torch as th
+from stable_baselines3.common.callbacks import EvalCallback
+from stable_baselines3.common.vec_env import SubprocVecEnv
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import (BaseReinforcementLearningModel,
+                                                                make_env)
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearner_multiproc(BaseReinforcementLearningModel):
+    """
+    User created Reinforcement Learning Model prediction model.
+    """
+
+    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
+
+        train_df = data_dictionary["train_features"]
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
+
+        # model arch
+        policy_kwargs = dict(activation_fn=th.nn.ReLU,
+                             net_arch=[512, 512, 512])
+
+        model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
+                                tensorboard_log=Path(dk.data_path / "tensorboard"),
+                                **self.freqai_info['model_training_parameters']
+                                )
+
+        model.learn(
+            total_timesteps=int(total_timesteps),
+            callback=self.eval_callback
+        )
+
+        if Path(dk.data_path / "best_model.zip").is_file():
+            logger.info('Callback found a best model.')
+            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
+            return best_model
+
+        logger.info('Couldnt find best model, using final model instead.')
+
+        return model
+
+    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk):
+        """
+        If user has particular environment configuration needs, they can do that by
+        overriding this function. In the present case, the user wants to setup training
+        environments for multiple workers.
+        """
+        train_df = data_dictionary["train_features"]
+        test_df = data_dictionary["test_features"]
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
+
+        # environments
+        if not self.train_env:
+            env_id = "train_env"
+            num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
+            self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
+                                            self.reward_params, self.CONV_WIDTH) for i
+                                            in range(num_cpu)])
+
+            eval_env_id = 'eval_env'
+            self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
+                                           self.reward_params, self.CONV_WIDTH, monitor=True) for i
+                                           in range(num_cpu)])
+            self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
+                                              render=False, eval_freq=eval_freq,
+                                              best_model_save_path=dk.data_path)
+        else:
+            self.train_env.env_method('reset')
+            self.eval_env.env_method('reset')
+            self.train_env.env_method('reset_env', train_df, prices_train,
+                                      self.CONV_WIDTH, self.reward_params)
+            self.eval_env.env_method('reset_env', train_df, prices_train,
+                                     self.CONV_WIDTH, self.reward_params)
+            self.eval_callback.__init__(self.eval_env, deterministic=True,
+                                        render=False, eval_freq=eval_freq,
+                                        best_model_save_path=dk.data_path)
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py
deleted file mode 100644
index 993ac263b..000000000
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import gc
-import logging
-from typing import Any, Dict  # , Tuple
-
-import numpy as np
-# import numpy.typing as npt
-import torch as th
-from stable_baselines3 import PPO
-from stable_baselines3.common.callbacks import EvalCallback
-from stable_baselines3.common.monitor import Monitor
-
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-from freqtrade.freqai.RL.Base3ActionRLEnv import Actions, Base3ActionRLEnv, Positions
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
-
-
-logger = logging.getLogger(__name__)
-
-
-class ReinforcementLearningPPO(BaseReinforcementLearningModel):
-    """
-    User created Reinforcement Learning Model prediction model.
-    """
-
-    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
-
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
-        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
-
-        path = dk.data_path
-        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
-                                     log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
-                                     deterministic=True, render=False)
-
-        # model arch
-        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[256, 256, 128])
-
-        model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs,
-                    tensorboard_log=f"{path}/ppo/tensorboard/",
-                    **self.freqai_info['model_training_parameters']
-                    )
-
-        model.learn(
-            total_timesteps=int(total_timesteps),
-            callback=eval_callback
-        )
-
-        del model
-        best_model = PPO.load(dk.data_path / "best_model")
-
-        print('Training finished!')
-        gc.collect()
-
-        return best_model
-
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
-        """
-        User overrides this as shown here if they are using a custom MyRLEnv
-        """
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-
-        # environments
-        if not self.train_env:
-            self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
-                                     reward_kwargs=self.reward_params)
-            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
-                                    window_size=self.CONV_WIDTH,
-                                    reward_kwargs=self.reward_params), ".")
-        else:
-            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.train_env.reset()
-            self.eval_env.reset()
-
-
-class MyRLEnv(Base3ActionRLEnv):
-    """
-    User can override any function in BaseRLEnv and gym.Env
-    """
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        # close short
-        if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        return 0.
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py
deleted file mode 100644
index 5fa24a599..000000000
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import logging
-from typing import Any, Dict  # , Tuple
-
-import numpy as np
-# import numpy.typing as npt
-import torch as th
-from stable_baselines3.common.monitor import Monitor
-from typing import Callable
-from stable_baselines3 import PPO
-from stable_baselines3.common.callbacks import EvalCallback
-from stable_baselines3.common.vec_env import SubprocVecEnv
-from stable_baselines3.common.utils import set_random_seed
-from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-import gym
-
-logger = logging.getLogger(__name__)
-
-
-def make_env(env_id: str, rank: int, seed: int, train_df, price,
-             reward_params, window_size, monitor=False) -> Callable:
-    """
-    Utility function for multiprocessed env.
-
-    :param env_id: (str) the environment ID
-    :param num_env: (int) the number of environment you wish to have in subprocesses
-    :param seed: (int) the inital seed for RNG
-    :param rank: (int) index of the subprocess
-    :return: (Callable)
-    """
-    def _init() -> gym.Env:
-
-        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
-                      reward_kwargs=reward_params, id=env_id, seed=seed + rank)
-        if monitor:
-            env = Monitor(env, ".")
-        return env
-    set_random_seed(seed)
-    return _init
-
-
-class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
-    """
-    User created Reinforcement Learning Model prediction model.
-    """
-
-    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
-
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
-        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
-
-        path = dk.data_path
-        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
-                                     log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
-                                     deterministic=True, render=False)
-
-        # model arch
-        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[512, 512, 512])
-
-        model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs,
-                    tensorboard_log=f"{path}/ppo/tensorboard/",
-                    **self.freqai_info['model_training_parameters']
-                    )
-
-        model.learn(
-            total_timesteps=int(total_timesteps),
-            callback=eval_callback
-        )
-
-        best_model = PPO.load(dk.data_path / "best_model")
-        print('Training finished!')
-
-        return best_model
-
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
-        """
-        User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
-        leaving this will default to Base5ActEnv
-        """
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-
-        # environments
-        if not self.train_env:
-            env_id = "train_env"
-            num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
-            self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
-                                            self.reward_params, self.CONV_WIDTH) for i
-                                            in range(num_cpu)])
-
-            eval_env_id = 'eval_env'
-            self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
-                                           self.reward_params, self.CONV_WIDTH, monitor=True) for i
-                                           in range(num_cpu)])
-        else:
-            self.train_env.env_method('reset_env', train_df, prices_train,
-                                      self.CONV_WIDTH, self.reward_params)
-            self.eval_env.env_method('reset_env', train_df, prices_train,
-                                     self.CONV_WIDTH, self.reward_params)
-            self.train_env.env_method('reset')
-            self.eval_env.env_method('reset')
-
-
-class MyRLEnv(Base3ActionRLEnv):
-    """
-    User can override any function in BaseRLEnv and gym.Env
-    """
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        # close short
-        if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        return 0.
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py
deleted file mode 100644
index 3c4ac6bdb..000000000
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import logging
-from typing import Any, Dict  # Optional
-import torch as th
-from stable_baselines3.common.callbacks import EvalCallback
-from stable_baselines3.common.monitor import Monitor
-from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
-from freqtrade.freqai.RL.TDQNagent import TDQN
-from stable_baselines3 import DQN
-from stable_baselines3.common.buffers import ReplayBuffer
-import numpy as np
-import gc
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-
-logger = logging.getLogger(__name__)
-
-
-class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
-    """
-    User created Reinforcement Learning Model prediction model.
-    """
-
-    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
-
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
-        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
-
-        path = dk.data_path
-        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
-                                     log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
-                                     deterministic=True, render=False)
-
-        # model arch
-        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[256, 256, 128])
-
-        model = TDQN('TMultiInputPolicy', self.train_env,
-                     tensorboard_log=f"{path}/tdqn/tensorboard/",
-                     policy_kwargs=policy_kwargs,
-                     replay_buffer_class=ReplayBuffer,
-                     **self.freqai_info['model_training_parameters']
-                     )
-
-        model.learn(
-            total_timesteps=int(total_timesteps),
-            callback=eval_callback
-        )
-
-        del model
-        best_model = DQN.load(dk.data_path / "best_model")
-
-        print('Training finished!')
-        gc.collect()
-        return best_model
-
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
-        """
-        User overrides this as shown here if they are using a custom MyRLEnv
-        """
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-
-        # environments
-        if not self.train_env:
-            self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
-                                     reward_kwargs=self.reward_params)
-            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
-                                    window_size=self.CONV_WIDTH,
-                                    reward_kwargs=self.reward_params), ".")
-        else:
-            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.train_env.reset()
-            self.eval_env.reset()
-
-
-# User can inherit and customize 5 action environment
-class MyRLEnv(Base5ActionRLEnv):
-    """
-    User can override any function in BaseRLEnv and gym.Env. Here the user
-    Adds 5 actions.
-    """
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
-            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-
-        # close short
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
-            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-
-        return 0.
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py
deleted file mode 100644
index 8634fd958..000000000
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import logging
-from typing import Any, Dict  # Optional
-import torch as th
-import numpy as np
-import gym
-from typing import Callable
-from stable_baselines3.common.callbacks import EvalCallback
-# EvalCallback , StopTrainingOnNoModelImprovement, StopTrainingOnRewardThreshold
-from stable_baselines3.common.monitor import Monitor
-from stable_baselines3.common.vec_env import SubprocVecEnv
-from stable_baselines3.common.utils import set_random_seed
-from stable_baselines3 import DQN
-from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
-from freqtrade.freqai.RL.TDQNagent import TDQN
-from stable_baselines3.common.buffers import ReplayBuffer
-from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-
-logger = logging.getLogger(__name__)
-
-
-def make_env(env_id: str, rank: int, seed: int, train_df, price,
-             reward_params, window_size, monitor=False) -> Callable:
-    """
-    Utility function for multiprocessed env.
-
-    :param env_id: (str) the environment ID
-    :param num_env: (int) the number of environment you wish to have in subprocesses
-    :param seed: (int) the inital seed for RNG
-    :param rank: (int) index of the subprocess
-    :return: (Callable)
-    """
-    def _init() -> gym.Env:
-
-        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
-                      reward_kwargs=reward_params, id=env_id, seed=seed + rank)
-        if monitor:
-            env = Monitor(env, ".")
-        return env
-    set_random_seed(seed)
-    return _init
-
-
-class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
-    """
-    User created Reinforcement Learning Model prediction model.
-    """
-
-    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
-
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
-        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
-
-        path = dk.data_path
-
-        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
-                                     log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
-                                     deterministic=True, render=False)
-        # model arch
-        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[512, 512, 512])
-
-        model = TDQN('TMultiInputPolicy', self.train_env,
-                     policy_kwargs=policy_kwargs,
-                     tensorboard_log=f"{path}/tdqn/tensorboard/",
-                     replay_buffer_class=ReplayBuffer,
-                     **self.freqai_info['model_training_parameters']
-                     )
-
-        model.learn(
-            total_timesteps=int(total_timesteps),
-            callback=eval_callback
-        )
-
-        best_model = DQN.load(dk.data_path / "best_model.zip")
-        print('Training finished!')
-
-        return best_model
-
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
-        """
-        User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
-        leaving this will default to Base5ActEnv
-        """
-        train_df = data_dictionary["train_features"]
-        test_df = data_dictionary["test_features"]
-
-        # environments
-        if not self.train_env:
-            env_id = "train_env"
-            num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
-            self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
-                                            self.reward_params, self.CONV_WIDTH) for i
-                                            in range(num_cpu)])
-
-            eval_env_id = 'eval_env'
-            self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
-                                           self.reward_params, self.CONV_WIDTH, monitor=True) for i
-                                           in range(num_cpu)])
-        else:
-            self.train_env.env_method('reset_env', train_df, prices_train,
-                                      self.CONV_WIDTH, self.reward_params)
-            self.eval_env.env_method('reset_env', train_df, prices_train,
-                                     self.CONV_WIDTH, self.reward_params)
-            self.train_env.env_method('reset')
-            self.eval_env.env_method('reset')
-
-# User can inherit and customize 5 action environment
-
-
-class MyRLEnv(Base5ActionRLEnv):
-    """
-    User can override any function in BaseRLEnv and gym.Env. Here the user
-    Adds 5 actions.
-    """
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
-            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-
-        # close short
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
-            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-
-        return 0.