diff --git a/config_examples/config_freqai-rl.example.json b/config_examples/config_freqai-rl.example.json
index 053c1a08e..1f12cbc6c 100644
--- a/config_examples/config_freqai-rl.example.json
+++ b/config_examples/config_freqai-rl.example.json
@@ -8,7 +8,7 @@
     "tradable_balance_ratio": 1,
     "fiat_display_currency": "USD",
     "dry_run": true,
-    "timeframe": "3m",
+    "timeframe": "5m",
     "dataformat_ohlcv": "json",
     "dry_run_wallet": 12000,
     "cancel_open_orders_on_exit": true,
@@ -35,7 +35,6 @@
     },
     "entry_pricing": {
         "price_side": "same",
-        "purge_old_models": true,
         "use_order_book": true,
         "order_book_top": 1,
         "price_last_balance": 0.0,
@@ -56,10 +55,8 @@
     ],
     "freqai": {
         "enabled": true,
-        "startup_candles": 1000,
-        "model_save_type": "stable_baselines_ppo",
+        "model_save_type": "stable_baselines_dqn",
         "conv_width": 10,
-        "follow_mode": false,
         "purge_old_models": true,
         "train_period_days": 10,
         "backtest_period_days": 2,
@@ -71,13 +68,9 @@
                 "ETH/USDT"
             ],
             "include_timeframes": [
-                "3m",
-                "15m"
+                "5m",
+                "30m"
             ],
-            "include_shifted_candles": 0,
-            "weight_factor": 0.9,
-            "principal_component_analysis": false,
-            "use_SVM_to_remove_outliers": false,
             "indicator_max_period_candles": 10,
             "indicator_periods_candles": [5, 10]
         },
@@ -86,16 +79,22 @@
             "random_state": 1,
             "shuffle": false
         },
-        "model_training_parameters": {
-            "ent_coef": 0.005,
-            "learning_rate": 0.000025,
-            "batch_size": 256, 
-            "eval_cycles" : 5,
-            "train_cycles" : 15
+        "model_training_parameters": {    
+            "learning_rate": 0.00025,
+            "gamma": 0.9,
+            "target_update_interval": 5000,
+            "buffer_size": 50000,
+            "exploration_initial_eps":1, 
+            "exploration_final_eps": 0.1,
+            "verbose": 1
         },
-        "model_reward_parameters": {
-            "rr": 1,
-            "profit_aim": 0.01
+        "rl_config": {
+            "train_cycles": 15,
+            "eval_cycles": 5,
+            "model_reward_parameters": {
+                "rr": 1,
+                "profit_aim": 0.02
+            }
         }
     },
     "bot_name": "RL_test",
diff --git a/freqtrade/freqai/RL/Base3ActionRLEnv.py b/freqtrade/freqai/RL/Base3ActionRLEnv.py
index bf7b2fc7b..9d17b982d 100644
--- a/freqtrade/freqai/RL/Base3ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base3ActionRLEnv.py
@@ -6,6 +6,7 @@ import gym
 import numpy as np
 from gym import spaces
 from gym.utils import seeding
+from pandas import DataFrame
 
 logger = logging.getLogger(__name__)
 
@@ -35,7 +36,8 @@ class Base3ActionRLEnv(gym.Env):
 
     metadata = {'render.modes': ['human']}
 
-    def __init__(self, df, prices, reward_kwargs, window_size=10, starting_point=True,
+    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
+                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
                  id: str = 'baseenv-1', seed: int = 1):
         assert df.ndim == 2
 
diff --git a/freqtrade/freqai/RL/Base5ActionRLEnv.py b/freqtrade/freqai/RL/Base5ActionRLEnv.py
index 5f817f14e..d7ceb5ff3 100644
--- a/freqtrade/freqai/RL/Base5ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py
@@ -6,6 +6,7 @@ import gym
 import numpy as np
 from gym import spaces
 from gym.utils import seeding
+from pandas import DataFrame
 
 logger = logging.getLogger(__name__)
 
@@ -39,7 +40,8 @@ class Base5ActionRLEnv(gym.Env):
     """
     metadata = {'render.modes': ['human']}
 
-    def __init__(self, df, prices, reward_kwargs, window_size=10, starting_point=True,
+    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
+                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
                  id: str = 'baseenv-1', seed: int = 1):
         assert df.ndim == 2
 
@@ -56,7 +58,7 @@ class Base5ActionRLEnv(gym.Env):
         self.fee = 0.0015
 
         # # spaces
-        self.shape = (window_size, self.signal_features.shape[1])
+        self.shape = (window_size, self.signal_features.shape[1] + 2)
         self.action_space = spaces.Discrete(len(Actions))
         self.observation_space = spaces.Box(
             low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
@@ -161,19 +163,26 @@ class Base5ActionRLEnv(gym.Env):
             self._done = True
 
         self._position_history.append(self._position)
-        observation = self._get_observation()
+
         info = dict(
             tick=self._current_tick,
             total_reward=self.total_reward,
             total_profit=self._total_profit,
             position=self._position.value
         )
+
+        observation = self._get_observation()
+
         self._update_history(info)
 
         return observation, step_reward, self._done, info
 
     def _get_observation(self):
-        return self.signal_features[(self._current_tick - self.window_size):self._current_tick]
+        features_and_state = self.signal_features[(
+            self._current_tick - self.window_size):self._current_tick]
+        features_and_state['current_profit_pct'] = self.get_unrealized_profit()
+        features_and_state['position'] = self._position.value
+        return features_and_state
 
     def get_unrealized_profit(self):
 
diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
index 78feea6d1..395b2a1a6 100644
--- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
+++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
@@ -13,7 +13,7 @@ from freqtrade.persistence import Trade
 import torch.multiprocessing
 import torch as th
 logger = logging.getLogger(__name__)
-th.set_num_threads(8)
+
 torch.multiprocessing.set_sharing_strategy('file_system')
 
 
@@ -22,6 +22,11 @@ class BaseReinforcementLearningModel(IFreqaiModel):
     User created Reinforcement Learning Model prediction model.
     """
 
+    def __init__(self, **kwargs):
+        super().__init__(config=kwargs['config'])
+        th.set_num_threads(self.freqai_info.get('data_kitchen_thread_count', 4))
+        self.reward_params = self.freqai_info['rl_config']['model_reward_parameters']
+
     def train(
         self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen
     ) -> Any:
@@ -62,12 +67,6 @@ class BaseReinforcementLearningModel(IFreqaiModel):
 
         model = self.fit_rl(data_dictionary, pair, dk, prices_train, prices_test)
 
-        if pair not in self.dd.historic_predictions:
-            self.set_initial_historic_predictions(
-                data_dictionary['train_features'], model, dk, pair)
-
-        self.dd.save_historic_predictions_to_disk()
-
         logger.info(f"--------------------done training {pair}--------------------")
 
         return model
@@ -127,7 +126,8 @@ class BaseReinforcementLearningModel(IFreqaiModel):
         # optional additional data cleaning/analysis
         self.data_cleaning_predict(dk, filtered_dataframe)
 
-        pred_df = self.rl_model_predict(dk.data_dictionary["prediction_features"], dk, self.model)
+        pred_df = self.rl_model_predict(
+            dk.data_dictionary["prediction_features"], dk, self.model)
         pred_df.fillna(0, inplace=True)
 
         return (pred_df, dk.do_predict)
@@ -135,10 +135,13 @@ class BaseReinforcementLearningModel(IFreqaiModel):
     def rl_model_predict(self, dataframe: DataFrame,
                          dk: FreqaiDataKitchen, model: Any) -> DataFrame:
 
-        output = pd.DataFrame(np.full((len(dataframe), 1), 2), columns=dk.label_list)
+        output = pd.DataFrame(np.zeros(len(dataframe)), columns=dk.label_list)
 
         def _predict(window):
+            market_side, current_profit, total_profit = self.get_state_info(dk.pair)
             observations = dataframe.iloc[window.index]
+            observations['current_profit'] = current_profit
+            observations['position'] = market_side
             res, _ = model.predict(observations, deterministic=True)
             return res
 
@@ -174,29 +177,6 @@ class BaseReinforcementLearningModel(IFreqaiModel):
 
         return prices_train, prices_test
 
-    def set_initial_historic_predictions(
-        self, df: DataFrame, model: Any, dk: FreqaiDataKitchen, pair: str
-    ) -> None:
-
-        pred_df = self.rl_model_predict(df, dk, model)
-        pred_df.fillna(0, inplace=True)
-        self.dd.historic_predictions[pair] = pred_df
-        hist_preds_df = self.dd.historic_predictions[pair]
-
-        for label in hist_preds_df.columns:
-            if hist_preds_df[label].dtype == object:
-                continue
-            hist_preds_df[f'{label}_mean'] = 0
-            hist_preds_df[f'{label}_std'] = 0
-
-        hist_preds_df['do_predict'] = 0
-
-        if self.freqai_info['feature_parameters'].get('DI_threshold', 0) > 0:
-            hist_preds_df['DI_values'] = 0
-
-        for return_str in dk.data['extra_returns_per_train']:
-            hist_preds_df[return_str] = 0
-
     # TODO take care of this appendage. Right now it needs to be called because FreqAI enforces it.
     # But FreqaiRL needs more objects passed to fit() (like DK) and we dont want to go refactor
     # all the other existing fit() functions to include dk argument. For now we instantiate and
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py
index b437ea8aa..5dc7735d3 100644
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py
@@ -24,18 +24,16 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
     def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
                prices_train: DataFrame, prices_test: DataFrame):
 
-        agent_params = self.freqai_info['model_training_parameters']
-        reward_params = self.freqai_info['model_reward_parameters']
         train_df = data_dictionary["train_features"]
         test_df = data_dictionary["test_features"]
-        eval_freq = agent_params.get("eval_cycles", 4) * len(test_df)
-        total_timesteps = agent_params["train_cycles"] * len(train_df)
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
 
         # environments
         train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
-                            reward_kwargs=reward_params)
+                            reward_kwargs=self.reward_params)
         eval = MyRLEnv(df=test_df, prices=prices_test,
-                       window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
+                       window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params)
         eval_env = Monitor(eval, ".")
 
         path = dk.data_path
@@ -49,7 +47,7 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
 
         model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs,
                     tensorboard_log=f"{path}/ppo/tensorboard/", learning_rate=0.00025,
-                    gamma=0.9, verbose=1
+                    **self.freqai_info['model_training_parameters']
                     )
 
         model.learn(
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py
index b1c5f316f..337e94607 100644
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py
@@ -51,23 +51,20 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
     def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
                prices_train: DataFrame, prices_test: DataFrame):
 
-        agent_params = self.freqai_info['model_training_parameters']
-        reward_params = self.freqai_info['model_reward_parameters']
         train_df = data_dictionary["train_features"]
         test_df = data_dictionary["test_features"]
-        eval_freq = agent_params.get("eval_cycles", 4) * len(test_df)
-        total_timesteps = agent_params["train_cycles"] * len(train_df)
-        learning_rate = agent_params["learning_rate"]
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
 
         env_id = "train_env"
-        th.set_num_threads(dk.thread_count)
         num_cpu = int(dk.thread_count / 2)
-        train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, reward_params,
-                                   self.CONV_WIDTH) for i in range(num_cpu)])
+        train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
+                                   self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)])
 
         eval_env_id = 'eval_env'
-        eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, reward_params,
-                                  self.CONV_WIDTH, monitor=True) for i in range(num_cpu)])
+        eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
+                                  self.reward_params, self.CONV_WIDTH, monitor=True) for i in
+                                  range(num_cpu)])
 
         path = dk.data_path
         eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/",
@@ -80,9 +77,7 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
 
         model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs,
                     tensorboard_log=f"{path}/ppo/tensorboard/",
-                    learning_rate=learning_rate,
-                    gamma=0.9,
-                    verbose=1
+                    **self.freqai_info['model_training_parameters']
                     )
 
         model.learn(
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py
index a60bc1fa1..3a57142cf 100644
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py
@@ -3,8 +3,7 @@ from typing import Any, Dict  # Optional
 import torch as th
 from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.monitor import Monitor
-# from stable_baselines3.common.vec_env import SubprocVecEnv
-from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
+from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from freqtrade.freqai.RL.TDQNagent import TDQN
 from stable_baselines3 import DQN
@@ -25,18 +24,16 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
     def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
                prices_train: DataFrame, prices_test: DataFrame):
 
-        agent_params = self.freqai_info['model_training_parameters']
-        reward_params = self.freqai_info['model_reward_parameters']
         train_df = data_dictionary["train_features"]
         test_df = data_dictionary["test_features"]
-        eval_freq = agent_params["eval_cycles"] * len(test_df)
-        total_timesteps = agent_params["train_cycles"] * len(train_df)
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
 
         # environments
         train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
-                            reward_kwargs=reward_params)
+                            reward_kwargs=self.reward_params)
         eval = MyRLEnv(df=test_df, prices=prices_test,
-                       window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
+                       window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params)
         eval_env = Monitor(eval, ".")
         eval_env.reset()
 
@@ -50,12 +47,10 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
                              net_arch=[256, 256, 128])
 
         model = TDQN('TMultiInputPolicy', train_env,
-                     policy_kwargs=policy_kwargs,
                      tensorboard_log=f"{path}/tdqn/tensorboard/",
-                     learning_rate=0.00025, gamma=0.9,
-                     target_update_interval=5000, buffer_size=50000,
-                     exploration_initial_eps=1, exploration_final_eps=0.1,
-                     replay_buffer_class=ReplayBuffer
+                     policy_kwargs=policy_kwargs,
+                     replay_buffer_class=ReplayBuffer,
+                     **self.freqai_info['model_training_parameters']
                      )
 
         model.learn(
@@ -70,9 +65,11 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
         return best_model
 
 
-class MyRLEnv(Base3ActionRLEnv):
+# User can inherit and customize 5 action environment
+class MyRLEnv(Base5ActionRLEnv):
     """
-    User can override any function in BaseRLEnv and gym.Env
+    User can override any function in BaseRLEnv and gym.Env. Here the user
+    Adds 5 actions.
     """
 
     def calculate_reward(self, action):
@@ -81,55 +78,27 @@ class MyRLEnv(Base3ActionRLEnv):
             return 0.
 
         # close long
-        if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
+        if action == Actions.Long_sell.value and self._position == Positions.Long:
             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
             return float(np.log(current_price) - np.log(last_trade_price))
 
+        if action == Actions.Long_sell.value and self._position == Positions.Long:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
+
         # close short
-        if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
+        if action == Actions.Short_buy.value and self._position == Positions.Short:
             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
             return float(np.log(last_trade_price) - np.log(current_price))
 
+        if action == Actions.Short_buy.value and self._position == Positions.Short:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
+
         return 0.
-
-# User can inherit and customize 5 action environment
-# class MyRLEnv(Base5ActionRLEnv):
-#     """
-#     User can override any function in BaseRLEnv and gym.Env. Here the user
-#     Adds 5 actions.
-#     """
-
-#     def calculate_reward(self, action):
-
-#         if self._last_trade_tick is None:
-#             return 0.
-
-#         # close long
-#         if action == Actions.Long_sell.value and self._position == Positions.Long:
-#             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-#             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-#             return float(np.log(current_price) - np.log(last_trade_price))
-
-#         if action == Actions.Long_sell.value and self._position == Positions.Long:
-#             if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-#                 last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-#                 current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-#                 return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-
-#         # close short
-#         if action == Actions.Short_buy.value and self._position == Positions.Short:
-#             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-#             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-#             return float(np.log(last_trade_price) - np.log(current_price))
-
-#         if action == Actions.Short_buy.value and self._position == Positions.Short:
-#             if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-#                 last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-#                 current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-#                 return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-
-#         return 0.
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py
index 51e3c07c4..bf9e03b7f 100644
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py
@@ -10,7 +10,7 @@ from stable_baselines3.common.monitor import Monitor
 from stable_baselines3.common.vec_env import SubprocVecEnv
 from stable_baselines3.common.utils import set_random_seed
 from stable_baselines3 import DQN
-from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
+from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from freqtrade.freqai.RL.TDQNagent import TDQN
 from stable_baselines3.common.buffers import ReplayBuffer
@@ -50,22 +50,20 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
     def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
                prices_train: DataFrame, prices_test: DataFrame):
 
-        agent_params = self.freqai_info['model_training_parameters']
-        reward_params = self.freqai_info['model_reward_parameters']
         train_df = data_dictionary["train_features"]
         test_df = data_dictionary["test_features"]
-        eval_freq = agent_params["eval_cycles"] * len(test_df)
-        total_timesteps = agent_params["train_cycles"] * len(train_df)
-        learning_rate = agent_params["learning_rate"]
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
 
         env_id = "train_env"
         num_cpu = int(dk.thread_count / 2)
-        train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, reward_params,
-                                   self.CONV_WIDTH) for i in range(num_cpu)])
+        train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
+                                   self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)])
 
         eval_env_id = 'eval_env'
-        eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, reward_params,
-                                  self.CONV_WIDTH, monitor=True) for i in range(num_cpu)])
+        eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
+                                  self.reward_params, self.CONV_WIDTH, monitor=True) for i in
+                                  range(num_cpu)])
 
         path = dk.data_path
         stop_train_callback = StopTrainingOnNoModelImprovement(
@@ -91,10 +89,8 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
         model = TDQN('TMultiInputPolicy', train_env,
                      policy_kwargs=policy_kwargs,
                      tensorboard_log=f"{path}/tdqn/tensorboard/",
-                     learning_rate=learning_rate, gamma=0.9,
-                     target_update_interval=5000, buffer_size=50000,
-                     exploration_initial_eps=1, exploration_final_eps=0.1,
-                     replay_buffer_class=ReplayBuffer
+                     replay_buffer_class=ReplayBuffer,
+                     **self.freqai_info['model_training_parameters']
                      )
 
         model.learn(
@@ -109,9 +105,11 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
         return best_model
 
 
-class MyRLEnv(Base3ActionRLEnv):
+# User can inherit and customize 5 action environment
+class MyRLEnv(Base5ActionRLEnv):
     """
-    User can override any function in BaseRLEnv and gym.Env
+    User can override any function in BaseRLEnv and gym.Env. Here the user
+    Adds 5 actions.
     """
 
     def calculate_reward(self, action):
@@ -120,55 +118,27 @@ class MyRLEnv(Base3ActionRLEnv):
             return 0.
 
         # close long
-        if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
+        if action == Actions.Long_sell.value and self._position == Positions.Long:
             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
             return float(np.log(current_price) - np.log(last_trade_price))
 
+        if action == Actions.Long_sell.value and self._position == Positions.Long:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
+
         # close short
-        if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
+        if action == Actions.Short_buy.value and self._position == Positions.Short:
             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
             return float(np.log(last_trade_price) - np.log(current_price))
 
+        if action == Actions.Short_buy.value and self._position == Positions.Short:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
+
         return 0.
-
-# User can inherit and customize 5 action environment
-# class MyRLEnv(Base5ActionRLEnv):
-#     """
-#     User can override any function in BaseRLEnv and gym.Env. Here the user
-#     Adds 5 actions.
-#     """
-
-#     def calculate_reward(self, action):
-
-#         if self._last_trade_tick is None:
-#             return 0.
-
-#         # close long
-#         if action == Actions.Long_sell.value and self._position == Positions.Long:
-#             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-#             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-#             return float(np.log(current_price) - np.log(last_trade_price))
-
-#         if action == Actions.Long_sell.value and self._position == Positions.Long:
-#             if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-#                 last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-#                 current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-#                 return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-
-#         # close short
-#         if action == Actions.Short_buy.value and self._position == Positions.Short:
-#             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-#             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-#             return float(np.log(last_trade_price) - np.log(current_price))
-
-#         if action == Actions.Short_buy.value and self._position == Positions.Short:
-#             if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-#                 last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-#                 current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-#                 return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-
-#         return 0.