From 5d4e5e69fe44aa9dedb9dcfdf43adfe240d9832b Mon Sep 17 00:00:00 2001
From: robcaulk <rob.caulk@gmail.com>
Date: Thu, 18 Aug 2022 13:02:47 +0200
Subject: [PATCH] reinforce training with state info, reinforce prediction with
 state info, restructure config to accommodate all parameters from any user
 imported model type. Set 5Act to default env on TDQN. Clean example config.

---
 config_examples/config_freqai-rl.example.json | 39 ++++-----
 freqtrade/freqai/RL/Base3ActionRLEnv.py       |  4 +-
 freqtrade/freqai/RL/Base5ActionRLEnv.py       | 17 +++-
 .../RL/BaseReinforcementLearningModel.py      | 44 +++-------
 .../ReinforcementLearningPPO.py               | 12 ++-
 .../ReinforcementLearningPPO_multiproc.py     | 21 ++---
 .../ReinforcementLearningTDQN.py              | 83 ++++++------------
 .../ReinforcementLearningTDQN_multiproc.py    | 86 ++++++-------------
 8 files changed, 114 insertions(+), 192 deletions(-)

diff --git a/config_examples/config_freqai-rl.example.json b/config_examples/config_freqai-rl.example.json
index 053c1a08e..1f12cbc6c 100644
--- a/config_examples/config_freqai-rl.example.json
+++ b/config_examples/config_freqai-rl.example.json
@@ -8,7 +8,7 @@
     "tradable_balance_ratio": 1,
     "fiat_display_currency": "USD",
     "dry_run": true,
-    "timeframe": "3m",
+    "timeframe": "5m",
     "dataformat_ohlcv": "json",
     "dry_run_wallet": 12000,
     "cancel_open_orders_on_exit": true,
@@ -35,7 +35,6 @@
     },
     "entry_pricing": {
         "price_side": "same",
-        "purge_old_models": true,
         "use_order_book": true,
         "order_book_top": 1,
         "price_last_balance": 0.0,
@@ -56,10 +55,8 @@
     ],
     "freqai": {
         "enabled": true,
-        "startup_candles": 1000,
-        "model_save_type": "stable_baselines_ppo",
+        "model_save_type": "stable_baselines_dqn",
         "conv_width": 10,
-        "follow_mode": false,
         "purge_old_models": true,
         "train_period_days": 10,
         "backtest_period_days": 2,
@@ -71,13 +68,9 @@
                 "ETH/USDT"
             ],
             "include_timeframes": [
-                "3m",
-                "15m"
+                "5m",
+                "30m"
             ],
-            "include_shifted_candles": 0,
-            "weight_factor": 0.9,
-            "principal_component_analysis": false,
-            "use_SVM_to_remove_outliers": false,
             "indicator_max_period_candles": 10,
             "indicator_periods_candles": [5, 10]
         },
@@ -86,16 +79,22 @@
             "random_state": 1,
             "shuffle": false
         },
-        "model_training_parameters": {
-            "ent_coef": 0.005,
-            "learning_rate": 0.000025,
-            "batch_size": 256, 
-            "eval_cycles" : 5,
-            "train_cycles" : 15
+        "model_training_parameters": {    
+            "learning_rate": 0.00025,
+            "gamma": 0.9,
+            "target_update_interval": 5000,
+            "buffer_size": 50000,
+            "exploration_initial_eps":1, 
+            "exploration_final_eps": 0.1,
+            "verbose": 1
         },
-        "model_reward_parameters": {
-            "rr": 1,
-            "profit_aim": 0.01
+        "rl_config": {
+            "train_cycles": 15,
+            "eval_cycles": 5,
+            "model_reward_parameters": {
+                "rr": 1,
+                "profit_aim": 0.02
+            }
         }
     },
     "bot_name": "RL_test",
diff --git a/freqtrade/freqai/RL/Base3ActionRLEnv.py b/freqtrade/freqai/RL/Base3ActionRLEnv.py
index bf7b2fc7b..9d17b982d 100644
--- a/freqtrade/freqai/RL/Base3ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base3ActionRLEnv.py
@@ -6,6 +6,7 @@ import gym
 import numpy as np
 from gym import spaces
 from gym.utils import seeding
+from pandas import DataFrame
 
 logger = logging.getLogger(__name__)
 
@@ -35,7 +36,8 @@ class Base3ActionRLEnv(gym.Env):
 
     metadata = {'render.modes': ['human']}
 
-    def __init__(self, df, prices, reward_kwargs, window_size=10, starting_point=True,
+    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
+                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
                  id: str = 'baseenv-1', seed: int = 1):
         assert df.ndim == 2
 
diff --git a/freqtrade/freqai/RL/Base5ActionRLEnv.py b/freqtrade/freqai/RL/Base5ActionRLEnv.py
index 5f817f14e..d7ceb5ff3 100644
--- a/freqtrade/freqai/RL/Base5ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py
@@ -6,6 +6,7 @@ import gym
 import numpy as np
 from gym import spaces
 from gym.utils import seeding
+from pandas import DataFrame
 
 logger = logging.getLogger(__name__)
 
@@ -39,7 +40,8 @@ class Base5ActionRLEnv(gym.Env):
     """
     metadata = {'render.modes': ['human']}
 
-    def __init__(self, df, prices, reward_kwargs, window_size=10, starting_point=True,
+    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
+                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
                  id: str = 'baseenv-1', seed: int = 1):
         assert df.ndim == 2
 
@@ -56,7 +58,7 @@ class Base5ActionRLEnv(gym.Env):
         self.fee = 0.0015
 
         # # spaces
-        self.shape = (window_size, self.signal_features.shape[1])
+        self.shape = (window_size, self.signal_features.shape[1] + 2)
         self.action_space = spaces.Discrete(len(Actions))
         self.observation_space = spaces.Box(
             low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
@@ -161,19 +163,26 @@ class Base5ActionRLEnv(gym.Env):
             self._done = True
 
         self._position_history.append(self._position)
-        observation = self._get_observation()
+
         info = dict(
             tick=self._current_tick,
             total_reward=self.total_reward,
             total_profit=self._total_profit,
             position=self._position.value
         )
+
+        observation = self._get_observation()
+
         self._update_history(info)
 
         return observation, step_reward, self._done, info
 
     def _get_observation(self):
-        return self.signal_features[(self._current_tick - self.window_size):self._current_tick]
+        features_and_state = self.signal_features[(
+            self._current_tick - self.window_size):self._current_tick]
+        features_and_state['current_profit_pct'] = self.get_unrealized_profit()
+        features_and_state['position'] = self._position.value
+        return features_and_state
 
     def get_unrealized_profit(self):
 
diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
index 78feea6d1..395b2a1a6 100644
--- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
+++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
@@ -13,7 +13,7 @@ from freqtrade.persistence import Trade
 import torch.multiprocessing
 import torch as th
 logger = logging.getLogger(__name__)
-th.set_num_threads(8)
+
 torch.multiprocessing.set_sharing_strategy('file_system')
 
 
@@ -22,6 +22,11 @@ class BaseReinforcementLearningModel(IFreqaiModel):
     User created Reinforcement Learning Model prediction model.
     """
 
+    def __init__(self, **kwargs):
+        super().__init__(config=kwargs['config'])
+        th.set_num_threads(self.freqai_info.get('data_kitchen_thread_count', 4))
+        self.reward_params = self.freqai_info['rl_config']['model_reward_parameters']
+
     def train(
         self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen
     ) -> Any:
@@ -62,12 +67,6 @@ class BaseReinforcementLearningModel(IFreqaiModel):
 
         model = self.fit_rl(data_dictionary, pair, dk, prices_train, prices_test)
 
-        if pair not in self.dd.historic_predictions:
-            self.set_initial_historic_predictions(
-                data_dictionary['train_features'], model, dk, pair)
-
-        self.dd.save_historic_predictions_to_disk()
-
         logger.info(f"--------------------done training {pair}--------------------")
 
         return model
@@ -127,7 +126,8 @@ class BaseReinforcementLearningModel(IFreqaiModel):
         # optional additional data cleaning/analysis
         self.data_cleaning_predict(dk, filtered_dataframe)
 
-        pred_df = self.rl_model_predict(dk.data_dictionary["prediction_features"], dk, self.model)
+        pred_df = self.rl_model_predict(
+            dk.data_dictionary["prediction_features"], dk, self.model)
         pred_df.fillna(0, inplace=True)
 
         return (pred_df, dk.do_predict)
@@ -135,10 +135,13 @@ class BaseReinforcementLearningModel(IFreqaiModel):
     def rl_model_predict(self, dataframe: DataFrame,
                          dk: FreqaiDataKitchen, model: Any) -> DataFrame:
 
-        output = pd.DataFrame(np.full((len(dataframe), 1), 2), columns=dk.label_list)
+        output = pd.DataFrame(np.zeros(len(dataframe)), columns=dk.label_list)
 
         def _predict(window):
+            market_side, current_profit, total_profit = self.get_state_info(dk.pair)
             observations = dataframe.iloc[window.index]
+            observations['current_profit'] = current_profit
+            observations['position'] = market_side
             res, _ = model.predict(observations, deterministic=True)
             return res
 
@@ -174,29 +177,6 @@ class BaseReinforcementLearningModel(IFreqaiModel):
 
         return prices_train, prices_test
 
-    def set_initial_historic_predictions(
-        self, df: DataFrame, model: Any, dk: FreqaiDataKitchen, pair: str
-    ) -> None:
-
-        pred_df = self.rl_model_predict(df, dk, model)
-        pred_df.fillna(0, inplace=True)
-        self.dd.historic_predictions[pair] = pred_df
-        hist_preds_df = self.dd.historic_predictions[pair]
-
-        for label in hist_preds_df.columns:
-            if hist_preds_df[label].dtype == object:
-                continue
-            hist_preds_df[f'{label}_mean'] = 0
-            hist_preds_df[f'{label}_std'] = 0
-
-        hist_preds_df['do_predict'] = 0
-
-        if self.freqai_info['feature_parameters'].get('DI_threshold', 0) > 0:
-            hist_preds_df['DI_values'] = 0
-
-        for return_str in dk.data['extra_returns_per_train']:
-            hist_preds_df[return_str] = 0
-
     # TODO take care of this appendage. Right now it needs to be called because FreqAI enforces it.
     # But FreqaiRL needs more objects passed to fit() (like DK) and we dont want to go refactor
     # all the other existing fit() functions to include dk argument. For now we instantiate and
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py
index b437ea8aa..5dc7735d3 100644
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py
@@ -24,18 +24,16 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
     def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
                prices_train: DataFrame, prices_test: DataFrame):
 
-        agent_params = self.freqai_info['model_training_parameters']
-        reward_params = self.freqai_info['model_reward_parameters']
         train_df = data_dictionary["train_features"]
         test_df = data_dictionary["test_features"]
-        eval_freq = agent_params.get("eval_cycles", 4) * len(test_df)
-        total_timesteps = agent_params["train_cycles"] * len(train_df)
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
 
         # environments
         train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
-                            reward_kwargs=reward_params)
+                            reward_kwargs=self.reward_params)
         eval = MyRLEnv(df=test_df, prices=prices_test,
-                       window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
+                       window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params)
         eval_env = Monitor(eval, ".")
 
         path = dk.data_path
@@ -49,7 +47,7 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
 
         model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs,
                     tensorboard_log=f"{path}/ppo/tensorboard/", learning_rate=0.00025,
-                    gamma=0.9, verbose=1
+                    **self.freqai_info['model_training_parameters']
                     )
 
         model.learn(
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py
index b1c5f316f..337e94607 100644
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py
@@ -51,23 +51,20 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
     def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
                prices_train: DataFrame, prices_test: DataFrame):
 
-        agent_params = self.freqai_info['model_training_parameters']
-        reward_params = self.freqai_info['model_reward_parameters']
         train_df = data_dictionary["train_features"]
         test_df = data_dictionary["test_features"]
-        eval_freq = agent_params.get("eval_cycles", 4) * len(test_df)
-        total_timesteps = agent_params["train_cycles"] * len(train_df)
-        learning_rate = agent_params["learning_rate"]
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
 
         env_id = "train_env"
-        th.set_num_threads(dk.thread_count)
         num_cpu = int(dk.thread_count / 2)
-        train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, reward_params,
-                                   self.CONV_WIDTH) for i in range(num_cpu)])
+        train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
+                                   self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)])
 
         eval_env_id = 'eval_env'
-        eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, reward_params,
-                                  self.CONV_WIDTH, monitor=True) for i in range(num_cpu)])
+        eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
+                                  self.reward_params, self.CONV_WIDTH, monitor=True) for i in
+                                  range(num_cpu)])
 
         path = dk.data_path
         eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/",
@@ -80,9 +77,7 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
 
         model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs,
                     tensorboard_log=f"{path}/ppo/tensorboard/",
-                    learning_rate=learning_rate,
-                    gamma=0.9,
-                    verbose=1
+                    **self.freqai_info['model_training_parameters']
                     )
 
         model.learn(
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py
index a60bc1fa1..3a57142cf 100644
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py
@@ -3,8 +3,7 @@ from typing import Any, Dict  # Optional
 import torch as th
 from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.monitor import Monitor
-# from stable_baselines3.common.vec_env import SubprocVecEnv
-from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
+from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from freqtrade.freqai.RL.TDQNagent import TDQN
 from stable_baselines3 import DQN
@@ -25,18 +24,16 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
     def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
                prices_train: DataFrame, prices_test: DataFrame):
 
-        agent_params = self.freqai_info['model_training_parameters']
-        reward_params = self.freqai_info['model_reward_parameters']
         train_df = data_dictionary["train_features"]
         test_df = data_dictionary["test_features"]
-        eval_freq = agent_params["eval_cycles"] * len(test_df)
-        total_timesteps = agent_params["train_cycles"] * len(train_df)
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
 
         # environments
         train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
-                            reward_kwargs=reward_params)
+                            reward_kwargs=self.reward_params)
         eval = MyRLEnv(df=test_df, prices=prices_test,
-                       window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
+                       window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params)
         eval_env = Monitor(eval, ".")
         eval_env.reset()
 
@@ -50,12 +47,10 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
                              net_arch=[256, 256, 128])
 
         model = TDQN('TMultiInputPolicy', train_env,
-                     policy_kwargs=policy_kwargs,
                      tensorboard_log=f"{path}/tdqn/tensorboard/",
-                     learning_rate=0.00025, gamma=0.9,
-                     target_update_interval=5000, buffer_size=50000,
-                     exploration_initial_eps=1, exploration_final_eps=0.1,
-                     replay_buffer_class=ReplayBuffer
+                     policy_kwargs=policy_kwargs,
+                     replay_buffer_class=ReplayBuffer,
+                     **self.freqai_info['model_training_parameters']
                      )
 
         model.learn(
@@ -70,9 +65,11 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
         return best_model
 
 
-class MyRLEnv(Base3ActionRLEnv):
+# User can inherit and customize 5 action environment
+class MyRLEnv(Base5ActionRLEnv):
     """
-    User can override any function in BaseRLEnv and gym.Env
+    User can override any function in BaseRLEnv and gym.Env. Here the user
+    Adds 5 actions.
     """
 
     def calculate_reward(self, action):
@@ -81,55 +78,27 @@ class MyRLEnv(Base3ActionRLEnv):
             return 0.
 
         # close long
-        if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
+        if action == Actions.Long_sell.value and self._position == Positions.Long:
             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
             return float(np.log(current_price) - np.log(last_trade_price))
 
+        if action == Actions.Long_sell.value and self._position == Positions.Long:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
+
         # close short
-        if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
+        if action == Actions.Short_buy.value and self._position == Positions.Short:
             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
             return float(np.log(last_trade_price) - np.log(current_price))
 
+        if action == Actions.Short_buy.value and self._position == Positions.Short:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
+
         return 0.
-
-# User can inherit and customize 5 action environment
-# class MyRLEnv(Base5ActionRLEnv):
-#     """
-#     User can override any function in BaseRLEnv and gym.Env. Here the user
-#     Adds 5 actions.
-#     """
-
-#     def calculate_reward(self, action):
-
-#         if self._last_trade_tick is None:
-#             return 0.
-
-#         # close long
-#         if action == Actions.Long_sell.value and self._position == Positions.Long:
-#             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-#             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-#             return float(np.log(current_price) - np.log(last_trade_price))
-
-#         if action == Actions.Long_sell.value and self._position == Positions.Long:
-#             if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-#                 last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-#                 current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-#                 return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-
-#         # close short
-#         if action == Actions.Short_buy.value and self._position == Positions.Short:
-#             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-#             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-#             return float(np.log(last_trade_price) - np.log(current_price))
-
-#         if action == Actions.Short_buy.value and self._position == Positions.Short:
-#             if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-#                 last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-#                 current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-#                 return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-
-#         return 0.
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py
index 51e3c07c4..bf9e03b7f 100644
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py
@@ -10,7 +10,7 @@ from stable_baselines3.common.monitor import Monitor
 from stable_baselines3.common.vec_env import SubprocVecEnv
 from stable_baselines3.common.utils import set_random_seed
 from stable_baselines3 import DQN
-from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
+from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from freqtrade.freqai.RL.TDQNagent import TDQN
 from stable_baselines3.common.buffers import ReplayBuffer
@@ -50,22 +50,20 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
     def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
                prices_train: DataFrame, prices_test: DataFrame):
 
-        agent_params = self.freqai_info['model_training_parameters']
-        reward_params = self.freqai_info['model_reward_parameters']
         train_df = data_dictionary["train_features"]
         test_df = data_dictionary["test_features"]
-        eval_freq = agent_params["eval_cycles"] * len(test_df)
-        total_timesteps = agent_params["train_cycles"] * len(train_df)
-        learning_rate = agent_params["learning_rate"]
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
 
         env_id = "train_env"
         num_cpu = int(dk.thread_count / 2)
-        train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, reward_params,
-                                   self.CONV_WIDTH) for i in range(num_cpu)])
+        train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
+                                   self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)])
 
         eval_env_id = 'eval_env'
-        eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, reward_params,
-                                  self.CONV_WIDTH, monitor=True) for i in range(num_cpu)])
+        eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
+                                  self.reward_params, self.CONV_WIDTH, monitor=True) for i in
+                                  range(num_cpu)])
 
         path = dk.data_path
         stop_train_callback = StopTrainingOnNoModelImprovement(
@@ -91,10 +89,8 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
         model = TDQN('TMultiInputPolicy', train_env,
                      policy_kwargs=policy_kwargs,
                      tensorboard_log=f"{path}/tdqn/tensorboard/",
-                     learning_rate=learning_rate, gamma=0.9,
-                     target_update_interval=5000, buffer_size=50000,
-                     exploration_initial_eps=1, exploration_final_eps=0.1,
-                     replay_buffer_class=ReplayBuffer
+                     replay_buffer_class=ReplayBuffer,
+                     **self.freqai_info['model_training_parameters']
                      )
 
         model.learn(
@@ -109,9 +105,11 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
         return best_model
 
 
-class MyRLEnv(Base3ActionRLEnv):
+# User can inherit and customize 5 action environment
+class MyRLEnv(Base5ActionRLEnv):
     """
-    User can override any function in BaseRLEnv and gym.Env
+    User can override any function in BaseRLEnv and gym.Env. Here the user
+    Adds 5 actions.
     """
 
     def calculate_reward(self, action):
@@ -120,55 +118,27 @@ class MyRLEnv(Base3ActionRLEnv):
             return 0.
 
         # close long
-        if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
+        if action == Actions.Long_sell.value and self._position == Positions.Long:
             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
             return float(np.log(current_price) - np.log(last_trade_price))
 
+        if action == Actions.Long_sell.value and self._position == Positions.Long:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
+
         # close short
-        if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
+        if action == Actions.Short_buy.value and self._position == Positions.Short:
             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
             return float(np.log(last_trade_price) - np.log(current_price))
 
+        if action == Actions.Short_buy.value and self._position == Positions.Short:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
+
         return 0.
-
-# User can inherit and customize 5 action environment
-# class MyRLEnv(Base5ActionRLEnv):
-#     """
-#     User can override any function in BaseRLEnv and gym.Env. Here the user
-#     Adds 5 actions.
-#     """
-
-#     def calculate_reward(self, action):
-
-#         if self._last_trade_tick is None:
-#             return 0.
-
-#         # close long
-#         if action == Actions.Long_sell.value and self._position == Positions.Long:
-#             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-#             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-#             return float(np.log(current_price) - np.log(last_trade_price))
-
-#         if action == Actions.Long_sell.value and self._position == Positions.Long:
-#             if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-#                 last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-#                 current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-#                 return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-
-#         # close short
-#         if action == Actions.Short_buy.value and self._position == Positions.Short:
-#             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-#             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-#             return float(np.log(last_trade_price) - np.log(current_price))
-
-#         if action == Actions.Short_buy.value and self._position == Positions.Short:
-#             if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-#                 last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-#                 current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-#                 return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-
-#         return 0.