reuse callback, allow user to acces all stable_baselines3 agents via config

2022-08-20 16:35:29 +02:00
parent 4b9499e321
commit 3eb897c2f8
11 changed files with 295 additions and 587 deletions
--- a/config_examples/config_freqai-rl.example.json
+++ b/config_examples/config_freqai-rl.example.json
@@ -55,7 +55,7 @@
    ],
    "freqai": {
        "enabled": true,
-        "model_save_type": "stable_baselines_dqn",
+        "model_save_type": "stable_baselines",
        "conv_width": 10,
        "purge_old_models": true,
        "train_period_days": 10,
@@ -85,8 +85,11 @@
            "verbose": 1
        },
        "rl_config": {
-            "train_cycles": 15,
+            "train_cycles": 10,
-            "eval_cycles": 5,
+            "eval_cycles": 3,
            "thread_count": 4,
            "model_type": "PPO",
            "policy_type": "MlpPolicy",
            "model_reward_parameters": {
                "rr": 1,
                "profit_aim": 0.02
--- a/freqtrade/freqai/RL/Base5ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py
@@ -266,59 +266,28 @@ class Base5ActionRLEnv(gym.Env):
        # close long
        if action == Actions.Long_exit.value and self._position == Positions.Long:
-            if len(self.close_trade_profit):
+            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-                # aim x2 rw
+            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
                if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                    last_trade_price = self.add_buy_fee(
                        self.prices.iloc[self._last_trade_tick].open)
                    current_price = self.add_sell_fee(
                        self.prices.iloc[self._current_tick].open)
                    return float((np.log(current_price) - np.log(last_trade_price)) * 2)
                # less than aim x1 rw
                elif self.close_trade_profit[-1] < self.profit_aim * self.rr:
                    last_trade_price = self.add_buy_fee(
                        self.prices.iloc[self._last_trade_tick].open
                    )
                    current_price = self.add_sell_fee(
                        self.prices.iloc[self._current_tick].open
                    )
            return float(np.log(current_price) - np.log(last_trade_price))
-                # # less than RR SL x2 neg rw
+
-                # elif self.close_trade_profit[-1] < (self.profit_aim * -1):
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
-                #     last_trade_price = self.add_buy_fee(
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                #         self.prices.iloc[self._last_trade_tick].open)
+                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-                #     current_price = self.add_sell_fee(
+                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-                #         self.prices.iloc[self._current_tick].open)
+                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
                #     return float((np.log(current_price) - np.log(last_trade_price)) * 2) * -1
        # close short
        if action == Actions.Short_exit.value and self._position == Positions.Short:
-            if len(self.close_trade_profit):
+            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-                # aim x2 rw
+            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
                if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                    last_trade_price = self.add_sell_fee(
                        self.prices.iloc[self._last_trade_tick].open
                    )
                    current_price = self.add_buy_fee(
                        self.prices.iloc[self._current_tick].open
                    )
                    return float((np.log(last_trade_price) - np.log(current_price)) * 2)
                # less than aim x1 rw
                elif self.close_trade_profit[-1] < self.profit_aim * self.rr:
                    last_trade_price = self.add_sell_fee(
                        self.prices.iloc[self._last_trade_tick].open
                    )
                    current_price = self.add_buy_fee(
                        self.prices.iloc[self._current_tick].open
                    )
            return float(np.log(last_trade_price) - np.log(current_price))
-                # # less than RR SL x2 neg rw
+
-                # elif self.close_trade_profit[-1] > self.profit_aim * self.rr:
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
-                #     last_trade_price = self.add_sell_fee(
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                #         self.prices.iloc[self._last_trade_tick].open)
+                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-                #     current_price = self.add_buy_fee(
+                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-                #         self.prices.iloc[self._current_tick].open)
+                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-                #     return float((np.log(last_trade_price) - np.log(current_price)) * 2) * -1
+
        return 0.
    def _update_profit(self, action):
--- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
+++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
@@ -11,8 +11,12 @@ from freqtrade.freqai.freqai_interface import IFreqaiModel
 from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
 from freqtrade.persistence import Trade
 import torch.multiprocessing
 from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.monitor import Monitor
 import torch as th
 from typing import Callable
 from stable_baselines3.common.utils import set_random_seed
 import gym
 logger = logging.getLogger(__name__)
 torch.multiprocessing.set_sharing_strategy('file_system')
@@ -25,9 +29,15 @@ class BaseReinforcementLearningModel(IFreqaiModel):
    def __init__(self, **kwargs):
        super().__init__(config=kwargs['config'])
-        th.set_num_threads(self.freqai_info.get('data_kitchen_thread_count', 4))
+        th.set_num_threads(self.freqai_info['rl_config'].get('thread_count', 4))
        self.reward_params = self.freqai_info['rl_config']['model_reward_parameters']
        self.train_env: Base5ActionRLEnv = None
        self.eval_env: Base5ActionRLEnv = None
        self.eval_callback: EvalCallback = None
        mod = __import__('stable_baselines3', fromlist=[
                         self.freqai_info['rl_config']['model_type']])
        self.MODELCLASS = getattr(mod, self.freqai_info['rl_config']['model_type'])
        self.policy_type = self.freqai_info['rl_config']['policy_type']
    def train(
        self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen
@@ -67,7 +77,7 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        )
        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
-        self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test)
+        self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test, dk)
        model = self.fit_rl(data_dictionary, dk)
@@ -75,13 +85,13 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        return model
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
+    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk):
        """
-        User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
+        User overrides this as shown here if they are using a custom MyRLEnv
        leaving this will default to Base5ActEnv
        """
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
        # environments
        if not self.train_env:
@@ -90,11 +100,17 @@ class BaseReinforcementLearningModel(IFreqaiModel):
            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
                                    window_size=self.CONV_WIDTH,
                                    reward_kwargs=self.reward_params), ".")
            self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
                                              render=False, eval_freq=eval_freq,
                                              best_model_save_path=dk.data_path)
        else:
            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
            self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
            self.train_env.reset()
            self.eval_env.reset()
            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
            self.eval_env.reset_env(test_df, prices_test, self.CONV_WIDTH, self.reward_params)
            self.eval_callback.__init__(self.eval_env, deterministic=True,
                                        render=False, eval_freq=eval_freq,
                                        best_model_save_path=dk.data_path)
    @abstractmethod
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
@@ -206,16 +222,28 @@ class BaseReinforcementLearningModel(IFreqaiModel):
    # all the other existing fit() functions to include dk argument. For now we instantiate and
    # leave it.
    def fit(self, data_dictionary: Dict[str, Any], pair: str = '') -> Any:
        """
        Most regressors use the same function names and arguments e.g. user
        can drop in LGBMRegressor in place of CatBoostRegressor and all data
        management will be properly handled by Freqai.
        :param data_dictionary: Dict = the dictionary constructed by DataHandler to hold
                                all the training and test data/labels.
        """
        return
 def make_env(env_id: str, rank: int, seed: int, train_df, price,
             reward_params, window_size, monitor=False) -> Callable:
    """
    Utility function for multiprocessed env.
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    """
    def _init() -> gym.Env:
        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
                      reward_kwargs=reward_params, id=env_id, seed=seed + rank)
        if monitor:
            env = Monitor(env, ".")
        return env
    set_random_seed(seed)
    return _init
 class MyRLEnv(Base5ActionRLEnv):
    """
@@ -229,24 +257,24 @@ class MyRLEnv(Base5ActionRLEnv):
            return 0.
        # close long
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
        # close short
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
--- a/freqtrade/freqai/data_drawer.py
+++ b/freqtrade/freqai/data_drawer.py
@@ -471,12 +471,11 @@ class FreqaiDataDrawer:
        elif model_type == 'keras':
            from tensorflow import keras
            model = keras.models.load_model(dk.data_path / f"{dk.model_filename}_model.h5")
-        elif model_type == 'stable_baselines_ppo':
+        elif model_type == 'stable_baselines':
-            from stable_baselines3.ppo.ppo import PPO
+            mod = __import__('stable_baselines3', fromlist=[
-            model = PPO.load(dk.data_path / f"{dk.model_filename}_model")
+                             self.freqai_info['rl_config']['model_type']])
-        elif model_type == 'stable_baselines_dqn':
+            MODELCLASS = getattr(mod, self.freqai_info['rl_config']['model_type'])
-            from stable_baselines3 import DQN
+            model = MODELCLASS.load(dk.data_path / f"{dk.model_filename}_model")
            model = DQN.load(dk.data_path / f"{dk.model_filename}_model")
        if Path(dk.data_path / f"{dk.model_filename}_svm_model.joblib").is_file():
            dk.svm_model = load(dk.data_path / f"{dk.model_filename}_svm_model.joblib")
--- a/freqtrade/freqai/prediction_models/ReinforcementLearner.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearner.py
@@ -0,0 +1,82 @@
 import logging
 from typing import Any, Dict  # , Tuple
 # import numpy.typing as npt
 import torch as th
 import numpy as np
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from pathlib import Path
 logger = logging.getLogger(__name__)
 class ReinforcementLearner(BaseReinforcementLearningModel):
    """
    User created Reinforcement Learning Model prediction model.
    """
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
        train_df = data_dictionary["train_features"]
        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
        policy_kwargs = dict(activation_fn=th.nn.ReLU,
                             net_arch=[256, 256, 128])
        model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
                                tensorboard_log=Path(dk.data_path / "tensorboard"),
                                **self.freqai_info['model_training_parameters']
                                )
        model.learn(
            total_timesteps=int(total_timesteps),
            callback=self.eval_callback
        )
        if Path(dk.data_path / "best_model.zip").is_file():
            logger.info('Callback found a best model.')
            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
            return best_model
        logger.info('Couldnt find best model, using final model instead.')
        return model
 class MyRLEnv(Base5ActionRLEnv):
    """
    User can modify any part of the environment by overriding base
    functions
    """
    def calculate_reward(self, action):
        if self._last_trade_tick is None:
            return 0.
        # close long
        if action == Actions.Long_exit.value and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
        if action == Actions.Long_exit.value and self._position == Positions.Long:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
        # close short
        if action == Actions.Short_exit.value and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
        if action == Actions.Short_exit.value and self._position == Positions.Short:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
        return 0.
--- a/freqtrade/freqai/prediction_models/ReinforcementLearnerCustomAgent.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearnerCustomAgent.py
@@ -1,17 +1,59 @@
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+import logging
 import gym
 import torch
 import torch as th
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from stable_baselines3 import DQN
 from stable_baselines3.common.buffers import ReplayBuffer
-from stable_baselines3.common.policies import BasePolicy
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
-from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor,
+from pathlib import Path
                                                   FlattenExtractor)
 from stable_baselines3.common.type_aliases import GymEnv, Schedule
 from stable_baselines3.dqn.policies import (CnnPolicy, DQNPolicy, MlpPolicy,
                                            QNetwork)
 from torch import nn
 import gym
 from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor,
                                                   FlattenExtractor)
 from stable_baselines3.common.type_aliases import GymEnv, Schedule
 from stable_baselines3.common.policies import BasePolicy
 logger = logging.getLogger(__name__)
 class ReinforcementLearnerCustomAgent(BaseReinforcementLearningModel):
    """
    User can customize agent by defining the class and using it directly.
    Here the example is "TDQN"
    """
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
        train_df = data_dictionary["train_features"]
        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
        policy_kwargs = dict(activation_fn=th.nn.ReLU,
                             net_arch=[256, 256, 128])
        # TDQN is a custom agent defined below
        model = TDQN(self.policy_type, self.train_env,
                     tensorboard_log=Path(dk.data_path / "tensorboard"),
                     policy_kwargs=policy_kwargs,
                     **self.freqai_info['model_training_parameters']
                     )
        model.learn(
            total_timesteps=int(total_timesteps),
            callback=self.eval_callback
        )
        if Path(dk.data_path / "best_model.zip").is_file():
            logger.info('Callback found a best model.')
            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
            return best_model
        logger.info('Couldnt find best model, using final model instead.')
        return model
 # User creates their custom agent and networks as shown below
 def create_mlp_(
@@ -72,7 +114,7 @@ class TDQNetwork(QNetwork):
    def init_weights(self, m):
        if type(m) == nn.Linear:
-            torch.nn.init.kaiming_uniform_(m.weight)
+            th.nn.init.kaiming_uniform_(m.weight)
 class TDQNPolicy(DQNPolicy):
@@ -175,7 +217,7 @@ class TDQN(DQN):
        exploration_initial_eps: float = 1.0,
        exploration_final_eps: float = 0.05,
        max_grad_norm: float = 10,
-        tensorboard_log: Optional[str] = None,
+        tensorboard_log: Optional[Path] = None,
        create_eval_env: bool = False,
        policy_kwargs: Optional[Dict[str, Any]] = None,
        verbose: int = 1,
--- a/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py
@@ -0,0 +1,84 @@
 import logging
 from typing import Any, Dict  # , Tuple
 # import numpy.typing as npt
 import torch as th
 from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.vec_env import SubprocVecEnv
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import (BaseReinforcementLearningModel,
                                                                make_env)
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from pathlib import Path
 logger = logging.getLogger(__name__)
 class ReinforcementLearner_multiproc(BaseReinforcementLearningModel):
    """
    User created Reinforcement Learning Model prediction model.
    """
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
        train_df = data_dictionary["train_features"]
        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
        # model arch
        policy_kwargs = dict(activation_fn=th.nn.ReLU,
                             net_arch=[512, 512, 512])
        model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
                                tensorboard_log=Path(dk.data_path / "tensorboard"),
                                **self.freqai_info['model_training_parameters']
                                )
        model.learn(
            total_timesteps=int(total_timesteps),
            callback=self.eval_callback
        )
        if Path(dk.data_path / "best_model.zip").is_file():
            logger.info('Callback found a best model.')
            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
            return best_model
        logger.info('Couldnt find best model, using final model instead.')
        return model
    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk):
        """
        If user has particular environment configuration needs, they can do that by
        overriding this function. In the present case, the user wants to setup training
        environments for multiple workers.
        """
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
        # environments
        if not self.train_env:
            env_id = "train_env"
            num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
            self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
                                            self.reward_params, self.CONV_WIDTH) for i
                                            in range(num_cpu)])
            eval_env_id = 'eval_env'
            self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
                                           self.reward_params, self.CONV_WIDTH, monitor=True) for i
                                           in range(num_cpu)])
            self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
                                              render=False, eval_freq=eval_freq,
                                              best_model_save_path=dk.data_path)
        else:
            self.train_env.env_method('reset')
            self.eval_env.env_method('reset')
            self.train_env.env_method('reset_env', train_df, prices_train,
                                      self.CONV_WIDTH, self.reward_params)
            self.eval_env.env_method('reset_env', train_df, prices_train,
                                     self.CONV_WIDTH, self.reward_params)
            self.eval_callback.__init__(self.eval_env, deterministic=True,
                                        render=False, eval_freq=eval_freq,
                                        best_model_save_path=dk.data_path)
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO.py
@@ -1,104 +0,0 @@
 import gc
 import logging
 from typing import Any, Dict  # , Tuple
 import numpy as np
 # import numpy.typing as npt
 import torch as th
 from stable_baselines3 import PPO
 from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.monitor import Monitor
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.RL.Base3ActionRLEnv import Actions, Base3ActionRLEnv, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 logger = logging.getLogger(__name__)
 class ReinforcementLearningPPO(BaseReinforcementLearningModel):
    """
    User created Reinforcement Learning Model prediction model.
    """
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
        path = dk.data_path
        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
                                     log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
                                     deterministic=True, render=False)
        # model arch
        policy_kwargs = dict(activation_fn=th.nn.ReLU,
                             net_arch=[256, 256, 128])
        model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs,
                    tensorboard_log=f"{path}/ppo/tensorboard/",
                    **self.freqai_info['model_training_parameters']
                    )
        model.learn(
            total_timesteps=int(total_timesteps),
            callback=eval_callback
        )
        del model
        best_model = PPO.load(dk.data_path / "best_model")
        print('Training finished!')
        gc.collect()
        return best_model
    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
        """
        User overrides this as shown here if they are using a custom MyRLEnv
        """
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        # environments
        if not self.train_env:
            self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
                                     reward_kwargs=self.reward_params)
            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
                                    window_size=self.CONV_WIDTH,
                                    reward_kwargs=self.reward_params), ".")
        else:
            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
            self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
            self.train_env.reset()
            self.eval_env.reset()
 class MyRLEnv(Base3ActionRLEnv):
    """
    User can override any function in BaseRLEnv and gym.Env
    """
    def calculate_reward(self, action):
        if self._last_trade_tick is None:
            return 0.
        # close long
        if (action == Actions.Short.value or
                action == Actions.Neutral.value) and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
        # close short
        if (action == Actions.Long.value or
                action == Actions.Neutral.value) and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
        return 0.
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py
@@ -1,132 +0,0 @@
 import logging
 from typing import Any, Dict  # , Tuple
 import numpy as np
 # import numpy.typing as npt
 import torch as th
 from stable_baselines3.common.monitor import Monitor
 from typing import Callable
 from stable_baselines3 import PPO
 from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.vec_env import SubprocVecEnv
 from stable_baselines3.common.utils import set_random_seed
 from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 import gym
 logger = logging.getLogger(__name__)
 def make_env(env_id: str, rank: int, seed: int, train_df, price,
             reward_params, window_size, monitor=False) -> Callable:
    """
    Utility function for multiprocessed env.
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    """
    def _init() -> gym.Env:
        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
                      reward_kwargs=reward_params, id=env_id, seed=seed + rank)
        if monitor:
            env = Monitor(env, ".")
        return env
    set_random_seed(seed)
    return _init
 class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
    """
    User created Reinforcement Learning Model prediction model.
    """
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
        path = dk.data_path
        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
                                     log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
                                     deterministic=True, render=False)
        # model arch
        policy_kwargs = dict(activation_fn=th.nn.ReLU,
                             net_arch=[512, 512, 512])
        model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs,
                    tensorboard_log=f"{path}/ppo/tensorboard/",
                    **self.freqai_info['model_training_parameters']
                    )
        model.learn(
            total_timesteps=int(total_timesteps),
            callback=eval_callback
        )
        best_model = PPO.load(dk.data_path / "best_model")
        print('Training finished!')
        return best_model
    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
        """
        User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
        leaving this will default to Base5ActEnv
        """
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        # environments
        if not self.train_env:
            env_id = "train_env"
            num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
            self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
                                            self.reward_params, self.CONV_WIDTH) for i
                                            in range(num_cpu)])
            eval_env_id = 'eval_env'
            self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
                                           self.reward_params, self.CONV_WIDTH, monitor=True) for i
                                           in range(num_cpu)])
        else:
            self.train_env.env_method('reset_env', train_df, prices_train,
                                      self.CONV_WIDTH, self.reward_params)
            self.eval_env.env_method('reset_env', train_df, prices_train,
                                     self.CONV_WIDTH, self.reward_params)
            self.train_env.env_method('reset')
            self.eval_env.env_method('reset')
 class MyRLEnv(Base3ActionRLEnv):
    """
    User can override any function in BaseRLEnv and gym.Env
    """
    def calculate_reward(self, action):
        if self._last_trade_tick is None:
            return 0.
        # close long
        if (action == Actions.Short.value or
                action == Actions.Neutral.value) and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
        # close short
        if (action == Actions.Long.value or
                action == Actions.Neutral.value) and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
        return 0.
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py
@@ -1,115 +0,0 @@
 import logging
 from typing import Any, Dict  # Optional
 import torch as th
 from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.monitor import Monitor
 from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from freqtrade.freqai.RL.TDQNagent import TDQN
 from stable_baselines3 import DQN
 from stable_baselines3.common.buffers import ReplayBuffer
 import numpy as np
 import gc
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 logger = logging.getLogger(__name__)
 class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
    """
    User created Reinforcement Learning Model prediction model.
    """
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
        path = dk.data_path
        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
                                     log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
                                     deterministic=True, render=False)
        # model arch
        policy_kwargs = dict(activation_fn=th.nn.ReLU,
                             net_arch=[256, 256, 128])
        model = TDQN('TMultiInputPolicy', self.train_env,
                     tensorboard_log=f"{path}/tdqn/tensorboard/",
                     policy_kwargs=policy_kwargs,
                     replay_buffer_class=ReplayBuffer,
                     **self.freqai_info['model_training_parameters']
                     )
        model.learn(
            total_timesteps=int(total_timesteps),
            callback=eval_callback
        )
        del model
        best_model = DQN.load(dk.data_path / "best_model")
        print('Training finished!')
        gc.collect()
        return best_model
    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
        """
        User overrides this as shown here if they are using a custom MyRLEnv
        """
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        # environments
        if not self.train_env:
            self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
                                     reward_kwargs=self.reward_params)
            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
                                    window_size=self.CONV_WIDTH,
                                    reward_kwargs=self.reward_params), ".")
        else:
            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
            self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
            self.train_env.reset()
            self.eval_env.reset()
 # User can inherit and customize 5 action environment
 class MyRLEnv(Base5ActionRLEnv):
    """
    User can override any function in BaseRLEnv and gym.Env. Here the user
    Adds 5 actions.
    """
    def calculate_reward(self, action):
        if self._last_trade_tick is None:
            return 0.
        # close long
        if action == Actions.Long_sell.value and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
        if action == Actions.Long_sell.value and self._position == Positions.Long:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
        # close short
        if action == Actions.Short_buy.value and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
        if action == Actions.Short_buy.value and self._position == Positions.Short:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
        return 0.
--- a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py
@@ -1,148 +0,0 @@
 import logging
 from typing import Any, Dict  # Optional
 import torch as th
 import numpy as np
 import gym
 from typing import Callable
 from stable_baselines3.common.callbacks import EvalCallback
 # EvalCallback , StopTrainingOnNoModelImprovement, StopTrainingOnRewardThreshold
 from stable_baselines3.common.monitor import Monitor
 from stable_baselines3.common.vec_env import SubprocVecEnv
 from stable_baselines3.common.utils import set_random_seed
 from stable_baselines3 import DQN
 from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from freqtrade.freqai.RL.TDQNagent import TDQN
 from stable_baselines3.common.buffers import ReplayBuffer
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 logger = logging.getLogger(__name__)
 def make_env(env_id: str, rank: int, seed: int, train_df, price,
             reward_params, window_size, monitor=False) -> Callable:
    """
    Utility function for multiprocessed env.
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    """
    def _init() -> gym.Env:
        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
                      reward_kwargs=reward_params, id=env_id, seed=seed + rank)
        if monitor:
            env = Monitor(env, ".")
        return env
    set_random_seed(seed)
    return _init
 class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
    """
    User created Reinforcement Learning Model prediction model.
    """
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
        path = dk.data_path
        eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
                                     log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
                                     deterministic=True, render=False)
        # model arch
        policy_kwargs = dict(activation_fn=th.nn.ReLU,
                             net_arch=[512, 512, 512])
        model = TDQN('TMultiInputPolicy', self.train_env,
                     policy_kwargs=policy_kwargs,
                     tensorboard_log=f"{path}/tdqn/tensorboard/",
                     replay_buffer_class=ReplayBuffer,
                     **self.freqai_info['model_training_parameters']
                     )
        model.learn(
            total_timesteps=int(total_timesteps),
            callback=eval_callback
        )
        best_model = DQN.load(dk.data_path / "best_model.zip")
        print('Training finished!')
        return best_model
    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
        """
        User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
        leaving this will default to Base5ActEnv
        """
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        # environments
        if not self.train_env:
            env_id = "train_env"
            num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
            self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
                                            self.reward_params, self.CONV_WIDTH) for i
                                            in range(num_cpu)])
            eval_env_id = 'eval_env'
            self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
                                           self.reward_params, self.CONV_WIDTH, monitor=True) for i
                                           in range(num_cpu)])
        else:
            self.train_env.env_method('reset_env', train_df, prices_train,
                                      self.CONV_WIDTH, self.reward_params)
            self.eval_env.env_method('reset_env', train_df, prices_train,
                                     self.CONV_WIDTH, self.reward_params)
            self.train_env.env_method('reset')
            self.eval_env.env_method('reset')
 # User can inherit and customize 5 action environment
 class MyRLEnv(Base5ActionRLEnv):
    """
    User can override any function in BaseRLEnv and gym.Env. Here the user
    Adds 5 actions.
    """
    def calculate_reward(self, action):
        if self._last_trade_tick is None:
            return 0.
        # close long
        if action == Actions.Long_sell.value and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
        if action == Actions.Long_sell.value and self._position == Positions.Long:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
        # close short
        if action == Actions.Short_buy.value and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
        if action == Actions.Short_buy.value and self._position == Positions.Short:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
        return 0.