Merge pull request #7289 from freqtrade/feat/freqai-rl-dev

Add reinforcement learning module to FreqAI
2022-11-27 17:15:21 +01:00
parent 2219d2f491 732757e087
commit f4025ee5de
27 changed files with 1874 additions and 43 deletions
@@ -0,0 +1,135 @@
+import logging
+from enum import Enum
+
+from gym import spaces
+
+from freqtrade.freqai.RL.BaseEnvironment import BaseEnvironment, Positions
+
+
+logger = logging.getLogger(__name__)
+
+
+class Actions(Enum):
+    Neutral = 0
+    Exit = 1
+    Long_enter = 2
+    Short_enter = 3
+
+
+class Base4ActionRLEnv(BaseEnvironment):
+    """
+    Base class for a 4 action environment
+    """
+
+    def set_action_space(self):
+        self.action_space = spaces.Discrete(len(Actions))
+
+    def step(self, action: int):
+        """
+        Logic for a single step (incrementing one candle in time)
+        by the agent
+        :param: action: int = the action type that the agent plans
+            to take for the current step.
+        :returns:
+            observation = current state of environment
+            step_reward = the reward from `calculate_reward()`
+            _done = if the agent "died" or if the candles finished
+            info = dict passed back to openai gym lib
+        """
+        self._done = False
+        self._current_tick += 1
+
+        if self._current_tick == self._end_tick:
+            self._done = True
+
+        self._update_unrealized_total_profit()
+
+        step_reward = self.calculate_reward(action)
+        self.total_reward += step_reward
+
+        trade_type = None
+        if self.is_tradesignal(action):
+            """
+            Action: Neutral, position: Long ->  Close Long
+            Action: Neutral, position: Short -> Close Short
+
+            Action: Long, position: Neutral -> Open Long
+            Action: Long, position: Short -> Close Short and Open Long
+
+            Action: Short, position: Neutral -> Open Short
+            Action: Short, position: Long -> Close Long and Open Short
+            """
+
+            if action == Actions.Neutral.value:
+                self._position = Positions.Neutral
+                trade_type = "neutral"
+                self._last_trade_tick = None
+            elif action == Actions.Long_enter.value:
+                self._position = Positions.Long
+                trade_type = "long"
+                self._last_trade_tick = self._current_tick
+            elif action == Actions.Short_enter.value:
+                self._position = Positions.Short
+                trade_type = "short"
+                self._last_trade_tick = self._current_tick
+            elif action == Actions.Exit.value:
+                self._update_total_profit()
+                self._position = Positions.Neutral
+                trade_type = "neutral"
+                self._last_trade_tick = None
+            else:
+                print("case not defined")
+
+            if trade_type is not None:
+                self.trade_history.append(
+                    {'price': self.current_price(), 'index': self._current_tick,
+                     'type': trade_type})
+
+        if self._total_profit < 1 - self.rl_config.get('max_training_drawdown_pct', 0.8):
+            self._done = True
+
+        self._position_history.append(self._position)
+
+        info = dict(
+            tick=self._current_tick,
+            total_reward=self.total_reward,
+            total_profit=self._total_profit,
+            position=self._position.value
+        )
+
+        observation = self._get_observation()
+
+        self._update_history(info)
+
+        return observation, step_reward, self._done, info
+
+    def is_tradesignal(self, action: int) -> bool:
+        """
+        Determine if the signal is a trade signal
+        e.g.: agent wants a Actions.Long_exit while it is in a Positions.short
+        """
+        return not ((action == Actions.Neutral.value and self._position == Positions.Neutral) or
+                    (action == Actions.Neutral.value and self._position == Positions.Short) or
+                    (action == Actions.Neutral.value and self._position == Positions.Long) or
+                    (action == Actions.Short_enter.value and self._position == Positions.Short) or
+                    (action == Actions.Short_enter.value and self._position == Positions.Long) or
+                    (action == Actions.Exit.value and self._position == Positions.Neutral) or
+                    (action == Actions.Long_enter.value and self._position == Positions.Long) or
+                    (action == Actions.Long_enter.value and self._position == Positions.Short))
+
+    def _is_valid(self, action: int) -> bool:
+        """
+        Determine if the signal is valid.
+        e.g.: agent wants a Actions.Long_exit while it is in a Positions.short
+        """
+        # Agent should only try to exit if it is in position
+        if action == Actions.Exit.value:
+            if self._position not in (Positions.Short, Positions.Long):
+                return False
+
+        # Agent should only try to enter if it is not in position
+        if action in (Actions.Short_enter.value, Actions.Long_enter.value):
+            if self._position != Positions.Neutral:
+                return False
+
+        return True
@@ -0,0 +1,145 @@
+import logging
+from enum import Enum
+
+from gym import spaces
+
+from freqtrade.freqai.RL.BaseEnvironment import BaseEnvironment, Positions
+
+
+logger = logging.getLogger(__name__)
+
+
+class Actions(Enum):
+    Neutral = 0
+    Long_enter = 1
+    Long_exit = 2
+    Short_enter = 3
+    Short_exit = 4
+
+
+class Base5ActionRLEnv(BaseEnvironment):
+    """
+    Base class for a 5 action environment
+    """
+
+    def set_action_space(self):
+        self.action_space = spaces.Discrete(len(Actions))
+
+    def step(self, action: int):
+        """
+        Logic for a single step (incrementing one candle in time)
+        by the agent
+        :param: action: int = the action type that the agent plans
+            to take for the current step.
+        :returns:
+            observation = current state of environment
+            step_reward = the reward from `calculate_reward()`
+            _done = if the agent "died" or if the candles finished
+            info = dict passed back to openai gym lib
+        """
+        self._done = False
+        self._current_tick += 1
+
+        if self._current_tick == self._end_tick:
+            self._done = True
+
+        self._update_unrealized_total_profit()
+        step_reward = self.calculate_reward(action)
+        self.total_reward += step_reward
+
+        trade_type = None
+        if self.is_tradesignal(action):
+            """
+            Action: Neutral, position: Long ->  Close Long
+            Action: Neutral, position: Short -> Close Short
+
+            Action: Long, position: Neutral -> Open Long
+            Action: Long, position: Short -> Close Short and Open Long
+
+            Action: Short, position: Neutral -> Open Short
+            Action: Short, position: Long -> Close Long and Open Short
+            """
+
+            if action == Actions.Neutral.value:
+                self._position = Positions.Neutral
+                trade_type = "neutral"
+                self._last_trade_tick = None
+            elif action == Actions.Long_enter.value:
+                self._position = Positions.Long
+                trade_type = "long"
+                self._last_trade_tick = self._current_tick
+            elif action == Actions.Short_enter.value:
+                self._position = Positions.Short
+                trade_type = "short"
+                self._last_trade_tick = self._current_tick
+            elif action == Actions.Long_exit.value:
+                self._update_total_profit()
+                self._position = Positions.Neutral
+                trade_type = "neutral"
+                self._last_trade_tick = None
+            elif action == Actions.Short_exit.value:
+                self._update_total_profit()
+                self._position = Positions.Neutral
+                trade_type = "neutral"
+                self._last_trade_tick = None
+            else:
+                print("case not defined")
+
+            if trade_type is not None:
+                self.trade_history.append(
+                    {'price': self.current_price(), 'index': self._current_tick,
+                     'type': trade_type})
+
+        if (self._total_profit < self.max_drawdown or
+                self._total_unrealized_profit < self.max_drawdown):
+            self._done = True
+
+        self._position_history.append(self._position)
+
+        info = dict(
+            tick=self._current_tick,
+            total_reward=self.total_reward,
+            total_profit=self._total_profit,
+            position=self._position.value
+        )
+
+        observation = self._get_observation()
+
+        self._update_history(info)
+
+        return observation, step_reward, self._done, info
+
+    def is_tradesignal(self, action: int) -> bool:
+        """
+        Determine if the signal is a trade signal
+        e.g.: agent wants a Actions.Long_exit while it is in a Positions.short
+        """
+        return not ((action == Actions.Neutral.value and self._position == Positions.Neutral) or
+                    (action == Actions.Neutral.value and self._position == Positions.Short) or
+                    (action == Actions.Neutral.value and self._position == Positions.Long) or
+                    (action == Actions.Short_enter.value and self._position == Positions.Short) or
+                    (action == Actions.Short_enter.value and self._position == Positions.Long) or
+                    (action == Actions.Short_exit.value and self._position == Positions.Long) or
+                    (action == Actions.Short_exit.value and self._position == Positions.Neutral) or
+                    (action == Actions.Long_enter.value and self._position == Positions.Long) or
+                    (action == Actions.Long_enter.value and self._position == Positions.Short) or
+                    (action == Actions.Long_exit.value and self._position == Positions.Short) or
+                    (action == Actions.Long_exit.value and self._position == Positions.Neutral))
+
+    def _is_valid(self, action: int) -> bool:
+        # trade signal
+        """
+        Determine if the signal is valid.
+        e.g.: agent wants a Actions.Long_exit while it is in a Positions.short
+        """
+        # Agent should only try to exit if it is in position
+        if action in (Actions.Short_exit.value, Actions.Long_exit.value):
+            if self._position not in (Positions.Short, Positions.Long):
+                return False
+
+        # Agent should only try to enter if it is not in position
+        if action in (Actions.Short_enter.value, Actions.Long_enter.value):
+            if self._position != Positions.Neutral:
+                return False
+
+        return True
@@ -0,0 +1,302 @@
+import logging
+from abc import abstractmethod
+from enum import Enum
+from typing import Optional
+
+import gym
+import numpy as np
+import pandas as pd
+from gym import spaces
+from gym.utils import seeding
+from pandas import DataFrame
+
+from freqtrade.data.dataprovider import DataProvider
+
+
+logger = logging.getLogger(__name__)
+
+
+class Positions(Enum):
+    Short = 0
+    Long = 1
+    Neutral = 0.5
+
+    def opposite(self):
+        return Positions.Short if self == Positions.Long else Positions.Long
+
+
+class BaseEnvironment(gym.Env):
+    """
+    Base class for environments. This class is agnostic to action count.
+    Inherited classes customize this to include varying action counts/types,
+    See RL/Base5ActionRLEnv.py and RL/Base4ActionRLEnv.py
+    """
+
+    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
+                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
+                 id: str = 'baseenv-1', seed: int = 1, config: dict = {},
+                 dp: Optional[DataProvider] = None):
+        """
+        Initializes the training/eval environment.
+        :param df: dataframe of features
+        :param prices: dataframe of prices to be used in the training environment
+        :param window_size: size of window (temporal) to pass to the agent
+        :param reward_kwargs: extra config settings assigned by user in `rl_config`
+        :param starting_point: start at edge of window or not
+        :param id: string id of the environment (used in backend for multiprocessed env)
+        :param seed: Sets the seed of the environment higher in the gym.Env object
+        :param config: Typical user configuration file
+        :param dp: dataprovider from freqtrade
+        """
+        self.config = config
+        self.rl_config = config['freqai']['rl_config']
+        self.add_state_info = self.rl_config.get('add_state_info', False)
+        self.id = id
+        self.seed(seed)
+        self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
+        self.max_drawdown = 1 - self.rl_config.get('max_training_drawdown_pct', 0.8)
+        self.compound_trades = config['stake_amount'] == 'unlimited'
+        if self.config.get('fee', None) is not None:
+            self.fee = self.config['fee']
+        elif dp is not None:
+            self.fee = dp._exchange.get_fee(symbol=dp.current_whitelist()[0])  # type: ignore
+        else:
+            self.fee = 0.0015
+
+    def reset_env(self, df: DataFrame, prices: DataFrame, window_size: int,
+                  reward_kwargs: dict, starting_point=True):
+        """
+        Resets the environment when the agent fails (in our case, if the drawdown
+        exceeds the user set max_training_drawdown_pct)
+        :param df: dataframe of features
+        :param prices: dataframe of prices to be used in the training environment
+        :param window_size: size of window (temporal) to pass to the agent
+        :param reward_kwargs: extra config settings assigned by user in `rl_config`
+        :param starting_point: start at edge of window or not
+        """
+        self.df = df
+        self.signal_features = self.df
+        self.prices = prices
+        self.window_size = window_size
+        self.starting_point = starting_point
+        self.rr = reward_kwargs["rr"]
+        self.profit_aim = reward_kwargs["profit_aim"]
+
+        # # spaces
+        if self.add_state_info:
+            self.total_features = self.signal_features.shape[1] + 3
+        else:
+            self.total_features = self.signal_features.shape[1]
+        self.shape = (window_size, self.total_features)
+        self.set_action_space()
+        self.observation_space = spaces.Box(
+            low=-1, high=1, shape=self.shape, dtype=np.float32)
+
+        # episode
+        self._start_tick: int = self.window_size
+        self._end_tick: int = len(self.prices) - 1
+        self._done: bool = False
+        self._current_tick: int = self._start_tick
+        self._last_trade_tick: Optional[int] = None
+        self._position = Positions.Neutral
+        self._position_history: list = [None]
+        self.total_reward: float = 0
+        self._total_profit: float = 1
+        self._total_unrealized_profit: float = 1
+        self.history: dict = {}
+        self.trade_history: list = []
+
+    @abstractmethod
+    def set_action_space(self):
+        """
+        Unique to the environment action count. Must be inherited.
+        """
+
+    def seed(self, seed: int = 1):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def reset(self):
+
+        self._done = False
+
+        if self.starting_point is True:
+            self._position_history = (self._start_tick * [None]) + [self._position]
+        else:
+            self._position_history = (self.window_size * [None]) + [self._position]
+
+        self._current_tick = self._start_tick
+        self._last_trade_tick = None
+        self._position = Positions.Neutral
+
+        self.total_reward = 0.
+        self._total_profit = 1.  # unit
+        self.history = {}
+        self.trade_history = []
+        self.portfolio_log_returns = np.zeros(len(self.prices))
+
+        self._profits = [(self._start_tick, 1)]
+        self.close_trade_profit = []
+        self._total_unrealized_profit = 1
+
+        return self._get_observation()
+
+    @abstractmethod
+    def step(self, action: int):
+        """
+        Step depeneds on action types, this must be inherited.
+        """
+        return
+
+    def _get_observation(self):
+        """
+        This may or may not be independent of action types, user can inherit
+        this in their custom "MyRLEnv"
+        """
+        features_window = self.signal_features[(
+            self._current_tick - self.window_size):self._current_tick]
+        if self.add_state_info:
+            features_and_state = DataFrame(np.zeros((len(features_window), 3)),
+                                           columns=['current_profit_pct',
+                                                    'position',
+                                                    'trade_duration'],
+                                           index=features_window.index)
+
+            features_and_state['current_profit_pct'] = self.get_unrealized_profit()
+            features_and_state['position'] = self._position.value
+            features_and_state['trade_duration'] = self.get_trade_duration()
+            features_and_state = pd.concat([features_window, features_and_state], axis=1)
+            return features_and_state
+        else:
+            return features_window
+
+    def get_trade_duration(self):
+        """
+        Get the trade duration if the agent is in a trade
+        """
+        if self._last_trade_tick is None:
+            return 0
+        else:
+            return self._current_tick - self._last_trade_tick
+
+    def get_unrealized_profit(self):
+        """
+        Get the unrealized profit if the agent is in a trade
+        """
+        if self._last_trade_tick is None:
+            return 0.
+
+        if self._position == Positions.Neutral:
+            return 0.
+        elif self._position == Positions.Short:
+            current_price = self.add_exit_fee(self.prices.iloc[self._current_tick].open)
+            last_trade_price = self.add_entry_fee(self.prices.iloc[self._last_trade_tick].open)
+            return (last_trade_price - current_price) / last_trade_price
+        elif self._position == Positions.Long:
+            current_price = self.add_entry_fee(self.prices.iloc[self._current_tick].open)
+            last_trade_price = self.add_exit_fee(self.prices.iloc[self._last_trade_tick].open)
+            return (current_price - last_trade_price) / last_trade_price
+        else:
+            return 0.
+
+    @abstractmethod
+    def is_tradesignal(self, action: int) -> bool:
+        """
+        Determine if the signal is a trade signal. This is
+        unique to the actions in the environment, and therefore must be
+        inherited.
+        """
+        return True
+
+    def _is_valid(self, action: int) -> bool:
+        """
+        Determine if the signal is valid.This is
+        unique to the actions in the environment, and therefore must be
+        inherited.
+        """
+        return True
+
+    def add_entry_fee(self, price):
+        return price * (1 + self.fee)
+
+    def add_exit_fee(self, price):
+        return price / (1 + self.fee)
+
+    def _update_history(self, info):
+        if not self.history:
+            self.history = {key: [] for key in info.keys()}
+
+        for key, value in info.items():
+            self.history[key].append(value)
+
+    @abstractmethod
+    def calculate_reward(self, action: int) -> float:
+        """
+        An example reward function. This is the one function that users will likely
+        wish to inject their own creativity into.
+        :param action: int = The action made by the agent for the current candle.
+        :return:
+        float = the reward to give to the agent for current step (used for optimization
+            of weights in NN)
+        """
+
+    def _update_unrealized_total_profit(self):
+        """
+        Update the unrealized total profit incase of episode end.
+        """
+        if self._position in (Positions.Long, Positions.Short):
+            pnl = self.get_unrealized_profit()
+            if self.compound_trades:
+                # assumes unit stake and compounding
+                unrl_profit = self._total_profit * (1 + pnl)
+            else:
+                # assumes unit stake and no compounding
+                unrl_profit = self._total_profit + pnl
+            self._total_unrealized_profit = unrl_profit
+
+    def _update_total_profit(self):
+        pnl = self.get_unrealized_profit()
+        if self.compound_trades:
+            # assumes unit stake and compounding
+            self._total_profit = self._total_profit * (1 + pnl)
+        else:
+            # assumes unit stake and no compounding
+            self._total_profit += pnl
+
+    def current_price(self) -> float:
+        return self.prices.iloc[self._current_tick].open
+
+    # Keeping around incase we want to start building more complex environment
+    # templates in the future.
+    # def most_recent_return(self):
+    #     """
+    #     Calculate the tick to tick return if in a trade.
+    #     Return is generated from rising prices in Long
+    #     and falling prices in Short positions.
+    #     The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
+    #     """
+    #     # Long positions
+    #     if self._position == Positions.Long:
+    #         current_price = self.prices.iloc[self._current_tick].open
+    #         previous_price = self.prices.iloc[self._current_tick - 1].open
+
+    #         if (self._position_history[self._current_tick - 1] == Positions.Short
+    #                 or self._position_history[self._current_tick - 1] == Positions.Neutral):
+    #             previous_price = self.add_entry_fee(previous_price)
+
+    #         return np.log(current_price) - np.log(previous_price)
+
+    #     # Short positions
+    #     if self._position == Positions.Short:
+    #         current_price = self.prices.iloc[self._current_tick].open
+    #         previous_price = self.prices.iloc[self._current_tick - 1].open
+    #         if (self._position_history[self._current_tick - 1] == Positions.Long
+    #                 or self._position_history[self._current_tick - 1] == Positions.Neutral):
+    #             previous_price = self.add_exit_fee(previous_price)
+
+    #         return np.log(previous_price) - np.log(current_price)
+
+    #     return 0
+
+    # def update_portfolio_log_returns(self, action):
+    #     self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
@@ -0,0 +1,395 @@
+import importlib
+import logging
+from abc import abstractmethod
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Tuple, Type, Union
+
+import gym
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+import torch as th
+import torch.multiprocessing
+from pandas import DataFrame
+from stable_baselines3.common.callbacks import EvalCallback
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.utils import set_random_seed
+from stable_baselines3.common.vec_env import SubprocVecEnv
+
+from freqtrade.exceptions import OperationalException
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from freqtrade.freqai.freqai_interface import IFreqaiModel
+from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv
+from freqtrade.freqai.RL.BaseEnvironment import Positions
+from freqtrade.persistence import Trade
+
+
+logger = logging.getLogger(__name__)
+
+torch.multiprocessing.set_sharing_strategy('file_system')
+
+SB3_MODELS = ['PPO', 'A2C', 'DQN']
+SB3_CONTRIB_MODELS = ['TRPO', 'ARS', 'RecurrentPPO', 'MaskablePPO']
+
+
+class BaseReinforcementLearningModel(IFreqaiModel):
+    """
+    User created Reinforcement Learning Model prediction class
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(config=kwargs['config'])
+        self.max_threads = min(self.freqai_info['rl_config'].get(
+            'cpu_count', 1), max(int(self.max_system_threads / 2), 1))
+        th.set_num_threads(self.max_threads)
+        self.reward_params = self.freqai_info['rl_config']['model_reward_parameters']
+        self.train_env: Union[SubprocVecEnv, gym.Env] = None
+        self.eval_env: Union[SubprocVecEnv, gym.Env] = None
+        self.eval_callback: Optional[EvalCallback] = None
+        self.model_type = self.freqai_info['rl_config']['model_type']
+        self.rl_config = self.freqai_info['rl_config']
+        self.continual_learning = self.freqai_info.get('continual_learning', False)
+        if self.model_type in SB3_MODELS:
+            import_str = 'stable_baselines3'
+        elif self.model_type in SB3_CONTRIB_MODELS:
+            import_str = 'sb3_contrib'
+        else:
+            raise OperationalException(f'{self.model_type} not available in stable_baselines3 or '
+                                       f'sb3_contrib. please choose one of {SB3_MODELS} or '
+                                       f'{SB3_CONTRIB_MODELS}')
+
+        mod = importlib.import_module(import_str, self.model_type)
+        self.MODELCLASS = getattr(mod, self.model_type)
+        self.policy_type = self.freqai_info['rl_config']['policy_type']
+        self.unset_outlier_removal()
+        self.net_arch = self.rl_config.get('net_arch', [128, 128])
+
+    def unset_outlier_removal(self):
+        """
+        If user has activated any function that may remove training points, this
+        function will set them to false and warn them
+        """
+        if self.ft_params.get('use_SVM_to_remove_outliers', False):
+            self.ft_params.update({'use_SVM_to_remove_outliers': False})
+            logger.warning('User tried to use SVM with RL. Deactivating SVM.')
+        if self.ft_params.get('use_DBSCAN_to_remove_outliers', False):
+            self.ft_params.update({'use_DBSCAN_to_remove_outliers': False})
+            logger.warning('User tried to use DBSCAN with RL. Deactivating DBSCAN.')
+        if self.freqai_info['data_split_parameters'].get('shuffle', False):
+            self.freqai_info['data_split_parameters'].update({'shuffle': False})
+            logger.warning('User tried to shuffle training data. Setting shuffle to False')
+
+    def train(
+        self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs
+    ) -> Any:
+        """
+        Filter the training data and train a model to it. Train makes heavy use of the datakitchen
+        for storing, saving, loading, and analyzing the data.
+        :param unfiltered_df: Full dataframe for the current training period
+        :param metadata: pair metadata from strategy.
+        :returns:
+        :model: Trained model which can be used to inference (self.predict)
+        """
+
+        logger.info("--------------------Starting training " f"{pair} --------------------")
+
+        features_filtered, labels_filtered = dk.filter_features(
+            unfiltered_df,
+            dk.training_features_list,
+            dk.label_list,
+            training_filter=True,
+        )
+
+        data_dictionary: Dict[str, Any] = dk.make_train_test_datasets(
+            features_filtered, labels_filtered)
+        dk.fit_labels()  # FIXME useless for now, but just satiating append methods
+
+        # normalize all data based on train_dataset only
+        prices_train, prices_test = self.build_ohlc_price_dataframes(dk.data_dictionary, pair, dk)
+        data_dictionary = dk.normalize_data(data_dictionary)
+
+        # data cleaning/analysis
+        self.data_cleaning_train(dk)
+
+        logger.info(
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
+            f' features and {len(data_dictionary["train_features"])} data points'
+        )
+
+        self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test, dk)
+
+        model = self.fit(data_dictionary, dk)
+
+        logger.info(f"--------------------done training {pair}--------------------")
+
+        return model
+
+    def set_train_and_eval_environments(self, data_dictionary: Dict[str, DataFrame],
+                                        prices_train: DataFrame, prices_test: DataFrame,
+                                        dk: FreqaiDataKitchen):
+        """
+        User can override this if they are using a custom MyRLEnv
+        :param data_dictionary: dict = common data dictionary containing train and test
+            features/labels/weights.
+        :param prices_train/test: DataFrame = dataframe comprised of the prices to be used in the
+            environment during training or testing
+        :param dk: FreqaiDataKitchen = the datakitchen for the current pair
+        """
+        train_df = data_dictionary["train_features"]
+        test_df = data_dictionary["test_features"]
+
+        self.train_env = self.MyRLEnv(df=train_df,
+                                      prices=prices_train,
+                                      window_size=self.CONV_WIDTH,
+                                      reward_kwargs=self.reward_params,
+                                      config=self.config,
+                                      dp=self.data_provider)
+        self.eval_env = Monitor(self.MyRLEnv(df=test_df,
+                                             prices=prices_test,
+                                             window_size=self.CONV_WIDTH,
+                                             reward_kwargs=self.reward_params,
+                                             config=self.config,
+                                             dp=self.data_provider))
+        self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
+                                          render=False, eval_freq=len(train_df),
+                                          best_model_save_path=str(dk.data_path))
+
+    @abstractmethod
+    def fit(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen, **kwargs):
+        """
+        Agent customizations and abstract Reinforcement Learning customizations
+        go in here. Abstract method, so this function must be overridden by
+        user class.
+        """
+        return
+
+    def get_state_info(self, pair: str) -> Tuple[float, float, int]:
+        """
+        State info during dry/live (not backtesting) which is fed back
+        into the model.
+        :param pair: str = COIN/STAKE to get the environment information for
+        :return:
+        :market_side: float = representing short, long, or neutral for
+            pair
+        :current_profit: float = unrealized profit of the current trade
+        :trade_duration: int = the number of candles that the trade has
+            been open for
+        """
+        open_trades = Trade.get_trades_proxy(is_open=True)
+        market_side = 0.5
+        current_profit: float = 0
+        trade_duration = 0
+        for trade in open_trades:
+            if trade.pair == pair:
+                if self.data_provider._exchange is None:  # type: ignore
+                    logger.error('No exchange available.')
+                    return 0, 0, 0
+                else:
+                    current_rate = self.data_provider._exchange.get_rate(  # type: ignore
+                                pair, refresh=False, side="exit", is_short=trade.is_short)
+
+                now = datetime.now(timezone.utc).timestamp()
+                trade_duration = int((now - trade.open_date_utc.timestamp()) / self.base_tf_seconds)
+                current_profit = trade.calc_profit_ratio(current_rate)
+
+        return market_side, current_profit, int(trade_duration)
+
+    def predict(
+        self, unfiltered_df: DataFrame, dk: FreqaiDataKitchen, **kwargs
+    ) -> Tuple[DataFrame, npt.NDArray[np.int_]]:
+        """
+        Filter the prediction features data and predict with it.
+        :param unfiltered_dataframe: Full dataframe for the current backtest period.
+        :return:
+        :pred_df: dataframe containing the predictions
+        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
+        data (NaNs) or felt uncertain about data (PCA and DI index)
+        """
+
+        dk.find_features(unfiltered_df)
+        filtered_dataframe, _ = dk.filter_features(
+            unfiltered_df, dk.training_features_list, training_filter=False
+        )
+        filtered_dataframe = dk.normalize_data_from_metadata(filtered_dataframe)
+        dk.data_dictionary["prediction_features"] = filtered_dataframe
+
+        # optional additional data cleaning/analysis
+        self.data_cleaning_predict(dk)
+
+        pred_df = self.rl_model_predict(
+            dk.data_dictionary["prediction_features"], dk, self.model)
+        pred_df.fillna(0, inplace=True)
+
+        return (pred_df, dk.do_predict)
+
+    def rl_model_predict(self, dataframe: DataFrame,
+                         dk: FreqaiDataKitchen, model: Any) -> DataFrame:
+        """
+        A helper function to make predictions in the Reinforcement learning module.
+        :param dataframe: DataFrame = the dataframe of features to make the predictions on
+        :param dk: FreqaiDatakitchen = data kitchen for the current pair
+        :param model: Any = the trained model used to inference the features.
+        """
+        output = pd.DataFrame(np.zeros(len(dataframe)), columns=dk.label_list)
+
+        def _predict(window):
+            observations = dataframe.iloc[window.index]
+            if self.live and self.rl_config.get('add_state_info', False):
+                market_side, current_profit, trade_duration = self.get_state_info(dk.pair)
+                observations['current_profit_pct'] = current_profit
+                observations['position'] = market_side
+                observations['trade_duration'] = trade_duration
+            res, _ = model.predict(observations, deterministic=True)
+            return res
+
+        output = output.rolling(window=self.CONV_WIDTH).apply(_predict)
+
+        return output
+
+    def build_ohlc_price_dataframes(self, data_dictionary: dict,
+                                    pair: str, dk: FreqaiDataKitchen) -> Tuple[DataFrame,
+                                                                               DataFrame]:
+        """
+        Builds the train prices and test prices for the environment.
+        """
+
+        pair = pair.replace(':', '')
+        train_df = data_dictionary["train_features"]
+        test_df = data_dictionary["test_features"]
+
+        # price data for model training and evaluation
+        tf = self.config['timeframe']
+        ohlc_list = [f'%-{pair}raw_open_{tf}', f'%-{pair}raw_low_{tf}',
+                     f'%-{pair}raw_high_{tf}', f'%-{pair}raw_close_{tf}']
+        rename_dict = {f'%-{pair}raw_open_{tf}': 'open', f'%-{pair}raw_low_{tf}': 'low',
+                       f'%-{pair}raw_high_{tf}': ' high', f'%-{pair}raw_close_{tf}': 'close'}
+
+        prices_train = train_df.filter(ohlc_list, axis=1)
+        if prices_train.empty:
+            raise OperationalException('Reinforcement learning module didnt find the raw prices '
+                                       'assigned in populate_any_indicators. Please assign them '
+                                       'with:\n'
+                                       'informative[f"%-{pair}raw_close"] = informative["close"]\n'
+                                       'informative[f"%-{pair}raw_open"] = informative["open"]\n'
+                                       'informative[f"%-{pair}raw_high"] = informative["high"]\n'
+                                       'informative[f"%-{pair}raw_low"] = informative["low"]\n')
+        prices_train.rename(columns=rename_dict, inplace=True)
+        prices_train.reset_index(drop=True)
+
+        prices_test = test_df.filter(ohlc_list, axis=1)
+        prices_test.rename(columns=rename_dict, inplace=True)
+        prices_test.reset_index(drop=True)
+
+        return prices_train, prices_test
+
+    def load_model_from_disk(self, dk: FreqaiDataKitchen) -> Any:
+        """
+        Can be used by user if they are trying to limit_ram_usage *and*
+        perform continual learning.
+        For now, this is unused.
+        """
+        exists = Path(dk.data_path / f"{dk.model_filename}_model").is_file()
+        if exists:
+            model = self.MODELCLASS.load(dk.data_path / f"{dk.model_filename}_model")
+        else:
+            logger.info('No model file on disk to continue learning from.')
+
+        return model
+
+    def _on_stop(self):
+        """
+        Hook called on bot shutdown. Close SubprocVecEnv subprocesses for clean shutdown.
+        """
+
+        if self.train_env:
+            self.train_env.close()
+
+        if self.eval_env:
+            self.eval_env.close()
+
+    # Nested class which can be overridden by user to customize further
+    class MyRLEnv(Base5ActionRLEnv):
+        """
+        User can override any function in BaseRLEnv and gym.Env. Here the user
+        sets a custom reward based on profit and trade duration.
+        """
+
+        def calculate_reward(self, action: int) -> float:
+            """
+            An example reward function. This is the one function that users will likely
+            wish to inject their own creativity into.
+            :param action: int = The action made by the agent for the current candle.
+            :return:
+            float = the reward to give to the agent for current step (used for optimization
+                of weights in NN)
+            """
+            # first, penalize if the action is not valid
+            if not self._is_valid(action):
+                return -2
+
+            pnl = self.get_unrealized_profit()
+            factor = 100.
+
+            # reward agent for entering trades
+            if (action in (Actions.Long_enter.value, Actions.Short_enter.value)
+                    and self._position == Positions.Neutral):
+                return 25
+            # discourage agent from not entering trades
+            if action == Actions.Neutral.value and self._position == Positions.Neutral:
+                return -1
+
+            max_trade_duration = self.rl_config.get('max_trade_duration_candles', 300)
+            if self._last_trade_tick:
+                trade_duration = self._current_tick - self._last_trade_tick
+            else:
+                trade_duration = 0
+
+            if trade_duration <= max_trade_duration:
+                factor *= 1.5
+            elif trade_duration > max_trade_duration:
+                factor *= 0.5
+
+            # discourage sitting in position
+            if (self._position in (Positions.Short, Positions.Long) and
+               action == Actions.Neutral.value):
+                return -1 * trade_duration / max_trade_duration
+
+            # close long
+            if action == Actions.Long_exit.value and self._position == Positions.Long:
+                if pnl > self.profit_aim * self.rr:
+                    factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2)
+                return float(pnl * factor)
+
+            # close short
+            if action == Actions.Short_exit.value and self._position == Positions.Short:
+                if pnl > self.profit_aim * self.rr:
+                    factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2)
+                return float(pnl * factor)
+
+            return 0.
+
+
+def make_env(MyRLEnv: Type[gym.Env], env_id: str, rank: int,
+             seed: int, train_df: DataFrame, price: DataFrame,
+             reward_params: Dict[str, int], window_size: int, monitor: bool = False,
+             config: Dict[str, Any] = {}) -> Callable:
+    """
+    Utility function for multiprocessed env.
+
+    :param env_id: (str) the environment ID
+    :param num_env: (int) the number of environment you wish to have in subprocesses
+    :param seed: (int) the inital seed for RNG
+    :param rank: (int) index of the subprocess
+    :return: (Callable)
+    """
+
+    def _init() -> gym.Env:
+
+        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
+                      reward_kwargs=reward_params, id=env_id, seed=seed + rank, config=config)
+        if monitor:
+            env = Monitor(env)
+        return env
+    set_random_seed(seed)
+    return _init
@@ -1,4 +1,5 @@
 import collections
+import importlib
 import logging
 import re
 import shutil
@@ -98,6 +99,12 @@ class FreqaiDataDrawer:
        self.empty_pair_dict: pair_info = {
                "model_filename": "", "trained_timestamp": 0,
                "data_path": "", "extras": {}}
+        if 'Reinforcement' in self.config['freqaimodel']:
+            self.model_type = 'stable_baselines'
+            logger.warning('User passed a ReinforcementLearner model, FreqAI will '
+                           'now use stable_baselines3 to save models.')
+        else:
+            self.model_type = self.freqai_info.get('model_save_type', 'joblib')

    def update_metric_tracker(self, metric: str, value: float, pair: str) -> None:
        """
@@ -476,10 +483,12 @@ class FreqaiDataDrawer:
        save_path = Path(dk.data_path)

        # Save the trained model
-        if not dk.keras:
+        if self.model_type == 'joblib':
            dump(model, save_path / f"{dk.model_filename}_model.joblib")
-        else:
+        elif self.model_type == 'keras':
            model.save(save_path / f"{dk.model_filename}_model.h5")
+        elif 'stable_baselines' in self.model_type:
+            model.save(save_path / f"{dk.model_filename}_model.zip")

        if dk.svm_model is not None:
            dump(dk.svm_model, save_path / f"{dk.model_filename}_svm_model.joblib")
@@ -506,11 +515,10 @@ class FreqaiDataDrawer:
                dk.pca, open(dk.data_path / f"{dk.model_filename}_pca_object.pkl", "wb")
            )

-        # if self.live:
-        # store as much in ram as possible to increase performance
        self.model_dictionary[coin] = model
        self.pair_dict[coin]["model_filename"] = dk.model_filename
        self.pair_dict[coin]["data_path"] = str(dk.data_path)
+
        if coin not in self.meta_data_dictionary:
            self.meta_data_dictionary[coin] = {}
        self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"]
@@ -542,14 +550,6 @@ class FreqaiDataDrawer:
        if dk.live:
            dk.model_filename = self.pair_dict[coin]["model_filename"]
            dk.data_path = Path(self.pair_dict[coin]["data_path"])
-            if self.freqai_info.get("follow_mode", False):
-                # follower can be on a different system which is rsynced from the leader:
-                dk.data_path = Path(
-                    self.config["user_data_dir"]
-                    / "models"
-                    / dk.data_path.parts[-2]
-                    / dk.data_path.parts[-1]
-                )

        if coin in self.meta_data_dictionary:
            dk.data = self.meta_data_dictionary[coin]["meta_data"]
@@ -568,12 +568,16 @@ class FreqaiDataDrawer:
        # try to access model in memory instead of loading object from disk to save time
        if dk.live and coin in self.model_dictionary:
            model = self.model_dictionary[coin]
-        elif not dk.keras:
+        elif self.model_type == 'joblib':
            model = load(dk.data_path / f"{dk.model_filename}_model.joblib")
-        else:
+        elif self.model_type == 'keras':
            from tensorflow import keras
-
            model = keras.models.load_model(dk.data_path / f"{dk.model_filename}_model.h5")
+        elif self.model_type == 'stable_baselines':
+            mod = importlib.import_module(
+                'stable_baselines3', self.freqai_info['rl_config']['model_type'])
+            MODELCLASS = getattr(mod, self.freqai_info['rl_config']['model_type'])
+            model = MODELCLASS.load(dk.data_path / f"{dk.model_filename}_model")

        if Path(dk.data_path / f"{dk.model_filename}_svm_model.joblib").is_file():
            dk.svm_model = load(dk.data_path / f"{dk.model_filename}_svm_model.joblib")
@@ -583,6 +587,10 @@ class FreqaiDataDrawer:
                f"Unable to load model, ensure model exists at " f"{dk.data_path} "
            )

+        # load it into ram if it was loaded from disk
+        if coin not in self.model_dictionary:
+            self.model_dictionary[coin] = model
+
        if self.config["freqai"]["feature_parameters"]["principal_component_analysis"]:
            dk.pca = cloudpickle.load(
                open(dk.data_path / f"{dk.model_filename}_pca_object.pkl", "rb")
@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Tuple
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
+import psutil
 from pandas import DataFrame
 from scipy import stats
 from sklearn import linear_model
@@ -102,7 +103,10 @@ class FreqaiDataKitchen:
                )

        self.data['extra_returns_per_train'] = self.freqai_config.get('extra_returns_per_train', {})
-        self.thread_count = self.freqai_config.get("data_kitchen_thread_count", -1)
+        if not self.freqai_config.get("data_kitchen_thread_count", 0):
+            self.thread_count = max(int(psutil.cpu_count() * 2 - 2), 1)
+        else:
+            self.thread_count = self.freqai_config["data_kitchen_thread_count"]
        self.train_dates: DataFrame = pd.DataFrame()
        self.unique_classes: Dict[str, list] = {}
        self.unique_class_list: list = []
@@ -5,15 +5,17 @@ from abc import ABC, abstractmethod
 from collections import deque
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Tuple
+from typing import Any, Dict, List, Literal, Optional, Tuple

 import numpy as np
 import pandas as pd
+import psutil
 from numpy.typing import NDArray
 from pandas import DataFrame

 from freqtrade.configuration import TimeRange
 from freqtrade.constants import Config
+from freqtrade.data.dataprovider import DataProvider
 from freqtrade.enums import RunMode
 from freqtrade.exceptions import OperationalException
 from freqtrade.exchange import timeframe_to_seconds
@@ -98,6 +100,8 @@ class IFreqaiModel(ABC):
        self.get_corr_dataframes: bool = True
        self._threads: List[threading.Thread] = []
        self._stop_event = threading.Event()
+        self.data_provider: Optional[DataProvider] = None
+        self.max_system_threads = max(int(psutil.cpu_count() * 2 - 2), 1)

        record_params(config, self.full_path)

@@ -126,6 +130,7 @@ class IFreqaiModel(ABC):

        self.live = strategy.dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE)
        self.dd.set_pair_dict_info(metadata)
+        self.data_provider = strategy.dp

        if self.live:
            self.inference_timer('start')
@@ -164,6 +169,13 @@ class IFreqaiModel(ABC):
        self.model = None
        self.dk = None

+    def _on_stop(self):
+        """
+        Callback for Subclasses to override to include logic for shutting down resources
+        when SIGINT is sent.
+        """
+        return
+
    def shutdown(self):
        """
        Cleans up threads on Shutdown, set stop event. Join threads to wait
@@ -172,6 +184,9 @@ class IFreqaiModel(ABC):
        logger.info("Stopping FreqAI")
        self._stop_event.set()

+        self.data_provider = None
+        self._on_stop()
+
        logger.info("Waiting on Training iteration")
        for _thread in self._threads:
            _thread.join()
@@ -0,0 +1,141 @@
+import logging
+from pathlib import Path
+from typing import Any, Dict
+
+import torch as th
+
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
+
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearner(BaseReinforcementLearningModel):
+    """
+    Reinforcement Learning Model prediction model.
+
+    Users can inherit from this class to make their own RL model with custom
+    environment/training controls. Define the file as follows:
+
+    ```
+    from freqtrade.freqai.prediction_models.ReinforcementLearner import ReinforcementLearner
+
+    class MyCoolRLModel(ReinforcementLearner):
+    ```
+
+    Save the file to `user_data/freqaimodels`, then run it with:
+
+    freqtrade trade --freqaimodel MyCoolRLModel --config config.json --strategy SomeCoolStrat
+
+    Here the users can override any of the functions
+    available in the `IFreqaiModel` inheritance tree. Most importantly for RL, this
+    is where the user overrides `MyRLEnv` (see below), to define custom
+    `calculate_reward()` function, or to override any other parts of the environment.
+
+    This class also allows users to override any other part of the IFreqaiModel tree.
+    For example, the user can override `def fit()` or `def train()` or `def predict()`
+    to take fine-tuned control over these processes.
+
+    Another common override may be `def data_cleaning_predict()` where the user can
+    take fine-tuned control over the data handling pipeline.
+    """
+
+    def fit(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen, **kwargs):
+        """
+        User customizable fit method
+        :param data_dictionary: dict = common data dictionary containing all train/test
+            features/labels/weights.
+        :param dk: FreqaiDatakitchen = data kitchen for current pair.
+        :return:
+        model Any = trained model to be used for inference in dry/live/backtesting
+        """
+        train_df = data_dictionary["train_features"]
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
+
+        policy_kwargs = dict(activation_fn=th.nn.ReLU,
+                             net_arch=self.net_arch)
+
+        if dk.pair not in self.dd.model_dictionary or not self.continual_learning:
+            model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
+                                    tensorboard_log=Path(
+                                        dk.full_path / "tensorboard" / dk.pair.split('/')[0]),
+                                    **self.freqai_info['model_training_parameters']
+                                    )
+        else:
+            logger.info('Continual training activated - starting training from previously '
+                        'trained agent.')
+            model = self.dd.model_dictionary[dk.pair]
+            model.set_env(self.train_env)
+
+        model.learn(
+            total_timesteps=int(total_timesteps),
+            callback=self.eval_callback
+        )
+
+        if Path(dk.data_path / "best_model.zip").is_file():
+            logger.info('Callback found a best model.')
+            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
+            return best_model
+
+        logger.info('Couldnt find best model, using final model instead.')
+
+        return model
+
+    class MyRLEnv(Base5ActionRLEnv):
+        """
+        User can override any function in BaseRLEnv and gym.Env. Here the user
+        sets a custom reward based on profit and trade duration.
+        """
+
+        def calculate_reward(self, action: int) -> float:
+            """
+            An example reward function. This is the one function that users will likely
+            wish to inject their own creativity into.
+            :param action: int = The action made by the agent for the current candle.
+            :return:
+            float = the reward to give to the agent for current step (used for optimization
+                of weights in NN)
+            """
+            # first, penalize if the action is not valid
+            if not self._is_valid(action):
+                return -2
+
+            pnl = self.get_unrealized_profit()
+            factor = 100.
+
+            # reward agent for entering trades
+            if (action in (Actions.Long_enter.value, Actions.Short_enter.value)
+                    and self._position == Positions.Neutral):
+                return 25
+            # discourage agent from not entering trades
+            if action == Actions.Neutral.value and self._position == Positions.Neutral:
+                return -1
+
+            max_trade_duration = self.rl_config.get('max_trade_duration_candles', 300)
+            trade_duration = self._current_tick - self._last_trade_tick  # type: ignore
+
+            if trade_duration <= max_trade_duration:
+                factor *= 1.5
+            elif trade_duration > max_trade_duration:
+                factor *= 0.5
+
+            # discourage sitting in position
+            if (self._position in (Positions.Short, Positions.Long) and
+                    action == Actions.Neutral.value):
+                return -1 * trade_duration / max_trade_duration
+
+            # close long
+            if action == Actions.Long_exit.value and self._position == Positions.Long:
+                if pnl > self.profit_aim * self.rr:
+                    factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2)
+                return float(pnl * factor)
+
+            # close short
+            if action == Actions.Short_exit.value and self._position == Positions.Short:
+                if pnl > self.profit_aim * self.rr:
+                    factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2)
+                return float(pnl * factor)
+
+            return 0.
@@ -0,0 +1,51 @@
+import logging
+from typing import Any, Dict  # , Tuple
+
+# import numpy.typing as npt
+from pandas import DataFrame
+from stable_baselines3.common.callbacks import EvalCallback
+from stable_baselines3.common.vec_env import SubprocVecEnv
+
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from freqtrade.freqai.prediction_models.ReinforcementLearner import ReinforcementLearner
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import make_env
+
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearner_multiproc(ReinforcementLearner):
+    """
+    Demonstration of how to build vectorized environments
+    """
+
+    def set_train_and_eval_environments(self, data_dictionary: Dict[str, Any],
+                                        prices_train: DataFrame, prices_test: DataFrame,
+                                        dk: FreqaiDataKitchen):
+        """
+        User can override this if they are using a custom MyRLEnv
+        :param data_dictionary: dict = common data dictionary containing train and test
+            features/labels/weights.
+        :param prices_train/test: DataFrame = dataframe comprised of the prices to be used in
+            the environment during training
+        or testing
+        :param dk: FreqaiDataKitchen = the datakitchen for the current pair
+        """
+        train_df = data_dictionary["train_features"]
+        test_df = data_dictionary["test_features"]
+
+        env_id = "train_env"
+        self.train_env = SubprocVecEnv([make_env(self.MyRLEnv, env_id, i, 1, train_df, prices_train,
+                                        self.reward_params, self.CONV_WIDTH, monitor=True,
+                                        config=self.config) for i
+                                        in range(self.max_threads)])
+
+        eval_env_id = 'eval_env'
+        self.eval_env = SubprocVecEnv([make_env(self.MyRLEnv, eval_env_id, i, 1,
+                                                test_df, prices_test,
+                                                self.reward_params, self.CONV_WIDTH, monitor=True,
+                                                config=self.config) for i
+                                       in range(self.max_threads)])
+        self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
+                                          render=False, eval_freq=len(train_df),
+                                          best_model_save_path=str(dk.data_path))