refactor environment inheritence tree to accommodate flexible action types/counts. fix bug in train profit handling

2022-08-28 19:21:57 +02:00 · 2022-08-28 19:21:57 +02:00 · 7766350c15
commit 7766350c15
parent 8c313b431d
8 changed files with 339 additions and 440 deletions
--- a/freqtrade/freqai/RL/Base4ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base4ActionRLEnv.py
@ -1,14 +1,11 @@
 import logging
 from enum import Enum
 from typing import Optional
 import gym
 import numpy as np
 from gym import spaces
-from gym.utils import seeding
+
-from pandas import DataFrame
+from freqtrade.freqai.RL.BaseEnvironment import BaseEnvironment, Positions
-import pandas as pd
+
-from abc import abstractmethod
+
 logger = logging.getLogger(__name__)
@ -19,95 +16,13 @@ class Actions(Enum):
    Short_enter = 3
-
+class Base4ActionRLEnv(BaseEnvironment):
 class Positions(Enum):
    Short = 0
    Long = 1
    Neutral = 0.5
    def opposite(self):
        return Positions.Short if self == Positions.Long else Positions.Long
 def mean_over_std(x):
    std = np.std(x, ddof=1)
    mean = np.mean(x)
    return mean / std if std > 0 else 0
 class Base4ActionRLEnv(gym.Env):
    """
-    Base class for a 5 action environment
+    Base class for a 4 action environment
    """
    metadata = {'render.modes': ['human']}
-    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
+    def set_action_space(self):
                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
                 id: str = 'baseenv-1', seed: int = 1, config: dict = {}):
        self.rl_config = config['freqai']['rl_config']
        self.id = id
        self.seed(seed)
        self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
    def reset_env(self, df: DataFrame, prices: DataFrame, window_size: int,
                  reward_kwargs: dict, starting_point=True):
        self.df = df
        self.signal_features = self.df
        self.prices = prices
        self.window_size = window_size
        self.starting_point = starting_point
        self.rr = reward_kwargs["rr"]
        self.profit_aim = reward_kwargs["profit_aim"]
        self.fee = 0.0015
        # # spaces
        self.shape = (window_size, self.signal_features.shape[1] + 3)
        self.action_space = spaces.Discrete(len(Actions))
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
        # episode
        self._start_tick: int = self.window_size
        self._end_tick: int = len(self.prices) - 1
        self._done: bool = False
        self._current_tick: int = self._start_tick
        self._last_trade_tick: Optional[int] = None
        self._position = Positions.Neutral
        self._position_history: list = [None]
        self.total_reward: float = 0
        self._total_profit: float = 1
        self.history: dict = {}
        self.trade_history: list = []
    def seed(self, seed: int = 1):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    def reset(self):
        self._done = False
        if self.starting_point is True:
            self._position_history = (self._start_tick * [None]) + [self._position]
        else:
            self._position_history = (self.window_size * [None]) + [self._position]
        self._current_tick = self._start_tick
        self._last_trade_tick = None
        self._position = Positions.Neutral
        self.total_reward = 0.
        self._total_profit = 1.  # unit
        self.history = {}
        self.trade_history = []
        self.portfolio_log_returns = np.zeros(len(self.prices))
        self._profits = [(self._start_tick, 1)]
        self.close_trade_profit = []
        return self._get_observation()
    def step(self, action: int):
        self._done = False
@ -181,43 +96,6 @@ class Base4ActionRLEnv(gym.Env):
        return observation, step_reward, self._done, info
    def _get_observation(self):
        features_window = self.signal_features[(
            self._current_tick - self.window_size):self._current_tick]
        features_and_state = DataFrame(np.zeros((len(features_window), 3)),
                                       columns=['current_profit_pct', 'position', 'trade_duration'],
                                       index=features_window.index)
        features_and_state['current_profit_pct'] = self.get_unrealized_profit()
        features_and_state['position'] = self._position.value
        features_and_state['trade_duration'] = self.get_trade_duration()
        features_and_state = pd.concat([features_window, features_and_state], axis=1)
        return features_and_state
    def get_trade_duration(self):
        if self._last_trade_tick is None:
            return 0
        else:
            return self._current_tick - self._last_trade_tick
    def get_unrealized_profit(self):
        if self._last_trade_tick is None:
            return 0.
        if self._position == Positions.Neutral:
            return 0.
        elif self._position == Positions.Short:
            current_price = self.add_entry_fee(self.prices.iloc[self._current_tick].open)
            last_trade_price = self.add_exit_fee(self.prices.iloc[self._last_trade_tick].open)
            return (last_trade_price - current_price) / last_trade_price
        elif self._position == Positions.Long:
            current_price = self.add_exit_fee(self.prices.iloc[self._current_tick].open)
            last_trade_price = self.add_entry_fee(self.prices.iloc[self._last_trade_tick].open)
            return (current_price - last_trade_price) / last_trade_price
        else:
            return 0.
    def is_tradesignal(self, action: int):
        # trade signal
        """
@ -228,7 +106,7 @@ class Base4ActionRLEnv(gym.Env):
                    (action == Actions.Neutral.value and self._position == Positions.Short) or
                    (action == Actions.Neutral.value and self._position == Positions.Long) or
                    (action == Actions.Short_enter.value and self._position == Positions.Short) or
-                    (action == Actions.Short_enter.value and self._position == Positions.Long) or                 
+                    (action == Actions.Short_enter.value and self._position == Positions.Long) or
                    (action == Actions.Exit.value and self._position == Positions.Neutral) or
                    (action == Actions.Long_enter.value and self._position == Positions.Long) or
                    (action == Actions.Long_enter.value and self._position == Positions.Short))
@ -240,7 +118,7 @@ class Base4ActionRLEnv(gym.Env):
        e.g.: agent wants a Actions.Long_exit while it is in a Positions.short
        """
        # Agent should only try to exit if it is in position
-        if action in (Actions.Exit.value):
+        if action == Actions.Exit.value:
            if self._position not in (Positions.Short, Positions.Long):
                return False
@ -250,97 +128,3 @@ class Base4ActionRLEnv(gym.Env):
                return False
        return True
    def _is_trade(self, action: Actions):
        return ((action == Actions.Long_enter.value and self._position == Positions.Neutral) or
                (action == Actions.Short_enter.value and self._position == Positions.Neutral))
    def is_hold(self, action):
        return ((action == Actions.Short_enter.value and self._position == Positions.Short) or
                (action == Actions.Long_enter.value and self._position == Positions.Long) or
                (action == Actions.Neutral.value and self._position == Positions.Long) or
                (action == Actions.Neutral.value and self._position == Positions.Short) or
                (action == Actions.Neutral.value and self._position == Positions.Neutral))
    def add_entry_fee(self, price):
        return price * (1 + self.fee)
    def add_exit_fee(self, price):
        return price / (1 + self.fee)
    def _update_history(self, info):
        if not self.history:
            self.history = {key: [] for key in info.keys()}
        for key, value in info.items():
            self.history[key].append(value)
    def get_sharpe_ratio(self):
        return mean_over_std(self.get_portfolio_log_returns())
    @abstractmethod
    def calculate_reward(self, action):
        """
        Reward is created by BaseReinforcementLearningModel and can
        be inherited/edited by the user made ReinforcementLearner file.
        """
        return 0.
    def _update_profit(self, action):
        if self._is_trade(action) or self._done:
            pnl = self.get_unrealized_profit()
            if self._position in (Positions.Long, Positions.Short):
                self._total_profit *= (1 + pnl)
                self._profits.append((self._current_tick, self._total_profit))
                self.close_trade_profit.append(pnl)
    def most_recent_return(self, action: int):
        """
        Calculate the tick to tick return if in a trade.
        Return is generated from rising prices in Long
        and falling prices in Short positions.
        The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
        """
        # Long positions
        if self._position == Positions.Long:
            current_price = self.prices.iloc[self._current_tick].open
            previous_price = self.prices.iloc[self._current_tick - 1].open
            if (self._position_history[self._current_tick - 1] == Positions.Short
                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
                previous_price = self.add_entry_fee(previous_price)
            return np.log(current_price) - np.log(previous_price)
        # Short positions
        if self._position == Positions.Short:
            current_price = self.prices.iloc[self._current_tick].open
            previous_price = self.prices.iloc[self._current_tick - 1].open
            if (self._position_history[self._current_tick - 1] == Positions.Long
                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
                previous_price = self.add_exit_fee(previous_price)
            return np.log(previous_price) - np.log(current_price)
        return 0
    def get_portfolio_log_returns(self):
        return self.portfolio_log_returns[1:self._current_tick + 1]
    def update_portfolio_log_returns(self, action):
        self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
    def current_price(self) -> float:
        return self.prices.iloc[self._current_tick].open
    def prev_price(self) -> float:
        return self.prices.iloc[self._current_tick - 1].open
    def sharpe_ratio(self):
        if len(self.close_trade_profit) == 0:
            return 0.
        returns = np.array(self.close_trade_profit)
        reward = (np.mean(returns) - 0. + 1e-9) / (np.std(returns) + 1e-9)
        return reward
--- a/freqtrade/freqai/RL/Base5ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py
@ -1,14 +1,14 @@
 import logging
 from enum import Enum
 from typing import Optional
 import gym
 import numpy as np
 from gym import spaces
 from gym.utils import seeding
 from pandas import DataFrame
 import pandas as pd
-from abc import abstractmethod
+from gym import spaces
 from pandas import DataFrame
 from freqtrade.freqai.RL.BaseEnvironment import BaseEnvironment, Positions
 logger = logging.getLogger(__name__)
@ -20,70 +20,19 @@ class Actions(Enum):
    Short_exit = 4
 class Positions(Enum):
    Short = 0
    Long = 1
    Neutral = 0.5
    def opposite(self):
        return Positions.Short if self == Positions.Long else Positions.Long
 def mean_over_std(x):
    std = np.std(x, ddof=1)
    mean = np.mean(x)
    return mean / std if std > 0 else 0
-class Base5ActionRLEnv(gym.Env):
+class Base5ActionRLEnv(BaseEnvironment):
    """
    Base class for a 5 action environment
    """
    metadata = {'render.modes': ['human']}
-    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
+    def set_action_space(self):
                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
                 id: str = 'baseenv-1', seed: int = 1, config: dict = {}):
        self.rl_config = config['freqai']['rl_config']
        self.id = id
        self.seed(seed)
        self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
    def reset_env(self, df: DataFrame, prices: DataFrame, window_size: int,
                  reward_kwargs: dict, starting_point=True):
        self.df = df
        self.signal_features = self.df
        self.prices = prices
        self.window_size = window_size
        self.starting_point = starting_point
        self.rr = reward_kwargs["rr"]
        self.profit_aim = reward_kwargs["profit_aim"]
        self.fee = 0.0015
        # # spaces
        self.shape = (window_size, self.signal_features.shape[1] + 3)
        self.action_space = spaces.Discrete(len(Actions))
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
        # episode
        self._start_tick: int = self.window_size
        self._end_tick: int = len(self.prices) - 1
        self._done: bool = False
        self._current_tick: int = self._start_tick
        self._last_trade_tick: Optional[int] = None
        self._position = Positions.Neutral
        self._position_history: list = [None]
        self.total_reward: float = 0
        self._total_profit: float = 1
        self.history: dict = {}
        self.trade_history: list = []
    def seed(self, seed: int = 1):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    def reset(self):
@ -106,6 +55,7 @@ class Base5ActionRLEnv(gym.Env):
        self._profits = [(self._start_tick, 1)]
        self.close_trade_profit = []
        self._total_unrealized_profit = 1
        return self._get_observation()
@ -118,7 +68,7 @@ class Base5ActionRLEnv(gym.Env):
        self.update_portfolio_log_returns(action)
-        self._update_profit(action)
+        self._update_unrealized_total_profit()
        step_reward = self.calculate_reward(action)
        self.total_reward += step_reward
@ -148,10 +98,12 @@ class Base5ActionRLEnv(gym.Env):
                trade_type = "short"
                self._last_trade_tick = self._current_tick
            elif action == Actions.Long_exit.value:
                self._update_total_profit()
                self._position = Positions.Neutral
                trade_type = "neutral"
                self._last_trade_tick = None
            elif action == Actions.Short_exit.value:
                self._update_total_profit()
                self._position = Positions.Neutral
                trade_type = "neutral"
                self._last_trade_tick = None
@ -163,7 +115,8 @@ class Base5ActionRLEnv(gym.Env):
                    {'price': self.current_price(), 'index': self._current_tick,
                     'type': trade_type})
-        if self._total_profit < 1 - self.rl_config.get('max_training_drawdown_pct', 0.8):
+        if (self._total_profit < self.max_drawdown or
                self._total_unrealized_profit < self.max_drawdown):
            self._done = True
        self._position_history.append(self._position)
@ -200,24 +153,6 @@ class Base5ActionRLEnv(gym.Env):
        else:
            return self._current_tick - self._last_trade_tick
    def get_unrealized_profit(self):
        if self._last_trade_tick is None:
            return 0.
        if self._position == Positions.Neutral:
            return 0.
        elif self._position == Positions.Short:
            current_price = self.add_entry_fee(self.prices.iloc[self._current_tick].open)
            last_trade_price = self.add_exit_fee(self.prices.iloc[self._last_trade_tick].open)
            return (last_trade_price - current_price) / last_trade_price
        elif self._position == Positions.Long:
            current_price = self.add_exit_fee(self.prices.iloc[self._current_tick].open)
            last_trade_price = self.add_entry_fee(self.prices.iloc[self._last_trade_tick].open)
            return (current_price - last_trade_price) / last_trade_price
        else:
            return 0.
    def is_tradesignal(self, action: int):
        # trade signal
        """
@ -253,97 +188,3 @@ class Base5ActionRLEnv(gym.Env):
                return False
        return True
    def _is_trade(self, action: Actions):
        return ((action == Actions.Long_enter.value and self._position == Positions.Neutral) or
                (action == Actions.Short_enter.value and self._position == Positions.Neutral))
    def is_hold(self, action):
        return ((action == Actions.Short_enter.value and self._position == Positions.Short) or
                (action == Actions.Long_enter.value and self._position == Positions.Long) or
                (action == Actions.Neutral.value and self._position == Positions.Long) or
                (action == Actions.Neutral.value and self._position == Positions.Short) or
                (action == Actions.Neutral.value and self._position == Positions.Neutral))
    def add_entry_fee(self, price):
        return price * (1 + self.fee)
    def add_exit_fee(self, price):
        return price / (1 + self.fee)
    def _update_history(self, info):
        if not self.history:
            self.history = {key: [] for key in info.keys()}
        for key, value in info.items():
            self.history[key].append(value)
    def get_sharpe_ratio(self):
        return mean_over_std(self.get_portfolio_log_returns())
    @abstractmethod
    def calculate_reward(self, action):
        """
        Reward is created by BaseReinforcementLearningModel and can
        be inherited/edited by the user made ReinforcementLearner file.
        """
        return 0.
    def _update_profit(self, action):
        if self._is_trade(action) or self._done:
            pnl = self.get_unrealized_profit()
            if self._position in (Positions.Long, Positions.Short):
                self._total_profit *= (1 + pnl)
                self._profits.append((self._current_tick, self._total_profit))
                self.close_trade_profit.append(pnl)
    def most_recent_return(self, action: int):
        """
        Calculate the tick to tick return if in a trade.
        Return is generated from rising prices in Long
        and falling prices in Short positions.
        The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
        """
        # Long positions
        if self._position == Positions.Long:
            current_price = self.prices.iloc[self._current_tick].open
            previous_price = self.prices.iloc[self._current_tick - 1].open
            if (self._position_history[self._current_tick - 1] == Positions.Short
                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
                previous_price = self.add_entry_fee(previous_price)
            return np.log(current_price) - np.log(previous_price)
        # Short positions
        if self._position == Positions.Short:
            current_price = self.prices.iloc[self._current_tick].open
            previous_price = self.prices.iloc[self._current_tick - 1].open
            if (self._position_history[self._current_tick - 1] == Positions.Long
                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
                previous_price = self.add_exit_fee(previous_price)
            return np.log(previous_price) - np.log(current_price)
        return 0
    def get_portfolio_log_returns(self):
        return self.portfolio_log_returns[1:self._current_tick + 1]
    def update_portfolio_log_returns(self, action):
        self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
    def current_price(self) -> float:
        return self.prices.iloc[self._current_tick].open
    def prev_price(self) -> float:
        return self.prices.iloc[self._current_tick - 1].open
    def sharpe_ratio(self):
        if len(self.close_trade_profit) == 0:
            return 0.
        returns = np.array(self.close_trade_profit)
        reward = (np.mean(returns) - 0. + 1e-9) / (np.std(returns) + 1e-9)
        return reward
--- a/freqtrade/freqai/RL/BaseEnvironment.py
+++ b/freqtrade/freqai/RL/BaseEnvironment.py
@ -0,0 +1,270 @@
 import logging
 from abc import abstractmethod
 from enum import Enum
 from typing import Optional
 import gym
 import numpy as np
 import pandas as pd
 from gym import spaces
 from gym.utils import seeding
 from pandas import DataFrame
 logger = logging.getLogger(__name__)
 class Positions(Enum):
    Short = 0
    Long = 1
    Neutral = 0.5
    def opposite(self):
        return Positions.Short if self == Positions.Long else Positions.Long
 class BaseEnvironment(gym.Env):
    """
    Base class for environments. This class is agnostic to action count.
    Inherited classes customize this to include varying action counts/types,
    See RL/Base5ActionRLEnv.py and RL/Base4ActionRLEnv.py
    """
    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
                 id: str = 'baseenv-1', seed: int = 1, config: dict = {}):
        self.rl_config = config['freqai']['rl_config']
        self.id = id
        self.seed(seed)
        self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
        self.max_drawdown = 1 - self.rl_config.get('max_training_drawdown_pct', 0.8)
        self.compound_trades = config['stake_amount'] == 'unlimited'
    def reset_env(self, df: DataFrame, prices: DataFrame, window_size: int,
                  reward_kwargs: dict, starting_point=True):
        self.df = df
        self.signal_features = self.df
        self.prices = prices
        self.window_size = window_size
        self.starting_point = starting_point
        self.rr = reward_kwargs["rr"]
        self.profit_aim = reward_kwargs["profit_aim"]
        self.fee = 0.0015
        # # spaces
        self.shape = (window_size, self.signal_features.shape[1] + 3)
        self.set_action_space()
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
        # episode
        self._start_tick: int = self.window_size
        self._end_tick: int = len(self.prices) - 1
        self._done: bool = False
        self._current_tick: int = self._start_tick
        self._last_trade_tick: Optional[int] = None
        self._position = Positions.Neutral
        self._position_history: list = [None]
        self.total_reward: float = 0
        self._total_profit: float = 1
        self._total_unrealized_profit: float = 1
        self.history: dict = {}
        self.trade_history: list = []
    @abstractmethod
    def set_action_space(self):
        """
        Unique to the environment action count. Must be inherited.
        """
    def seed(self, seed: int = 1):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    def reset(self):
        self._done = False
        if self.starting_point is True:
            self._position_history = (self._start_tick * [None]) + [self._position]
        else:
            self._position_history = (self.window_size * [None]) + [self._position]
        self._current_tick = self._start_tick
        self._last_trade_tick = None
        self._position = Positions.Neutral
        self.total_reward = 0.
        self._total_profit = 1.  # unit
        self.history = {}
        self.trade_history = []
        self.portfolio_log_returns = np.zeros(len(self.prices))
        self._profits = [(self._start_tick, 1)]
        self.close_trade_profit = []
        self._total_unrealized_profit = 1
        return self._get_observation()
    @abstractmethod
    def step(self, action: int):
        """
        Step depeneds on action types, this must be inherited.
        """
        return
    def _get_observation(self):
        """
        This may or may not be independent of action types, user can inherit
        this in their custom "MyRLEnv"
        """
        features_window = self.signal_features[(
            self._current_tick - self.window_size):self._current_tick]
        features_and_state = DataFrame(np.zeros((len(features_window), 3)),
                                       columns=['current_profit_pct', 'position', 'trade_duration'],
                                       index=features_window.index)
        features_and_state['current_profit_pct'] = self.get_unrealized_profit()
        features_and_state['position'] = self._position.value
        features_and_state['trade_duration'] = self.get_trade_duration()
        features_and_state = pd.concat([features_window, features_and_state], axis=1)
        return features_and_state
    def get_trade_duration(self):
        if self._last_trade_tick is None:
            return 0
        else:
            return self._current_tick - self._last_trade_tick
    def get_unrealized_profit(self):
        if self._last_trade_tick is None:
            return 0.
        if self._position == Positions.Neutral:
            return 0.
        elif self._position == Positions.Short:
            current_price = self.add_entry_fee(self.prices.iloc[self._current_tick].open)
            last_trade_price = self.add_exit_fee(self.prices.iloc[self._last_trade_tick].open)
            return (last_trade_price - current_price) / last_trade_price
        elif self._position == Positions.Long:
            current_price = self.add_exit_fee(self.prices.iloc[self._current_tick].open)
            last_trade_price = self.add_entry_fee(self.prices.iloc[self._last_trade_tick].open)
            return (current_price - last_trade_price) / last_trade_price
        else:
            return 0.
    @abstractmethod
    def is_tradesignal(self, action: int):
        # trade signal
        """
        Determine if the signal is a trade signal. This is
        unique to the actions in the environment, and therefore must be
        inherited.
        """
        return
    def _is_valid(self, action: int):
        # trade signal
        """
        Determine if the signal is valid.This is
        unique to the actions in the environment, and therefore must be
        inherited.
        """
        return
    def add_entry_fee(self, price):
        return price * (1 + self.fee)
    def add_exit_fee(self, price):
        return price / (1 + self.fee)
    def _update_history(self, info):
        if not self.history:
            self.history = {key: [] for key in info.keys()}
        for key, value in info.items():
            self.history[key].append(value)
    @abstractmethod
    def calculate_reward(self, action):
        """
        Reward is created by BaseReinforcementLearningModel and can
        be inherited/edited by the user made ReinforcementLearner file.
        """
        return 0.
    def _update_unrealized_total_profit(self):
        """
        Update the unrealized total profit incase of episode end.
        """
        if self._position in (Positions.Long, Positions.Short):
            pnl = self.get_unrealized_profit()
            if self.compound_trades:
                # assumes unit stake and compounding
                unrl_profit = self._total_profit * (1 + pnl)
            else:
                # assumes unit stake and no compounding
                unrl_profit = self._total_profit + pnl
            self._total_unrealized_profit = unrl_profit
    def _update_total_profit(self):
        pnl = self.get_unrealized_profit()
        if self.compound_trades:
            # assumes unite stake and compounding
            self._total_profit = self._total_profit * (1 + pnl)
        else:
            # assumes unit stake and no compounding
            self._total_profit += pnl
    def most_recent_return(self, action: int):
        """
        Calculate the tick to tick return if in a trade.
        Return is generated from rising prices in Long
        and falling prices in Short positions.
        The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
        """
        # Long positions
        if self._position == Positions.Long:
            current_price = self.prices.iloc[self._current_tick].open
            previous_price = self.prices.iloc[self._current_tick - 1].open
            if (self._position_history[self._current_tick - 1] == Positions.Short
                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
                previous_price = self.add_entry_fee(previous_price)
            return np.log(current_price) - np.log(previous_price)
        # Short positions
        if self._position == Positions.Short:
            current_price = self.prices.iloc[self._current_tick].open
            previous_price = self.prices.iloc[self._current_tick - 1].open
            if (self._position_history[self._current_tick - 1] == Positions.Long
                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
                previous_price = self.add_exit_fee(previous_price)
            return np.log(previous_price) - np.log(current_price)
        return 0
    def get_portfolio_log_returns(self):
        return self.portfolio_log_returns[1:self._current_tick + 1]
    def update_portfolio_log_returns(self, action):
        self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
    def current_price(self) -> float:
        return self.prices.iloc[self._current_tick].open
    def prev_price(self) -> float:
        return self.prices.iloc[self._current_tick - 1].open
    def sharpe_ratio(self):
        if len(self.close_trade_profit) == 0:
            return 0.
        returns = np.array(self.close_trade_profit)
        reward = (np.mean(returns) - 0. + 1e-9) / (np.std(returns) + 1e-9)
        return reward
--- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
+++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
@ -1,25 +1,28 @@
 import logging
-from typing import Any, Dict, Tuple
+from abc import abstractmethod
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Callable, Dict, Tuple
 import gym
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
 import torch as th
 import torch.multiprocessing
 from pandas import DataFrame
-from abc import abstractmethod
+from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.monitor import Monitor
 from stable_baselines3.common.utils import set_random_seed
 from freqtrade.exceptions import OperationalException
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.freqai_interface import IFreqaiModel
-from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
+from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv
 from freqtrade.freqai.RL.BaseEnvironment import BaseEnvironment, Positions
 from freqtrade.persistence import Trade
-import torch.multiprocessing
+
-from stable_baselines3.common.callbacks import EvalCallback
+
 from stable_baselines3.common.monitor import Monitor
 import torch as th
 from typing import Callable
 from datetime import datetime, timezone
 from stable_baselines3.common.utils import set_random_seed
 import gym
 from pathlib import Path
 logger = logging.getLogger(__name__)
 torch.multiprocessing.set_sharing_strategy('file_system')
@ -37,8 +40,8 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        super().__init__(config=kwargs['config'])
        th.set_num_threads(self.freqai_info['rl_config'].get('thread_count', 4))
        self.reward_params = self.freqai_info['rl_config']['model_reward_parameters']
-        self.train_env: Base5ActionRLEnv = None
+        self.train_env: BaseEnvironment = None
-        self.eval_env: Base5ActionRLEnv = None
+        self.eval_env: BaseEnvironment = None
        self.eval_callback: EvalCallback = None
        self.model_type = self.freqai_info['rl_config']['model_type']
        self.rl_config = self.freqai_info['rl_config']
@ -194,7 +197,7 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        def _predict(window):
            market_side, current_profit, trade_duration = self.get_state_info(dk.pair)
            observations = dataframe.iloc[window.index]
-            observations['current_profit'] = current_profit
+            observations['current_profit_pct'] = current_profit
            observations['position'] = market_side
            observations['trade_duration'] = trade_duration
            res, _ = model.predict(observations, deterministic=True)
@ -306,7 +309,7 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        return
-def make_env(MyRLEnv: Base5ActionRLEnv, env_id: str, rank: int,
+def make_env(MyRLEnv: BaseEnvironment, env_id: str, rank: int,
             seed: int, train_df: DataFrame, price: DataFrame,
             reward_params: Dict[str, int], window_size: int, monitor: bool = False,
             config: Dict[str, Any] = {}) -> Callable:
--- a/freqtrade/freqai/RL/ReinforcementLearnerCustomAgent.py
+++ b/freqtrade/freqai/RL/ReinforcementLearnerCustomAgent.py
@ -1,19 +1,20 @@
 import logging
-import torch as th
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
-from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
+
 import gym
 import torch as th
 from stable_baselines3 import DQN
 from stable_baselines3.common.buffers import ReplayBuffer
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from pathlib import Path
 from stable_baselines3.dqn.policies import (CnnPolicy, DQNPolicy, MlpPolicy,
                                            QNetwork)
 from torch import nn
 import gym
 from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor,
                                                   FlattenExtractor)
 from stable_baselines3.common.type_aliases import GymEnv, Schedule
 from stable_baselines3.common.policies import BasePolicy
 from stable_baselines3.common.torch_layers import BaseFeaturesExtractor, FlattenExtractor
 from stable_baselines3.common.type_aliases import GymEnv, Schedule
 from stable_baselines3.dqn.policies import CnnPolicy, DQNPolicy, MlpPolicy, QNetwork
 from torch import nn
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 logger = logging.getLogger(__name__)
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@ -7,7 +7,7 @@ import time
 from abc import ABC, abstractmethod
 from pathlib import Path
 from threading import Lock
-from typing import Any, Dict, Tuple, Optional
+from typing import Any, Dict, Optional, Tuple
 import numpy as np
 import pandas as pd
--- a/freqtrade/freqai/prediction_models/ReinforcementLearner.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearner.py
@ -1,15 +1,14 @@
 import logging
 from pathlib import Path
 from typing import Any, Dict
 import torch as th
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from pathlib import Path
 # from pandas import DataFrame
 # from stable_baselines3.common.callbacks import EvalCallback
 # from stable_baselines3.common.monitor import Monitor
 import numpy as np
 import torch as th
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 logger = logging.getLogger(__name__)
@ -53,7 +52,7 @@ class ReinforcementLearner(BaseReinforcementLearningModel):
        return model
-    class MyRLEnv(BaseReinforcementLearningModel.MyRLEnv):
+    class MyRLEnv(Base5ActionRLEnv):
        """
        User can override any function in BaseRLEnv and gym.Env. Here the user
        sets a custom reward based on profit and trade duration.
--- a/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py
@ -1,15 +1,16 @@
 import logging
 from pathlib import Path
 from typing import Any, Dict  # , Tuple
 # import numpy.typing as npt
 import torch as th
 from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.vec_env import SubprocVecEnv
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import (BaseReinforcementLearningModel,
                                                                make_env)
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from pathlib import Path
 logger = logging.getLogger(__name__)
@ -26,7 +27,7 @@ class ReinforcementLearner_multiproc(BaseReinforcementLearningModel):
        # model arch
        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[256, 256])
+                             net_arch=[256, 256, 128])
        if dk.pair not in self.dd.model_dictionary or not self.continual_learning:
            model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
@ -64,9 +65,9 @@ class ReinforcementLearner_multiproc(BaseReinforcementLearningModel):
        test_df = data_dictionary["test_features"]
        env_id = "train_env"
-        num_cpu = int(self.freqai_info["rl_config"]["thread_count"] / 2)
+        num_cpu = int(self.freqai_info["rl_config"]["thread_count"])
        self.train_env = SubprocVecEnv([make_env(self.MyRLEnv, env_id, i, 1, train_df, prices_train,
-                                        self.reward_params, self.CONV_WIDTH,
+                                        self.reward_params, self.CONV_WIDTH, monitor=True,
                                        config=self.config) for i
                                        in range(num_cpu)])