add continual retraining feature, handly mypy typing reqs, improve docstrings

2022-08-24 12:54:02 +02:00
parent b708134c1a
commit c0cee5df07
11 changed files with 387 additions and 362 deletions
--- a/config_examples/config_freqai-rl.example.json
+++ b/config_examples/config_freqai-rl.example.json
@@ -85,12 +85,13 @@
            "verbose": 1
        },
        "rl_config": {
-            "train_cycles": 10,
+            "train_cycles": 3,
            "eval_cycles": 3,
            "thread_count": 4,
            "max_trade_duration_candles": 100,
            "model_type": "PPO",
            "policy_type": "MlpPolicy",
            "continual_retraining": true,
            "model_reward_parameters": {
                "rr": 1,
                "profit_aim": 0.02,
--- a/freqtrade/freqai/RL/Base3ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base3ActionRLEnv.py
@@ -1,330 +1,330 @@
-import logging
+# import logging
-from enum import Enum
+# from enum import Enum
-import gym
+# import gym
-import numpy as np
+# import numpy as np
-import pandas as pd
+# import pandas as pd
-from gym import spaces
+# from gym import spaces
-from gym.utils import seeding
+# from gym.utils import seeding
-from pandas import DataFrame
+# from pandas import DataFrame
-# from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+# # from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
-logger = logging.getLogger(__name__)
+# logger = logging.getLogger(__name__)
-class Actions(Enum):
+# class Actions(Enum):
-    Short = 0
+#     Short = 0
-    Long = 1
+#     Long = 1
-    Neutral = 2
+#     Neutral = 2
-class Positions(Enum):
+# class Positions(Enum):
-    Short = 0
+#     Short = 0
-    Long = 1
+#     Long = 1
-    Neutral = 0.5
+#     Neutral = 0.5
-    def opposite(self):
+#     def opposite(self):
-        return Positions.Short if self == Positions.Long else Positions.Long
+#         return Positions.Short if self == Positions.Long else Positions.Long
-def mean_over_std(x):
+# def mean_over_std(x):
-    std = np.std(x, ddof=1)
+#     std = np.std(x, ddof=1)
-    mean = np.mean(x)
+#     mean = np.mean(x)
-    return mean / std if std > 0 else 0
+#     return mean / std if std > 0 else 0
-class Base3ActionRLEnv(gym.Env):
+# class Base3ActionRLEnv(gym.Env):
-    metadata = {'render.modes': ['human']}
+#     metadata = {'render.modes': ['human']}
-    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
+#     def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
-                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
+#                  reward_kwargs: dict = {}, window_size=10, starting_point=True,
-                 id: str = 'baseenv-1', seed: int = 1):
+#                  id: str = 'baseenv-1', seed: int = 1):
-        assert df.ndim == 2
+#         assert df.ndim == 2
-        self.id = id
+#         self.id = id
-        self.seed(seed)
+#         self.seed(seed)
-        self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
+#         self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
-    def reset_env(self, df, prices, window_size, reward_kwargs, starting_point=True):
+#     def reset_env(self, df, prices, window_size, reward_kwargs, starting_point=True):
-        self.df = df
+#         self.df = df
-        self.signal_features = self.df
+#         self.signal_features = self.df
-        self.prices = prices
+#         self.prices = prices
-        self.window_size = window_size
+#         self.window_size = window_size
-        self.starting_point = starting_point
+#         self.starting_point = starting_point
-        self.rr = reward_kwargs["rr"]
+#         self.rr = reward_kwargs["rr"]
-        self.profit_aim = reward_kwargs["profit_aim"]
+#         self.profit_aim = reward_kwargs["profit_aim"]
-        self.fee = 0.0015
+#         self.fee = 0.0015
-        # # spaces
+#         # # spaces
-        self.shape = (window_size, self.signal_features.shape[1] + 2)
+#         self.shape = (window_size, self.signal_features.shape[1] + 2)
-        self.action_space = spaces.Discrete(len(Actions))
+#         self.action_space = spaces.Discrete(len(Actions))
-        self.observation_space = spaces.Box(
+#         self.observation_space = spaces.Box(
-            low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
+#             low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
-
+
-        # episode
+#         # episode
-        self._start_tick = self.window_size
+#         self._start_tick = self.window_size
-        self._end_tick = len(self.prices) - 1
+#         self._end_tick = len(self.prices) - 1
-        self._done = None
+#         self._done = None
-        self._current_tick = None
+#         self._current_tick = None
-        self._last_trade_tick = None
+#         self._last_trade_tick = None
-        self._position = Positions.Neutral
+#         self._position = Positions.Neutral
-        self._position_history = None
+#         self._position_history = None
-        self.total_reward = None
+#         self.total_reward = None
-        self._total_profit = None
+#         self._total_profit = None
-        self._first_rendering = None
+#         self._first_rendering = None
-        self.history = None
+#         self.history = None
-        self.trade_history = []
+#         self.trade_history = []
-
+
-    def seed(self, seed: int = 1):
+#     def seed(self, seed: int = 1):
-        self.np_random, seed = seeding.np_random(seed)
+#         self.np_random, seed = seeding.np_random(seed)
-        return [seed]
+#         return [seed]
-
+
-    def reset(self):
+#     def reset(self):
-
+
-        self._done = False
+#         self._done = False
-
+
-        if self.starting_point is True:
+#         if self.starting_point is True:
-            self._position_history = (self._start_tick * [None]) + [self._position]
+#             self._position_history = (self._start_tick * [None]) + [self._position]
-        else:
+#         else:
-            self._position_history = (self.window_size * [None]) + [self._position]
+#             self._position_history = (self.window_size * [None]) + [self._position]
-
+
-        self._current_tick = self._start_tick
+#         self._current_tick = self._start_tick
-        self._last_trade_tick = None
+#         self._last_trade_tick = None
-        self._position = Positions.Neutral
+#         self._position = Positions.Neutral
-
+
-        self.total_reward = 0.
+#         self.total_reward = 0.
-        self._total_profit = 1.  # unit
+#         self._total_profit = 1.  # unit
-        self._first_rendering = True
+#         self._first_rendering = True
-        self.history = {}
+#         self.history = {}
-        self.trade_history = []
+#         self.trade_history = []
-        self.portfolio_log_returns = np.zeros(len(self.prices))
+#         self.portfolio_log_returns = np.zeros(len(self.prices))
-
+
-        self._profits = [(self._start_tick, 1)]
+#         self._profits = [(self._start_tick, 1)]
-        self.close_trade_profit = []
+#         self.close_trade_profit = []
-
+
-        return self._get_observation()
+#         return self._get_observation()
-
+
-    def step(self, action: int):
+#     def step(self, action: int):
-        self._done = False
+#         self._done = False
-        self._current_tick += 1
+#         self._current_tick += 1
-
+
-        if self._current_tick == self._end_tick:
+#         if self._current_tick == self._end_tick:
-            self._done = True
+#             self._done = True
-
+
-        self.update_portfolio_log_returns(action)
+#         self.update_portfolio_log_returns(action)
-
+
-        self._update_profit(action)
+#         self._update_profit(action)
-        step_reward = self.calculate_reward(action)
+#         step_reward = self.calculate_reward(action)
-        self.total_reward += step_reward
+#         self.total_reward += step_reward
-
+
-        trade_type = None
+#         trade_type = None
-        if self.is_tradesignal(action):  # exclude 3 case not trade
+#         if self.is_tradesignal(action):  # exclude 3 case not trade
-            # Update position
+#             # Update position
-            """
+#             """
-            Action: Neutral, position: Long ->  Close Long
+#             Action: Neutral, position: Long ->  Close Long
-            Action: Neutral, position: Short -> Close Short
+#             Action: Neutral, position: Short -> Close Short
-
+
-            Action: Long, position: Neutral -> Open Long
+#             Action: Long, position: Neutral -> Open Long
-            Action: Long, position: Short -> Close Short and Open Long
+#             Action: Long, position: Short -> Close Short and Open Long
-
+
-            Action: Short, position: Neutral -> Open Short
+#             Action: Short, position: Neutral -> Open Short
-            Action: Short, position: Long -> Close Long and Open Short
+#             Action: Short, position: Long -> Close Long and Open Short
-            """
+#             """
-
+
-            if action == Actions.Neutral.value:
+#             if action == Actions.Neutral.value:
-                self._position = Positions.Neutral
+#                 self._position = Positions.Neutral
-                trade_type = "neutral"
+#                 trade_type = "neutral"
-            elif action == Actions.Long.value:
+#             elif action == Actions.Long.value:
-                self._position = Positions.Long
+#                 self._position = Positions.Long
-                trade_type = "long"
+#                 trade_type = "long"
-            elif action == Actions.Short.value:
+#             elif action == Actions.Short.value:
-                self._position = Positions.Short
+#                 self._position = Positions.Short
-                trade_type = "short"
+#                 trade_type = "short"
-            else:
+#             else:
-                print("case not defined")
+#                 print("case not defined")
-
+
-            # Update last trade tick
+#             # Update last trade tick
-            self._last_trade_tick = self._current_tick
+#             self._last_trade_tick = self._current_tick
-
+
-            if trade_type is not None:
+#             if trade_type is not None:
-                self.trade_history.append(
+#                 self.trade_history.append(
-                    {'price': self.current_price(), 'index': self._current_tick,
+#                     {'price': self.current_price(), 'index': self._current_tick,
-                     'type': trade_type})
+#                      'type': trade_type})
-
+
-        if self._total_profit < 0.2:
+#         if self._total_profit < 0.2:
-            self._done = True
+#             self._done = True
-
+
-        self._position_history.append(self._position)
+#         self._position_history.append(self._position)
-        observation = self._get_observation()
+#         observation = self._get_observation()
-        info = dict(
+#         info = dict(
-            tick=self._current_tick,
+#             tick=self._current_tick,
-            total_reward=self.total_reward,
+#             total_reward=self.total_reward,
-            total_profit=self._total_profit,
+#             total_profit=self._total_profit,
-            position=self._position.value
+#             position=self._position.value
-        )
+#         )
-        self._update_history(info)
+#         self._update_history(info)
-
+
-        return observation, step_reward, self._done, info
+#         return observation, step_reward, self._done, info
-
+
-    def _get_observation(self):
+#     def _get_observation(self):
-        features_window = self.signal_features[(
+#         features_window = self.signal_features[(
-            self._current_tick - self.window_size):self._current_tick]
+#             self._current_tick - self.window_size):self._current_tick]
-        features_and_state = DataFrame(np.zeros((len(features_window), 2)),
+#         features_and_state = DataFrame(np.zeros((len(features_window), 2)),
-                                       columns=['current_profit_pct', 'position'],
+#                                        columns=['current_profit_pct', 'position'],
-                                       index=features_window.index)
+#                                        index=features_window.index)
-
+
-        features_and_state['current_profit_pct'] = self.get_unrealized_profit()
+#         features_and_state['current_profit_pct'] = self.get_unrealized_profit()
-        features_and_state['position'] = self._position.value
+#         features_and_state['position'] = self._position.value
-        features_and_state = pd.concat([features_window, features_and_state], axis=1)
+#         features_and_state = pd.concat([features_window, features_and_state], axis=1)
-        return features_and_state
+#         return features_and_state
-
+
-    def get_unrealized_profit(self):
+#     def get_unrealized_profit(self):
-
+
-        if self._last_trade_tick is None:
+#         if self._last_trade_tick is None:
-            return 0.
+#             return 0.
-
+
-        if self._position == Positions.Neutral:
+#         if self._position == Positions.Neutral:
-            return 0.
+#             return 0.
-        elif self._position == Positions.Short:
+#         elif self._position == Positions.Short:
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+#             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+#             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            return (last_trade_price - current_price) / last_trade_price
+#             return (last_trade_price - current_price) / last_trade_price
-        elif self._position == Positions.Long:
+#         elif self._position == Positions.Long:
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+#             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+#             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            return (current_price - last_trade_price) / last_trade_price
+#             return (current_price - last_trade_price) / last_trade_price
-        else:
+#         else:
-            return 0.
+#             return 0.
-
+
-    def is_tradesignal(self, action: int):
+#     def is_tradesignal(self, action: int):
-        # trade signal
+#         # trade signal
-        """
+#         """
-        not trade signal is :
+#         not trade signal is :
-        Action: Neutral, position: Neutral -> Nothing
+#         Action: Neutral, position: Neutral -> Nothing
-        Action: Long, position: Long -> Hold Long
+#         Action: Long, position: Long -> Hold Long
-        Action: Short, position: Short -> Hold Short
+#         Action: Short, position: Short -> Hold Short
-        """
+#         """
-        return not ((action == Actions.Neutral.value and self._position == Positions.Neutral)
+#         return not ((action == Actions.Neutral.value and self._position == Positions.Neutral)
-                    or (action == Actions.Short.value and self._position == Positions.Short)
+#                     or (action == Actions.Short.value and self._position == Positions.Short)
-                    or (action == Actions.Long.value and self._position == Positions.Long))
+#                     or (action == Actions.Long.value and self._position == Positions.Long))
-
+
-    def _is_trade(self, action: Actions):
+#     def _is_trade(self, action: Actions):
-        return ((action == Actions.Long.value and self._position == Positions.Short) or
+#         return ((action == Actions.Long.value and self._position == Positions.Short) or
-                (action == Actions.Short.value and self._position == Positions.Long) or
+#                 (action == Actions.Short.value and self._position == Positions.Long) or
-                (action == Actions.Neutral.value and self._position == Positions.Long) or
+#                 (action == Actions.Neutral.value and self._position == Positions.Long) or
-                (action == Actions.Neutral.value and self._position == Positions.Short)
+#                 (action == Actions.Neutral.value and self._position == Positions.Short)
-                )
+#                 )
-
+
-    def is_hold(self, action):
+#     def is_hold(self, action):
-        return ((action == Actions.Short.value and self._position == Positions.Short)
+#         return ((action == Actions.Short.value and self._position == Positions.Short)
-                or (action == Actions.Long.value and self._position == Positions.Long))
+#                 or (action == Actions.Long.value and self._position == Positions.Long))
-
+
-    def add_buy_fee(self, price):
+#     def add_buy_fee(self, price):
-        return price * (1 + self.fee)
+#         return price * (1 + self.fee)
-
+
-    def add_sell_fee(self, price):
+#     def add_sell_fee(self, price):
-        return price / (1 + self.fee)
+#         return price / (1 + self.fee)
-
+
-    def _update_history(self, info):
+#     def _update_history(self, info):
-        if not self.history:
+#         if not self.history:
-            self.history = {key: [] for key in info.keys()}
+#             self.history = {key: [] for key in info.keys()}
-
+
-        for key, value in info.items():
+#         for key, value in info.items():
-            self.history[key].append(value)
+#             self.history[key].append(value)
-
+
-    def get_sharpe_ratio(self):
+#     def get_sharpe_ratio(self):
-        return mean_over_std(self.get_portfolio_log_returns())
+#         return mean_over_std(self.get_portfolio_log_returns())
-
+
-    def calculate_reward(self, action):
+#     def calculate_reward(self, action):
-
+
-        if self._last_trade_tick is None:
+#         if self._last_trade_tick is None:
-            return 0.
+#             return 0.
-
+
-        # close long
+#         # close long
-        if (action == Actions.Short.value or
+#         if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
+#                 action == Actions.Neutral.value) and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+#             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+#             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
+#             return float(np.log(current_price) - np.log(last_trade_price))
-
+
-        # close short
+#         # close short
-        if (action == Actions.Long.value or
+#         if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
+#                 action == Actions.Neutral.value) and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+#             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+#             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
+#             return float(np.log(last_trade_price) - np.log(current_price))
-
+
-        return 0.
+#         return 0.
-
+
-    def _update_profit(self, action):
+#     def _update_profit(self, action):
-        if self._is_trade(action) or self._done:
+#         if self._is_trade(action) or self._done:
-            pnl = self.get_unrealized_profit()
+#             pnl = self.get_unrealized_profit()
-
+
-            if self._position == Positions.Long:
+#             if self._position == Positions.Long:
-                self._total_profit = self._total_profit + self._total_profit * pnl
+#                 self._total_profit = self._total_profit + self._total_profit * pnl
-                self._profits.append((self._current_tick, self._total_profit))
+#                 self._profits.append((self._current_tick, self._total_profit))
-                self.close_trade_profit.append(pnl)
+#                 self.close_trade_profit.append(pnl)
-
+
-            if self._position == Positions.Short:
+#             if self._position == Positions.Short:
-                self._total_profit = self._total_profit + self._total_profit * pnl
+#                 self._total_profit = self._total_profit + self._total_profit * pnl
-                self._profits.append((self._current_tick, self._total_profit))
+#                 self._profits.append((self._current_tick, self._total_profit))
-                self.close_trade_profit.append(pnl)
+#                 self.close_trade_profit.append(pnl)
-
+
-    def most_recent_return(self, action: int):
+#     def most_recent_return(self, action: int):
-        """
+#         """
-        We support Long, Neutral and Short positions.
+#         We support Long, Neutral and Short positions.
-        Return is generated from rising prices in Long
+#         Return is generated from rising prices in Long
-        and falling prices in Short positions.
+#         and falling prices in Short positions.
-        The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
+#         The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
-        """
+#         """
-        # Long positions
+#         # Long positions
-        if self._position == Positions.Long:
+#         if self._position == Positions.Long:
-            current_price = self.prices.iloc[self._current_tick].open
+#             current_price = self.prices.iloc[self._current_tick].open
-            if action == Actions.Short.value or action == Actions.Neutral.value:
+#             if action == Actions.Short.value or action == Actions.Neutral.value:
-                current_price = self.add_sell_fee(current_price)
+#                 current_price = self.add_sell_fee(current_price)
-
+
-            previous_price = self.prices.iloc[self._current_tick - 1].open
+#             previous_price = self.prices.iloc[self._current_tick - 1].open
-
+
-            if (self._position_history[self._current_tick - 1] == Positions.Short
+#             if (self._position_history[self._current_tick - 1] == Positions.Short
-                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
+#                     or self._position_history[self._current_tick - 1] == Positions.Neutral):
-                previous_price = self.add_buy_fee(previous_price)
+#                 previous_price = self.add_buy_fee(previous_price)
-
+
-            return np.log(current_price) - np.log(previous_price)
+#             return np.log(current_price) - np.log(previous_price)
-
+
-        # Short positions
+#         # Short positions
-        if self._position == Positions.Short:
+#         if self._position == Positions.Short:
-            current_price = self.prices.iloc[self._current_tick].open
+#             current_price = self.prices.iloc[self._current_tick].open
-            if action == Actions.Long.value or action == Actions.Neutral.value:
+#             if action == Actions.Long.value or action == Actions.Neutral.value:
-                current_price = self.add_buy_fee(current_price)
+#                 current_price = self.add_buy_fee(current_price)
-
+
-            previous_price = self.prices.iloc[self._current_tick - 1].open
+#             previous_price = self.prices.iloc[self._current_tick - 1].open
-            if (self._position_history[self._current_tick - 1] == Positions.Long
+#             if (self._position_history[self._current_tick - 1] == Positions.Long
-                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
+#                     or self._position_history[self._current_tick - 1] == Positions.Neutral):
-                previous_price = self.add_sell_fee(previous_price)
+#                 previous_price = self.add_sell_fee(previous_price)
-
+
-            return np.log(previous_price) - np.log(current_price)
+#             return np.log(previous_price) - np.log(current_price)
-
+
-        return 0
+#         return 0
-
+
-    def get_portfolio_log_returns(self):
+#     def get_portfolio_log_returns(self):
-        return self.portfolio_log_returns[1:self._current_tick + 1]
+#         return self.portfolio_log_returns[1:self._current_tick + 1]
-
+
-    def update_portfolio_log_returns(self, action):
+#     def update_portfolio_log_returns(self, action):
-        self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
+#         self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
-
+
-    def current_price(self) -> float:
+#     def current_price(self) -> float:
-        return self.prices.iloc[self._current_tick].open
+#         return self.prices.iloc[self._current_tick].open
-    def prev_price(self) -> float:
+#     def prev_price(self) -> float:
-        return self.prices.iloc[self._current_tick - 1].open
+#         return self.prices.iloc[self._current_tick - 1].open
-    def sharpe_ratio(self) -> float:
+#     def sharpe_ratio(self) -> float:
-        if len(self.close_trade_profit) == 0:
+#         if len(self.close_trade_profit) == 0:
-            return 0.
+#             return 0.
-        returns = np.array(self.close_trade_profit)
+#         returns = np.array(self.close_trade_profit)
-        reward = (np.mean(returns) - 0. + 1e-9) / (np.std(returns) + 1e-9)
+#         reward = (np.mean(returns) - 0. + 1e-9) / (np.std(returns) + 1e-9)
-        return reward
+#         return reward
--- a/freqtrade/freqai/RL/Base5ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py
@@ -1,6 +1,6 @@
 import logging
 from enum import Enum
-# from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Optional
 import gym
 import numpy as np
@@ -44,14 +44,14 @@ class Base5ActionRLEnv(gym.Env):
    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
                 id: str = 'baseenv-1', seed: int = 1, config: dict = {}):
        assert df.ndim == 2
        self.rl_config = config['freqai']['rl_config']
        self.id = id
        self.seed(seed)
        self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
-    def reset_env(self, df, prices, window_size, reward_kwargs, starting_point=True):
+    def reset_env(self, df: DataFrame, prices: DataFrame, window_size: int,
                  reward_kwargs: dict, starting_point=True):
        self.df = df
        self.signal_features = self.df
        self.prices = prices
@@ -69,18 +69,18 @@ class Base5ActionRLEnv(gym.Env):
            low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
        # episode
-        self._start_tick = self.window_size
+        self._start_tick: int = self.window_size
-        self._end_tick = len(self.prices) - 1
+        self._end_tick: int = len(self.prices) - 1
-        self._done = None
+        self._done: bool = False
-        self._current_tick = None
+        self._current_tick: int = self._start_tick
-        self._last_trade_tick = None
+        self._last_trade_tick: Optional[int] = None
        self._position = Positions.Neutral
-        self._position_history = None
+        self._position_history: list = [None]
-        self.total_reward = None
+        self.total_reward: float = 0
-        self._total_profit = None
+        self._total_profit: float = 0
-        self._first_rendering = None
+        self._first_rendering: bool = False
-        self.history = None
+        self.history: dict = {}
-        self.trade_history = []
+        self.trade_history: list = []
    def seed(self, seed: int = 1):
        self.np_random, seed = seeding.np_random(seed)
@@ -125,8 +125,7 @@ class Base5ActionRLEnv(gym.Env):
        self.total_reward += step_reward
        trade_type = None
-        if self.is_tradesignal(action):  # exclude 3 case not trade
+        if self.is_tradesignal(action):
            # Update position
            """
            Action: Neutral, position: Long ->  Close Long
            Action: Neutral, position: Short -> Close Short
@@ -223,9 +222,8 @@ class Base5ActionRLEnv(gym.Env):
        # trade signal
        """
        not trade signal is :
-        Action: Neutral, position: Neutral -> Nothing
+        Determine if the signal is non sensical
-        Action: Long, position: Long -> Hold Long
+        e.g.: agent wants a Actions.Long_exit while it is in a Positions.short
        Action: Short, position: Short -> Hold Short
        """
        return not ((action == Actions.Neutral.value and self._position == Positions.Neutral) or
                    (action == Actions.Neutral.value and self._position == Positions.Short) or
@@ -292,7 +290,7 @@ class Base5ActionRLEnv(gym.Env):
    def most_recent_return(self, action: int):
        """
-        We support Long, Neutral and Short positions.
+        Calculate the tick to tick return if in a trade.
        Return is generated from rising prices in Long
        and falling prices in Short positions.
        The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
--- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
+++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
@@ -19,6 +19,7 @@ from typing import Callable
 from datetime import datetime, timezone
 from stable_baselines3.common.utils import set_random_seed
 import gym
 from pathlib import Path
 logger = logging.getLogger(__name__)
 torch.multiprocessing.set_sharing_strategy('file_system')
@@ -40,6 +41,8 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        self.eval_env: Base5ActionRLEnv = None
        self.eval_callback: EvalCallback = None
        self.model_type = self.freqai_info['rl_config']['model_type']
        self.rl_config = self.freqai_info['rl_config']
        self.continual_retraining = self.rl_config['continual_retraining']
        if self.model_type in SB3_MODELS:
            import_str = 'stable_baselines3'
        elif self.model_type in SB3_CONTRIB_MODELS:
@@ -68,7 +71,6 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        logger.info("--------------------Starting training " f"{pair} --------------------")
        # filter the features requested by user in the configuration file and elegantly handle NaNs
        features_filtered, labels_filtered = dk.filter_features(
            unfiltered_dataframe,
            dk.training_features_list,
@@ -78,19 +80,19 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        data_dictionary: Dict[str, Any] = dk.make_train_test_datasets(
            features_filtered, labels_filtered)
-        dk.fit_labels()  # useless for now, but just satiating append methods
+        dk.fit_labels()  # FIXME useless for now, but just satiating append methods
        # normalize all data based on train_dataset only
        prices_train, prices_test = self.build_ohlc_price_dataframes(dk.data_dictionary, pair, dk)
        data_dictionary = dk.normalize_data(data_dictionary)
-        # optional additional data cleaning/analysis
+        # data cleaning/analysis
        self.data_cleaning_train(dk)
        logger.info(
-            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
            f' features and {len(data_dictionary["train_features"])} data points'
        )
        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
        self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test, dk)
@@ -100,9 +102,11 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        return model
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk):
+    def set_train_and_eval_environments(self, data_dictionary: Dict[str, DataFrame],
                                        prices_train: DataFrame, prices_test: DataFrame,
                                        dk: FreqaiDataKitchen):
        """
-        User overrides this as shown here if they are using a custom MyRLEnv
+        User can override this if they are using a custom MyRLEnv
        """
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
@@ -114,18 +118,22 @@ class BaseReinforcementLearningModel(IFreqaiModel):
                                     reward_kwargs=self.reward_params, config=self.config)
            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
                                    window_size=self.CONV_WIDTH,
-                                    reward_kwargs=self.reward_params, config=self.config), ".")
+                                    reward_kwargs=self.reward_params, config=self.config),
                                    str(Path(dk.data_path / 'monitor')))
            self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
                                              render=False, eval_freq=eval_freq,
-                                              best_model_save_path=dk.data_path)
+                                              best_model_save_path=str(dk.data_path))
        else:
            self.train_env.reset()
            self.eval_env.reset()
            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
            self.eval_env.reset_env(test_df, prices_test, self.CONV_WIDTH, self.reward_params)
            # self.eval_callback.eval_env = self.eval_env
            # self.eval_callback.best_model_save_path = str(dk.data_path)
            # self.eval_callback._init_callback()
            self.eval_callback.__init__(self.eval_env, deterministic=True,
                                        render=False, eval_freq=eval_freq,
-                                        best_model_save_path=dk.data_path)
+                                        best_model_save_path=str(dk.data_path))
    @abstractmethod
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
@@ -137,19 +145,20 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        return
-    def get_state_info(self, pair):
+    def get_state_info(self, pair: str):
        open_trades = Trade.get_trades_proxy(is_open=True)
        market_side = 0.5
-        current_profit = 0
+        current_profit: float = 0
        trade_duration = 0
        for trade in open_trades:
            if trade.pair == pair:
                # FIXME: mypy typing doesnt like that strategy may be "None" (it never will be)
                current_value = self.strategy.dp._exchange.get_rate(
                    pair, refresh=False, side="exit", is_short=trade.is_short)
                openrate = trade.open_rate
                now = datetime.now(timezone.utc).timestamp()
-                trade_duration = (now - trade.open_date.timestamp()) / self.base_tf_seconds
+                trade_duration = int((now - trade.open_date.timestamp()) / self.base_tf_seconds)
-                if 'long' in trade.enter_tag:
+                if 'long' in str(trade.enter_tag):
                    market_side = 1
                    current_profit = (current_value - openrate) / openrate
                else:
@@ -245,8 +254,9 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        return
-def make_env(env_id: str, rank: int, seed: int, train_df, price,
+def make_env(env_id: str, rank: int, seed: int, train_df: DataFrame, price: DataFrame,
-             reward_params, window_size, monitor=False, config={}) -> Callable:
+             reward_params: Dict[str, int], window_size: int, monitor: bool = False,
             config: Dict[str, Any] = {}) -> Callable:
    """
    Utility function for multiprocessed env.
--- a/freqtrade/freqai/prediction_models/ReinforcementLearnerCustomAgent.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearnerCustomAgent.py
@@ -22,6 +22,12 @@ class ReinforcementLearnerCustomAgent(BaseReinforcementLearningModel):
    """
    User can customize agent by defining the class and using it directly.
    Here the example is "TDQN"
    Warning!
    This is an advanced example of how a user may create and use a highly
    customized model class (which can inherit from existing classes,
    similar to how the example below inherits from DQN).
    This file is for example purposes only, and should not be run.
    """
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
@@ -34,7 +40,7 @@ class ReinforcementLearnerCustomAgent(BaseReinforcementLearningModel):
        # TDQN is a custom agent defined below
        model = TDQN(self.policy_type, self.train_env,
-                     tensorboard_log=Path(dk.data_path / "tensorboard"),
+                     tensorboard_log=str(Path(dk.data_path / "tensorboard")),
                     policy_kwargs=policy_kwargs,
                     **self.freqai_info['model_training_parameters']
                     )
@@ -217,7 +223,7 @@ class TDQN(DQN):
        exploration_initial_eps: float = 1.0,
        exploration_final_eps: float = 0.05,
        max_grad_norm: float = 10,
-        tensorboard_log: Optional[Path] = None,
+        tensorboard_log: Optional[str] = None,
        create_eval_env: bool = False,
        policy_kwargs: Optional[Dict[str, Any]] = None,
        verbose: int = 1,
--- a/freqtrade/freqai/data_drawer.py
+++ b/freqtrade/freqai/data_drawer.py
@@ -485,6 +485,10 @@ class FreqaiDataDrawer:
                f"Unable to load model, ensure model exists at " f"{dk.data_path} "
            )
        # load it into ram if it was loaded from disk
        if coin not in self.model_dictionary:
            self.model_dictionary[coin] = model
        if self.config["freqai"]["feature_parameters"]["principal_component_analysis"]:
            dk.pca = cloudpickle.load(
                open(dk.data_path / f"{dk.model_filename}_pca_object.pkl", "rb")
--- a/freqtrade/freqai/example_strats/ReinforcementLearningExample5ac.py
+++ b/freqtrade/freqai/example_strats/ReinforcementLearningExample5ac.py
@@ -76,7 +76,8 @@ class ReinforcementLearningExample5ac(IStrategy):
        informative[f"%-{coin}pct-change"] = informative["close"].pct_change()
        informative[f"%-{coin}raw_volume"] = informative["volume"]
-        # The following features are necessary for RL models
+        # FIXME: add these outside the user strategy?
        # The following columns are necessary for RL models.
        informative[f"%-{coin}raw_close"] = informative["close"]
        informative[f"%-{coin}raw_open"] = informative["open"]
        informative[f"%-{coin}raw_high"] = informative["high"]
--- a/freqtrade/freqai/prediction_models/BaseClassifierModel.py
+++ b/freqtrade/freqai/prediction_models/BaseClassifierModel.py
@@ -57,9 +57,9 @@ class BaseClassifierModel(IFreqaiModel):
        self.data_cleaning_train(dk)
        logger.info(
-            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
            f' features and {len(data_dictionary["train_features"])} data points'
        )
        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
        model = self.fit(data_dictionary)
--- a/freqtrade/freqai/prediction_models/BaseRegressionModel.py
+++ b/freqtrade/freqai/prediction_models/BaseRegressionModel.py
@@ -56,9 +56,9 @@ class BaseRegressionModel(IFreqaiModel):
        self.data_cleaning_train(dk)
        logger.info(
-            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
            f' features and {len(data_dictionary["train_features"])} data points'
        )
        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
        model = self.fit(data_dictionary)
--- a/freqtrade/freqai/prediction_models/BaseTensorFlowModel.py
+++ b/freqtrade/freqai/prediction_models/BaseTensorFlowModel.py
@@ -53,9 +53,9 @@ class BaseTensorFlowModel(IFreqaiModel):
        self.data_cleaning_train(dk)
        logger.info(
-            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
            f' features and {len(data_dictionary["train_features"])} data points'
        )
        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
        model = self.fit(data_dictionary)
--- a/freqtrade/freqai/prediction_models/ReinforcementLearner.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearner.py
@@ -1,7 +1,6 @@
 import logging
-from typing import Any, Dict  # , Tuple
+from typing import Any, Dict
 # import numpy.typing as npt
 import torch as th
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions
@@ -22,12 +21,18 @@ class ReinforcementLearner(BaseReinforcementLearningModel):
        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[256, 256, 128])
+                             net_arch=[512, 512, 256])
        if dk.pair not in self.dd.model_dictionary or not self.continual_retraining:
            model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
                                    tensorboard_log=Path(dk.data_path / "tensorboard"),
                                    **self.freqai_info['model_training_parameters']
                                    )
        else:
            logger.info('Continual training activated - starting training from previously '
                        'trained agent.')
            model = self.dd.model_dictionary[dk.pair]
            model.set_env(self.train_env)
        model.learn(
            total_timesteps=int(total_timesteps),