add continual retraining feature, handly mypy typing reqs, improve docstrings

2022-08-24 12:54:02 +02:00
parent b708134c1a
commit c0cee5df07
11 changed files with 387 additions and 362 deletions
@@ -85,12 +85,13 @@
            "verbose": 1
        },
        "rl_config": {
-            "train_cycles": 10,
+            "train_cycles": 3,
            "eval_cycles": 3,
            "thread_count": 4,
            "max_trade_duration_candles": 100,
            "model_type": "PPO",
            "policy_type": "MlpPolicy",
+            "continual_retraining": true,
            "model_reward_parameters": {
                "rr": 1,
                "profit_aim": 0.02,
@@ -1,330 +1,330 @@
-import logging
-from enum import Enum
+# import logging
+# from enum import Enum

-import gym
-import numpy as np
-import pandas as pd
-from gym import spaces
-from gym.utils import seeding
-from pandas import DataFrame
+# import gym
+# import numpy as np
+# import pandas as pd
+# from gym import spaces
+# from gym.utils import seeding
+# from pandas import DataFrame


-# from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+# # from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union

-logger = logging.getLogger(__name__)
+# logger = logging.getLogger(__name__)


-class Actions(Enum):
-    Short = 0
-    Long = 1
-    Neutral = 2
+# class Actions(Enum):
+#     Short = 0
+#     Long = 1
+#     Neutral = 2


-class Positions(Enum):
-    Short = 0
-    Long = 1
-    Neutral = 0.5
+# class Positions(Enum):
+#     Short = 0
+#     Long = 1
+#     Neutral = 0.5

-    def opposite(self):
-        return Positions.Short if self == Positions.Long else Positions.Long
+#     def opposite(self):
+#         return Positions.Short if self == Positions.Long else Positions.Long


-def mean_over_std(x):
-    std = np.std(x, ddof=1)
-    mean = np.mean(x)
-    return mean / std if std > 0 else 0
+# def mean_over_std(x):
+#     std = np.std(x, ddof=1)
+#     mean = np.mean(x)
+#     return mean / std if std > 0 else 0


-class Base3ActionRLEnv(gym.Env):
+# class Base3ActionRLEnv(gym.Env):

-    metadata = {'render.modes': ['human']}
+#     metadata = {'render.modes': ['human']}

-    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
-                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
-                 id: str = 'baseenv-1', seed: int = 1):
-        assert df.ndim == 2
+#     def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
+#                  reward_kwargs: dict = {}, window_size=10, starting_point=True,
+#                  id: str = 'baseenv-1', seed: int = 1):
+#         assert df.ndim == 2

-        self.id = id
-        self.seed(seed)
-        self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
+#         self.id = id
+#         self.seed(seed)
+#         self.reset_env(df, prices, window_size, reward_kwargs, starting_point)

-    def reset_env(self, df, prices, window_size, reward_kwargs, starting_point=True):
-        self.df = df
-        self.signal_features = self.df
-        self.prices = prices
-        self.window_size = window_size
-        self.starting_point = starting_point
-        self.rr = reward_kwargs["rr"]
-        self.profit_aim = reward_kwargs["profit_aim"]
+#     def reset_env(self, df, prices, window_size, reward_kwargs, starting_point=True):
+#         self.df = df
+#         self.signal_features = self.df
+#         self.prices = prices
+#         self.window_size = window_size
+#         self.starting_point = starting_point
+#         self.rr = reward_kwargs["rr"]
+#         self.profit_aim = reward_kwargs["profit_aim"]

-        self.fee = 0.0015
+#         self.fee = 0.0015

-        # # spaces
-        self.shape = (window_size, self.signal_features.shape[1] + 2)
-        self.action_space = spaces.Discrete(len(Actions))
-        self.observation_space = spaces.Box(
-            low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
-
-        # episode
-        self._start_tick = self.window_size
-        self._end_tick = len(self.prices) - 1
-        self._done = None
-        self._current_tick = None
-        self._last_trade_tick = None
-        self._position = Positions.Neutral
-        self._position_history = None
-        self.total_reward = None
-        self._total_profit = None
-        self._first_rendering = None
-        self.history = None
-        self.trade_history = []
-
-    def seed(self, seed: int = 1):
-        self.np_random, seed = seeding.np_random(seed)
-        return [seed]
-
-    def reset(self):
-
-        self._done = False
-
-        if self.starting_point is True:
-            self._position_history = (self._start_tick * [None]) + [self._position]
-        else:
-            self._position_history = (self.window_size * [None]) + [self._position]
-
-        self._current_tick = self._start_tick
-        self._last_trade_tick = None
-        self._position = Positions.Neutral
-
-        self.total_reward = 0.
-        self._total_profit = 1.  # unit
-        self._first_rendering = True
-        self.history = {}
-        self.trade_history = []
-        self.portfolio_log_returns = np.zeros(len(self.prices))
-
-        self._profits = [(self._start_tick, 1)]
-        self.close_trade_profit = []
-
-        return self._get_observation()
-
-    def step(self, action: int):
-        self._done = False
-        self._current_tick += 1
-
-        if self._current_tick == self._end_tick:
-            self._done = True
-
-        self.update_portfolio_log_returns(action)
-
-        self._update_profit(action)
-        step_reward = self.calculate_reward(action)
-        self.total_reward += step_reward
-
-        trade_type = None
-        if self.is_tradesignal(action):  # exclude 3 case not trade
-            # Update position
-            """
-            Action: Neutral, position: Long ->  Close Long
-            Action: Neutral, position: Short -> Close Short
-
-            Action: Long, position: Neutral -> Open Long
-            Action: Long, position: Short -> Close Short and Open Long
-
-            Action: Short, position: Neutral -> Open Short
-            Action: Short, position: Long -> Close Long and Open Short
-            """
-
-            if action == Actions.Neutral.value:
-                self._position = Positions.Neutral
-                trade_type = "neutral"
-            elif action == Actions.Long.value:
-                self._position = Positions.Long
-                trade_type = "long"
-            elif action == Actions.Short.value:
-                self._position = Positions.Short
-                trade_type = "short"
-            else:
-                print("case not defined")
-
-            # Update last trade tick
-            self._last_trade_tick = self._current_tick
-
-            if trade_type is not None:
-                self.trade_history.append(
-                    {'price': self.current_price(), 'index': self._current_tick,
-                     'type': trade_type})
-
-        if self._total_profit < 0.2:
-            self._done = True
-
-        self._position_history.append(self._position)
-        observation = self._get_observation()
-        info = dict(
-            tick=self._current_tick,
-            total_reward=self.total_reward,
-            total_profit=self._total_profit,
-            position=self._position.value
-        )
-        self._update_history(info)
-
-        return observation, step_reward, self._done, info
-
-    def _get_observation(self):
-        features_window = self.signal_features[(
-            self._current_tick - self.window_size):self._current_tick]
-        features_and_state = DataFrame(np.zeros((len(features_window), 2)),
-                                       columns=['current_profit_pct', 'position'],
-                                       index=features_window.index)
-
-        features_and_state['current_profit_pct'] = self.get_unrealized_profit()
-        features_and_state['position'] = self._position.value
-        features_and_state = pd.concat([features_window, features_and_state], axis=1)
-        return features_and_state
-
-    def get_unrealized_profit(self):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        if self._position == Positions.Neutral:
-            return 0.
-        elif self._position == Positions.Short:
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            return (last_trade_price - current_price) / last_trade_price
-        elif self._position == Positions.Long:
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            return (current_price - last_trade_price) / last_trade_price
-        else:
-            return 0.
-
-    def is_tradesignal(self, action: int):
-        # trade signal
-        """
-        not trade signal is :
-        Action: Neutral, position: Neutral -> Nothing
-        Action: Long, position: Long -> Hold Long
-        Action: Short, position: Short -> Hold Short
-        """
-        return not ((action == Actions.Neutral.value and self._position == Positions.Neutral)
-                    or (action == Actions.Short.value and self._position == Positions.Short)
-                    or (action == Actions.Long.value and self._position == Positions.Long))
-
-    def _is_trade(self, action: Actions):
-        return ((action == Actions.Long.value and self._position == Positions.Short) or
-                (action == Actions.Short.value and self._position == Positions.Long) or
-                (action == Actions.Neutral.value and self._position == Positions.Long) or
-                (action == Actions.Neutral.value and self._position == Positions.Short)
-                )
-
-    def is_hold(self, action):
-        return ((action == Actions.Short.value and self._position == Positions.Short)
-                or (action == Actions.Long.value and self._position == Positions.Long))
-
-    def add_buy_fee(self, price):
-        return price * (1 + self.fee)
-
-    def add_sell_fee(self, price):
-        return price / (1 + self.fee)
-
-    def _update_history(self, info):
-        if not self.history:
-            self.history = {key: [] for key in info.keys()}
-
-        for key, value in info.items():
-            self.history[key].append(value)
-
-    def get_sharpe_ratio(self):
-        return mean_over_std(self.get_portfolio_log_returns())
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        # close short
-        if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        return 0.
-
-    def _update_profit(self, action):
-        if self._is_trade(action) or self._done:
-            pnl = self.get_unrealized_profit()
-
-            if self._position == Positions.Long:
-                self._total_profit = self._total_profit + self._total_profit * pnl
-                self._profits.append((self._current_tick, self._total_profit))
-                self.close_trade_profit.append(pnl)
-
-            if self._position == Positions.Short:
-                self._total_profit = self._total_profit + self._total_profit * pnl
-                self._profits.append((self._current_tick, self._total_profit))
-                self.close_trade_profit.append(pnl)
-
-    def most_recent_return(self, action: int):
-        """
-        We support Long, Neutral and Short positions.
-        Return is generated from rising prices in Long
-        and falling prices in Short positions.
-        The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
-        """
-        # Long positions
-        if self._position == Positions.Long:
-            current_price = self.prices.iloc[self._current_tick].open
-            if action == Actions.Short.value or action == Actions.Neutral.value:
-                current_price = self.add_sell_fee(current_price)
-
-            previous_price = self.prices.iloc[self._current_tick - 1].open
-
-            if (self._position_history[self._current_tick - 1] == Positions.Short
-                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
-                previous_price = self.add_buy_fee(previous_price)
-
-            return np.log(current_price) - np.log(previous_price)
-
-        # Short positions
-        if self._position == Positions.Short:
-            current_price = self.prices.iloc[self._current_tick].open
-            if action == Actions.Long.value or action == Actions.Neutral.value:
-                current_price = self.add_buy_fee(current_price)
-
-            previous_price = self.prices.iloc[self._current_tick - 1].open
-            if (self._position_history[self._current_tick - 1] == Positions.Long
-                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
-                previous_price = self.add_sell_fee(previous_price)
-
-            return np.log(previous_price) - np.log(current_price)
-
-        return 0
-
-    def get_portfolio_log_returns(self):
-        return self.portfolio_log_returns[1:self._current_tick + 1]
-
-    def update_portfolio_log_returns(self, action):
-        self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
-
-    def current_price(self) -> float:
-        return self.prices.iloc[self._current_tick].open
+#         # # spaces
+#         self.shape = (window_size, self.signal_features.shape[1] + 2)
+#         self.action_space = spaces.Discrete(len(Actions))
+#         self.observation_space = spaces.Box(
+#             low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
+
+#         # episode
+#         self._start_tick = self.window_size
+#         self._end_tick = len(self.prices) - 1
+#         self._done = None
+#         self._current_tick = None
+#         self._last_trade_tick = None
+#         self._position = Positions.Neutral
+#         self._position_history = None
+#         self.total_reward = None
+#         self._total_profit = None
+#         self._first_rendering = None
+#         self.history = None
+#         self.trade_history = []
+
+#     def seed(self, seed: int = 1):
+#         self.np_random, seed = seeding.np_random(seed)
+#         return [seed]
+
+#     def reset(self):
+
+#         self._done = False
+
+#         if self.starting_point is True:
+#             self._position_history = (self._start_tick * [None]) + [self._position]
+#         else:
+#             self._position_history = (self.window_size * [None]) + [self._position]
+
+#         self._current_tick = self._start_tick
+#         self._last_trade_tick = None
+#         self._position = Positions.Neutral
+
+#         self.total_reward = 0.
+#         self._total_profit = 1.  # unit
+#         self._first_rendering = True
+#         self.history = {}
+#         self.trade_history = []
+#         self.portfolio_log_returns = np.zeros(len(self.prices))
+
+#         self._profits = [(self._start_tick, 1)]
+#         self.close_trade_profit = []
+
+#         return self._get_observation()
+
+#     def step(self, action: int):
+#         self._done = False
+#         self._current_tick += 1
+
+#         if self._current_tick == self._end_tick:
+#             self._done = True
+
+#         self.update_portfolio_log_returns(action)
+
+#         self._update_profit(action)
+#         step_reward = self.calculate_reward(action)
+#         self.total_reward += step_reward
+
+#         trade_type = None
+#         if self.is_tradesignal(action):  # exclude 3 case not trade
+#             # Update position
+#             """
+#             Action: Neutral, position: Long ->  Close Long
+#             Action: Neutral, position: Short -> Close Short
+
+#             Action: Long, position: Neutral -> Open Long
+#             Action: Long, position: Short -> Close Short and Open Long
+
+#             Action: Short, position: Neutral -> Open Short
+#             Action: Short, position: Long -> Close Long and Open Short
+#             """
+
+#             if action == Actions.Neutral.value:
+#                 self._position = Positions.Neutral
+#                 trade_type = "neutral"
+#             elif action == Actions.Long.value:
+#                 self._position = Positions.Long
+#                 trade_type = "long"
+#             elif action == Actions.Short.value:
+#                 self._position = Positions.Short
+#                 trade_type = "short"
+#             else:
+#                 print("case not defined")
+
+#             # Update last trade tick
+#             self._last_trade_tick = self._current_tick
+
+#             if trade_type is not None:
+#                 self.trade_history.append(
+#                     {'price': self.current_price(), 'index': self._current_tick,
+#                      'type': trade_type})
+
+#         if self._total_profit < 0.2:
+#             self._done = True
+
+#         self._position_history.append(self._position)
+#         observation = self._get_observation()
+#         info = dict(
+#             tick=self._current_tick,
+#             total_reward=self.total_reward,
+#             total_profit=self._total_profit,
+#             position=self._position.value
+#         )
+#         self._update_history(info)
+
+#         return observation, step_reward, self._done, info
+
+#     def _get_observation(self):
+#         features_window = self.signal_features[(
+#             self._current_tick - self.window_size):self._current_tick]
+#         features_and_state = DataFrame(np.zeros((len(features_window), 2)),
+#                                        columns=['current_profit_pct', 'position'],
+#                                        index=features_window.index)
+
+#         features_and_state['current_profit_pct'] = self.get_unrealized_profit()
+#         features_and_state['position'] = self._position.value
+#         features_and_state = pd.concat([features_window, features_and_state], axis=1)
+#         return features_and_state
+
+#     def get_unrealized_profit(self):
+
+#         if self._last_trade_tick is None:
+#             return 0.
+
+#         if self._position == Positions.Neutral:
+#             return 0.
+#         elif self._position == Positions.Short:
+#             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+#             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+#             return (last_trade_price - current_price) / last_trade_price
+#         elif self._position == Positions.Long:
+#             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+#             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+#             return (current_price - last_trade_price) / last_trade_price
+#         else:
+#             return 0.
+
+#     def is_tradesignal(self, action: int):
+#         # trade signal
+#         """
+#         not trade signal is :
+#         Action: Neutral, position: Neutral -> Nothing
+#         Action: Long, position: Long -> Hold Long
+#         Action: Short, position: Short -> Hold Short
+#         """
+#         return not ((action == Actions.Neutral.value and self._position == Positions.Neutral)
+#                     or (action == Actions.Short.value and self._position == Positions.Short)
+#                     or (action == Actions.Long.value and self._position == Positions.Long))
+
+#     def _is_trade(self, action: Actions):
+#         return ((action == Actions.Long.value and self._position == Positions.Short) or
+#                 (action == Actions.Short.value and self._position == Positions.Long) or
+#                 (action == Actions.Neutral.value and self._position == Positions.Long) or
+#                 (action == Actions.Neutral.value and self._position == Positions.Short)
+#                 )
+
+#     def is_hold(self, action):
+#         return ((action == Actions.Short.value and self._position == Positions.Short)
+#                 or (action == Actions.Long.value and self._position == Positions.Long))
+
+#     def add_buy_fee(self, price):
+#         return price * (1 + self.fee)
+
+#     def add_sell_fee(self, price):
+#         return price / (1 + self.fee)
+
+#     def _update_history(self, info):
+#         if not self.history:
+#             self.history = {key: [] for key in info.keys()}
+
+#         for key, value in info.items():
+#             self.history[key].append(value)
+
+#     def get_sharpe_ratio(self):
+#         return mean_over_std(self.get_portfolio_log_returns())
+
+#     def calculate_reward(self, action):
+
+#         if self._last_trade_tick is None:
+#             return 0.
+
+#         # close long
+#         if (action == Actions.Short.value or
+#                 action == Actions.Neutral.value) and self._position == Positions.Long:
+#             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+#             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+#             return float(np.log(current_price) - np.log(last_trade_price))
+
+#         # close short
+#         if (action == Actions.Long.value or
+#                 action == Actions.Neutral.value) and self._position == Positions.Short:
+#             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+#             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+#             return float(np.log(last_trade_price) - np.log(current_price))
+
+#         return 0.
+
+#     def _update_profit(self, action):
+#         if self._is_trade(action) or self._done:
+#             pnl = self.get_unrealized_profit()
+
+#             if self._position == Positions.Long:
+#                 self._total_profit = self._total_profit + self._total_profit * pnl
+#                 self._profits.append((self._current_tick, self._total_profit))
+#                 self.close_trade_profit.append(pnl)
+
+#             if self._position == Positions.Short:
+#                 self._total_profit = self._total_profit + self._total_profit * pnl
+#                 self._profits.append((self._current_tick, self._total_profit))
+#                 self.close_trade_profit.append(pnl)
+
+#     def most_recent_return(self, action: int):
+#         """
+#         We support Long, Neutral and Short positions.
+#         Return is generated from rising prices in Long
+#         and falling prices in Short positions.
+#         The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
+#         """
+#         # Long positions
+#         if self._position == Positions.Long:
+#             current_price = self.prices.iloc[self._current_tick].open
+#             if action == Actions.Short.value or action == Actions.Neutral.value:
+#                 current_price = self.add_sell_fee(current_price)
+
+#             previous_price = self.prices.iloc[self._current_tick - 1].open
+
+#             if (self._position_history[self._current_tick - 1] == Positions.Short
+#                     or self._position_history[self._current_tick - 1] == Positions.Neutral):
+#                 previous_price = self.add_buy_fee(previous_price)
+
+#             return np.log(current_price) - np.log(previous_price)
+
+#         # Short positions
+#         if self._position == Positions.Short:
+#             current_price = self.prices.iloc[self._current_tick].open
+#             if action == Actions.Long.value or action == Actions.Neutral.value:
+#                 current_price = self.add_buy_fee(current_price)
+
+#             previous_price = self.prices.iloc[self._current_tick - 1].open
+#             if (self._position_history[self._current_tick - 1] == Positions.Long
+#                     or self._position_history[self._current_tick - 1] == Positions.Neutral):
+#                 previous_price = self.add_sell_fee(previous_price)
+
+#             return np.log(previous_price) - np.log(current_price)
+
+#         return 0
+
+#     def get_portfolio_log_returns(self):
+#         return self.portfolio_log_returns[1:self._current_tick + 1]
+
+#     def update_portfolio_log_returns(self, action):
+#         self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
+
+#     def current_price(self) -> float:
+#         return self.prices.iloc[self._current_tick].open

-    def prev_price(self) -> float:
-        return self.prices.iloc[self._current_tick - 1].open
+#     def prev_price(self) -> float:
+#         return self.prices.iloc[self._current_tick - 1].open

-    def sharpe_ratio(self) -> float:
-        if len(self.close_trade_profit) == 0:
-            return 0.
-        returns = np.array(self.close_trade_profit)
-        reward = (np.mean(returns) - 0. + 1e-9) / (np.std(returns) + 1e-9)
-        return reward
+#     def sharpe_ratio(self) -> float:
+#         if len(self.close_trade_profit) == 0:
+#             return 0.
+#         returns = np.array(self.close_trade_profit)
+#         reward = (np.mean(returns) - 0. + 1e-9) / (np.std(returns) + 1e-9)
+#         return reward
@@ -1,6 +1,6 @@
 import logging
 from enum import Enum
-# from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Optional

 import gym
 import numpy as np
@@ -44,14 +44,14 @@ class Base5ActionRLEnv(gym.Env):
    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
                 id: str = 'baseenv-1', seed: int = 1, config: dict = {}):
-        assert df.ndim == 2

        self.rl_config = config['freqai']['rl_config']
        self.id = id
        self.seed(seed)
        self.reset_env(df, prices, window_size, reward_kwargs, starting_point)

-    def reset_env(self, df, prices, window_size, reward_kwargs, starting_point=True):
+    def reset_env(self, df: DataFrame, prices: DataFrame, window_size: int,
+                  reward_kwargs: dict, starting_point=True):
        self.df = df
        self.signal_features = self.df
        self.prices = prices
@@ -69,18 +69,18 @@ class Base5ActionRLEnv(gym.Env):
            low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)

        # episode
-        self._start_tick = self.window_size
-        self._end_tick = len(self.prices) - 1
-        self._done = None
-        self._current_tick = None
-        self._last_trade_tick = None
+        self._start_tick: int = self.window_size
+        self._end_tick: int = len(self.prices) - 1
+        self._done: bool = False
+        self._current_tick: int = self._start_tick
+        self._last_trade_tick: Optional[int] = None
        self._position = Positions.Neutral
-        self._position_history = None
-        self.total_reward = None
-        self._total_profit = None
-        self._first_rendering = None
-        self.history = None
-        self.trade_history = []
+        self._position_history: list = [None]
+        self.total_reward: float = 0
+        self._total_profit: float = 0
+        self._first_rendering: bool = False
+        self.history: dict = {}
+        self.trade_history: list = []

    def seed(self, seed: int = 1):
        self.np_random, seed = seeding.np_random(seed)
@@ -125,8 +125,7 @@ class Base5ActionRLEnv(gym.Env):
        self.total_reward += step_reward

        trade_type = None
-        if self.is_tradesignal(action):  # exclude 3 case not trade
-            # Update position
+        if self.is_tradesignal(action):
            """
            Action: Neutral, position: Long ->  Close Long
            Action: Neutral, position: Short -> Close Short
@@ -223,9 +222,8 @@ class Base5ActionRLEnv(gym.Env):
        # trade signal
        """
        not trade signal is :
-        Action: Neutral, position: Neutral -> Nothing
-        Action: Long, position: Long -> Hold Long
-        Action: Short, position: Short -> Hold Short
+        Determine if the signal is non sensical
+        e.g.: agent wants a Actions.Long_exit while it is in a Positions.short
        """
        return not ((action == Actions.Neutral.value and self._position == Positions.Neutral) or
                    (action == Actions.Neutral.value and self._position == Positions.Short) or
@@ -292,7 +290,7 @@ class Base5ActionRLEnv(gym.Env):

    def most_recent_return(self, action: int):
        """
-        We support Long, Neutral and Short positions.
+        Calculate the tick to tick return if in a trade.
        Return is generated from rising prices in Long
        and falling prices in Short positions.
        The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
@@ -19,6 +19,7 @@ from typing import Callable
 from datetime import datetime, timezone
 from stable_baselines3.common.utils import set_random_seed
 import gym
+from pathlib import Path
 logger = logging.getLogger(__name__)

 torch.multiprocessing.set_sharing_strategy('file_system')
@@ -40,6 +41,8 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        self.eval_env: Base5ActionRLEnv = None
        self.eval_callback: EvalCallback = None
        self.model_type = self.freqai_info['rl_config']['model_type']
+        self.rl_config = self.freqai_info['rl_config']
+        self.continual_retraining = self.rl_config['continual_retraining']
        if self.model_type in SB3_MODELS:
            import_str = 'stable_baselines3'
        elif self.model_type in SB3_CONTRIB_MODELS:
@@ -68,7 +71,6 @@ class BaseReinforcementLearningModel(IFreqaiModel):

        logger.info("--------------------Starting training " f"{pair} --------------------")

-        # filter the features requested by user in the configuration file and elegantly handle NaNs
        features_filtered, labels_filtered = dk.filter_features(
            unfiltered_dataframe,
            dk.training_features_list,
@@ -78,19 +80,19 @@ class BaseReinforcementLearningModel(IFreqaiModel):

        data_dictionary: Dict[str, Any] = dk.make_train_test_datasets(
            features_filtered, labels_filtered)
-        dk.fit_labels()  # useless for now, but just satiating append methods
+        dk.fit_labels()  # FIXME useless for now, but just satiating append methods

        # normalize all data based on train_dataset only
        prices_train, prices_test = self.build_ohlc_price_dataframes(dk.data_dictionary, pair, dk)
        data_dictionary = dk.normalize_data(data_dictionary)

-        # optional additional data cleaning/analysis
+        # data cleaning/analysis
        self.data_cleaning_train(dk)

        logger.info(
-            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
+            f' features and {len(data_dictionary["train_features"])} data points'
        )
-        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')

        self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test, dk)

@@ -100,9 +102,11 @@ class BaseReinforcementLearningModel(IFreqaiModel):

        return model

-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk):
+    def set_train_and_eval_environments(self, data_dictionary: Dict[str, DataFrame],
+                                        prices_train: DataFrame, prices_test: DataFrame,
+                                        dk: FreqaiDataKitchen):
        """
-        User overrides this as shown here if they are using a custom MyRLEnv
+        User can override this if they are using a custom MyRLEnv
        """
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
@@ -114,18 +118,22 @@ class BaseReinforcementLearningModel(IFreqaiModel):
                                     reward_kwargs=self.reward_params, config=self.config)
            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
                                    window_size=self.CONV_WIDTH,
-                                    reward_kwargs=self.reward_params, config=self.config), ".")
+                                    reward_kwargs=self.reward_params, config=self.config),
+                                    str(Path(dk.data_path / 'monitor')))
            self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
                                              render=False, eval_freq=eval_freq,
-                                              best_model_save_path=dk.data_path)
+                                              best_model_save_path=str(dk.data_path))
        else:
            self.train_env.reset()
            self.eval_env.reset()
            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
            self.eval_env.reset_env(test_df, prices_test, self.CONV_WIDTH, self.reward_params)
+            # self.eval_callback.eval_env = self.eval_env
+            # self.eval_callback.best_model_save_path = str(dk.data_path)
+            # self.eval_callback._init_callback()
            self.eval_callback.__init__(self.eval_env, deterministic=True,
                                        render=False, eval_freq=eval_freq,
-                                        best_model_save_path=dk.data_path)
+                                        best_model_save_path=str(dk.data_path))

    @abstractmethod
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
@@ -137,19 +145,20 @@ class BaseReinforcementLearningModel(IFreqaiModel):

        return

-    def get_state_info(self, pair):
+    def get_state_info(self, pair: str):
        open_trades = Trade.get_trades_proxy(is_open=True)
        market_side = 0.5
-        current_profit = 0
+        current_profit: float = 0
        trade_duration = 0
        for trade in open_trades:
            if trade.pair == pair:
+                # FIXME: mypy typing doesnt like that strategy may be "None" (it never will be)
                current_value = self.strategy.dp._exchange.get_rate(
                    pair, refresh=False, side="exit", is_short=trade.is_short)
                openrate = trade.open_rate
                now = datetime.now(timezone.utc).timestamp()
-                trade_duration = (now - trade.open_date.timestamp()) / self.base_tf_seconds
-                if 'long' in trade.enter_tag:
+                trade_duration = int((now - trade.open_date.timestamp()) / self.base_tf_seconds)
+                if 'long' in str(trade.enter_tag):
                    market_side = 1
                    current_profit = (current_value - openrate) / openrate
                else:
@@ -245,8 +254,9 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        return


-def make_env(env_id: str, rank: int, seed: int, train_df, price,
-             reward_params, window_size, monitor=False, config={}) -> Callable:
+def make_env(env_id: str, rank: int, seed: int, train_df: DataFrame, price: DataFrame,
+             reward_params: Dict[str, int], window_size: int, monitor: bool = False,
+             config: Dict[str, Any] = {}) -> Callable:
    """
    Utility function for multiprocessed env.

@@ -22,6 +22,12 @@ class ReinforcementLearnerCustomAgent(BaseReinforcementLearningModel):
    """
    User can customize agent by defining the class and using it directly.
    Here the example is "TDQN"
+
+    Warning!
+    This is an advanced example of how a user may create and use a highly
+    customized model class (which can inherit from existing classes,
+    similar to how the example below inherits from DQN).
+    This file is for example purposes only, and should not be run.
    """

    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
@@ -34,7 +40,7 @@ class ReinforcementLearnerCustomAgent(BaseReinforcementLearningModel):

        # TDQN is a custom agent defined below
        model = TDQN(self.policy_type, self.train_env,
-                     tensorboard_log=Path(dk.data_path / "tensorboard"),
+                     tensorboard_log=str(Path(dk.data_path / "tensorboard")),
                     policy_kwargs=policy_kwargs,
                     **self.freqai_info['model_training_parameters']
                     )
@@ -217,7 +223,7 @@ class TDQN(DQN):
        exploration_initial_eps: float = 1.0,
        exploration_final_eps: float = 0.05,
        max_grad_norm: float = 10,
-        tensorboard_log: Optional[Path] = None,
+        tensorboard_log: Optional[str] = None,
        create_eval_env: bool = False,
        policy_kwargs: Optional[Dict[str, Any]] = None,
        verbose: int = 1,
@@ -485,6 +485,10 @@ class FreqaiDataDrawer:
                f"Unable to load model, ensure model exists at " f"{dk.data_path} "
            )

+        # load it into ram if it was loaded from disk
+        if coin not in self.model_dictionary:
+            self.model_dictionary[coin] = model
+
        if self.config["freqai"]["feature_parameters"]["principal_component_analysis"]:
            dk.pca = cloudpickle.load(
                open(dk.data_path / f"{dk.model_filename}_pca_object.pkl", "rb")
@@ -76,7 +76,8 @@ class ReinforcementLearningExample5ac(IStrategy):
        informative[f"%-{coin}pct-change"] = informative["close"].pct_change()
        informative[f"%-{coin}raw_volume"] = informative["volume"]

-        # The following features are necessary for RL models
+        # FIXME: add these outside the user strategy?
+        # The following columns are necessary for RL models.
        informative[f"%-{coin}raw_close"] = informative["close"]
        informative[f"%-{coin}raw_open"] = informative["open"]
        informative[f"%-{coin}raw_high"] = informative["high"]
@@ -57,9 +57,9 @@ class BaseClassifierModel(IFreqaiModel):
        self.data_cleaning_train(dk)

        logger.info(
-            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
+            f' features and {len(data_dictionary["train_features"])} data points'
        )
-        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')

        model = self.fit(data_dictionary)

@@ -56,9 +56,9 @@ class BaseRegressionModel(IFreqaiModel):
        self.data_cleaning_train(dk)

        logger.info(
-            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
+            f' features and {len(data_dictionary["train_features"])} data points'
        )
-        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')

        model = self.fit(data_dictionary)

@@ -53,9 +53,9 @@ class BaseTensorFlowModel(IFreqaiModel):
        self.data_cleaning_train(dk)

        logger.info(
-            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
+            f' features and {len(data_dictionary["train_features"])} data points'
        )
-        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')

        model = self.fit(data_dictionary)

@@ -1,7 +1,6 @@
 import logging
-from typing import Any, Dict  # , Tuple
+from typing import Any, Dict

-# import numpy.typing as npt
 import torch as th
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions
@@ -22,12 +21,18 @@ class ReinforcementLearner(BaseReinforcementLearningModel):
        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)

        policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[256, 256, 128])
+                             net_arch=[512, 512, 256])

-        model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
-                                tensorboard_log=Path(dk.data_path / "tensorboard"),
-                                **self.freqai_info['model_training_parameters']
-                                )
+        if dk.pair not in self.dd.model_dictionary or not self.continual_retraining:
+            model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
+                                    tensorboard_log=Path(dk.data_path / "tensorboard"),
+                                    **self.freqai_info['model_training_parameters']
+                                    )
+        else:
+            logger.info('Continual training activated - starting training from previously '
+                        'trained agent.')
+            model = self.dd.model_dictionary[dk.pair]
+            model.set_env(self.train_env)

        model.learn(
            total_timesteps=int(total_timesteps),