From c0cee5df07ac18d7f870385586e9007ccc74024b Mon Sep 17 00:00:00 2001
From: robcaulk <rob.caulk@gmail.com>
Date: Wed, 24 Aug 2022 12:54:02 +0200
Subject: [PATCH] add continual retraining feature, handly mypy typing reqs,
 improve docstrings

---
 config_examples/config_freqai-rl.example.json |   3 +-
 freqtrade/freqai/RL/Base3ActionRLEnv.py       | 618 +++++++++---------
 freqtrade/freqai/RL/Base5ActionRLEnv.py       |  38 +-
 .../RL/BaseReinforcementLearningModel.py      |  42 +-
 .../ReinforcementLearnerCustomAgent.py        |  10 +-
 freqtrade/freqai/data_drawer.py               |   4 +
 .../ReinforcementLearningExample5ac.py        |   3 +-
 .../prediction_models/BaseClassifierModel.py  |   4 +-
 .../prediction_models/BaseRegressionModel.py  |   4 +-
 .../prediction_models/BaseTensorFlowModel.py  |   4 +-
 .../prediction_models/ReinforcementLearner.py |  19 +-
 11 files changed, 387 insertions(+), 362 deletions(-)
 rename freqtrade/freqai/{prediction_models => RL}/ReinforcementLearnerCustomAgent.py (95%)

diff --git a/config_examples/config_freqai-rl.example.json b/config_examples/config_freqai-rl.example.json
index bb67b44b6..b3f8737be 100644
--- a/config_examples/config_freqai-rl.example.json
+++ b/config_examples/config_freqai-rl.example.json
@@ -85,12 +85,13 @@
             "verbose": 1
         },
         "rl_config": {
-            "train_cycles": 10,
+            "train_cycles": 3,
             "eval_cycles": 3,
             "thread_count": 4,
             "max_trade_duration_candles": 100,
             "model_type": "PPO",
             "policy_type": "MlpPolicy",
+            "continual_retraining": true,
             "model_reward_parameters": {
                 "rr": 1,
                 "profit_aim": 0.02,
diff --git a/freqtrade/freqai/RL/Base3ActionRLEnv.py b/freqtrade/freqai/RL/Base3ActionRLEnv.py
index df53c729b..cddd2f6f9 100644
--- a/freqtrade/freqai/RL/Base3ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base3ActionRLEnv.py
@@ -1,330 +1,330 @@
-import logging
-from enum import Enum
+# import logging
+# from enum import Enum
 
-import gym
-import numpy as np
-import pandas as pd
-from gym import spaces
-from gym.utils import seeding
-from pandas import DataFrame
+# import gym
+# import numpy as np
+# import pandas as pd
+# from gym import spaces
+# from gym.utils import seeding
+# from pandas import DataFrame
 
 
-# from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+# # from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
-logger = logging.getLogger(__name__)
+# logger = logging.getLogger(__name__)
 
 
-class Actions(Enum):
-    Short = 0
-    Long = 1
-    Neutral = 2
+# class Actions(Enum):
+#     Short = 0
+#     Long = 1
+#     Neutral = 2
 
 
-class Positions(Enum):
-    Short = 0
-    Long = 1
-    Neutral = 0.5
+# class Positions(Enum):
+#     Short = 0
+#     Long = 1
+#     Neutral = 0.5
 
-    def opposite(self):
-        return Positions.Short if self == Positions.Long else Positions.Long
+#     def opposite(self):
+#         return Positions.Short if self == Positions.Long else Positions.Long
 
 
-def mean_over_std(x):
-    std = np.std(x, ddof=1)
-    mean = np.mean(x)
-    return mean / std if std > 0 else 0
+# def mean_over_std(x):
+#     std = np.std(x, ddof=1)
+#     mean = np.mean(x)
+#     return mean / std if std > 0 else 0
 
 
-class Base3ActionRLEnv(gym.Env):
+# class Base3ActionRLEnv(gym.Env):
 
-    metadata = {'render.modes': ['human']}
+#     metadata = {'render.modes': ['human']}
 
-    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
-                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
-                 id: str = 'baseenv-1', seed: int = 1):
-        assert df.ndim == 2
+#     def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
+#                  reward_kwargs: dict = {}, window_size=10, starting_point=True,
+#                  id: str = 'baseenv-1', seed: int = 1):
+#         assert df.ndim == 2
 
-        self.id = id
-        self.seed(seed)
-        self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
+#         self.id = id
+#         self.seed(seed)
+#         self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
 
-    def reset_env(self, df, prices, window_size, reward_kwargs, starting_point=True):
-        self.df = df
-        self.signal_features = self.df
-        self.prices = prices
-        self.window_size = window_size
-        self.starting_point = starting_point
-        self.rr = reward_kwargs["rr"]
-        self.profit_aim = reward_kwargs["profit_aim"]
+#     def reset_env(self, df, prices, window_size, reward_kwargs, starting_point=True):
+#         self.df = df
+#         self.signal_features = self.df
+#         self.prices = prices
+#         self.window_size = window_size
+#         self.starting_point = starting_point
+#         self.rr = reward_kwargs["rr"]
+#         self.profit_aim = reward_kwargs["profit_aim"]
 
-        self.fee = 0.0015
+#         self.fee = 0.0015
 
-        # # spaces
-        self.shape = (window_size, self.signal_features.shape[1] + 2)
-        self.action_space = spaces.Discrete(len(Actions))
-        self.observation_space = spaces.Box(
-            low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
-
-        # episode
-        self._start_tick = self.window_size
-        self._end_tick = len(self.prices) - 1
-        self._done = None
-        self._current_tick = None
-        self._last_trade_tick = None
-        self._position = Positions.Neutral
-        self._position_history = None
-        self.total_reward = None
-        self._total_profit = None
-        self._first_rendering = None
-        self.history = None
-        self.trade_history = []
-
-    def seed(self, seed: int = 1):
-        self.np_random, seed = seeding.np_random(seed)
-        return [seed]
-
-    def reset(self):
-
-        self._done = False
-
-        if self.starting_point is True:
-            self._position_history = (self._start_tick * [None]) + [self._position]
-        else:
-            self._position_history = (self.window_size * [None]) + [self._position]
-
-        self._current_tick = self._start_tick
-        self._last_trade_tick = None
-        self._position = Positions.Neutral
-
-        self.total_reward = 0.
-        self._total_profit = 1.  # unit
-        self._first_rendering = True
-        self.history = {}
-        self.trade_history = []
-        self.portfolio_log_returns = np.zeros(len(self.prices))
-
-        self._profits = [(self._start_tick, 1)]
-        self.close_trade_profit = []
-
-        return self._get_observation()
-
-    def step(self, action: int):
-        self._done = False
-        self._current_tick += 1
-
-        if self._current_tick == self._end_tick:
-            self._done = True
-
-        self.update_portfolio_log_returns(action)
-
-        self._update_profit(action)
-        step_reward = self.calculate_reward(action)
-        self.total_reward += step_reward
-
-        trade_type = None
-        if self.is_tradesignal(action):  # exclude 3 case not trade
-            # Update position
-            """
-            Action: Neutral, position: Long ->  Close Long
-            Action: Neutral, position: Short -> Close Short
-
-            Action: Long, position: Neutral -> Open Long
-            Action: Long, position: Short -> Close Short and Open Long
-
-            Action: Short, position: Neutral -> Open Short
-            Action: Short, position: Long -> Close Long and Open Short
-            """
-
-            if action == Actions.Neutral.value:
-                self._position = Positions.Neutral
-                trade_type = "neutral"
-            elif action == Actions.Long.value:
-                self._position = Positions.Long
-                trade_type = "long"
-            elif action == Actions.Short.value:
-                self._position = Positions.Short
-                trade_type = "short"
-            else:
-                print("case not defined")
-
-            # Update last trade tick
-            self._last_trade_tick = self._current_tick
-
-            if trade_type is not None:
-                self.trade_history.append(
-                    {'price': self.current_price(), 'index': self._current_tick,
-                     'type': trade_type})
-
-        if self._total_profit < 0.2:
-            self._done = True
-
-        self._position_history.append(self._position)
-        observation = self._get_observation()
-        info = dict(
-            tick=self._current_tick,
-            total_reward=self.total_reward,
-            total_profit=self._total_profit,
-            position=self._position.value
-        )
-        self._update_history(info)
-
-        return observation, step_reward, self._done, info
-
-    def _get_observation(self):
-        features_window = self.signal_features[(
-            self._current_tick - self.window_size):self._current_tick]
-        features_and_state = DataFrame(np.zeros((len(features_window), 2)),
-                                       columns=['current_profit_pct', 'position'],
-                                       index=features_window.index)
-
-        features_and_state['current_profit_pct'] = self.get_unrealized_profit()
-        features_and_state['position'] = self._position.value
-        features_and_state = pd.concat([features_window, features_and_state], axis=1)
-        return features_and_state
-
-    def get_unrealized_profit(self):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        if self._position == Positions.Neutral:
-            return 0.
-        elif self._position == Positions.Short:
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            return (last_trade_price - current_price) / last_trade_price
-        elif self._position == Positions.Long:
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            return (current_price - last_trade_price) / last_trade_price
-        else:
-            return 0.
-
-    def is_tradesignal(self, action: int):
-        # trade signal
-        """
-        not trade signal is :
-        Action: Neutral, position: Neutral -> Nothing
-        Action: Long, position: Long -> Hold Long
-        Action: Short, position: Short -> Hold Short
-        """
-        return not ((action == Actions.Neutral.value and self._position == Positions.Neutral)
-                    or (action == Actions.Short.value and self._position == Positions.Short)
-                    or (action == Actions.Long.value and self._position == Positions.Long))
-
-    def _is_trade(self, action: Actions):
-        return ((action == Actions.Long.value and self._position == Positions.Short) or
-                (action == Actions.Short.value and self._position == Positions.Long) or
-                (action == Actions.Neutral.value and self._position == Positions.Long) or
-                (action == Actions.Neutral.value and self._position == Positions.Short)
-                )
-
-    def is_hold(self, action):
-        return ((action == Actions.Short.value and self._position == Positions.Short)
-                or (action == Actions.Long.value and self._position == Positions.Long))
-
-    def add_buy_fee(self, price):
-        return price * (1 + self.fee)
-
-    def add_sell_fee(self, price):
-        return price / (1 + self.fee)
-
-    def _update_history(self, info):
-        if not self.history:
-            self.history = {key: [] for key in info.keys()}
-
-        for key, value in info.items():
-            self.history[key].append(value)
-
-    def get_sharpe_ratio(self):
-        return mean_over_std(self.get_portfolio_log_returns())
-
-    def calculate_reward(self, action):
-
-        if self._last_trade_tick is None:
-            return 0.
-
-        # close long
-        if (action == Actions.Short.value or
-                action == Actions.Neutral.value) and self._position == Positions.Long:
-            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(current_price) - np.log(last_trade_price))
-
-        # close short
-        if (action == Actions.Long.value or
-                action == Actions.Neutral.value) and self._position == Positions.Short:
-            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
-            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
-            return float(np.log(last_trade_price) - np.log(current_price))
-
-        return 0.
-
-    def _update_profit(self, action):
-        if self._is_trade(action) or self._done:
-            pnl = self.get_unrealized_profit()
-
-            if self._position == Positions.Long:
-                self._total_profit = self._total_profit + self._total_profit * pnl
-                self._profits.append((self._current_tick, self._total_profit))
-                self.close_trade_profit.append(pnl)
-
-            if self._position == Positions.Short:
-                self._total_profit = self._total_profit + self._total_profit * pnl
-                self._profits.append((self._current_tick, self._total_profit))
-                self.close_trade_profit.append(pnl)
-
-    def most_recent_return(self, action: int):
-        """
-        We support Long, Neutral and Short positions.
-        Return is generated from rising prices in Long
-        and falling prices in Short positions.
-        The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
-        """
-        # Long positions
-        if self._position == Positions.Long:
-            current_price = self.prices.iloc[self._current_tick].open
-            if action == Actions.Short.value or action == Actions.Neutral.value:
-                current_price = self.add_sell_fee(current_price)
-
-            previous_price = self.prices.iloc[self._current_tick - 1].open
-
-            if (self._position_history[self._current_tick - 1] == Positions.Short
-                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
-                previous_price = self.add_buy_fee(previous_price)
-
-            return np.log(current_price) - np.log(previous_price)
-
-        # Short positions
-        if self._position == Positions.Short:
-            current_price = self.prices.iloc[self._current_tick].open
-            if action == Actions.Long.value or action == Actions.Neutral.value:
-                current_price = self.add_buy_fee(current_price)
-
-            previous_price = self.prices.iloc[self._current_tick - 1].open
-            if (self._position_history[self._current_tick - 1] == Positions.Long
-                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
-                previous_price = self.add_sell_fee(previous_price)
-
-            return np.log(previous_price) - np.log(current_price)
-
-        return 0
-
-    def get_portfolio_log_returns(self):
-        return self.portfolio_log_returns[1:self._current_tick + 1]
-
-    def update_portfolio_log_returns(self, action):
-        self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
-
-    def current_price(self) -> float:
-        return self.prices.iloc[self._current_tick].open
+#         # # spaces
+#         self.shape = (window_size, self.signal_features.shape[1] + 2)
+#         self.action_space = spaces.Discrete(len(Actions))
+#         self.observation_space = spaces.Box(
+#             low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
+
+#         # episode
+#         self._start_tick = self.window_size
+#         self._end_tick = len(self.prices) - 1
+#         self._done = None
+#         self._current_tick = None
+#         self._last_trade_tick = None
+#         self._position = Positions.Neutral
+#         self._position_history = None
+#         self.total_reward = None
+#         self._total_profit = None
+#         self._first_rendering = None
+#         self.history = None
+#         self.trade_history = []
+
+#     def seed(self, seed: int = 1):
+#         self.np_random, seed = seeding.np_random(seed)
+#         return [seed]
+
+#     def reset(self):
+
+#         self._done = False
+
+#         if self.starting_point is True:
+#             self._position_history = (self._start_tick * [None]) + [self._position]
+#         else:
+#             self._position_history = (self.window_size * [None]) + [self._position]
+
+#         self._current_tick = self._start_tick
+#         self._last_trade_tick = None
+#         self._position = Positions.Neutral
+
+#         self.total_reward = 0.
+#         self._total_profit = 1.  # unit
+#         self._first_rendering = True
+#         self.history = {}
+#         self.trade_history = []
+#         self.portfolio_log_returns = np.zeros(len(self.prices))
+
+#         self._profits = [(self._start_tick, 1)]
+#         self.close_trade_profit = []
+
+#         return self._get_observation()
+
+#     def step(self, action: int):
+#         self._done = False
+#         self._current_tick += 1
+
+#         if self._current_tick == self._end_tick:
+#             self._done = True
+
+#         self.update_portfolio_log_returns(action)
+
+#         self._update_profit(action)
+#         step_reward = self.calculate_reward(action)
+#         self.total_reward += step_reward
+
+#         trade_type = None
+#         if self.is_tradesignal(action):  # exclude 3 case not trade
+#             # Update position
+#             """
+#             Action: Neutral, position: Long ->  Close Long
+#             Action: Neutral, position: Short -> Close Short
+
+#             Action: Long, position: Neutral -> Open Long
+#             Action: Long, position: Short -> Close Short and Open Long
+
+#             Action: Short, position: Neutral -> Open Short
+#             Action: Short, position: Long -> Close Long and Open Short
+#             """
+
+#             if action == Actions.Neutral.value:
+#                 self._position = Positions.Neutral
+#                 trade_type = "neutral"
+#             elif action == Actions.Long.value:
+#                 self._position = Positions.Long
+#                 trade_type = "long"
+#             elif action == Actions.Short.value:
+#                 self._position = Positions.Short
+#                 trade_type = "short"
+#             else:
+#                 print("case not defined")
+
+#             # Update last trade tick
+#             self._last_trade_tick = self._current_tick
+
+#             if trade_type is not None:
+#                 self.trade_history.append(
+#                     {'price': self.current_price(), 'index': self._current_tick,
+#                      'type': trade_type})
+
+#         if self._total_profit < 0.2:
+#             self._done = True
+
+#         self._position_history.append(self._position)
+#         observation = self._get_observation()
+#         info = dict(
+#             tick=self._current_tick,
+#             total_reward=self.total_reward,
+#             total_profit=self._total_profit,
+#             position=self._position.value
+#         )
+#         self._update_history(info)
+
+#         return observation, step_reward, self._done, info
+
+#     def _get_observation(self):
+#         features_window = self.signal_features[(
+#             self._current_tick - self.window_size):self._current_tick]
+#         features_and_state = DataFrame(np.zeros((len(features_window), 2)),
+#                                        columns=['current_profit_pct', 'position'],
+#                                        index=features_window.index)
+
+#         features_and_state['current_profit_pct'] = self.get_unrealized_profit()
+#         features_and_state['position'] = self._position.value
+#         features_and_state = pd.concat([features_window, features_and_state], axis=1)
+#         return features_and_state
+
+#     def get_unrealized_profit(self):
+
+#         if self._last_trade_tick is None:
+#             return 0.
+
+#         if self._position == Positions.Neutral:
+#             return 0.
+#         elif self._position == Positions.Short:
+#             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+#             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+#             return (last_trade_price - current_price) / last_trade_price
+#         elif self._position == Positions.Long:
+#             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+#             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+#             return (current_price - last_trade_price) / last_trade_price
+#         else:
+#             return 0.
+
+#     def is_tradesignal(self, action: int):
+#         # trade signal
+#         """
+#         not trade signal is :
+#         Action: Neutral, position: Neutral -> Nothing
+#         Action: Long, position: Long -> Hold Long
+#         Action: Short, position: Short -> Hold Short
+#         """
+#         return not ((action == Actions.Neutral.value and self._position == Positions.Neutral)
+#                     or (action == Actions.Short.value and self._position == Positions.Short)
+#                     or (action == Actions.Long.value and self._position == Positions.Long))
+
+#     def _is_trade(self, action: Actions):
+#         return ((action == Actions.Long.value and self._position == Positions.Short) or
+#                 (action == Actions.Short.value and self._position == Positions.Long) or
+#                 (action == Actions.Neutral.value and self._position == Positions.Long) or
+#                 (action == Actions.Neutral.value and self._position == Positions.Short)
+#                 )
+
+#     def is_hold(self, action):
+#         return ((action == Actions.Short.value and self._position == Positions.Short)
+#                 or (action == Actions.Long.value and self._position == Positions.Long))
+
+#     def add_buy_fee(self, price):
+#         return price * (1 + self.fee)
+
+#     def add_sell_fee(self, price):
+#         return price / (1 + self.fee)
+
+#     def _update_history(self, info):
+#         if not self.history:
+#             self.history = {key: [] for key in info.keys()}
+
+#         for key, value in info.items():
+#             self.history[key].append(value)
+
+#     def get_sharpe_ratio(self):
+#         return mean_over_std(self.get_portfolio_log_returns())
+
+#     def calculate_reward(self, action):
+
+#         if self._last_trade_tick is None:
+#             return 0.
+
+#         # close long
+#         if (action == Actions.Short.value or
+#                 action == Actions.Neutral.value) and self._position == Positions.Long:
+#             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+#             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+#             return float(np.log(current_price) - np.log(last_trade_price))
+
+#         # close short
+#         if (action == Actions.Long.value or
+#                 action == Actions.Neutral.value) and self._position == Positions.Short:
+#             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+#             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+#             return float(np.log(last_trade_price) - np.log(current_price))
+
+#         return 0.
+
+#     def _update_profit(self, action):
+#         if self._is_trade(action) or self._done:
+#             pnl = self.get_unrealized_profit()
+
+#             if self._position == Positions.Long:
+#                 self._total_profit = self._total_profit + self._total_profit * pnl
+#                 self._profits.append((self._current_tick, self._total_profit))
+#                 self.close_trade_profit.append(pnl)
+
+#             if self._position == Positions.Short:
+#                 self._total_profit = self._total_profit + self._total_profit * pnl
+#                 self._profits.append((self._current_tick, self._total_profit))
+#                 self.close_trade_profit.append(pnl)
+
+#     def most_recent_return(self, action: int):
+#         """
+#         We support Long, Neutral and Short positions.
+#         Return is generated from rising prices in Long
+#         and falling prices in Short positions.
+#         The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
+#         """
+#         # Long positions
+#         if self._position == Positions.Long:
+#             current_price = self.prices.iloc[self._current_tick].open
+#             if action == Actions.Short.value or action == Actions.Neutral.value:
+#                 current_price = self.add_sell_fee(current_price)
+
+#             previous_price = self.prices.iloc[self._current_tick - 1].open
+
+#             if (self._position_history[self._current_tick - 1] == Positions.Short
+#                     or self._position_history[self._current_tick - 1] == Positions.Neutral):
+#                 previous_price = self.add_buy_fee(previous_price)
+
+#             return np.log(current_price) - np.log(previous_price)
+
+#         # Short positions
+#         if self._position == Positions.Short:
+#             current_price = self.prices.iloc[self._current_tick].open
+#             if action == Actions.Long.value or action == Actions.Neutral.value:
+#                 current_price = self.add_buy_fee(current_price)
+
+#             previous_price = self.prices.iloc[self._current_tick - 1].open
+#             if (self._position_history[self._current_tick - 1] == Positions.Long
+#                     or self._position_history[self._current_tick - 1] == Positions.Neutral):
+#                 previous_price = self.add_sell_fee(previous_price)
+
+#             return np.log(previous_price) - np.log(current_price)
+
+#         return 0
+
+#     def get_portfolio_log_returns(self):
+#         return self.portfolio_log_returns[1:self._current_tick + 1]
+
+#     def update_portfolio_log_returns(self, action):
+#         self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
+
+#     def current_price(self) -> float:
+#         return self.prices.iloc[self._current_tick].open
 
-    def prev_price(self) -> float:
-        return self.prices.iloc[self._current_tick - 1].open
+#     def prev_price(self) -> float:
+#         return self.prices.iloc[self._current_tick - 1].open
 
-    def sharpe_ratio(self) -> float:
-        if len(self.close_trade_profit) == 0:
-            return 0.
-        returns = np.array(self.close_trade_profit)
-        reward = (np.mean(returns) - 0. + 1e-9) / (np.std(returns) + 1e-9)
-        return reward
+#     def sharpe_ratio(self) -> float:
+#         if len(self.close_trade_profit) == 0:
+#             return 0.
+#         returns = np.array(self.close_trade_profit)
+#         reward = (np.mean(returns) - 0. + 1e-9) / (np.std(returns) + 1e-9)
+#         return reward
diff --git a/freqtrade/freqai/RL/Base5ActionRLEnv.py b/freqtrade/freqai/RL/Base5ActionRLEnv.py
index a14111495..64d7061fc 100644
--- a/freqtrade/freqai/RL/Base5ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py
@@ -1,6 +1,6 @@
 import logging
 from enum import Enum
-# from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Optional
 
 import gym
 import numpy as np
@@ -44,14 +44,14 @@ class Base5ActionRLEnv(gym.Env):
     def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
                  reward_kwargs: dict = {}, window_size=10, starting_point=True,
                  id: str = 'baseenv-1', seed: int = 1, config: dict = {}):
-        assert df.ndim == 2
 
         self.rl_config = config['freqai']['rl_config']
         self.id = id
         self.seed(seed)
         self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
 
-    def reset_env(self, df, prices, window_size, reward_kwargs, starting_point=True):
+    def reset_env(self, df: DataFrame, prices: DataFrame, window_size: int,
+                  reward_kwargs: dict, starting_point=True):
         self.df = df
         self.signal_features = self.df
         self.prices = prices
@@ -69,18 +69,18 @@ class Base5ActionRLEnv(gym.Env):
             low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
 
         # episode
-        self._start_tick = self.window_size
-        self._end_tick = len(self.prices) - 1
-        self._done = None
-        self._current_tick = None
-        self._last_trade_tick = None
+        self._start_tick: int = self.window_size
+        self._end_tick: int = len(self.prices) - 1
+        self._done: bool = False
+        self._current_tick: int = self._start_tick
+        self._last_trade_tick: Optional[int] = None
         self._position = Positions.Neutral
-        self._position_history = None
-        self.total_reward = None
-        self._total_profit = None
-        self._first_rendering = None
-        self.history = None
-        self.trade_history = []
+        self._position_history: list = [None]
+        self.total_reward: float = 0
+        self._total_profit: float = 0
+        self._first_rendering: bool = False
+        self.history: dict = {}
+        self.trade_history: list = []
 
     def seed(self, seed: int = 1):
         self.np_random, seed = seeding.np_random(seed)
@@ -125,8 +125,7 @@ class Base5ActionRLEnv(gym.Env):
         self.total_reward += step_reward
 
         trade_type = None
-        if self.is_tradesignal(action):  # exclude 3 case not trade
-            # Update position
+        if self.is_tradesignal(action):
             """
             Action: Neutral, position: Long ->  Close Long
             Action: Neutral, position: Short -> Close Short
@@ -223,9 +222,8 @@ class Base5ActionRLEnv(gym.Env):
         # trade signal
         """
         not trade signal is :
-        Action: Neutral, position: Neutral -> Nothing
-        Action: Long, position: Long -> Hold Long
-        Action: Short, position: Short -> Hold Short
+        Determine if the signal is non sensical
+        e.g.: agent wants a Actions.Long_exit while it is in a Positions.short
         """
         return not ((action == Actions.Neutral.value and self._position == Positions.Neutral) or
                     (action == Actions.Neutral.value and self._position == Positions.Short) or
@@ -292,7 +290,7 @@ class Base5ActionRLEnv(gym.Env):
 
     def most_recent_return(self, action: int):
         """
-        We support Long, Neutral and Short positions.
+        Calculate the tick to tick return if in a trade.
         Return is generated from rising prices in Long
         and falling prices in Short positions.
         The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
index a9a1377a8..6660709bd 100644
--- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
+++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
@@ -19,6 +19,7 @@ from typing import Callable
 from datetime import datetime, timezone
 from stable_baselines3.common.utils import set_random_seed
 import gym
+from pathlib import Path
 logger = logging.getLogger(__name__)
 
 torch.multiprocessing.set_sharing_strategy('file_system')
@@ -40,6 +41,8 @@ class BaseReinforcementLearningModel(IFreqaiModel):
         self.eval_env: Base5ActionRLEnv = None
         self.eval_callback: EvalCallback = None
         self.model_type = self.freqai_info['rl_config']['model_type']
+        self.rl_config = self.freqai_info['rl_config']
+        self.continual_retraining = self.rl_config['continual_retraining']
         if self.model_type in SB3_MODELS:
             import_str = 'stable_baselines3'
         elif self.model_type in SB3_CONTRIB_MODELS:
@@ -68,7 +71,6 @@ class BaseReinforcementLearningModel(IFreqaiModel):
 
         logger.info("--------------------Starting training " f"{pair} --------------------")
 
-        # filter the features requested by user in the configuration file and elegantly handle NaNs
         features_filtered, labels_filtered = dk.filter_features(
             unfiltered_dataframe,
             dk.training_features_list,
@@ -78,19 +80,19 @@ class BaseReinforcementLearningModel(IFreqaiModel):
 
         data_dictionary: Dict[str, Any] = dk.make_train_test_datasets(
             features_filtered, labels_filtered)
-        dk.fit_labels()  # useless for now, but just satiating append methods
+        dk.fit_labels()  # FIXME useless for now, but just satiating append methods
 
         # normalize all data based on train_dataset only
         prices_train, prices_test = self.build_ohlc_price_dataframes(dk.data_dictionary, pair, dk)
         data_dictionary = dk.normalize_data(data_dictionary)
 
-        # optional additional data cleaning/analysis
+        # data cleaning/analysis
         self.data_cleaning_train(dk)
 
         logger.info(
-            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
+            f' features and {len(data_dictionary["train_features"])} data points'
         )
-        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
 
         self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test, dk)
 
@@ -100,9 +102,11 @@ class BaseReinforcementLearningModel(IFreqaiModel):
 
         return model
 
-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk):
+    def set_train_and_eval_environments(self, data_dictionary: Dict[str, DataFrame],
+                                        prices_train: DataFrame, prices_test: DataFrame,
+                                        dk: FreqaiDataKitchen):
         """
-        User overrides this as shown here if they are using a custom MyRLEnv
+        User can override this if they are using a custom MyRLEnv
         """
         train_df = data_dictionary["train_features"]
         test_df = data_dictionary["test_features"]
@@ -114,18 +118,22 @@ class BaseReinforcementLearningModel(IFreqaiModel):
                                      reward_kwargs=self.reward_params, config=self.config)
             self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
                                     window_size=self.CONV_WIDTH,
-                                    reward_kwargs=self.reward_params, config=self.config), ".")
+                                    reward_kwargs=self.reward_params, config=self.config),
+                                    str(Path(dk.data_path / 'monitor')))
             self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
                                               render=False, eval_freq=eval_freq,
-                                              best_model_save_path=dk.data_path)
+                                              best_model_save_path=str(dk.data_path))
         else:
             self.train_env.reset()
             self.eval_env.reset()
             self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
             self.eval_env.reset_env(test_df, prices_test, self.CONV_WIDTH, self.reward_params)
+            # self.eval_callback.eval_env = self.eval_env
+            # self.eval_callback.best_model_save_path = str(dk.data_path)
+            # self.eval_callback._init_callback()
             self.eval_callback.__init__(self.eval_env, deterministic=True,
                                         render=False, eval_freq=eval_freq,
-                                        best_model_save_path=dk.data_path)
+                                        best_model_save_path=str(dk.data_path))
 
     @abstractmethod
     def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
@@ -137,19 +145,20 @@ class BaseReinforcementLearningModel(IFreqaiModel):
 
         return
 
-    def get_state_info(self, pair):
+    def get_state_info(self, pair: str):
         open_trades = Trade.get_trades_proxy(is_open=True)
         market_side = 0.5
-        current_profit = 0
+        current_profit: float = 0
         trade_duration = 0
         for trade in open_trades:
             if trade.pair == pair:
+                # FIXME: mypy typing doesnt like that strategy may be "None" (it never will be)
                 current_value = self.strategy.dp._exchange.get_rate(
                     pair, refresh=False, side="exit", is_short=trade.is_short)
                 openrate = trade.open_rate
                 now = datetime.now(timezone.utc).timestamp()
-                trade_duration = (now - trade.open_date.timestamp()) / self.base_tf_seconds
-                if 'long' in trade.enter_tag:
+                trade_duration = int((now - trade.open_date.timestamp()) / self.base_tf_seconds)
+                if 'long' in str(trade.enter_tag):
                     market_side = 1
                     current_profit = (current_value - openrate) / openrate
                 else:
@@ -245,8 +254,9 @@ class BaseReinforcementLearningModel(IFreqaiModel):
         return
 
 
-def make_env(env_id: str, rank: int, seed: int, train_df, price,
-             reward_params, window_size, monitor=False, config={}) -> Callable:
+def make_env(env_id: str, rank: int, seed: int, train_df: DataFrame, price: DataFrame,
+             reward_params: Dict[str, int], window_size: int, monitor: bool = False,
+             config: Dict[str, Any] = {}) -> Callable:
     """
     Utility function for multiprocessed env.
 
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearnerCustomAgent.py b/freqtrade/freqai/RL/ReinforcementLearnerCustomAgent.py
similarity index 95%
rename from freqtrade/freqai/prediction_models/ReinforcementLearnerCustomAgent.py
rename to freqtrade/freqai/RL/ReinforcementLearnerCustomAgent.py
index bb16b612b..fcd813ce6 100644
--- a/freqtrade/freqai/prediction_models/ReinforcementLearnerCustomAgent.py
+++ b/freqtrade/freqai/RL/ReinforcementLearnerCustomAgent.py
@@ -22,6 +22,12 @@ class ReinforcementLearnerCustomAgent(BaseReinforcementLearningModel):
     """
     User can customize agent by defining the class and using it directly.
     Here the example is "TDQN"
+
+    Warning!
+    This is an advanced example of how a user may create and use a highly
+    customized model class (which can inherit from existing classes,
+    similar to how the example below inherits from DQN).
+    This file is for example purposes only, and should not be run.
     """
 
     def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
@@ -34,7 +40,7 @@ class ReinforcementLearnerCustomAgent(BaseReinforcementLearningModel):
 
         # TDQN is a custom agent defined below
         model = TDQN(self.policy_type, self.train_env,
-                     tensorboard_log=Path(dk.data_path / "tensorboard"),
+                     tensorboard_log=str(Path(dk.data_path / "tensorboard")),
                      policy_kwargs=policy_kwargs,
                      **self.freqai_info['model_training_parameters']
                      )
@@ -217,7 +223,7 @@ class TDQN(DQN):
         exploration_initial_eps: float = 1.0,
         exploration_final_eps: float = 0.05,
         max_grad_norm: float = 10,
-        tensorboard_log: Optional[Path] = None,
+        tensorboard_log: Optional[str] = None,
         create_eval_env: bool = False,
         policy_kwargs: Optional[Dict[str, Any]] = None,
         verbose: int = 1,
diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py
index c37973551..ae3e92f5e 100644
--- a/freqtrade/freqai/data_drawer.py
+++ b/freqtrade/freqai/data_drawer.py
@@ -485,6 +485,10 @@ class FreqaiDataDrawer:
                 f"Unable to load model, ensure model exists at " f"{dk.data_path} "
             )
 
+        # load it into ram if it was loaded from disk
+        if coin not in self.model_dictionary:
+            self.model_dictionary[coin] = model
+
         if self.config["freqai"]["feature_parameters"]["principal_component_analysis"]:
             dk.pca = cloudpickle.load(
                 open(dk.data_path / f"{dk.model_filename}_pca_object.pkl", "rb")
diff --git a/freqtrade/freqai/example_strats/ReinforcementLearningExample5ac.py b/freqtrade/freqai/example_strats/ReinforcementLearningExample5ac.py
index 437b53b05..15a263b94 100644
--- a/freqtrade/freqai/example_strats/ReinforcementLearningExample5ac.py
+++ b/freqtrade/freqai/example_strats/ReinforcementLearningExample5ac.py
@@ -76,7 +76,8 @@ class ReinforcementLearningExample5ac(IStrategy):
         informative[f"%-{coin}pct-change"] = informative["close"].pct_change()
         informative[f"%-{coin}raw_volume"] = informative["volume"]
 
-        # The following features are necessary for RL models
+        # FIXME: add these outside the user strategy?
+        # The following columns are necessary for RL models.
         informative[f"%-{coin}raw_close"] = informative["close"]
         informative[f"%-{coin}raw_open"] = informative["open"]
         informative[f"%-{coin}raw_high"] = informative["high"]
diff --git a/freqtrade/freqai/prediction_models/BaseClassifierModel.py b/freqtrade/freqai/prediction_models/BaseClassifierModel.py
index 2edbf3b51..042f43199 100644
--- a/freqtrade/freqai/prediction_models/BaseClassifierModel.py
+++ b/freqtrade/freqai/prediction_models/BaseClassifierModel.py
@@ -57,9 +57,9 @@ class BaseClassifierModel(IFreqaiModel):
         self.data_cleaning_train(dk)
 
         logger.info(
-            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
+            f' features and {len(data_dictionary["train_features"])} data points'
         )
-        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
 
         model = self.fit(data_dictionary)
 
diff --git a/freqtrade/freqai/prediction_models/BaseRegressionModel.py b/freqtrade/freqai/prediction_models/BaseRegressionModel.py
index 2ef175a2e..6ca9ae8cb 100644
--- a/freqtrade/freqai/prediction_models/BaseRegressionModel.py
+++ b/freqtrade/freqai/prediction_models/BaseRegressionModel.py
@@ -56,9 +56,9 @@ class BaseRegressionModel(IFreqaiModel):
         self.data_cleaning_train(dk)
 
         logger.info(
-            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
+            f' features and {len(data_dictionary["train_features"])} data points'
         )
-        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
 
         model = self.fit(data_dictionary)
 
diff --git a/freqtrade/freqai/prediction_models/BaseTensorFlowModel.py b/freqtrade/freqai/prediction_models/BaseTensorFlowModel.py
index 04eff045f..6a842f007 100644
--- a/freqtrade/freqai/prediction_models/BaseTensorFlowModel.py
+++ b/freqtrade/freqai/prediction_models/BaseTensorFlowModel.py
@@ -53,9 +53,9 @@ class BaseTensorFlowModel(IFreqaiModel):
         self.data_cleaning_train(dk)
 
         logger.info(
-            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
+            f' features and {len(data_dictionary["train_features"])} data points'
         )
-        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
 
         model = self.fit(data_dictionary)
 
diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearner.py b/freqtrade/freqai/prediction_models/ReinforcementLearner.py
index d3e6bde7c..254fd32b0 100644
--- a/freqtrade/freqai/prediction_models/ReinforcementLearner.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearner.py
@@ -1,7 +1,6 @@
 import logging
-from typing import Any, Dict  # , Tuple
+from typing import Any, Dict
 
-# import numpy.typing as npt
 import torch as th
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions
@@ -22,12 +21,18 @@ class ReinforcementLearner(BaseReinforcementLearningModel):
         total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
 
         policy_kwargs = dict(activation_fn=th.nn.ReLU,
-                             net_arch=[256, 256, 128])
+                             net_arch=[512, 512, 256])
 
-        model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
-                                tensorboard_log=Path(dk.data_path / "tensorboard"),
-                                **self.freqai_info['model_training_parameters']
-                                )
+        if dk.pair not in self.dd.model_dictionary or not self.continual_retraining:
+            model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
+                                    tensorboard_log=Path(dk.data_path / "tensorboard"),
+                                    **self.freqai_info['model_training_parameters']
+                                    )
+        else:
+            logger.info('Continual training activated - starting training from previously '
+                        'trained agent.')
+            model = self.dd.model_dictionary[dk.pair]
+            model.set_env(self.train_env)
 
         model.learn(
             total_timesteps=int(total_timesteps),