reuse callback, allow user to acces all stable_baselines3 agents via config

2022-08-20 16:35:29 +02:00
parent 4b9499e321
commit 3eb897c2f8
11 changed files with 295 additions and 587 deletions
@@ -266,59 +266,28 @@ class Base5ActionRLEnv(gym.Env):

        # close long
        if action == Actions.Long_exit.value and self._position == Positions.Long:
-            if len(self.close_trade_profit):
-                # aim x2 rw
-                if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                    last_trade_price = self.add_buy_fee(
-                        self.prices.iloc[self._last_trade_tick].open)
-                    current_price = self.add_sell_fee(
-                        self.prices.iloc[self._current_tick].open)
-                    return float((np.log(current_price) - np.log(last_trade_price)) * 2)
-                # less than aim x1 rw
-                elif self.close_trade_profit[-1] < self.profit_aim * self.rr:
-                    last_trade_price = self.add_buy_fee(
-                        self.prices.iloc[self._last_trade_tick].open
-                    )
-                    current_price = self.add_sell_fee(
-                        self.prices.iloc[self._current_tick].open
-                    )
-                    return float(np.log(current_price) - np.log(last_trade_price))
-                # # less than RR SL x2 neg rw
-                # elif self.close_trade_profit[-1] < (self.profit_aim * -1):
-                #     last_trade_price = self.add_buy_fee(
-                #         self.prices.iloc[self._last_trade_tick].open)
-                #     current_price = self.add_sell_fee(
-                #         self.prices.iloc[self._current_tick].open)
-                #     return float((np.log(current_price) - np.log(last_trade_price)) * 2) * -1
+            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+            return float(np.log(current_price) - np.log(last_trade_price))
+
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(current_price) - np.log(last_trade_price)) * 2)

        # close short
        if action == Actions.Short_exit.value and self._position == Positions.Short:
-            if len(self.close_trade_profit):
-                # aim x2 rw
-                if self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                    last_trade_price = self.add_sell_fee(
-                        self.prices.iloc[self._last_trade_tick].open
-                    )
-                    current_price = self.add_buy_fee(
-                        self.prices.iloc[self._current_tick].open
-                    )
-                    return float((np.log(last_trade_price) - np.log(current_price)) * 2)
-                # less than aim x1 rw
-                elif self.close_trade_profit[-1] < self.profit_aim * self.rr:
-                    last_trade_price = self.add_sell_fee(
-                        self.prices.iloc[self._last_trade_tick].open
-                    )
-                    current_price = self.add_buy_fee(
-                        self.prices.iloc[self._current_tick].open
-                    )
-                    return float(np.log(last_trade_price) - np.log(current_price))
-                # # less than RR SL x2 neg rw
-                # elif self.close_trade_profit[-1] > self.profit_aim * self.rr:
-                #     last_trade_price = self.add_sell_fee(
-                #         self.prices.iloc[self._last_trade_tick].open)
-                #     current_price = self.add_buy_fee(
-                #         self.prices.iloc[self._current_tick].open)
-                #     return float((np.log(last_trade_price) - np.log(current_price)) * 2) * -1
+            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+            return float(np.log(last_trade_price) - np.log(current_price))
+
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
+            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
+                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
+                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
+                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
+
        return 0.

    def _update_profit(self, action):
@@ -11,8 +11,12 @@ from freqtrade.freqai.freqai_interface import IFreqaiModel
 from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
 from freqtrade.persistence import Trade
 import torch.multiprocessing
+from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.monitor import Monitor
 import torch as th
+from typing import Callable
+from stable_baselines3.common.utils import set_random_seed
+import gym
 logger = logging.getLogger(__name__)

 torch.multiprocessing.set_sharing_strategy('file_system')
@@ -25,9 +29,15 @@ class BaseReinforcementLearningModel(IFreqaiModel):

    def __init__(self, **kwargs):
        super().__init__(config=kwargs['config'])
-        th.set_num_threads(self.freqai_info.get('data_kitchen_thread_count', 4))
+        th.set_num_threads(self.freqai_info['rl_config'].get('thread_count', 4))
        self.reward_params = self.freqai_info['rl_config']['model_reward_parameters']
        self.train_env: Base5ActionRLEnv = None
+        self.eval_env: Base5ActionRLEnv = None
+        self.eval_callback: EvalCallback = None
+        mod = __import__('stable_baselines3', fromlist=[
+                         self.freqai_info['rl_config']['model_type']])
+        self.MODELCLASS = getattr(mod, self.freqai_info['rl_config']['model_type'])
+        self.policy_type = self.freqai_info['rl_config']['policy_type']

    def train(
        self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen
@@ -67,7 +77,7 @@ class BaseReinforcementLearningModel(IFreqaiModel):
        )
        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')

-        self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test)
+        self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test, dk)

        model = self.fit_rl(data_dictionary, dk)

@@ -75,13 +85,13 @@ class BaseReinforcementLearningModel(IFreqaiModel):

        return model

-    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
+    def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk):
        """
-        User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
-        leaving this will default to Base5ActEnv
+        User overrides this as shown here if they are using a custom MyRLEnv
        """
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
+        eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)

        # environments
        if not self.train_env:
@@ -90,11 +100,17 @@ class BaseReinforcementLearningModel(IFreqaiModel):
            self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
                                    window_size=self.CONV_WIDTH,
                                    reward_kwargs=self.reward_params), ".")
+            self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
+                                              render=False, eval_freq=eval_freq,
+                                              best_model_save_path=dk.data_path)
        else:
-            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
-            self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
            self.train_env.reset()
            self.eval_env.reset()
+            self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
+            self.eval_env.reset_env(test_df, prices_test, self.CONV_WIDTH, self.reward_params)
+            self.eval_callback.__init__(self.eval_env, deterministic=True,
+                                        render=False, eval_freq=eval_freq,
+                                        best_model_save_path=dk.data_path)

    @abstractmethod
    def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
@@ -206,16 +222,28 @@ class BaseReinforcementLearningModel(IFreqaiModel):
    # all the other existing fit() functions to include dk argument. For now we instantiate and
    # leave it.
    def fit(self, data_dictionary: Dict[str, Any], pair: str = '') -> Any:
-        """
-        Most regressors use the same function names and arguments e.g. user
-        can drop in LGBMRegressor in place of CatBoostRegressor and all data
-        management will be properly handled by Freqai.
-        :param data_dictionary: Dict = the dictionary constructed by DataHandler to hold
-                                all the training and test data/labels.
-        """
-
        return

+def make_env(env_id: str, rank: int, seed: int, train_df, price,
+             reward_params, window_size, monitor=False) -> Callable:
+    """
+    Utility function for multiprocessed env.
+
+    :param env_id: (str) the environment ID
+    :param num_env: (int) the number of environment you wish to have in subprocesses
+    :param seed: (int) the inital seed for RNG
+    :param rank: (int) index of the subprocess
+    :return: (Callable)
+    """
+    def _init() -> gym.Env:
+
+        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
+                      reward_kwargs=reward_params, id=env_id, seed=seed + rank)
+        if monitor:
+            env = Monitor(env, ".")
+        return env
+    set_random_seed(seed)
+    return _init

 class MyRLEnv(Base5ActionRLEnv):
    """
@@ -229,24 +257,24 @@ class MyRLEnv(Base5ActionRLEnv):
            return 0.

        # close long
-        if action == Actions.Long_sell.value and self._position == Positions.Long:
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))

-        if action == Actions.Long_sell.value and self._position == Positions.Long:
+        if action == Actions.Long_exit.value and self._position == Positions.Long:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(current_price) - np.log(last_trade_price)) * 2)

        # close short
-        if action == Actions.Short_buy.value and self._position == Positions.Short:
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))

-        if action == Actions.Short_buy.value and self._position == Positions.Short:
+        if action == Actions.Short_exit.value and self._position == Positions.Short:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
@@ -1,213 +0,0 @@
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
-
-import gym
-import torch
-import torch as th
-from stable_baselines3 import DQN
-from stable_baselines3.common.buffers import ReplayBuffer
-from stable_baselines3.common.policies import BasePolicy
-from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor,
-                                                   FlattenExtractor)
-from stable_baselines3.common.type_aliases import GymEnv, Schedule
-from stable_baselines3.dqn.policies import (CnnPolicy, DQNPolicy, MlpPolicy,
-                                            QNetwork)
-from torch import nn
-
-
-def create_mlp_(
-    input_dim: int,
-    output_dim: int,
-    net_arch: List[int],
-    activation_fn: Type[nn.Module] = nn.ReLU,
-    squash_output: bool = False,
-) -> List[nn.Module]:
-    dropout = 0.2
-    if len(net_arch) > 0:
-        number_of_neural = net_arch[0]
-
-    modules = [
-        nn.Linear(input_dim, number_of_neural),
-        nn.BatchNorm1d(number_of_neural),
-        nn.LeakyReLU(),
-        nn.Dropout(dropout),
-        nn.Linear(number_of_neural, number_of_neural),
-        nn.BatchNorm1d(number_of_neural),
-        nn.LeakyReLU(),
-        nn.Dropout(dropout),
-        nn.Linear(number_of_neural, number_of_neural),
-        nn.BatchNorm1d(number_of_neural),
-        nn.LeakyReLU(),
-        nn.Dropout(dropout),
-        nn.Linear(number_of_neural, number_of_neural),
-        nn.BatchNorm1d(number_of_neural),
-        nn.LeakyReLU(),
-        nn.Dropout(dropout),
-        nn.Linear(number_of_neural, output_dim)
-    ]
-    return modules
-
-
-class TDQNetwork(QNetwork):
-    def __init__(self,
-                 observation_space: gym.spaces.Space,
-                 action_space: gym.spaces.Space,
-                 features_extractor: nn.Module,
-                 features_dim: int,
-                 net_arch: Optional[List[int]] = None,
-                 activation_fn: Type[nn.Module] = nn.ReLU,
-                 normalize_images: bool = True
-                 ):
-        super().__init__(
-            observation_space=observation_space,
-            action_space=action_space,
-            features_extractor=features_extractor,
-            features_dim=features_dim,
-            net_arch=net_arch,
-            activation_fn=activation_fn,
-            normalize_images=normalize_images
-        )
-        action_dim = self.action_space.n
-        q_net = create_mlp_(self.features_dim, action_dim, self.net_arch, self.activation_fn)
-        self.q_net = nn.Sequential(*q_net).apply(self.init_weights)
-
-    def init_weights(self, m):
-        if type(m) == nn.Linear:
-            torch.nn.init.kaiming_uniform_(m.weight)
-
-
-class TDQNPolicy(DQNPolicy):
-
-    def __init__(
-        self,
-        observation_space: gym.spaces.Space,
-        action_space: gym.spaces.Space,
-        lr_schedule: Schedule,
-        net_arch: Optional[List[int]] = None,
-        activation_fn: Type[nn.Module] = nn.ReLU,
-        features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
-        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
-        normalize_images: bool = True,
-        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
-        optimizer_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        super().__init__(
-            observation_space=observation_space,
-            action_space=action_space,
-            lr_schedule=lr_schedule,
-            net_arch=net_arch,
-            activation_fn=activation_fn,
-            features_extractor_class=features_extractor_class,
-            features_extractor_kwargs=features_extractor_kwargs,
-            normalize_images=normalize_images,
-            optimizer_class=optimizer_class,
-            optimizer_kwargs=optimizer_kwargs
-        )
-
-    @staticmethod
-    def init_weights(module: nn.Module, gain: float = 1) -> None:
-        """
-        Orthogonal initialization (used in PPO and A2C)
-        """
-        if isinstance(module, (nn.Linear, nn.Conv2d)):
-            nn.init.kaiming_uniform_(module.weight)
-            if module.bias is not None:
-                module.bias.data.fill_(0.0)
-
-    def make_q_net(self) -> TDQNetwork:
-        # Make sure we always have separate networks for features extractors etc
-        net_args = self._update_features_extractor(self.net_args, features_extractor=None)
-        return TDQNetwork(**net_args).to(self.device)
-
-
-class TMultiInputPolicy(TDQNPolicy):
-    def __init__(
-        self,
-        observation_space: gym.spaces.Space,
-        action_space: gym.spaces.Space,
-        lr_schedule: Schedule,
-        net_arch: Optional[List[int]] = None,
-        activation_fn: Type[nn.Module] = nn.ReLU,
-        features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
-        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
-        normalize_images: bool = True,
-        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
-        optimizer_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        super().__init__(
-            observation_space,
-            action_space,
-            lr_schedule,
-            net_arch,
-            activation_fn,
-            features_extractor_class,
-            features_extractor_kwargs,
-            normalize_images,
-            optimizer_class,
-            optimizer_kwargs,
-        )
-
-
-class TDQN(DQN):
-
-    policy_aliases: Dict[str, Type[BasePolicy]] = {
-        "MlpPolicy": MlpPolicy,
-        "CnnPolicy": CnnPolicy,
-        "TMultiInputPolicy": TMultiInputPolicy,
-    }
-
-    def __init__(
-        self,
-        policy: Union[str, Type[TDQNPolicy]],
-        env: Union[GymEnv, str],
-        learning_rate: Union[float, Schedule] = 1e-4,
-        buffer_size: int = 1000000,  # 1e6
-        learning_starts: int = 50000,
-        batch_size: int = 32,
-        tau: float = 1.0,
-        gamma: float = 0.99,
-        train_freq: Union[int, Tuple[int, str]] = 4,
-        gradient_steps: int = 1,
-        replay_buffer_class: Optional[ReplayBuffer] = None,
-        replay_buffer_kwargs: Optional[Dict[str, Any]] = None,
-        optimize_memory_usage: bool = False,
-        target_update_interval: int = 10000,
-        exploration_fraction: float = 0.1,
-        exploration_initial_eps: float = 1.0,
-        exploration_final_eps: float = 0.05,
-        max_grad_norm: float = 10,
-        tensorboard_log: Optional[str] = None,
-        create_eval_env: bool = False,
-        policy_kwargs: Optional[Dict[str, Any]] = None,
-        verbose: int = 1,
-        seed: Optional[int] = None,
-        device: Union[th.device, str] = "auto",
-        _init_setup_model: bool = True,
-    ):
-
-        super().__init__(
-            policy=policy,
-            env=env,
-            learning_rate=learning_rate,
-            buffer_size=buffer_size,
-            learning_starts=learning_starts,
-            batch_size=batch_size,
-            tau=tau,
-            gamma=gamma,
-            train_freq=train_freq,
-            gradient_steps=gradient_steps,
-            replay_buffer_class=replay_buffer_class,  # No action noise
-            replay_buffer_kwargs=replay_buffer_kwargs,
-            optimize_memory_usage=optimize_memory_usage,
-            target_update_interval=target_update_interval,
-            exploration_fraction=exploration_fraction,
-            exploration_initial_eps=exploration_initial_eps,
-            exploration_final_eps=exploration_final_eps,
-            max_grad_norm=max_grad_norm,
-            tensorboard_log=tensorboard_log,
-            create_eval_env=create_eval_env,
-            policy_kwargs=policy_kwargs,
-            verbose=verbose,
-            seed=seed,
-            device=device,
-            _init_setup_model=_init_setup_model
-        )