stable/freqtrade/freqai/RL/TDQNagent.py

214 lines
7.5 KiB
Python
Raw Normal View History

2022-08-14 13:24:20 +00:00
from typing import Any, Dict, List, Optional, Tuple, Type, Union
2022-08-13 15:48:58 +00:00
import gym
2022-08-14 13:24:20 +00:00
import torch
import torch as th
2022-08-13 15:48:58 +00:00
from stable_baselines3 import DQN
2022-08-14 13:24:20 +00:00
from stable_baselines3.common.buffers import ReplayBuffer
2022-08-13 15:48:58 +00:00
from stable_baselines3.common.policies import BasePolicy
from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor,
2022-08-14 13:24:20 +00:00
FlattenExtractor)
from stable_baselines3.common.type_aliases import GymEnv, Schedule
from stable_baselines3.dqn.policies import (CnnPolicy, DQNPolicy, MlpPolicy,
2022-08-14 13:24:20 +00:00
QNetwork)
from torch import nn
2022-08-13 15:48:58 +00:00
def create_mlp_(
input_dim: int,
output_dim: int,
net_arch: List[int],
activation_fn: Type[nn.Module] = nn.ReLU,
squash_output: bool = False,
) -> List[nn.Module]:
dropout = 0.2
if len(net_arch) > 0:
number_of_neural = net_arch[0]
2022-08-14 13:24:20 +00:00
2022-08-13 15:48:58 +00:00
modules = [
nn.Linear(input_dim, number_of_neural),
nn.BatchNorm1d(number_of_neural),
nn.LeakyReLU(),
nn.Dropout(dropout),
nn.Linear(number_of_neural, number_of_neural),
nn.BatchNorm1d(number_of_neural),
nn.LeakyReLU(),
nn.Dropout(dropout),
nn.Linear(number_of_neural, number_of_neural),
nn.BatchNorm1d(number_of_neural),
nn.LeakyReLU(),
nn.Dropout(dropout),
nn.Linear(number_of_neural, number_of_neural),
nn.BatchNorm1d(number_of_neural),
nn.LeakyReLU(),
nn.Dropout(dropout),
nn.Linear(number_of_neural, output_dim)
]
return modules
2022-08-13 15:48:58 +00:00
class TDQNetwork(QNetwork):
def __init__(self,
observation_space: gym.spaces.Space,
action_space: gym.spaces.Space,
features_extractor: nn.Module,
features_dim: int,
net_arch: Optional[List[int]] = None,
activation_fn: Type[nn.Module] = nn.ReLU,
normalize_images: bool = True
):
2022-08-13 15:48:58 +00:00
super().__init__(
observation_space=observation_space,
action_space=action_space,
features_extractor=features_extractor,
features_dim=features_dim,
net_arch=net_arch,
activation_fn=activation_fn,
2022-08-14 13:24:20 +00:00
normalize_images=normalize_images
2022-08-13 15:48:58 +00:00
)
action_dim = self.action_space.n
q_net = create_mlp_(self.features_dim, action_dim, self.net_arch, self.activation_fn)
self.q_net = nn.Sequential(*q_net).apply(self.init_weights)
2022-08-14 13:24:20 +00:00
2022-08-13 15:48:58 +00:00
def init_weights(self, m):
if type(m) == nn.Linear:
torch.nn.init.kaiming_uniform_(m.weight)
2022-08-14 13:24:20 +00:00
2022-08-13 15:48:58 +00:00
class TDQNPolicy(DQNPolicy):
2022-08-14 13:24:20 +00:00
2022-08-13 15:48:58 +00:00
def __init__(
self,
observation_space: gym.spaces.Space,
action_space: gym.spaces.Space,
lr_schedule: Schedule,
net_arch: Optional[List[int]] = None,
activation_fn: Type[nn.Module] = nn.ReLU,
features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
features_extractor_kwargs: Optional[Dict[str, Any]] = None,
normalize_images: bool = True,
optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
optimizer_kwargs: Optional[Dict[str, Any]] = None,
):
super().__init__(
observation_space=observation_space,
action_space=action_space,
lr_schedule=lr_schedule,
net_arch=net_arch,
activation_fn=activation_fn,
features_extractor_class=features_extractor_class,
features_extractor_kwargs=features_extractor_kwargs,
normalize_images=normalize_images,
optimizer_class=optimizer_class,
optimizer_kwargs=optimizer_kwargs
)
2022-08-14 13:24:20 +00:00
2022-08-13 15:48:58 +00:00
@staticmethod
def init_weights(module: nn.Module, gain: float = 1) -> None:
"""
Orthogonal initialization (used in PPO and A2C)
"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
nn.init.kaiming_uniform_(module.weight)
if module.bias is not None:
module.bias.data.fill_(0.0)
2022-08-14 13:24:20 +00:00
2022-08-13 15:48:58 +00:00
def make_q_net(self) -> TDQNetwork:
# Make sure we always have separate networks for features extractors etc
net_args = self._update_features_extractor(self.net_args, features_extractor=None)
return TDQNetwork(**net_args).to(self.device)
2022-08-14 13:24:20 +00:00
2022-08-13 15:48:58 +00:00
class TMultiInputPolicy(TDQNPolicy):
def __init__(
self,
observation_space: gym.spaces.Space,
action_space: gym.spaces.Space,
lr_schedule: Schedule,
net_arch: Optional[List[int]] = None,
activation_fn: Type[nn.Module] = nn.ReLU,
features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
features_extractor_kwargs: Optional[Dict[str, Any]] = None,
normalize_images: bool = True,
optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
optimizer_kwargs: Optional[Dict[str, Any]] = None,
):
super().__init__(
observation_space,
action_space,
lr_schedule,
net_arch,
activation_fn,
features_extractor_class,
features_extractor_kwargs,
normalize_images,
optimizer_class,
optimizer_kwargs,
)
2022-08-14 13:24:20 +00:00
2022-08-13 15:48:58 +00:00
class TDQN(DQN):
policy_aliases: Dict[str, Type[BasePolicy]] = {
"MlpPolicy": MlpPolicy,
"CnnPolicy": CnnPolicy,
"TMultiInputPolicy": TMultiInputPolicy,
}
def __init__(
self,
policy: Union[str, Type[TDQNPolicy]],
env: Union[GymEnv, str],
learning_rate: Union[float, Schedule] = 1e-4,
buffer_size: int = 1000000, # 1e6
learning_starts: int = 50000,
batch_size: int = 32,
tau: float = 1.0,
gamma: float = 0.99,
train_freq: Union[int, Tuple[int, str]] = 4,
gradient_steps: int = 1,
replay_buffer_class: Optional[ReplayBuffer] = None,
replay_buffer_kwargs: Optional[Dict[str, Any]] = None,
optimize_memory_usage: bool = False,
target_update_interval: int = 10000,
exploration_fraction: float = 0.1,
exploration_initial_eps: float = 1.0,
exploration_final_eps: float = 0.05,
max_grad_norm: float = 10,
tensorboard_log: Optional[str] = None,
create_eval_env: bool = False,
policy_kwargs: Optional[Dict[str, Any]] = None,
verbose: int = 1,
seed: Optional[int] = None,
device: Union[th.device, str] = "auto",
_init_setup_model: bool = True,
):
super().__init__(
policy=policy,
env=env,
learning_rate=learning_rate,
buffer_size=buffer_size,
learning_starts=learning_starts,
batch_size=batch_size,
tau=tau,
gamma=gamma,
train_freq=train_freq,
gradient_steps=gradient_steps,
replay_buffer_class=replay_buffer_class, # No action noise
replay_buffer_kwargs=replay_buffer_kwargs,
optimize_memory_usage=optimize_memory_usage,
target_update_interval=target_update_interval,
exploration_fraction=exploration_fraction,
exploration_initial_eps=exploration_initial_eps,
exploration_final_eps=exploration_final_eps,
max_grad_norm=max_grad_norm,
tensorboard_log=tensorboard_log,
create_eval_env=create_eval_env,
policy_kwargs=policy_kwargs,
verbose=verbose,
seed=seed,
device=device,
_init_setup_model=_init_setup_model
)