reuse callback, allow user to acces all stable_baselines3 agents via config
This commit is contained in:
82
freqtrade/freqai/prediction_models/ReinforcementLearner.py
Normal file
82
freqtrade/freqai/prediction_models/ReinforcementLearner.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import logging
|
||||
from typing import Any, Dict # , Tuple
|
||||
|
||||
# import numpy.typing as npt
|
||||
import torch as th
|
||||
import numpy as np
|
||||
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
|
||||
from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions
|
||||
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ReinforcementLearner(BaseReinforcementLearningModel):
|
||||
"""
|
||||
User created Reinforcement Learning Model prediction model.
|
||||
"""
|
||||
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
|
||||
|
||||
train_df = data_dictionary["train_features"]
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
policy_kwargs = dict(activation_fn=th.nn.ReLU,
|
||||
net_arch=[256, 256, 128])
|
||||
|
||||
model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=Path(dk.data_path / "tensorboard"),
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
total_timesteps=int(total_timesteps),
|
||||
callback=self.eval_callback
|
||||
)
|
||||
|
||||
if Path(dk.data_path / "best_model.zip").is_file():
|
||||
logger.info('Callback found a best model.')
|
||||
best_model = self.MODELCLASS.load(dk.data_path / "best_model")
|
||||
return best_model
|
||||
|
||||
logger.info('Couldnt find best model, using final model instead.')
|
||||
|
||||
return model
|
||||
|
||||
|
||||
class MyRLEnv(Base5ActionRLEnv):
|
||||
"""
|
||||
User can modify any part of the environment by overriding base
|
||||
functions
|
||||
"""
|
||||
def calculate_reward(self, action):
|
||||
|
||||
if self._last_trade_tick is None:
|
||||
return 0.
|
||||
|
||||
# close long
|
||||
if action == Actions.Long_exit.value and self._position == Positions.Long:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
if action == Actions.Long_exit.value and self._position == Positions.Long:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(current_price) - np.log(last_trade_price)) * 2)
|
||||
|
||||
# close short
|
||||
if action == Actions.Short_exit.value and self._position == Positions.Short:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
if action == Actions.Short_exit.value and self._position == Positions.Short:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(last_trade_price) - np.log(current_price)) * 2)
|
||||
|
||||
return 0.
|
@@ -0,0 +1,255 @@
|
||||
import logging
|
||||
import torch as th
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
||||
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
|
||||
from stable_baselines3 import DQN
|
||||
from stable_baselines3.common.buffers import ReplayBuffer
|
||||
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
|
||||
from pathlib import Path
|
||||
from stable_baselines3.dqn.policies import (CnnPolicy, DQNPolicy, MlpPolicy,
|
||||
QNetwork)
|
||||
from torch import nn
|
||||
import gym
|
||||
from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor,
|
||||
FlattenExtractor)
|
||||
from stable_baselines3.common.type_aliases import GymEnv, Schedule
|
||||
from stable_baselines3.common.policies import BasePolicy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ReinforcementLearnerCustomAgent(BaseReinforcementLearningModel):
|
||||
"""
|
||||
User can customize agent by defining the class and using it directly.
|
||||
Here the example is "TDQN"
|
||||
"""
|
||||
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
|
||||
|
||||
train_df = data_dictionary["train_features"]
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
policy_kwargs = dict(activation_fn=th.nn.ReLU,
|
||||
net_arch=[256, 256, 128])
|
||||
|
||||
# TDQN is a custom agent defined below
|
||||
model = TDQN(self.policy_type, self.train_env,
|
||||
tensorboard_log=Path(dk.data_path / "tensorboard"),
|
||||
policy_kwargs=policy_kwargs,
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
total_timesteps=int(total_timesteps),
|
||||
callback=self.eval_callback
|
||||
)
|
||||
|
||||
if Path(dk.data_path / "best_model.zip").is_file():
|
||||
logger.info('Callback found a best model.')
|
||||
best_model = self.MODELCLASS.load(dk.data_path / "best_model")
|
||||
return best_model
|
||||
|
||||
logger.info('Couldnt find best model, using final model instead.')
|
||||
|
||||
return model
|
||||
|
||||
# User creates their custom agent and networks as shown below
|
||||
|
||||
|
||||
def create_mlp_(
|
||||
input_dim: int,
|
||||
output_dim: int,
|
||||
net_arch: List[int],
|
||||
activation_fn: Type[nn.Module] = nn.ReLU,
|
||||
squash_output: bool = False,
|
||||
) -> List[nn.Module]:
|
||||
dropout = 0.2
|
||||
if len(net_arch) > 0:
|
||||
number_of_neural = net_arch[0]
|
||||
|
||||
modules = [
|
||||
nn.Linear(input_dim, number_of_neural),
|
||||
nn.BatchNorm1d(number_of_neural),
|
||||
nn.LeakyReLU(),
|
||||
nn.Dropout(dropout),
|
||||
nn.Linear(number_of_neural, number_of_neural),
|
||||
nn.BatchNorm1d(number_of_neural),
|
||||
nn.LeakyReLU(),
|
||||
nn.Dropout(dropout),
|
||||
nn.Linear(number_of_neural, number_of_neural),
|
||||
nn.BatchNorm1d(number_of_neural),
|
||||
nn.LeakyReLU(),
|
||||
nn.Dropout(dropout),
|
||||
nn.Linear(number_of_neural, number_of_neural),
|
||||
nn.BatchNorm1d(number_of_neural),
|
||||
nn.LeakyReLU(),
|
||||
nn.Dropout(dropout),
|
||||
nn.Linear(number_of_neural, output_dim)
|
||||
]
|
||||
return modules
|
||||
|
||||
|
||||
class TDQNetwork(QNetwork):
|
||||
def __init__(self,
|
||||
observation_space: gym.spaces.Space,
|
||||
action_space: gym.spaces.Space,
|
||||
features_extractor: nn.Module,
|
||||
features_dim: int,
|
||||
net_arch: Optional[List[int]] = None,
|
||||
activation_fn: Type[nn.Module] = nn.ReLU,
|
||||
normalize_images: bool = True
|
||||
):
|
||||
super().__init__(
|
||||
observation_space=observation_space,
|
||||
action_space=action_space,
|
||||
features_extractor=features_extractor,
|
||||
features_dim=features_dim,
|
||||
net_arch=net_arch,
|
||||
activation_fn=activation_fn,
|
||||
normalize_images=normalize_images
|
||||
)
|
||||
action_dim = self.action_space.n
|
||||
q_net = create_mlp_(self.features_dim, action_dim, self.net_arch, self.activation_fn)
|
||||
self.q_net = nn.Sequential(*q_net).apply(self.init_weights)
|
||||
|
||||
def init_weights(self, m):
|
||||
if type(m) == nn.Linear:
|
||||
th.nn.init.kaiming_uniform_(m.weight)
|
||||
|
||||
|
||||
class TDQNPolicy(DQNPolicy):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
observation_space: gym.spaces.Space,
|
||||
action_space: gym.spaces.Space,
|
||||
lr_schedule: Schedule,
|
||||
net_arch: Optional[List[int]] = None,
|
||||
activation_fn: Type[nn.Module] = nn.ReLU,
|
||||
features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
|
||||
features_extractor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
normalize_images: bool = True,
|
||||
optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
|
||||
optimizer_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
super().__init__(
|
||||
observation_space=observation_space,
|
||||
action_space=action_space,
|
||||
lr_schedule=lr_schedule,
|
||||
net_arch=net_arch,
|
||||
activation_fn=activation_fn,
|
||||
features_extractor_class=features_extractor_class,
|
||||
features_extractor_kwargs=features_extractor_kwargs,
|
||||
normalize_images=normalize_images,
|
||||
optimizer_class=optimizer_class,
|
||||
optimizer_kwargs=optimizer_kwargs
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def init_weights(module: nn.Module, gain: float = 1) -> None:
|
||||
"""
|
||||
Orthogonal initialization (used in PPO and A2C)
|
||||
"""
|
||||
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
||||
nn.init.kaiming_uniform_(module.weight)
|
||||
if module.bias is not None:
|
||||
module.bias.data.fill_(0.0)
|
||||
|
||||
def make_q_net(self) -> TDQNetwork:
|
||||
# Make sure we always have separate networks for features extractors etc
|
||||
net_args = self._update_features_extractor(self.net_args, features_extractor=None)
|
||||
return TDQNetwork(**net_args).to(self.device)
|
||||
|
||||
|
||||
class TMultiInputPolicy(TDQNPolicy):
|
||||
def __init__(
|
||||
self,
|
||||
observation_space: gym.spaces.Space,
|
||||
action_space: gym.spaces.Space,
|
||||
lr_schedule: Schedule,
|
||||
net_arch: Optional[List[int]] = None,
|
||||
activation_fn: Type[nn.Module] = nn.ReLU,
|
||||
features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
|
||||
features_extractor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
normalize_images: bool = True,
|
||||
optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
|
||||
optimizer_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
super().__init__(
|
||||
observation_space,
|
||||
action_space,
|
||||
lr_schedule,
|
||||
net_arch,
|
||||
activation_fn,
|
||||
features_extractor_class,
|
||||
features_extractor_kwargs,
|
||||
normalize_images,
|
||||
optimizer_class,
|
||||
optimizer_kwargs,
|
||||
)
|
||||
|
||||
|
||||
class TDQN(DQN):
|
||||
|
||||
policy_aliases: Dict[str, Type[BasePolicy]] = {
|
||||
"MlpPolicy": MlpPolicy,
|
||||
"CnnPolicy": CnnPolicy,
|
||||
"TMultiInputPolicy": TMultiInputPolicy,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
policy: Union[str, Type[TDQNPolicy]],
|
||||
env: Union[GymEnv, str],
|
||||
learning_rate: Union[float, Schedule] = 1e-4,
|
||||
buffer_size: int = 1000000, # 1e6
|
||||
learning_starts: int = 50000,
|
||||
batch_size: int = 32,
|
||||
tau: float = 1.0,
|
||||
gamma: float = 0.99,
|
||||
train_freq: Union[int, Tuple[int, str]] = 4,
|
||||
gradient_steps: int = 1,
|
||||
replay_buffer_class: Optional[ReplayBuffer] = None,
|
||||
replay_buffer_kwargs: Optional[Dict[str, Any]] = None,
|
||||
optimize_memory_usage: bool = False,
|
||||
target_update_interval: int = 10000,
|
||||
exploration_fraction: float = 0.1,
|
||||
exploration_initial_eps: float = 1.0,
|
||||
exploration_final_eps: float = 0.05,
|
||||
max_grad_norm: float = 10,
|
||||
tensorboard_log: Optional[Path] = None,
|
||||
create_eval_env: bool = False,
|
||||
policy_kwargs: Optional[Dict[str, Any]] = None,
|
||||
verbose: int = 1,
|
||||
seed: Optional[int] = None,
|
||||
device: Union[th.device, str] = "auto",
|
||||
_init_setup_model: bool = True,
|
||||
):
|
||||
|
||||
super().__init__(
|
||||
policy=policy,
|
||||
env=env,
|
||||
learning_rate=learning_rate,
|
||||
buffer_size=buffer_size,
|
||||
learning_starts=learning_starts,
|
||||
batch_size=batch_size,
|
||||
tau=tau,
|
||||
gamma=gamma,
|
||||
train_freq=train_freq,
|
||||
gradient_steps=gradient_steps,
|
||||
replay_buffer_class=replay_buffer_class, # No action noise
|
||||
replay_buffer_kwargs=replay_buffer_kwargs,
|
||||
optimize_memory_usage=optimize_memory_usage,
|
||||
target_update_interval=target_update_interval,
|
||||
exploration_fraction=exploration_fraction,
|
||||
exploration_initial_eps=exploration_initial_eps,
|
||||
exploration_final_eps=exploration_final_eps,
|
||||
max_grad_norm=max_grad_norm,
|
||||
tensorboard_log=tensorboard_log,
|
||||
create_eval_env=create_eval_env,
|
||||
policy_kwargs=policy_kwargs,
|
||||
verbose=verbose,
|
||||
seed=seed,
|
||||
device=device,
|
||||
_init_setup_model=_init_setup_model
|
||||
)
|
@@ -0,0 +1,84 @@
|
||||
import logging
|
||||
from typing import Any, Dict # , Tuple
|
||||
|
||||
# import numpy.typing as npt
|
||||
import torch as th
|
||||
from stable_baselines3.common.callbacks import EvalCallback
|
||||
from stable_baselines3.common.vec_env import SubprocVecEnv
|
||||
from freqtrade.freqai.RL.BaseReinforcementLearningModel import (BaseReinforcementLearningModel,
|
||||
make_env)
|
||||
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ReinforcementLearner_multiproc(BaseReinforcementLearningModel):
|
||||
"""
|
||||
User created Reinforcement Learning Model prediction model.
|
||||
"""
|
||||
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
|
||||
|
||||
train_df = data_dictionary["train_features"]
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
# model arch
|
||||
policy_kwargs = dict(activation_fn=th.nn.ReLU,
|
||||
net_arch=[512, 512, 512])
|
||||
|
||||
model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=Path(dk.data_path / "tensorboard"),
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
total_timesteps=int(total_timesteps),
|
||||
callback=self.eval_callback
|
||||
)
|
||||
|
||||
if Path(dk.data_path / "best_model.zip").is_file():
|
||||
logger.info('Callback found a best model.')
|
||||
best_model = self.MODELCLASS.load(dk.data_path / "best_model")
|
||||
return best_model
|
||||
|
||||
logger.info('Couldnt find best model, using final model instead.')
|
||||
|
||||
return model
|
||||
|
||||
def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk):
|
||||
"""
|
||||
If user has particular environment configuration needs, they can do that by
|
||||
overriding this function. In the present case, the user wants to setup training
|
||||
environments for multiple workers.
|
||||
"""
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
|
||||
# environments
|
||||
if not self.train_env:
|
||||
env_id = "train_env"
|
||||
num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
|
||||
self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
|
||||
self.reward_params, self.CONV_WIDTH) for i
|
||||
in range(num_cpu)])
|
||||
|
||||
eval_env_id = 'eval_env'
|
||||
self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
|
||||
self.reward_params, self.CONV_WIDTH, monitor=True) for i
|
||||
in range(num_cpu)])
|
||||
self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
|
||||
render=False, eval_freq=eval_freq,
|
||||
best_model_save_path=dk.data_path)
|
||||
else:
|
||||
self.train_env.env_method('reset')
|
||||
self.eval_env.env_method('reset')
|
||||
self.train_env.env_method('reset_env', train_df, prices_train,
|
||||
self.CONV_WIDTH, self.reward_params)
|
||||
self.eval_env.env_method('reset_env', train_df, prices_train,
|
||||
self.CONV_WIDTH, self.reward_params)
|
||||
self.eval_callback.__init__(self.eval_env, deterministic=True,
|
||||
render=False, eval_freq=eval_freq,
|
||||
best_model_save_path=dk.data_path)
|
@@ -1,104 +0,0 @@
|
||||
import gc
|
||||
import logging
|
||||
from typing import Any, Dict # , Tuple
|
||||
|
||||
import numpy as np
|
||||
# import numpy.typing as npt
|
||||
import torch as th
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.callbacks import EvalCallback
|
||||
from stable_baselines3.common.monitor import Monitor
|
||||
|
||||
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
|
||||
from freqtrade.freqai.RL.Base3ActionRLEnv import Actions, Base3ActionRLEnv, Positions
|
||||
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ReinforcementLearningPPO(BaseReinforcementLearningModel):
|
||||
"""
|
||||
User created Reinforcement Learning Model prediction model.
|
||||
"""
|
||||
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
|
||||
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
path = dk.data_path
|
||||
eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
|
||||
log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
|
||||
deterministic=True, render=False)
|
||||
|
||||
# model arch
|
||||
policy_kwargs = dict(activation_fn=th.nn.ReLU,
|
||||
net_arch=[256, 256, 128])
|
||||
|
||||
model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=f"{path}/ppo/tensorboard/",
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
total_timesteps=int(total_timesteps),
|
||||
callback=eval_callback
|
||||
)
|
||||
|
||||
del model
|
||||
best_model = PPO.load(dk.data_path / "best_model")
|
||||
|
||||
print('Training finished!')
|
||||
gc.collect()
|
||||
|
||||
return best_model
|
||||
|
||||
def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
|
||||
"""
|
||||
User overrides this as shown here if they are using a custom MyRLEnv
|
||||
"""
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
|
||||
# environments
|
||||
if not self.train_env:
|
||||
self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
|
||||
reward_kwargs=self.reward_params)
|
||||
self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
|
||||
window_size=self.CONV_WIDTH,
|
||||
reward_kwargs=self.reward_params), ".")
|
||||
else:
|
||||
self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
|
||||
self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
|
||||
self.train_env.reset()
|
||||
self.eval_env.reset()
|
||||
|
||||
|
||||
class MyRLEnv(Base3ActionRLEnv):
|
||||
"""
|
||||
User can override any function in BaseRLEnv and gym.Env
|
||||
"""
|
||||
|
||||
def calculate_reward(self, action):
|
||||
|
||||
if self._last_trade_tick is None:
|
||||
return 0.
|
||||
|
||||
# close long
|
||||
if (action == Actions.Short.value or
|
||||
action == Actions.Neutral.value) and self._position == Positions.Long:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
# close short
|
||||
if (action == Actions.Long.value or
|
||||
action == Actions.Neutral.value) and self._position == Positions.Short:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
return 0.
|
@@ -1,132 +0,0 @@
|
||||
import logging
|
||||
from typing import Any, Dict # , Tuple
|
||||
|
||||
import numpy as np
|
||||
# import numpy.typing as npt
|
||||
import torch as th
|
||||
from stable_baselines3.common.monitor import Monitor
|
||||
from typing import Callable
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.callbacks import EvalCallback
|
||||
from stable_baselines3.common.vec_env import SubprocVecEnv
|
||||
from stable_baselines3.common.utils import set_random_seed
|
||||
from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
|
||||
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
|
||||
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
|
||||
import gym
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def make_env(env_id: str, rank: int, seed: int, train_df, price,
|
||||
reward_params, window_size, monitor=False) -> Callable:
|
||||
"""
|
||||
Utility function for multiprocessed env.
|
||||
|
||||
:param env_id: (str) the environment ID
|
||||
:param num_env: (int) the number of environment you wish to have in subprocesses
|
||||
:param seed: (int) the inital seed for RNG
|
||||
:param rank: (int) index of the subprocess
|
||||
:return: (Callable)
|
||||
"""
|
||||
def _init() -> gym.Env:
|
||||
|
||||
env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
|
||||
reward_kwargs=reward_params, id=env_id, seed=seed + rank)
|
||||
if monitor:
|
||||
env = Monitor(env, ".")
|
||||
return env
|
||||
set_random_seed(seed)
|
||||
return _init
|
||||
|
||||
|
||||
class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
|
||||
"""
|
||||
User created Reinforcement Learning Model prediction model.
|
||||
"""
|
||||
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
|
||||
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
path = dk.data_path
|
||||
eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
|
||||
log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
|
||||
deterministic=True, render=False)
|
||||
|
||||
# model arch
|
||||
policy_kwargs = dict(activation_fn=th.nn.ReLU,
|
||||
net_arch=[512, 512, 512])
|
||||
|
||||
model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=f"{path}/ppo/tensorboard/",
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
total_timesteps=int(total_timesteps),
|
||||
callback=eval_callback
|
||||
)
|
||||
|
||||
best_model = PPO.load(dk.data_path / "best_model")
|
||||
print('Training finished!')
|
||||
|
||||
return best_model
|
||||
|
||||
def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
|
||||
"""
|
||||
User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
|
||||
leaving this will default to Base5ActEnv
|
||||
"""
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
|
||||
# environments
|
||||
if not self.train_env:
|
||||
env_id = "train_env"
|
||||
num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
|
||||
self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
|
||||
self.reward_params, self.CONV_WIDTH) for i
|
||||
in range(num_cpu)])
|
||||
|
||||
eval_env_id = 'eval_env'
|
||||
self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
|
||||
self.reward_params, self.CONV_WIDTH, monitor=True) for i
|
||||
in range(num_cpu)])
|
||||
else:
|
||||
self.train_env.env_method('reset_env', train_df, prices_train,
|
||||
self.CONV_WIDTH, self.reward_params)
|
||||
self.eval_env.env_method('reset_env', train_df, prices_train,
|
||||
self.CONV_WIDTH, self.reward_params)
|
||||
self.train_env.env_method('reset')
|
||||
self.eval_env.env_method('reset')
|
||||
|
||||
|
||||
class MyRLEnv(Base3ActionRLEnv):
|
||||
"""
|
||||
User can override any function in BaseRLEnv and gym.Env
|
||||
"""
|
||||
|
||||
def calculate_reward(self, action):
|
||||
|
||||
if self._last_trade_tick is None:
|
||||
return 0.
|
||||
|
||||
# close long
|
||||
if (action == Actions.Short.value or
|
||||
action == Actions.Neutral.value) and self._position == Positions.Long:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
# close short
|
||||
if (action == Actions.Long.value or
|
||||
action == Actions.Neutral.value) and self._position == Positions.Short:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
return 0.
|
@@ -1,115 +0,0 @@
|
||||
import logging
|
||||
from typing import Any, Dict # Optional
|
||||
import torch as th
|
||||
from stable_baselines3.common.callbacks import EvalCallback
|
||||
from stable_baselines3.common.monitor import Monitor
|
||||
from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
|
||||
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
|
||||
from freqtrade.freqai.RL.TDQNagent import TDQN
|
||||
from stable_baselines3 import DQN
|
||||
from stable_baselines3.common.buffers import ReplayBuffer
|
||||
import numpy as np
|
||||
import gc
|
||||
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
|
||||
"""
|
||||
User created Reinforcement Learning Model prediction model.
|
||||
"""
|
||||
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
|
||||
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
path = dk.data_path
|
||||
eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
|
||||
log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
|
||||
deterministic=True, render=False)
|
||||
|
||||
# model arch
|
||||
policy_kwargs = dict(activation_fn=th.nn.ReLU,
|
||||
net_arch=[256, 256, 128])
|
||||
|
||||
model = TDQN('TMultiInputPolicy', self.train_env,
|
||||
tensorboard_log=f"{path}/tdqn/tensorboard/",
|
||||
policy_kwargs=policy_kwargs,
|
||||
replay_buffer_class=ReplayBuffer,
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
total_timesteps=int(total_timesteps),
|
||||
callback=eval_callback
|
||||
)
|
||||
|
||||
del model
|
||||
best_model = DQN.load(dk.data_path / "best_model")
|
||||
|
||||
print('Training finished!')
|
||||
gc.collect()
|
||||
return best_model
|
||||
|
||||
def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
|
||||
"""
|
||||
User overrides this as shown here if they are using a custom MyRLEnv
|
||||
"""
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
|
||||
# environments
|
||||
if not self.train_env:
|
||||
self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
|
||||
reward_kwargs=self.reward_params)
|
||||
self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
|
||||
window_size=self.CONV_WIDTH,
|
||||
reward_kwargs=self.reward_params), ".")
|
||||
else:
|
||||
self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
|
||||
self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
|
||||
self.train_env.reset()
|
||||
self.eval_env.reset()
|
||||
|
||||
|
||||
# User can inherit and customize 5 action environment
|
||||
class MyRLEnv(Base5ActionRLEnv):
|
||||
"""
|
||||
User can override any function in BaseRLEnv and gym.Env. Here the user
|
||||
Adds 5 actions.
|
||||
"""
|
||||
|
||||
def calculate_reward(self, action):
|
||||
|
||||
if self._last_trade_tick is None:
|
||||
return 0.
|
||||
|
||||
# close long
|
||||
if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(current_price) - np.log(last_trade_price)) * 2)
|
||||
|
||||
# close short
|
||||
if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(last_trade_price) - np.log(current_price)) * 2)
|
||||
|
||||
return 0.
|
@@ -1,148 +0,0 @@
|
||||
import logging
|
||||
from typing import Any, Dict # Optional
|
||||
import torch as th
|
||||
import numpy as np
|
||||
import gym
|
||||
from typing import Callable
|
||||
from stable_baselines3.common.callbacks import EvalCallback
|
||||
# EvalCallback , StopTrainingOnNoModelImprovement, StopTrainingOnRewardThreshold
|
||||
from stable_baselines3.common.monitor import Monitor
|
||||
from stable_baselines3.common.vec_env import SubprocVecEnv
|
||||
from stable_baselines3.common.utils import set_random_seed
|
||||
from stable_baselines3 import DQN
|
||||
from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
|
||||
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
|
||||
from freqtrade.freqai.RL.TDQNagent import TDQN
|
||||
from stable_baselines3.common.buffers import ReplayBuffer
|
||||
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def make_env(env_id: str, rank: int, seed: int, train_df, price,
|
||||
reward_params, window_size, monitor=False) -> Callable:
|
||||
"""
|
||||
Utility function for multiprocessed env.
|
||||
|
||||
:param env_id: (str) the environment ID
|
||||
:param num_env: (int) the number of environment you wish to have in subprocesses
|
||||
:param seed: (int) the inital seed for RNG
|
||||
:param rank: (int) index of the subprocess
|
||||
:return: (Callable)
|
||||
"""
|
||||
def _init() -> gym.Env:
|
||||
|
||||
env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
|
||||
reward_kwargs=reward_params, id=env_id, seed=seed + rank)
|
||||
if monitor:
|
||||
env = Monitor(env, ".")
|
||||
return env
|
||||
set_random_seed(seed)
|
||||
return _init
|
||||
|
||||
|
||||
class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
|
||||
"""
|
||||
User created Reinforcement Learning Model prediction model.
|
||||
"""
|
||||
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
|
||||
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
path = dk.data_path
|
||||
|
||||
eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
|
||||
log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
|
||||
deterministic=True, render=False)
|
||||
# model arch
|
||||
policy_kwargs = dict(activation_fn=th.nn.ReLU,
|
||||
net_arch=[512, 512, 512])
|
||||
|
||||
model = TDQN('TMultiInputPolicy', self.train_env,
|
||||
policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=f"{path}/tdqn/tensorboard/",
|
||||
replay_buffer_class=ReplayBuffer,
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
total_timesteps=int(total_timesteps),
|
||||
callback=eval_callback
|
||||
)
|
||||
|
||||
best_model = DQN.load(dk.data_path / "best_model.zip")
|
||||
print('Training finished!')
|
||||
|
||||
return best_model
|
||||
|
||||
def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
|
||||
"""
|
||||
User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
|
||||
leaving this will default to Base5ActEnv
|
||||
"""
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
|
||||
# environments
|
||||
if not self.train_env:
|
||||
env_id = "train_env"
|
||||
num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
|
||||
self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
|
||||
self.reward_params, self.CONV_WIDTH) for i
|
||||
in range(num_cpu)])
|
||||
|
||||
eval_env_id = 'eval_env'
|
||||
self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
|
||||
self.reward_params, self.CONV_WIDTH, monitor=True) for i
|
||||
in range(num_cpu)])
|
||||
else:
|
||||
self.train_env.env_method('reset_env', train_df, prices_train,
|
||||
self.CONV_WIDTH, self.reward_params)
|
||||
self.eval_env.env_method('reset_env', train_df, prices_train,
|
||||
self.CONV_WIDTH, self.reward_params)
|
||||
self.train_env.env_method('reset')
|
||||
self.eval_env.env_method('reset')
|
||||
|
||||
# User can inherit and customize 5 action environment
|
||||
|
||||
|
||||
class MyRLEnv(Base5ActionRLEnv):
|
||||
"""
|
||||
User can override any function in BaseRLEnv and gym.Env. Here the user
|
||||
Adds 5 actions.
|
||||
"""
|
||||
|
||||
def calculate_reward(self, action):
|
||||
|
||||
if self._last_trade_tick is None:
|
||||
return 0.
|
||||
|
||||
# close long
|
||||
if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(current_price) - np.log(last_trade_price)) * 2)
|
||||
|
||||
# close short
|
||||
if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(last_trade_price) - np.log(current_price)) * 2)
|
||||
|
||||
return 0.
|
Reference in New Issue
Block a user