reinforce training with state info, reinforce prediction with state info, restructure config to accommodate all parameters from any user imported model type. Set 5Act to default env on TDQN. Clean example config.
This commit is contained in:
@@ -24,18 +24,16 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
|
||||
prices_train: DataFrame, prices_test: DataFrame):
|
||||
|
||||
agent_params = self.freqai_info['model_training_parameters']
|
||||
reward_params = self.freqai_info['model_reward_parameters']
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = agent_params.get("eval_cycles", 4) * len(test_df)
|
||||
total_timesteps = agent_params["train_cycles"] * len(train_df)
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
# environments
|
||||
train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
|
||||
reward_kwargs=reward_params)
|
||||
reward_kwargs=self.reward_params)
|
||||
eval = MyRLEnv(df=test_df, prices=prices_test,
|
||||
window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
|
||||
window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params)
|
||||
eval_env = Monitor(eval, ".")
|
||||
|
||||
path = dk.data_path
|
||||
@@ -49,7 +47,7 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
|
||||
|
||||
model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=f"{path}/ppo/tensorboard/", learning_rate=0.00025,
|
||||
gamma=0.9, verbose=1
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
|
@@ -51,23 +51,20 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
|
||||
prices_train: DataFrame, prices_test: DataFrame):
|
||||
|
||||
agent_params = self.freqai_info['model_training_parameters']
|
||||
reward_params = self.freqai_info['model_reward_parameters']
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = agent_params.get("eval_cycles", 4) * len(test_df)
|
||||
total_timesteps = agent_params["train_cycles"] * len(train_df)
|
||||
learning_rate = agent_params["learning_rate"]
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
env_id = "train_env"
|
||||
th.set_num_threads(dk.thread_count)
|
||||
num_cpu = int(dk.thread_count / 2)
|
||||
train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, reward_params,
|
||||
self.CONV_WIDTH) for i in range(num_cpu)])
|
||||
train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
|
||||
self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)])
|
||||
|
||||
eval_env_id = 'eval_env'
|
||||
eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, reward_params,
|
||||
self.CONV_WIDTH, monitor=True) for i in range(num_cpu)])
|
||||
eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
|
||||
self.reward_params, self.CONV_WIDTH, monitor=True) for i in
|
||||
range(num_cpu)])
|
||||
|
||||
path = dk.data_path
|
||||
eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/",
|
||||
@@ -80,9 +77,7 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
|
||||
|
||||
model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=f"{path}/ppo/tensorboard/",
|
||||
learning_rate=learning_rate,
|
||||
gamma=0.9,
|
||||
verbose=1
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
|
@@ -3,8 +3,7 @@ from typing import Any, Dict # Optional
|
||||
import torch as th
|
||||
from stable_baselines3.common.callbacks import EvalCallback
|
||||
from stable_baselines3.common.monitor import Monitor
|
||||
# from stable_baselines3.common.vec_env import SubprocVecEnv
|
||||
from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
|
||||
from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
|
||||
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
|
||||
from freqtrade.freqai.RL.TDQNagent import TDQN
|
||||
from stable_baselines3 import DQN
|
||||
@@ -25,18 +24,16 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
|
||||
prices_train: DataFrame, prices_test: DataFrame):
|
||||
|
||||
agent_params = self.freqai_info['model_training_parameters']
|
||||
reward_params = self.freqai_info['model_reward_parameters']
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = agent_params["eval_cycles"] * len(test_df)
|
||||
total_timesteps = agent_params["train_cycles"] * len(train_df)
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
# environments
|
||||
train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
|
||||
reward_kwargs=reward_params)
|
||||
reward_kwargs=self.reward_params)
|
||||
eval = MyRLEnv(df=test_df, prices=prices_test,
|
||||
window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
|
||||
window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params)
|
||||
eval_env = Monitor(eval, ".")
|
||||
eval_env.reset()
|
||||
|
||||
@@ -50,12 +47,10 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
|
||||
net_arch=[256, 256, 128])
|
||||
|
||||
model = TDQN('TMultiInputPolicy', train_env,
|
||||
policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=f"{path}/tdqn/tensorboard/",
|
||||
learning_rate=0.00025, gamma=0.9,
|
||||
target_update_interval=5000, buffer_size=50000,
|
||||
exploration_initial_eps=1, exploration_final_eps=0.1,
|
||||
replay_buffer_class=ReplayBuffer
|
||||
policy_kwargs=policy_kwargs,
|
||||
replay_buffer_class=ReplayBuffer,
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
@@ -70,9 +65,11 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
|
||||
return best_model
|
||||
|
||||
|
||||
class MyRLEnv(Base3ActionRLEnv):
|
||||
# User can inherit and customize 5 action environment
|
||||
class MyRLEnv(Base5ActionRLEnv):
|
||||
"""
|
||||
User can override any function in BaseRLEnv and gym.Env
|
||||
User can override any function in BaseRLEnv and gym.Env. Here the user
|
||||
Adds 5 actions.
|
||||
"""
|
||||
|
||||
def calculate_reward(self, action):
|
||||
@@ -81,55 +78,27 @@ class MyRLEnv(Base3ActionRLEnv):
|
||||
return 0.
|
||||
|
||||
# close long
|
||||
if (action == Actions.Short.value or
|
||||
action == Actions.Neutral.value) and self._position == Positions.Long:
|
||||
if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(current_price) - np.log(last_trade_price)) * 2)
|
||||
|
||||
# close short
|
||||
if (action == Actions.Long.value or
|
||||
action == Actions.Neutral.value) and self._position == Positions.Short:
|
||||
if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(last_trade_price) - np.log(current_price)) * 2)
|
||||
|
||||
return 0.
|
||||
|
||||
# User can inherit and customize 5 action environment
|
||||
# class MyRLEnv(Base5ActionRLEnv):
|
||||
# """
|
||||
# User can override any function in BaseRLEnv and gym.Env. Here the user
|
||||
# Adds 5 actions.
|
||||
# """
|
||||
|
||||
# def calculate_reward(self, action):
|
||||
|
||||
# if self._last_trade_tick is None:
|
||||
# return 0.
|
||||
|
||||
# # close long
|
||||
# if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
# last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
# if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
# if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
# last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float((np.log(current_price) - np.log(last_trade_price)) * 2)
|
||||
|
||||
# # close short
|
||||
# if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
# last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
# if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
# if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
# last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float((np.log(last_trade_price) - np.log(current_price)) * 2)
|
||||
|
||||
# return 0.
|
||||
|
@@ -10,7 +10,7 @@ from stable_baselines3.common.monitor import Monitor
|
||||
from stable_baselines3.common.vec_env import SubprocVecEnv
|
||||
from stable_baselines3.common.utils import set_random_seed
|
||||
from stable_baselines3 import DQN
|
||||
from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
|
||||
from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
|
||||
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
|
||||
from freqtrade.freqai.RL.TDQNagent import TDQN
|
||||
from stable_baselines3.common.buffers import ReplayBuffer
|
||||
@@ -50,22 +50,20 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
|
||||
prices_train: DataFrame, prices_test: DataFrame):
|
||||
|
||||
agent_params = self.freqai_info['model_training_parameters']
|
||||
reward_params = self.freqai_info['model_reward_parameters']
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = agent_params["eval_cycles"] * len(test_df)
|
||||
total_timesteps = agent_params["train_cycles"] * len(train_df)
|
||||
learning_rate = agent_params["learning_rate"]
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
env_id = "train_env"
|
||||
num_cpu = int(dk.thread_count / 2)
|
||||
train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, reward_params,
|
||||
self.CONV_WIDTH) for i in range(num_cpu)])
|
||||
train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
|
||||
self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)])
|
||||
|
||||
eval_env_id = 'eval_env'
|
||||
eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, reward_params,
|
||||
self.CONV_WIDTH, monitor=True) for i in range(num_cpu)])
|
||||
eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
|
||||
self.reward_params, self.CONV_WIDTH, monitor=True) for i in
|
||||
range(num_cpu)])
|
||||
|
||||
path = dk.data_path
|
||||
stop_train_callback = StopTrainingOnNoModelImprovement(
|
||||
@@ -91,10 +89,8 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
|
||||
model = TDQN('TMultiInputPolicy', train_env,
|
||||
policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=f"{path}/tdqn/tensorboard/",
|
||||
learning_rate=learning_rate, gamma=0.9,
|
||||
target_update_interval=5000, buffer_size=50000,
|
||||
exploration_initial_eps=1, exploration_final_eps=0.1,
|
||||
replay_buffer_class=ReplayBuffer
|
||||
replay_buffer_class=ReplayBuffer,
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
@@ -109,9 +105,11 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
|
||||
return best_model
|
||||
|
||||
|
||||
class MyRLEnv(Base3ActionRLEnv):
|
||||
# User can inherit and customize 5 action environment
|
||||
class MyRLEnv(Base5ActionRLEnv):
|
||||
"""
|
||||
User can override any function in BaseRLEnv and gym.Env
|
||||
User can override any function in BaseRLEnv and gym.Env. Here the user
|
||||
Adds 5 actions.
|
||||
"""
|
||||
|
||||
def calculate_reward(self, action):
|
||||
@@ -120,55 +118,27 @@ class MyRLEnv(Base3ActionRLEnv):
|
||||
return 0.
|
||||
|
||||
# close long
|
||||
if (action == Actions.Short.value or
|
||||
action == Actions.Neutral.value) and self._position == Positions.Long:
|
||||
if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(current_price) - np.log(last_trade_price)) * 2)
|
||||
|
||||
# close short
|
||||
if (action == Actions.Long.value or
|
||||
action == Actions.Neutral.value) and self._position == Positions.Short:
|
||||
if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(last_trade_price) - np.log(current_price)) * 2)
|
||||
|
||||
return 0.
|
||||
|
||||
# User can inherit and customize 5 action environment
|
||||
# class MyRLEnv(Base5ActionRLEnv):
|
||||
# """
|
||||
# User can override any function in BaseRLEnv and gym.Env. Here the user
|
||||
# Adds 5 actions.
|
||||
# """
|
||||
|
||||
# def calculate_reward(self, action):
|
||||
|
||||
# if self._last_trade_tick is None:
|
||||
# return 0.
|
||||
|
||||
# # close long
|
||||
# if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
# last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
# if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
# if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
# last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float((np.log(current_price) - np.log(last_trade_price)) * 2)
|
||||
|
||||
# # close short
|
||||
# if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
# last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
# if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
# if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
# last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float((np.log(last_trade_price) - np.log(current_price)) * 2)
|
||||
|
||||
# return 0.
|
||||
|
Reference in New Issue
Block a user