reinforce training with state info, reinforce prediction with state info, restructure config to accommodate all parameters from any user imported model type. Set 5Act to default env on TDQN. Clean example config.
This commit is contained in:
parent
7962a1439b
commit
5d4e5e69fe
@ -8,7 +8,7 @@
|
||||
"tradable_balance_ratio": 1,
|
||||
"fiat_display_currency": "USD",
|
||||
"dry_run": true,
|
||||
"timeframe": "3m",
|
||||
"timeframe": "5m",
|
||||
"dataformat_ohlcv": "json",
|
||||
"dry_run_wallet": 12000,
|
||||
"cancel_open_orders_on_exit": true,
|
||||
@ -35,7 +35,6 @@
|
||||
},
|
||||
"entry_pricing": {
|
||||
"price_side": "same",
|
||||
"purge_old_models": true,
|
||||
"use_order_book": true,
|
||||
"order_book_top": 1,
|
||||
"price_last_balance": 0.0,
|
||||
@ -56,10 +55,8 @@
|
||||
],
|
||||
"freqai": {
|
||||
"enabled": true,
|
||||
"startup_candles": 1000,
|
||||
"model_save_type": "stable_baselines_ppo",
|
||||
"model_save_type": "stable_baselines_dqn",
|
||||
"conv_width": 10,
|
||||
"follow_mode": false,
|
||||
"purge_old_models": true,
|
||||
"train_period_days": 10,
|
||||
"backtest_period_days": 2,
|
||||
@ -71,13 +68,9 @@
|
||||
"ETH/USDT"
|
||||
],
|
||||
"include_timeframes": [
|
||||
"3m",
|
||||
"15m"
|
||||
"5m",
|
||||
"30m"
|
||||
],
|
||||
"include_shifted_candles": 0,
|
||||
"weight_factor": 0.9,
|
||||
"principal_component_analysis": false,
|
||||
"use_SVM_to_remove_outliers": false,
|
||||
"indicator_max_period_candles": 10,
|
||||
"indicator_periods_candles": [5, 10]
|
||||
},
|
||||
@ -86,16 +79,22 @@
|
||||
"random_state": 1,
|
||||
"shuffle": false
|
||||
},
|
||||
"model_training_parameters": {
|
||||
"ent_coef": 0.005,
|
||||
"learning_rate": 0.000025,
|
||||
"batch_size": 256,
|
||||
"eval_cycles" : 5,
|
||||
"train_cycles" : 15
|
||||
"model_training_parameters": {
|
||||
"learning_rate": 0.00025,
|
||||
"gamma": 0.9,
|
||||
"target_update_interval": 5000,
|
||||
"buffer_size": 50000,
|
||||
"exploration_initial_eps":1,
|
||||
"exploration_final_eps": 0.1,
|
||||
"verbose": 1
|
||||
},
|
||||
"model_reward_parameters": {
|
||||
"rr": 1,
|
||||
"profit_aim": 0.01
|
||||
"rl_config": {
|
||||
"train_cycles": 15,
|
||||
"eval_cycles": 5,
|
||||
"model_reward_parameters": {
|
||||
"rr": 1,
|
||||
"profit_aim": 0.02
|
||||
}
|
||||
}
|
||||
},
|
||||
"bot_name": "RL_test",
|
||||
|
@ -6,6 +6,7 @@ import gym
|
||||
import numpy as np
|
||||
from gym import spaces
|
||||
from gym.utils import seeding
|
||||
from pandas import DataFrame
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -35,7 +36,8 @@ class Base3ActionRLEnv(gym.Env):
|
||||
|
||||
metadata = {'render.modes': ['human']}
|
||||
|
||||
def __init__(self, df, prices, reward_kwargs, window_size=10, starting_point=True,
|
||||
def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
|
||||
reward_kwargs: dict = {}, window_size=10, starting_point=True,
|
||||
id: str = 'baseenv-1', seed: int = 1):
|
||||
assert df.ndim == 2
|
||||
|
||||
|
@ -6,6 +6,7 @@ import gym
|
||||
import numpy as np
|
||||
from gym import spaces
|
||||
from gym.utils import seeding
|
||||
from pandas import DataFrame
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -39,7 +40,8 @@ class Base5ActionRLEnv(gym.Env):
|
||||
"""
|
||||
metadata = {'render.modes': ['human']}
|
||||
|
||||
def __init__(self, df, prices, reward_kwargs, window_size=10, starting_point=True,
|
||||
def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
|
||||
reward_kwargs: dict = {}, window_size=10, starting_point=True,
|
||||
id: str = 'baseenv-1', seed: int = 1):
|
||||
assert df.ndim == 2
|
||||
|
||||
@ -56,7 +58,7 @@ class Base5ActionRLEnv(gym.Env):
|
||||
self.fee = 0.0015
|
||||
|
||||
# # spaces
|
||||
self.shape = (window_size, self.signal_features.shape[1])
|
||||
self.shape = (window_size, self.signal_features.shape[1] + 2)
|
||||
self.action_space = spaces.Discrete(len(Actions))
|
||||
self.observation_space = spaces.Box(
|
||||
low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
|
||||
@ -161,19 +163,26 @@ class Base5ActionRLEnv(gym.Env):
|
||||
self._done = True
|
||||
|
||||
self._position_history.append(self._position)
|
||||
observation = self._get_observation()
|
||||
|
||||
info = dict(
|
||||
tick=self._current_tick,
|
||||
total_reward=self.total_reward,
|
||||
total_profit=self._total_profit,
|
||||
position=self._position.value
|
||||
)
|
||||
|
||||
observation = self._get_observation()
|
||||
|
||||
self._update_history(info)
|
||||
|
||||
return observation, step_reward, self._done, info
|
||||
|
||||
def _get_observation(self):
|
||||
return self.signal_features[(self._current_tick - self.window_size):self._current_tick]
|
||||
features_and_state = self.signal_features[(
|
||||
self._current_tick - self.window_size):self._current_tick]
|
||||
features_and_state['current_profit_pct'] = self.get_unrealized_profit()
|
||||
features_and_state['position'] = self._position.value
|
||||
return features_and_state
|
||||
|
||||
def get_unrealized_profit(self):
|
||||
|
||||
|
@ -13,7 +13,7 @@ from freqtrade.persistence import Trade
|
||||
import torch.multiprocessing
|
||||
import torch as th
|
||||
logger = logging.getLogger(__name__)
|
||||
th.set_num_threads(8)
|
||||
|
||||
torch.multiprocessing.set_sharing_strategy('file_system')
|
||||
|
||||
|
||||
@ -22,6 +22,11 @@ class BaseReinforcementLearningModel(IFreqaiModel):
|
||||
User created Reinforcement Learning Model prediction model.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(config=kwargs['config'])
|
||||
th.set_num_threads(self.freqai_info.get('data_kitchen_thread_count', 4))
|
||||
self.reward_params = self.freqai_info['rl_config']['model_reward_parameters']
|
||||
|
||||
def train(
|
||||
self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen
|
||||
) -> Any:
|
||||
@ -62,12 +67,6 @@ class BaseReinforcementLearningModel(IFreqaiModel):
|
||||
|
||||
model = self.fit_rl(data_dictionary, pair, dk, prices_train, prices_test)
|
||||
|
||||
if pair not in self.dd.historic_predictions:
|
||||
self.set_initial_historic_predictions(
|
||||
data_dictionary['train_features'], model, dk, pair)
|
||||
|
||||
self.dd.save_historic_predictions_to_disk()
|
||||
|
||||
logger.info(f"--------------------done training {pair}--------------------")
|
||||
|
||||
return model
|
||||
@ -127,7 +126,8 @@ class BaseReinforcementLearningModel(IFreqaiModel):
|
||||
# optional additional data cleaning/analysis
|
||||
self.data_cleaning_predict(dk, filtered_dataframe)
|
||||
|
||||
pred_df = self.rl_model_predict(dk.data_dictionary["prediction_features"], dk, self.model)
|
||||
pred_df = self.rl_model_predict(
|
||||
dk.data_dictionary["prediction_features"], dk, self.model)
|
||||
pred_df.fillna(0, inplace=True)
|
||||
|
||||
return (pred_df, dk.do_predict)
|
||||
@ -135,10 +135,13 @@ class BaseReinforcementLearningModel(IFreqaiModel):
|
||||
def rl_model_predict(self, dataframe: DataFrame,
|
||||
dk: FreqaiDataKitchen, model: Any) -> DataFrame:
|
||||
|
||||
output = pd.DataFrame(np.full((len(dataframe), 1), 2), columns=dk.label_list)
|
||||
output = pd.DataFrame(np.zeros(len(dataframe)), columns=dk.label_list)
|
||||
|
||||
def _predict(window):
|
||||
market_side, current_profit, total_profit = self.get_state_info(dk.pair)
|
||||
observations = dataframe.iloc[window.index]
|
||||
observations['current_profit'] = current_profit
|
||||
observations['position'] = market_side
|
||||
res, _ = model.predict(observations, deterministic=True)
|
||||
return res
|
||||
|
||||
@ -174,29 +177,6 @@ class BaseReinforcementLearningModel(IFreqaiModel):
|
||||
|
||||
return prices_train, prices_test
|
||||
|
||||
def set_initial_historic_predictions(
|
||||
self, df: DataFrame, model: Any, dk: FreqaiDataKitchen, pair: str
|
||||
) -> None:
|
||||
|
||||
pred_df = self.rl_model_predict(df, dk, model)
|
||||
pred_df.fillna(0, inplace=True)
|
||||
self.dd.historic_predictions[pair] = pred_df
|
||||
hist_preds_df = self.dd.historic_predictions[pair]
|
||||
|
||||
for label in hist_preds_df.columns:
|
||||
if hist_preds_df[label].dtype == object:
|
||||
continue
|
||||
hist_preds_df[f'{label}_mean'] = 0
|
||||
hist_preds_df[f'{label}_std'] = 0
|
||||
|
||||
hist_preds_df['do_predict'] = 0
|
||||
|
||||
if self.freqai_info['feature_parameters'].get('DI_threshold', 0) > 0:
|
||||
hist_preds_df['DI_values'] = 0
|
||||
|
||||
for return_str in dk.data['extra_returns_per_train']:
|
||||
hist_preds_df[return_str] = 0
|
||||
|
||||
# TODO take care of this appendage. Right now it needs to be called because FreqAI enforces it.
|
||||
# But FreqaiRL needs more objects passed to fit() (like DK) and we dont want to go refactor
|
||||
# all the other existing fit() functions to include dk argument. For now we instantiate and
|
||||
|
@ -24,18 +24,16 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
|
||||
prices_train: DataFrame, prices_test: DataFrame):
|
||||
|
||||
agent_params = self.freqai_info['model_training_parameters']
|
||||
reward_params = self.freqai_info['model_reward_parameters']
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = agent_params.get("eval_cycles", 4) * len(test_df)
|
||||
total_timesteps = agent_params["train_cycles"] * len(train_df)
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
# environments
|
||||
train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
|
||||
reward_kwargs=reward_params)
|
||||
reward_kwargs=self.reward_params)
|
||||
eval = MyRLEnv(df=test_df, prices=prices_test,
|
||||
window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
|
||||
window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params)
|
||||
eval_env = Monitor(eval, ".")
|
||||
|
||||
path = dk.data_path
|
||||
@ -49,7 +47,7 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
|
||||
|
||||
model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=f"{path}/ppo/tensorboard/", learning_rate=0.00025,
|
||||
gamma=0.9, verbose=1
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
|
@ -51,23 +51,20 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
|
||||
prices_train: DataFrame, prices_test: DataFrame):
|
||||
|
||||
agent_params = self.freqai_info['model_training_parameters']
|
||||
reward_params = self.freqai_info['model_reward_parameters']
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = agent_params.get("eval_cycles", 4) * len(test_df)
|
||||
total_timesteps = agent_params["train_cycles"] * len(train_df)
|
||||
learning_rate = agent_params["learning_rate"]
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
env_id = "train_env"
|
||||
th.set_num_threads(dk.thread_count)
|
||||
num_cpu = int(dk.thread_count / 2)
|
||||
train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, reward_params,
|
||||
self.CONV_WIDTH) for i in range(num_cpu)])
|
||||
train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
|
||||
self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)])
|
||||
|
||||
eval_env_id = 'eval_env'
|
||||
eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, reward_params,
|
||||
self.CONV_WIDTH, monitor=True) for i in range(num_cpu)])
|
||||
eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
|
||||
self.reward_params, self.CONV_WIDTH, monitor=True) for i in
|
||||
range(num_cpu)])
|
||||
|
||||
path = dk.data_path
|
||||
eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/",
|
||||
@ -80,9 +77,7 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
|
||||
|
||||
model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=f"{path}/ppo/tensorboard/",
|
||||
learning_rate=learning_rate,
|
||||
gamma=0.9,
|
||||
verbose=1
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
|
@ -3,8 +3,7 @@ from typing import Any, Dict # Optional
|
||||
import torch as th
|
||||
from stable_baselines3.common.callbacks import EvalCallback
|
||||
from stable_baselines3.common.monitor import Monitor
|
||||
# from stable_baselines3.common.vec_env import SubprocVecEnv
|
||||
from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
|
||||
from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
|
||||
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
|
||||
from freqtrade.freqai.RL.TDQNagent import TDQN
|
||||
from stable_baselines3 import DQN
|
||||
@ -25,18 +24,16 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
|
||||
prices_train: DataFrame, prices_test: DataFrame):
|
||||
|
||||
agent_params = self.freqai_info['model_training_parameters']
|
||||
reward_params = self.freqai_info['model_reward_parameters']
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = agent_params["eval_cycles"] * len(test_df)
|
||||
total_timesteps = agent_params["train_cycles"] * len(train_df)
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
# environments
|
||||
train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
|
||||
reward_kwargs=reward_params)
|
||||
reward_kwargs=self.reward_params)
|
||||
eval = MyRLEnv(df=test_df, prices=prices_test,
|
||||
window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
|
||||
window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params)
|
||||
eval_env = Monitor(eval, ".")
|
||||
eval_env.reset()
|
||||
|
||||
@ -50,12 +47,10 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
|
||||
net_arch=[256, 256, 128])
|
||||
|
||||
model = TDQN('TMultiInputPolicy', train_env,
|
||||
policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=f"{path}/tdqn/tensorboard/",
|
||||
learning_rate=0.00025, gamma=0.9,
|
||||
target_update_interval=5000, buffer_size=50000,
|
||||
exploration_initial_eps=1, exploration_final_eps=0.1,
|
||||
replay_buffer_class=ReplayBuffer
|
||||
policy_kwargs=policy_kwargs,
|
||||
replay_buffer_class=ReplayBuffer,
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
@ -70,9 +65,11 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
|
||||
return best_model
|
||||
|
||||
|
||||
class MyRLEnv(Base3ActionRLEnv):
|
||||
# User can inherit and customize 5 action environment
|
||||
class MyRLEnv(Base5ActionRLEnv):
|
||||
"""
|
||||
User can override any function in BaseRLEnv and gym.Env
|
||||
User can override any function in BaseRLEnv and gym.Env. Here the user
|
||||
Adds 5 actions.
|
||||
"""
|
||||
|
||||
def calculate_reward(self, action):
|
||||
@ -81,55 +78,27 @@ class MyRLEnv(Base3ActionRLEnv):
|
||||
return 0.
|
||||
|
||||
# close long
|
||||
if (action == Actions.Short.value or
|
||||
action == Actions.Neutral.value) and self._position == Positions.Long:
|
||||
if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(current_price) - np.log(last_trade_price)) * 2)
|
||||
|
||||
# close short
|
||||
if (action == Actions.Long.value or
|
||||
action == Actions.Neutral.value) and self._position == Positions.Short:
|
||||
if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(last_trade_price) - np.log(current_price)) * 2)
|
||||
|
||||
return 0.
|
||||
|
||||
# User can inherit and customize 5 action environment
|
||||
# class MyRLEnv(Base5ActionRLEnv):
|
||||
# """
|
||||
# User can override any function in BaseRLEnv and gym.Env. Here the user
|
||||
# Adds 5 actions.
|
||||
# """
|
||||
|
||||
# def calculate_reward(self, action):
|
||||
|
||||
# if self._last_trade_tick is None:
|
||||
# return 0.
|
||||
|
||||
# # close long
|
||||
# if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
# last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
# if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
# if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
# last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float((np.log(current_price) - np.log(last_trade_price)) * 2)
|
||||
|
||||
# # close short
|
||||
# if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
# last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
# if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
# if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
# last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float((np.log(last_trade_price) - np.log(current_price)) * 2)
|
||||
|
||||
# return 0.
|
||||
|
@ -10,7 +10,7 @@ from stable_baselines3.common.monitor import Monitor
|
||||
from stable_baselines3.common.vec_env import SubprocVecEnv
|
||||
from stable_baselines3.common.utils import set_random_seed
|
||||
from stable_baselines3 import DQN
|
||||
from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
|
||||
from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
|
||||
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
|
||||
from freqtrade.freqai.RL.TDQNagent import TDQN
|
||||
from stable_baselines3.common.buffers import ReplayBuffer
|
||||
@ -50,22 +50,20 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
|
||||
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
|
||||
prices_train: DataFrame, prices_test: DataFrame):
|
||||
|
||||
agent_params = self.freqai_info['model_training_parameters']
|
||||
reward_params = self.freqai_info['model_reward_parameters']
|
||||
train_df = data_dictionary["train_features"]
|
||||
test_df = data_dictionary["test_features"]
|
||||
eval_freq = agent_params["eval_cycles"] * len(test_df)
|
||||
total_timesteps = agent_params["train_cycles"] * len(train_df)
|
||||
learning_rate = agent_params["learning_rate"]
|
||||
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
|
||||
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
|
||||
|
||||
env_id = "train_env"
|
||||
num_cpu = int(dk.thread_count / 2)
|
||||
train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, reward_params,
|
||||
self.CONV_WIDTH) for i in range(num_cpu)])
|
||||
train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
|
||||
self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)])
|
||||
|
||||
eval_env_id = 'eval_env'
|
||||
eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, reward_params,
|
||||
self.CONV_WIDTH, monitor=True) for i in range(num_cpu)])
|
||||
eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
|
||||
self.reward_params, self.CONV_WIDTH, monitor=True) for i in
|
||||
range(num_cpu)])
|
||||
|
||||
path = dk.data_path
|
||||
stop_train_callback = StopTrainingOnNoModelImprovement(
|
||||
@ -91,10 +89,8 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
|
||||
model = TDQN('TMultiInputPolicy', train_env,
|
||||
policy_kwargs=policy_kwargs,
|
||||
tensorboard_log=f"{path}/tdqn/tensorboard/",
|
||||
learning_rate=learning_rate, gamma=0.9,
|
||||
target_update_interval=5000, buffer_size=50000,
|
||||
exploration_initial_eps=1, exploration_final_eps=0.1,
|
||||
replay_buffer_class=ReplayBuffer
|
||||
replay_buffer_class=ReplayBuffer,
|
||||
**self.freqai_info['model_training_parameters']
|
||||
)
|
||||
|
||||
model.learn(
|
||||
@ -109,9 +105,11 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
|
||||
return best_model
|
||||
|
||||
|
||||
class MyRLEnv(Base3ActionRLEnv):
|
||||
# User can inherit and customize 5 action environment
|
||||
class MyRLEnv(Base5ActionRLEnv):
|
||||
"""
|
||||
User can override any function in BaseRLEnv and gym.Env
|
||||
User can override any function in BaseRLEnv and gym.Env. Here the user
|
||||
Adds 5 actions.
|
||||
"""
|
||||
|
||||
def calculate_reward(self, action):
|
||||
@ -120,55 +118,27 @@ class MyRLEnv(Base3ActionRLEnv):
|
||||
return 0.
|
||||
|
||||
# close long
|
||||
if (action == Actions.Short.value or
|
||||
action == Actions.Neutral.value) and self._position == Positions.Long:
|
||||
if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(current_price) - np.log(last_trade_price)) * 2)
|
||||
|
||||
# close short
|
||||
if (action == Actions.Long.value or
|
||||
action == Actions.Neutral.value) and self._position == Positions.Short:
|
||||
if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
return float((np.log(last_trade_price) - np.log(current_price)) * 2)
|
||||
|
||||
return 0.
|
||||
|
||||
# User can inherit and customize 5 action environment
|
||||
# class MyRLEnv(Base5ActionRLEnv):
|
||||
# """
|
||||
# User can override any function in BaseRLEnv and gym.Env. Here the user
|
||||
# Adds 5 actions.
|
||||
# """
|
||||
|
||||
# def calculate_reward(self, action):
|
||||
|
||||
# if self._last_trade_tick is None:
|
||||
# return 0.
|
||||
|
||||
# # close long
|
||||
# if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
# last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float(np.log(current_price) - np.log(last_trade_price))
|
||||
|
||||
# if action == Actions.Long_sell.value and self._position == Positions.Long:
|
||||
# if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
# last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float((np.log(current_price) - np.log(last_trade_price)) * 2)
|
||||
|
||||
# # close short
|
||||
# if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
# last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float(np.log(last_trade_price) - np.log(current_price))
|
||||
|
||||
# if action == Actions.Short_buy.value and self._position == Positions.Short:
|
||||
# if self.close_trade_profit[-1] > self.profit_aim * self.rr:
|
||||
# last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
|
||||
# current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
|
||||
# return float((np.log(last_trade_price) - np.log(current_price)) * 2)
|
||||
|
||||
# return 0.
|
||||
|
Loading…
Reference in New Issue
Block a user