reinforce training with state info, reinforce prediction with state info, restructure config to accommodate all parameters from any user imported model type. Set 5Act to default env on TDQN. Clean example config.

This commit is contained in:
robcaulk 2022-08-18 13:02:47 +02:00
parent 7962a1439b
commit 5d4e5e69fe
8 changed files with 114 additions and 192 deletions

View File

@ -8,7 +8,7 @@
"tradable_balance_ratio": 1, "tradable_balance_ratio": 1,
"fiat_display_currency": "USD", "fiat_display_currency": "USD",
"dry_run": true, "dry_run": true,
"timeframe": "3m", "timeframe": "5m",
"dataformat_ohlcv": "json", "dataformat_ohlcv": "json",
"dry_run_wallet": 12000, "dry_run_wallet": 12000,
"cancel_open_orders_on_exit": true, "cancel_open_orders_on_exit": true,
@ -35,7 +35,6 @@
}, },
"entry_pricing": { "entry_pricing": {
"price_side": "same", "price_side": "same",
"purge_old_models": true,
"use_order_book": true, "use_order_book": true,
"order_book_top": 1, "order_book_top": 1,
"price_last_balance": 0.0, "price_last_balance": 0.0,
@ -56,10 +55,8 @@
], ],
"freqai": { "freqai": {
"enabled": true, "enabled": true,
"startup_candles": 1000, "model_save_type": "stable_baselines_dqn",
"model_save_type": "stable_baselines_ppo",
"conv_width": 10, "conv_width": 10,
"follow_mode": false,
"purge_old_models": true, "purge_old_models": true,
"train_period_days": 10, "train_period_days": 10,
"backtest_period_days": 2, "backtest_period_days": 2,
@ -71,13 +68,9 @@
"ETH/USDT" "ETH/USDT"
], ],
"include_timeframes": [ "include_timeframes": [
"3m", "5m",
"15m" "30m"
], ],
"include_shifted_candles": 0,
"weight_factor": 0.9,
"principal_component_analysis": false,
"use_SVM_to_remove_outliers": false,
"indicator_max_period_candles": 10, "indicator_max_period_candles": 10,
"indicator_periods_candles": [5, 10] "indicator_periods_candles": [5, 10]
}, },
@ -87,15 +80,21 @@
"shuffle": false "shuffle": false
}, },
"model_training_parameters": { "model_training_parameters": {
"ent_coef": 0.005, "learning_rate": 0.00025,
"learning_rate": 0.000025, "gamma": 0.9,
"batch_size": 256, "target_update_interval": 5000,
"eval_cycles" : 5, "buffer_size": 50000,
"train_cycles" : 15 "exploration_initial_eps":1,
"exploration_final_eps": 0.1,
"verbose": 1
}, },
"rl_config": {
"train_cycles": 15,
"eval_cycles": 5,
"model_reward_parameters": { "model_reward_parameters": {
"rr": 1, "rr": 1,
"profit_aim": 0.01 "profit_aim": 0.02
}
} }
}, },
"bot_name": "RL_test", "bot_name": "RL_test",

View File

@ -6,6 +6,7 @@ import gym
import numpy as np import numpy as np
from gym import spaces from gym import spaces
from gym.utils import seeding from gym.utils import seeding
from pandas import DataFrame
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -35,7 +36,8 @@ class Base3ActionRLEnv(gym.Env):
metadata = {'render.modes': ['human']} metadata = {'render.modes': ['human']}
def __init__(self, df, prices, reward_kwargs, window_size=10, starting_point=True, def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
reward_kwargs: dict = {}, window_size=10, starting_point=True,
id: str = 'baseenv-1', seed: int = 1): id: str = 'baseenv-1', seed: int = 1):
assert df.ndim == 2 assert df.ndim == 2

View File

@ -6,6 +6,7 @@ import gym
import numpy as np import numpy as np
from gym import spaces from gym import spaces
from gym.utils import seeding from gym.utils import seeding
from pandas import DataFrame
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -39,7 +40,8 @@ class Base5ActionRLEnv(gym.Env):
""" """
metadata = {'render.modes': ['human']} metadata = {'render.modes': ['human']}
def __init__(self, df, prices, reward_kwargs, window_size=10, starting_point=True, def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
reward_kwargs: dict = {}, window_size=10, starting_point=True,
id: str = 'baseenv-1', seed: int = 1): id: str = 'baseenv-1', seed: int = 1):
assert df.ndim == 2 assert df.ndim == 2
@ -56,7 +58,7 @@ class Base5ActionRLEnv(gym.Env):
self.fee = 0.0015 self.fee = 0.0015
# # spaces # # spaces
self.shape = (window_size, self.signal_features.shape[1]) self.shape = (window_size, self.signal_features.shape[1] + 2)
self.action_space = spaces.Discrete(len(Actions)) self.action_space = spaces.Discrete(len(Actions))
self.observation_space = spaces.Box( self.observation_space = spaces.Box(
low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32) low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
@ -161,19 +163,26 @@ class Base5ActionRLEnv(gym.Env):
self._done = True self._done = True
self._position_history.append(self._position) self._position_history.append(self._position)
observation = self._get_observation()
info = dict( info = dict(
tick=self._current_tick, tick=self._current_tick,
total_reward=self.total_reward, total_reward=self.total_reward,
total_profit=self._total_profit, total_profit=self._total_profit,
position=self._position.value position=self._position.value
) )
observation = self._get_observation()
self._update_history(info) self._update_history(info)
return observation, step_reward, self._done, info return observation, step_reward, self._done, info
def _get_observation(self): def _get_observation(self):
return self.signal_features[(self._current_tick - self.window_size):self._current_tick] features_and_state = self.signal_features[(
self._current_tick - self.window_size):self._current_tick]
features_and_state['current_profit_pct'] = self.get_unrealized_profit()
features_and_state['position'] = self._position.value
return features_and_state
def get_unrealized_profit(self): def get_unrealized_profit(self):

View File

@ -13,7 +13,7 @@ from freqtrade.persistence import Trade
import torch.multiprocessing import torch.multiprocessing
import torch as th import torch as th
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
th.set_num_threads(8)
torch.multiprocessing.set_sharing_strategy('file_system') torch.multiprocessing.set_sharing_strategy('file_system')
@ -22,6 +22,11 @@ class BaseReinforcementLearningModel(IFreqaiModel):
User created Reinforcement Learning Model prediction model. User created Reinforcement Learning Model prediction model.
""" """
def __init__(self, **kwargs):
super().__init__(config=kwargs['config'])
th.set_num_threads(self.freqai_info.get('data_kitchen_thread_count', 4))
self.reward_params = self.freqai_info['rl_config']['model_reward_parameters']
def train( def train(
self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen
) -> Any: ) -> Any:
@ -62,12 +67,6 @@ class BaseReinforcementLearningModel(IFreqaiModel):
model = self.fit_rl(data_dictionary, pair, dk, prices_train, prices_test) model = self.fit_rl(data_dictionary, pair, dk, prices_train, prices_test)
if pair not in self.dd.historic_predictions:
self.set_initial_historic_predictions(
data_dictionary['train_features'], model, dk, pair)
self.dd.save_historic_predictions_to_disk()
logger.info(f"--------------------done training {pair}--------------------") logger.info(f"--------------------done training {pair}--------------------")
return model return model
@ -127,7 +126,8 @@ class BaseReinforcementLearningModel(IFreqaiModel):
# optional additional data cleaning/analysis # optional additional data cleaning/analysis
self.data_cleaning_predict(dk, filtered_dataframe) self.data_cleaning_predict(dk, filtered_dataframe)
pred_df = self.rl_model_predict(dk.data_dictionary["prediction_features"], dk, self.model) pred_df = self.rl_model_predict(
dk.data_dictionary["prediction_features"], dk, self.model)
pred_df.fillna(0, inplace=True) pred_df.fillna(0, inplace=True)
return (pred_df, dk.do_predict) return (pred_df, dk.do_predict)
@ -135,10 +135,13 @@ class BaseReinforcementLearningModel(IFreqaiModel):
def rl_model_predict(self, dataframe: DataFrame, def rl_model_predict(self, dataframe: DataFrame,
dk: FreqaiDataKitchen, model: Any) -> DataFrame: dk: FreqaiDataKitchen, model: Any) -> DataFrame:
output = pd.DataFrame(np.full((len(dataframe), 1), 2), columns=dk.label_list) output = pd.DataFrame(np.zeros(len(dataframe)), columns=dk.label_list)
def _predict(window): def _predict(window):
market_side, current_profit, total_profit = self.get_state_info(dk.pair)
observations = dataframe.iloc[window.index] observations = dataframe.iloc[window.index]
observations['current_profit'] = current_profit
observations['position'] = market_side
res, _ = model.predict(observations, deterministic=True) res, _ = model.predict(observations, deterministic=True)
return res return res
@ -174,29 +177,6 @@ class BaseReinforcementLearningModel(IFreqaiModel):
return prices_train, prices_test return prices_train, prices_test
def set_initial_historic_predictions(
self, df: DataFrame, model: Any, dk: FreqaiDataKitchen, pair: str
) -> None:
pred_df = self.rl_model_predict(df, dk, model)
pred_df.fillna(0, inplace=True)
self.dd.historic_predictions[pair] = pred_df
hist_preds_df = self.dd.historic_predictions[pair]
for label in hist_preds_df.columns:
if hist_preds_df[label].dtype == object:
continue
hist_preds_df[f'{label}_mean'] = 0
hist_preds_df[f'{label}_std'] = 0
hist_preds_df['do_predict'] = 0
if self.freqai_info['feature_parameters'].get('DI_threshold', 0) > 0:
hist_preds_df['DI_values'] = 0
for return_str in dk.data['extra_returns_per_train']:
hist_preds_df[return_str] = 0
# TODO take care of this appendage. Right now it needs to be called because FreqAI enforces it. # TODO take care of this appendage. Right now it needs to be called because FreqAI enforces it.
# But FreqaiRL needs more objects passed to fit() (like DK) and we dont want to go refactor # But FreqaiRL needs more objects passed to fit() (like DK) and we dont want to go refactor
# all the other existing fit() functions to include dk argument. For now we instantiate and # all the other existing fit() functions to include dk argument. For now we instantiate and

View File

@ -24,18 +24,16 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
prices_train: DataFrame, prices_test: DataFrame): prices_train: DataFrame, prices_test: DataFrame):
agent_params = self.freqai_info['model_training_parameters']
reward_params = self.freqai_info['model_reward_parameters']
train_df = data_dictionary["train_features"] train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"] test_df = data_dictionary["test_features"]
eval_freq = agent_params.get("eval_cycles", 4) * len(test_df) eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
total_timesteps = agent_params["train_cycles"] * len(train_df) total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
# environments # environments
train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH, train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
reward_kwargs=reward_params) reward_kwargs=self.reward_params)
eval = MyRLEnv(df=test_df, prices=prices_test, eval = MyRLEnv(df=test_df, prices=prices_test,
window_size=self.CONV_WIDTH, reward_kwargs=reward_params) window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params)
eval_env = Monitor(eval, ".") eval_env = Monitor(eval, ".")
path = dk.data_path path = dk.data_path
@ -49,7 +47,7 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs, model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs,
tensorboard_log=f"{path}/ppo/tensorboard/", learning_rate=0.00025, tensorboard_log=f"{path}/ppo/tensorboard/", learning_rate=0.00025,
gamma=0.9, verbose=1 **self.freqai_info['model_training_parameters']
) )
model.learn( model.learn(

View File

@ -51,23 +51,20 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
prices_train: DataFrame, prices_test: DataFrame): prices_train: DataFrame, prices_test: DataFrame):
agent_params = self.freqai_info['model_training_parameters']
reward_params = self.freqai_info['model_reward_parameters']
train_df = data_dictionary["train_features"] train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"] test_df = data_dictionary["test_features"]
eval_freq = agent_params.get("eval_cycles", 4) * len(test_df) eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
total_timesteps = agent_params["train_cycles"] * len(train_df) total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
learning_rate = agent_params["learning_rate"]
env_id = "train_env" env_id = "train_env"
th.set_num_threads(dk.thread_count)
num_cpu = int(dk.thread_count / 2) num_cpu = int(dk.thread_count / 2)
train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, reward_params, train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
self.CONV_WIDTH) for i in range(num_cpu)]) self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)])
eval_env_id = 'eval_env' eval_env_id = 'eval_env'
eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, reward_params, eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
self.CONV_WIDTH, monitor=True) for i in range(num_cpu)]) self.reward_params, self.CONV_WIDTH, monitor=True) for i in
range(num_cpu)])
path = dk.data_path path = dk.data_path
eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/", eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/",
@ -80,9 +77,7 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs, model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs,
tensorboard_log=f"{path}/ppo/tensorboard/", tensorboard_log=f"{path}/ppo/tensorboard/",
learning_rate=learning_rate, **self.freqai_info['model_training_parameters']
gamma=0.9,
verbose=1
) )
model.learn( model.learn(

View File

@ -3,8 +3,7 @@ from typing import Any, Dict # Optional
import torch as th import torch as th
from stable_baselines3.common.callbacks import EvalCallback from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.monitor import Monitor
# from stable_baselines3.common.vec_env import SubprocVecEnv from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
from freqtrade.freqai.RL.TDQNagent import TDQN from freqtrade.freqai.RL.TDQNagent import TDQN
from stable_baselines3 import DQN from stable_baselines3 import DQN
@ -25,18 +24,16 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
prices_train: DataFrame, prices_test: DataFrame): prices_train: DataFrame, prices_test: DataFrame):
agent_params = self.freqai_info['model_training_parameters']
reward_params = self.freqai_info['model_reward_parameters']
train_df = data_dictionary["train_features"] train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"] test_df = data_dictionary["test_features"]
eval_freq = agent_params["eval_cycles"] * len(test_df) eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
total_timesteps = agent_params["train_cycles"] * len(train_df) total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
# environments # environments
train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH, train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
reward_kwargs=reward_params) reward_kwargs=self.reward_params)
eval = MyRLEnv(df=test_df, prices=prices_test, eval = MyRLEnv(df=test_df, prices=prices_test,
window_size=self.CONV_WIDTH, reward_kwargs=reward_params) window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params)
eval_env = Monitor(eval, ".") eval_env = Monitor(eval, ".")
eval_env.reset() eval_env.reset()
@ -50,12 +47,10 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
net_arch=[256, 256, 128]) net_arch=[256, 256, 128])
model = TDQN('TMultiInputPolicy', train_env, model = TDQN('TMultiInputPolicy', train_env,
policy_kwargs=policy_kwargs,
tensorboard_log=f"{path}/tdqn/tensorboard/", tensorboard_log=f"{path}/tdqn/tensorboard/",
learning_rate=0.00025, gamma=0.9, policy_kwargs=policy_kwargs,
target_update_interval=5000, buffer_size=50000, replay_buffer_class=ReplayBuffer,
exploration_initial_eps=1, exploration_final_eps=0.1, **self.freqai_info['model_training_parameters']
replay_buffer_class=ReplayBuffer
) )
model.learn( model.learn(
@ -70,9 +65,11 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
return best_model return best_model
class MyRLEnv(Base3ActionRLEnv): # User can inherit and customize 5 action environment
class MyRLEnv(Base5ActionRLEnv):
""" """
User can override any function in BaseRLEnv and gym.Env User can override any function in BaseRLEnv and gym.Env. Here the user
Adds 5 actions.
""" """
def calculate_reward(self, action): def calculate_reward(self, action):
@ -81,55 +78,27 @@ class MyRLEnv(Base3ActionRLEnv):
return 0. return 0.
# close long # close long
if (action == Actions.Short.value or if action == Actions.Long_sell.value and self._position == Positions.Long:
action == Actions.Neutral.value) and self._position == Positions.Long:
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
return float(np.log(current_price) - np.log(last_trade_price)) return float(np.log(current_price) - np.log(last_trade_price))
if action == Actions.Long_sell.value and self._position == Positions.Long:
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
return float((np.log(current_price) - np.log(last_trade_price)) * 2)
# close short # close short
if (action == Actions.Long.value or if action == Actions.Short_buy.value and self._position == Positions.Short:
action == Actions.Neutral.value) and self._position == Positions.Short:
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
return float(np.log(last_trade_price) - np.log(current_price)) return float(np.log(last_trade_price) - np.log(current_price))
if action == Actions.Short_buy.value and self._position == Positions.Short:
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
return float((np.log(last_trade_price) - np.log(current_price)) * 2)
return 0. return 0.
# User can inherit and customize 5 action environment
# class MyRLEnv(Base5ActionRLEnv):
# """
# User can override any function in BaseRLEnv and gym.Env. Here the user
# Adds 5 actions.
# """
# def calculate_reward(self, action):
# if self._last_trade_tick is None:
# return 0.
# # close long
# if action == Actions.Long_sell.value and self._position == Positions.Long:
# last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
# current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
# return float(np.log(current_price) - np.log(last_trade_price))
# if action == Actions.Long_sell.value and self._position == Positions.Long:
# if self.close_trade_profit[-1] > self.profit_aim * self.rr:
# last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
# current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
# return float((np.log(current_price) - np.log(last_trade_price)) * 2)
# # close short
# if action == Actions.Short_buy.value and self._position == Positions.Short:
# last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
# current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
# return float(np.log(last_trade_price) - np.log(current_price))
# if action == Actions.Short_buy.value and self._position == Positions.Short:
# if self.close_trade_profit[-1] > self.profit_aim * self.rr:
# last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
# current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
# return float((np.log(last_trade_price) - np.log(current_price)) * 2)
# return 0.

View File

@ -10,7 +10,7 @@ from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import SubprocVecEnv from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.utils import set_random_seed from stable_baselines3.common.utils import set_random_seed
from stable_baselines3 import DQN from stable_baselines3 import DQN
from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
from freqtrade.freqai.RL.TDQNagent import TDQN from freqtrade.freqai.RL.TDQNagent import TDQN
from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.buffers import ReplayBuffer
@ -50,22 +50,20 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen,
prices_train: DataFrame, prices_test: DataFrame): prices_train: DataFrame, prices_test: DataFrame):
agent_params = self.freqai_info['model_training_parameters']
reward_params = self.freqai_info['model_reward_parameters']
train_df = data_dictionary["train_features"] train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"] test_df = data_dictionary["test_features"]
eval_freq = agent_params["eval_cycles"] * len(test_df) eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
total_timesteps = agent_params["train_cycles"] * len(train_df) total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
learning_rate = agent_params["learning_rate"]
env_id = "train_env" env_id = "train_env"
num_cpu = int(dk.thread_count / 2) num_cpu = int(dk.thread_count / 2)
train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, reward_params, train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
self.CONV_WIDTH) for i in range(num_cpu)]) self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)])
eval_env_id = 'eval_env' eval_env_id = 'eval_env'
eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, reward_params, eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
self.CONV_WIDTH, monitor=True) for i in range(num_cpu)]) self.reward_params, self.CONV_WIDTH, monitor=True) for i in
range(num_cpu)])
path = dk.data_path path = dk.data_path
stop_train_callback = StopTrainingOnNoModelImprovement( stop_train_callback = StopTrainingOnNoModelImprovement(
@ -91,10 +89,8 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
model = TDQN('TMultiInputPolicy', train_env, model = TDQN('TMultiInputPolicy', train_env,
policy_kwargs=policy_kwargs, policy_kwargs=policy_kwargs,
tensorboard_log=f"{path}/tdqn/tensorboard/", tensorboard_log=f"{path}/tdqn/tensorboard/",
learning_rate=learning_rate, gamma=0.9, replay_buffer_class=ReplayBuffer,
target_update_interval=5000, buffer_size=50000, **self.freqai_info['model_training_parameters']
exploration_initial_eps=1, exploration_final_eps=0.1,
replay_buffer_class=ReplayBuffer
) )
model.learn( model.learn(
@ -109,9 +105,11 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
return best_model return best_model
class MyRLEnv(Base3ActionRLEnv): # User can inherit and customize 5 action environment
class MyRLEnv(Base5ActionRLEnv):
""" """
User can override any function in BaseRLEnv and gym.Env User can override any function in BaseRLEnv and gym.Env. Here the user
Adds 5 actions.
""" """
def calculate_reward(self, action): def calculate_reward(self, action):
@ -120,55 +118,27 @@ class MyRLEnv(Base3ActionRLEnv):
return 0. return 0.
# close long # close long
if (action == Actions.Short.value or if action == Actions.Long_sell.value and self._position == Positions.Long:
action == Actions.Neutral.value) and self._position == Positions.Long:
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
return float(np.log(current_price) - np.log(last_trade_price)) return float(np.log(current_price) - np.log(last_trade_price))
if action == Actions.Long_sell.value and self._position == Positions.Long:
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
return float((np.log(current_price) - np.log(last_trade_price)) * 2)
# close short # close short
if (action == Actions.Long.value or if action == Actions.Short_buy.value and self._position == Positions.Short:
action == Actions.Neutral.value) and self._position == Positions.Short:
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
return float(np.log(last_trade_price) - np.log(current_price)) return float(np.log(last_trade_price) - np.log(current_price))
if action == Actions.Short_buy.value and self._position == Positions.Short:
if self.close_trade_profit[-1] > self.profit_aim * self.rr:
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
return float((np.log(last_trade_price) - np.log(current_price)) * 2)
return 0. return 0.
# User can inherit and customize 5 action environment
# class MyRLEnv(Base5ActionRLEnv):
# """
# User can override any function in BaseRLEnv and gym.Env. Here the user
# Adds 5 actions.
# """
# def calculate_reward(self, action):
# if self._last_trade_tick is None:
# return 0.
# # close long
# if action == Actions.Long_sell.value and self._position == Positions.Long:
# last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
# current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
# return float(np.log(current_price) - np.log(last_trade_price))
# if action == Actions.Long_sell.value and self._position == Positions.Long:
# if self.close_trade_profit[-1] > self.profit_aim * self.rr:
# last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
# current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
# return float((np.log(current_price) - np.log(last_trade_price)) * 2)
# # close short
# if action == Actions.Short_buy.value and self._position == Positions.Short:
# last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
# current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
# return float(np.log(last_trade_price) - np.log(current_price))
# if action == Actions.Short_buy.value and self._position == Positions.Short:
# if self.close_trade_profit[-1] > self.profit_aim * self.rr:
# last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
# current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
# return float((np.log(last_trade_price) - np.log(current_price)) * 2)
# return 0.