From 94cfc8e63febe0590bae324f932cde390fc3a7a2 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Thu, 25 Aug 2022 11:46:18 +0200 Subject: [PATCH] fix multiproc callback, add continual learning to multiproc, fix totalprofit bug in env, set eval_freq automatically, improve default reward --- config_examples/config_freqai-rl.example.json | 14 ++--- freqtrade/freqai/RL/Base3ActionRLEnv.py | 2 + freqtrade/freqai/RL/Base5ActionRLEnv.py | 7 +-- .../RL/BaseReinforcementLearningModel.py | 24 ++++---- .../prediction_models/ReinforcementLearner.py | 16 +++--- .../ReinforcementLearner_multiproc.py | 57 +++++++++---------- 6 files changed, 58 insertions(+), 62 deletions(-) diff --git a/config_examples/config_freqai-rl.example.json b/config_examples/config_freqai-rl.example.json index b3f8737be..e8852a0cf 100644 --- a/config_examples/config_freqai-rl.example.json +++ b/config_examples/config_freqai-rl.example.json @@ -56,9 +56,9 @@ "freqai": { "enabled": true, "model_save_type": "stable_baselines", - "conv_width": 10, + "conv_width": 4, "purge_old_models": true, - "train_period_days": 10, + "train_period_days": 5, "backtest_period_days": 2, "identifier": "unique-id", "data_kitchen_thread_count": 2, @@ -72,7 +72,7 @@ "30m" ], "indicator_max_period_candles": 10, - "indicator_periods_candles": [5, 10] + "indicator_periods_candles": [5] }, "data_split_parameters": { "test_size": 0.5, @@ -85,13 +85,13 @@ "verbose": 1 }, "rl_config": { - "train_cycles": 3, - "eval_cycles": 3, + "train_cycles": 6, "thread_count": 4, - "max_trade_duration_candles": 100, + "max_trade_duration_candles": 300, "model_type": "PPO", "policy_type": "MlpPolicy", - "continual_retraining": true, + "continual_learning": false, + "max_training_drawdown_pct": 0.5, "model_reward_parameters": { "rr": 1, "profit_aim": 0.02, diff --git a/freqtrade/freqai/RL/Base3ActionRLEnv.py b/freqtrade/freqai/RL/Base3ActionRLEnv.py index cddd2f6f9..fe51d3b13 100644 --- a/freqtrade/freqai/RL/Base3ActionRLEnv.py +++ b/freqtrade/freqai/RL/Base3ActionRLEnv.py @@ -1,3 +1,5 @@ +# Example of a 3 action environment. + # import logging # from enum import Enum diff --git a/freqtrade/freqai/RL/Base5ActionRLEnv.py b/freqtrade/freqai/RL/Base5ActionRLEnv.py index 9f7c52c9c..b93d6e6ff 100644 --- a/freqtrade/freqai/RL/Base5ActionRLEnv.py +++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py @@ -77,8 +77,7 @@ class Base5ActionRLEnv(gym.Env): self._position = Positions.Neutral self._position_history: list = [None] self.total_reward: float = 0 - self._total_profit: float = 0 - self._first_rendering: bool = False + self._total_profit: float = 1 self.history: dict = {} self.trade_history: list = [] @@ -101,7 +100,6 @@ class Base5ActionRLEnv(gym.Env): self.total_reward = 0. self._total_profit = 1. # unit - self._first_rendering = True self.history = {} self.trade_history = [] self.portfolio_log_returns = np.zeros(len(self.prices)) @@ -165,7 +163,7 @@ class Base5ActionRLEnv(gym.Env): {'price': self.current_price(), 'index': self._current_tick, 'type': trade_type}) - if self._total_profit < 0.5: + if self._total_profit < 1 - self.rl_config.get('max_training_drawdown_pct', 0.8): self._done = True self._position_history.append(self._position) @@ -293,7 +291,6 @@ class Base5ActionRLEnv(gym.Env): return 0. def _update_profit(self, action): - # if self._is_trade(action) or self._done: if self._is_trade(action) or self._done: pnl = self.get_unrealized_profit() diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py index 84d19f269..7a524ba87 100644 --- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py +++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py @@ -23,8 +23,8 @@ logger = logging.getLogger(__name__) torch.multiprocessing.set_sharing_strategy('file_system') -SB3_MODELS = ['PPO', 'A2C', 'DQN', 'TD3', 'SAC'] -SB3_CONTRIB_MODELS = ['TRPO', 'ARS'] +SB3_MODELS = ['PPO', 'A2C', 'DQN'] +SB3_CONTRIB_MODELS = ['TRPO', 'ARS', 'RecurrentPPO', 'MaskablePPO'] class BaseReinforcementLearningModel(IFreqaiModel): @@ -41,7 +41,7 @@ class BaseReinforcementLearningModel(IFreqaiModel): self.eval_callback: EvalCallback = None self.model_type = self.freqai_info['rl_config']['model_type'] self.rl_config = self.freqai_info['rl_config'] - self.continual_retraining = self.rl_config.get('continual_retraining', False) + self.continual_learning = self.rl_config.get('continual_learning', False) if self.model_type in SB3_MODELS: import_str = 'stable_baselines3' elif self.model_type in SB3_CONTRIB_MODELS: @@ -109,7 +109,6 @@ class BaseReinforcementLearningModel(IFreqaiModel): """ train_df = data_dictionary["train_features"] test_df = data_dictionary["test_features"] - eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params, config=self.config) @@ -117,7 +116,7 @@ class BaseReinforcementLearningModel(IFreqaiModel): window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params, config=self.config)) self.eval_callback = EvalCallback(self.eval_env, deterministic=True, - render=False, eval_freq=eval_freq, + render=False, eval_freq=len(train_df), best_model_save_path=str(dk.data_path)) @abstractmethod @@ -138,6 +137,8 @@ class BaseReinforcementLearningModel(IFreqaiModel): for trade in open_trades: if trade.pair == pair: # FIXME: mypy typing doesnt like that strategy may be "None" (it never will be) + # FIXME: get_rate and trade_udration shouldn't work with backtesting, + # we need to use candle dates and prices to compute that. current_value = self.strategy.dp._exchange.get_rate( pair, refresh=False, side="exit", is_short=trade.is_short) openrate = trade.open_rate @@ -256,7 +257,7 @@ def make_env(env_id: str, rank: int, seed: int, train_df: DataFrame, price: Data env = MyRLEnv(df=train_df, prices=price, window_size=window_size, reward_kwargs=reward_params, id=env_id, seed=seed + rank, config=config) if monitor: - env = Monitor(env, ".") + env = Monitor(env) return env set_random_seed(seed) return _init @@ -272,18 +273,19 @@ class MyRLEnv(Base5ActionRLEnv): # first, penalize if the action is not valid if not self._is_valid(action): - return -15 + return -2 pnl = self.get_unrealized_profit() rew = np.sign(pnl) * (pnl + 1) factor = 100 # reward agent for entering trades - if action in (Actions.Long_enter.value, Actions.Short_enter.value): + if action in (Actions.Long_enter.value, Actions.Short_enter.value) \ + and self._position == Positions.Neutral: return 25 # discourage agent from not entering trades if action == Actions.Neutral.value and self._position == Positions.Neutral: - return -15 + return -1 max_trade_duration = self.rl_config.get('max_trade_duration_candles', 300) trade_duration = self._current_tick - self._last_trade_tick @@ -294,8 +296,8 @@ class MyRLEnv(Base5ActionRLEnv): factor *= 0.5 # discourage sitting in position - if self._position in (Positions.Short, Positions.Long): - return -50 * trade_duration / max_trade_duration + if self._position in (Positions.Short, Positions.Long) and action == Actions.Neutral.value: + return -1 * trade_duration / max_trade_duration # close long if action == Actions.Long_exit.value and self._position == Positions.Long: diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearner.py b/freqtrade/freqai/prediction_models/ReinforcementLearner.py index 2d1cafab5..36cc821e4 100644 --- a/freqtrade/freqai/prediction_models/ReinforcementLearner.py +++ b/freqtrade/freqai/prediction_models/ReinforcementLearner.py @@ -27,7 +27,7 @@ class ReinforcementLearner(BaseReinforcementLearningModel): policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[512, 512, 256]) - if dk.pair not in self.dd.model_dictionary or not self.continual_retraining: + if dk.pair not in self.dd.model_dictionary or not self.continual_learning: model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs, tensorboard_log=Path(dk.data_path / "tensorboard"), **self.freqai_info['model_training_parameters'] @@ -61,7 +61,6 @@ class ReinforcementLearner(BaseReinforcementLearningModel): """ train_df = data_dictionary["train_features"] test_df = data_dictionary["test_features"] - eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params, config=self.config) @@ -69,7 +68,7 @@ class ReinforcementLearner(BaseReinforcementLearningModel): window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params, config=self.config)) self.eval_callback = EvalCallback(self.eval_env, deterministic=True, - render=False, eval_freq=eval_freq, + render=False, eval_freq=len(train_df), best_model_save_path=str(dk.data_path)) @@ -83,18 +82,19 @@ class MyRLEnv(Base5ActionRLEnv): # first, penalize if the action is not valid if not self._is_valid(action): - return -15 + return -2 pnl = self.get_unrealized_profit() rew = np.sign(pnl) * (pnl + 1) factor = 100 # reward agent for entering trades - if action in (Actions.Long_enter.value, Actions.Short_enter.value): + if action in (Actions.Long_enter.value, Actions.Short_enter.value) \ + and self._position == Positions.Neutral: return 25 # discourage agent from not entering trades if action == Actions.Neutral.value and self._position == Positions.Neutral: - return -15 + return -1 max_trade_duration = self.rl_config.get('max_trade_duration_candles', 300) trade_duration = self._current_tick - self._last_trade_tick @@ -105,8 +105,8 @@ class MyRLEnv(Base5ActionRLEnv): factor *= 0.5 # discourage sitting in position - if self._position in (Positions.Short, Positions.Long): - return -50 * trade_duration / max_trade_duration + if self._position in (Positions.Short, Positions.Long) and action == Actions.Neutral.value: + return -1 * trade_duration / max_trade_duration # close long if action == Actions.Long_exit.value and self._position == Positions.Long: diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py b/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py index 3a4c245aa..7e8141b23 100644 --- a/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py +++ b/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py @@ -26,12 +26,19 @@ class ReinforcementLearner_multiproc(BaseReinforcementLearningModel): # model arch policy_kwargs = dict(activation_fn=th.nn.ReLU, - net_arch=[512, 512, 256]) + net_arch=[256, 256]) - model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs, - tensorboard_log=Path(dk.full_path / "tensorboard"), - **self.freqai_info['model_training_parameters'] - ) + if dk.pair not in self.dd.model_dictionary or not self.continual_learning: + model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs, + tensorboard_log=Path(dk.full_path / "tensorboard"), + **self.freqai_info['model_training_parameters'] + ) + else: + logger.info('Continual training activated - starting training from previously ' + 'trained agent.') + model = self.dd.model_dictionary[dk.pair] + model.tensorboard_log = Path(dk.data_path / "tensorboard") + model.set_env(self.train_env) model.learn( total_timesteps=int(total_timesteps), @@ -57,30 +64,18 @@ class ReinforcementLearner_multiproc(BaseReinforcementLearningModel): test_df = data_dictionary["test_features"] eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) - # environments - if not self.train_env: - env_id = "train_env" - num_cpu = int(self.freqai_info["rl_config"]["thread_count"] / 2) - self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, - self.reward_params, self.CONV_WIDTH, - config=self.config) for i - in range(num_cpu)]) + env_id = "train_env" + num_cpu = int(self.freqai_info["rl_config"]["thread_count"] / 2) + self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, + self.reward_params, self.CONV_WIDTH, + config=self.config) for i + in range(num_cpu)]) - eval_env_id = 'eval_env' - self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, - self.reward_params, self.CONV_WIDTH, monitor=True, - config=self.config) for i - in range(num_cpu)]) - self.eval_callback = EvalCallback(self.eval_env, deterministic=True, - render=False, eval_freq=eval_freq, - best_model_save_path=dk.data_path) - else: - self.train_env.env_method('reset') - self.eval_env.env_method('reset') - self.train_env.env_method('reset_env', train_df, prices_train, - self.CONV_WIDTH, self.reward_params) - self.eval_env.env_method('reset_env', train_df, prices_train, - self.CONV_WIDTH, self.reward_params) - self.eval_callback.__init__(self.eval_env, deterministic=True, - render=False, eval_freq=eval_freq, - best_model_save_path=dk.data_path) + eval_env_id = 'eval_env' + self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, + self.reward_params, self.CONV_WIDTH, monitor=True, + config=self.config) for i + in range(num_cpu)]) + self.eval_callback = EvalCallback(self.eval_env, deterministic=True, + render=False, eval_freq=eval_freq, + best_model_save_path=dk.data_path)