diff --git a/config_examples/config_freqai-rl.example.json b/config_examples/config_freqai-rl.example.json index 1f12cbc6c..ccc977705 100644 --- a/config_examples/config_freqai-rl.example.json +++ b/config_examples/config_freqai-rl.example.json @@ -61,7 +61,7 @@ "train_period_days": 10, "backtest_period_days": 2, "identifier": "unique-id", - "data_kitchen_thread_count": 4, + "data_kitchen_thread_count": 2, "feature_parameters": { "include_corr_pairlist": [ "BTC/USDT", diff --git a/freqtrade/freqai/RL/Base5ActionRLEnv.py b/freqtrade/freqai/RL/Base5ActionRLEnv.py index d7ceb5ff3..bf3f0df33 100644 --- a/freqtrade/freqai/RL/Base5ActionRLEnv.py +++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py @@ -7,7 +7,7 @@ import numpy as np from gym import spaces from gym.utils import seeding from pandas import DataFrame - +import pandas as pd logger = logging.getLogger(__name__) @@ -47,6 +47,9 @@ class Base5ActionRLEnv(gym.Env): self.id = id self.seed(seed) + self.reset_env(df, prices, window_size, reward_kwargs, starting_point) + + def reset_env(self, df, prices, window_size, reward_kwargs, starting_point=True): self.df = df self.signal_features = self.df self.prices = prices @@ -178,10 +181,15 @@ class Base5ActionRLEnv(gym.Env): return observation, step_reward, self._done, info def _get_observation(self): - features_and_state = self.signal_features[( + features_window = self.signal_features[( self._current_tick - self.window_size):self._current_tick] + features_and_state = DataFrame(np.zeros((len(features_window), 2)), + columns=['current_profit_pct', 'position'], + index=features_window.index) + features_and_state['current_profit_pct'] = self.get_unrealized_profit() features_and_state['position'] = self._position.value + features_and_state = pd.concat([features_window, features_and_state], axis=1) return features_and_state def get_unrealized_profit(self): diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py index 395b2a1a6..9c7b1e4b4 100644 --- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py +++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py @@ -8,9 +8,10 @@ from pandas import DataFrame from abc import abstractmethod from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.freqai_interface import IFreqaiModel -from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions +from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions from freqtrade.persistence import Trade import torch.multiprocessing +from stable_baselines3.common.monitor import Monitor import torch as th logger = logging.getLogger(__name__) @@ -26,6 +27,7 @@ class BaseReinforcementLearningModel(IFreqaiModel): super().__init__(config=kwargs['config']) th.set_num_threads(self.freqai_info.get('data_kitchen_thread_count', 4)) self.reward_params = self.freqai_info['rl_config']['model_reward_parameters'] + self.train_env: Base5ActionRLEnv = None def train( self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen @@ -65,15 +67,37 @@ class BaseReinforcementLearningModel(IFreqaiModel): ) logger.info(f'Training model on {len(data_dictionary["train_features"])} data points') - model = self.fit_rl(data_dictionary, pair, dk, prices_train, prices_test) + self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test) + + model = self.fit_rl(data_dictionary, dk) logger.info(f"--------------------done training {pair}--------------------") return model + def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test): + """ + User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise + leaving this will default to Base5ActEnv + """ + train_df = data_dictionary["train_features"] + test_df = data_dictionary["test_features"] + + # environments + if not self.train_env: + self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH, + reward_kwargs=self.reward_params) + self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test, + window_size=self.CONV_WIDTH, + reward_kwargs=self.reward_params), ".") + else: + self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params) + self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params) + self.train_env.reset() + self.eval_env.reset() + @abstractmethod - def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, - prices_train: DataFrame, prices_test: DataFrame): + def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen): """ Agent customizations and abstract Reinforcement Learning customizations go in here. Abstract method, so this function must be overridden by @@ -193,66 +217,39 @@ class BaseReinforcementLearningModel(IFreqaiModel): return -class MyRLEnv(Base3ActionRLEnv): +class MyRLEnv(Base5ActionRLEnv): + """ + User can override any function in BaseRLEnv and gym.Env. Here the user + Adds 5 actions. + """ - def step(self, action): - self._done = False - self._current_tick += 1 + def calculate_reward(self, action): - if self._current_tick == self._end_tick: - self._done = True + if self._last_trade_tick is None: + return 0. - self.update_portfolio_log_returns(action) + # close long + if action == Actions.Long_sell.value and self._position == Positions.Long: + last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) + current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) + return float(np.log(current_price) - np.log(last_trade_price)) - self._update_profit(action) - step_reward = self._calculate_reward(action) - self.total_reward += step_reward + if action == Actions.Long_sell.value and self._position == Positions.Long: + if self.close_trade_profit[-1] > self.profit_aim * self.rr: + last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) + current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) + return float((np.log(current_price) - np.log(last_trade_price)) * 2) - trade_type = None - if self.is_tradesignal(action): # exclude 3 case not trade - # Update position - """ - Action: Neutral, position: Long -> Close Long - Action: Neutral, position: Short -> Close Short + # close short + if action == Actions.Short_buy.value and self._position == Positions.Short: + last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) + current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) + return float(np.log(last_trade_price) - np.log(current_price)) - Action: Long, position: Neutral -> Open Long - Action: Long, position: Short -> Close Short and Open Long + if action == Actions.Short_buy.value and self._position == Positions.Short: + if self.close_trade_profit[-1] > self.profit_aim * self.rr: + last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) + current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) + return float((np.log(last_trade_price) - np.log(current_price)) * 2) - Action: Short, position: Neutral -> Open Short - Action: Short, position: Long -> Close Long and Open Short - """ - - if action == Actions.Neutral.value: - self._position = Positions.Neutral - trade_type = "neutral" - elif action == Actions.Long.value: - self._position = Positions.Long - trade_type = "long" - elif action == Actions.Short.value: - self._position = Positions.Short - trade_type = "short" - else: - print("case not defined") - - # Update last trade tick - self._last_trade_tick = self._current_tick - - if trade_type is not None: - self.trade_history.append( - {'price': self.current_price(), 'index': self._current_tick, - 'type': trade_type}) - - if self._total_profit < 0.2: - self._done = True - - self._position_history.append(self._position) - observation = self._get_observation() - info = dict( - tick=self._current_tick, - total_reward=self.total_reward, - total_profit=self._total_profit, - position=self._position.value - ) - self._update_history(info) - - return observation, step_reward, self._done, info + return 0. diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py index 337e94607..5fa24a599 100644 --- a/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py +++ b/freqtrade/freqai/prediction_models/ReinforcementLearningPPO_multiproc.py @@ -3,9 +3,7 @@ from typing import Any, Dict # , Tuple import numpy as np # import numpy.typing as npt -# import pandas as pd import torch as th -# from pandas import DataFrame from stable_baselines3.common.monitor import Monitor from typing import Callable from stable_baselines3 import PPO @@ -16,7 +14,6 @@ from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Posi from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel from freqtrade.freqai.data_kitchen import FreqaiDataKitchen import gym -from pandas import DataFrame logger = logging.getLogger(__name__) @@ -48,26 +45,15 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel): User created Reinforcement Learning Model prediction model. """ - def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, - prices_train: DataFrame, prices_test: DataFrame): + def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen): train_df = data_dictionary["train_features"] test_df = data_dictionary["test_features"] eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) - env_id = "train_env" - num_cpu = int(dk.thread_count / 2) - train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, - self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)]) - - eval_env_id = 'eval_env' - eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, - self.reward_params, self.CONV_WIDTH, monitor=True) for i in - range(num_cpu)]) - path = dk.data_path - eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/", + eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/", log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq), deterministic=True, render=False) @@ -75,7 +61,7 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel): policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[512, 512, 512]) - model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs, + model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs, tensorboard_log=f"{path}/ppo/tensorboard/", **self.freqai_info['model_training_parameters'] ) @@ -87,10 +73,37 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel): best_model = PPO.load(dk.data_path / "best_model") print('Training finished!') - eval_env.close() return best_model + def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test): + """ + User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise + leaving this will default to Base5ActEnv + """ + train_df = data_dictionary["train_features"] + test_df = data_dictionary["test_features"] + + # environments + if not self.train_env: + env_id = "train_env" + num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2) + self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, + self.reward_params, self.CONV_WIDTH) for i + in range(num_cpu)]) + + eval_env_id = 'eval_env' + self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, + self.reward_params, self.CONV_WIDTH, monitor=True) for i + in range(num_cpu)]) + else: + self.train_env.env_method('reset_env', train_df, prices_train, + self.CONV_WIDTH, self.reward_params) + self.eval_env.env_method('reset_env', train_df, prices_train, + self.CONV_WIDTH, self.reward_params) + self.train_env.env_method('reset') + self.eval_env.env_method('reset') + class MyRLEnv(Base3ActionRLEnv): """ diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py index 3a57142cf..3c4ac6bdb 100644 --- a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py +++ b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN.py @@ -9,8 +9,7 @@ from freqtrade.freqai.RL.TDQNagent import TDQN from stable_baselines3 import DQN from stable_baselines3.common.buffers import ReplayBuffer import numpy as np -from pandas import DataFrame - +import gc from freqtrade.freqai.data_kitchen import FreqaiDataKitchen logger = logging.getLogger(__name__) @@ -21,24 +20,15 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel): User created Reinforcement Learning Model prediction model. """ - def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, - prices_train: DataFrame, prices_test: DataFrame): + def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen): train_df = data_dictionary["train_features"] test_df = data_dictionary["test_features"] eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) - # environments - train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH, - reward_kwargs=self.reward_params) - eval = MyRLEnv(df=test_df, prices=prices_test, - window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params) - eval_env = Monitor(eval, ".") - eval_env.reset() - path = dk.data_path - eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/", + eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/", log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq), deterministic=True, render=False) @@ -46,7 +36,7 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel): policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[256, 256, 128]) - model = TDQN('TMultiInputPolicy', train_env, + model = TDQN('TMultiInputPolicy', self.train_env, tensorboard_log=f"{path}/tdqn/tensorboard/", policy_kwargs=policy_kwargs, replay_buffer_class=ReplayBuffer, @@ -58,12 +48,33 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel): callback=eval_callback ) + del model best_model = DQN.load(dk.data_path / "best_model") print('Training finished!') - + gc.collect() return best_model + def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test): + """ + User overrides this as shown here if they are using a custom MyRLEnv + """ + train_df = data_dictionary["train_features"] + test_df = data_dictionary["test_features"] + + # environments + if not self.train_env: + self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH, + reward_kwargs=self.reward_params) + self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test, + window_size=self.CONV_WIDTH, + reward_kwargs=self.reward_params), ".") + else: + self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params) + self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params) + self.train_env.reset() + self.eval_env.reset() + # User can inherit and customize 5 action environment class MyRLEnv(Base5ActionRLEnv): diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py index bf9e03b7f..8634fd958 100644 --- a/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py +++ b/freqtrade/freqai/prediction_models/ReinforcementLearningTDQN_multiproc.py @@ -4,8 +4,8 @@ import torch as th import numpy as np import gym from typing import Callable -from stable_baselines3.common.callbacks import ( - EvalCallback, StopTrainingOnNoModelImprovement, StopTrainingOnRewardThreshold) +from stable_baselines3.common.callbacks import EvalCallback +# EvalCallback , StopTrainingOnNoModelImprovement, StopTrainingOnRewardThreshold from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.vec_env import SubprocVecEnv from stable_baselines3.common.utils import set_random_seed @@ -15,7 +15,6 @@ from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcement from freqtrade.freqai.RL.TDQNagent import TDQN from stable_baselines3.common.buffers import ReplayBuffer from freqtrade.freqai.data_kitchen import FreqaiDataKitchen -from pandas import DataFrame logger = logging.getLogger(__name__) @@ -47,46 +46,23 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel): User created Reinforcement Learning Model prediction model. """ - def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, - prices_train: DataFrame, prices_test: DataFrame): + def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen): train_df = data_dictionary["train_features"] test_df = data_dictionary["test_features"] eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) - env_id = "train_env" - num_cpu = int(dk.thread_count / 2) - train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, - self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)]) - - eval_env_id = 'eval_env' - eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, - self.reward_params, self.CONV_WIDTH, monitor=True) for i in - range(num_cpu)]) - path = dk.data_path - stop_train_callback = StopTrainingOnNoModelImprovement( - max_no_improvement_evals=5, - min_evals=10, - verbose=2 - ) - callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=2) - eval_callback = EvalCallback( - eval_env, best_model_save_path=f"{path}/", - log_path=f"{path}/tdqn/logs/", - eval_freq=int(eval_freq), - deterministic=True, - render=True, - callback_after_eval=stop_train_callback, - callback_on_new_best=callback_on_best, - verbose=2 - ) + + eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/", + log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq), + deterministic=True, render=False) # model arch policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[512, 512, 512]) - model = TDQN('TMultiInputPolicy', train_env, + model = TDQN('TMultiInputPolicy', self.train_env, policy_kwargs=policy_kwargs, tensorboard_log=f"{path}/tdqn/tensorboard/", replay_buffer_class=ReplayBuffer, @@ -100,12 +76,40 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel): best_model = DQN.load(dk.data_path / "best_model.zip") print('Training finished!') - eval_env.close() return best_model + def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test): + """ + User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise + leaving this will default to Base5ActEnv + """ + train_df = data_dictionary["train_features"] + test_df = data_dictionary["test_features"] + + # environments + if not self.train_env: + env_id = "train_env" + num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2) + self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, + self.reward_params, self.CONV_WIDTH) for i + in range(num_cpu)]) + + eval_env_id = 'eval_env' + self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, + self.reward_params, self.CONV_WIDTH, monitor=True) for i + in range(num_cpu)]) + else: + self.train_env.env_method('reset_env', train_df, prices_train, + self.CONV_WIDTH, self.reward_params) + self.eval_env.env_method('reset_env', train_df, prices_train, + self.CONV_WIDTH, self.reward_params) + self.train_env.env_method('reset') + self.eval_env.env_method('reset') # User can inherit and customize 5 action environment + + class MyRLEnv(Base5ActionRLEnv): """ User can override any function in BaseRLEnv and gym.Env. Here the user