diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearning.py b/freqtrade/freqai/prediction_models/ReinforcementLearning.py index e208707eb..5783baba8 100644 --- a/freqtrade/freqai/prediction_models/ReinforcementLearning.py +++ b/freqtrade/freqai/prediction_models/ReinforcementLearning.py @@ -5,14 +5,23 @@ import numpy as np import numpy.typing as npt import pandas as pd from pandas import DataFrame -from stable_baselines.common.callbacks import CallbackList, CheckpointCallback, EvalCallback from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.freqai_interface import IFreqaiModel from freqtrade.freqai.prediction_models.RL.RLPrediction_agent import RLPrediction_agent +from freqtrade.freqai.prediction_models.RL.RLPrediction_agent_v2 import TDQN #from freqtrade.freqai.prediction_models.RL.RLPrediction_env import GymAnytrading from freqtrade.freqai.prediction_models.RL.RLPrediction_env import DEnv from freqtrade.persistence import Trade +from stable_baselines3.common.vec_env import SubprocVecEnv +from stable_baselines3.common.monitor import Monitor + +import torch as th +from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback, StopTrainingOnRewardThreshold +from stable_baselines3.common.buffers import ReplayBuffer +from stable_baselines3 import PPO + + logger = logging.getLogger(__name__) @@ -74,47 +83,127 @@ class ReinforcementLearningModel(IFreqaiModel): def fit(self, data_dictionary: Dict[str, Any], pair: str = ''): - train_df = data_dictionary["train_features"] - # train_labels = data_dictionary["train_labels"] - test_df = data_dictionary["test_features"] - # test_labels = data_dictionary["test_labels"] + # train_df = data_dictionary["train_features"] + # # train_labels = data_dictionary["train_labels"] + # test_df = data_dictionary["test_features"] + # # test_labels = data_dictionary["test_labels"] - # sep = '/' - # coin = pair.split(sep, 1)[0] - # price = train_df[f"%-{coin}raw_price_{self.config['timeframe']}"] - # price.reset_index(inplace=True, drop=True) - # price = price.to_frame() - price = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(len(train_df.index)) + # # sep = '/' + # # coin = pair.split(sep, 1)[0] + # # price = train_df[f"%-{coin}raw_price_{self.config['timeframe']}"] + # # price.reset_index(inplace=True, drop=True) + # # price = price.to_frame() + # price = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(len(train_df.index)) + # price_test = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(len(test_df.index)) + + # #train_env = GymAnytrading(train_df, price, self.CONV_WIDTH) + + # agent_params = self.freqai_info['model_training_parameters'] + # reward_params = self.freqai_info['model_reward_parameters'] - model_name = 'ppo' + # train_env = DEnv(df=train_df, prices=price, window_size=self.CONV_WIDTH, reward_kwargs=reward_params) + # #eval_env = DEnv(df=test_df, prices=price_test, window_size=self.CONV_WIDTH, reward_kwargs=reward_params) + + # #env_instance = SubprocVecEnv([DEnv(df=train_df, prices=price, window_size=self.CONV_WIDTH, reward_kwargs=reward_params)]) + # #train_env.reset() + # #eval_env.reset() + + # # model + + # #policy_kwargs = dict(net_arch=[512, 512, 512]) + # policy_kwargs = dict(activation_fn=th.nn.Tanh, + # net_arch=[256, 256, 256]) + # agent = RLPrediction_agent(train_env) + # #eval_agent = RLPrediction_agent(eval_env) - #env_instance = GymAnytrading(train_df, price, self.CONV_WIDTH) + # # PPO + # model_name = 'ppo' + # model = agent.get_model(model_name, model_kwargs=agent_params, policy_kwargs=policy_kwargs) + # trained_model = agent.train_model(model=model, + # tb_log_name=model_name, + # model_kwargs=agent_params, + # train_df=train_df, + # test_df=test_df, + # price=price, + # price_test=price_test, + # window_size=self.CONV_WIDTH) + + + # # best_model = eval_agent.train_model(model=model, + # # tb_log_name=model_name, + # # model_kwargs=agent_params, + # # eval=eval_env) + + + # # TDQN + # # model_name = 'TDQN' + # # model = TDQN('TMultiInputPolicy', train_env, policy_kwargs=policy_kwargs, tensorboard_log='./tensorboard_log/', + # # learning_rate=agent_params["learning_rate"], gamma=0.9, + # # target_update_interval=5000, buffer_size=50000, + # # exploration_initial_eps=1, exploration_final_eps=0.1, + # # replay_buffer_class=ReplayBuffer + # # ) + + # # trained_model = agent.train_model(model=model, + # # tb_log_name=model_name, + # # model_kwargs=agent_params) + # #model.learn( + # # total_timesteps=5000, + # # callback=callback + # # ) agent_params = self.freqai_info['model_training_parameters'] reward_params = self.freqai_info['model_reward_parameters'] + train_df = data_dictionary["train_features"] + test_df = data_dictionary["test_features"] - env_instance = DEnv(df=train_df, prices=price, window_size=self.CONV_WIDTH, reward_kwargs=reward_params) - agent = RLPrediction_agent(env_instance) + # price data for model training and evaluation + price = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(len(train_df.index)) + price_test = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(len(test_df.index)) + + # environments + train_env = DEnv(df=train_df, prices=price, window_size=self.CONV_WIDTH, reward_kwargs=reward_params) + eval = DEnv(df=test_df, prices=price_test, window_size=self.CONV_WIDTH, reward_kwargs=reward_params) + eval_env = Monitor(eval, ".") + eval_env.reset() - # checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/') - # eval_callback = EvalCallback(test_df, best_model_save_path='./models/', - # log_path='./logs/', eval_freq=10000, - # deterministic=True, render=False) + # this should be in config - TODO + agent_type = 'tdqn' - # #Create the callback list - # callback = CallbackList([checkpoint_callback, eval_callback]) + path = self.dk.data_path + eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/", + log_path=f"{path}/{agent_type}/logs/", eval_freq=10000, + deterministic=True, render=False) - model = agent.get_model(model_name, model_kwargs=agent_params) - trained_model = agent.train_model(model=model, - tb_log_name=model_name, - model_kwargs=agent_params) - #eval_callback=callback) + # model arch + policy_kwargs = dict(activation_fn=th.nn.Tanh, + net_arch=[512, 512, 512]) + + + if agent_type == 'tdqn': + model = TDQN('TMultiInputPolicy', train_env, policy_kwargs=policy_kwargs, tensorboard_log=f"{path}/{agent_type}/tensorboard/", + learning_rate=0.00025, gamma=0.9, + target_update_interval=5000, buffer_size=50000, + exploration_initial_eps=1, exploration_final_eps=0.1, + replay_buffer_class=ReplayBuffer + ) + elif agent_type == 'ppo': + model = PPO('MultiInputPolicy', train_env, policy_kwargs=policy_kwargs, tensorboard_log=f"{path}/{agent_type}/tensorboard/", + learning_rate=0.00025, gamma=0.9 + ) + + model.learn( + total_timesteps=agent_params["total_timesteps"], + callback=eval_callback + ) print('Training finished!') - return trained_model + return model + + def get_state_info(self, pair): open_trades = Trade.get_trades(trade_filter=Trade.is_open.is_(True))