reuse callback, allow user to acces all stable_baselines3 agents via config
This commit is contained in:
		| @@ -55,7 +55,7 @@ | ||||
|     ], | ||||
|     "freqai": { | ||||
|         "enabled": true, | ||||
|         "model_save_type": "stable_baselines_dqn", | ||||
|         "model_save_type": "stable_baselines", | ||||
|         "conv_width": 10, | ||||
|         "purge_old_models": true, | ||||
|         "train_period_days": 10, | ||||
| @@ -85,8 +85,11 @@ | ||||
|             "verbose": 1 | ||||
|         }, | ||||
|         "rl_config": { | ||||
|             "train_cycles": 15, | ||||
|             "eval_cycles": 5, | ||||
|             "train_cycles": 10, | ||||
|             "eval_cycles": 3, | ||||
|             "thread_count": 4, | ||||
|             "model_type": "PPO", | ||||
|             "policy_type": "MlpPolicy", | ||||
|             "model_reward_parameters": { | ||||
|                 "rr": 1, | ||||
|                 "profit_aim": 0.02 | ||||
|   | ||||
| @@ -266,59 +266,28 @@ class Base5ActionRLEnv(gym.Env): | ||||
|  | ||||
|         # close long | ||||
|         if action == Actions.Long_exit.value and self._position == Positions.Long: | ||||
|             if len(self.close_trade_profit): | ||||
|                 # aim x2 rw | ||||
|                 if self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                     last_trade_price = self.add_buy_fee( | ||||
|                         self.prices.iloc[self._last_trade_tick].open) | ||||
|                     current_price = self.add_sell_fee( | ||||
|                         self.prices.iloc[self._current_tick].open) | ||||
|                     return float((np.log(current_price) - np.log(last_trade_price)) * 2) | ||||
|                 # less than aim x1 rw | ||||
|                 elif self.close_trade_profit[-1] < self.profit_aim * self.rr: | ||||
|                     last_trade_price = self.add_buy_fee( | ||||
|                         self.prices.iloc[self._last_trade_tick].open | ||||
|                     ) | ||||
|                     current_price = self.add_sell_fee( | ||||
|                         self.prices.iloc[self._current_tick].open | ||||
|                     ) | ||||
|                     return float(np.log(current_price) - np.log(last_trade_price)) | ||||
|                 # # less than RR SL x2 neg rw | ||||
|                 # elif self.close_trade_profit[-1] < (self.profit_aim * -1): | ||||
|                 #     last_trade_price = self.add_buy_fee( | ||||
|                 #         self.prices.iloc[self._last_trade_tick].open) | ||||
|                 #     current_price = self.add_sell_fee( | ||||
|                 #         self.prices.iloc[self._current_tick].open) | ||||
|                 #     return float((np.log(current_price) - np.log(last_trade_price)) * 2) * -1 | ||||
|             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(current_price) - np.log(last_trade_price)) | ||||
|  | ||||
|         if action == Actions.Long_exit.value and self._position == Positions.Long: | ||||
|             if self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                 last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|                 current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) | ||||
|                 return float((np.log(current_price) - np.log(last_trade_price)) * 2) | ||||
|  | ||||
|         # close short | ||||
|         if action == Actions.Short_exit.value and self._position == Positions.Short: | ||||
|             if len(self.close_trade_profit): | ||||
|                 # aim x2 rw | ||||
|                 if self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                     last_trade_price = self.add_sell_fee( | ||||
|                         self.prices.iloc[self._last_trade_tick].open | ||||
|                     ) | ||||
|                     current_price = self.add_buy_fee( | ||||
|                         self.prices.iloc[self._current_tick].open | ||||
|                     ) | ||||
|                     return float((np.log(last_trade_price) - np.log(current_price)) * 2) | ||||
|                 # less than aim x1 rw | ||||
|                 elif self.close_trade_profit[-1] < self.profit_aim * self.rr: | ||||
|                     last_trade_price = self.add_sell_fee( | ||||
|                         self.prices.iloc[self._last_trade_tick].open | ||||
|                     ) | ||||
|                     current_price = self.add_buy_fee( | ||||
|                         self.prices.iloc[self._current_tick].open | ||||
|                     ) | ||||
|                     return float(np.log(last_trade_price) - np.log(current_price)) | ||||
|                 # # less than RR SL x2 neg rw | ||||
|                 # elif self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                 #     last_trade_price = self.add_sell_fee( | ||||
|                 #         self.prices.iloc[self._last_trade_tick].open) | ||||
|                 #     current_price = self.add_buy_fee( | ||||
|                 #         self.prices.iloc[self._current_tick].open) | ||||
|                 #     return float((np.log(last_trade_price) - np.log(current_price)) * 2) * -1 | ||||
|             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(last_trade_price) - np.log(current_price)) | ||||
|  | ||||
|         if action == Actions.Short_exit.value and self._position == Positions.Short: | ||||
|             if self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                 last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|                 current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) | ||||
|                 return float((np.log(last_trade_price) - np.log(current_price)) * 2) | ||||
|  | ||||
|         return 0. | ||||
|  | ||||
|     def _update_profit(self, action): | ||||
|   | ||||
| @@ -11,8 +11,12 @@ from freqtrade.freqai.freqai_interface import IFreqaiModel | ||||
| from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions | ||||
| from freqtrade.persistence import Trade | ||||
| import torch.multiprocessing | ||||
| from stable_baselines3.common.callbacks import EvalCallback | ||||
| from stable_baselines3.common.monitor import Monitor | ||||
| import torch as th | ||||
| from typing import Callable | ||||
| from stable_baselines3.common.utils import set_random_seed | ||||
| import gym | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
| torch.multiprocessing.set_sharing_strategy('file_system') | ||||
| @@ -25,9 +29,15 @@ class BaseReinforcementLearningModel(IFreqaiModel): | ||||
|  | ||||
|     def __init__(self, **kwargs): | ||||
|         super().__init__(config=kwargs['config']) | ||||
|         th.set_num_threads(self.freqai_info.get('data_kitchen_thread_count', 4)) | ||||
|         th.set_num_threads(self.freqai_info['rl_config'].get('thread_count', 4)) | ||||
|         self.reward_params = self.freqai_info['rl_config']['model_reward_parameters'] | ||||
|         self.train_env: Base5ActionRLEnv = None | ||||
|         self.eval_env: Base5ActionRLEnv = None | ||||
|         self.eval_callback: EvalCallback = None | ||||
|         mod = __import__('stable_baselines3', fromlist=[ | ||||
|                          self.freqai_info['rl_config']['model_type']]) | ||||
|         self.MODELCLASS = getattr(mod, self.freqai_info['rl_config']['model_type']) | ||||
|         self.policy_type = self.freqai_info['rl_config']['policy_type'] | ||||
|  | ||||
|     def train( | ||||
|         self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen | ||||
| @@ -67,7 +77,7 @@ class BaseReinforcementLearningModel(IFreqaiModel): | ||||
|         ) | ||||
|         logger.info(f'Training model on {len(data_dictionary["train_features"])} data points') | ||||
|  | ||||
|         self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test) | ||||
|         self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test, dk) | ||||
|  | ||||
|         model = self.fit_rl(data_dictionary, dk) | ||||
|  | ||||
| @@ -75,13 +85,13 @@ class BaseReinforcementLearningModel(IFreqaiModel): | ||||
|  | ||||
|         return model | ||||
|  | ||||
|     def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test): | ||||
|     def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk): | ||||
|         """ | ||||
|         User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise | ||||
|         leaving this will default to Base5ActEnv | ||||
|         User overrides this as shown here if they are using a custom MyRLEnv | ||||
|         """ | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         test_df = data_dictionary["test_features"] | ||||
|         eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) | ||||
|  | ||||
|         # environments | ||||
|         if not self.train_env: | ||||
| @@ -90,11 +100,17 @@ class BaseReinforcementLearningModel(IFreqaiModel): | ||||
|             self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test, | ||||
|                                     window_size=self.CONV_WIDTH, | ||||
|                                     reward_kwargs=self.reward_params), ".") | ||||
|             self.eval_callback = EvalCallback(self.eval_env, deterministic=True, | ||||
|                                               render=False, eval_freq=eval_freq, | ||||
|                                               best_model_save_path=dk.data_path) | ||||
|         else: | ||||
|             self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params) | ||||
|             self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params) | ||||
|             self.train_env.reset() | ||||
|             self.eval_env.reset() | ||||
|             self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params) | ||||
|             self.eval_env.reset_env(test_df, prices_test, self.CONV_WIDTH, self.reward_params) | ||||
|             self.eval_callback.__init__(self.eval_env, deterministic=True, | ||||
|                                         render=False, eval_freq=eval_freq, | ||||
|                                         best_model_save_path=dk.data_path) | ||||
|  | ||||
|     @abstractmethod | ||||
|     def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen): | ||||
| @@ -206,16 +222,28 @@ class BaseReinforcementLearningModel(IFreqaiModel): | ||||
|     # all the other existing fit() functions to include dk argument. For now we instantiate and | ||||
|     # leave it. | ||||
|     def fit(self, data_dictionary: Dict[str, Any], pair: str = '') -> Any: | ||||
|         """ | ||||
|         Most regressors use the same function names and arguments e.g. user | ||||
|         can drop in LGBMRegressor in place of CatBoostRegressor and all data | ||||
|         management will be properly handled by Freqai. | ||||
|         :param data_dictionary: Dict = the dictionary constructed by DataHandler to hold | ||||
|                                 all the training and test data/labels. | ||||
|         """ | ||||
|  | ||||
|         return | ||||
|  | ||||
| def make_env(env_id: str, rank: int, seed: int, train_df, price, | ||||
|              reward_params, window_size, monitor=False) -> Callable: | ||||
|     """ | ||||
|     Utility function for multiprocessed env. | ||||
|  | ||||
|     :param env_id: (str) the environment ID | ||||
|     :param num_env: (int) the number of environment you wish to have in subprocesses | ||||
|     :param seed: (int) the inital seed for RNG | ||||
|     :param rank: (int) index of the subprocess | ||||
|     :return: (Callable) | ||||
|     """ | ||||
|     def _init() -> gym.Env: | ||||
|  | ||||
|         env = MyRLEnv(df=train_df, prices=price, window_size=window_size, | ||||
|                       reward_kwargs=reward_params, id=env_id, seed=seed + rank) | ||||
|         if monitor: | ||||
|             env = Monitor(env, ".") | ||||
|         return env | ||||
|     set_random_seed(seed) | ||||
|     return _init | ||||
|  | ||||
| class MyRLEnv(Base5ActionRLEnv): | ||||
|     """ | ||||
| @@ -229,24 +257,24 @@ class MyRLEnv(Base5ActionRLEnv): | ||||
|             return 0. | ||||
|  | ||||
|         # close long | ||||
|         if action == Actions.Long_sell.value and self._position == Positions.Long: | ||||
|         if action == Actions.Long_exit.value and self._position == Positions.Long: | ||||
|             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(current_price) - np.log(last_trade_price)) | ||||
|  | ||||
|         if action == Actions.Long_sell.value and self._position == Positions.Long: | ||||
|         if action == Actions.Long_exit.value and self._position == Positions.Long: | ||||
|             if self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                 last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|                 current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) | ||||
|                 return float((np.log(current_price) - np.log(last_trade_price)) * 2) | ||||
|  | ||||
|         # close short | ||||
|         if action == Actions.Short_buy.value and self._position == Positions.Short: | ||||
|         if action == Actions.Short_exit.value and self._position == Positions.Short: | ||||
|             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(last_trade_price) - np.log(current_price)) | ||||
|  | ||||
|         if action == Actions.Short_buy.value and self._position == Positions.Short: | ||||
|         if action == Actions.Short_exit.value and self._position == Positions.Short: | ||||
|             if self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                 last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|                 current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) | ||||
|   | ||||
| @@ -471,12 +471,11 @@ class FreqaiDataDrawer: | ||||
|         elif model_type == 'keras': | ||||
|             from tensorflow import keras | ||||
|             model = keras.models.load_model(dk.data_path / f"{dk.model_filename}_model.h5") | ||||
|         elif model_type == 'stable_baselines_ppo': | ||||
|             from stable_baselines3.ppo.ppo import PPO | ||||
|             model = PPO.load(dk.data_path / f"{dk.model_filename}_model") | ||||
|         elif model_type == 'stable_baselines_dqn': | ||||
|             from stable_baselines3 import DQN | ||||
|             model = DQN.load(dk.data_path / f"{dk.model_filename}_model") | ||||
|         elif model_type == 'stable_baselines': | ||||
|             mod = __import__('stable_baselines3', fromlist=[ | ||||
|                              self.freqai_info['rl_config']['model_type']]) | ||||
|             MODELCLASS = getattr(mod, self.freqai_info['rl_config']['model_type']) | ||||
|             model = MODELCLASS.load(dk.data_path / f"{dk.model_filename}_model") | ||||
|  | ||||
|         if Path(dk.data_path / f"{dk.model_filename}_svm_model.joblib").is_file(): | ||||
|             dk.svm_model = load(dk.data_path / f"{dk.model_filename}_svm_model.joblib") | ||||
|   | ||||
							
								
								
									
										82
									
								
								freqtrade/freqai/prediction_models/ReinforcementLearner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								freqtrade/freqai/prediction_models/ReinforcementLearner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,82 @@ | ||||
| import logging | ||||
| from typing import Any, Dict  # , Tuple | ||||
|  | ||||
| # import numpy.typing as npt | ||||
| import torch as th | ||||
| import numpy as np | ||||
| from freqtrade.freqai.data_kitchen import FreqaiDataKitchen | ||||
| from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions | ||||
| from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel | ||||
| from pathlib import Path | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class ReinforcementLearner(BaseReinforcementLearningModel): | ||||
|     """ | ||||
|     User created Reinforcement Learning Model prediction model. | ||||
|     """ | ||||
|  | ||||
|     def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen): | ||||
|  | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) | ||||
|  | ||||
|         policy_kwargs = dict(activation_fn=th.nn.ReLU, | ||||
|                              net_arch=[256, 256, 128]) | ||||
|  | ||||
|         model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs, | ||||
|                                 tensorboard_log=Path(dk.data_path / "tensorboard"), | ||||
|                                 **self.freqai_info['model_training_parameters'] | ||||
|                                 ) | ||||
|  | ||||
|         model.learn( | ||||
|             total_timesteps=int(total_timesteps), | ||||
|             callback=self.eval_callback | ||||
|         ) | ||||
|  | ||||
|         if Path(dk.data_path / "best_model.zip").is_file(): | ||||
|             logger.info('Callback found a best model.') | ||||
|             best_model = self.MODELCLASS.load(dk.data_path / "best_model") | ||||
|             return best_model | ||||
|  | ||||
|         logger.info('Couldnt find best model, using final model instead.') | ||||
|  | ||||
|         return model | ||||
|  | ||||
|  | ||||
| class MyRLEnv(Base5ActionRLEnv): | ||||
|     """ | ||||
|     User can modify any part of the environment by overriding base | ||||
|     functions | ||||
|     """ | ||||
|     def calculate_reward(self, action): | ||||
|  | ||||
|         if self._last_trade_tick is None: | ||||
|             return 0. | ||||
|  | ||||
|         # close long | ||||
|         if action == Actions.Long_exit.value and self._position == Positions.Long: | ||||
|             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(current_price) - np.log(last_trade_price)) | ||||
|  | ||||
|         if action == Actions.Long_exit.value and self._position == Positions.Long: | ||||
|             if self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                 last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|                 current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) | ||||
|                 return float((np.log(current_price) - np.log(last_trade_price)) * 2) | ||||
|  | ||||
|         # close short | ||||
|         if action == Actions.Short_exit.value and self._position == Positions.Short: | ||||
|             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(last_trade_price) - np.log(current_price)) | ||||
|  | ||||
|         if action == Actions.Short_exit.value and self._position == Positions.Short: | ||||
|             if self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                 last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|                 current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) | ||||
|                 return float((np.log(last_trade_price) - np.log(current_price)) * 2) | ||||
|  | ||||
|         return 0. | ||||
| @@ -1,17 +1,59 @@ | ||||
| from typing import Any, Dict, List, Optional, Tuple, Type, Union | ||||
| 
 | ||||
| import gym | ||||
| import torch | ||||
| import logging | ||||
| import torch as th | ||||
| from typing import Any, Dict, List, Optional, Tuple, Type, Union | ||||
| from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel | ||||
| from stable_baselines3 import DQN | ||||
| from stable_baselines3.common.buffers import ReplayBuffer | ||||
| from stable_baselines3.common.policies import BasePolicy | ||||
| from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor, | ||||
|                                                    FlattenExtractor) | ||||
| from stable_baselines3.common.type_aliases import GymEnv, Schedule | ||||
| from freqtrade.freqai.data_kitchen import FreqaiDataKitchen | ||||
| from pathlib import Path | ||||
| from stable_baselines3.dqn.policies import (CnnPolicy, DQNPolicy, MlpPolicy, | ||||
|                                             QNetwork) | ||||
| from torch import nn | ||||
| import gym | ||||
| from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor, | ||||
|                                                    FlattenExtractor) | ||||
| from stable_baselines3.common.type_aliases import GymEnv, Schedule | ||||
| from stable_baselines3.common.policies import BasePolicy | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| 
 | ||||
| class ReinforcementLearnerCustomAgent(BaseReinforcementLearningModel): | ||||
|     """ | ||||
|     User can customize agent by defining the class and using it directly. | ||||
|     Here the example is "TDQN" | ||||
|     """ | ||||
| 
 | ||||
|     def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen): | ||||
| 
 | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) | ||||
| 
 | ||||
|         policy_kwargs = dict(activation_fn=th.nn.ReLU, | ||||
|                              net_arch=[256, 256, 128]) | ||||
| 
 | ||||
|         # TDQN is a custom agent defined below | ||||
|         model = TDQN(self.policy_type, self.train_env, | ||||
|                      tensorboard_log=Path(dk.data_path / "tensorboard"), | ||||
|                      policy_kwargs=policy_kwargs, | ||||
|                      **self.freqai_info['model_training_parameters'] | ||||
|                      ) | ||||
| 
 | ||||
|         model.learn( | ||||
|             total_timesteps=int(total_timesteps), | ||||
|             callback=self.eval_callback | ||||
|         ) | ||||
| 
 | ||||
|         if Path(dk.data_path / "best_model.zip").is_file(): | ||||
|             logger.info('Callback found a best model.') | ||||
|             best_model = self.MODELCLASS.load(dk.data_path / "best_model") | ||||
|             return best_model | ||||
| 
 | ||||
|         logger.info('Couldnt find best model, using final model instead.') | ||||
| 
 | ||||
|         return model | ||||
| 
 | ||||
| # User creates their custom agent and networks as shown below | ||||
| 
 | ||||
| 
 | ||||
| def create_mlp_( | ||||
| @@ -72,7 +114,7 @@ class TDQNetwork(QNetwork): | ||||
| 
 | ||||
|     def init_weights(self, m): | ||||
|         if type(m) == nn.Linear: | ||||
|             torch.nn.init.kaiming_uniform_(m.weight) | ||||
|             th.nn.init.kaiming_uniform_(m.weight) | ||||
| 
 | ||||
| 
 | ||||
| class TDQNPolicy(DQNPolicy): | ||||
| @@ -175,7 +217,7 @@ class TDQN(DQN): | ||||
|         exploration_initial_eps: float = 1.0, | ||||
|         exploration_final_eps: float = 0.05, | ||||
|         max_grad_norm: float = 10, | ||||
|         tensorboard_log: Optional[str] = None, | ||||
|         tensorboard_log: Optional[Path] = None, | ||||
|         create_eval_env: bool = False, | ||||
|         policy_kwargs: Optional[Dict[str, Any]] = None, | ||||
|         verbose: int = 1, | ||||
| @@ -0,0 +1,84 @@ | ||||
| import logging | ||||
| from typing import Any, Dict  # , Tuple | ||||
|  | ||||
| # import numpy.typing as npt | ||||
| import torch as th | ||||
| from stable_baselines3.common.callbacks import EvalCallback | ||||
| from stable_baselines3.common.vec_env import SubprocVecEnv | ||||
| from freqtrade.freqai.RL.BaseReinforcementLearningModel import (BaseReinforcementLearningModel, | ||||
|                                                                 make_env) | ||||
| from freqtrade.freqai.data_kitchen import FreqaiDataKitchen | ||||
|  | ||||
| from pathlib import Path | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class ReinforcementLearner_multiproc(BaseReinforcementLearningModel): | ||||
|     """ | ||||
|     User created Reinforcement Learning Model prediction model. | ||||
|     """ | ||||
|  | ||||
|     def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen): | ||||
|  | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) | ||||
|  | ||||
|         # model arch | ||||
|         policy_kwargs = dict(activation_fn=th.nn.ReLU, | ||||
|                              net_arch=[512, 512, 512]) | ||||
|  | ||||
|         model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs, | ||||
|                                 tensorboard_log=Path(dk.data_path / "tensorboard"), | ||||
|                                 **self.freqai_info['model_training_parameters'] | ||||
|                                 ) | ||||
|  | ||||
|         model.learn( | ||||
|             total_timesteps=int(total_timesteps), | ||||
|             callback=self.eval_callback | ||||
|         ) | ||||
|  | ||||
|         if Path(dk.data_path / "best_model.zip").is_file(): | ||||
|             logger.info('Callback found a best model.') | ||||
|             best_model = self.MODELCLASS.load(dk.data_path / "best_model") | ||||
|             return best_model | ||||
|  | ||||
|         logger.info('Couldnt find best model, using final model instead.') | ||||
|  | ||||
|         return model | ||||
|  | ||||
|     def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test, dk): | ||||
|         """ | ||||
|         If user has particular environment configuration needs, they can do that by | ||||
|         overriding this function. In the present case, the user wants to setup training | ||||
|         environments for multiple workers. | ||||
|         """ | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         test_df = data_dictionary["test_features"] | ||||
|         eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) | ||||
|  | ||||
|         # environments | ||||
|         if not self.train_env: | ||||
|             env_id = "train_env" | ||||
|             num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2) | ||||
|             self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, | ||||
|                                             self.reward_params, self.CONV_WIDTH) for i | ||||
|                                             in range(num_cpu)]) | ||||
|  | ||||
|             eval_env_id = 'eval_env' | ||||
|             self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, | ||||
|                                            self.reward_params, self.CONV_WIDTH, monitor=True) for i | ||||
|                                            in range(num_cpu)]) | ||||
|             self.eval_callback = EvalCallback(self.eval_env, deterministic=True, | ||||
|                                               render=False, eval_freq=eval_freq, | ||||
|                                               best_model_save_path=dk.data_path) | ||||
|         else: | ||||
|             self.train_env.env_method('reset') | ||||
|             self.eval_env.env_method('reset') | ||||
|             self.train_env.env_method('reset_env', train_df, prices_train, | ||||
|                                       self.CONV_WIDTH, self.reward_params) | ||||
|             self.eval_env.env_method('reset_env', train_df, prices_train, | ||||
|                                      self.CONV_WIDTH, self.reward_params) | ||||
|             self.eval_callback.__init__(self.eval_env, deterministic=True, | ||||
|                                         render=False, eval_freq=eval_freq, | ||||
|                                         best_model_save_path=dk.data_path) | ||||
| @@ -1,104 +0,0 @@ | ||||
| import gc | ||||
| import logging | ||||
| from typing import Any, Dict  # , Tuple | ||||
|  | ||||
| import numpy as np | ||||
| # import numpy.typing as npt | ||||
| import torch as th | ||||
| from stable_baselines3 import PPO | ||||
| from stable_baselines3.common.callbacks import EvalCallback | ||||
| from stable_baselines3.common.monitor import Monitor | ||||
|  | ||||
| from freqtrade.freqai.data_kitchen import FreqaiDataKitchen | ||||
| from freqtrade.freqai.RL.Base3ActionRLEnv import Actions, Base3ActionRLEnv, Positions | ||||
| from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel | ||||
|  | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class ReinforcementLearningPPO(BaseReinforcementLearningModel): | ||||
|     """ | ||||
|     User created Reinforcement Learning Model prediction model. | ||||
|     """ | ||||
|  | ||||
|     def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen): | ||||
|  | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         test_df = data_dictionary["test_features"] | ||||
|         eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) | ||||
|         total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) | ||||
|  | ||||
|         path = dk.data_path | ||||
|         eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/", | ||||
|                                      log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq), | ||||
|                                      deterministic=True, render=False) | ||||
|  | ||||
|         # model arch | ||||
|         policy_kwargs = dict(activation_fn=th.nn.ReLU, | ||||
|                              net_arch=[256, 256, 128]) | ||||
|  | ||||
|         model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs, | ||||
|                     tensorboard_log=f"{path}/ppo/tensorboard/", | ||||
|                     **self.freqai_info['model_training_parameters'] | ||||
|                     ) | ||||
|  | ||||
|         model.learn( | ||||
|             total_timesteps=int(total_timesteps), | ||||
|             callback=eval_callback | ||||
|         ) | ||||
|  | ||||
|         del model | ||||
|         best_model = PPO.load(dk.data_path / "best_model") | ||||
|  | ||||
|         print('Training finished!') | ||||
|         gc.collect() | ||||
|  | ||||
|         return best_model | ||||
|  | ||||
|     def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test): | ||||
|         """ | ||||
|         User overrides this as shown here if they are using a custom MyRLEnv | ||||
|         """ | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         test_df = data_dictionary["test_features"] | ||||
|  | ||||
|         # environments | ||||
|         if not self.train_env: | ||||
|             self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH, | ||||
|                                      reward_kwargs=self.reward_params) | ||||
|             self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test, | ||||
|                                     window_size=self.CONV_WIDTH, | ||||
|                                     reward_kwargs=self.reward_params), ".") | ||||
|         else: | ||||
|             self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params) | ||||
|             self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params) | ||||
|             self.train_env.reset() | ||||
|             self.eval_env.reset() | ||||
|  | ||||
|  | ||||
| class MyRLEnv(Base3ActionRLEnv): | ||||
|     """ | ||||
|     User can override any function in BaseRLEnv and gym.Env | ||||
|     """ | ||||
|  | ||||
|     def calculate_reward(self, action): | ||||
|  | ||||
|         if self._last_trade_tick is None: | ||||
|             return 0. | ||||
|  | ||||
|         # close long | ||||
|         if (action == Actions.Short.value or | ||||
|                 action == Actions.Neutral.value) and self._position == Positions.Long: | ||||
|             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(current_price) - np.log(last_trade_price)) | ||||
|  | ||||
|         # close short | ||||
|         if (action == Actions.Long.value or | ||||
|                 action == Actions.Neutral.value) and self._position == Positions.Short: | ||||
|             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(last_trade_price) - np.log(current_price)) | ||||
|  | ||||
|         return 0. | ||||
| @@ -1,132 +0,0 @@ | ||||
| import logging | ||||
| from typing import Any, Dict  # , Tuple | ||||
|  | ||||
| import numpy as np | ||||
| # import numpy.typing as npt | ||||
| import torch as th | ||||
| from stable_baselines3.common.monitor import Monitor | ||||
| from typing import Callable | ||||
| from stable_baselines3 import PPO | ||||
| from stable_baselines3.common.callbacks import EvalCallback | ||||
| from stable_baselines3.common.vec_env import SubprocVecEnv | ||||
| from stable_baselines3.common.utils import set_random_seed | ||||
| from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions | ||||
| from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel | ||||
| from freqtrade.freqai.data_kitchen import FreqaiDataKitchen | ||||
| import gym | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def make_env(env_id: str, rank: int, seed: int, train_df, price, | ||||
|              reward_params, window_size, monitor=False) -> Callable: | ||||
|     """ | ||||
|     Utility function for multiprocessed env. | ||||
|  | ||||
|     :param env_id: (str) the environment ID | ||||
|     :param num_env: (int) the number of environment you wish to have in subprocesses | ||||
|     :param seed: (int) the inital seed for RNG | ||||
|     :param rank: (int) index of the subprocess | ||||
|     :return: (Callable) | ||||
|     """ | ||||
|     def _init() -> gym.Env: | ||||
|  | ||||
|         env = MyRLEnv(df=train_df, prices=price, window_size=window_size, | ||||
|                       reward_kwargs=reward_params, id=env_id, seed=seed + rank) | ||||
|         if monitor: | ||||
|             env = Monitor(env, ".") | ||||
|         return env | ||||
|     set_random_seed(seed) | ||||
|     return _init | ||||
|  | ||||
|  | ||||
| class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel): | ||||
|     """ | ||||
|     User created Reinforcement Learning Model prediction model. | ||||
|     """ | ||||
|  | ||||
|     def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen): | ||||
|  | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         test_df = data_dictionary["test_features"] | ||||
|         eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) | ||||
|         total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) | ||||
|  | ||||
|         path = dk.data_path | ||||
|         eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/", | ||||
|                                      log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq), | ||||
|                                      deterministic=True, render=False) | ||||
|  | ||||
|         # model arch | ||||
|         policy_kwargs = dict(activation_fn=th.nn.ReLU, | ||||
|                              net_arch=[512, 512, 512]) | ||||
|  | ||||
|         model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs, | ||||
|                     tensorboard_log=f"{path}/ppo/tensorboard/", | ||||
|                     **self.freqai_info['model_training_parameters'] | ||||
|                     ) | ||||
|  | ||||
|         model.learn( | ||||
|             total_timesteps=int(total_timesteps), | ||||
|             callback=eval_callback | ||||
|         ) | ||||
|  | ||||
|         best_model = PPO.load(dk.data_path / "best_model") | ||||
|         print('Training finished!') | ||||
|  | ||||
|         return best_model | ||||
|  | ||||
|     def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test): | ||||
|         """ | ||||
|         User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise | ||||
|         leaving this will default to Base5ActEnv | ||||
|         """ | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         test_df = data_dictionary["test_features"] | ||||
|  | ||||
|         # environments | ||||
|         if not self.train_env: | ||||
|             env_id = "train_env" | ||||
|             num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2) | ||||
|             self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, | ||||
|                                             self.reward_params, self.CONV_WIDTH) for i | ||||
|                                             in range(num_cpu)]) | ||||
|  | ||||
|             eval_env_id = 'eval_env' | ||||
|             self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, | ||||
|                                            self.reward_params, self.CONV_WIDTH, monitor=True) for i | ||||
|                                            in range(num_cpu)]) | ||||
|         else: | ||||
|             self.train_env.env_method('reset_env', train_df, prices_train, | ||||
|                                       self.CONV_WIDTH, self.reward_params) | ||||
|             self.eval_env.env_method('reset_env', train_df, prices_train, | ||||
|                                      self.CONV_WIDTH, self.reward_params) | ||||
|             self.train_env.env_method('reset') | ||||
|             self.eval_env.env_method('reset') | ||||
|  | ||||
|  | ||||
| class MyRLEnv(Base3ActionRLEnv): | ||||
|     """ | ||||
|     User can override any function in BaseRLEnv and gym.Env | ||||
|     """ | ||||
|  | ||||
|     def calculate_reward(self, action): | ||||
|  | ||||
|         if self._last_trade_tick is None: | ||||
|             return 0. | ||||
|  | ||||
|         # close long | ||||
|         if (action == Actions.Short.value or | ||||
|                 action == Actions.Neutral.value) and self._position == Positions.Long: | ||||
|             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(current_price) - np.log(last_trade_price)) | ||||
|  | ||||
|         # close short | ||||
|         if (action == Actions.Long.value or | ||||
|                 action == Actions.Neutral.value) and self._position == Positions.Short: | ||||
|             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(last_trade_price) - np.log(current_price)) | ||||
|  | ||||
|         return 0. | ||||
| @@ -1,115 +0,0 @@ | ||||
| import logging | ||||
| from typing import Any, Dict  # Optional | ||||
| import torch as th | ||||
| from stable_baselines3.common.callbacks import EvalCallback | ||||
| from stable_baselines3.common.monitor import Monitor | ||||
| from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions | ||||
| from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel | ||||
| from freqtrade.freqai.RL.TDQNagent import TDQN | ||||
| from stable_baselines3 import DQN | ||||
| from stable_baselines3.common.buffers import ReplayBuffer | ||||
| import numpy as np | ||||
| import gc | ||||
| from freqtrade.freqai.data_kitchen import FreqaiDataKitchen | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class ReinforcementLearningTDQN(BaseReinforcementLearningModel): | ||||
|     """ | ||||
|     User created Reinforcement Learning Model prediction model. | ||||
|     """ | ||||
|  | ||||
|     def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen): | ||||
|  | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         test_df = data_dictionary["test_features"] | ||||
|         eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) | ||||
|         total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) | ||||
|  | ||||
|         path = dk.data_path | ||||
|         eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/", | ||||
|                                      log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq), | ||||
|                                      deterministic=True, render=False) | ||||
|  | ||||
|         # model arch | ||||
|         policy_kwargs = dict(activation_fn=th.nn.ReLU, | ||||
|                              net_arch=[256, 256, 128]) | ||||
|  | ||||
|         model = TDQN('TMultiInputPolicy', self.train_env, | ||||
|                      tensorboard_log=f"{path}/tdqn/tensorboard/", | ||||
|                      policy_kwargs=policy_kwargs, | ||||
|                      replay_buffer_class=ReplayBuffer, | ||||
|                      **self.freqai_info['model_training_parameters'] | ||||
|                      ) | ||||
|  | ||||
|         model.learn( | ||||
|             total_timesteps=int(total_timesteps), | ||||
|             callback=eval_callback | ||||
|         ) | ||||
|  | ||||
|         del model | ||||
|         best_model = DQN.load(dk.data_path / "best_model") | ||||
|  | ||||
|         print('Training finished!') | ||||
|         gc.collect() | ||||
|         return best_model | ||||
|  | ||||
|     def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test): | ||||
|         """ | ||||
|         User overrides this as shown here if they are using a custom MyRLEnv | ||||
|         """ | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         test_df = data_dictionary["test_features"] | ||||
|  | ||||
|         # environments | ||||
|         if not self.train_env: | ||||
|             self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH, | ||||
|                                      reward_kwargs=self.reward_params) | ||||
|             self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test, | ||||
|                                     window_size=self.CONV_WIDTH, | ||||
|                                     reward_kwargs=self.reward_params), ".") | ||||
|         else: | ||||
|             self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params) | ||||
|             self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params) | ||||
|             self.train_env.reset() | ||||
|             self.eval_env.reset() | ||||
|  | ||||
|  | ||||
| # User can inherit and customize 5 action environment | ||||
| class MyRLEnv(Base5ActionRLEnv): | ||||
|     """ | ||||
|     User can override any function in BaseRLEnv and gym.Env. Here the user | ||||
|     Adds 5 actions. | ||||
|     """ | ||||
|  | ||||
|     def calculate_reward(self, action): | ||||
|  | ||||
|         if self._last_trade_tick is None: | ||||
|             return 0. | ||||
|  | ||||
|         # close long | ||||
|         if action == Actions.Long_sell.value and self._position == Positions.Long: | ||||
|             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(current_price) - np.log(last_trade_price)) | ||||
|  | ||||
|         if action == Actions.Long_sell.value and self._position == Positions.Long: | ||||
|             if self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                 last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|                 current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) | ||||
|                 return float((np.log(current_price) - np.log(last_trade_price)) * 2) | ||||
|  | ||||
|         # close short | ||||
|         if action == Actions.Short_buy.value and self._position == Positions.Short: | ||||
|             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(last_trade_price) - np.log(current_price)) | ||||
|  | ||||
|         if action == Actions.Short_buy.value and self._position == Positions.Short: | ||||
|             if self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                 last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|                 current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) | ||||
|                 return float((np.log(last_trade_price) - np.log(current_price)) * 2) | ||||
|  | ||||
|         return 0. | ||||
| @@ -1,148 +0,0 @@ | ||||
| import logging | ||||
| from typing import Any, Dict  # Optional | ||||
| import torch as th | ||||
| import numpy as np | ||||
| import gym | ||||
| from typing import Callable | ||||
| from stable_baselines3.common.callbacks import EvalCallback | ||||
| # EvalCallback , StopTrainingOnNoModelImprovement, StopTrainingOnRewardThreshold | ||||
| from stable_baselines3.common.monitor import Monitor | ||||
| from stable_baselines3.common.vec_env import SubprocVecEnv | ||||
| from stable_baselines3.common.utils import set_random_seed | ||||
| from stable_baselines3 import DQN | ||||
| from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions | ||||
| from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel | ||||
| from freqtrade.freqai.RL.TDQNagent import TDQN | ||||
| from stable_baselines3.common.buffers import ReplayBuffer | ||||
| from freqtrade.freqai.data_kitchen import FreqaiDataKitchen | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def make_env(env_id: str, rank: int, seed: int, train_df, price, | ||||
|              reward_params, window_size, monitor=False) -> Callable: | ||||
|     """ | ||||
|     Utility function for multiprocessed env. | ||||
|  | ||||
|     :param env_id: (str) the environment ID | ||||
|     :param num_env: (int) the number of environment you wish to have in subprocesses | ||||
|     :param seed: (int) the inital seed for RNG | ||||
|     :param rank: (int) index of the subprocess | ||||
|     :return: (Callable) | ||||
|     """ | ||||
|     def _init() -> gym.Env: | ||||
|  | ||||
|         env = MyRLEnv(df=train_df, prices=price, window_size=window_size, | ||||
|                       reward_kwargs=reward_params, id=env_id, seed=seed + rank) | ||||
|         if monitor: | ||||
|             env = Monitor(env, ".") | ||||
|         return env | ||||
|     set_random_seed(seed) | ||||
|     return _init | ||||
|  | ||||
|  | ||||
| class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel): | ||||
|     """ | ||||
|     User created Reinforcement Learning Model prediction model. | ||||
|     """ | ||||
|  | ||||
|     def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen): | ||||
|  | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         test_df = data_dictionary["test_features"] | ||||
|         eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) | ||||
|         total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) | ||||
|  | ||||
|         path = dk.data_path | ||||
|  | ||||
|         eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/", | ||||
|                                      log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq), | ||||
|                                      deterministic=True, render=False) | ||||
|         # model arch | ||||
|         policy_kwargs = dict(activation_fn=th.nn.ReLU, | ||||
|                              net_arch=[512, 512, 512]) | ||||
|  | ||||
|         model = TDQN('TMultiInputPolicy', self.train_env, | ||||
|                      policy_kwargs=policy_kwargs, | ||||
|                      tensorboard_log=f"{path}/tdqn/tensorboard/", | ||||
|                      replay_buffer_class=ReplayBuffer, | ||||
|                      **self.freqai_info['model_training_parameters'] | ||||
|                      ) | ||||
|  | ||||
|         model.learn( | ||||
|             total_timesteps=int(total_timesteps), | ||||
|             callback=eval_callback | ||||
|         ) | ||||
|  | ||||
|         best_model = DQN.load(dk.data_path / "best_model.zip") | ||||
|         print('Training finished!') | ||||
|  | ||||
|         return best_model | ||||
|  | ||||
|     def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test): | ||||
|         """ | ||||
|         User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise | ||||
|         leaving this will default to Base5ActEnv | ||||
|         """ | ||||
|         train_df = data_dictionary["train_features"] | ||||
|         test_df = data_dictionary["test_features"] | ||||
|  | ||||
|         # environments | ||||
|         if not self.train_env: | ||||
|             env_id = "train_env" | ||||
|             num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2) | ||||
|             self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, | ||||
|                                             self.reward_params, self.CONV_WIDTH) for i | ||||
|                                             in range(num_cpu)]) | ||||
|  | ||||
|             eval_env_id = 'eval_env' | ||||
|             self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, | ||||
|                                            self.reward_params, self.CONV_WIDTH, monitor=True) for i | ||||
|                                            in range(num_cpu)]) | ||||
|         else: | ||||
|             self.train_env.env_method('reset_env', train_df, prices_train, | ||||
|                                       self.CONV_WIDTH, self.reward_params) | ||||
|             self.eval_env.env_method('reset_env', train_df, prices_train, | ||||
|                                      self.CONV_WIDTH, self.reward_params) | ||||
|             self.train_env.env_method('reset') | ||||
|             self.eval_env.env_method('reset') | ||||
|  | ||||
| # User can inherit and customize 5 action environment | ||||
|  | ||||
|  | ||||
| class MyRLEnv(Base5ActionRLEnv): | ||||
|     """ | ||||
|     User can override any function in BaseRLEnv and gym.Env. Here the user | ||||
|     Adds 5 actions. | ||||
|     """ | ||||
|  | ||||
|     def calculate_reward(self, action): | ||||
|  | ||||
|         if self._last_trade_tick is None: | ||||
|             return 0. | ||||
|  | ||||
|         # close long | ||||
|         if action == Actions.Long_sell.value and self._position == Positions.Long: | ||||
|             last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(current_price) - np.log(last_trade_price)) | ||||
|  | ||||
|         if action == Actions.Long_sell.value and self._position == Positions.Long: | ||||
|             if self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                 last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|                 current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open) | ||||
|                 return float((np.log(current_price) - np.log(last_trade_price)) * 2) | ||||
|  | ||||
|         # close short | ||||
|         if action == Actions.Short_buy.value and self._position == Positions.Short: | ||||
|             last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|             current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) | ||||
|             return float(np.log(last_trade_price) - np.log(current_price)) | ||||
|  | ||||
|         if action == Actions.Short_buy.value and self._position == Positions.Short: | ||||
|             if self.close_trade_profit[-1] > self.profit_aim * self.rr: | ||||
|                 last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open) | ||||
|                 current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open) | ||||
|                 return float((np.log(last_trade_price) - np.log(current_price)) * 2) | ||||
|  | ||||
|         return 0. | ||||
		Reference in New Issue
	
	Block a user