persist a single training environment.

This commit is contained in:
robcaulk 2022-08-18 16:07:19 +02:00
parent 5d4e5e69fe
commit f95602f6bd
6 changed files with 162 additions and 129 deletions

View File

@ -61,7 +61,7 @@
"train_period_days": 10, "train_period_days": 10,
"backtest_period_days": 2, "backtest_period_days": 2,
"identifier": "unique-id", "identifier": "unique-id",
"data_kitchen_thread_count": 4, "data_kitchen_thread_count": 2,
"feature_parameters": { "feature_parameters": {
"include_corr_pairlist": [ "include_corr_pairlist": [
"BTC/USDT", "BTC/USDT",

View File

@ -7,7 +7,7 @@ import numpy as np
from gym import spaces from gym import spaces
from gym.utils import seeding from gym.utils import seeding
from pandas import DataFrame from pandas import DataFrame
import pandas as pd
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -47,6 +47,9 @@ class Base5ActionRLEnv(gym.Env):
self.id = id self.id = id
self.seed(seed) self.seed(seed)
self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
def reset_env(self, df, prices, window_size, reward_kwargs, starting_point=True):
self.df = df self.df = df
self.signal_features = self.df self.signal_features = self.df
self.prices = prices self.prices = prices
@ -178,10 +181,15 @@ class Base5ActionRLEnv(gym.Env):
return observation, step_reward, self._done, info return observation, step_reward, self._done, info
def _get_observation(self): def _get_observation(self):
features_and_state = self.signal_features[( features_window = self.signal_features[(
self._current_tick - self.window_size):self._current_tick] self._current_tick - self.window_size):self._current_tick]
features_and_state = DataFrame(np.zeros((len(features_window), 2)),
columns=['current_profit_pct', 'position'],
index=features_window.index)
features_and_state['current_profit_pct'] = self.get_unrealized_profit() features_and_state['current_profit_pct'] = self.get_unrealized_profit()
features_and_state['position'] = self._position.value features_and_state['position'] = self._position.value
features_and_state = pd.concat([features_window, features_and_state], axis=1)
return features_and_state return features_and_state
def get_unrealized_profit(self): def get_unrealized_profit(self):

View File

@ -8,9 +8,10 @@ from pandas import DataFrame
from abc import abstractmethod from abc import abstractmethod
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from freqtrade.freqai.freqai_interface import IFreqaiModel from freqtrade.freqai.freqai_interface import IFreqaiModel
from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv, Actions, Positions
from freqtrade.persistence import Trade from freqtrade.persistence import Trade
import torch.multiprocessing import torch.multiprocessing
from stable_baselines3.common.monitor import Monitor
import torch as th import torch as th
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -26,6 +27,7 @@ class BaseReinforcementLearningModel(IFreqaiModel):
super().__init__(config=kwargs['config']) super().__init__(config=kwargs['config'])
th.set_num_threads(self.freqai_info.get('data_kitchen_thread_count', 4)) th.set_num_threads(self.freqai_info.get('data_kitchen_thread_count', 4))
self.reward_params = self.freqai_info['rl_config']['model_reward_parameters'] self.reward_params = self.freqai_info['rl_config']['model_reward_parameters']
self.train_env: Base5ActionRLEnv = None
def train( def train(
self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen
@ -65,15 +67,37 @@ class BaseReinforcementLearningModel(IFreqaiModel):
) )
logger.info(f'Training model on {len(data_dictionary["train_features"])} data points') logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
model = self.fit_rl(data_dictionary, pair, dk, prices_train, prices_test) self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test)
model = self.fit_rl(data_dictionary, dk)
logger.info(f"--------------------done training {pair}--------------------") logger.info(f"--------------------done training {pair}--------------------")
return model return model
def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
"""
User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
leaving this will default to Base5ActEnv
"""
train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"]
# environments
if not self.train_env:
self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
reward_kwargs=self.reward_params)
self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
window_size=self.CONV_WIDTH,
reward_kwargs=self.reward_params), ".")
else:
self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
self.train_env.reset()
self.eval_env.reset()
@abstractmethod @abstractmethod
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
prices_train: DataFrame, prices_test: DataFrame):
""" """
Agent customizations and abstract Reinforcement Learning customizations Agent customizations and abstract Reinforcement Learning customizations
go in here. Abstract method, so this function must be overridden by go in here. Abstract method, so this function must be overridden by
@ -193,66 +217,39 @@ class BaseReinforcementLearningModel(IFreqaiModel):
return return
class MyRLEnv(Base3ActionRLEnv): class MyRLEnv(Base5ActionRLEnv):
"""
User can override any function in BaseRLEnv and gym.Env. Here the user
Adds 5 actions.
"""
def step(self, action): def calculate_reward(self, action):
self._done = False
self._current_tick += 1
if self._current_tick == self._end_tick: if self._last_trade_tick is None:
self._done = True return 0.
self.update_portfolio_log_returns(action) # close long
if action == Actions.Long_sell.value and self._position == Positions.Long:
last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
return float(np.log(current_price) - np.log(last_trade_price))
self._update_profit(action) if action == Actions.Long_sell.value and self._position == Positions.Long:
step_reward = self._calculate_reward(action) if self.close_trade_profit[-1] > self.profit_aim * self.rr:
self.total_reward += step_reward last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
return float((np.log(current_price) - np.log(last_trade_price)) * 2)
trade_type = None # close short
if self.is_tradesignal(action): # exclude 3 case not trade if action == Actions.Short_buy.value and self._position == Positions.Short:
# Update position last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
""" current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
Action: Neutral, position: Long -> Close Long return float(np.log(last_trade_price) - np.log(current_price))
Action: Neutral, position: Short -> Close Short
Action: Long, position: Neutral -> Open Long if action == Actions.Short_buy.value and self._position == Positions.Short:
Action: Long, position: Short -> Close Short and Open Long if self.close_trade_profit[-1] > self.profit_aim * self.rr:
last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
return float((np.log(last_trade_price) - np.log(current_price)) * 2)
Action: Short, position: Neutral -> Open Short return 0.
Action: Short, position: Long -> Close Long and Open Short
"""
if action == Actions.Neutral.value:
self._position = Positions.Neutral
trade_type = "neutral"
elif action == Actions.Long.value:
self._position = Positions.Long
trade_type = "long"
elif action == Actions.Short.value:
self._position = Positions.Short
trade_type = "short"
else:
print("case not defined")
# Update last trade tick
self._last_trade_tick = self._current_tick
if trade_type is not None:
self.trade_history.append(
{'price': self.current_price(), 'index': self._current_tick,
'type': trade_type})
if self._total_profit < 0.2:
self._done = True
self._position_history.append(self._position)
observation = self._get_observation()
info = dict(
tick=self._current_tick,
total_reward=self.total_reward,
total_profit=self._total_profit,
position=self._position.value
)
self._update_history(info)
return observation, step_reward, self._done, info

View File

@ -3,9 +3,7 @@ from typing import Any, Dict # , Tuple
import numpy as np import numpy as np
# import numpy.typing as npt # import numpy.typing as npt
# import pandas as pd
import torch as th import torch as th
# from pandas import DataFrame
from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.monitor import Monitor
from typing import Callable from typing import Callable
from stable_baselines3 import PPO from stable_baselines3 import PPO
@ -16,7 +14,6 @@ from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Posi
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
import gym import gym
from pandas import DataFrame
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -48,26 +45,15 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
User created Reinforcement Learning Model prediction model. User created Reinforcement Learning Model prediction model.
""" """
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
prices_train: DataFrame, prices_test: DataFrame):
train_df = data_dictionary["train_features"] train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"] test_df = data_dictionary["test_features"]
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
env_id = "train_env"
num_cpu = int(dk.thread_count / 2)
train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)])
eval_env_id = 'eval_env'
eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
self.reward_params, self.CONV_WIDTH, monitor=True) for i in
range(num_cpu)])
path = dk.data_path path = dk.data_path
eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/", eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq), log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
deterministic=True, render=False) deterministic=True, render=False)
@ -75,7 +61,7 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
policy_kwargs = dict(activation_fn=th.nn.ReLU, policy_kwargs = dict(activation_fn=th.nn.ReLU,
net_arch=[512, 512, 512]) net_arch=[512, 512, 512])
model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs, model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs,
tensorboard_log=f"{path}/ppo/tensorboard/", tensorboard_log=f"{path}/ppo/tensorboard/",
**self.freqai_info['model_training_parameters'] **self.freqai_info['model_training_parameters']
) )
@ -87,10 +73,37 @@ class ReinforcementLearningPPO_multiproc(BaseReinforcementLearningModel):
best_model = PPO.load(dk.data_path / "best_model") best_model = PPO.load(dk.data_path / "best_model")
print('Training finished!') print('Training finished!')
eval_env.close()
return best_model return best_model
def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
"""
User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
leaving this will default to Base5ActEnv
"""
train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"]
# environments
if not self.train_env:
env_id = "train_env"
num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
self.reward_params, self.CONV_WIDTH) for i
in range(num_cpu)])
eval_env_id = 'eval_env'
self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
self.reward_params, self.CONV_WIDTH, monitor=True) for i
in range(num_cpu)])
else:
self.train_env.env_method('reset_env', train_df, prices_train,
self.CONV_WIDTH, self.reward_params)
self.eval_env.env_method('reset_env', train_df, prices_train,
self.CONV_WIDTH, self.reward_params)
self.train_env.env_method('reset')
self.eval_env.env_method('reset')
class MyRLEnv(Base3ActionRLEnv): class MyRLEnv(Base3ActionRLEnv):
""" """

View File

@ -9,8 +9,7 @@ from freqtrade.freqai.RL.TDQNagent import TDQN
from stable_baselines3 import DQN from stable_baselines3 import DQN
from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.buffers import ReplayBuffer
import numpy as np import numpy as np
from pandas import DataFrame import gc
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -21,24 +20,15 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
User created Reinforcement Learning Model prediction model. User created Reinforcement Learning Model prediction model.
""" """
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
prices_train: DataFrame, prices_test: DataFrame):
train_df = data_dictionary["train_features"] train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"] test_df = data_dictionary["test_features"]
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
# environments
train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
reward_kwargs=self.reward_params)
eval = MyRLEnv(df=test_df, prices=prices_test,
window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params)
eval_env = Monitor(eval, ".")
eval_env.reset()
path = dk.data_path path = dk.data_path
eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/", eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq), log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
deterministic=True, render=False) deterministic=True, render=False)
@ -46,7 +36,7 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
policy_kwargs = dict(activation_fn=th.nn.ReLU, policy_kwargs = dict(activation_fn=th.nn.ReLU,
net_arch=[256, 256, 128]) net_arch=[256, 256, 128])
model = TDQN('TMultiInputPolicy', train_env, model = TDQN('TMultiInputPolicy', self.train_env,
tensorboard_log=f"{path}/tdqn/tensorboard/", tensorboard_log=f"{path}/tdqn/tensorboard/",
policy_kwargs=policy_kwargs, policy_kwargs=policy_kwargs,
replay_buffer_class=ReplayBuffer, replay_buffer_class=ReplayBuffer,
@ -58,12 +48,33 @@ class ReinforcementLearningTDQN(BaseReinforcementLearningModel):
callback=eval_callback callback=eval_callback
) )
del model
best_model = DQN.load(dk.data_path / "best_model") best_model = DQN.load(dk.data_path / "best_model")
print('Training finished!') print('Training finished!')
gc.collect()
return best_model return best_model
def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
"""
User overrides this as shown here if they are using a custom MyRLEnv
"""
train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"]
# environments
if not self.train_env:
self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
reward_kwargs=self.reward_params)
self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
window_size=self.CONV_WIDTH,
reward_kwargs=self.reward_params), ".")
else:
self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
self.train_env.reset()
self.eval_env.reset()
# User can inherit and customize 5 action environment # User can inherit and customize 5 action environment
class MyRLEnv(Base5ActionRLEnv): class MyRLEnv(Base5ActionRLEnv):

View File

@ -4,8 +4,8 @@ import torch as th
import numpy as np import numpy as np
import gym import gym
from typing import Callable from typing import Callable
from stable_baselines3.common.callbacks import ( from stable_baselines3.common.callbacks import EvalCallback
EvalCallback, StopTrainingOnNoModelImprovement, StopTrainingOnRewardThreshold) # EvalCallback , StopTrainingOnNoModelImprovement, StopTrainingOnRewardThreshold
from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import SubprocVecEnv from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.utils import set_random_seed from stable_baselines3.common.utils import set_random_seed
@ -15,7 +15,6 @@ from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcement
from freqtrade.freqai.RL.TDQNagent import TDQN from freqtrade.freqai.RL.TDQNagent import TDQN
from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.buffers import ReplayBuffer
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from pandas import DataFrame
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -47,46 +46,23 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
User created Reinforcement Learning Model prediction model. User created Reinforcement Learning Model prediction model.
""" """
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
prices_train: DataFrame, prices_test: DataFrame):
train_df = data_dictionary["train_features"] train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"] test_df = data_dictionary["test_features"]
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
env_id = "train_env"
num_cpu = int(dk.thread_count / 2)
train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
self.reward_params, self.CONV_WIDTH) for i in range(num_cpu)])
eval_env_id = 'eval_env'
eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
self.reward_params, self.CONV_WIDTH, monitor=True) for i in
range(num_cpu)])
path = dk.data_path path = dk.data_path
stop_train_callback = StopTrainingOnNoModelImprovement(
max_no_improvement_evals=5, eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
min_evals=10, log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
verbose=2 deterministic=True, render=False)
)
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=2)
eval_callback = EvalCallback(
eval_env, best_model_save_path=f"{path}/",
log_path=f"{path}/tdqn/logs/",
eval_freq=int(eval_freq),
deterministic=True,
render=True,
callback_after_eval=stop_train_callback,
callback_on_new_best=callback_on_best,
verbose=2
)
# model arch # model arch
policy_kwargs = dict(activation_fn=th.nn.ReLU, policy_kwargs = dict(activation_fn=th.nn.ReLU,
net_arch=[512, 512, 512]) net_arch=[512, 512, 512])
model = TDQN('TMultiInputPolicy', train_env, model = TDQN('TMultiInputPolicy', self.train_env,
policy_kwargs=policy_kwargs, policy_kwargs=policy_kwargs,
tensorboard_log=f"{path}/tdqn/tensorboard/", tensorboard_log=f"{path}/tdqn/tensorboard/",
replay_buffer_class=ReplayBuffer, replay_buffer_class=ReplayBuffer,
@ -100,12 +76,40 @@ class ReinforcementLearningTDQN_multiproc(BaseReinforcementLearningModel):
best_model = DQN.load(dk.data_path / "best_model.zip") best_model = DQN.load(dk.data_path / "best_model.zip")
print('Training finished!') print('Training finished!')
eval_env.close()
return best_model return best_model
def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
"""
User overrides this in their prediction model if they are custom a MyRLEnv. Othwerwise
leaving this will default to Base5ActEnv
"""
train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"]
# environments
if not self.train_env:
env_id = "train_env"
num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2)
self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train,
self.reward_params, self.CONV_WIDTH) for i
in range(num_cpu)])
eval_env_id = 'eval_env'
self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test,
self.reward_params, self.CONV_WIDTH, monitor=True) for i
in range(num_cpu)])
else:
self.train_env.env_method('reset_env', train_df, prices_train,
self.CONV_WIDTH, self.reward_params)
self.eval_env.env_method('reset_env', train_df, prices_train,
self.CONV_WIDTH, self.reward_params)
self.train_env.env_method('reset')
self.eval_env.env_method('reset')
# User can inherit and customize 5 action environment # User can inherit and customize 5 action environment
class MyRLEnv(Base5ActionRLEnv): class MyRLEnv(Base5ActionRLEnv):
""" """
User can override any function in BaseRLEnv and gym.Env. Here the user User can override any function in BaseRLEnv and gym.Env. Here the user