fix persist a single training environment for PPO

This commit is contained in:
sonnhfit 2022-08-19 01:49:11 +07:00 committed by robcaulk
parent f95602f6bd
commit 4baa36bdcf
3 changed files with 51 additions and 25 deletions

View File

@ -82,10 +82,6 @@
"model_training_parameters": { "model_training_parameters": {
"learning_rate": 0.00025, "learning_rate": 0.00025,
"gamma": 0.9, "gamma": 0.9,
"target_update_interval": 5000,
"buffer_size": 50000,
"exploration_initial_eps":1,
"exploration_final_eps": 0.1,
"verbose": 1 "verbose": 1
}, },
"rl_config": { "rl_config": {

View File

@ -1,13 +1,16 @@
import logging import logging
from enum import Enum from enum import Enum
# from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
import gym import gym
import numpy as np import numpy as np
import pandas as pd
from gym import spaces from gym import spaces
from gym.utils import seeding from gym.utils import seeding
from pandas import DataFrame from pandas import DataFrame
# from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -43,6 +46,9 @@ class Base3ActionRLEnv(gym.Env):
self.id = id self.id = id
self.seed(seed) self.seed(seed)
self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
def reset_env(self, df, prices, window_size, reward_kwargs, starting_point=True):
self.df = df self.df = df
self.signal_features = self.df self.signal_features = self.df
self.prices = prices self.prices = prices
@ -54,7 +60,7 @@ class Base3ActionRLEnv(gym.Env):
self.fee = 0.0015 self.fee = 0.0015
# # spaces # # spaces
self.shape = (window_size, self.signal_features.shape[1]) self.shape = (window_size, self.signal_features.shape[1] + 2)
self.action_space = spaces.Discrete(len(Actions)) self.action_space = spaces.Discrete(len(Actions))
self.observation_space = spaces.Box( self.observation_space = spaces.Box(
low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32) low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
@ -165,7 +171,16 @@ class Base3ActionRLEnv(gym.Env):
return observation, step_reward, self._done, info return observation, step_reward, self._done, info
def _get_observation(self): def _get_observation(self):
return self.signal_features[(self._current_tick - self.window_size):self._current_tick] features_window = self.signal_features[(
self._current_tick - self.window_size):self._current_tick]
features_and_state = DataFrame(np.zeros((len(features_window), 2)),
columns=['current_profit_pct', 'position'],
index=features_window.index)
features_and_state['current_profit_pct'] = self.get_unrealized_profit()
features_and_state['position'] = self._position.value
features_and_state = pd.concat([features_window, features_and_state], axis=1)
return features_and_state
def get_unrealized_profit(self): def get_unrealized_profit(self):
@ -307,7 +322,7 @@ class Base3ActionRLEnv(gym.Env):
def prev_price(self) -> float: def prev_price(self) -> float:
return self.prices.iloc[self._current_tick - 1].open return self.prices.iloc[self._current_tick - 1].open
def sharpe_ratio(self): def sharpe_ratio(self) -> float:
if len(self.close_trade_profit) == 0: if len(self.close_trade_profit) == 0:
return 0. return 0.
returns = np.array(self.close_trade_profit) returns = np.array(self.close_trade_profit)

View File

@ -1,16 +1,17 @@
import gc
import logging import logging
from typing import Any, Dict # , Tuple from typing import Any, Dict # , Tuple
import numpy as np import numpy as np
# import numpy.typing as npt # import numpy.typing as npt
import torch as th import torch as th
from pandas import DataFrame
from stable_baselines3 import PPO from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.monitor import Monitor
from freqtrade.freqai.RL.Base3ActionRLEnv import Base3ActionRLEnv, Actions, Positions
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from freqtrade.freqai.RL.Base3ActionRLEnv import Actions, Base3ActionRLEnv, Positions
from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -21,23 +22,15 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
User created Reinforcement Learning Model prediction model. User created Reinforcement Learning Model prediction model.
""" """
def fit_rl(self, data_dictionary: Dict[str, Any], pair: str, dk: FreqaiDataKitchen, def fit_rl(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen):
prices_train: DataFrame, prices_test: DataFrame):
train_df = data_dictionary["train_features"] train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"] test_df = data_dictionary["test_features"]
eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df) eval_freq = self.freqai_info["rl_config"]["eval_cycles"] * len(test_df)
total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df) total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
# environments
train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
reward_kwargs=self.reward_params)
eval = MyRLEnv(df=test_df, prices=prices_test,
window_size=self.CONV_WIDTH, reward_kwargs=self.reward_params)
eval_env = Monitor(eval, ".")
path = dk.data_path path = dk.data_path
eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/", eval_callback = EvalCallback(self.eval_env, best_model_save_path=f"{path}/",
log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq), log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
deterministic=True, render=False) deterministic=True, render=False)
@ -45,8 +38,8 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
policy_kwargs = dict(activation_fn=th.nn.ReLU, policy_kwargs = dict(activation_fn=th.nn.ReLU,
net_arch=[256, 256, 128]) net_arch=[256, 256, 128])
model = PPO('MlpPolicy', train_env, policy_kwargs=policy_kwargs, model = PPO('MlpPolicy', self.train_env, policy_kwargs=policy_kwargs,
tensorboard_log=f"{path}/ppo/tensorboard/", learning_rate=0.00025, tensorboard_log=f"{path}/ppo/tensorboard/",
**self.freqai_info['model_training_parameters'] **self.freqai_info['model_training_parameters']
) )
@ -55,12 +48,34 @@ class ReinforcementLearningPPO(BaseReinforcementLearningModel):
callback=eval_callback callback=eval_callback
) )
del model
best_model = PPO.load(dk.data_path / "best_model") best_model = PPO.load(dk.data_path / "best_model")
print('Training finished!') print('Training finished!')
gc.collect()
return best_model return best_model
def set_train_and_eval_environments(self, data_dictionary, prices_train, prices_test):
"""
User overrides this as shown here if they are using a custom MyRLEnv
"""
train_df = data_dictionary["train_features"]
test_df = data_dictionary["test_features"]
# environments
if not self.train_env:
self.train_env = MyRLEnv(df=train_df, prices=prices_train, window_size=self.CONV_WIDTH,
reward_kwargs=self.reward_params)
self.eval_env = Monitor(MyRLEnv(df=test_df, prices=prices_test,
window_size=self.CONV_WIDTH,
reward_kwargs=self.reward_params), ".")
else:
self.train_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
self.eval_env.reset_env(train_df, prices_train, self.CONV_WIDTH, self.reward_params)
self.train_env.reset()
self.eval_env.reset()
class MyRLEnv(Base3ActionRLEnv): class MyRLEnv(Base3ActionRLEnv):
""" """