From b26ed7dea4564d55b112cc50ce96e08983913bf2 Mon Sep 17 00:00:00 2001 From: robcaulk Date: Tue, 23 Aug 2022 14:58:38 +0200 Subject: [PATCH] fix generic reward, add time duration to reward --- config_examples/config_freqai-rl.example.json | 1 + freqtrade/freqai/RL/Base5ActionRLEnv.py | 27 ++++------------- .../RL/BaseReinforcementLearningModel.py | 25 +++++++++------- .../prediction_models/ReinforcementLearner.py | 29 +++++++++++-------- .../ReinforcementLearner_multiproc.py | 6 ++-- 5 files changed, 43 insertions(+), 45 deletions(-) diff --git a/config_examples/config_freqai-rl.example.json b/config_examples/config_freqai-rl.example.json index 07ddb04d3..bb67b44b6 100644 --- a/config_examples/config_freqai-rl.example.json +++ b/config_examples/config_freqai-rl.example.json @@ -88,6 +88,7 @@ "train_cycles": 10, "eval_cycles": 3, "thread_count": 4, + "max_trade_duration_candles": 100, "model_type": "PPO", "policy_type": "MlpPolicy", "model_reward_parameters": { diff --git a/freqtrade/freqai/RL/Base5ActionRLEnv.py b/freqtrade/freqai/RL/Base5ActionRLEnv.py index 2b1c4f975..a14111495 100644 --- a/freqtrade/freqai/RL/Base5ActionRLEnv.py +++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py @@ -8,6 +8,7 @@ from gym import spaces from gym.utils import seeding from pandas import DataFrame import pandas as pd +from abc import abstractmethod logger = logging.getLogger(__name__) @@ -265,28 +266,12 @@ class Base5ActionRLEnv(gym.Env): def get_sharpe_ratio(self): return mean_over_std(self.get_portfolio_log_returns()) + @abstractmethod def calculate_reward(self, action): - - if self._last_trade_tick is None: - return 0. - - # close long - if action == Actions.Long_exit.value and self._position == Positions.Long: - last_trade_price = self.add_entry_fee(self.prices.iloc[self._last_trade_tick].open) - current_price = self.add_exit_fee(self.prices.iloc[self._current_tick].open) - factor = 1 - if self.close_trade_profit and self.close_trade_profit[-1] > self.profit_aim * self.rr: - factor = self.rl_config['model_reward_parameters'].get('win_reward_factor', 2) - return float((np.log(current_price) - np.log(last_trade_price)) * factor) - - # close short - if action == Actions.Short_exit.value and self._position == Positions.Short: - last_trade_price = self.add_exit_fee(self.prices.iloc[self._last_trade_tick].open) - current_price = self.add_entry_fee(self.prices.iloc[self._current_tick].open) - factor = 1 - if self.close_trade_profit and self.close_trade_profit[-1] > self.profit_aim * self.rr: - factor = self.rl_config['model_reward_parameters'].get('win_reward_factor', 2) - return float(np.log(last_trade_price) - np.log(current_price) * factor) + """ + Reward is created by BaseReinforcementLearningModel and can + be inherited/edited by the user made ReinforcementLearner file. + """ return 0. diff --git a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py index 6a15b96f9..a9a1377a8 100644 --- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py +++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py @@ -270,7 +270,7 @@ def make_env(env_id: str, rank: int, seed: int, train_df, price, class MyRLEnv(Base5ActionRLEnv): """ User can override any function in BaseRLEnv and gym.Env. Here the user - Adds 5 actions. + sets a custom reward based on profit and trade duration. """ def calculate_reward(self, action): @@ -278,22 +278,27 @@ class MyRLEnv(Base5ActionRLEnv): if self._last_trade_tick is None: return 0. + pnl = self.get_unrealized_profit() + max_trade_duration = self.rl_config['max_trade_duration_candles'] + trade_duration = self._current_tick - self._last_trade_tick + + factor = 1 + if trade_duration <= max_trade_duration: + factor *= 1.5 + elif trade_duration > max_trade_duration: + factor *= 0.5 + # close long if action == Actions.Long_exit.value and self._position == Positions.Long: - last_trade_price = self.add_entry_fee(self.prices.iloc[self._last_trade_tick].open) - current_price = self.add_exit_fee(self.prices.iloc[self._current_tick].open) - factor = 1 if self.close_trade_profit and self.close_trade_profit[-1] > self.profit_aim * self.rr: - factor = self.rl_config['model_reward_parameters'].get('win_reward_factor', 2) - return float((np.log(current_price) - np.log(last_trade_price)) * factor) + factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2) + return float(pnl * factor) # close short if action == Actions.Short_exit.value and self._position == Positions.Short: - last_trade_price = self.add_exit_fee(self.prices.iloc[self._last_trade_tick].open) - current_price = self.add_entry_fee(self.prices.iloc[self._current_tick].open) factor = 1 if self.close_trade_profit and self.close_trade_profit[-1] > self.profit_aim * self.rr: - factor = self.rl_config['model_reward_parameters'].get('win_reward_factor', 2) - return float(np.log(last_trade_price) - np.log(current_price) * factor) + factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2) + return float(pnl * factor) return 0. diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearner.py b/freqtrade/freqai/prediction_models/ReinforcementLearner.py index 5f22971e1..d3e6bde7c 100644 --- a/freqtrade/freqai/prediction_models/ReinforcementLearner.py +++ b/freqtrade/freqai/prediction_models/ReinforcementLearner.py @@ -3,7 +3,6 @@ from typing import Any, Dict # , Tuple # import numpy.typing as npt import torch as th -import numpy as np from freqtrade.freqai.data_kitchen import FreqaiDataKitchen from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel @@ -47,30 +46,36 @@ class ReinforcementLearner(BaseReinforcementLearningModel): class MyRLEnv(Base5ActionRLEnv): """ - User can modify any part of the environment by overriding base - functions + User can override any function in BaseRLEnv and gym.Env. Here the user + sets a custom reward based on profit and trade duration. """ + def calculate_reward(self, action): if self._last_trade_tick is None: return 0. + pnl = self.get_unrealized_profit() + max_trade_duration = self.rl_config['max_trade_duration_candles'] + trade_duration = self._current_tick - self._last_trade_tick + + factor = 1 + if trade_duration <= max_trade_duration: + factor *= 1.5 + elif trade_duration > max_trade_duration: + factor *= 0.5 + # close long if action == Actions.Long_exit.value and self._position == Positions.Long: - last_trade_price = self.add_entry_fee(self.prices.iloc[self._last_trade_tick].open) - current_price = self.add_exit_fee(self.prices.iloc[self._current_tick].open) - factor = 1 if self.close_trade_profit and self.close_trade_profit[-1] > self.profit_aim * self.rr: - factor = self.rl_config['model_reward_parameters'].get('win_reward_factor', 2) - return float((np.log(current_price) - np.log(last_trade_price)) * factor) + factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2) + return float(pnl * factor) # close short if action == Actions.Short_exit.value and self._position == Positions.Short: - last_trade_price = self.add_exit_fee(self.prices.iloc[self._last_trade_tick].open) - current_price = self.add_entry_fee(self.prices.iloc[self._current_tick].open) factor = 1 if self.close_trade_profit and self.close_trade_profit[-1] > self.profit_aim * self.rr: - factor = self.rl_config['model_reward_parameters'].get('win_reward_factor', 2) - return float(np.log(last_trade_price) - np.log(current_price) * factor) + factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2) + return float(pnl * factor) return 0. diff --git a/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py b/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py index ee9a407c9..96d42ae66 100644 --- a/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py +++ b/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py @@ -62,12 +62,14 @@ class ReinforcementLearner_multiproc(BaseReinforcementLearningModel): env_id = "train_env" num_cpu = int(self.freqai_info["data_kitchen_thread_count"] / 2) self.train_env = SubprocVecEnv([make_env(env_id, i, 1, train_df, prices_train, - self.reward_params, self.CONV_WIDTH, config=self.config) for i + self.reward_params, self.CONV_WIDTH, + config=self.config) for i in range(num_cpu)]) eval_env_id = 'eval_env' self.eval_env = SubprocVecEnv([make_env(eval_env_id, i, 1, test_df, prices_test, - self.reward_params, self.CONV_WIDTH, monitor=True, config=self.config) for i + self.reward_params, self.CONV_WIDTH, monitor=True, + config=self.config) for i in range(num_cpu)]) self.eval_callback = EvalCallback(self.eval_env, deterministic=True, render=False, eval_freq=eval_freq,