restructure RL so that user can customize environment

2022-08-15 10:26:44 +02:00
parent ecd1f55abc
commit 91683e1dca
13 changed files with 882 additions and 1904 deletions
@@ -0,0 +1,318 @@
 import logging
 from enum import Enum
 # from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 import gym
 import numpy as np
 from gym import spaces
 from gym.utils import seeding
 logger = logging.getLogger(__name__)
 class Actions(Enum):
    Short = 0
    Long = 1
    Neutral = 2
 class Positions(Enum):
    Short = 0
    Long = 1
    Neutral = 0.5
    def opposite(self):
        return Positions.Short if self == Positions.Long else Positions.Long
 def mean_over_std(x):
    std = np.std(x, ddof=1)
    mean = np.mean(x)
    return mean / std if std > 0 else 0
 class BaseRLEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    def __init__(self, df, prices, reward_kwargs, window_size=10, starting_point=True, ):
        assert df.ndim == 2
        self.seed()
        self.df = df
        self.signal_features = self.df
        self.prices = prices
        self.window_size = window_size
        self.starting_point = starting_point
        self.rr = reward_kwargs["rr"]
        self.profit_aim = reward_kwargs["profit_aim"]
        self.fee = 0.0015
        # # spaces
        self.shape = (window_size, self.signal_features.shape[1])
        self.action_space = spaces.Discrete(len(Actions))
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
        # episode
        self._start_tick = self.window_size
        self._end_tick = len(self.prices) - 1
        self._done = None
        self._current_tick = None
        self._last_trade_tick = None
        self._position = Positions.Neutral
        self._position_history = None
        self.total_reward = None
        self._total_profit = None
        self._first_rendering = None
        self.history = None
        self.trade_history = []
        self.r_t_change = 0.
        self.returns_report = []
    def seed(self, seed: int = 1):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    def reset(self):
        self._done = False
        if self.starting_point is True:
            self._position_history = (self._start_tick * [None]) + [self._position]
        else:
            self._position_history = (self.window_size * [None]) + [self._position]
        self._current_tick = self._start_tick
        self._last_trade_tick = None
        self._position = Positions.Neutral
        self.total_reward = 0.
        self._total_profit = 1.  # unit
        self._first_rendering = True
        self.history = {}
        self.trade_history = []
        self.portfolio_log_returns = np.zeros(len(self.prices))
        self._profits = [(self._start_tick, 1)]
        self.close_trade_profit = []
        self.r_t_change = 0.
        self.returns_report = []
        return self._get_observation()
    def step(self, action: int):
        self._done = False
        self._current_tick += 1
        if self._current_tick == self._end_tick:
            self._done = True
        self.update_portfolio_log_returns(action)
        self._update_profit(action)
        step_reward = self.calculate_reward(action)
        self.total_reward += step_reward
        trade_type = None
        if self.is_tradesignal(action):  # exclude 3 case not trade
            # Update position
            """
            Action: Neutral, position: Long ->  Close Long
            Action: Neutral, position: Short -> Close Short
            Action: Long, position: Neutral -> Open Long
            Action: Long, position: Short -> Close Short and Open Long
            Action: Short, position: Neutral -> Open Short
            Action: Short, position: Long -> Close Long and Open Short
            """
            if action == Actions.Neutral.value:
                self._position = Positions.Neutral
                trade_type = "neutral"
            elif action == Actions.Long.value:
                self._position = Positions.Long
                trade_type = "long"
            elif action == Actions.Short.value:
                self._position = Positions.Short
                trade_type = "short"
            else:
                print("case not defined")
            # Update last trade tick
            self._last_trade_tick = self._current_tick
            if trade_type is not None:
                self.trade_history.append(
                    {'price': self.current_price(), 'index': self._current_tick,
                     'type': trade_type})
        if self._total_profit < 0.2:
            self._done = True
        self._position_history.append(self._position)
        observation = self._get_observation()
        info = dict(
            tick=self._current_tick,
            total_reward=self.total_reward,
            total_profit=self._total_profit,
            position=self._position.value
        )
        self._update_history(info)
        return observation, step_reward, self._done, info
    def _get_observation(self):
        return self.signal_features[(self._current_tick - self.window_size):self._current_tick]
    def get_unrealized_profit(self):
        if self._last_trade_tick is None:
            return 0.
        if self._position == Positions.Neutral:
            return 0.
        elif self._position == Positions.Short:
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            return (last_trade_price - current_price) / last_trade_price
        elif self._position == Positions.Long:
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            return (current_price - last_trade_price) / last_trade_price
        else:
            return 0.
    def is_tradesignal(self, action: int):
        # trade signal
        """
        not trade signal is :
        Action: Neutral, position: Neutral -> Nothing
        Action: Long, position: Long -> Hold Long
        Action: Short, position: Short -> Hold Short
        """
        return not ((action == Actions.Neutral.value and self._position == Positions.Neutral)
                    or (action == Actions.Short.value and self._position == Positions.Short)
                    or (action == Actions.Long.value and self._position == Positions.Long))
    def _is_trade(self, action: Actions):
        return ((action == Actions.Long.value and self._position == Positions.Short) or
                (action == Actions.Short.value and self._position == Positions.Long) or
                (action == Actions.Neutral.value and self._position == Positions.Long) or
                (action == Actions.Neutral.value and self._position == Positions.Short)
                )
    def is_hold(self, action):
        return ((action == Actions.Short.value and self._position == Positions.Short)
                or (action == Actions.Long.value and self._position == Positions.Long))
    def add_buy_fee(self, price):
        return price * (1 + self.fee)
    def add_sell_fee(self, price):
        return price / (1 + self.fee)
    def _update_history(self, info):
        if not self.history:
            self.history = {key: [] for key in info.keys()}
        for key, value in info.items():
            self.history[key].append(value)
    def get_sharpe_ratio(self):
        return mean_over_std(self.get_portfolio_log_returns())
    def calculate_reward(self, action):
        if self._last_trade_tick is None:
            return 0.
        # close long
        if (action == Actions.Short.value or
                action == Actions.Neutral.value) and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
        # close short
        if (action == Actions.Long.value or
                action == Actions.Neutral.value) and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
        return 0.
    def _update_profit(self, action):
        if self._is_trade(action) or self._done:
            pnl = self.get_unrealized_profit()
            if self._position == Positions.Long:
                self._total_profit = self._total_profit + self._total_profit * pnl
                self._profits.append((self._current_tick, self._total_profit))
                self.close_trade_profit.append(pnl)
            if self._position == Positions.Short:
                self._total_profit = self._total_profit + self._total_profit * pnl
                self._profits.append((self._current_tick, self._total_profit))
                self.close_trade_profit.append(pnl)
    def most_recent_return(self, action: int):
        """
        We support Long, Neutral and Short positions.
        Return is generated from rising prices in Long
        and falling prices in Short positions.
        The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
        """
        # Long positions
        if self._position == Positions.Long:
            current_price = self.prices.iloc[self._current_tick].open
            if action == Actions.Short.value or action == Actions.Neutral.value:
                current_price = self.add_sell_fee(current_price)
            previous_price = self.prices.iloc[self._current_tick - 1].open
            if (self._position_history[self._current_tick - 1] == Positions.Short
                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
                previous_price = self.add_buy_fee(previous_price)
            return np.log(current_price) - np.log(previous_price)
        # Short positions
        if self._position == Positions.Short:
            current_price = self.prices.iloc[self._current_tick].open
            if action == Actions.Long.value or action == Actions.Neutral.value:
                current_price = self.add_buy_fee(current_price)
            previous_price = self.prices.iloc[self._current_tick - 1].open
            if (self._position_history[self._current_tick - 1] == Positions.Long
                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
                previous_price = self.add_sell_fee(previous_price)
            return np.log(previous_price) - np.log(current_price)
        return 0
    def get_portfolio_log_returns(self):
        return self.portfolio_log_returns[1:self._current_tick + 1]
    def update_portfolio_log_returns(self, action):
        self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
    def current_price(self) -> float:
        return self.prices.iloc[self._current_tick].open
    def prev_price(self) -> float:
        return self.prices.iloc[self._current_tick - 1].open
    def sharpe_ratio(self):
        if len(self.close_trade_profit) == 0:
            return 0.
        returns = np.array(self.close_trade_profit)
        reward = (np.mean(returns) - 0. + 1e-9) / (np.std(returns) + 1e-9)
        return reward
@@ -0,0 +1,230 @@
 import logging
 from typing import Any, Dict, Tuple
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
 from pandas import DataFrame
 from abc import abstractmethod
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.freqai_interface import IFreqaiModel
 from freqtrade.freqai.RL.BaseRLEnv import BaseRLEnv, Actions, Positions
 from freqtrade.persistence import Trade
 logger = logging.getLogger(__name__)
 class BaseReinforcementLearningModel(IFreqaiModel):
    """
    User created Reinforcement Learning Model prediction model.
    """
    def train(
        self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen
    ) -> Any:
        """
        Filter the training data and train a model to it. Train makes heavy use of the datakitchen
        for storing, saving, loading, and analyzing the data.
        :param unfiltered_dataframe: Full dataframe for the current training period
        :param metadata: pair metadata from strategy.
        :returns:
        :model: Trained model which can be used to inference (self.predict)
        """
        logger.info("--------------------Starting training " f"{pair} --------------------")
        # filter the features requested by user in the configuration file and elegantly handle NaNs
        features_filtered, labels_filtered = dk.filter_features(
            unfiltered_dataframe,
            dk.training_features_list,
            dk.label_list,
            training_filter=True,
        )
        data_dictionary: Dict[str, Any] = dk.make_train_test_datasets(
            features_filtered, labels_filtered)
        dk.fit_labels()  # useless for now, but just satiating append methods
        # normalize all data based on train_dataset only
        data_dictionary = dk.normalize_data(data_dictionary)
        # optional additional data cleaning/analysis
        self.data_cleaning_train(dk)
        logger.info(
            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
        )
        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
        model = self.fit(data_dictionary, pair)
        if pair not in self.dd.historic_predictions:
            self.set_initial_historic_predictions(
                data_dictionary['train_features'], model, dk, pair)
        self.dd.save_historic_predictions_to_disk()
        logger.info(f"--------------------done training {pair}--------------------")
        return model
    @abstractmethod
    def fit(self, data_dictionary: Dict[str, Any], pair: str = ''):
        """
        Agent customizations and abstract Reinforcement Learning customizations
        go in here. Abstract method, so this function must be overridden by
        user class.
        """
        return
    def get_state_info(self, pair):
        open_trades = Trade.get_trades(trade_filter=Trade.is_open.is_(True))
        market_side = 0.5
        current_profit = 0
        for trade in open_trades:
            if trade.pair == pair:
                current_value = trade.open_trade_value
                openrate = trade.open_rate
                if 'long' in trade.enter_tag:
                    market_side = 1
                else:
                    market_side = 0
                current_profit = current_value / openrate - 1
        total_profit = 0
        closed_trades = Trade.get_trades(
            trade_filter=[Trade.is_open.is_(False), Trade.pair == pair])
        for trade in closed_trades:
            total_profit += trade.close_profit
        return market_side, current_profit, total_profit
    def predict(
        self, unfiltered_dataframe: DataFrame, dk: FreqaiDataKitchen, first: bool = False
    ) -> Tuple[DataFrame, npt.NDArray[np.int_]]:
        """
        Filter the prediction features data and predict with it.
        :param: unfiltered_dataframe: Full dataframe for the current backtest period.
        :return:
        :pred_df: dataframe containing the predictions
        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
        data (NaNs) or felt uncertain about data (PCA and DI index)
        """
        dk.find_features(unfiltered_dataframe)
        filtered_dataframe, _ = dk.filter_features(
            unfiltered_dataframe, dk.training_features_list, training_filter=False
        )
        filtered_dataframe = dk.normalize_data_from_metadata(filtered_dataframe)
        dk.data_dictionary["prediction_features"] = filtered_dataframe
        # optional additional data cleaning/analysis
        self.data_cleaning_predict(dk, filtered_dataframe)
        pred_df = self.rl_model_predict(dk.data_dictionary["prediction_features"], dk, self.model)
        pred_df.fillna(0, inplace=True)
        return (pred_df, dk.do_predict)
    def rl_model_predict(self, dataframe: DataFrame,
                         dk: FreqaiDataKitchen, model: Any) -> DataFrame:
        output = pd.DataFrame(np.full((len(dataframe), 1), 2), columns=dk.label_list)
        def _predict(window):
            observations = dataframe.iloc[window.index]
            res, _ = model.predict(observations, deterministic=True)
            return res
        output = output.rolling(window=self.CONV_WIDTH).apply(_predict)
        return output
    def set_initial_historic_predictions(
        self, df: DataFrame, model: Any, dk: FreqaiDataKitchen, pair: str
    ) -> None:
        pred_df = self.rl_model_predict(df, dk, model)
        pred_df.fillna(0, inplace=True)
        self.dd.historic_predictions[pair] = pred_df
        hist_preds_df = self.dd.historic_predictions[pair]
        for label in hist_preds_df.columns:
            if hist_preds_df[label].dtype == object:
                continue
            hist_preds_df[f'{label}_mean'] = 0
            hist_preds_df[f'{label}_std'] = 0
        hist_preds_df['do_predict'] = 0
        if self.freqai_info['feature_parameters'].get('DI_threshold', 0) > 0:
            hist_preds_df['DI_values'] = 0
        for return_str in dk.data['extra_returns_per_train']:
            hist_preds_df[return_str] = 0
 class MyRLEnv(BaseRLEnv):
    def step(self, action):
        self._done = False
        self._current_tick += 1
        if self._current_tick == self._end_tick:
            self._done = True
        self.update_portfolio_log_returns(action)
        self._update_profit(action)
        step_reward = self._calculate_reward(action)
        self.total_reward += step_reward
        trade_type = None
        if self.is_tradesignal(action):  # exclude 3 case not trade
            # Update position
            """
            Action: Neutral, position: Long ->  Close Long
            Action: Neutral, position: Short -> Close Short
            Action: Long, position: Neutral -> Open Long
            Action: Long, position: Short -> Close Short and Open Long
            Action: Short, position: Neutral -> Open Short
            Action: Short, position: Long -> Close Long and Open Short
            """
            if action == Actions.Neutral.value:
                self._position = Positions.Neutral
                trade_type = "neutral"
            elif action == Actions.Long.value:
                self._position = Positions.Long
                trade_type = "long"
            elif action == Actions.Short.value:
                self._position = Positions.Short
                trade_type = "short"
            else:
                print("case not defined")
            # Update last trade tick
            self._last_trade_tick = self._current_tick
            if trade_type is not None:
                self.trade_history.append(
                    {'price': self.current_price(), 'index': self._current_tick,
                     'type': trade_type})
        if self._total_profit < 0.2:
            self._done = True
        self._position_history.append(self._position)
        observation = self._get_observation()
        info = dict(
            tick=self._current_tick,
            total_reward=self.total_reward,
            total_profit=self._total_profit,
            position=self._position.value
        )
        self._update_history(info)
        return observation, step_reward, self._done, info
@@ -6,11 +6,10 @@ import torch as th
 from stable_baselines3 import DQN
 from stable_baselines3.common.buffers import ReplayBuffer
 from stable_baselines3.common.policies import BasePolicy
-from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor, CombinedExtractor,
+from stable_baselines3.common.torch_layers import (BaseFeaturesExtractor,
                                                   FlattenExtractor)
 from stable_baselines3.common.type_aliases import GymEnv, Schedule
-#from stable_baselines3.common.policies import register_policy
+from stable_baselines3.dqn.policies import (CnnPolicy, DQNPolicy, MlpPolicy,
 from stable_baselines3.dqn.policies import (CnnPolicy, DQNPolicy, MlpPolicy, MultiInputPolicy,
                                            QNetwork)
 from torch import nn
@@ -47,16 +46,17 @@ def create_mlp_(
    ]
    return modules
 class TDQNetwork(QNetwork):
    def __init__(self,
-        observation_space: gym.spaces.Space,
+                 observation_space: gym.spaces.Space,
-        action_space: gym.spaces.Space,
+                 action_space: gym.spaces.Space,
-        features_extractor: nn.Module,
+                 features_extractor: nn.Module,
-        features_dim: int,
+                 features_dim: int,
-        net_arch: Optional[List[int]] = None,
+                 net_arch: Optional[List[int]] = None,
-        activation_fn: Type[nn.Module] = nn.ReLU,
+                 activation_fn: Type[nn.Module] = nn.ReLU,
-        normalize_images: bool = True
+                 normalize_images: bool = True
-    ):
+                 ):
        super().__init__(
            observation_space=observation_space,
            action_space=action_space,
@@ -211,10 +211,3 @@ class TDQN(DQN):
            device=device,
            _init_setup_model=_init_setup_model
        )
 # try:
 #     register_policy("TMultiInputPolicy", TMultiInputPolicy)
 # except:
 #     print("already registered")
@@ -1,139 +0,0 @@
 # common library
 import gym
 import numpy as np
 from stable_baselines3 import A2C, DDPG, PPO, SAC, TD3
 from stable_baselines3.common.callbacks import (BaseCallback, CallbackList, CheckpointCallback,
                                                EvalCallback, StopTrainingOnRewardThreshold)
 from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
 from freqtrade.freqai.prediction_models.RL import config
 #from freqtrade.freqai.prediction_models.RL.RLPrediction_agent_v2 import TDQN
 from freqtrade.freqai.prediction_models.RL.RLPrediction_env import DEnv
 # from stable_baselines3.common.vec_env import DummyVecEnv
 # from meta.env_stock_trading.env_stock_trading import StockTradingEnv
 # RL models from stable-baselines
 MODELS = {"a2c": A2C, "ddpg": DDPG, "td3": TD3, "sac": SAC, "ppo": PPO}
 MODEL_KWARGS = {x: config.__dict__[f"{x.upper()}_PARAMS"] for x in MODELS.keys()}
 NOISE = {
    "normal": NormalActionNoise,
    "ornstein_uhlenbeck": OrnsteinUhlenbeckActionNoise,
 }
 class TensorboardCallback(BaseCallback):
    """
    Custom callback for plotting additional values in tensorboard.
    """
    def __init__(self, verbose=0):
        super(TensorboardCallback, self).__init__(verbose)
    def _on_step(self) -> bool:
        try:
            self.logger.record(key="train/reward", value=self.locals["rewards"][0])
        except BaseException:
            self.logger.record(key="train/reward", value=self.locals["reward"][0])
        return True
 class RLPrediction_agent:
    """Provides implementations for DRL algorithms
    Based on:
    https://github.com/AI4Finance-Foundation/FinRL-Meta/blob/master/agents/stablebaselines3_models.py
    Attributes
    ----------
        env: gym environment class
            user-defined class
    Methods
    -------
        get_model()
            setup DRL algorithms
        train_model()
            train DRL algorithms in a train dataset
            and output the trained model
        DRL_prediction()
            make a prediction in a test dataset and get results
    """
    def __init__(self, env):
        self.env = env
    def get_model(
        self,
        model_name,
        policy="MlpPolicy",
        policy_kwargs=None,
        model_kwargs=None,
        reward_kwargs=None,
        #total_timesteps=None,
        verbose=1,
        seed=None
    ):
        if model_name not in MODELS:
            raise NotImplementedError("NotImplementedError")
        if model_kwargs is None:
            model_kwargs = MODEL_KWARGS[model_name]
        if "action_noise" in model_kwargs:
            n_actions = self.env.action_space.shape[-1]
            model_kwargs["action_noise"] = NOISE[model_kwargs["action_noise"]](
                mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)
            )
        print(model_kwargs)
        model = MODELS[model_name](
            policy=policy,
            env=self.env,
            tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{model_name}",
            verbose=verbose,
            policy_kwargs=policy_kwargs,
            #model_kwargs=model_kwargs,
            #total_timesteps=model_kwargs["total_timesteps"],
            seed=seed
            #**model_kwargs,
        )
        return model
    def train_model(self, model, tb_log_name, model_kwargs, train_df, test_df, price, price_test, window_size):
        agent_params = self.freqai_info['model_training_parameters']
        reward_params = self.freqai_info['model_reward_parameters']
        train_env = DEnv(df=train_df, prices=price, window_size=window_size, reward_kwargs=reward_params)
        eval_env = DEnv(df=test_df, prices=price_test, window_size=window_size, reward_kwargs=reward_params)
        # checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/',
        #         name_prefix='rl_model')
        checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/')
        eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model', log_path='./logs/results', eval_freq=500)
        #callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=1)
        # Create the callback list
        callback = CallbackList([checkpoint_callback, eval_callback])
        model = model.learn(
            total_timesteps=model_kwargs["total_timesteps"],
            tb_log_name=tb_log_name,
            callback=callback,
            #callback=TensorboardCallback(),
        )
        return model
@@ -1,513 +0,0 @@
 import logging
 import random
 from collections import deque
 from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 import gym
 import matplotlib.pylab as plt
 import numpy as np
 import pandas as pd
 from gym import spaces
 from gym.utils import seeding
 logger = logging.getLogger(__name__)
 class Actions(Enum):
    Short = 0
    Long = 1
    Neutral = 2
 class Positions(Enum):
    Short = 0
    Long = 1
    Neutral = 0.5
    def opposite(self):
        return Positions.Short if self == Positions.Long else Positions.Long
 def mean_over_std(x):
    std = np.std(x, ddof=1)
    mean = np.mean(x)
    return mean / std if std > 0 else 0
 class DEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    def __init__(self, df, prices, reward_kwargs, window_size=10, starting_point=True, ):
        assert df.ndim == 2
        self.seed()
        self.df = df
        self.signal_features = self.df
        self.prices = prices
        self.window_size = window_size
        self.starting_point = starting_point
        self.rr = reward_kwargs["rr"]
        self.profit_aim = reward_kwargs["profit_aim"]
        self.fee=0.0015
        # # spaces
        self.shape = (window_size, self.signal_features.shape[1])
        self.action_space = spaces.Discrete(len(Actions))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
        # episode
        self._start_tick = self.window_size
        self._end_tick = len(self.prices) - 1
        self._done = None
        self._current_tick = None
        self._last_trade_tick = None
        self._position = Positions.Neutral
        self._position_history = None
        self.total_reward = None
        self._total_profit = None
        self._first_rendering = None
        self.history = None
        self.trade_history = []
        # self.A_t, self.B_t = 0.000639, 0.00001954
        self.r_t_change = 0.
        self.returns_report = []
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    def reset(self):
        self._done = False
        if self.starting_point == True:
            self._position_history = (self._start_tick* [None]) + [self._position]
        else:
            self._position_history = (self.window_size * [None]) + [self._position]
        self._current_tick = self._start_tick
        self._last_trade_tick = None
        #self._last_trade_tick = self._current_tick - 1
        self._position = Positions.Neutral
        self.total_reward = 0.
        self._total_profit = 1.  # unit
        self._first_rendering = True
        self.history = {}
        self.trade_history = []
        self.portfolio_log_returns = np.zeros(len(self.prices))
        self._profits = [(self._start_tick, 1)]
        self.close_trade_profit = []
        self.r_t_change = 0.
        self.returns_report = []
        return self._get_observation()
    def step(self, action):
        self._done = False
        self._current_tick += 1
        if self._current_tick == self._end_tick:
            self._done = True
        self.update_portfolio_log_returns(action)
        self._update_profit(action)
        step_reward = self._calculate_reward(action)
        self.total_reward += step_reward
        trade_type = None
        if self.is_tradesignal(action): # exclude 3 case not trade
            # Update position
            """
            Action: Neutral, position: Long ->  Close Long
            Action: Neutral, position: Short -> Close Short
            Action: Long, position: Neutral -> Open Long
            Action: Long, position: Short -> Close Short and Open Long
            Action: Short, position: Neutral -> Open Short
            Action: Short, position: Long -> Close Long and Open Short
            """
            temp_position = self._position
            if action == Actions.Neutral.value:
                self._position = Positions.Neutral
                trade_type = "neutral"
            elif action == Actions.Long.value:
                self._position = Positions.Long
                trade_type = "long"
            elif action == Actions.Short.value:
                self._position = Positions.Short
                trade_type = "short"
            else:
                print("case not defined")
            # Update last trade tick
            self._last_trade_tick = self._current_tick
            if trade_type != None:
                self.trade_history.append(
                    {'price': self.current_price(), 'index': self._current_tick, 'type': trade_type})
        if self._total_profit < 0.2:
            self._done = True
        self._position_history.append(self._position)
        observation = self._get_observation()
        info = dict(
            tick = self._current_tick,
            total_reward = self.total_reward,
            total_profit = self._total_profit,
            position = self._position.value
        )
        self._update_history(info)
        return observation, step_reward, self._done, info
    # def processState(self, state):
    #     return state.to_numpy()
    # def convert_mlp_Policy(self, obs_):
    #     pass
    def _get_observation(self):
        return self.signal_features[(self._current_tick - self.window_size):self._current_tick]
    def get_unrealized_profit(self):
        if self._last_trade_tick == None:
            return 0.
        if self._position == Positions.Neutral:
            return 0.
        elif self._position == Positions.Short:
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            return  (last_trade_price - current_price)/last_trade_price
        elif self._position == Positions.Long:
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            return (current_price - last_trade_price)/last_trade_price
        else:
            return 0.
    def is_tradesignal(self, action):
        # trade signal
        """
        not trade signal is :
        Action: Neutral, position: Neutral -> Nothing
        Action: Long, position: Long -> Hold Long
        Action: Short, position: Short -> Hold Short
        """
        return not ((action == Actions.Neutral.value and self._position == Positions.Neutral)
                    or (action == Actions.Short.value and self._position == Positions.Short)
                    or (action == Actions.Long.value and self._position == Positions.Long))
    def _is_trade(self, action: Actions):
        return ((action == Actions.Long.value and self._position == Positions.Short) or
        (action == Actions.Short.value and self._position == Positions.Long) or
        (action == Actions.Neutral.value and self._position == Positions.Long) or
        (action == Actions.Neutral.value and self._position == Positions.Short)
        )
    def is_hold(self, action):
        return ((action == Actions.Short.value and self._position == Positions.Short)
                or (action == Actions.Long.value and self._position == Positions.Long))
    def add_buy_fee(self, price):
        return price * (1 + self.fee)
    def add_sell_fee(self, price):
        return price / (1 + self.fee)
    def _update_history(self, info):
        if not self.history:
            self.history = {key: [] for key in info.keys()}
        for key, value in info.items():
            self.history[key].append(value)
    # def render(self, mode='human'):
    #     def _plot_position(position, tick):
    #         color = None
    #         if position == Positions.Short:
    #             color = 'red'
    #         elif position == Positions.Long:
    #             color = 'green'
    #         if color:
    #             plt.scatter(tick, self.prices.loc[tick].open, color=color)
    #     if self._first_rendering:
    #         self._first_rendering = False
    #         plt.cla()
    #         plt.plot(self.prices)
    #         start_position = self._position_history[self._start_tick]
    #         _plot_position(start_position, self._start_tick)
    #     plt.cla()
    #     plt.plot(self.prices)
    #     _plot_position(self._position, self._current_tick)
    #     plt.suptitle("Total Reward: %.6f" % self.total_reward + ' ~ ' + "Total Profit: %.6f" % self._total_profit)
    #     plt.pause(0.01)
    # def render_all(self):
    #     plt.figure()
    #     window_ticks = np.arange(len(self._position_history))
    #     plt.plot(self.prices['open'], alpha=0.5)
    #     short_ticks = []
    #     long_ticks = []
    #     neutral_ticks = []
    #     for i, tick in enumerate(window_ticks):
    #         if self._position_history[i] == Positions.Short:
    #             short_ticks.append(tick - 1)
    #         elif self._position_history[i] == Positions.Long:
    #             long_ticks.append(tick - 1)
    #         elif self._position_history[i] == Positions.Neutral:
    #             neutral_ticks.append(tick - 1)
    #     plt.plot(neutral_ticks, self.prices.loc[neutral_ticks].open,
    #              'o', color='grey', ms=3, alpha=0.1)
    #     plt.plot(short_ticks, self.prices.loc[short_ticks].open,
    #              'o', color='r', ms=3, alpha=0.8)
    #     plt.plot(long_ticks, self.prices.loc[long_ticks].open,
    #              'o', color='g', ms=3, alpha=0.8)
    #     plt.suptitle("Generalising")
    #     fig = plt.gcf()
    #     fig.set_size_inches(15, 10)
    # def close_trade_report(self):
    #     small_trade = 0
    #     positive_big_trade = 0
    #     negative_big_trade = 0
    #     small_profit = 0.003
    #     for i in self.close_trade_profit:
    #         if i < small_profit and i > -small_profit:
    #             small_trade+=1
    #         elif i > small_profit:
    #             positive_big_trade += 1
    #         elif i < -small_profit:
    #             negative_big_trade += 1
    #     print(f"small trade={small_trade/len(self.close_trade_profit)}; positive_big_trade={positive_big_trade/len(self.close_trade_profit)}; negative_big_trade={negative_big_trade/len(self.close_trade_profit)}")
    # def report(self):
    #     # get total trade
    #     long_trade = 0
    #     short_trade = 0
    #     neutral_trade = 0
    #     for trade in self.trade_history:
    #         if trade['type'] == 'long':
    #             long_trade += 1
    #         elif trade['type'] == 'short':
    #             short_trade += 1
    #         else:
    #             neutral_trade += 1
    #     negative_trade = 0
    #     positive_trade = 0
    #     for tr in self.close_trade_profit:
    #         if tr < 0.:
    #             negative_trade += 1
    #         if tr > 0.:
    #             positive_trade += 1
    #     total_trade_lr = negative_trade+positive_trade
    #     total_trade = long_trade + short_trade
    #     sharp_ratio = self.sharpe_ratio()
    #     sharp_log = self.get_sharpe_ratio()
    #     from tabulate import tabulate
    #     headers = ["Performance", ""]
    #     performanceTable = [["Total Trade", "{0:.2f}".format(total_trade)],
    #                      ["Total reward", "{0:.3f}".format(self.total_reward)],
    #                      ["Start profit(unit)", "{0:.2f}".format(1.)],
    #                      ["End profit(unit)", "{0:.3f}".format(self._total_profit)],
    #                      ["Sharp ratio", "{0:.3f}".format(sharp_ratio)],
    #                      ["Sharp log", "{0:.3f}".format(sharp_log)],
    #                      # ["Sortino ratio", "{0:.2f}".format(0) + '%'],
    #                      ["winrate", "{0:.2f}".format(positive_trade*100/total_trade_lr) + '%']
    #                      ]
    #     tabulation = tabulate(performanceTable, headers, tablefmt="fancy_grid", stralign="center")
    #     print(tabulation)
    #     result = {
    #         "Start": "{0:.2f}".format(1.),
    #         "End": "{0:.2f}".format(self._total_profit),
    #         "Sharp": "{0:.3f}".format(sharp_ratio),
    #         "Winrate": "{0:.2f}".format(positive_trade*100/total_trade_lr)
    #     }
    #     return result
    # def close(self):
    #     plt.close()
    def get_sharpe_ratio(self):
        return mean_over_std(self.get_portfolio_log_returns())
    # def save_rendering(self, filepath):
    #     plt.savefig(filepath)
    # def pause_rendering(self):
    #     plt.show()
    def _calculate_reward(self, action):
        # rw = self.transaction_profit_reward(action)
        #rw = self.reward_rr_profit_config(action)
        rw = self.profit_only_when_close_reward(action)
        #rw = self.profit_only_when_close_reward_aim(action)
        return rw
    def _update_profit(self, action):
        if self._is_trade(action) or self._done:
            pnl = self.get_unrealized_profit()
            if self._position == Positions.Long:
                self._total_profit = self._total_profit + self._total_profit*pnl
                self._profits.append((self._current_tick, self._total_profit))
                self.close_trade_profit.append(pnl)
            if self._position == Positions.Short:
                self._total_profit = self._total_profit + self._total_profit*pnl
                self._profits.append((self._current_tick, self._total_profit))
                self.close_trade_profit.append(pnl)
    def most_recent_return(self, action):
        """
        We support Long, Neutral and Short positions.
        Return is generated from rising prices in Long
        and falling prices in Short positions.
        The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
        """
        # Long positions
        if self._position == Positions.Long:
            current_price = self.prices.iloc[self._current_tick].open
            if action == Actions.Short.value or action == Actions.Neutral.value:
                current_price = self.add_sell_fee(current_price)
            previous_price = self.prices.iloc[self._current_tick - 1].open
            if (self._position_history[self._current_tick - 1] == Positions.Short
                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
                previous_price = self.add_buy_fee(previous_price)
            return np.log(current_price) - np.log(previous_price)
        # Short positions
        if self._position == Positions.Short:
            current_price = self.prices.iloc[self._current_tick].open
            if action == Actions.Long.value or action == Actions.Neutral.value:
                current_price = self.add_buy_fee(current_price)
            previous_price = self.prices.iloc[self._current_tick - 1].open
            if (self._position_history[self._current_tick - 1] == Positions.Long
                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
                previous_price = self.add_sell_fee(previous_price)
            return np.log(previous_price) - np.log(current_price)
        return 0
    def get_portfolio_log_returns(self):
        return self.portfolio_log_returns[1:self._current_tick + 1]
    # def get_trading_log_return(self):
    #     return self.portfolio_log_returns[self._start_tick:]
    def update_portfolio_log_returns(self, action):
        self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
    def current_price(self) -> float:
        return self.prices.iloc[self._current_tick].open
    def prev_price(self) -> float:
        return self.prices.iloc[self._current_tick-1].open
    def sharpe_ratio(self):
        if len(self.close_trade_profit) == 0:
            return 0.
        returns = np.array(self.close_trade_profit)
        reward = (np.mean(returns) - 0. + 1e-9) / (np.std(returns) + 1e-9)
        return reward
    # def get_bnh_log_return(self):
    #     return np.diff(np.log(self.prices['open'][self._start_tick:]))
    def transaction_profit_reward(self, action):
        rw = 0.
        pt  = self.prev_price()
        pt_1 = self.current_price()
        if self._position == Positions.Long:
            a_t = 1
        elif self._position == Positions.Short:
            a_t = -1
        else:
            a_t = 0
        # close long
        if (action == Actions.Short.value or action == Actions.Neutral.value) and self._position == Positions.Long:
            pt_1 = self.add_sell_fee(self.current_price())
            po = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            rw = a_t*(pt_1 - po)/po
            #rw = rw*2
        # close short
        elif (action == Actions.Long.value or action == Actions.Neutral.value) and self._position == Positions.Short:
            pt_1 = self.add_buy_fee(self.current_price())
            po = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            rw = a_t*(pt_1 - po)/po
            #rw = rw*2
        else:
            rw = a_t*(pt_1 - pt)/pt
        return np.clip(rw, 0, 1)
    def profit_only_when_close_reward_aim(self, action):
        if self._last_trade_tick == None:
            return 0.
        # close long
        if (action == Actions.Short.value or action == Actions.Neutral.value) and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
        if (action == Actions.Short.value or action == Actions.Neutral.value) and self._position == Positions.Long:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
        # close short
        if (action == Actions.Long.value or action == Actions.Neutral.value) and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
        if (action == Actions.Long.value or action == Actions.Neutral.value) and self._position == Positions.Short:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
        return 0.
    def profit_only_when_close_reward(self, action):
        if self._last_trade_tick == None:
            return 0.
        # close long
        if (action == Actions.Short.value or action == Actions.Neutral.value) and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
        # close short
        if (action == Actions.Long.value or action == Actions.Neutral.value) and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
        return 0.
@@ -1,671 +0,0 @@
 import logging
 import random
 from collections import deque
 from enum import Enum
 #from sklearn.decomposition import PCA, KernelPCA
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 import gym
 import matplotlib.pylab as plt
 import numpy as np
 import pandas as pd
 from gym import spaces
 from gym.utils import seeding
 logger = logging.getLogger(__name__)
 # from bokeh.io import output_notebook
 # from bokeh.plotting import figure, show
 # from bokeh.models import (
 #     CustomJS,
 #     ColumnDataSource,
 #     NumeralTickFormatter,
 #     Span,
 #     HoverTool,
 #     Range1d,
 #     DatetimeTickFormatter,
 #     Scatter,
 #     Label, LabelSet
 # )
 class Actions(Enum):
    Neutral = 0
    Long_buy = 1
    Long_sell = 2
    Short_buy = 3
    Short_sell = 4
 class Positions(Enum):
    Short = 0
    Long = 1
    Neutral = 0.5
    def opposite(self):
        return Positions.Short if self == Positions.Long else Positions.Long
 def mean_over_std(x):
    std = np.std(x, ddof=1)
    mean = np.mean(x)
    return mean / std if std > 0 else 0
 class DEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    def __init__(self, df, prices, reward_kwargs, window_size=10, starting_point=True, ):
        assert df.ndim == 2
        self.seed()
        self.df = df
        self.signal_features = self.df
        self.prices = prices
        self.window_size = window_size
        self.starting_point = starting_point
        self.rr = reward_kwargs["rr"]
        self.profit_aim = reward_kwargs["profit_aim"]
        self.fee=0.0015
        # # spaces
        self.shape = (window_size, self.signal_features.shape[1])
        self.action_space = spaces.Discrete(len(Actions))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32)
        # episode
        self._start_tick = self.window_size
        self._end_tick = len(self.prices) - 1
        self._done = None
        self._current_tick = None
        self._last_trade_tick = None
        self._position = Positions.Neutral
        self._position_history = None
        self.total_reward = None
        self._total_profit = None
        self._first_rendering = None
        self.history = None
        self.trade_history = []
        # self.A_t, self.B_t = 0.000639, 0.00001954
        self.r_t_change = 0.
        self.returns_report = []
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    def reset(self):
        self._done = False
        if self.starting_point == True:
            self._position_history = (self._start_tick* [None]) + [self._position]
        else:
            self._position_history = (self.window_size * [None]) + [self._position]
        self._current_tick = self._start_tick
        self._last_trade_tick = None
        #self._last_trade_tick = self._current_tick - 1
        self._position = Positions.Neutral
        self.total_reward = 0.
        self._total_profit = 1.  # unit
        self._first_rendering = True
        self.history = {}
        self.trade_history = []
        self.portfolio_log_returns = np.zeros(len(self.prices))
        self._profits = [(self._start_tick, 1)]
        self.close_trade_profit = []
        self.r_t_change = 0.
        self.returns_report = []
        return self._get_observation()
    def step(self, action):
        self._done = False
        self._current_tick += 1
        if self._current_tick == self._end_tick:
            self._done = True
        self.update_portfolio_log_returns(action)
        self._update_profit(action)
        step_reward = self._calculate_reward(action)
        self.total_reward += step_reward
        trade_type = None
        if self.is_tradesignal(action): # exclude 3 case not trade
            # Update position
            """
            Action: Neutral, position: Long ->  Close Long
            Action: Neutral, position: Short -> Close Short
            Action: Long, position: Neutral -> Open Long
            Action: Long, position: Short -> Close Short and Open Long
            Action: Short, position: Neutral -> Open Short
            Action: Short, position: Long -> Close Long and Open Short
            """
            temp_position = self._position
            if action == Actions.Neutral.value:
                self._position = Positions.Neutral
                trade_type = "neutral"
            elif action == Actions.Long_buy.value:
                self._position = Positions.Long
                trade_type = "long"
            elif action == Actions.Short_buy.value:
                self._position = Positions.Short
                trade_type = "short"
            elif action == Actions.Long_sell.value:
                self._position = Positions.Neutral
                trade_type = "neutral"
            elif action == Actions.Short_sell.value:
                self._position = Positions.Neutral
                trade_type = "neutral"
            else:
                print("case not defined")
            # Update last trade tick
            self._last_trade_tick = self._current_tick
            if trade_type != None:
                self.trade_history.append(
                    {'price': self.current_price(), 'index': self._current_tick, 'type': trade_type})
        if self._total_profit < 0.2:
            self._done = True
        self._position_history.append(self._position)
        observation = self._get_observation()
        info = dict(
            tick = self._current_tick,
            total_reward = self.total_reward,
            total_profit = self._total_profit,
            position = self._position.value
        )
        self._update_history(info)
        return observation, step_reward, self._done, info
    # def processState(self, state):
    #     return state.to_numpy()
    # def convert_mlp_Policy(self, obs_):
    #     pass
    def _get_observation(self):
        return self.signal_features[(self._current_tick - self.window_size):self._current_tick]
    def get_unrealized_profit(self):
        if self._last_trade_tick == None:
            return 0.
        if self._position == Positions.Neutral:
            return 0.
        elif self._position == Positions.Short:
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            return  (last_trade_price - current_price)/last_trade_price
        elif self._position == Positions.Long:
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            return (current_price - last_trade_price)/last_trade_price
        else:
            return 0.
    def is_tradesignal(self, action):
        # trade signal
        """
        not trade signal is :
        Action: Neutral, position: Neutral -> Nothing
        Action: Long, position: Long -> Hold Long
        Action: Short, position: Short -> Hold Short
        """
        return not ((action == Actions.Neutral.value and self._position == Positions.Neutral) or
                    (action == Actions.Short_buy.value and self._position == Positions.Short) or
                    (action == Actions.Short_sell.value and self._position == Positions.Short) or
                    (action == Actions.Short_buy.value and self._position == Positions.Long) or
                    (action == Actions.Short_sell.value and self._position == Positions.Long) or
                    (action == Actions.Long_buy.value and self._position == Positions.Long) or
                    (action == Actions.Long_sell.value and self._position == Positions.Long) or
                    (action == Actions.Long_buy.value and self._position == Positions.Short) or
                    (action == Actions.Long_sell.value and self._position == Positions.Short))
    def _is_trade(self, action: Actions):
        return ((action == Actions.Long_buy.value and self._position == Positions.Short) or
        (action == Actions.Short_buy.value and self._position == Positions.Long) or
        (action == Actions.Neutral.value and self._position == Positions.Long) or
        (action == Actions.Neutral.value and self._position == Positions.Short) or
        (action == Actions.Neutral.Short_sell and self._position == Positions.Long) or
        (action == Actions.Neutral.Long_sell and self._position == Positions.Short)
        )
    def is_hold(self, action):
        return ((action == Actions.Short.value and self._position == Positions.Short)
                or (action == Actions.Long.value and self._position == Positions.Long))
    def add_buy_fee(self, price):
        return price * (1 + self.fee)
    def add_sell_fee(self, price):
        return price / (1 + self.fee)
    def _update_history(self, info):
        if not self.history:
            self.history = {key: [] for key in info.keys()}
        for key, value in info.items():
            self.history[key].append(value)
    # def render(self, mode='human'):
    #     def _plot_position(position, tick):
    #         color = None
    #         if position == Positions.Short:
    #             color = 'red'
    #         elif position == Positions.Long:
    #             color = 'green'
    #         if color:
    #             plt.scatter(tick, self.prices.loc[tick].open, color=color)
    #     if self._first_rendering:
    #         self._first_rendering = False
    #         plt.cla()
    #         plt.plot(self.prices)
    #         start_position = self._position_history[self._start_tick]
    #         _plot_position(start_position, self._start_tick)
    #     plt.cla()
    #     plt.plot(self.prices)
    #     _plot_position(self._position, self._current_tick)
    #     plt.suptitle("Total Reward: %.6f" % self.total_reward + ' ~ ' + "Total Profit: %.6f" % self._total_profit)
    #     plt.pause(0.01)
    # def render_all(self):
    #     plt.figure()
    #     window_ticks = np.arange(len(self._position_history))
    #     plt.plot(self.prices['open'], alpha=0.5)
    #     short_ticks = []
    #     long_ticks = []
    #     neutral_ticks = []
    #     for i, tick in enumerate(window_ticks):
    #         if self._position_history[i] == Positions.Short:
    #             short_ticks.append(tick - 1)
    #         elif self._position_history[i] == Positions.Long:
    #             long_ticks.append(tick - 1)
    #         elif self._position_history[i] == Positions.Neutral:
    #             neutral_ticks.append(tick - 1)
    #     plt.plot(neutral_ticks, self.prices.loc[neutral_ticks].open,
    #              'o', color='grey', ms=3, alpha=0.1)
    #     plt.plot(short_ticks, self.prices.loc[short_ticks].open,
    #              'o', color='r', ms=3, alpha=0.8)
    #     plt.plot(long_ticks, self.prices.loc[long_ticks].open,
    #              'o', color='g', ms=3, alpha=0.8)
    #     plt.suptitle("Generalising")
    #     fig = plt.gcf()
    #     fig.set_size_inches(15, 10)
    # def close_trade_report(self):
    #     small_trade = 0
    #     positive_big_trade = 0
    #     negative_big_trade = 0
    #     small_profit = 0.003
    #     for i in self.close_trade_profit:
    #         if i < small_profit and i > -small_profit:
    #             small_trade+=1
    #         elif i > small_profit:
    #             positive_big_trade += 1
    #         elif i < -small_profit:
    #             negative_big_trade += 1
    #     print(f"small trade={small_trade/len(self.close_trade_profit)}; positive_big_trade={positive_big_trade/len(self.close_trade_profit)}; negative_big_trade={negative_big_trade/len(self.close_trade_profit)}")
    # def report(self):
    #     # get total trade
    #     long_trade = 0
    #     short_trade = 0
    #     neutral_trade = 0
    #     for trade in self.trade_history:
    #         if trade['type'] == 'long':
    #             long_trade += 1
    #         elif trade['type'] == 'short':
    #             short_trade += 1
    #         else:
    #             neutral_trade += 1
    #     negative_trade = 0
    #     positive_trade = 0
    #     for tr in self.close_trade_profit:
    #         if tr < 0.:
    #             negative_trade += 1
    #         if tr > 0.:
    #             positive_trade += 1
    #     total_trade_lr = negative_trade+positive_trade
    #     total_trade = long_trade + short_trade
    #     sharp_ratio = self.sharpe_ratio()
    #     sharp_log = self.get_sharpe_ratio()
    #     from tabulate import tabulate
    #     headers = ["Performance", ""]
    #     performanceTable = [["Total Trade", "{0:.2f}".format(total_trade)],
    #                      ["Total reward", "{0:.3f}".format(self.total_reward)],
    #                      ["Start profit(unit)", "{0:.2f}".format(1.)],
    #                      ["End profit(unit)", "{0:.3f}".format(self._total_profit)],
    #                      ["Sharp ratio", "{0:.3f}".format(sharp_ratio)],
    #                      ["Sharp log", "{0:.3f}".format(sharp_log)],
    #                      # ["Sortino ratio", "{0:.2f}".format(0) + '%'],
    #                      ["winrate", "{0:.2f}".format(positive_trade*100/total_trade_lr) + '%']
    #                      ]
    #     tabulation = tabulate(performanceTable, headers, tablefmt="fancy_grid", stralign="center")
    #     print(tabulation)
    #     result = {
    #         "Start": "{0:.2f}".format(1.),
    #         "End": "{0:.2f}".format(self._total_profit),
    #         "Sharp": "{0:.3f}".format(sharp_ratio),
    #         "Winrate": "{0:.2f}".format(positive_trade*100/total_trade_lr)
    #     }
    #     return result
    # def close(self):
    #     plt.close()
    def get_sharpe_ratio(self):
        return mean_over_std(self.get_portfolio_log_returns())
    # def save_rendering(self, filepath):
    #     plt.savefig(filepath)
    # def pause_rendering(self):
    #     plt.show()
    def _calculate_reward(self, action):
        # rw = self.transaction_profit_reward(action)
        #rw = self.reward_rr_profit_config(action)
        #rw = self.reward_rr_profit_config(action) # main
        #rw = self.profit_only_when_close_reward(action)
        rw = self.profit_only_when_close_reward_aim(action)
        return rw
    def _update_profit(self, action):
        #if self._is_trade(action) or self._done:
        if self._is_trade(action) or self._done:
            pnl = self.get_unrealized_profit()
            if self._position == Positions.Long:
                self._total_profit = self._total_profit + self._total_profit*pnl
                self._profits.append((self._current_tick, self._total_profit))
                self.close_trade_profit.append(pnl)
            if self._position == Positions.Short:
                self._total_profit = self._total_profit + self._total_profit*pnl
                self._profits.append((self._current_tick, self._total_profit))
                self.close_trade_profit.append(pnl)
    def most_recent_return(self, action):
        """
        We support Long, Neutral and Short positions.
        Return is generated from rising prices in Long
        and falling prices in Short positions.
        The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
        """
        # Long positions
        if self._position == Positions.Long:
            current_price = self.prices.iloc[self._current_tick].open
            #if action == Actions.Short.value or action == Actions.Neutral.value:
            if action == Actions.Short_buy.value or action == Actions.Neutral.value:
                current_price = self.add_sell_fee(current_price)
            previous_price = self.prices.iloc[self._current_tick - 1].open
            if (self._position_history[self._current_tick - 1] == Positions.Short
                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
                previous_price = self.add_buy_fee(previous_price)
            return np.log(current_price) - np.log(previous_price)
        # Short positions
        if self._position == Positions.Short:
            current_price = self.prices.iloc[self._current_tick].open
            #if action == Actions.Long.value or action == Actions.Neutral.value:
            if action == Actions.Long_buy.value or action == Actions.Neutral.value:
                current_price = self.add_buy_fee(current_price)
            previous_price = self.prices.iloc[self._current_tick - 1].open
            if (self._position_history[self._current_tick - 1] == Positions.Long
                    or self._position_history[self._current_tick - 1] == Positions.Neutral):
                previous_price = self.add_sell_fee(previous_price)
            return np.log(previous_price) - np.log(current_price)
        return 0
    def get_portfolio_log_returns(self):
        return self.portfolio_log_returns[1:self._current_tick + 1]
    def get_trading_log_return(self):
        return self.portfolio_log_returns[self._start_tick:]
    def update_portfolio_log_returns(self, action):
        self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
    def current_price(self) -> float:
        return self.prices.iloc[self._current_tick].open
    def prev_price(self) -> float:
        return self.prices.iloc[self._current_tick-1].open
    def sharpe_ratio(self):
        if len(self.close_trade_profit) == 0:
            return 0.
        returns = np.array(self.close_trade_profit)
        reward = (np.mean(returns) - 0. + 1e-9) / (np.std(returns) + 1e-9)
        return reward
    def get_bnh_log_return(self):
        return np.diff(np.log(self.prices['open'][self._start_tick:]))
    def transaction_profit_reward(self, action):
        rw = 0.
        pt  = self.prev_price()
        pt_1 = self.current_price()
        if self._position == Positions.Long:
            a_t = 1
        elif self._position == Positions.Short:
            a_t = -1
        else:
            a_t = 0
        # close long
        if (action == Actions.Short.value or action == Actions.Neutral.value) and self._position == Positions.Long:
            pt_1 = self.add_sell_fee(self.current_price())
            po = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            rw = a_t*(pt_1 - po)/po
            #rw = rw*2
        # close short
        elif (action == Actions.Long.value or action == Actions.Neutral.value) and self._position == Positions.Short:
            pt_1 = self.add_buy_fee(self.current_price())
            po = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            rw = a_t*(pt_1 - po)/po
            #rw = rw*2
        else:
            rw = a_t*(pt_1 - pt)/pt
        return np.clip(rw, 0, 1)
    def profit_only_when_close_reward(self, action):
        if self._last_trade_tick == None:
            return 0.
        # close long
        if action == Actions.Long_sell.value and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
        # close short
        if action == Actions.Short_buy.value  and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
        return 0.
    def profit_only_when_close_reward_aim(self, action):
        if self._last_trade_tick == None:
            return 0.
        # close long
        if action == Actions.Long_sell.value and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
        if action == Actions.Long_sell.value and self._position == Positions.Long:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
        # close short
        if action == Actions.Short_buy.value  and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
        if action == Actions.Short_buy.value  and self._position == Positions.Short:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
        return 0.
    def reward_rr_profit_config(self, action):
        rw = 0.
        pt_1 = self.current_price()
        if len(self.close_trade_profit) > 0:
            # long
            if self._position == Positions.Long:
                pt_1 = self.add_sell_fee(self.current_price())
                po = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
                if action == Actions.Short_buy.value:
                    if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                        rw = 15
                    elif self.close_trade_profit[-1] > 0.01 and self.close_trade_profit[-1] < self.profit_aim * self.rr:
                        rw = -1
                    elif self.close_trade_profit[-1] < 0:
                        rw = -10
                    elif self.close_trade_profit[-1] < (self.profit_aim * -1) * self.rr:
                        rw = -15
                if action == Actions.Long_sell.value:
                    if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                        rw = 20
                    elif self.close_trade_profit[-1] > 0.01 and self.close_trade_profit[-1] < self.profit_aim * self.rr:
                        rw = -1
                    elif self.close_trade_profit[-1] < 0:
                        rw = -15
                    elif self.close_trade_profit[-1] < (self.profit_aim * -1) * self.rr:
                        rw = -25
                if action == Actions.Neutral.value:
                    if self.close_trade_profit[-1] > 0.005:
                        rw = 0
                    elif self.close_trade_profit[-1] < 0:
                        rw = 0
            # short
            if self._position == Positions.Short:
                pt_1 = self.add_sell_fee(self.current_price())
                po = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
                if action == Actions.Long_buy.value:
                    if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                        rw = 15
                    elif self.close_trade_profit[-1] > 0.01 and self.close_trade_profit[-1] < (self.profit_aim * -1) * self.rr:
                        rw = -1
                    elif self.close_trade_profit[-1] < 0:
                        rw = -10
                    elif self.close_trade_profit[-1] < (self.profit_aim * -1) * self.rr:
                        rw =- -25
                if action == Actions.Short_sell.value:
                    if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                        rw = 20
                    elif self.close_trade_profit[-1] > 0.01 and self.close_trade_profit[-1] < (self.profit_aim * -1) * self.rr:
                        rw = -1
                    elif self.close_trade_profit[-1] < 0:
                        rw = -15
                    elif self.close_trade_profit[-1] < (self.profit_aim * -1) * self.rr:
                        rw = -25
                if action == Actions.Neutral.value:
                    if self.close_trade_profit[-1] > 0.005:
                        rw = 0
                    elif self.close_trade_profit[-1] < 0:
                        rw = 0
        return np.clip(rw, 0, 1)
@@ -1,37 +0,0 @@
 # dir
 DATA_SAVE_DIR = "datasets"
 TRAINED_MODEL_DIR = "trained_models"
 TENSORBOARD_LOG_DIR = "tensorboard_log"
 RESULTS_DIR = "results"
 # Model Parameters
 A2C_PARAMS = {"n_steps": 5, "ent_coef": 0.01, "learning_rate": 0.0007}
 PPO_PARAMS = {
    "n_steps": 2048,
    "ent_coef": 0.01,
    "learning_rate": 0.00025,
    "batch_size": 64,
 }
 DDPG_PARAMS = {"batch_size": 128, "buffer_size": 50000, "learning_rate": 0.001}
 TD3_PARAMS = {
    "batch_size": 100,
    "buffer_size": 1000000,
    "learning_rate": 0.001,
 }
 SAC_PARAMS = {
    "batch_size": 64,
    "buffer_size": 100000,
    "learning_rate": 0.0001,
    "learning_starts": 100,
    "ent_coef": "auto_0.1",
 }
 ERL_PARAMS = {
    "learning_rate": 3e-5,
    "batch_size": 2048,
    "gamma": 0.985,
    "seed": 312,
    "net_dimension": 512,
    "target_step": 5000,
    "eval_gap": 30,
 }
 RLlib_PARAMS = {"lr": 5e-5, "train_batch_size": 500, "gamma": 0.99}
@@ -1,253 +0,0 @@
 import logging
 from typing import Any, Dict, Tuple
 #from matplotlib.colors import DivergingNorm
 from pandas import DataFrame
 import pandas as pd
 from freqtrade.exceptions import OperationalException
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 import tensorflow as tf
 from freqtrade.freqai.prediction_models.BaseTensorFlowModel import BaseTensorFlowModel
 from freqtrade.freqai.freqai_interface import IFreqaiModel
 from tensorflow.keras.layers import Input, Conv1D, Dense, MaxPooling1D, Flatten, Dropout
 from tensorflow.keras.models import Model
 import numpy as np
 import copy
 from keras.layers import *
 import random
 logger = logging.getLogger(__name__)
 # tf.config.run_functions_eagerly(True)
 # tf.data.experimental.enable_debug_mode()
 import os
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 MAX_EPOCHS = 10
 LOOKBACK = 8
 class RLPredictionModel_v2(IFreqaiModel):
    """
    User created prediction model. The class needs to override three necessary
    functions, predict(), fit().
    """
    def fit(self, data_dictionary: Dict, pair) -> Any:
        """
        User sets up the training and test data to fit their desired model here
        :params:
        :data_dictionary: the dictionary constructed by DataHandler to hold
        all the training and test data/labels.
        """
        train_df = data_dictionary["train_features"]
        train_labels = data_dictionary["train_labels"]
        test_df = data_dictionary["test_features"]
        test_labels = data_dictionary["test_labels"]
        n_labels = len(train_labels.columns)
        if n_labels > 1:
            raise OperationalException(
                "Neural Net not yet configured for multi-targets. Please "
                " reduce number of targets to 1 in strategy."
            )
        n_features = len(data_dictionary["train_features"].columns)
        BATCH_SIZE = self.freqai_info.get("batch_size", 64)
        input_dims = [BATCH_SIZE, self.CONV_WIDTH, n_features]
        w1 = WindowGenerator(
            input_width=self.CONV_WIDTH,
            label_width=1,
            shift=1,
            train_df=train_df,
            val_df=test_df,
            train_labels=train_labels,
            val_labels=test_labels,
            batch_size=BATCH_SIZE,
        )
        # train_agent()
        #pair = self.dd.historical_data[pair]
        #gym_env = FreqtradeEnv(data=train_df, prices=0.01, windows_size=100, pair=pair, stake_amount=100)
        # sep = '/'
        # coin = pair.split(sep, 1)[0]
        # # df1 = train_df.filter(regex='price')
        # # df2 = df1.filter(regex='raw')
        # # df3 = df2.filter(regex=f"{coin}")
        # # print(df3)
        # price = train_df[f"%-{coin}raw_price_5m"]
        # gym_env = RLPrediction_GymAnytrading(signal_features=train_df, prices=price, window_size=100)
        # sac = RLPrediction_Agent(gym_env)
        # print(sac)
        # return 0
        return model
    def predict(
        self, unfiltered_dataframe: DataFrame, dk: FreqaiDataKitchen, first=True
    ) -> Tuple[DataFrame, DataFrame]:
        """
        Filter the prediction features data and predict with it.
        :param: unfiltered_dataframe: Full dataframe for the current backtest period.
        :return:
        :predictions: np.array of predictions
        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
        data (NaNs) or felt uncertain about data (PCA and DI index)
        """
        dk.find_features(unfiltered_dataframe)
        filtered_dataframe, _ = dk.filter_features(
            unfiltered_dataframe, dk.training_features_list, training_filter=False
        )
        filtered_dataframe = dk.normalize_data_from_metadata(filtered_dataframe)
        dk.data_dictionary["prediction_features"] = filtered_dataframe
        # optional additional data cleaning/analysis
        self.data_cleaning_predict(dk, filtered_dataframe)
        if first:
            full_df = dk.data_dictionary["prediction_features"]
            w1 = WindowGenerator(
                input_width=self.CONV_WIDTH,
                label_width=1,
                shift=1,
                test_df=full_df,
                batch_size=len(full_df),
            )
            predictions = self.model.predict(w1.inference)
            len_diff = len(dk.do_predict) - len(predictions)
            if len_diff > 0:
                dk.do_predict = dk.do_predict[len_diff:]
        else:
            data = dk.data_dictionary["prediction_features"]
            data = tf.expand_dims(data, axis=0)
            predictions = self.model(data, training=False)
        predictions = predictions[:, 0]
        pred_df = DataFrame(predictions, columns=dk.label_list)
        pred_df = dk.denormalize_labels_from_metadata(pred_df)
        return (pred_df, np.ones(len(pred_df)))
    def set_initial_historic_predictions(
        self, df: DataFrame, model: Any, dk: FreqaiDataKitchen, pair: str
    ) -> None:
        pass
        # w1 = WindowGenerator(
        #     input_width=self.CONV_WIDTH, label_width=1, shift=1, test_df=df, batch_size=len(df)
        # )
        # trained_predictions = model.predict(w1.inference)
        # #trained_predictions = trained_predictions[:, 0, 0]
        # trained_predictions = trained_predictions[:, 0]
        # n_lost_points = len(df) - len(trained_predictions)
        # pred_df = DataFrame(trained_predictions, columns=dk.label_list)
        # zeros_df = DataFrame(np.zeros((n_lost_points, len(dk.label_list))), columns=dk.label_list)
        # pred_df = pd.concat([zeros_df, pred_df], axis=0)
        # pred_df = dk.denormalize_labels_from_metadata(pred_df)
        # self.dd.historic_predictions[pair] = DataFrame()
        # self.dd.historic_predictions[pair] = copy.deepcopy(pred_df)
 class WindowGenerator:
    def __init__(
        self,
        input_width,
        label_width,
        shift,
        train_df=None,
        val_df=None,
        test_df=None,
        train_labels=None,
        val_labels=None,
        test_labels=None,
        batch_size=None,
    ):
        # Store the raw data.
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.train_labels = train_labels
        self.val_labels = val_labels
        self.test_labels = test_labels
        self.batch_size = batch_size
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift
        self.total_window_size = input_width + shift
        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]
    def make_dataset(self, data, labels=None):
        data = np.array(data, dtype=np.float32)
        if labels is not None:
            labels = np.array(labels, dtype=np.float32)
        ds = tf.keras.preprocessing.timeseries_dataset_from_array(
            data=data,
            targets=labels,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            sampling_rate=1,
            shuffle=False,
            batch_size=self.batch_size,
        )
        return ds
    @property
    def train(self):
        return self.make_dataset(self.train_df, self.train_labels)
    @property
    def val(self):
        return self.make_dataset(self.val_df, self.val_labels)
    @property
    def test(self):
        return self.make_dataset(self.test_df, self.test_labels)
    @property
    def inference(self):
        return self.make_dataset(self.test_df)
    @property
    def example(self):
        """Get and cache an example batch of `inputs, labels` for plotting."""
        result = getattr(self, "_example", None)
        if result is None:
            # No example batch was found, so get one from the `.train` dataset
            result = next(iter(self.train))
            # And cache it for next time
            self._example = result
        return result
@@ -1,273 +0,0 @@
 import logging
 from typing import Any, Dict, Tuple
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
 import torch as th
 from pandas import DataFrame
 from stable_baselines3 import PPO
 from stable_baselines3.common.buffers import ReplayBuffer
 from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.monitor import Monitor
 from stable_baselines3.common.vec_env import SubprocVecEnv
 from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
 from freqtrade.freqai.freqai_interface import IFreqaiModel
 from freqtrade.freqai.prediction_models.RL.RLPrediction_agent_TDQN import TDQN
 from freqtrade.freqai.prediction_models.RL.RLPrediction_env_TDQN_5ac import DEnv
 #from freqtrade.freqai.prediction_models.RL.RLPrediction_env_TDQN_3ac import DEnv
 from freqtrade.persistence import Trade
 logger = logging.getLogger(__name__)
 class ReinforcementLearning(IFreqaiModel):
    """
    User created Reinforcement Learning Model prediction model.
    """
    def train(
        self, unfiltered_dataframe: DataFrame, pair: str, dk: FreqaiDataKitchen
    ) -> Any:
        """
        Filter the training data and train a model to it. Train makes heavy use of the datakitchen
        for storing, saving, loading, and analyzing the data.
        :param unfiltered_dataframe: Full dataframe for the current training period
        :param metadata: pair metadata from strategy.
        :returns:
        :model: Trained model which can be used to inference (self.predict)
        """
        logger.info("--------------------Starting training " f"{pair} --------------------")
        # filter the features requested by user in the configuration file and elegantly handle NaNs
        features_filtered, labels_filtered = dk.filter_features(
            unfiltered_dataframe,
            dk.training_features_list,
            dk.label_list,
            training_filter=True,
        )
        data_dictionary: Dict[str, Any] = dk.make_train_test_datasets(
            features_filtered, labels_filtered)
        dk.fit_labels()  # useless for now, but just satiating append methods
        # normalize all data based on train_dataset only
        data_dictionary = dk.normalize_data(data_dictionary)
        # optional additional data cleaning/analysis
        self.data_cleaning_train(dk)
        logger.info(
            f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
        )
        logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
        model = self.fit(data_dictionary, pair)
        if pair not in self.dd.historic_predictions:
            self.set_initial_historic_predictions(
                data_dictionary['train_features'], model, dk, pair)
        self.dd.save_historic_predictions_to_disk()
        logger.info(f"--------------------done training {pair}--------------------")
        return model
    def fit(self, data_dictionary: Dict[str, Any], pair: str = ''):
        # train_df = data_dictionary["train_features"]
        # # train_labels = data_dictionary["train_labels"]
        # test_df = data_dictionary["test_features"]
        # # test_labels = data_dictionary["test_labels"]
        # # sep = '/'
        # # coin = pair.split(sep, 1)[0]
        # # price = train_df[f"%-{coin}raw_price_{self.config['timeframe']}"]
        # # price.reset_index(inplace=True, drop=True)
        # # price = price.to_frame()
        # price = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(len(train_df.index))
        # price_test = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(len(test_df.index))
        # #train_env = GymAnytrading(train_df, price, self.CONV_WIDTH)
        # agent_params = self.freqai_info['model_training_parameters']
        # reward_params = self.freqai_info['model_reward_parameters']
        # train_env = DEnv(df=train_df, prices=price, window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
        # #eval_env = DEnv(df=test_df, prices=price_test, window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
        # #env_instance = SubprocVecEnv([DEnv(df=train_df, prices=price, window_size=self.CONV_WIDTH, reward_kwargs=reward_params)])
        # #train_env.reset()
        # #eval_env.reset()
        # # model
        # #policy_kwargs = dict(net_arch=[512, 512, 512])
        # policy_kwargs = dict(activation_fn=th.nn.Tanh,
        #              net_arch=[256, 256, 256])
        # agent = RLPrediction_agent(train_env)
        # #eval_agent = RLPrediction_agent(eval_env)
        # # PPO
        # model_name = 'ppo'
        # model = agent.get_model(model_name, model_kwargs=agent_params, policy_kwargs=policy_kwargs)
        # trained_model = agent.train_model(model=model,
        #                                   tb_log_name=model_name,
        #                                   model_kwargs=agent_params,
        #                                   train_df=train_df,
        #                                   test_df=test_df,
        #                                   price=price,
        #                                   price_test=price_test,
        #                                   window_size=self.CONV_WIDTH)
        # # best_model = eval_agent.train_model(model=model,
        # #                                   tb_log_name=model_name,
        # #                                   model_kwargs=agent_params,
        # #                                   eval=eval_env)
        # # TDQN
        # # model_name = 'TDQN'
        # # model = TDQN('TMultiInputPolicy', train_env, policy_kwargs=policy_kwargs, tensorboard_log='./tensorboard_log/',
        # #             learning_rate=agent_params["learning_rate"], gamma=0.9,
        # #             target_update_interval=5000, buffer_size=50000,
        # #             exploration_initial_eps=1, exploration_final_eps=0.1,
        # #             replay_buffer_class=ReplayBuffer
        # #            )
        # # trained_model = agent.train_model(model=model,
        # #                                   tb_log_name=model_name,
        # #                                   model_kwargs=agent_params)
        # #model.learn(
        # #     total_timesteps=5000,
        # #     callback=callback
        # # )
        agent_params = self.freqai_info['model_training_parameters']
        reward_params = self.freqai_info['model_reward_parameters']
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        eval_freq = agent_params["eval_cycles"] * len(test_df)
        total_timesteps = agent_params["train_cycles"] * len(train_df)
        # price data for model training and evaluation
        price = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(len(train_df.index))
        price_test = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(len(test_df.index))
        # environments
        train_env = DEnv(df=train_df, prices=price, window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
        eval = DEnv(df=test_df, prices=price_test, window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
        eval_env = Monitor(eval, ".")
        eval_env.reset()
        # this should be in config - TODO
        agent_type = 'tdqn'
        path = self.dk.data_path
        eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/",
                             log_path=f"{path}/{agent_type}/logs/", eval_freq=int(eval_freq),
                             deterministic=True, render=False)
        # model arch
        policy_kwargs = dict(activation_fn=th.nn.ReLU,
                      net_arch=[256, 256, 128])
        if agent_type == 'tdqn':
            model = TDQN('TMultiInputPolicy', train_env, policy_kwargs=policy_kwargs, tensorboard_log=f"{path}/{agent_type}/tensorboard/",
                    learning_rate=0.00025, gamma=0.9,
                    target_update_interval=5000, buffer_size=50000,
                    exploration_initial_eps=1, exploration_final_eps=0.1,
                    replay_buffer_class=ReplayBuffer
                   )
        elif agent_type == 'ppo':
            model = PPO('MultiInputPolicy', train_env, policy_kwargs=policy_kwargs, tensorboard_log=f"{path}/{agent_type}/tensorboard/",
                learning_rate=0.00025, gamma=0.9
            )
        model.learn(
            total_timesteps=int(total_timesteps),
            callback=eval_callback
        )
        print('Training finished!')
        return model
    def get_state_info(self, pair):
        open_trades = Trade.get_trades(trade_filter=Trade.is_open.is_(True))
        market_side = 0.5
        current_profit = 0
        for trade in open_trades:
            if trade.pair == pair:
                current_value = trade.open_trade_value
                openrate = trade.open_rate
                if 'long' in trade.enter_tag:
                    market_side = 1
                else:
                    market_side = 0
                current_profit = current_value / openrate -1
        total_profit = 0
        closed_trades = Trade.get_trades(trade_filter=[Trade.is_open.is_(False), Trade.pair == pair])
        for trade in closed_trades:
            total_profit += trade.close_profit
        return market_side, current_profit, total_profit
    def predict(
        self, unfiltered_dataframe: DataFrame, dk: FreqaiDataKitchen, first: bool = False
    ) -> Tuple[DataFrame, npt.NDArray[np.int_]]:
        """
        Filter the prediction features data and predict with it.
        :param: unfiltered_dataframe: Full dataframe for the current backtest period.
        :return:
        :pred_df: dataframe containing the predictions
        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
        data (NaNs) or felt uncertain about data (PCA and DI index)
        """
        dk.find_features(unfiltered_dataframe)
        filtered_dataframe, _ = dk.filter_features(
            unfiltered_dataframe, dk.training_features_list, training_filter=False
        )
        filtered_dataframe = dk.normalize_data_from_metadata(filtered_dataframe)
        dk.data_dictionary["prediction_features"] = filtered_dataframe
        # optional additional data cleaning/analysis
        self.data_cleaning_predict(dk, filtered_dataframe)
        pred_df = self.rl_model_predict(dk.data_dictionary["prediction_features"], dk, self.model)
        pred_df.fillna(0, inplace=True)
        return (pred_df, dk.do_predict)
    def rl_model_predict(self, dataframe: DataFrame,
                         dk: FreqaiDataKitchen, model: Any) -> DataFrame:
        output = pd.DataFrame(np.full((len(dataframe), 1), 2), columns=dk.label_list)
        def _predict(window):
            observations = dataframe.iloc[window.index]
            res, _ = model.predict(observations, deterministic=True)
            return res
        output = output.rolling(window=self.CONV_WIDTH).apply(_predict)
        return output
    def set_initial_historic_predictions(
        self, df: DataFrame, model: Any, dk: FreqaiDataKitchen, pair: str
    ) -> None:
        pred_df = self.rl_model_predict(df, dk, model)
        pred_df.fillna(0, inplace=True)
        self.dd.historic_predictions[pair] = pred_df
        hist_preds_df = self.dd.historic_predictions[pair]
        for label in hist_preds_df.columns:
            if hist_preds_df[label].dtype == object:
                continue
            hist_preds_df[f'{label}_mean'] = 0
            hist_preds_df[f'{label}_std'] = 0
        hist_preds_df['do_predict'] = 0
        if self.freqai_info['feature_parameters'].get('DI_threshold', 0) > 0:
            hist_preds_df['DI_values'] = 0
        for return_str in dk.data['extra_returns_per_train']:
            hist_preds_df[return_str] = 0
@@ -0,0 +1,155 @@
 import logging
 from typing import Any, Dict  # , Tuple
 import numpy as np
 # import numpy.typing as npt
 # import pandas as pd
 import torch as th
 # from pandas import DataFrame
 from stable_baselines3 import PPO
 from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.monitor import Monitor
 # from stable_baselines3.common.vec_env import SubprocVecEnv
 from freqtrade.freqai.RL.BaseRLEnv import BaseRLEnv, Actions, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 logger = logging.getLogger(__name__)
 class ReinforcementLearningPPO(BaseReinforcementLearningModel):
    """
    User created Reinforcement Learning Model prediction model.
    """
    def fit(self, data_dictionary: Dict[str, Any], pair: str = ''):
        agent_params = self.freqai_info['model_training_parameters']
        reward_params = self.freqai_info['model_reward_parameters']
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        eval_freq = agent_params["eval_cycles"] * len(test_df)
        total_timesteps = agent_params["train_cycles"] * len(train_df)
        # price data for model training and evaluation
        price = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(len(train_df.index))
        price_test = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(
            len(test_df.index))
        # environments
        train_env = MyRLEnv(df=train_df, prices=price, window_size=self.CONV_WIDTH,
                            reward_kwargs=reward_params)
        eval = MyRLEnv(df=test_df, prices=price_test,
                       window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
        eval_env = Monitor(eval, ".")
        eval_env.reset()
        path = self.dk.data_path
        eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/",
                                     log_path=f"{path}/ppo/logs/", eval_freq=int(eval_freq),
                                     deterministic=True, render=False)
        # model arch
        policy_kwargs = dict(activation_fn=th.nn.ReLU,
                             net_arch=[256, 256, 128])
        model = PPO('MultiInputPolicy', train_env, policy_kwargs=policy_kwargs,
                    tensorboard_log=f"{path}/ppo/tensorboard/", learning_rate=0.00025, gamma=0.9
                    )
        model.learn(
            total_timesteps=int(total_timesteps),
            callback=eval_callback
        )
        print('Training finished!')
        return model
 class MyRLEnv(BaseRLEnv):
    """
    User can override any function in BaseRLEnv and gym.Env
    """
    def step(self, action):
        self._done = False
        self._current_tick += 1
        if self._current_tick == self._end_tick:
            self._done = True
        self.update_portfolio_log_returns(action)
        self._update_profit(action)
        step_reward = self._calculate_reward(action)
        self.total_reward += step_reward
        trade_type = None
        if self.is_tradesignal(action):
            """
            Action: Neutral, position: Long ->  Close Long
            Action: Neutral, position: Short -> Close Short
            Action: Long, position: Neutral -> Open Long
            Action: Long, position: Short -> Close Short and Open Long
            Action: Short, position: Neutral -> Open Short
            Action: Short, position: Long -> Close Long and Open Short
            """
            if action == Actions.Neutral.value:
                self._position = Positions.Neutral
                trade_type = "neutral"
            elif action == Actions.Long.value:
                self._position = Positions.Long
                trade_type = "long"
            elif action == Actions.Short.value:
                self._position = Positions.Short
                trade_type = "short"
            else:
                print("case not defined")
            # Update last trade tick
            self._last_trade_tick = self._current_tick
            if trade_type is not None:
                self.trade_history.append(
                    {'price': self.current_price(), 'index': self._current_tick,
                     'type': trade_type})
        if self._total_profit < 0.2:
            self._done = True
        self._position_history.append(self._position)
        observation = self._get_observation()
        info = dict(
            tick=self._current_tick,
            total_reward=self.total_reward,
            total_profit=self._total_profit,
            position=self._position.value
        )
        self._update_history(info)
        return observation, step_reward, self._done, info
    def calculate_reward(self, action):
        if self._last_trade_tick is None:
            return 0.
        # close long
        if (action == Actions.Short.value or
                action == Actions.Neutral.value) and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
        # close short
        if (action == Actions.Long.value or
                action == Actions.Neutral.value) and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
        return 0.
@@ -0,0 +1,168 @@
 import logging
 from typing import Any, Dict, Optional
 import numpy as np
 import torch as th
 from stable_baselines3.common.callbacks import EvalCallback
 from stable_baselines3.common.monitor import Monitor
 # from stable_baselines3.common.vec_env import SubprocVecEnv
 from freqtrade.freqai.RL.BaseRLEnv import BaseRLEnv, Actions, Positions
 from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
 from freqtrade.freqai.RL.TDQNagent import TDQN
 from stable_baselines3.common.buffers import ReplayBuffer
 logger = logging.getLogger(__name__)
 class ReinforcementLearningPPO(BaseReinforcementLearningModel):
    """
    User created Reinforcement Learning Model prediction model.
    """
    def fit(self, data_dictionary: Dict[str, Any], pair: str = ''):
        agent_params = self.freqai_info['model_training_parameters']
        reward_params = self.freqai_info['model_reward_parameters']
        train_df = data_dictionary["train_features"]
        test_df = data_dictionary["test_features"]
        eval_freq = agent_params["eval_cycles"] * len(test_df)
        total_timesteps = agent_params["train_cycles"] * len(train_df)
        # price data for model training and evaluation
        price = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(len(train_df.index))
        price_test = self.dd.historic_data[pair][f"{self.config['timeframe']}"].tail(
            len(test_df.index))
        # environments
        train_env = MyRLEnv(df=train_df, prices=price, window_size=self.CONV_WIDTH,
                            reward_kwargs=reward_params)
        eval = MyRLEnv(df=test_df, prices=price_test,
                       window_size=self.CONV_WIDTH, reward_kwargs=reward_params)
        eval_env = Monitor(eval, ".")
        eval_env.reset()
        path = self.dk.data_path
        eval_callback = EvalCallback(eval_env, best_model_save_path=f"{path}/",
                                     log_path=f"{path}/tdqn/logs/", eval_freq=int(eval_freq),
                                     deterministic=True, render=False)
        # model arch
        policy_kwargs = dict(activation_fn=th.nn.ReLU,
                             net_arch=[256, 256, 128])
        model = TDQN('TMultiInputPolicy', train_env,
                     policy_kwargs=policy_kwargs,
                     tensorboard_log=f"{path}/tdqn/tensorboard/",
                     learning_rate=0.00025, gamma=0.9,
                     target_update_interval=5000, buffer_size=50000,
                     exploration_initial_eps=1, exploration_final_eps=0.1,
                     replay_buffer_class=Optional(ReplayBuffer)
                     )
        model.learn(
            total_timesteps=int(total_timesteps),
            callback=eval_callback
        )
        print('Training finished!')
        return model
 class MyRLEnv(BaseRLEnv):
    """
    User can override any function in BaseRLEnv and gym.Env
    """
    def step(self, action):
        self._done = False
        self._current_tick += 1
        if self._current_tick == self._end_tick:
            self._done = True
        self.update_portfolio_log_returns(action)
        self._update_profit(action)
        step_reward = self._calculate_reward(action)
        self.total_reward += step_reward
        trade_type = None
        if self.is_tradesignal(action):
            """
            Action: Neutral, position: Long ->  Close Long
            Action: Neutral, position: Short -> Close Short
            Action: Long, position: Neutral -> Open Long
            Action: Long, position: Short -> Close Short and Open Long
            Action: Short, position: Neutral -> Open Short
            Action: Short, position: Long -> Close Long and Open Short
            """
            if action == Actions.Neutral.value:
                self._position = Positions.Neutral
                trade_type = "neutral"
            elif action == Actions.Long.value:
                self._position = Positions.Long
                trade_type = "long"
            elif action == Actions.Short.value:
                self._position = Positions.Short
                trade_type = "short"
            else:
                print("case not defined")
            # Update last trade tick
            self._last_trade_tick = self._current_tick
            if trade_type is not None:
                self.trade_history.append(
                    {'price': self.current_price(), 'index': self._current_tick,
                     'type': trade_type})
        if self._total_profit < 0.2:
            self._done = True
        self._position_history.append(self._position)
        observation = self._get_observation()
        info = dict(
            tick=self._current_tick,
            total_reward=self.total_reward,
            total_profit=self._total_profit,
            position=self._position.value
        )
        self._update_history(info)
        return observation, step_reward, self._done, info
    def calculate_reward(self, action):
        if self._last_trade_tick is None:
            return 0.
        # close long
        if action == Actions.Long_sell.value and self._position == Positions.Long:
            last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(current_price) - np.log(last_trade_price))
        if action == Actions.Long_sell.value and self._position == Positions.Long:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_buy_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_sell_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(current_price) - np.log(last_trade_price)) * 2)
        # close short
        if action == Actions.Short_buy.value and self._position == Positions.Short:
            last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
            current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
            return float(np.log(last_trade_price) - np.log(current_price))
        if action == Actions.Short_buy.value and self._position == Positions.Short:
            if self.close_trade_profit[-1] > self.profit_aim * self.rr:
                last_trade_price = self.add_sell_fee(self.prices.iloc[self._last_trade_tick].open)
                current_price = self.add_buy_fee(self.prices.iloc[self._current_tick].open)
                return float((np.log(last_trade_price) - np.log(current_price)) * 2)
        return 0.