Merge pull request #7289 from freqtrade/feat/freqai-rl-dev

Add reinforcement learning module to FreqAI
2022-11-27 17:15:21 +01:00 · 2022-11-27 17:15:21 +01:00 · f4025ee5de
commit f4025ee5de
parent 2219d2f491 732757e087
27 changed files with 1874 additions and 43 deletions
--- a/build_helpers/publish_docker_arm64.sh
+++ b/build_helpers/publish_docker_arm64.sh
@ -7,11 +7,13 @@ export DOCKER_BUILDKIT=1
 TAG=$(echo "${BRANCH_NAME}" | sed -e "s/\//_/g")
 TAG_PLOT=${TAG}_plot
 TAG_FREQAI=${TAG}_freqai
+TAG_FREQAI_RL=${TAG_FREQAI}rl
 TAG_PI="${TAG}_pi"

 TAG_ARM=${TAG}_arm
 TAG_PLOT_ARM=${TAG_PLOT}_arm
 TAG_FREQAI_ARM=${TAG_FREQAI}_arm
+TAG_FREQAI_RL_ARM=${TAG_FREQAI_RL}_arm
 CACHE_IMAGE=freqtradeorg/freqtrade_cache

 echo "Running for ${TAG}"
@ -41,9 +43,11 @@ docker tag freqtrade:$TAG_ARM ${CACHE_IMAGE}:$TAG_ARM

 docker build --cache-from freqtrade:${TAG_ARM} --build-arg sourceimage=${CACHE_IMAGE} --build-arg sourcetag=${TAG_ARM} -t freqtrade:${TAG_PLOT_ARM} -f docker/Dockerfile.plot .
 docker build --cache-from freqtrade:${TAG_ARM} --build-arg sourceimage=${CACHE_IMAGE} --build-arg sourcetag=${TAG_ARM} -t freqtrade:${TAG_FREQAI_ARM} -f docker/Dockerfile.freqai .
+docker build --cache-from freqtrade:${TAG_ARM} --build-arg sourceimage=${CACHE_IMAGE} --build-arg sourcetag=${TAG_ARM} -t freqtrade:${TAG_FREQAI_RL_ARM} -f docker/Dockerfile.freqai_rl .

 docker tag freqtrade:$TAG_PLOT_ARM ${CACHE_IMAGE}:$TAG_PLOT_ARM
 docker tag freqtrade:$TAG_FREQAI_ARM ${CACHE_IMAGE}:$TAG_FREQAI_ARM
+docker tag freqtrade:$TAG_FREQAI_RL_ARM ${CACHE_IMAGE}:$TAG_FREQAI_RL_ARM

 # Run backtest
 docker run --rm -v $(pwd)/config_examples/config_bittrex.example.json:/freqtrade/config.json:ro -v $(pwd)/tests:/tests freqtrade:${TAG_ARM} backtesting --datadir /tests/testdata --strategy-path /tests/strategy/strats/ --strategy StrategyTestV3
@ -58,6 +62,7 @@ docker images
 # docker push ${IMAGE_NAME}
 docker push ${CACHE_IMAGE}:$TAG_PLOT_ARM
 docker push ${CACHE_IMAGE}:$TAG_FREQAI_ARM
+docker push ${CACHE_IMAGE}:$TAG_FREQAI_RL_ARM
 docker push ${CACHE_IMAGE}:$TAG_ARM

 # Create multi-arch image
@ -74,6 +79,9 @@ docker manifest push -p ${IMAGE_NAME}:${TAG_PLOT}
 docker manifest create ${IMAGE_NAME}:${TAG_FREQAI} ${CACHE_IMAGE}:${TAG_FREQAI_ARM} ${CACHE_IMAGE}:${TAG_FREQAI}
 docker manifest push -p ${IMAGE_NAME}:${TAG_FREQAI}

+docker manifest create ${IMAGE_NAME}:${TAG_FREQAI_RL} ${CACHE_IMAGE}:${TAG_FREQAI_RL_ARM} ${CACHE_IMAGE}:${TAG_FREQAI_RL}
+docker manifest push -p ${IMAGE_NAME}:${TAG_FREQAI_RL}
+
 # Tag as latest for develop builds
 if [ "${TAG}" = "develop" ]; then
    docker manifest create ${IMAGE_NAME}:latest ${CACHE_IMAGE}:${TAG_ARM} ${IMAGE_NAME}:${TAG_PI} ${CACHE_IMAGE}:${TAG}
--- a/build_helpers/publish_docker_multi.sh
+++ b/build_helpers/publish_docker_multi.sh
@ -6,6 +6,7 @@
 TAG=$(echo "${BRANCH_NAME}" | sed -e "s/\//_/g")
 TAG_PLOT=${TAG}_plot
 TAG_FREQAI=${TAG}_freqai
+TAG_FREQAI_RL=${TAG_FREQAI}rl
 TAG_PI="${TAG}_pi"

 PI_PLATFORM="linux/arm/v7"
@ -51,9 +52,11 @@ docker tag freqtrade:$TAG ${CACHE_IMAGE}:$TAG

 docker build --cache-from freqtrade:${TAG} --build-arg sourceimage=${CACHE_IMAGE} --build-arg sourcetag=${TAG} -t freqtrade:${TAG_PLOT} -f docker/Dockerfile.plot .
 docker build --cache-from freqtrade:${TAG} --build-arg sourceimage=${CACHE_IMAGE} --build-arg sourcetag=${TAG} -t freqtrade:${TAG_FREQAI} -f docker/Dockerfile.freqai .
+docker build --cache-from freqtrade:${TAG_FREQAI} --build-arg sourceimage=${CACHE_IMAGE} --build-arg sourcetag=${TAG_FREQAI} -t freqtrade:${TAG_FREQAI_RL} -f docker/Dockerfile.freqai_rl .

 docker tag freqtrade:$TAG_PLOT ${CACHE_IMAGE}:$TAG_PLOT
 docker tag freqtrade:$TAG_FREQAI ${CACHE_IMAGE}:$TAG_FREQAI
+docker tag freqtrade:$TAG_FREQAI_RL ${CACHE_IMAGE}:$TAG_FREQAI_RL

 # Run backtest
 docker run --rm -v $(pwd)/config_examples/config_bittrex.example.json:/freqtrade/config.json:ro -v $(pwd)/tests:/tests freqtrade:${TAG} backtesting --datadir /tests/testdata --strategy-path /tests/strategy/strats/ --strategy StrategyTestV3
@ -68,6 +71,7 @@ docker images
 docker push ${CACHE_IMAGE}
 docker push ${CACHE_IMAGE}:$TAG_PLOT
 docker push ${CACHE_IMAGE}:$TAG_FREQAI
+docker push ${CACHE_IMAGE}:$TAG_FREQAI_RL
 docker push ${CACHE_IMAGE}:$TAG


--- a/docker/Dockerfile.freqai_rl
+++ b/docker/Dockerfile.freqai_rl
@ -0,0 +1,8 @@
+ARG sourceimage=freqtradeorg/freqtrade
+ARG sourcetag=develop_freqai
+FROM ${sourceimage}:${sourcetag}
+
+# Install dependencies
+COPY requirements-freqai.txt requirements-freqai-rl.txt /freqtrade/
+
+RUN pip install -r requirements-freqai-rl.txt --user --no-cache-dir
--- a/docs/freqai-parameter-table.md
+++ b/docs/freqai-parameter-table.md
@ -4,9 +4,11 @@ The table below will list all configuration parameters available for FreqAI. Som

 Mandatory parameters are marked as **Required** and have to be set in one of the suggested ways.

+### General configuration parameters
+
 |  Parameter | Description |
 |------------|-------------|
-|  |  **General configuration parameters**
+|  |  **General configuration parameters within the `config.freqai` tree**
 | `freqai` | **Required.** <br> The parent dictionary containing all the parameters for controlling FreqAI. <br> **Datatype:** Dictionary.
 | `train_period_days` | **Required.** <br> Number of days to use for the training data (width of the sliding window). <br> **Datatype:** Positive integer.
 | `backtest_period_days` | **Required.** <br> Number of days to inference from the trained model before sliding the `train_period_days` window defined above, and retraining the model during backtesting (more info [here](freqai-running.md#backtesting)). This can be fractional days, but beware that the provided `timerange` will be divided by this number to yield the number of trainings necessary to complete the backtest. <br> **Datatype:** Float.
@ -19,7 +21,13 @@ Mandatory parameters are marked as **Required** and have to be set in one of the
 | `follow_mode` | Use a `follower` that will look for models associated with a specific `identifier` and load those for inferencing. A `follower` will **not** train new models. <br> **Datatype:** Boolean. <br> Default: `False`.
 | `continual_learning` | Use the final state of the most recently trained model as starting point for the new model, allowing for incremental learning (more information can be found [here](freqai-running.md#continual-learning)). <br> **Datatype:** Boolean. <br> Default: `False`.
 | `write_metrics_to_disk` | Collect train timings, inference timings and cpu usage in json file. <br> **Datatype:** Boolean. <br> Default: `False`
-|  |  **Feature parameters**
+| `data_kitchen_thread_count` | <br> Designate the number of threads you want to use for data processing (outlier methods, normalization, etc.). This has no impact on the number of threads used for training. If user does not set it (default), FreqAI will use max number of threads - 2 (leaving 1 physical core available for Freqtrade bot and FreqUI) <br> **Datatype:** Positive integer.
+
+### Feature parameters
+
+|  Parameter | Description |
+|------------|-------------|
+|  |  **Feature parameters within the `freqai.feature_parameters` sub dictionary**
 | `feature_parameters` | A dictionary containing the parameters used to engineer the feature set. Details and examples are shown [here](freqai-feature-engineering.md). <br> **Datatype:** Dictionary.
 | `include_timeframes` | A list of timeframes that all indicators in `populate_any_indicators` will be created for. The list is added as features to the base indicators dataset. <br> **Datatype:** List of timeframes (strings).
 | `include_corr_pairlist` | A list of correlated coins that FreqAI will add as additional features to all `pair_whitelist` coins. All indicators set in `populate_any_indicators` during feature engineering (see details [here](freqai-feature-engineering.md)) will be created for each correlated coin. The correlated coins features are added to the base indicators dataset. <br> **Datatype:** List of assets (strings).
@ -38,16 +46,48 @@ Mandatory parameters are marked as **Required** and have to be set in one of the
 | `noise_standard_deviation` | If set, FreqAI adds noise to the training features with the aim of preventing overfitting. FreqAI generates random deviates from a gaussian distribution with a standard deviation of `noise_standard_deviation` and adds them to all data points. `noise_standard_deviation` should be kept relative to the normalized space, i.e., between -1 and 1. In other words, since data in FreqAI is always normalized to be between -1 and 1, `noise_standard_deviation: 0.05` would result in 32% of the data being randomly increased/decreased by more than 2.5% (i.e., the percent of data falling within the first standard deviation). <br> **Datatype:** Integer. <br> Default: `0`.
 | `outlier_protection_percentage` | Enable to prevent outlier detection methods from discarding too much data. If more than `outlier_protection_percentage` % of points are detected as outliers by the SVM or DBSCAN, FreqAI will log a warning message and ignore outlier detection, i.e., the original dataset will be kept intact. If the outlier protection is triggered, no predictions will be made based on the training dataset. <br> **Datatype:** Float. <br> Default: `30`.
 | `reverse_train_test_order` | Split the feature dataset (see below) and use the latest data split for training and test on historical split of the data. This allows the model to be trained up to the most recent data point, while avoiding overfitting. However, you should be careful to understand the unorthodox nature of this parameter before employing it. <br> **Datatype:** Boolean. <br> Default: `False` (no reversal).
-|  |  **Data split parameters**
+
+### Data split parameters
+
+|  Parameter | Description |
+|------------|-------------|
+|  |  **Data split parameters within the `freqai.data_split_parameters` sub dictionary**
 | `data_split_parameters` | Include any additional parameters available from Scikit-learn `test_train_split()`, which are shown [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) (external website). <br> **Datatype:** Dictionary.
 | `test_size` | The fraction of data that should be used for testing instead of training. <br> **Datatype:** Positive float < 1.
 | `shuffle` | Shuffle the training data points during training. Typically, to not remove the chronological order of data in time-series forecasting, this is set to `False`. <br> **Datatype:** Boolean. <br> Defaut: `False`.
-|  |  **Model training parameters**
+
+### Model training parameters
+
+|  Parameter | Description |
+|------------|-------------|
+|  |  **Model training parameters within the `freqai.model_training_parameters` sub dictionary**
 | `model_training_parameters` | A flexible dictionary that includes all parameters available by the selected model library. For example, if you use `LightGBMRegressor`, this dictionary can contain any parameter available by the `LightGBMRegressor` [here](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html) (external website). If you select a different model, this dictionary can contain any parameter from that model. A list of the currently available models can be found [here](freqai-configuration.md#using-different-prediction-models).  <br> **Datatype:** Dictionary.
 | `n_estimators` | The number of boosted trees to fit in the training of the model. <br> **Datatype:** Integer.
 | `learning_rate` | Boosting learning rate during training of the model. <br> **Datatype:** Float.
 | `n_jobs`, `thread_count`, `task_type` | Set the number of threads for parallel processing and the `task_type` (`gpu` or `cpu`). Different model libraries use different parameter names. <br> **Datatype:** Float.
+
+### Reinforcement Learning parameters
+
+|  Parameter | Description |
+|------------|-------------|
+|  |  **Reinforcement Learning Parameters within the `freqai.rl_config` sub dictionary**
+| `rl_config` | A dictionary containing the control parameters for a Reinforcement Learning model. <br> **Datatype:** Dictionary.
+| `train_cycles` | Training time steps will be set based on the `train_cycles * number of training data points. <br> **Datatype:** Integer.
+| `cpu_count` | Number of processors to dedicate to the Reinforcement Learning training process. <br> **Datatype:** int.
+| `max_trade_duration_candles`| Guides the agent training to keep trades below desired length. Example usage shown in `prediction_models/ReinforcementLearner.py` within the customizable `calculate_reward()` function. <br> **Datatype:** int.
+| `model_type` | Model string from stable_baselines3 or SBcontrib. Available strings include: `'TRPO', 'ARS', 'RecurrentPPO', 'MaskablePPO', 'PPO', 'A2C', 'DQN'`. User should ensure that `model_training_parameters` match those available to the corresponding stable_baselines3 model by visiting their documentaiton. [PPO doc](https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html) (external website) <br> **Datatype:** string.
+| `policy_type` | One of the available policy types from stable_baselines3 <br> **Datatype:** string.
+| `max_training_drawdown_pct` | The maximum drawdown that the agent is allowed to experience during training. <br> **Datatype:** float. <br> Default: 0.8
+| `cpu_count` | Number of threads/cpus to dedicate to the Reinforcement Learning training process (depending on if `ReinforcementLearning_multiproc` is selected or not). Recommended to leave this untouched, by default, this value is set to the total number of physical cores minus 1. <br> **Datatype:** int. 
+| `model_reward_parameters` | Parameters used inside the customizable `calculate_reward()` function in `ReinforcementLearner.py` <br> **Datatype:** int.
+| `add_state_info` | Tell FreqAI to include state information in the feature set for training and inferencing. The current state variables include trade duration, current profit, trade position. This is only available in dry/live runs, and is automatically switched to false for backtesting. <br> **Datatype:** bool. <br> Default: `False`.
+| `net_arch` | Network architecture which is well described in [`stable_baselines3` doc](https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html#examples). In summary: `[<shared layers>, dict(vf=[<non-shared value network layers>], pi=[<non-shared policy network layers>])]`. By default this is set to `[128, 128]`, which defines 2 shared hidden layers with 128 units each.
+
+### Additional parameters
+
+|  Parameter | Description |
+|------------|-------------|
 |  |  **Extraneous parameters**
-| `keras` | If the selected model makes use of Keras (typical for Tensorflow-based prediction models), this flag needs to be activated so that the model save/loading follows Keras standards. <br> **Datatype:** Boolean. <br> Default: `False`.
-| `conv_width` | The width of a convolutional neural network input tensor. This replaces the need for shifting candles (`include_shifted_candles`) by feeding in historical data points as the second dimension of the tensor. Technically, this parameter can also be used for regressors, but it only adds computational overhead and does not change the model training/prediction. <br> **Datatype:** Integer. <br> Default: `2`.
-| `reduce_df_footprint` | Recast all numeric columns to float32/int32, with the objective of reducing ram/disk usage and decreasing train/inference timing. This parameter is set in the main level of the Freqtrade configuration file (not inside FreqAI). <br> **Datatype:** Boolean. <br> Default: `False`.
+| `freqai.keras` | If the selected model makes use of Keras (typical for Tensorflow-based prediction models), this flag needs to be activated so that the model save/loading follows Keras standards. <br> **Datatype:** Boolean. <br> Default: `False`.
+| `freqai.conv_width` | The width of a convolutional neural network input tensor. This replaces the need for shifting candles (`include_shifted_candles`) by feeding in historical data points as the second dimension of the tensor. Technically, this parameter can also be used for regressors, but it only adds computational overhead and does not change the model training/prediction. <br> **Datatype:** Integer. <br> Default: `2`.
+| `freqai.reduce_df_footprint` | Recast all numeric columns to float32/int32, with the objective of reducing ram/disk usage and decreasing train/inference timing. This parameter is set in the main level of the Freqtrade configuration file (not inside FreqAI). <br> **Datatype:** Boolean. <br> Default: `False`.
--- a/docs/freqai-reinforcement-learning.md
+++ b/docs/freqai-reinforcement-learning.md
@ -0,0 +1,259 @@
+# Reinforcement Learning
+
+!!! Note "Installation size"
+    Reinforcement learning dependencies include large packages such as `torch`, which should be explicitly requested during `./setup.sh -i` by answering "y" to the question "Do you also want dependencies for freqai-rl (~700mb additional space required) [y/N]?".  
+    Users who prefer docker should ensure they use the docker image appended with `_freqairl`.
+
+## Background and terminology
+
+### What is RL and why does FreqAI need it?
+
+Reinforcement learning involves two important components, the *agent* and the training *environment*. During agent training, the agent moves through historical data candle by candle, always making 1 of a set of actions: Long entry, long exit, short entry, short exit, neutral). During this training process, the environment tracks the performance of these actions and rewards the agent according to a custom user made `calculate_reward()` (here we offer a default reward for users to build on if they wish [details here](#creating-the-reward)). The reward is used to train weights in a neural network.
+
+A second important component of the FreqAI RL implementation is the use of *state* information. State information is fed into the network at each step, including current profit, current position, and current trade duration. These are used to train the agent in the training environment, and to reinforce the agent in dry/live (this functionality is not available in backtesting). *FreqAI + Freqtrade is a perfect match for this reinforcing mechanism since this information is readily available in live deployments.*
+
+Reinforcement learning is a natural progression for FreqAI, since it adds a new layer of adaptivity and market reactivity that Classifiers and Regressors cannot match. However, Classifiers and Regressors have strengths that RL does not have such as robust predictions. Improperly trained RL agents may find "cheats" and "tricks" to maximize reward without actually winning any trades. For this reason, RL is more complex and demands a higher level of understanding than typical Classifiers and Regressors.
+
+### The RL interface
+
+With the current framework, we aim to expose the training environment via the common "prediction model" file, which is a user inherited `BaseReinforcementLearner` object (e.g. `freqai/prediction_models/ReinforcementLearner`). Inside this user class, the RL environment is available and customized via `MyRLEnv` as [shown below](#creating-the-reward).
+
+We envision the majority of users focusing their effort on creative design of the `calculate_reward()` function [details here](#creating-the-reward), while leaving the rest of the environment untouched. Other users may not touch the environment at all, and they will only play with the configuration settings and the powerful feature engineering that already exists in FreqAI. Meanwhile, we enable advanced users to create their own model classes entirely.
+
+The framework is built on stable_baselines3 (torch) and OpenAI gym for the base environment class. But generally speaking, the model class is well isolated. Thus, the addition of competing libraries can be easily integrated into the existing framework. For the environment, it is inheriting from `gym.env` which means that it is necessary to write an entirely new environment in order to switch to a different library.
+
+### Important considerations
+
+As explained above, the agent is "trained" in an artificial trading "environment". In our case, that environment may seem quite similar to a real Freqtrade backtesting environment, but it is *NOT*. In fact, the RL trading environment is much more simplified. It does not incorporate any of the complicated strategy logic, such as callbacks such as `custom_exit`, `custom_stoploss`, leverage controls, etc. The RL environment is instead a very "raw" representation of the true market, where the agent has free-will to learn the policy (read: stoploss, take profit, ect) which is enforced by the `calculate_reward()`. Thus, it is important to consider that the agent training environment is not identical to the real world.
+
+## Running Reinforcement Learning
+
+Setting up and running a Reinforcement Learning model is the same as running a Regressor or Classifier. The same two flags, `--freqaimodel` and `--strategy`, must be defined on the command line:
+
+```bash
+freqtrade trade --freqaimodel ReinforcementLearner --strategy MyRLStrategy --config config.json
+```
+
+where `ReinforcementLearner` will use the templated `ReinforcementLearner` from `freqai/prediction_models/ReinforcementLearner` (or a custom user defined one located in `user_data/freqaimodels`). The strategy, on the other hand, follows the same base [feature engineering](freqai-feature-engineering.md) with `populate_any_indicators` as a typical Regressor:
+
+```python
+    def populate_any_indicators(
+        self, pair, df, tf, informative=None, set_generalized_indicators=False
+    ):
+
+        if informative is None:
+            informative = self.dp.get_pair_dataframe(pair, tf)
+
+        # first loop is automatically duplicating indicators for time periods
+        for t in self.freqai_info["feature_parameters"]["indicator_periods_candles"]:
+
+            t = int(t)
+            informative[f"%-{pair}rsi-period_{t}"] = ta.RSI(informative, timeperiod=t)
+            informative[f"%-{pair}mfi-period_{t}"] = ta.MFI(informative, timeperiod=t)
+            informative[f"%-{pair}adx-period_{t}"] = ta.ADX(informative, window=t)
+
+        # The following raw price values are necessary for RL models
+        informative[f"%-{pair}raw_close"] = informative["close"]
+        informative[f"%-{pair}raw_open"] = informative["open"]
+        informative[f"%-{pair}raw_high"] = informative["high"]
+        informative[f"%-{pair}raw_low"] = informative["low"]
+
+        indicators = [col for col in informative if col.startswith("%")]
+        # This loop duplicates and shifts all indicators to add a sense of recency to data
+        for n in range(self.freqai_info["feature_parameters"]["include_shifted_candles"] + 1):
+            if n == 0:
+                continue
+            informative_shift = informative[indicators].shift(n)
+            informative_shift = informative_shift.add_suffix("_shift-" + str(n))
+            informative = pd.concat((informative, informative_shift), axis=1)
+
+        df = merge_informative_pair(df, informative, self.config["timeframe"], tf, ffill=True)
+        skip_columns = [
+            (s + "_" + tf) for s in ["date", "open", "high", "low", "close", "volume"]
+        ]
+        df = df.drop(columns=skip_columns)
+
+        # Add generalized indicators here (because in live, it will call this
+        # function to populate indicators during training). Notice how we ensure not to
+        # add them multiple times
+        if set_generalized_indicators:
+
+            # For RL, there are no direct targets to set. This is filler (neutral)
+            # until the agent sends an action.
+            df["&-action"] = 0
+
+        return df
+```
+
+Most of the function remains the same as for typical Regressors, however, the function above shows how the strategy must pass the raw price data to the agent so that it has access to raw OHLCV in the training environment:
+
+```python
+        # The following features are necessary for RL models
+        informative[f"%-{pair}raw_close"] = informative["close"]
+        informative[f"%-{pair}raw_open"] = informative["open"]
+        informative[f"%-{pair}raw_high"] = informative["high"]
+        informative[f"%-{pair}raw_low"] = informative["low"]
+```
+
+Finally, there is no explicit "label" to make - instead the you need to assign the `&-action` column which will contain the agent's actions when accessed in `populate_entry/exit_trends()`. In the present example, the neutral action to 0. This value should align with the environment used. FreqAI provides two environments, both use 0 as the neutral action.
+
+After users realize there are no labels to set, they will soon understand that the agent is making its "own" entry and exit decisions. This makes strategy construction rather simple. The entry and exit signals come from the agent in the form of an integer - which are used directly to decide entries and exits in the strategy:
+
+```python
+    def populate_entry_trend(self, df: DataFrame, metadata: dict) -> DataFrame:
+
+        enter_long_conditions = [df["do_predict"] == 1, df["&-action"] == 1]
+
+        if enter_long_conditions:
+            df.loc[
+                reduce(lambda x, y: x & y, enter_long_conditions), ["enter_long", "enter_tag"]
+            ] = (1, "long")
+
+        enter_short_conditions = [df["do_predict"] == 1, df["&-action"] == 3]
+
+        if enter_short_conditions:
+            df.loc[
+                reduce(lambda x, y: x & y, enter_short_conditions), ["enter_short", "enter_tag"]
+            ] = (1, "short")
+
+        return df
+
+    def populate_exit_trend(self, df: DataFrame, metadata: dict) -> DataFrame:
+        exit_long_conditions = [df["do_predict"] == 1, df["&-action"] == 2]
+        if exit_long_conditions:
+            df.loc[reduce(lambda x, y: x & y, exit_long_conditions), "exit_long"] = 1
+
+        exit_short_conditions = [df["do_predict"] == 1, df["&-action"] == 4]
+        if exit_short_conditions:
+            df.loc[reduce(lambda x, y: x & y, exit_short_conditions), "exit_short"] = 1
+
+        return df
+```
+
+It is important to consider that `&-action` depends on which environment they choose to use. The example above shows 5 actions, where 0 is neutral, 1 is enter long, 2 is exit long, 3 is enter short and 4 is exit short. 
+
+## Configuring the Reinforcement Learner
+
+In order to configure the `Reinforcement Learner` the following dictionary must exist in the `freqai` config:
+
+```json
+        "rl_config": {
+            "train_cycles": 25,
+            "add_state_info": true,
+            "max_trade_duration_candles": 300,
+            "max_training_drawdown_pct": 0.02,
+            "cpu_count": 8,
+            "model_type": "PPO",
+            "policy_type": "MlpPolicy",
+            "model_reward_parameters": {
+                "rr": 1,
+                "profit_aim": 0.025
+            }
+        }
+```
+
+Parameter details can be found [here](freqai-parameter-table.md), but in general the `train_cycles` decides how many times the agent should cycle through the candle data in its artificial environment to train weights in the model. `model_type` is a string which selects one of the available models in [stable_baselines](https://stable-baselines3.readthedocs.io/en/master/)(external link).
+
+!!! Note
+    If you would like to experiment with `continual_learning`, then you should set that value to `true` in the main `freqai` configuration dictionary. This will tell the Reinforcement Learning library to continue training new models from the final state of previous models, instead of retraining new models from scratch each time a retrain is initiated.
+
+!!! Note
+    Remember that the general `model_training_parameters` dictionary should contain all the model hyperparameter customizations for the particular `model_type`. For example, `PPO` parameters can be found [here](https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html).
+
+## Creating a custom reward function
+
+As you begin to modify the strategy and the prediction model, you will quickly realize some important differences between the Reinforcement Learner and the Regressors/Classifiers. Firstly, the strategy does not set a target value (no labels!). Instead, you set the `calculate_reward()` function inside the `MyRLEnv` class (see below). A default `calculate_reward()` is provided inside `prediction_models/ReinforcementLearner.py` to demonstrate the necessary building blocks for creating rewards, but users are encouraged to create their own custom reinforcement learning model class (see below) and save it to `user_data/freqaimodels`. It is inside the `calculate_reward()` where creative theories about the market can be expressed. For example, you can reward your agent when it makes a winning trade, and penalize the agent when it makes a losing trade. Or perhaps, you wish to reward the agent for entering trades, and penalize the agent for sitting in trades too long. Below we show examples of how these rewards are all calculated:
+
+```python
+    from freqtrade.freqai.prediction_models.ReinforcementLearner import ReinforcementLearner
+    from freqtrade.freqai.RL.Base5ActionRLEnv import Base5ActionRLEnv
+
+    class MyCoolRLModel(ReinforcementLearner):
+        """
+        User created RL prediction model. 
+
+        Save this file to `freqtrade/user_data/freqaimodels`
+
+        then use it with:
+
+        freqtrade trade --freqaimodel MyCoolRLModel --config config.json --strategy SomeCoolStrat
+        
+        Here the users can override any of the functions 
+        available in the `IFreqaiModel` inheritance tree. Most importantly for RL, this 
+        is where the user overrides `MyRLEnv` (see below), to define custom
+        `calculate_reward()` function, or to override any other parts of the environment.
+        
+        This class also allows users to override any other part of the IFreqaiModel tree.
+        For example, the user can override `def fit()` or `def train()` or `def predict()` 
+        to take fine-tuned control over these processes.
+
+        Another common override may be `def data_cleaning_predict()` where the user can
+        take fine-tuned control over the data handling pipeline.
+        """
+        class MyRLEnv(Base5ActionRLEnv):
+            """
+            User made custom environment. This class inherits from BaseEnvironment and gym.env.
+            Users can override any functions from those parent classes. Here is an example
+            of a user customized `calculate_reward()` function.
+            """
+            def calculate_reward(self, action: int) -> float:
+                # first, penalize if the action is not valid
+                if not self._is_valid(action):
+                    return -2
+                pnl = self.get_unrealized_profit()
+
+                factor = 100
+                # reward agent for entering trades
+                if action in (Actions.Long_enter.value, Actions.Short_enter.value) \
+                        and self._position == Positions.Neutral:
+                    return 25
+                # discourage agent from not entering trades
+                if action == Actions.Neutral.value and self._position == Positions.Neutral:
+                    return -1
+                max_trade_duration = self.rl_config.get('max_trade_duration_candles', 300)
+                trade_duration = self._current_tick - self._last_trade_tick
+                if trade_duration <= max_trade_duration:
+                    factor *= 1.5
+                elif trade_duration > max_trade_duration:
+                    factor *= 0.5
+                # discourage sitting in position
+                if self._position in (Positions.Short, Positions.Long) and \
+                action == Actions.Neutral.value:
+                    return -1 * trade_duration / max_trade_duration
+                # close long
+                if action == Actions.Long_exit.value and self._position == Positions.Long:
+                    if pnl > self.profit_aim * self.rr:
+                        factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2)
+                    return float(pnl * factor)
+                # close short
+                if action == Actions.Short_exit.value and self._position == Positions.Short:
+                    if pnl > self.profit_aim * self.rr:
+                        factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2)
+                    return float(pnl * factor)
+                return 0.
+```
+
+### Using Tensorboard
+
+Reinforcement Learning models benefit from tracking training metrics. FreqAI has integrated Tensorboard to allow users to track training and evaluation performance across all coins and across all retrainings. Tensorboard is activated via the following command:
+
+```bash
+cd freqtrade
+tensorboard --logdir user_data/models/unique-id
+```
+
+where `unique-id` is the `identifier` set in the `freqai` configuration file. This command must be run in a separate shell to view the output in their browser at 127.0.0.1:6060 (6060 is the default port used by Tensorboard).
+
+![tensorboard](assets/tensorboard.jpg)
+
+### Choosing a base environment
+
+FreqAI provides two base environments, `Base4ActionEnvironment` and `Base5ActionEnvironment`. As the names imply, the environments are customized for agents that can select from 4 or 5 actions. In the `Base4ActionEnvironment`, the agent can enter long, enter short, hold neutral, or exit position. Meanwhile, in the `Base5ActionEnvironment`, the agent has the same actions as Base4, but instead of a single exit action, it separates exit long and exit short. The main changes stemming from the environment selection include:
+
+* the actions available in the `calculate_reward`
+* the actions consumed by the user strategy
+
+Both of the FreqAI provided environments inherit from an action/position agnostic environment object called the `BaseEnvironment`, which contains all shared logic. The architecture is designed to be easily customized. The simplest customization is the `calculate_reward()` (see details [here](#creating-the-reward)). However, the customizations can be further extended into any of the functions inside the environment. You can do this by simply overriding those functions inside your `MyRLEnv` in the prediction model file. Or for more advanced customizations, it is encouraged to create an entirely new environment inherited from `BaseEnvironment`.
+
+!!! Note
+    FreqAI does not provide by default, a long-only training environment. However, creating one should be as simple as copy-pasting one of the built in environments and removing the `short` actions (and all associated references to those).
--- a/freqtrade/constants.py
+++ b/freqtrade/constants.py
@ -578,9 +578,26 @@ CONF_SCHEMA = {
                    },
                },
                "model_training_parameters": {
+                    "type": "object"
+                },
+                "rl_config": {
                    "type": "object",
                    "properties": {
-                        "n_estimators": {"type": "integer", "default": 1000}
+                        "train_cycles": {"type": "integer"},
+                        "max_trade_duration_candles": {"type": "integer"},
+                        "add_state_info": {"type": "boolean", "default": False},
+                        "max_training_drawdown_pct": {"type": "number", "default": 0.02},
+                        "cpu_count": {"type": "integer", "default": 1},
+                        "model_type": {"type": "string", "default": "PPO"},
+                        "policy_type": {"type": "string", "default": "MlpPolicy"},
+                        "net_arch": {"type": "array", "default": [128, 128]},
+                        "model_reward_parameters": {
+                            "type": "object",
+                            "properties": {
+                                "rr": {"type": "number", "default": 1},
+                                "profit_aim": {"type": "number", "default": 0.025}
+                            }
+                        }
                    },
                },
            },
--- a/freqtrade/freqai/RL/Base4ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base4ActionRLEnv.py
@ -0,0 +1,135 @@
+import logging
+from enum import Enum
+
+from gym import spaces
+
+from freqtrade.freqai.RL.BaseEnvironment import BaseEnvironment, Positions
+
+
+logger = logging.getLogger(__name__)
+
+
+class Actions(Enum):
+    Neutral = 0
+    Exit = 1
+    Long_enter = 2
+    Short_enter = 3
+
+
+class Base4ActionRLEnv(BaseEnvironment):
+    """
+    Base class for a 4 action environment
+    """
+
+    def set_action_space(self):
+        self.action_space = spaces.Discrete(len(Actions))
+
+    def step(self, action: int):
+        """
+        Logic for a single step (incrementing one candle in time)
+        by the agent
+        :param: action: int = the action type that the agent plans
+            to take for the current step.
+        :returns:
+            observation = current state of environment
+            step_reward = the reward from `calculate_reward()`
+            _done = if the agent "died" or if the candles finished
+            info = dict passed back to openai gym lib
+        """
+        self._done = False
+        self._current_tick += 1
+
+        if self._current_tick == self._end_tick:
+            self._done = True
+
+        self._update_unrealized_total_profit()
+
+        step_reward = self.calculate_reward(action)
+        self.total_reward += step_reward
+
+        trade_type = None
+        if self.is_tradesignal(action):
+            """
+            Action: Neutral, position: Long ->  Close Long
+            Action: Neutral, position: Short -> Close Short
+
+            Action: Long, position: Neutral -> Open Long
+            Action: Long, position: Short -> Close Short and Open Long
+
+            Action: Short, position: Neutral -> Open Short
+            Action: Short, position: Long -> Close Long and Open Short
+            """
+
+            if action == Actions.Neutral.value:
+                self._position = Positions.Neutral
+                trade_type = "neutral"
+                self._last_trade_tick = None
+            elif action == Actions.Long_enter.value:
+                self._position = Positions.Long
+                trade_type = "long"
+                self._last_trade_tick = self._current_tick
+            elif action == Actions.Short_enter.value:
+                self._position = Positions.Short
+                trade_type = "short"
+                self._last_trade_tick = self._current_tick
+            elif action == Actions.Exit.value:
+                self._update_total_profit()
+                self._position = Positions.Neutral
+                trade_type = "neutral"
+                self._last_trade_tick = None
+            else:
+                print("case not defined")
+
+            if trade_type is not None:
+                self.trade_history.append(
+                    {'price': self.current_price(), 'index': self._current_tick,
+                     'type': trade_type})
+
+        if self._total_profit < 1 - self.rl_config.get('max_training_drawdown_pct', 0.8):
+            self._done = True
+
+        self._position_history.append(self._position)
+
+        info = dict(
+            tick=self._current_tick,
+            total_reward=self.total_reward,
+            total_profit=self._total_profit,
+            position=self._position.value
+        )
+
+        observation = self._get_observation()
+
+        self._update_history(info)
+
+        return observation, step_reward, self._done, info
+
+    def is_tradesignal(self, action: int) -> bool:
+        """
+        Determine if the signal is a trade signal
+        e.g.: agent wants a Actions.Long_exit while it is in a Positions.short
+        """
+        return not ((action == Actions.Neutral.value and self._position == Positions.Neutral) or
+                    (action == Actions.Neutral.value and self._position == Positions.Short) or
+                    (action == Actions.Neutral.value and self._position == Positions.Long) or
+                    (action == Actions.Short_enter.value and self._position == Positions.Short) or
+                    (action == Actions.Short_enter.value and self._position == Positions.Long) or
+                    (action == Actions.Exit.value and self._position == Positions.Neutral) or
+                    (action == Actions.Long_enter.value and self._position == Positions.Long) or
+                    (action == Actions.Long_enter.value and self._position == Positions.Short))
+
+    def _is_valid(self, action: int) -> bool:
+        """
+        Determine if the signal is valid.
+        e.g.: agent wants a Actions.Long_exit while it is in a Positions.short
+        """
+        # Agent should only try to exit if it is in position
+        if action == Actions.Exit.value:
+            if self._position not in (Positions.Short, Positions.Long):
+                return False
+
+        # Agent should only try to enter if it is not in position
+        if action in (Actions.Short_enter.value, Actions.Long_enter.value):
+            if self._position != Positions.Neutral:
+                return False
+
+        return True
--- a/freqtrade/freqai/RL/Base5ActionRLEnv.py
+++ b/freqtrade/freqai/RL/Base5ActionRLEnv.py
@ -0,0 +1,145 @@
+import logging
+from enum import Enum
+
+from gym import spaces
+
+from freqtrade.freqai.RL.BaseEnvironment import BaseEnvironment, Positions
+
+
+logger = logging.getLogger(__name__)
+
+
+class Actions(Enum):
+    Neutral = 0
+    Long_enter = 1
+    Long_exit = 2
+    Short_enter = 3
+    Short_exit = 4
+
+
+class Base5ActionRLEnv(BaseEnvironment):
+    """
+    Base class for a 5 action environment
+    """
+
+    def set_action_space(self):
+        self.action_space = spaces.Discrete(len(Actions))
+
+    def step(self, action: int):
+        """
+        Logic for a single step (incrementing one candle in time)
+        by the agent
+        :param: action: int = the action type that the agent plans
+            to take for the current step.
+        :returns:
+            observation = current state of environment
+            step_reward = the reward from `calculate_reward()`
+            _done = if the agent "died" or if the candles finished
+            info = dict passed back to openai gym lib
+        """
+        self._done = False
+        self._current_tick += 1
+
+        if self._current_tick == self._end_tick:
+            self._done = True
+
+        self._update_unrealized_total_profit()
+        step_reward = self.calculate_reward(action)
+        self.total_reward += step_reward
+
+        trade_type = None
+        if self.is_tradesignal(action):
+            """
+            Action: Neutral, position: Long ->  Close Long
+            Action: Neutral, position: Short -> Close Short
+
+            Action: Long, position: Neutral -> Open Long
+            Action: Long, position: Short -> Close Short and Open Long
+
+            Action: Short, position: Neutral -> Open Short
+            Action: Short, position: Long -> Close Long and Open Short
+            """
+
+            if action == Actions.Neutral.value:
+                self._position = Positions.Neutral
+                trade_type = "neutral"
+                self._last_trade_tick = None
+            elif action == Actions.Long_enter.value:
+                self._position = Positions.Long
+                trade_type = "long"
+                self._last_trade_tick = self._current_tick
+            elif action == Actions.Short_enter.value:
+                self._position = Positions.Short
+                trade_type = "short"
+                self._last_trade_tick = self._current_tick
+            elif action == Actions.Long_exit.value:
+                self._update_total_profit()
+                self._position = Positions.Neutral
+                trade_type = "neutral"
+                self._last_trade_tick = None
+            elif action == Actions.Short_exit.value:
+                self._update_total_profit()
+                self._position = Positions.Neutral
+                trade_type = "neutral"
+                self._last_trade_tick = None
+            else:
+                print("case not defined")
+
+            if trade_type is not None:
+                self.trade_history.append(
+                    {'price': self.current_price(), 'index': self._current_tick,
+                     'type': trade_type})
+
+        if (self._total_profit < self.max_drawdown or
+                self._total_unrealized_profit < self.max_drawdown):
+            self._done = True
+
+        self._position_history.append(self._position)
+
+        info = dict(
+            tick=self._current_tick,
+            total_reward=self.total_reward,
+            total_profit=self._total_profit,
+            position=self._position.value
+        )
+
+        observation = self._get_observation()
+
+        self._update_history(info)
+
+        return observation, step_reward, self._done, info
+
+    def is_tradesignal(self, action: int) -> bool:
+        """
+        Determine if the signal is a trade signal
+        e.g.: agent wants a Actions.Long_exit while it is in a Positions.short
+        """
+        return not ((action == Actions.Neutral.value and self._position == Positions.Neutral) or
+                    (action == Actions.Neutral.value and self._position == Positions.Short) or
+                    (action == Actions.Neutral.value and self._position == Positions.Long) or
+                    (action == Actions.Short_enter.value and self._position == Positions.Short) or
+                    (action == Actions.Short_enter.value and self._position == Positions.Long) or
+                    (action == Actions.Short_exit.value and self._position == Positions.Long) or
+                    (action == Actions.Short_exit.value and self._position == Positions.Neutral) or
+                    (action == Actions.Long_enter.value and self._position == Positions.Long) or
+                    (action == Actions.Long_enter.value and self._position == Positions.Short) or
+                    (action == Actions.Long_exit.value and self._position == Positions.Short) or
+                    (action == Actions.Long_exit.value and self._position == Positions.Neutral))
+
+    def _is_valid(self, action: int) -> bool:
+        # trade signal
+        """
+        Determine if the signal is valid.
+        e.g.: agent wants a Actions.Long_exit while it is in a Positions.short
+        """
+        # Agent should only try to exit if it is in position
+        if action in (Actions.Short_exit.value, Actions.Long_exit.value):
+            if self._position not in (Positions.Short, Positions.Long):
+                return False
+
+        # Agent should only try to enter if it is not in position
+        if action in (Actions.Short_enter.value, Actions.Long_enter.value):
+            if self._position != Positions.Neutral:
+                return False
+
+        return True
--- a/freqtrade/freqai/RL/BaseEnvironment.py
+++ b/freqtrade/freqai/RL/BaseEnvironment.py
@ -0,0 +1,302 @@
+import logging
+from abc import abstractmethod
+from enum import Enum
+from typing import Optional
+
+import gym
+import numpy as np
+import pandas as pd
+from gym import spaces
+from gym.utils import seeding
+from pandas import DataFrame
+
+from freqtrade.data.dataprovider import DataProvider
+
+
+logger = logging.getLogger(__name__)
+
+
+class Positions(Enum):
+    Short = 0
+    Long = 1
+    Neutral = 0.5
+
+    def opposite(self):
+        return Positions.Short if self == Positions.Long else Positions.Long
+
+
+class BaseEnvironment(gym.Env):
+    """
+    Base class for environments. This class is agnostic to action count.
+    Inherited classes customize this to include varying action counts/types,
+    See RL/Base5ActionRLEnv.py and RL/Base4ActionRLEnv.py
+    """
+
+    def __init__(self, df: DataFrame = DataFrame(), prices: DataFrame = DataFrame(),
+                 reward_kwargs: dict = {}, window_size=10, starting_point=True,
+                 id: str = 'baseenv-1', seed: int = 1, config: dict = {},
+                 dp: Optional[DataProvider] = None):
+        """
+        Initializes the training/eval environment.
+        :param df: dataframe of features
+        :param prices: dataframe of prices to be used in the training environment
+        :param window_size: size of window (temporal) to pass to the agent
+        :param reward_kwargs: extra config settings assigned by user in `rl_config`
+        :param starting_point: start at edge of window or not
+        :param id: string id of the environment (used in backend for multiprocessed env)
+        :param seed: Sets the seed of the environment higher in the gym.Env object
+        :param config: Typical user configuration file
+        :param dp: dataprovider from freqtrade
+        """
+        self.config = config
+        self.rl_config = config['freqai']['rl_config']
+        self.add_state_info = self.rl_config.get('add_state_info', False)
+        self.id = id
+        self.seed(seed)
+        self.reset_env(df, prices, window_size, reward_kwargs, starting_point)
+        self.max_drawdown = 1 - self.rl_config.get('max_training_drawdown_pct', 0.8)
+        self.compound_trades = config['stake_amount'] == 'unlimited'
+        if self.config.get('fee', None) is not None:
+            self.fee = self.config['fee']
+        elif dp is not None:
+            self.fee = dp._exchange.get_fee(symbol=dp.current_whitelist()[0])  # type: ignore
+        else:
+            self.fee = 0.0015
+
+    def reset_env(self, df: DataFrame, prices: DataFrame, window_size: int,
+                  reward_kwargs: dict, starting_point=True):
+        """
+        Resets the environment when the agent fails (in our case, if the drawdown
+        exceeds the user set max_training_drawdown_pct)
+        :param df: dataframe of features
+        :param prices: dataframe of prices to be used in the training environment
+        :param window_size: size of window (temporal) to pass to the agent
+        :param reward_kwargs: extra config settings assigned by user in `rl_config`
+        :param starting_point: start at edge of window or not
+        """
+        self.df = df
+        self.signal_features = self.df
+        self.prices = prices
+        self.window_size = window_size
+        self.starting_point = starting_point
+        self.rr = reward_kwargs["rr"]
+        self.profit_aim = reward_kwargs["profit_aim"]
+
+        # # spaces
+        if self.add_state_info:
+            self.total_features = self.signal_features.shape[1] + 3
+        else:
+            self.total_features = self.signal_features.shape[1]
+        self.shape = (window_size, self.total_features)
+        self.set_action_space()
+        self.observation_space = spaces.Box(
+            low=-1, high=1, shape=self.shape, dtype=np.float32)
+
+        # episode
+        self._start_tick: int = self.window_size
+        self._end_tick: int = len(self.prices) - 1
+        self._done: bool = False
+        self._current_tick: int = self._start_tick
+        self._last_trade_tick: Optional[int] = None
+        self._position = Positions.Neutral
+        self._position_history: list = [None]
+        self.total_reward: float = 0
+        self._total_profit: float = 1
+        self._total_unrealized_profit: float = 1
+        self.history: dict = {}
+        self.trade_history: list = []
+
+    @abstractmethod
+    def set_action_space(self):
+        """
+        Unique to the environment action count. Must be inherited.
+        """
+
+    def seed(self, seed: int = 1):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def reset(self):
+
+        self._done = False
+
+        if self.starting_point is True:
+            self._position_history = (self._start_tick * [None]) + [self._position]
+        else:
+            self._position_history = (self.window_size * [None]) + [self._position]
+
+        self._current_tick = self._start_tick
+        self._last_trade_tick = None
+        self._position = Positions.Neutral
+
+        self.total_reward = 0.
+        self._total_profit = 1.  # unit
+        self.history = {}
+        self.trade_history = []
+        self.portfolio_log_returns = np.zeros(len(self.prices))
+
+        self._profits = [(self._start_tick, 1)]
+        self.close_trade_profit = []
+        self._total_unrealized_profit = 1
+
+        return self._get_observation()
+
+    @abstractmethod
+    def step(self, action: int):
+        """
+        Step depeneds on action types, this must be inherited.
+        """
+        return
+
+    def _get_observation(self):
+        """
+        This may or may not be independent of action types, user can inherit
+        this in their custom "MyRLEnv"
+        """
+        features_window = self.signal_features[(
+            self._current_tick - self.window_size):self._current_tick]
+        if self.add_state_info:
+            features_and_state = DataFrame(np.zeros((len(features_window), 3)),
+                                           columns=['current_profit_pct',
+                                                    'position',
+                                                    'trade_duration'],
+                                           index=features_window.index)
+
+            features_and_state['current_profit_pct'] = self.get_unrealized_profit()
+            features_and_state['position'] = self._position.value
+            features_and_state['trade_duration'] = self.get_trade_duration()
+            features_and_state = pd.concat([features_window, features_and_state], axis=1)
+            return features_and_state
+        else:
+            return features_window
+
+    def get_trade_duration(self):
+        """
+        Get the trade duration if the agent is in a trade
+        """
+        if self._last_trade_tick is None:
+            return 0
+        else:
+            return self._current_tick - self._last_trade_tick
+
+    def get_unrealized_profit(self):
+        """
+        Get the unrealized profit if the agent is in a trade
+        """
+        if self._last_trade_tick is None:
+            return 0.
+
+        if self._position == Positions.Neutral:
+            return 0.
+        elif self._position == Positions.Short:
+            current_price = self.add_exit_fee(self.prices.iloc[self._current_tick].open)
+            last_trade_price = self.add_entry_fee(self.prices.iloc[self._last_trade_tick].open)
+            return (last_trade_price - current_price) / last_trade_price
+        elif self._position == Positions.Long:
+            current_price = self.add_entry_fee(self.prices.iloc[self._current_tick].open)
+            last_trade_price = self.add_exit_fee(self.prices.iloc[self._last_trade_tick].open)
+            return (current_price - last_trade_price) / last_trade_price
+        else:
+            return 0.
+
+    @abstractmethod
+    def is_tradesignal(self, action: int) -> bool:
+        """
+        Determine if the signal is a trade signal. This is
+        unique to the actions in the environment, and therefore must be
+        inherited.
+        """
+        return True
+
+    def _is_valid(self, action: int) -> bool:
+        """
+        Determine if the signal is valid.This is
+        unique to the actions in the environment, and therefore must be
+        inherited.
+        """
+        return True
+
+    def add_entry_fee(self, price):
+        return price * (1 + self.fee)
+
+    def add_exit_fee(self, price):
+        return price / (1 + self.fee)
+
+    def _update_history(self, info):
+        if not self.history:
+            self.history = {key: [] for key in info.keys()}
+
+        for key, value in info.items():
+            self.history[key].append(value)
+
+    @abstractmethod
+    def calculate_reward(self, action: int) -> float:
+        """
+        An example reward function. This is the one function that users will likely
+        wish to inject their own creativity into.
+        :param action: int = The action made by the agent for the current candle.
+        :return:
+        float = the reward to give to the agent for current step (used for optimization
+            of weights in NN)
+        """
+
+    def _update_unrealized_total_profit(self):
+        """
+        Update the unrealized total profit incase of episode end.
+        """
+        if self._position in (Positions.Long, Positions.Short):
+            pnl = self.get_unrealized_profit()
+            if self.compound_trades:
+                # assumes unit stake and compounding
+                unrl_profit = self._total_profit * (1 + pnl)
+            else:
+                # assumes unit stake and no compounding
+                unrl_profit = self._total_profit + pnl
+            self._total_unrealized_profit = unrl_profit
+
+    def _update_total_profit(self):
+        pnl = self.get_unrealized_profit()
+        if self.compound_trades:
+            # assumes unit stake and compounding
+            self._total_profit = self._total_profit * (1 + pnl)
+        else:
+            # assumes unit stake and no compounding
+            self._total_profit += pnl
+
+    def current_price(self) -> float:
+        return self.prices.iloc[self._current_tick].open
+
+    # Keeping around incase we want to start building more complex environment
+    # templates in the future.
+    # def most_recent_return(self):
+    #     """
+    #     Calculate the tick to tick return if in a trade.
+    #     Return is generated from rising prices in Long
+    #     and falling prices in Short positions.
+    #     The actions Sell/Buy or Hold during a Long position trigger the sell/buy-fee.
+    #     """
+    #     # Long positions
+    #     if self._position == Positions.Long:
+    #         current_price = self.prices.iloc[self._current_tick].open
+    #         previous_price = self.prices.iloc[self._current_tick - 1].open
+
+    #         if (self._position_history[self._current_tick - 1] == Positions.Short
+    #                 or self._position_history[self._current_tick - 1] == Positions.Neutral):
+    #             previous_price = self.add_entry_fee(previous_price)
+
+    #         return np.log(current_price) - np.log(previous_price)
+
+    #     # Short positions
+    #     if self._position == Positions.Short:
+    #         current_price = self.prices.iloc[self._current_tick].open
+    #         previous_price = self.prices.iloc[self._current_tick - 1].open
+    #         if (self._position_history[self._current_tick - 1] == Positions.Long
+    #                 or self._position_history[self._current_tick - 1] == Positions.Neutral):
+    #             previous_price = self.add_exit_fee(previous_price)
+
+    #         return np.log(previous_price) - np.log(current_price)
+
+    #     return 0
+
+    # def update_portfolio_log_returns(self, action):
+    #     self.portfolio_log_returns[self._current_tick] = self.most_recent_return(action)
--- a/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
+++ b/freqtrade/freqai/RL/BaseReinforcementLearningModel.py
@ -0,0 +1,395 @@
+import importlib
+import logging
+from abc import abstractmethod
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Tuple, Type, Union
+
+import gym
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+import torch as th
+import torch.multiprocessing
+from pandas import DataFrame
+from stable_baselines3.common.callbacks import EvalCallback
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.utils import set_random_seed
+from stable_baselines3.common.vec_env import SubprocVecEnv
+
+from freqtrade.exceptions import OperationalException
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from freqtrade.freqai.freqai_interface import IFreqaiModel
+from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv
+from freqtrade.freqai.RL.BaseEnvironment import Positions
+from freqtrade.persistence import Trade
+
+
+logger = logging.getLogger(__name__)
+
+torch.multiprocessing.set_sharing_strategy('file_system')
+
+SB3_MODELS = ['PPO', 'A2C', 'DQN']
+SB3_CONTRIB_MODELS = ['TRPO', 'ARS', 'RecurrentPPO', 'MaskablePPO']
+
+
+class BaseReinforcementLearningModel(IFreqaiModel):
+    """
+    User created Reinforcement Learning Model prediction class
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(config=kwargs['config'])
+        self.max_threads = min(self.freqai_info['rl_config'].get(
+            'cpu_count', 1), max(int(self.max_system_threads / 2), 1))
+        th.set_num_threads(self.max_threads)
+        self.reward_params = self.freqai_info['rl_config']['model_reward_parameters']
+        self.train_env: Union[SubprocVecEnv, gym.Env] = None
+        self.eval_env: Union[SubprocVecEnv, gym.Env] = None
+        self.eval_callback: Optional[EvalCallback] = None
+        self.model_type = self.freqai_info['rl_config']['model_type']
+        self.rl_config = self.freqai_info['rl_config']
+        self.continual_learning = self.freqai_info.get('continual_learning', False)
+        if self.model_type in SB3_MODELS:
+            import_str = 'stable_baselines3'
+        elif self.model_type in SB3_CONTRIB_MODELS:
+            import_str = 'sb3_contrib'
+        else:
+            raise OperationalException(f'{self.model_type} not available in stable_baselines3 or '
+                                       f'sb3_contrib. please choose one of {SB3_MODELS} or '
+                                       f'{SB3_CONTRIB_MODELS}')
+
+        mod = importlib.import_module(import_str, self.model_type)
+        self.MODELCLASS = getattr(mod, self.model_type)
+        self.policy_type = self.freqai_info['rl_config']['policy_type']
+        self.unset_outlier_removal()
+        self.net_arch = self.rl_config.get('net_arch', [128, 128])
+
+    def unset_outlier_removal(self):
+        """
+        If user has activated any function that may remove training points, this
+        function will set them to false and warn them
+        """
+        if self.ft_params.get('use_SVM_to_remove_outliers', False):
+            self.ft_params.update({'use_SVM_to_remove_outliers': False})
+            logger.warning('User tried to use SVM with RL. Deactivating SVM.')
+        if self.ft_params.get('use_DBSCAN_to_remove_outliers', False):
+            self.ft_params.update({'use_DBSCAN_to_remove_outliers': False})
+            logger.warning('User tried to use DBSCAN with RL. Deactivating DBSCAN.')
+        if self.freqai_info['data_split_parameters'].get('shuffle', False):
+            self.freqai_info['data_split_parameters'].update({'shuffle': False})
+            logger.warning('User tried to shuffle training data. Setting shuffle to False')
+
+    def train(
+        self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs
+    ) -> Any:
+        """
+        Filter the training data and train a model to it. Train makes heavy use of the datakitchen
+        for storing, saving, loading, and analyzing the data.
+        :param unfiltered_df: Full dataframe for the current training period
+        :param metadata: pair metadata from strategy.
+        :returns:
+        :model: Trained model which can be used to inference (self.predict)
+        """
+
+        logger.info("--------------------Starting training " f"{pair} --------------------")
+
+        features_filtered, labels_filtered = dk.filter_features(
+            unfiltered_df,
+            dk.training_features_list,
+            dk.label_list,
+            training_filter=True,
+        )
+
+        data_dictionary: Dict[str, Any] = dk.make_train_test_datasets(
+            features_filtered, labels_filtered)
+        dk.fit_labels()  # FIXME useless for now, but just satiating append methods
+
+        # normalize all data based on train_dataset only
+        prices_train, prices_test = self.build_ohlc_price_dataframes(dk.data_dictionary, pair, dk)
+        data_dictionary = dk.normalize_data(data_dictionary)
+
+        # data cleaning/analysis
+        self.data_cleaning_train(dk)
+
+        logger.info(
+            f'Training model on {len(dk.data_dictionary["train_features"].columns)}'
+            f' features and {len(data_dictionary["train_features"])} data points'
+        )
+
+        self.set_train_and_eval_environments(data_dictionary, prices_train, prices_test, dk)
+
+        model = self.fit(data_dictionary, dk)
+
+        logger.info(f"--------------------done training {pair}--------------------")
+
+        return model
+
+    def set_train_and_eval_environments(self, data_dictionary: Dict[str, DataFrame],
+                                        prices_train: DataFrame, prices_test: DataFrame,
+                                        dk: FreqaiDataKitchen):
+        """
+        User can override this if they are using a custom MyRLEnv
+        :param data_dictionary: dict = common data dictionary containing train and test
+            features/labels/weights.
+        :param prices_train/test: DataFrame = dataframe comprised of the prices to be used in the
+            environment during training or testing
+        :param dk: FreqaiDataKitchen = the datakitchen for the current pair
+        """
+        train_df = data_dictionary["train_features"]
+        test_df = data_dictionary["test_features"]
+
+        self.train_env = self.MyRLEnv(df=train_df,
+                                      prices=prices_train,
+                                      window_size=self.CONV_WIDTH,
+                                      reward_kwargs=self.reward_params,
+                                      config=self.config,
+                                      dp=self.data_provider)
+        self.eval_env = Monitor(self.MyRLEnv(df=test_df,
+                                             prices=prices_test,
+                                             window_size=self.CONV_WIDTH,
+                                             reward_kwargs=self.reward_params,
+                                             config=self.config,
+                                             dp=self.data_provider))
+        self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
+                                          render=False, eval_freq=len(train_df),
+                                          best_model_save_path=str(dk.data_path))
+
+    @abstractmethod
+    def fit(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen, **kwargs):
+        """
+        Agent customizations and abstract Reinforcement Learning customizations
+        go in here. Abstract method, so this function must be overridden by
+        user class.
+        """
+        return
+
+    def get_state_info(self, pair: str) -> Tuple[float, float, int]:
+        """
+        State info during dry/live (not backtesting) which is fed back
+        into the model.
+        :param pair: str = COIN/STAKE to get the environment information for
+        :return:
+        :market_side: float = representing short, long, or neutral for
+            pair
+        :current_profit: float = unrealized profit of the current trade
+        :trade_duration: int = the number of candles that the trade has
+            been open for
+        """
+        open_trades = Trade.get_trades_proxy(is_open=True)
+        market_side = 0.5
+        current_profit: float = 0
+        trade_duration = 0
+        for trade in open_trades:
+            if trade.pair == pair:
+                if self.data_provider._exchange is None:  # type: ignore
+                    logger.error('No exchange available.')
+                    return 0, 0, 0
+                else:
+                    current_rate = self.data_provider._exchange.get_rate(  # type: ignore
+                                pair, refresh=False, side="exit", is_short=trade.is_short)
+
+                now = datetime.now(timezone.utc).timestamp()
+                trade_duration = int((now - trade.open_date_utc.timestamp()) / self.base_tf_seconds)
+                current_profit = trade.calc_profit_ratio(current_rate)
+
+        return market_side, current_profit, int(trade_duration)
+
+    def predict(
+        self, unfiltered_df: DataFrame, dk: FreqaiDataKitchen, **kwargs
+    ) -> Tuple[DataFrame, npt.NDArray[np.int_]]:
+        """
+        Filter the prediction features data and predict with it.
+        :param unfiltered_dataframe: Full dataframe for the current backtest period.
+        :return:
+        :pred_df: dataframe containing the predictions
+        :do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
+        data (NaNs) or felt uncertain about data (PCA and DI index)
+        """
+
+        dk.find_features(unfiltered_df)
+        filtered_dataframe, _ = dk.filter_features(
+            unfiltered_df, dk.training_features_list, training_filter=False
+        )
+        filtered_dataframe = dk.normalize_data_from_metadata(filtered_dataframe)
+        dk.data_dictionary["prediction_features"] = filtered_dataframe
+
+        # optional additional data cleaning/analysis
+        self.data_cleaning_predict(dk)
+
+        pred_df = self.rl_model_predict(
+            dk.data_dictionary["prediction_features"], dk, self.model)
+        pred_df.fillna(0, inplace=True)
+
+        return (pred_df, dk.do_predict)
+
+    def rl_model_predict(self, dataframe: DataFrame,
+                         dk: FreqaiDataKitchen, model: Any) -> DataFrame:
+        """
+        A helper function to make predictions in the Reinforcement learning module.
+        :param dataframe: DataFrame = the dataframe of features to make the predictions on
+        :param dk: FreqaiDatakitchen = data kitchen for the current pair
+        :param model: Any = the trained model used to inference the features.
+        """
+        output = pd.DataFrame(np.zeros(len(dataframe)), columns=dk.label_list)
+
+        def _predict(window):
+            observations = dataframe.iloc[window.index]
+            if self.live and self.rl_config.get('add_state_info', False):
+                market_side, current_profit, trade_duration = self.get_state_info(dk.pair)
+                observations['current_profit_pct'] = current_profit
+                observations['position'] = market_side
+                observations['trade_duration'] = trade_duration
+            res, _ = model.predict(observations, deterministic=True)
+            return res
+
+        output = output.rolling(window=self.CONV_WIDTH).apply(_predict)
+
+        return output
+
+    def build_ohlc_price_dataframes(self, data_dictionary: dict,
+                                    pair: str, dk: FreqaiDataKitchen) -> Tuple[DataFrame,
+                                                                               DataFrame]:
+        """
+        Builds the train prices and test prices for the environment.
+        """
+
+        pair = pair.replace(':', '')
+        train_df = data_dictionary["train_features"]
+        test_df = data_dictionary["test_features"]
+
+        # price data for model training and evaluation
+        tf = self.config['timeframe']
+        ohlc_list = [f'%-{pair}raw_open_{tf}', f'%-{pair}raw_low_{tf}',
+                     f'%-{pair}raw_high_{tf}', f'%-{pair}raw_close_{tf}']
+        rename_dict = {f'%-{pair}raw_open_{tf}': 'open', f'%-{pair}raw_low_{tf}': 'low',
+                       f'%-{pair}raw_high_{tf}': ' high', f'%-{pair}raw_close_{tf}': 'close'}
+
+        prices_train = train_df.filter(ohlc_list, axis=1)
+        if prices_train.empty:
+            raise OperationalException('Reinforcement learning module didnt find the raw prices '
+                                       'assigned in populate_any_indicators. Please assign them '
+                                       'with:\n'
+                                       'informative[f"%-{pair}raw_close"] = informative["close"]\n'
+                                       'informative[f"%-{pair}raw_open"] = informative["open"]\n'
+                                       'informative[f"%-{pair}raw_high"] = informative["high"]\n'
+                                       'informative[f"%-{pair}raw_low"] = informative["low"]\n')
+        prices_train.rename(columns=rename_dict, inplace=True)
+        prices_train.reset_index(drop=True)
+
+        prices_test = test_df.filter(ohlc_list, axis=1)
+        prices_test.rename(columns=rename_dict, inplace=True)
+        prices_test.reset_index(drop=True)
+
+        return prices_train, prices_test
+
+    def load_model_from_disk(self, dk: FreqaiDataKitchen) -> Any:
+        """
+        Can be used by user if they are trying to limit_ram_usage *and*
+        perform continual learning.
+        For now, this is unused.
+        """
+        exists = Path(dk.data_path / f"{dk.model_filename}_model").is_file()
+        if exists:
+            model = self.MODELCLASS.load(dk.data_path / f"{dk.model_filename}_model")
+        else:
+            logger.info('No model file on disk to continue learning from.')
+
+        return model
+
+    def _on_stop(self):
+        """
+        Hook called on bot shutdown. Close SubprocVecEnv subprocesses for clean shutdown.
+        """
+
+        if self.train_env:
+            self.train_env.close()
+
+        if self.eval_env:
+            self.eval_env.close()
+
+    # Nested class which can be overridden by user to customize further
+    class MyRLEnv(Base5ActionRLEnv):
+        """
+        User can override any function in BaseRLEnv and gym.Env. Here the user
+        sets a custom reward based on profit and trade duration.
+        """
+
+        def calculate_reward(self, action: int) -> float:
+            """
+            An example reward function. This is the one function that users will likely
+            wish to inject their own creativity into.
+            :param action: int = The action made by the agent for the current candle.
+            :return:
+            float = the reward to give to the agent for current step (used for optimization
+                of weights in NN)
+            """
+            # first, penalize if the action is not valid
+            if not self._is_valid(action):
+                return -2
+
+            pnl = self.get_unrealized_profit()
+            factor = 100.
+
+            # reward agent for entering trades
+            if (action in (Actions.Long_enter.value, Actions.Short_enter.value)
+                    and self._position == Positions.Neutral):
+                return 25
+            # discourage agent from not entering trades
+            if action == Actions.Neutral.value and self._position == Positions.Neutral:
+                return -1
+
+            max_trade_duration = self.rl_config.get('max_trade_duration_candles', 300)
+            if self._last_trade_tick:
+                trade_duration = self._current_tick - self._last_trade_tick
+            else:
+                trade_duration = 0
+
+            if trade_duration <= max_trade_duration:
+                factor *= 1.5
+            elif trade_duration > max_trade_duration:
+                factor *= 0.5
+
+            # discourage sitting in position
+            if (self._position in (Positions.Short, Positions.Long) and
+               action == Actions.Neutral.value):
+                return -1 * trade_duration / max_trade_duration
+
+            # close long
+            if action == Actions.Long_exit.value and self._position == Positions.Long:
+                if pnl > self.profit_aim * self.rr:
+                    factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2)
+                return float(pnl * factor)
+
+            # close short
+            if action == Actions.Short_exit.value and self._position == Positions.Short:
+                if pnl > self.profit_aim * self.rr:
+                    factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2)
+                return float(pnl * factor)
+
+            return 0.
+
+
+def make_env(MyRLEnv: Type[gym.Env], env_id: str, rank: int,
+             seed: int, train_df: DataFrame, price: DataFrame,
+             reward_params: Dict[str, int], window_size: int, monitor: bool = False,
+             config: Dict[str, Any] = {}) -> Callable:
+    """
+    Utility function for multiprocessed env.
+
+    :param env_id: (str) the environment ID
+    :param num_env: (int) the number of environment you wish to have in subprocesses
+    :param seed: (int) the inital seed for RNG
+    :param rank: (int) index of the subprocess
+    :return: (Callable)
+    """
+
+    def _init() -> gym.Env:
+
+        env = MyRLEnv(df=train_df, prices=price, window_size=window_size,
+                      reward_kwargs=reward_params, id=env_id, seed=seed + rank, config=config)
+        if monitor:
+            env = Monitor(env)
+        return env
+    set_random_seed(seed)
+    return _init
--- a/freqtrade/freqai/RL/init.py
+++ b/freqtrade/freqai/RL/init.py
--- a/freqtrade/freqai/data_drawer.py
+++ b/freqtrade/freqai/data_drawer.py
@ -1,4 +1,5 @@
 import collections
+import importlib
 import logging
 import re
 import shutil
@ -98,6 +99,12 @@ class FreqaiDataDrawer:
        self.empty_pair_dict: pair_info = {
                "model_filename": "", "trained_timestamp": 0,
                "data_path": "", "extras": {}}
+        if 'Reinforcement' in self.config['freqaimodel']:
+            self.model_type = 'stable_baselines'
+            logger.warning('User passed a ReinforcementLearner model, FreqAI will '
+                           'now use stable_baselines3 to save models.')
+        else:
+            self.model_type = self.freqai_info.get('model_save_type', 'joblib')

    def update_metric_tracker(self, metric: str, value: float, pair: str) -> None:
        """
@ -476,10 +483,12 @@ class FreqaiDataDrawer:
        save_path = Path(dk.data_path)

        # Save the trained model
-        if not dk.keras:
+        if self.model_type == 'joblib':
            dump(model, save_path / f"{dk.model_filename}_model.joblib")
-        else:
+        elif self.model_type == 'keras':
            model.save(save_path / f"{dk.model_filename}_model.h5")
+        elif 'stable_baselines' in self.model_type:
+            model.save(save_path / f"{dk.model_filename}_model.zip")

        if dk.svm_model is not None:
            dump(dk.svm_model, save_path / f"{dk.model_filename}_svm_model.joblib")
@ -506,11 +515,10 @@ class FreqaiDataDrawer:
                dk.pca, open(dk.data_path / f"{dk.model_filename}_pca_object.pkl", "wb")
            )

-        # if self.live:
-        # store as much in ram as possible to increase performance
        self.model_dictionary[coin] = model
        self.pair_dict[coin]["model_filename"] = dk.model_filename
        self.pair_dict[coin]["data_path"] = str(dk.data_path)
+
        if coin not in self.meta_data_dictionary:
            self.meta_data_dictionary[coin] = {}
        self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"]
@ -542,14 +550,6 @@ class FreqaiDataDrawer:
        if dk.live:
            dk.model_filename = self.pair_dict[coin]["model_filename"]
            dk.data_path = Path(self.pair_dict[coin]["data_path"])
-            if self.freqai_info.get("follow_mode", False):
-                # follower can be on a different system which is rsynced from the leader:
-                dk.data_path = Path(
-                    self.config["user_data_dir"]
-                    / "models"
-                    / dk.data_path.parts[-2]
-                    / dk.data_path.parts[-1]
-                )

        if coin in self.meta_data_dictionary:
            dk.data = self.meta_data_dictionary[coin]["meta_data"]
@ -568,12 +568,16 @@ class FreqaiDataDrawer:
        # try to access model in memory instead of loading object from disk to save time
        if dk.live and coin in self.model_dictionary:
            model = self.model_dictionary[coin]
-        elif not dk.keras:
+        elif self.model_type == 'joblib':
            model = load(dk.data_path / f"{dk.model_filename}_model.joblib")
-        else:
+        elif self.model_type == 'keras':
            from tensorflow import keras
-
            model = keras.models.load_model(dk.data_path / f"{dk.model_filename}_model.h5")
+        elif self.model_type == 'stable_baselines':
+            mod = importlib.import_module(
+                'stable_baselines3', self.freqai_info['rl_config']['model_type'])
+            MODELCLASS = getattr(mod, self.freqai_info['rl_config']['model_type'])
+            model = MODELCLASS.load(dk.data_path / f"{dk.model_filename}_model")

        if Path(dk.data_path / f"{dk.model_filename}_svm_model.joblib").is_file():
            dk.svm_model = load(dk.data_path / f"{dk.model_filename}_svm_model.joblib")
@ -583,6 +587,10 @@ class FreqaiDataDrawer:
                f"Unable to load model, ensure model exists at " f"{dk.data_path} "
            )

+        # load it into ram if it was loaded from disk
+        if coin not in self.model_dictionary:
+            self.model_dictionary[coin] = model
+
        if self.config["freqai"]["feature_parameters"]["principal_component_analysis"]:
            dk.pca = cloudpickle.load(
                open(dk.data_path / f"{dk.model_filename}_pca_object.pkl", "rb")
--- a/freqtrade/freqai/data_kitchen.py
+++ b/freqtrade/freqai/data_kitchen.py
@ -9,6 +9,7 @@ from typing import Any, Dict, List, Tuple
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
+import psutil
 from pandas import DataFrame
 from scipy import stats
 from sklearn import linear_model
@ -102,7 +103,10 @@ class FreqaiDataKitchen:
                )

        self.data['extra_returns_per_train'] = self.freqai_config.get('extra_returns_per_train', {})
-        self.thread_count = self.freqai_config.get("data_kitchen_thread_count", -1)
+        if not self.freqai_config.get("data_kitchen_thread_count", 0):
+            self.thread_count = max(int(psutil.cpu_count() * 2 - 2), 1)
+        else:
+            self.thread_count = self.freqai_config["data_kitchen_thread_count"]
        self.train_dates: DataFrame = pd.DataFrame()
        self.unique_classes: Dict[str, list] = {}
        self.unique_class_list: list = []
--- a/freqtrade/freqai/freqai_interface.py
+++ b/freqtrade/freqai/freqai_interface.py
@ -5,15 +5,17 @@ from abc import ABC, abstractmethod
 from collections import deque
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Tuple
+from typing import Any, Dict, List, Literal, Optional, Tuple

 import numpy as np
 import pandas as pd
+import psutil
 from numpy.typing import NDArray
 from pandas import DataFrame

 from freqtrade.configuration import TimeRange
 from freqtrade.constants import Config
+from freqtrade.data.dataprovider import DataProvider
 from freqtrade.enums import RunMode
 from freqtrade.exceptions import OperationalException
 from freqtrade.exchange import timeframe_to_seconds
@ -98,6 +100,8 @@ class IFreqaiModel(ABC):
        self.get_corr_dataframes: bool = True
        self._threads: List[threading.Thread] = []
        self._stop_event = threading.Event()
+        self.data_provider: Optional[DataProvider] = None
+        self.max_system_threads = max(int(psutil.cpu_count() * 2 - 2), 1)

        record_params(config, self.full_path)

@ -126,6 +130,7 @@ class IFreqaiModel(ABC):

        self.live = strategy.dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE)
        self.dd.set_pair_dict_info(metadata)
+        self.data_provider = strategy.dp

        if self.live:
            self.inference_timer('start')
@ -164,6 +169,13 @@ class IFreqaiModel(ABC):
        self.model = None
        self.dk = None

+    def _on_stop(self):
+        """
+        Callback for Subclasses to override to include logic for shutting down resources
+        when SIGINT is sent.
+        """
+        return
+
    def shutdown(self):
        """
        Cleans up threads on Shutdown, set stop event. Join threads to wait
@ -172,6 +184,9 @@ class IFreqaiModel(ABC):
        logger.info("Stopping FreqAI")
        self._stop_event.set()

+        self.data_provider = None
+        self._on_stop()
+
        logger.info("Waiting on Training iteration")
        for _thread in self._threads:
            _thread.join()
--- a/freqtrade/freqai/prediction_models/ReinforcementLearner.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearner.py
@ -0,0 +1,141 @@
+import logging
+from pathlib import Path
+from typing import Any, Dict
+
+import torch as th
+
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from freqtrade.freqai.RL.Base5ActionRLEnv import Actions, Base5ActionRLEnv, Positions
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import BaseReinforcementLearningModel
+
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearner(BaseReinforcementLearningModel):
+    """
+    Reinforcement Learning Model prediction model.
+
+    Users can inherit from this class to make their own RL model with custom
+    environment/training controls. Define the file as follows:
+
+    ```
+    from freqtrade.freqai.prediction_models.ReinforcementLearner import ReinforcementLearner
+
+    class MyCoolRLModel(ReinforcementLearner):
+    ```
+
+    Save the file to `user_data/freqaimodels`, then run it with:
+
+    freqtrade trade --freqaimodel MyCoolRLModel --config config.json --strategy SomeCoolStrat
+
+    Here the users can override any of the functions
+    available in the `IFreqaiModel` inheritance tree. Most importantly for RL, this
+    is where the user overrides `MyRLEnv` (see below), to define custom
+    `calculate_reward()` function, or to override any other parts of the environment.
+
+    This class also allows users to override any other part of the IFreqaiModel tree.
+    For example, the user can override `def fit()` or `def train()` or `def predict()`
+    to take fine-tuned control over these processes.
+
+    Another common override may be `def data_cleaning_predict()` where the user can
+    take fine-tuned control over the data handling pipeline.
+    """
+
+    def fit(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen, **kwargs):
+        """
+        User customizable fit method
+        :param data_dictionary: dict = common data dictionary containing all train/test
+            features/labels/weights.
+        :param dk: FreqaiDatakitchen = data kitchen for current pair.
+        :return:
+        model Any = trained model to be used for inference in dry/live/backtesting
+        """
+        train_df = data_dictionary["train_features"]
+        total_timesteps = self.freqai_info["rl_config"]["train_cycles"] * len(train_df)
+
+        policy_kwargs = dict(activation_fn=th.nn.ReLU,
+                             net_arch=self.net_arch)
+
+        if dk.pair not in self.dd.model_dictionary or not self.continual_learning:
+            model = self.MODELCLASS(self.policy_type, self.train_env, policy_kwargs=policy_kwargs,
+                                    tensorboard_log=Path(
+                                        dk.full_path / "tensorboard" / dk.pair.split('/')[0]),
+                                    **self.freqai_info['model_training_parameters']
+                                    )
+        else:
+            logger.info('Continual training activated - starting training from previously '
+                        'trained agent.')
+            model = self.dd.model_dictionary[dk.pair]
+            model.set_env(self.train_env)
+
+        model.learn(
+            total_timesteps=int(total_timesteps),
+            callback=self.eval_callback
+        )
+
+        if Path(dk.data_path / "best_model.zip").is_file():
+            logger.info('Callback found a best model.')
+            best_model = self.MODELCLASS.load(dk.data_path / "best_model")
+            return best_model
+
+        logger.info('Couldnt find best model, using final model instead.')
+
+        return model
+
+    class MyRLEnv(Base5ActionRLEnv):
+        """
+        User can override any function in BaseRLEnv and gym.Env. Here the user
+        sets a custom reward based on profit and trade duration.
+        """
+
+        def calculate_reward(self, action: int) -> float:
+            """
+            An example reward function. This is the one function that users will likely
+            wish to inject their own creativity into.
+            :param action: int = The action made by the agent for the current candle.
+            :return:
+            float = the reward to give to the agent for current step (used for optimization
+                of weights in NN)
+            """
+            # first, penalize if the action is not valid
+            if not self._is_valid(action):
+                return -2
+
+            pnl = self.get_unrealized_profit()
+            factor = 100.
+
+            # reward agent for entering trades
+            if (action in (Actions.Long_enter.value, Actions.Short_enter.value)
+                    and self._position == Positions.Neutral):
+                return 25
+            # discourage agent from not entering trades
+            if action == Actions.Neutral.value and self._position == Positions.Neutral:
+                return -1
+
+            max_trade_duration = self.rl_config.get('max_trade_duration_candles', 300)
+            trade_duration = self._current_tick - self._last_trade_tick  # type: ignore
+
+            if trade_duration <= max_trade_duration:
+                factor *= 1.5
+            elif trade_duration > max_trade_duration:
+                factor *= 0.5
+
+            # discourage sitting in position
+            if (self._position in (Positions.Short, Positions.Long) and
+                    action == Actions.Neutral.value):
+                return -1 * trade_duration / max_trade_duration
+
+            # close long
+            if action == Actions.Long_exit.value and self._position == Positions.Long:
+                if pnl > self.profit_aim * self.rr:
+                    factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2)
+                return float(pnl * factor)
+
+            # close short
+            if action == Actions.Short_exit.value and self._position == Positions.Short:
+                if pnl > self.profit_aim * self.rr:
+                    factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2)
+                return float(pnl * factor)
+
+            return 0.
--- a/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py
+++ b/freqtrade/freqai/prediction_models/ReinforcementLearner_multiproc.py
@ -0,0 +1,51 @@
+import logging
+from typing import Any, Dict  # , Tuple
+
+# import numpy.typing as npt
+from pandas import DataFrame
+from stable_baselines3.common.callbacks import EvalCallback
+from stable_baselines3.common.vec_env import SubprocVecEnv
+
+from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
+from freqtrade.freqai.prediction_models.ReinforcementLearner import ReinforcementLearner
+from freqtrade.freqai.RL.BaseReinforcementLearningModel import make_env
+
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearner_multiproc(ReinforcementLearner):
+    """
+    Demonstration of how to build vectorized environments
+    """
+
+    def set_train_and_eval_environments(self, data_dictionary: Dict[str, Any],
+                                        prices_train: DataFrame, prices_test: DataFrame,
+                                        dk: FreqaiDataKitchen):
+        """
+        User can override this if they are using a custom MyRLEnv
+        :param data_dictionary: dict = common data dictionary containing train and test
+            features/labels/weights.
+        :param prices_train/test: DataFrame = dataframe comprised of the prices to be used in
+            the environment during training
+        or testing
+        :param dk: FreqaiDataKitchen = the datakitchen for the current pair
+        """
+        train_df = data_dictionary["train_features"]
+        test_df = data_dictionary["test_features"]
+
+        env_id = "train_env"
+        self.train_env = SubprocVecEnv([make_env(self.MyRLEnv, env_id, i, 1, train_df, prices_train,
+                                        self.reward_params, self.CONV_WIDTH, monitor=True,
+                                        config=self.config) for i
+                                        in range(self.max_threads)])
+
+        eval_env_id = 'eval_env'
+        self.eval_env = SubprocVecEnv([make_env(self.MyRLEnv, eval_env_id, i, 1,
+                                                test_df, prices_test,
+                                                self.reward_params, self.CONV_WIDTH, monitor=True,
+                                                config=self.config) for i
+                                       in range(self.max_threads)])
+        self.eval_callback = EvalCallback(self.eval_env, deterministic=True,
+                                          render=False, eval_freq=len(train_df),
+                                          best_model_save_path=str(dk.data_path))
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -29,6 +29,7 @@ nav:
        - Parameter table: freqai-parameter-table.md
        - Feature engineering: freqai-feature-engineering.md
        - Running FreqAI: freqai-running.md
+        - Reinforcement Learning: freqai-reinforcement-learning.md
        - Developer guide: freqai-developers.md
    - Short / Leverage: leverage.md
    - Utility Sub-commands: utils.md
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -3,6 +3,7 @@
 -r requirements-plot.txt
 -r requirements-hyperopt.txt
 -r requirements-freqai.txt
+-r requirements-freqai-rl.txt
 -r docs/requirements-docs.txt

 coveralls==3.3.1
--- a/requirements-freqai-rl.txt
+++ b/requirements-freqai-rl.txt
@ -0,0 +1,8 @@
+# Include all requirements to run the bot.
+-r requirements-freqai.txt
+
+# Required for freqai-rl
+torch==1.12.1
+stable-baselines3==1.6.1
+gym==0.21
+sb3-contrib==1.6.1
--- a/setup.py
+++ b/setup.py
@ -15,6 +15,14 @@ freqai = [
    'scikit-learn',
    'catboost; platform_machine != "aarch64"',
    'lightgbm',
+    'xgboost'
+]
+
+freqai_rl = [
+    'torch',
+    'stable-baselines3',
+    'gym==0.21',
+    'sb3-contrib'
 ]

 develop = [
@ -36,7 +44,7 @@ jupyter = [
    'nbconvert',
 ]

-all_extra = plot + develop + jupyter + hyperopt + freqai
+all_extra = plot + develop + jupyter + hyperopt + freqai + freqai_rl

 setup(
    tests_require=[
@ -90,6 +98,7 @@ setup(
        'jupyter': jupyter,
        'hyperopt': hyperopt,
        'freqai': freqai,
+        'freqai_rl': freqai_rl,
        'all': all_extra,
    },
 )
--- a/setup.sh
+++ b/setup.sh
@ -78,14 +78,21 @@ function updateenv() {
    fi

    REQUIREMENTS_FREQAI=""
+    REQUIREMENTS_FREQAI_RL=""
    read -p "Do you want to install dependencies for freqai [y/N]? "
    dev=$REPLY
    if [[ $REPLY =~ ^[Yy]$ ]]
    then
        REQUIREMENTS_FREQAI="-r requirements-freqai.txt --use-pep517"
+        read -p "Do you also want dependencies for freqai-rl (~700mb additional space required) [y/N]? "
+        dev=$REPLY
+        if [[ $REPLY =~ ^[Yy]$ ]]
+        then
+            REQUIREMENTS_FREQAI="-r requirements-freqai-rl.txt"
+        fi
    fi

-    ${PYTHON} -m pip install --upgrade -r ${REQUIREMENTS} ${REQUIREMENTS_HYPEROPT} ${REQUIREMENTS_PLOT} ${REQUIREMENTS_FREQAI}
+    ${PYTHON} -m pip install --upgrade -r ${REQUIREMENTS} ${REQUIREMENTS_HYPEROPT} ${REQUIREMENTS_PLOT} ${REQUIREMENTS_FREQAI} ${REQUIREMENTS_FREQAI_RL}
    if [ $? -ne 0 ]; then
        echo "Failed installing dependencies"
        exit 1
--- a/tests/freqai/conftest.py
+++ b/tests/freqai/conftest.py
@ -27,10 +27,9 @@ def freqai_conf(default_conf, tmpdir):
            "timerange": "20180110-20180115",
            "freqai": {
                "enabled": True,
-                "startup_candles": 10000,
                "purge_old_models": True,
                "train_period_days": 2,
-                "backtest_period_days": 2,
+                "backtest_period_days": 10,
                "live_retrain_hours": 0,
                "expiration_hours": 1,
                "identifier": "uniqe-id100",
@ -58,6 +57,30 @@ def freqai_conf(default_conf, tmpdir):
    return freqaiconf


+def make_rl_config(conf):
+    conf.update({"strategy": "freqai_rl_test_strat"})
+    conf["freqai"].update({"model_training_parameters": {
+        "learning_rate": 0.00025,
+        "gamma": 0.9,
+        "verbose": 1
+    }})
+    conf["freqai"]["rl_config"] = {
+        "train_cycles": 1,
+        "thread_count": 2,
+        "max_trade_duration_candles": 300,
+        "model_type": "PPO",
+        "policy_type": "MlpPolicy",
+        "max_training_drawdown_pct": 0.5,
+        "net_arch": [32, 32],
+        "model_reward_parameters": {
+            "rr": 1,
+            "profit_aim": 0.02,
+            "win_reward_factor": 2
+        }}
+
+    return conf
+
+
 def get_patched_data_kitchen(mocker, freqaiconf):
    dk = FreqaiDataKitchen(freqaiconf)
    return dk
--- a/tests/freqai/test_freqai_interface.py
+++ b/tests/freqai/test_freqai_interface.py
@ -13,8 +13,8 @@ from freqtrade.freqai.utils import download_all_data_for_training, get_required_
 from freqtrade.optimize.backtesting import Backtesting
 from freqtrade.persistence import Trade
 from freqtrade.plugins.pairlistmanager import PairListManager
-from tests.conftest import get_patched_exchange, log_has_re
-from tests.freqai.conftest import get_patched_freqai_strategy
+from tests.conftest import create_mock_trades, get_patched_exchange, log_has_re
+from tests.freqai.conftest import get_patched_freqai_strategy, make_rl_config


 def is_arm() -> bool:
@ -32,11 +32,17 @@ def is_mac() -> bool:
    ('XGBoostRegressor', False, True, False),
    ('XGBoostRFRegressor', False, False, False),
    ('CatboostRegressor', False, False, False),
+    ('ReinforcementLearner', False, True, False),
+    ('ReinforcementLearner_multiproc', False, False, False),
+    ('ReinforcementLearner_test_4ac', False, False, False)
    ])
 def test_extract_data_and_train_model_Standard(mocker, freqai_conf, model, pca, dbscan, float32):
    if is_arm() and model == 'CatboostRegressor':
        pytest.skip("CatBoost is not supported on ARM")

+    if is_mac() and 'Reinforcement' in model:
+        pytest.skip("Reinforcement learning module not available on intel based Mac OS")
+
    model_save_ext = 'joblib'
    freqai_conf.update({"freqaimodel": model})
    freqai_conf.update({"timerange": "20180110-20180130"})
@ -45,6 +51,26 @@ def test_extract_data_and_train_model_Standard(mocker, freqai_conf, model, pca,
    freqai_conf['freqai']['feature_parameters'].update({"use_DBSCAN_to_remove_outliers": dbscan})
    freqai_conf.update({"reduce_df_footprint": float32})

+    if 'ReinforcementLearner' in model:
+        model_save_ext = 'zip'
+        freqai_conf = make_rl_config(freqai_conf)
+        # test the RL guardrails
+        freqai_conf['freqai']['feature_parameters'].update({"use_SVM_to_remove_outliers": True})
+        freqai_conf['freqai']['data_split_parameters'].update({'shuffle': True})
+
+    if 'test_4ac' in model:
+        freqai_conf["freqaimodel_path"] = str(Path(__file__).parents[1] / "freqai" / "test_models")
+
+    if 'ReinforcementLearner' in model:
+        model_save_ext = 'zip'
+        freqai_conf = make_rl_config(freqai_conf)
+        # test the RL guardrails
+        freqai_conf['freqai']['feature_parameters'].update({"use_SVM_to_remove_outliers": True})
+        freqai_conf['freqai']['data_split_parameters'].update({'shuffle': True})
+
+    if 'test_4ac' in model:
+        freqai_conf["freqaimodel_path"] = str(Path(__file__).parents[1] / "freqai" / "test_models")
+
    strategy = get_patched_freqai_strategy(mocker, freqai_conf)
    exchange = get_patched_exchange(mocker, freqai_conf)
    strategy.dp = DataProvider(freqai_conf, exchange)
@ -52,6 +78,7 @@ def test_extract_data_and_train_model_Standard(mocker, freqai_conf, model, pca,
    freqai = strategy.freqai
    freqai.live = True
    freqai.dk = FreqaiDataKitchen(freqai_conf)
+    freqai.dk.set_paths('ADA/BTC', 10000)
    timerange = TimeRange.parse_timerange("20180110-20180130")
    freqai.dd.load_all_pair_histories(timerange, freqai.dk)

@ -165,25 +192,35 @@ def test_extract_data_and_train_model_Classifiers(mocker, freqai_conf, model):
@pytest.mark.parametrize(
    "model, num_files, strat",
    [
-        ("LightGBMRegressor", 6, "freqai_test_strat"),
-        ("XGBoostRegressor", 6, "freqai_test_strat"),
-        ("CatboostRegressor", 6, "freqai_test_strat"),
-        ("XGBoostClassifier", 6, "freqai_test_classifier"),
-        ("LightGBMClassifier", 6, "freqai_test_classifier"),
-        ("CatboostClassifier", 6, "freqai_test_classifier")
+        ("LightGBMRegressor", 2, "freqai_test_strat"),
+        ("XGBoostRegressor", 2, "freqai_test_strat"),
+        ("CatboostRegressor", 2, "freqai_test_strat"),
+        ("ReinforcementLearner", 3, "freqai_rl_test_strat"),
+        ("XGBoostClassifier", 2, "freqai_test_classifier"),
+        ("LightGBMClassifier", 2, "freqai_test_classifier"),
+        ("CatboostClassifier", 2, "freqai_test_classifier")
    ],
    )
 def test_start_backtesting(mocker, freqai_conf, model, num_files, strat, caplog):
    freqai_conf.get("freqai", {}).update({"save_backtest_models": True})
    freqai_conf['runmode'] = RunMode.BACKTEST
-    Trade.use_db = False
    if is_arm() and "Catboost" in model:
        pytest.skip("CatBoost is not supported on ARM")

+    if is_mac() and 'Reinforcement' in model:
+        pytest.skip("Reinforcement learning module not available on intel based Mac OS")
+    Trade.use_db = False
+
    freqai_conf.update({"freqaimodel": model})
    freqai_conf.update({"timerange": "20180120-20180130"})
    freqai_conf.update({"strategy": strat})

+    if 'ReinforcementLearner' in model:
+        freqai_conf = make_rl_config(freqai_conf)
+
+    if 'test_4ac' in model:
+        freqai_conf["freqaimodel_path"] = str(Path(__file__).parents[1] / "freqai" / "test_models")
+
    strategy = get_patched_freqai_strategy(mocker, freqai_conf)
    exchange = get_patched_exchange(mocker, freqai_conf)
    strategy.dp = DataProvider(freqai_conf, exchange)
@ -207,6 +244,7 @@ def test_start_backtesting(mocker, freqai_conf, model, num_files, strat, caplog)
    model_folders = [x for x in freqai.dd.full_path.iterdir() if x.is_dir()]

    assert len(model_folders) == num_files
+    Trade.use_db = True
    assert log_has_re(
        "Removed features ",
        caplog,
@ -267,7 +305,7 @@ def test_start_backtesting_from_existing_folder(mocker, freqai_conf, caplog):
    freqai.start_backtesting(df, metadata, freqai.dk)
    model_folders = [x for x in freqai.dd.full_path.iterdir() if x.is_dir()]

-    assert len(model_folders) == 6
+    assert len(model_folders) == 2

    # without deleting the existing folder structure, re-run

@ -295,7 +333,7 @@ def test_start_backtesting_from_existing_folder(mocker, freqai_conf, caplog):

    path = (freqai.dd.full_path / freqai.dk.backtest_predictions_folder)
    prediction_files = [x for x in path.iterdir() if x.is_file()]
-    assert len(prediction_files) == 5
+    assert len(prediction_files) == 1

    shutil.rmtree(Path(freqai.dk.full_path))

@ -473,3 +511,43 @@ def test_download_all_data_for_training(mocker, freqai_conf, caplog, tmpdir):
        "Downloading",
        caplog,
    )
+
+
+@pytest.mark.usefixtures("init_persistence")
+@pytest.mark.parametrize('dp_exists', [(False), (True)])
+def test_get_state_info(mocker, freqai_conf, dp_exists, caplog, tickers):
+
+    if is_mac():
+        pytest.skip("Reinforcement learning module not available on intel based Mac OS")
+
+    freqai_conf.update({"freqaimodel": "ReinforcementLearner"})
+    freqai_conf.update({"timerange": "20180110-20180130"})
+    freqai_conf.update({"strategy": "freqai_rl_test_strat"})
+    freqai_conf = make_rl_config(freqai_conf)
+    freqai_conf['entry_pricing']['price_side'] = 'same'
+    freqai_conf['exit_pricing']['price_side'] = 'same'
+
+    strategy = get_patched_freqai_strategy(mocker, freqai_conf)
+    exchange = get_patched_exchange(mocker, freqai_conf)
+    ticker_mock = MagicMock(return_value=tickers()['ETH/BTC'])
+    mocker.patch("freqtrade.exchange.Exchange.fetch_ticker", ticker_mock)
+    strategy.dp = DataProvider(freqai_conf, exchange)
+
+    if not dp_exists:
+        strategy.dp._exchange = None
+
+    strategy.freqai_info = freqai_conf.get("freqai", {})
+    freqai = strategy.freqai
+    freqai.data_provider = strategy.dp
+    freqai.live = True
+
+    Trade.use_db = True
+    create_mock_trades(MagicMock(return_value=0.0025), False, True)
+    freqai.get_state_info("ADA/BTC")
+    freqai.get_state_info("ETH/BTC")
+
+    if not dp_exists:
+        assert log_has_re(
+            "No exchange available",
+            caplog,
+        )
--- a/tests/freqai/test_models/ReinforcementLearner_test_4ac.py
+++ b/tests/freqai/test_models/ReinforcementLearner_test_4ac.py
@ -0,0 +1,66 @@
+import logging
+
+import numpy as np
+
+from freqtrade.freqai.prediction_models.ReinforcementLearner import ReinforcementLearner
+from freqtrade.freqai.RL.Base4ActionRLEnv import Actions, Base4ActionRLEnv, Positions
+
+
+logger = logging.getLogger(__name__)
+
+
+class ReinforcementLearner_test_4ac(ReinforcementLearner):
+    """
+    User created Reinforcement Learning Model prediction model.
+    """
+
+    class MyRLEnv(Base4ActionRLEnv):
+        """
+        User can override any function in BaseRLEnv and gym.Env. Here the user
+        sets a custom reward based on profit and trade duration.
+        """
+
+        def calculate_reward(self, action: int) -> float:
+
+            # first, penalize if the action is not valid
+            if not self._is_valid(action):
+                return -2
+
+            pnl = self.get_unrealized_profit()
+            rew = np.sign(pnl) * (pnl + 1)
+            factor = 100.
+
+            # reward agent for entering trades
+            if (action in (Actions.Long_enter.value, Actions.Short_enter.value)
+                    and self._position == Positions.Neutral):
+                return 25
+            # discourage agent from not entering trades
+            if action == Actions.Neutral.value and self._position == Positions.Neutral:
+                return -1
+
+            max_trade_duration = self.rl_config.get('max_trade_duration_candles', 300)
+            trade_duration = self._current_tick - self._last_trade_tick  # type: ignore
+
+            if trade_duration <= max_trade_duration:
+                factor *= 1.5
+            elif trade_duration > max_trade_duration:
+                factor *= 0.5
+
+            # discourage sitting in position
+            if (self._position in (Positions.Short, Positions.Long) and
+                    action == Actions.Neutral.value):
+                return -1 * trade_duration / max_trade_duration
+
+            # close long
+            if action == Actions.Exit.value and self._position == Positions.Long:
+                if pnl > self.profit_aim * self.rr:
+                    factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2)
+                return float(rew * factor)
+
+            # close short
+            if action == Actions.Exit.value and self._position == Positions.Short:
+                if pnl > self.profit_aim * self.rr:
+                    factor *= self.rl_config['model_reward_parameters'].get('win_reward_factor', 2)
+                return float(rew * factor)
+
+            return 0.
--- a/tests/rpc/test_rpc_apiserver.py
+++ b/tests/rpc/test_rpc_apiserver.py
@ -1461,6 +1461,7 @@ def test_api_strategies(botclient, tmpdir):
        'StrategyTestV3',
        'StrategyTestV3CustomEntryPrice',
        'StrategyTestV3Futures',
+        'freqai_rl_test_strat',
        'freqai_test_classifier',
        'freqai_test_multimodel_classifier_strat',
        'freqai_test_multimodel_strat',
--- a/tests/strategy/strats/freqai_rl_test_strat.py
+++ b/tests/strategy/strats/freqai_rl_test_strat.py
@ -0,0 +1,105 @@
+import logging
+from functools import reduce
+
+import pandas as pd
+import talib.abstract as ta
+from pandas import DataFrame
+
+from freqtrade.strategy import IStrategy, merge_informative_pair
+
+
+logger = logging.getLogger(__name__)
+
+
+class freqai_rl_test_strat(IStrategy):
+    """
+    Test strategy - used for testing freqAI functionalities.
+    DO not use in production.
+    """
+
+    minimal_roi = {"0": 0.1, "240": -1}
+
+    process_only_new_candles = True
+    stoploss = -0.05
+    use_exit_signal = True
+    startup_candle_count: int = 30
+    can_short = False
+
+    def populate_any_indicators(
+        self, pair, df, tf, informative=None, set_generalized_indicators=False
+    ):
+
+        if informative is None:
+            informative = self.dp.get_pair_dataframe(pair, tf)
+
+        # first loop is automatically duplicating indicators for time periods
+        for t in self.freqai_info["feature_parameters"]["indicator_periods_candles"]:
+
+            t = int(t)
+            informative[f"%-{pair}rsi-period_{t}"] = ta.RSI(informative, timeperiod=t)
+
+        # The following columns are necessary for RL models.
+        informative[f"%-{pair}raw_close"] = informative["close"]
+        informative[f"%-{pair}raw_open"] = informative["open"]
+        informative[f"%-{pair}raw_high"] = informative["high"]
+        informative[f"%-{pair}raw_low"] = informative["low"]
+
+        indicators = [col for col in informative if col.startswith("%")]
+        # This loop duplicates and shifts all indicators to add a sense of recency to data
+        for n in range(self.freqai_info["feature_parameters"]["include_shifted_candles"] + 1):
+            if n == 0:
+                continue
+            informative_shift = informative[indicators].shift(n)
+            informative_shift = informative_shift.add_suffix("_shift-" + str(n))
+            informative = pd.concat((informative, informative_shift), axis=1)
+
+        df = merge_informative_pair(df, informative, self.config["timeframe"], tf, ffill=True)
+        skip_columns = [
+            (s + "_" + tf) for s in ["date", "open", "high", "low", "close", "volume"]
+        ]
+        df = df.drop(columns=skip_columns)
+
+        # Add generalized indicators here (because in live, it will call this
+        # function to populate indicators during training). Notice how we ensure not to
+        # add them multiple times
+        if set_generalized_indicators:
+            # For RL, there are no direct targets to set. This is filler (neutral)
+            # until the agent sends an action.
+            df["&-action"] = 0
+
+        return df
+
+    def populate_indicators(self, dataframe: DataFrame, metadata: dict) -> DataFrame:
+
+        dataframe = self.freqai.start(dataframe, metadata, self)
+
+        return dataframe
+
+    def populate_entry_trend(self, df: DataFrame, metadata: dict) -> DataFrame:
+
+        enter_long_conditions = [df["do_predict"] == 1, df["&-action"] == 1]
+
+        if enter_long_conditions:
+            df.loc[
+                reduce(lambda x, y: x & y, enter_long_conditions), ["enter_long", "enter_tag"]
+            ] = (1, "long")
+
+        enter_short_conditions = [df["do_predict"] == 1, df["&-action"] == 3]
+
+        if enter_short_conditions:
+            df.loc[
+                reduce(lambda x, y: x & y, enter_short_conditions), ["enter_short", "enter_tag"]
+            ] = (1, "short")
+
+        return df
+
+    def populate_exit_trend(self, df: DataFrame, metadata: dict) -> DataFrame:
+        exit_long_conditions = [df["do_predict"] == 1, df["&-action"] == 2]
+        if exit_long_conditions:
+            df.loc[reduce(lambda x, y: x & y, exit_long_conditions), "exit_long"] = 1
+
+        exit_short_conditions = [df["do_predict"] == 1, df["&-action"] == 4]
+        if exit_short_conditions:
+            df.loc[reduce(lambda x, y: x & y, exit_short_conditions), "exit_short"] = 1
+
+        return df
--- a/tests/strategy/test_strategy_loading.py
+++ b/tests/strategy/test_strategy_loading.py
@ -34,7 +34,7 @@ def test_search_all_strategies_no_failed():
    directory = Path(__file__).parent / "strats"
    strategies = StrategyResolver._search_all_objects(directory, enum_failed=False)
    assert isinstance(strategies, list)
-    assert len(strategies) == 11
+    assert len(strategies) == 12
    assert isinstance(strategies[0], dict)


@ -42,10 +42,10 @@ def test_search_all_strategies_with_failed():
    directory = Path(__file__).parent / "strats"
    strategies = StrategyResolver._search_all_objects(directory, enum_failed=True)
    assert isinstance(strategies, list)
-    assert len(strategies) == 12
+    assert len(strategies) == 13
    # with enum_failed=True search_all_objects() shall find 2 good strategies
    # and 1 which fails to load
-    assert len([x for x in strategies if x['class'] is not None]) == 11
+    assert len([x for x in strategies if x['class'] is not None]) == 12

    assert len([x for x in strategies if x['class'] is None]) == 1