Merge branch 'develop' into backtest_live_models

This commit is contained in:
Wagner Costa Santos 2022-09-25 20:05:26 -03:00
commit f3f3917da3
18 changed files with 1282 additions and 69 deletions

View File

@ -272,6 +272,16 @@ jobs:
pip install pyaml
python build_helpers/pre_commit_update.py
pre-commit:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
- uses: pre-commit/action@v3.0.0
docs_check:
runs-on: ubuntu-20.04
steps:
@ -302,7 +312,7 @@ jobs:
# Notify only once - when CI completes (and after deploy) in case it's successfull
notify-complete:
needs: [ build_linux, build_macos, build_windows, docs_check, mypy_version_check ]
needs: [ build_linux, build_macos, build_windows, docs_check, mypy_version_check, pre-commit ]
runs-on: ubuntu-20.04
# Discord notification can't handle schedule events
if: (github.event_name != 'schedule')
@ -327,7 +337,7 @@ jobs:
webhookUrl: ${{ secrets.DISCORD_WEBHOOK }}
deploy:
needs: [ build_linux, build_macos, build_windows, docs_check, mypy_version_check ]
needs: [ build_linux, build_macos, build_windows, docs_check, mypy_version_check, pre-commit ]
runs-on: ubuntu-20.04
if: (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'release') && github.repository == 'freqtrade/freqtrade'

View File

@ -26,7 +26,7 @@ usage: freqtrade download-data [-h] [-v] [--logfile FILE] [-V] [-c PATH]
[--timerange TIMERANGE] [--dl-trades]
[--exchange EXCHANGE]
[-t TIMEFRAMES [TIMEFRAMES ...]] [--erase]
[--data-format-ohlcv {json,jsongz,hdf5}]
[--data-format-ohlcv {json,jsongz,hdf5,feather,parquet}]
[--data-format-trades {json,jsongz,hdf5}]
[--trading-mode {spot,margin,futures}]
[--prepend]
@ -55,7 +55,7 @@ optional arguments:
list. Default: `1m 5m`.
--erase Clean all existing data for the selected
exchange/pairs/timeframes.
--data-format-ohlcv {json,jsongz,hdf5}
--data-format-ohlcv {json,jsongz,hdf5,feather,parquet}
Storage format for downloaded candle (OHLCV) data.
(default: `json`).
--data-format-trades {json,jsongz,hdf5}
@ -76,7 +76,7 @@ Common arguments:
`userdir/config.json` or `config.json` whichever
exists). Multiple --config options may be used. Can be
set to `-` to read config from stdin.
-d PATH, --datadir PATH
-d PATH, --datadir PATH, --data-dir PATH
Path to directory with historical backtesting data.
--userdir PATH, --user-data-dir PATH
Path to userdata directory.
@ -179,9 +179,11 @@ freqtrade download-data --exchange binance --pairs ETH/USDT XRP/USDT BTC/USDT --
Freqtrade currently supports 3 data-formats for both OHLCV and trades data:
* `json` (plain "text" json files)
* `jsongz` (a gzip-zipped version of json files)
* `hdf5` (a high performance datastore)
* `json` - plain "text" json files
* `jsongz` - a gzip-zipped version of json files
* `hdf5` - a high performance datastore
* `feather` - a dataformat based on Apache Arrow
* `parquet` - columnar datastore
By default, OHLCV data is stored as `json` data, while trades data is stored as `jsongz` data.
@ -200,38 +202,74 @@ If the default data-format has been changed during download, then the keys `data
!!! Note
You can convert between data-formats using the [convert-data](#sub-command-convert-data) and [convert-trade-data](#sub-command-convert-trade-data) methods.
#### Dataformat comparison
The following comparisons have been made with the following data, and by using the linux `time` command.
```
Found 6 pair / timeframe combinations.
+----------+-------------+--------+---------------------+---------------------+
| Pair | Timeframe | Type | From | To |
|----------+-------------+--------+---------------------+---------------------|
| BTC/USDT | 5m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:25:00 |
| ETH/USDT | 1m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:26:00 |
| BTC/USDT | 1m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:30:00 |
| XRP/USDT | 5m | spot | 2018-05-04 08:10:00 | 2022-09-13 19:15:00 |
| XRP/USDT | 1m | spot | 2018-05-04 08:11:00 | 2022-09-13 19:22:00 |
| ETH/USDT | 5m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:20:00 |
+----------+-------------+--------+---------------------+---------------------+
```
Timings have been taken in a not very scientific way with the following command, which forces reading the data into memory.
``` bash
time freqtrade list-data --show-timerange --data-format-ohlcv <dataformat>
```
| Format | Size | timing |
|------------|-------------|-------------|
| `json` | 149Mb | 25.6s |
| `jsongz` | 39Mb | 27s |
| `hdf5` | 145Mb | 3.9s |
| `feather` | 72Mb | 3.5s |
| `parquet` | 83Mb | 3.8s |
Size has been taken from the BTC/USDT 1m spot combination for the timerange specified above.
To have a best performance/size mix, we recommend the use of either feather or parquet.
#### Sub-command convert data
```
usage: freqtrade convert-data [-h] [-v] [--logfile FILE] [-V] [-c PATH]
[-d PATH] [--userdir PATH]
[-p PAIRS [PAIRS ...]] --format-from
{json,jsongz,hdf5} --format-to
{json,jsongz,hdf5} [--erase]
[-t {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...]]
{json,jsongz,hdf5,feather,parquet} --format-to
{json,jsongz,hdf5,feather,parquet} [--erase]
[--exchange EXCHANGE]
[-t TIMEFRAMES [TIMEFRAMES ...]]
[--trading-mode {spot,margin,futures}]
[--candle-types {spot,,futures,mark,index,premiumIndex,funding_rate} [{spot,,futures,mark,index,premiumIndex,funding_rate} ...]]
[--candle-types {spot,futures,mark,index,premiumIndex,funding_rate} [{spot,futures,mark,index,premiumIndex,funding_rate} ...]]
optional arguments:
-h, --help show this help message and exit
-p PAIRS [PAIRS ...], --pairs PAIRS [PAIRS ...]
Limit command to these pairs. Pairs are space-
separated.
--format-from {json,jsongz,hdf5}
--format-from {json,jsongz,hdf5,feather,parquet}
Source format for data conversion.
--format-to {json,jsongz,hdf5}
--format-to {json,jsongz,hdf5,feather,parquet}
Destination format for data conversion.
--erase Clean all existing data for the selected
exchange/pairs/timeframes.
-t {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...], --timeframes {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...]
Specify which tickers to download. Space-separated
list. Default: `1m 5m`.
--exchange EXCHANGE Exchange name (default: `bittrex`). Only valid if no
config is provided.
--trading-mode {spot,margin,futures}
-t TIMEFRAMES [TIMEFRAMES ...], --timeframes TIMEFRAMES [TIMEFRAMES ...]
Specify which tickers to download. Space-separated
list. Default: `1m 5m`.
--trading-mode {spot,margin,futures}, --tradingmode {spot,margin,futures}
Select Trading mode
--candle-types {spot,,futures,mark,index,premiumIndex,funding_rate} [{spot,,futures,mark,index,premiumIndex,funding_rate} ...]
--candle-types {spot,futures,mark,index,premiumIndex,funding_rate} [{spot,futures,mark,index,premiumIndex,funding_rate} ...]
Select candle type to use
Common arguments:
@ -245,7 +283,7 @@ Common arguments:
`userdir/config.json` or `config.json` whichever
exists). Multiple --config options may be used. Can be
set to `-` to read config from stdin.
-d PATH, --datadir PATH
-d PATH, --datadir PATH, --data-dir PATH
Path to directory with historical backtesting data.
--userdir PATH, --user-data-dir PATH
Path to userdata directory.
@ -267,20 +305,24 @@ freqtrade convert-data --format-from json --format-to jsongz --datadir ~/.freqtr
usage: freqtrade convert-trade-data [-h] [-v] [--logfile FILE] [-V] [-c PATH]
[-d PATH] [--userdir PATH]
[-p PAIRS [PAIRS ...]] --format-from
{json,jsongz,hdf5} --format-to
{json,jsongz,hdf5} [--erase]
{json,jsongz,hdf5,feather,parquet}
--format-to
{json,jsongz,hdf5,feather,parquet}
[--erase] [--exchange EXCHANGE]
optional arguments:
-h, --help show this help message and exit
-p PAIRS [PAIRS ...], --pairs PAIRS [PAIRS ...]
Show profits for only these pairs. Pairs are space-
Limit command to these pairs. Pairs are space-
separated.
--format-from {json,jsongz,hdf5}
--format-from {json,jsongz,hdf5,feather,parquet}
Source format for data conversion.
--format-to {json,jsongz,hdf5}
--format-to {json,jsongz,hdf5,feather,parquet}
Destination format for data conversion.
--erase Clean all existing data for the selected
exchange/pairs/timeframes.
--exchange EXCHANGE Exchange name (default: `bittrex`). Only valid if no
config is provided.
Common arguments:
-v, --verbose Verbose mode (-vv for more, -vvv to get all messages).
@ -293,7 +335,7 @@ Common arguments:
`userdir/config.json` or `config.json` whichever
exists). Multiple --config options may be used. Can be
set to `-` to read config from stdin.
-d PATH, --datadir PATH
-d PATH, --datadir PATH, --data-dir PATH
Path to directory with historical backtesting data.
--userdir PATH, --user-data-dir PATH
Path to userdata directory.
@ -318,9 +360,9 @@ This command will allow you to repeat this last step for additional timeframes w
usage: freqtrade trades-to-ohlcv [-h] [-v] [--logfile FILE] [-V] [-c PATH]
[-d PATH] [--userdir PATH]
[-p PAIRS [PAIRS ...]]
[-t {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...]]
[-t TIMEFRAMES [TIMEFRAMES ...]]
[--exchange EXCHANGE]
[--data-format-ohlcv {json,jsongz,hdf5}]
[--data-format-ohlcv {json,jsongz,hdf5,feather,parquet}]
[--data-format-trades {json,jsongz,hdf5}]
optional arguments:
@ -328,12 +370,12 @@ optional arguments:
-p PAIRS [PAIRS ...], --pairs PAIRS [PAIRS ...]
Limit command to these pairs. Pairs are space-
separated.
-t {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...], --timeframes {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...]
-t TIMEFRAMES [TIMEFRAMES ...], --timeframes TIMEFRAMES [TIMEFRAMES ...]
Specify which tickers to download. Space-separated
list. Default: `1m 5m`.
--exchange EXCHANGE Exchange name (default: `bittrex`). Only valid if no
config is provided.
--data-format-ohlcv {json,jsongz,hdf5}
--data-format-ohlcv {json,jsongz,hdf5,feather,parquet}
Storage format for downloaded candle (OHLCV) data.
(default: `json`).
--data-format-trades {json,jsongz,hdf5}
@ -351,7 +393,7 @@ Common arguments:
`userdir/config.json` or `config.json` whichever
exists). Multiple --config options may be used. Can be
set to `-` to read config from stdin.
-d PATH, --datadir PATH
-d PATH, --datadir PATH, --data-dir PATH
Path to directory with historical backtesting data.
--userdir PATH, --user-data-dir PATH
Path to userdata directory.
@ -371,7 +413,7 @@ You can get a list of downloaded data using the `list-data` sub-command.
```
usage: freqtrade list-data [-h] [-v] [--logfile FILE] [-V] [-c PATH] [-d PATH]
[--userdir PATH] [--exchange EXCHANGE]
[--data-format-ohlcv {json,jsongz,hdf5}]
[--data-format-ohlcv {json,jsongz,hdf5,feather,parquet}]
[-p PAIRS [PAIRS ...]]
[--trading-mode {spot,margin,futures}]
[--show-timerange]
@ -380,13 +422,13 @@ optional arguments:
-h, --help show this help message and exit
--exchange EXCHANGE Exchange name (default: `bittrex`). Only valid if no
config is provided.
--data-format-ohlcv {json,jsongz,hdf5}
--data-format-ohlcv {json,jsongz,hdf5,feather,parquet}
Storage format for downloaded candle (OHLCV) data.
(default: `json`).
-p PAIRS [PAIRS ...], --pairs PAIRS [PAIRS ...]
Limit command to these pairs. Pairs are space-
separated.
--trading-mode {spot,margin,futures}
--trading-mode {spot,margin,futures}, --tradingmode {spot,margin,futures}
Select Trading mode
--show-timerange Show timerange available for available data. (May take
a while to calculate).
@ -402,7 +444,7 @@ Common arguments:
`userdir/config.json` or `config.json` whichever
exists). Multiple --config options may be used. Can be
set to `-` to read config from stdin.
-d PATH, --datadir PATH
-d PATH, --datadir PATH, --data-dir PATH
Path to directory with historical backtesting data.
--userdir PATH, --user-data-dir PATH
Path to userdata directory.

View File

@ -109,8 +109,8 @@ Mandatory parameters are marked as **Required**, which means that they are requi
| `indicator_max_period_candles` | **No longer used**. User must use the strategy set `startup_candle_count` which defines the maximum *period* used in `populate_any_indicators()` for indicator creation (timeframe independent). FreqAI uses this information in combination with the maximum timeframe to calculate how many data points it should download so that the first data point does not have a NaN <br> **Datatype:** positive integer.
| `indicator_periods_candles` | Calculate indicators for `indicator_periods_candles` time periods and add them to the feature set. <br> **Datatype:** List of positive integers.
| `stratify_training_data` | This value is used to indicate the grouping of the data. For example, 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. See details about how it works [here](#stratifying-the-data-for-training-and-testing-the-model) <br> **Datatype:** Positive integer.
| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis)
| `plot_feature_importance` | Create an interactive feature importance plot for each model.<br> **Datatype:** Boolean.<br> **Datatype:** Boolean, defaults to `False`
| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis) <br> **Datatype:** Boolean. defaults to `false`.
| `plot_feature_importances` | Create a feature importance plot for each model for the top/bottom `plot_feature_importances` number of features.<br> **Datatype:** Integer, defaults to `0`.
| `DI_threshold` | Activates the Dissimilarity Index for outlier detection when > 0. See details about how it works [here](#removing-outliers-with-the-dissimilarity-index). <br> **Datatype:** Positive float (typically < 1).
| `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Boolean.
| `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm). <br> **Datatype:** Dictionary.

View File

@ -34,6 +34,7 @@ dependencies:
- schedule
- python-dateutil
- joblib
- pyarrow
# ============================

View File

@ -440,7 +440,7 @@ AVAILABLE_CLI_OPTIONS = {
"dataformat_trades": Arg(
'--data-format-trades',
help='Storage format for downloaded trades data. (default: `jsongz`).',
choices=constants.AVAILABLE_DATAHANDLERS,
choices=constants.AVAILABLE_DATAHANDLERS_TRADES,
),
"show_timerange": Arg(
'--show-timerange',

View File

@ -36,7 +36,8 @@ AVAILABLE_PAIRLISTS = ['StaticPairList', 'VolumePairList',
'PrecisionFilter', 'PriceFilter', 'RangeStabilityFilter',
'ShuffleFilter', 'SpreadFilter', 'VolatilityFilter']
AVAILABLE_PROTECTIONS = ['CooldownPeriod', 'LowProfitPairs', 'MaxDrawdown', 'StoplossGuard']
AVAILABLE_DATAHANDLERS = ['json', 'jsongz', 'hdf5']
AVAILABLE_DATAHANDLERS_TRADES = ['json', 'jsongz', 'hdf5']
AVAILABLE_DATAHANDLERS = AVAILABLE_DATAHANDLERS_TRADES + ['feather', 'parquet']
BACKTEST_BREAKDOWNS = ['day', 'week', 'month']
BACKTEST_CACHE_AGE = ['none', 'day', 'week', 'month']
BACKTEST_CACHE_DEFAULT = 'day'
@ -434,7 +435,7 @@ CONF_SCHEMA = {
},
'dataformat_trades': {
'type': 'string',
'enum': AVAILABLE_DATAHANDLERS,
'enum': AVAILABLE_DATAHANDLERS_TRADES,
'default': 'jsongz'
},
'position_adjustment_enable': {'type': 'boolean'},

View File

@ -0,0 +1,130 @@
import logging
from typing import Optional
from pandas import DataFrame, read_feather, to_datetime
from freqtrade.configuration import TimeRange
from freqtrade.constants import DEFAULT_DATAFRAME_COLUMNS, TradeList
from freqtrade.enums import CandleType
from .idatahandler import IDataHandler
logger = logging.getLogger(__name__)
class FeatherDataHandler(IDataHandler):
_columns = DEFAULT_DATAFRAME_COLUMNS
def ohlcv_store(
self, pair: str, timeframe: str, data: DataFrame, candle_type: CandleType) -> None:
"""
Store data in json format "values".
format looks as follows:
[[<date>,<open>,<high>,<low>,<close>]]
:param pair: Pair - used to generate filename
:param timeframe: Timeframe - used to generate filename
:param data: Dataframe containing OHLCV data
:param candle_type: Any of the enum CandleType (must match trading mode!)
:return: None
"""
filename = self._pair_data_filename(self._datadir, pair, timeframe, candle_type)
self.create_dir_if_needed(filename)
data.reset_index(drop=True).loc[:, self._columns].to_feather(
filename, compression_level=9, compression='lz4')
def _ohlcv_load(self, pair: str, timeframe: str,
timerange: Optional[TimeRange], candle_type: CandleType
) -> DataFrame:
"""
Internal method used to load data for one pair from disk.
Implements the loading and conversion to a Pandas dataframe.
Timerange trimming and dataframe validation happens outside of this method.
:param pair: Pair to load data
:param timeframe: Timeframe (e.g. "5m")
:param timerange: Limit data to be loaded to this timerange.
Optionally implemented by subclasses to avoid loading
all data where possible.
:param candle_type: Any of the enum CandleType (must match trading mode!)
:return: DataFrame with ohlcv data, or empty DataFrame
"""
filename = self._pair_data_filename(
self._datadir, pair, timeframe, candle_type=candle_type)
if not filename.exists():
# Fallback mode for 1M files
filename = self._pair_data_filename(
self._datadir, pair, timeframe, candle_type=candle_type, no_timeframe_modify=True)
if not filename.exists():
return DataFrame(columns=self._columns)
pairdata = read_feather(filename)
pairdata.columns = self._columns
pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float',
'low': 'float', 'close': 'float', 'volume': 'float'})
pairdata['date'] = to_datetime(pairdata['date'],
unit='ms',
utc=True,
infer_datetime_format=True)
return pairdata
def ohlcv_append(
self,
pair: str,
timeframe: str,
data: DataFrame,
candle_type: CandleType
) -> None:
"""
Append data to existing data structures
:param pair: Pair
:param timeframe: Timeframe this ohlcv data is for
:param data: Data to append.
:param candle_type: Any of the enum CandleType (must match trading mode!)
"""
raise NotImplementedError()
def trades_store(self, pair: str, data: TradeList) -> None:
"""
Store trades data (list of Dicts) to file
:param pair: Pair - used for filename
:param data: List of Lists containing trade data,
column sequence as in DEFAULT_TRADES_COLUMNS
"""
# filename = self._pair_trades_filename(self._datadir, pair)
raise NotImplementedError()
# array = pa.array(data)
# array
# feather.write_feather(data, filename)
def trades_append(self, pair: str, data: TradeList):
"""
Append data to existing files
:param pair: Pair - used for filename
:param data: List of Lists containing trade data,
column sequence as in DEFAULT_TRADES_COLUMNS
"""
raise NotImplementedError()
def _trades_load(self, pair: str, timerange: Optional[TimeRange] = None) -> TradeList:
"""
Load a pair from file, either .json.gz or .json
# TODO: respect timerange ...
:param pair: Load trades for this pair
:param timerange: Timerange to load trades for - currently not implemented
:return: List of trades
"""
raise NotImplementedError()
# filename = self._pair_trades_filename(self._datadir, pair)
# tradesdata = misc.file_load_json(filename)
# if not tradesdata:
# return []
# return tradesdata
@classmethod
def _get_file_extension(cls):
return "feather"

View File

@ -81,6 +81,7 @@ class HDF5DataHandler(IDataHandler):
raise ValueError("Wrong dataframe format")
pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float',
'low': 'float', 'close': 'float', 'volume': 'float'})
pairdata = pairdata.reset_index(drop=True)
return pairdata
def ohlcv_append(

View File

@ -375,6 +375,12 @@ def get_datahandlerclass(datatype: str) -> Type[IDataHandler]:
elif datatype == 'hdf5':
from .hdf5datahandler import HDF5DataHandler
return HDF5DataHandler
elif datatype == 'feather':
from .featherdatahandler import FeatherDataHandler
return FeatherDataHandler
elif datatype == 'parquet':
from .parquetdatahandler import ParquetDataHandler
return ParquetDataHandler
else:
raise ValueError(f"No datahandler for datatype {datatype} available.")

View File

@ -0,0 +1,129 @@
import logging
from typing import Optional
from pandas import DataFrame, read_parquet, to_datetime
from freqtrade.configuration import TimeRange
from freqtrade.constants import DEFAULT_DATAFRAME_COLUMNS, TradeList
from freqtrade.enums import CandleType
from .idatahandler import IDataHandler
logger = logging.getLogger(__name__)
class ParquetDataHandler(IDataHandler):
_columns = DEFAULT_DATAFRAME_COLUMNS
def ohlcv_store(
self, pair: str, timeframe: str, data: DataFrame, candle_type: CandleType) -> None:
"""
Store data in json format "values".
format looks as follows:
[[<date>,<open>,<high>,<low>,<close>]]
:param pair: Pair - used to generate filename
:param timeframe: Timeframe - used to generate filename
:param data: Dataframe containing OHLCV data
:param candle_type: Any of the enum CandleType (must match trading mode!)
:return: None
"""
filename = self._pair_data_filename(self._datadir, pair, timeframe, candle_type)
self.create_dir_if_needed(filename)
data.reset_index(drop=True).loc[:, self._columns].to_parquet(filename)
def _ohlcv_load(self, pair: str, timeframe: str,
timerange: Optional[TimeRange], candle_type: CandleType
) -> DataFrame:
"""
Internal method used to load data for one pair from disk.
Implements the loading and conversion to a Pandas dataframe.
Timerange trimming and dataframe validation happens outside of this method.
:param pair: Pair to load data
:param timeframe: Timeframe (e.g. "5m")
:param timerange: Limit data to be loaded to this timerange.
Optionally implemented by subclasses to avoid loading
all data where possible.
:param candle_type: Any of the enum CandleType (must match trading mode!)
:return: DataFrame with ohlcv data, or empty DataFrame
"""
filename = self._pair_data_filename(
self._datadir, pair, timeframe, candle_type=candle_type)
if not filename.exists():
# Fallback mode for 1M files
filename = self._pair_data_filename(
self._datadir, pair, timeframe, candle_type=candle_type, no_timeframe_modify=True)
if not filename.exists():
return DataFrame(columns=self._columns)
pairdata = read_parquet(filename)
pairdata.columns = self._columns
pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float',
'low': 'float', 'close': 'float', 'volume': 'float'})
pairdata['date'] = to_datetime(pairdata['date'],
unit='ms',
utc=True,
infer_datetime_format=True)
return pairdata
def ohlcv_append(
self,
pair: str,
timeframe: str,
data: DataFrame,
candle_type: CandleType
) -> None:
"""
Append data to existing data structures
:param pair: Pair
:param timeframe: Timeframe this ohlcv data is for
:param data: Data to append.
:param candle_type: Any of the enum CandleType (must match trading mode!)
"""
raise NotImplementedError()
def trades_store(self, pair: str, data: TradeList) -> None:
"""
Store trades data (list of Dicts) to file
:param pair: Pair - used for filename
:param data: List of Lists containing trade data,
column sequence as in DEFAULT_TRADES_COLUMNS
"""
# filename = self._pair_trades_filename(self._datadir, pair)
raise NotImplementedError()
# array = pa.array(data)
# array
# feather.write_feather(data, filename)
def trades_append(self, pair: str, data: TradeList):
"""
Append data to existing files
:param pair: Pair - used for filename
:param data: List of Lists containing trade data,
column sequence as in DEFAULT_TRADES_COLUMNS
"""
raise NotImplementedError()
def _trades_load(self, pair: str, timerange: Optional[TimeRange] = None) -> TradeList:
"""
Load a pair from file, either .json.gz or .json
# TODO: respect timerange ...
:param pair: Load trades for this pair
:param timerange: Timerange to load trades for - currently not implemented
:return: List of trades
"""
raise NotImplementedError()
# filename = self._pair_trades_filename(self._datadir, pair)
# tradesdata = misc.file_load_json(filename)
# if not tradesdata:
# return []
# return tradesdata
@classmethod
def _get_file_extension(cls):
return "parquet"

View File

@ -313,6 +313,7 @@ class FreqaiDataDrawer:
"""
dk.find_features(dataframe)
dk.find_labels(dataframe)
full_labels = dk.label_list + dk.unique_class_list
@ -376,7 +377,27 @@ class FreqaiDataDrawer:
if self.config.get("freqai", {}).get("purge_old_models", False):
self.purge_old_models()
# Functions pulled back from FreqaiDataKitchen because they relied on DataDrawer
def save_metadata(self, dk: FreqaiDataKitchen) -> None:
"""
Saves only metadata for backtesting studies if user prefers
not to save model data. This saves tremendous amounts of space
for users generating huge studies.
This is only active when `save_backtest_models`: false (not default)
"""
if not dk.data_path.is_dir():
dk.data_path.mkdir(parents=True, exist_ok=True)
save_path = Path(dk.data_path)
dk.data["data_path"] = str(dk.data_path)
dk.data["model_filename"] = str(dk.model_filename)
dk.data["training_features_list"] = list(dk.data_dictionary["train_features"].columns)
dk.data["label_list"] = dk.label_list
with open(save_path / f"{dk.model_filename}_metadata.json", "w") as fp:
rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE)
return
def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None:
"""

View File

@ -858,7 +858,7 @@ class FreqaiDataKitchen:
inlier_metric = pd.DataFrame(
data=inliers.sum(axis=1) / no_prev_pts,
columns=['inlier_metric'],
columns=['%-inlier_metric'],
index=compute_df.index
)
@ -908,11 +908,14 @@ class FreqaiDataKitchen:
"""
column_names = dataframe.columns
features = [c for c in column_names if "%" in c]
labels = [c for c in column_names if "&" in c]
if not features:
raise OperationalException("Could not find any features!")
self.training_features_list = features
def find_labels(self, dataframe: DataFrame) -> None:
column_names = dataframe.columns
labels = [c for c in column_names if "&" in c]
self.label_list = labels
def check_if_pred_in_training_spaces(self) -> None:
@ -1233,7 +1236,8 @@ class FreqaiDataKitchen:
def get_unique_classes_from_labels(self, dataframe: DataFrame) -> None:
self.find_features(dataframe)
# self.find_features(dataframe)
self.find_labels(dataframe)
for key in self.label_list:
if dataframe[key].dtype == object:

View File

@ -0,0 +1,783 @@
import logging
import shutil
import threading
import time
from abc import ABC, abstractmethod
from collections import deque
from datetime import datetime, timezone
from pathlib import Path
from threading import Lock
from typing import Any, Dict, List, Tuple
import numpy as np
import pandas as pd
from numpy.typing import NDArray
from pandas import DataFrame
from freqtrade.configuration import TimeRange
from freqtrade.constants import DATETIME_PRINT_FORMAT, Config
from freqtrade.enums import RunMode
from freqtrade.exceptions import OperationalException
from freqtrade.exchange import timeframe_to_seconds
from freqtrade.freqai.data_drawer import FreqaiDataDrawer
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from freqtrade.freqai.utils import plot_feature_importance
from freqtrade.strategy.interface import IStrategy
pd.options.mode.chained_assignment = None
logger = logging.getLogger(__name__)
class IFreqaiModel(ABC):
"""
Class containing all tools for training and prediction in the strategy.
Base*PredictionModels inherit from this class.
Record of contribution:
FreqAI was developed by a group of individuals who all contributed specific skillsets to the
project.
Conception and software development:
Robert Caulk @robcaulk
Theoretical brainstorming:
Elin Törnquist @th0rntwig
Code review, software architecture brainstorming:
@xmatthias
Beta testing and bug reporting:
@bloodhunter4rc, Salah Lamkadem @ikonx, @ken11o2, @longyu, @paranoidandy, @smidelis, @smarm
Juha Nykänen @suikula, Wagner Costa @wagnercosta, Johan Vlugt @Jooopieeert
"""
def __init__(self, config: Config) -> None:
self.config = config
self.assert_config(self.config)
self.freqai_info: Dict[str, Any] = config["freqai"]
self.data_split_parameters: Dict[str, Any] = config.get("freqai", {}).get(
"data_split_parameters", {})
self.model_training_parameters: Dict[str, Any] = config.get("freqai", {}).get(
"model_training_parameters", {})
self.retrain = False
self.first = True
self.set_full_path()
self.follow_mode: bool = self.freqai_info.get("follow_mode", False)
self.save_backtest_models: bool = self.freqai_info.get("save_backtest_models", True)
if self.save_backtest_models:
logger.info('Backtesting module configured to save all models.')
self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode)
self.identifier: str = self.freqai_info.get("identifier", "no_id_provided")
self.scanning = False
self.ft_params = self.freqai_info["feature_parameters"]
self.keras: bool = self.freqai_info.get("keras", False)
if self.keras and self.ft_params.get("DI_threshold", 0):
self.ft_params["DI_threshold"] = 0
logger.warning("DI threshold is not configured for Keras models yet. Deactivating.")
self.CONV_WIDTH = self.freqai_info.get("conv_width", 2)
if self.ft_params.get("inlier_metric_window", 0):
self.CONV_WIDTH = self.ft_params.get("inlier_metric_window", 0) * 2
self.pair_it = 0
self.pair_it_train = 0
self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist"))
self.train_queue = self._set_train_queue()
self.last_trade_database_summary: DataFrame = {}
self.current_trade_database_summary: DataFrame = {}
self.analysis_lock = Lock()
self.inference_time: float = 0
self.train_time: float = 0
self.begin_time: float = 0
self.begin_time_train: float = 0
self.base_tf_seconds = timeframe_to_seconds(self.config['timeframe'])
self.continual_learning = self.freqai_info.get('continual_learning', False)
self._threads: List[threading.Thread] = []
self._stop_event = threading.Event()
def __getstate__(self):
"""
Return an empty state to be pickled in hyperopt
"""
return ({})
def assert_config(self, config: Config) -> None:
if not config.get("freqai", {}):
raise OperationalException("No freqai parameters found in configuration file.")
def start(self, dataframe: DataFrame, metadata: dict, strategy: IStrategy) -> DataFrame:
"""
Entry point to the FreqaiModel from a specific pair, it will train a new model if
necessary before making the prediction.
:param dataframe: Full dataframe coming from strategy - it contains entire
backtesting timerange + additional historical data necessary to train
the model.
:param metadata: pair metadata coming from strategy.
:param strategy: Strategy to train on
"""
self.live = strategy.dp.runmode in (RunMode.DRY_RUN, RunMode.LIVE)
self.dd.set_pair_dict_info(metadata)
if self.live:
self.inference_timer('start')
self.dk = FreqaiDataKitchen(self.config, self.live, metadata["pair"])
dk = self.start_live(dataframe, metadata, strategy, self.dk)
# For backtesting, each pair enters and then gets trained for each window along the
# sliding window defined by "train_period_days" (training window) and "live_retrain_hours"
# (backtest window, i.e. window immediately following the training window).
# FreqAI slides the window and sequentially builds the backtesting results before returning
# the concatenated results for the full backtesting period back to the strategy.
elif not self.follow_mode:
self.dk = FreqaiDataKitchen(self.config, self.live, metadata["pair"])
if(self.dk.backtest_live_models):
logger.info(
f"Backtesting {len(self.dk.backtesting_timeranges)} timeranges (Live Models)")
else:
logger.info(f"Training {len(self.dk.training_timeranges)} timeranges")
dataframe = self.dk.use_strategy_to_populate_indicators(
strategy, prediction_dataframe=dataframe, pair=metadata["pair"]
)
dk = self.start_backtesting(dataframe, metadata, self.dk)
# else:
# dk = self.start_backtesting_live_models(dataframe, metadata, self.dk)
dataframe = dk.remove_features_from_df(dk.return_dataframe)
self.clean_up()
if self.live:
self.inference_timer('stop')
return dataframe
def clean_up(self):
"""
Objects that should be handled by GC already between coins, but
are explicitly shown here to help demonstrate the non-persistence of these
objects.
"""
self.model = None
self.dk = None
def shutdown(self):
"""
Cleans up threads on Shutdown, set stop event. Join threads to wait
for current training iteration.
"""
logger.info("Stopping FreqAI")
self._stop_event.set()
logger.info("Waiting on Training iteration")
for _thread in self._threads:
_thread.join()
def start_scanning(self, *args, **kwargs) -> None:
"""
Start `self._start_scanning` in a separate thread
"""
_thread = threading.Thread(target=self._start_scanning, args=args, kwargs=kwargs)
self._threads.append(_thread)
_thread.start()
def _start_scanning(self, strategy: IStrategy) -> None:
"""
Function designed to constantly scan pairs for retraining on a separate thread (intracandle)
to improve model youth. This function is agnostic to data preparation/collection/storage,
it simply trains on what ever data is available in the self.dd.
:param strategy: IStrategy = The user defined strategy class
"""
while not self._stop_event.is_set():
time.sleep(1)
pair = self.train_queue[0]
# ensure pair is avaialble in dp
if pair not in strategy.dp.current_whitelist():
self.train_queue.popleft()
logger.warning(f'{pair} not in current whitelist, removing from train queue.')
continue
(_, trained_timestamp, _) = self.dd.get_pair_dict_info(pair)
dk = FreqaiDataKitchen(self.config, self.live, pair)
dk.set_paths(pair, trained_timestamp)
(
retrain,
new_trained_timerange,
data_load_timerange,
) = dk.check_if_new_training_required(trained_timestamp)
dk.set_paths(pair, new_trained_timerange.stopts)
if retrain:
self.train_timer('start')
try:
self.extract_data_and_train_model(
new_trained_timerange, pair, strategy, dk, data_load_timerange
)
except Exception as msg:
logger.warning(f'Training {pair} raised exception {msg}, skipping.')
self.train_timer('stop')
# only rotate the queue after the first has been trained.
self.train_queue.rotate(-1)
self.dd.save_historic_predictions_to_disk()
def start_backtesting(
self, dataframe: DataFrame, metadata: dict, dk: FreqaiDataKitchen
) -> FreqaiDataKitchen:
"""
The main broad execution for backtesting. For backtesting, each pair enters and then gets
trained for each window along the sliding window defined by "train_period_days"
(training window) and "backtest_period_days" (backtest window, i.e. window immediately
following the training window). FreqAI slides the window and sequentially builds
the backtesting results before returning the concatenated results for the full
backtesting period back to the strategy.
:param dataframe: DataFrame = strategy passed dataframe
:param metadata: Dict = pair metadata
:param dk: FreqaiDataKitchen = Data management/analysis tool associated to present pair only
:return:
FreqaiDataKitchen = Data management/analysis tool associated to present pair only
"""
self.pair_it += 1
train_it = 0
# Loop enforcing the sliding window training/backtesting paradigm
# tr_train is the training time range e.g. 1 historical month
# tr_backtest is the backtesting time range e.g. the week directly
# following tr_train. Both of these windows slide through the
# entire backtest
for tr_train, tr_backtest in zip(dk.training_timeranges, dk.backtesting_timeranges):
pair = metadata["pair"]
(_, _, _) = self.dd.get_pair_dict_info(pair)
train_it += 1
total_trains = len(dk.backtesting_timeranges)
self.training_timerange = tr_train
dataframe_train = dk.slice_dataframe(tr_train, dataframe)
dataframe_backtest = dk.slice_dataframe(tr_backtest, dataframe)
trained_timestamp = tr_train
tr_train_startts_str = datetime.fromtimestamp(
tr_train.startts,
tz=timezone.utc).strftime(DATETIME_PRINT_FORMAT)
tr_train_stopts_str = datetime.fromtimestamp(
tr_train.stopts,
tz=timezone.utc).strftime(DATETIME_PRINT_FORMAT)
if not dk.backtest_live_models:
logger.info(
f"Training {pair}, {self.pair_it}/{self.total_pairs} pairs"
f" from {tr_train_startts_str}"
f" to {tr_train_stopts_str}, {train_it}/{total_trains} "
"trains"
)
timestamp_model_id = int(trained_timestamp.stopts)
if dk.backtest_live_models:
timestamp_model_id = int(tr_backtest.startts)
dk.data_path = Path(
dk.full_path / f"sub-train-{pair.split('/')[0]}_{timestamp_model_id}"
)
dk.set_new_model_names(pair, timestamp_model_id)
if dk.check_if_backtest_prediction_exists():
self.dd.load_metadata(dk)
if not dk.backtest_live_models:
self.check_if_feature_list_matches_strategy(dataframe_train, dk)
append_df = dk.get_backtesting_prediction()
dk.append_predictions(append_df)
else:
if not self.model_exists(dk):
if dk.backtest_live_models:
raise OperationalException(
"Training models is not allowed "
"in backtest_live_models backtesting "
"mode"
)
dk.find_features(dataframe_train)
self.model = self.train(dataframe_train, pair, dk)
self.dd.pair_dict[pair]["trained_timestamp"] = int(
trained_timestamp.stopts)
if self.save_backtest_models:
logger.info('Saving backtest model to disk.')
self.dd.save_data(self.model, pair, dk)
else:
self.model = self.dd.load_data(pair, dk)
self.check_if_feature_list_matches_strategy(dataframe_train, dk)
pred_df, do_preds = self.predict(dataframe_backtest, dk)
append_df = dk.get_predictions_to_append(pred_df, do_preds)
dk.append_predictions(append_df)
dk.save_backtesting_prediction(append_df)
dk.fill_predictions(dataframe)
return dk
def start_live(
self, dataframe: DataFrame, metadata: dict, strategy: IStrategy, dk: FreqaiDataKitchen
) -> FreqaiDataKitchen:
"""
The main broad execution for dry/live. This function will check if a retraining should be
performed, and if so, retrain and reset the model.
:param dataframe: DataFrame = strategy passed dataframe
:param metadata: Dict = pair metadata
:param strategy: IStrategy = currently employed strategy
dk: FreqaiDataKitchen = Data management/analysis tool associated to present pair only
:returns:
dk: FreqaiDataKitchen = Data management/analysis tool associated to present pair only
"""
# update follower
if self.follow_mode:
self.dd.update_follower_metadata()
# get the model metadata associated with the current pair
(_, trained_timestamp, return_null_array) = self.dd.get_pair_dict_info(metadata["pair"])
# if the metadata doesn't exist, the follower returns null arrays to strategy
if self.follow_mode and return_null_array:
logger.info("Returning null array from follower to strategy")
self.dd.return_null_values_to_strategy(dataframe, dk)
return dk
# append the historic data once per round
if self.dd.historic_data:
self.dd.update_historic_data(strategy, dk)
logger.debug(f'Updating historic data on pair {metadata["pair"]}')
if not self.follow_mode:
(_, new_trained_timerange, data_load_timerange) = dk.check_if_new_training_required(
trained_timestamp
)
dk.set_paths(metadata["pair"], new_trained_timerange.stopts)
# load candle history into memory if it is not yet.
if not self.dd.historic_data:
self.dd.load_all_pair_histories(data_load_timerange, dk)
if not self.scanning:
self.scanning = True
self.start_scanning(strategy)
elif self.follow_mode:
dk.set_paths(metadata["pair"], trained_timestamp)
logger.info(
"FreqAI instance set to follow_mode, finding existing pair "
f"using { self.identifier }"
)
# load the model and associated data into the data kitchen
self.model = self.dd.load_data(metadata["pair"], dk)
with self.analysis_lock:
dataframe = self.dk.use_strategy_to_populate_indicators(
strategy, prediction_dataframe=dataframe, pair=metadata["pair"]
)
if not self.model:
logger.warning(
f"No model ready for {metadata['pair']}, returning null values to strategy."
)
self.dd.return_null_values_to_strategy(dataframe, dk)
return dk
# ensure user is feeding the correct indicators to the model
self.check_if_feature_list_matches_strategy(dataframe, dk)
self.build_strategy_return_arrays(dataframe, dk, metadata["pair"], trained_timestamp)
return dk
def build_strategy_return_arrays(
self, dataframe: DataFrame, dk: FreqaiDataKitchen, pair: str, trained_timestamp: int
) -> None:
# hold the historical predictions in memory so we are sending back
# correct array to strategy
if pair not in self.dd.model_return_values:
# first predictions are made on entire historical candle set coming from strategy. This
# allows FreqUI to show full return values.
pred_df, do_preds = self.predict(dataframe, dk)
if pair not in self.dd.historic_predictions:
self.set_initial_historic_predictions(pred_df, dk, pair)
self.dd.set_initial_return_values(pair, pred_df)
dk.return_dataframe = self.dd.attach_return_values_to_return_dataframe(pair, dataframe)
return
elif self.dk.check_if_model_expired(trained_timestamp):
pred_df = DataFrame(np.zeros((2, len(dk.label_list))), columns=dk.label_list)
do_preds = np.ones(2, dtype=np.int_) * 2
dk.DI_values = np.zeros(2)
logger.warning(
f"Model expired for {pair}, returning null values to strategy. Strategy "
"construction should take care to consider this event with "
"prediction == 0 and do_predict == 2"
)
else:
# remaining predictions are made only on the most recent candles for performance and
# historical accuracy reasons.
pred_df, do_preds = self.predict(dataframe.iloc[-self.CONV_WIDTH:], dk, first=False)
if self.freqai_info.get('fit_live_predictions_candles', 0) and self.live:
self.fit_live_predictions(dk, pair)
self.dd.append_model_predictions(pair, pred_df, do_preds, dk, len(dataframe))
dk.return_dataframe = self.dd.attach_return_values_to_return_dataframe(pair, dataframe)
return
def check_if_feature_list_matches_strategy(
self, dataframe: DataFrame, dk: FreqaiDataKitchen
) -> None:
"""
Ensure user is passing the proper feature set if they are reusing an `identifier` pointing
to a folder holding existing models.
:param dataframe: DataFrame = strategy provided dataframe
:param dk: FreqaiDataKitchen = non-persistent data container/analyzer for
current coin/bot loop
"""
dk.find_features(dataframe)
if "training_features_list_raw" in dk.data:
feature_list = dk.data["training_features_list_raw"]
else:
feature_list = dk.data['training_features_list']
if dk.training_features_list != feature_list:
raise OperationalException(
"Trying to access pretrained model with `identifier` "
"but found different features furnished by current strategy."
"Change `identifier` to train from scratch, or ensure the"
"strategy is furnishing the same features as the pretrained"
"model. In case of --strategy-list, please be aware that FreqAI "
"requires all strategies to maintain identical "
"populate_any_indicator() functions"
)
def data_cleaning_train(self, dk: FreqaiDataKitchen) -> None:
"""
Base data cleaning method for train.
Functions here improve/modify the input data by identifying outliers,
computing additional metrics, adding noise, reducing dimensionality etc.
"""
ft_params = self.freqai_info["feature_parameters"]
if ft_params.get('inlier_metric_window', 0):
dk.compute_inlier_metric(set_='train')
if self.freqai_info["data_split_parameters"]["test_size"] > 0:
dk.compute_inlier_metric(set_='test')
if ft_params.get(
"principal_component_analysis", False
):
dk.principal_component_analysis()
if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=False)
if ft_params.get("DI_threshold", 0):
dk.data["avg_mean_dist"] = dk.compute_distances()
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
if dk.pair in self.dd.old_DBSCAN_eps:
eps = self.dd.old_DBSCAN_eps[dk.pair]
else:
eps = None
dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps)
self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps']
if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0):
dk.add_noise_to_training_features()
def data_cleaning_predict(self, dk: FreqaiDataKitchen, dataframe: DataFrame) -> None:
"""
Base data cleaning method for predict.
Functions here are complementary to the functions of data_cleaning_train.
"""
ft_params = self.freqai_info["feature_parameters"]
if ft_params.get('inlier_metric_window', 0):
dk.compute_inlier_metric(set_='predict')
if ft_params.get(
"principal_component_analysis", False
):
dk.pca_transform(self.dk.data_dictionary['prediction_features'])
if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=True)
if ft_params.get("DI_threshold", 0):
dk.check_if_pred_in_training_spaces()
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
dk.use_DBSCAN_to_remove_outliers(predict=True)
def model_exists(
self,
dk: FreqaiDataKitchen,
scanning: bool = False,
) -> bool:
"""
Given a pair and path, check if a model already exists
:param pair: pair e.g. BTC/USD
:param path: path to model
:return:
:boolean: whether the model file exists or not.
"""
path_to_modelfile = Path(dk.data_path / f"{dk.model_filename}_model.joblib")
file_exists = path_to_modelfile.is_file()
if file_exists and not scanning:
logger.info("Found model at %s", dk.data_path / dk.model_filename)
elif not scanning:
logger.info("Could not find model at %s", dk.data_path / dk.model_filename)
return file_exists
def set_full_path(self) -> None:
self.full_path = Path(
self.config["user_data_dir"] / "models" / f"{self.freqai_info['identifier']}"
)
self.full_path.mkdir(parents=True, exist_ok=True)
shutil.copy(
self.config["config_files"][0],
Path(self.full_path, Path(self.config["config_files"][0]).name),
)
def extract_data_and_train_model(
self,
new_trained_timerange: TimeRange,
pair: str,
strategy: IStrategy,
dk: FreqaiDataKitchen,
data_load_timerange: TimeRange,
):
"""
Retrieve data and train model.
:param new_trained_timerange: TimeRange = the timerange to train the model on
:param metadata: dict = strategy provided metadata
:param strategy: IStrategy = user defined strategy object
:param dk: FreqaiDataKitchen = non-persistent data container for current coin/loop
:param data_load_timerange: TimeRange = the amount of data to be loaded
for populate_any_indicators
(larger than new_trained_timerange so that
new_trained_timerange does not contain any NaNs)
"""
corr_dataframes, base_dataframes = self.dd.get_base_and_corr_dataframes(
data_load_timerange, pair, dk
)
with self.analysis_lock:
unfiltered_dataframe = dk.use_strategy_to_populate_indicators(
strategy, corr_dataframes, base_dataframes, pair
)
unfiltered_dataframe = dk.slice_dataframe(new_trained_timerange, unfiltered_dataframe)
# find the features indicated by strategy and store in datakitchen
dk.find_features(unfiltered_dataframe)
model = self.train(unfiltered_dataframe, pair, dk)
self.dd.pair_dict[pair]["trained_timestamp"] = new_trained_timerange.stopts
dk.set_new_model_names(pair, int(new_trained_timerange.stopts))
self.dd.save_data(model, pair, dk)
if self.freqai_info["feature_parameters"].get("plot_feature_importance", False):
plot_feature_importance(model, pair, dk)
if self.freqai_info.get("purge_old_models", False):
self.dd.purge_old_models()
def set_initial_historic_predictions(
self, pred_df: DataFrame, dk: FreqaiDataKitchen, pair: str
) -> None:
"""
This function is called only if the datadrawer failed to load an
existing set of historic predictions. In this case, it builds
the structure and sets fake predictions off the first training
data. After that, FreqAI will append new real predictions to the
set of historic predictions.
These values are used to generate live statistics which can be used
in the strategy for adaptive values. E.g. &*_mean/std are quantities
that can computed based on live predictions from the set of historical
predictions. Those values can be used in the user strategy to better
assess prediction rarity, and thus wait for probabilistically favorable
entries relative to the live historical predictions.
If the user reuses an identifier on a subsequent instance,
this function will not be called. In that case, "real" predictions
will be appended to the loaded set of historic predictions.
:param: df: DataFrame = the dataframe containing the training feature data
:param: model: Any = A model which was `fit` using a common library such as
catboost or lightgbm
:param: dk: FreqaiDataKitchen = object containing methods for data analysis
:param: pair: str = current pair
"""
self.dd.historic_predictions[pair] = pred_df
hist_preds_df = self.dd.historic_predictions[pair]
for label in hist_preds_df.columns:
if hist_preds_df[label].dtype == object:
continue
hist_preds_df[f'{label}_mean'] = 0
hist_preds_df[f'{label}_std'] = 0
hist_preds_df['do_predict'] = 0
if self.freqai_info['feature_parameters'].get('DI_threshold', 0) > 0:
hist_preds_df['DI_values'] = 0
for return_str in dk.data['extra_returns_per_train']:
hist_preds_df[return_str] = 0
# # for keras type models, the conv_window needs to be prepended so
# # viewing is correct in frequi
if self.freqai_info.get('keras', False) or self.ft_params.get('inlier_metric_window', 0):
n_lost_points = self.freqai_info.get('conv_width', 2)
zeros_df = DataFrame(np.zeros((n_lost_points, len(hist_preds_df.columns))),
columns=hist_preds_df.columns)
self.dd.historic_predictions[pair] = pd.concat(
[zeros_df, hist_preds_df], axis=0, ignore_index=True)
def fit_live_predictions(self, dk: FreqaiDataKitchen, pair: str) -> None:
"""
Fit the labels with a gaussian distribution
"""
import scipy as spy
# add classes from classifier label types if used
full_labels = dk.label_list + dk.unique_class_list
num_candles = self.freqai_info.get("fit_live_predictions_candles", 100)
dk.data["labels_mean"], dk.data["labels_std"] = {}, {}
for label in full_labels:
if self.dd.historic_predictions[dk.pair][label].dtype == object:
continue
f = spy.stats.norm.fit(self.dd.historic_predictions[dk.pair][label].tail(num_candles))
dk.data["labels_mean"][label], dk.data["labels_std"][label] = f[0], f[1]
return
def inference_timer(self, do='start'):
"""
Timer designed to track the cumulative time spent in FreqAI for one pass through
the whitelist. This will check if the time spent is more than 1/4 the time
of a single candle, and if so, it will warn the user of degraded performance
"""
if do == 'start':
self.pair_it += 1
self.begin_time = time.time()
elif do == 'stop':
end = time.time()
self.inference_time += (end - self.begin_time)
if self.pair_it == self.total_pairs:
logger.info(
f'Total time spent inferencing pairlist {self.inference_time:.2f} seconds')
if self.inference_time > 0.25 * self.base_tf_seconds:
logger.warning("Inference took over 25% of the candle time. Reduce pairlist to"
" avoid blinding open trades and degrading performance.")
self.pair_it = 0
self.inference_time = 0
return
def train_timer(self, do='start'):
"""
Timer designed to track the cumulative time spent training the full pairlist in
FreqAI.
"""
if do == 'start':
self.pair_it_train += 1
self.begin_time_train = time.time()
elif do == 'stop':
end = time.time()
self.train_time += (end - self.begin_time_train)
if self.pair_it_train == self.total_pairs:
logger.info(
f'Total time spent training pairlist {self.train_time:.2f} seconds')
self.pair_it_train = 0
self.train_time = 0
return
def get_init_model(self, pair: str) -> Any:
if pair not in self.dd.model_dictionary or not self.continual_learning:
init_model = None
else:
init_model = self.dd.model_dictionary[pair]
return init_model
def _set_train_queue(self):
"""
Sets train queue from existing train timestamps if they exist
otherwise it sets the train queue based on the provided whitelist.
"""
current_pairlist = self.config.get("exchange", {}).get("pair_whitelist")
if not self.dd.pair_dict:
logger.info('Set fresh train queue from whitelist. '
f'Queue: {current_pairlist}')
return deque(current_pairlist)
best_queue = deque()
pair_dict_sorted = sorted(self.dd.pair_dict.items(),
key=lambda k: k[1]['trained_timestamp'])
for pair in pair_dict_sorted:
if pair[0] in current_pairlist:
best_queue.append(pair[0])
for pair in current_pairlist:
if pair not in best_queue:
best_queue.appendleft(pair)
logger.info('Set existing queue from trained timestamps. '
f'Best approximation queue: {best_queue}')
return best_queue
# Following methods which are overridden by user made prediction models.
# See freqai/prediction_models/CatboostPredictionModel.py for an example.
@abstractmethod
def train(self, unfiltered_df: DataFrame, pair: str,
dk: FreqaiDataKitchen, **kwargs) -> Any:
"""
Filter the training data and train a model to it. Train makes heavy use of the datahandler
for storing, saving, loading, and analyzing the data.
:param unfiltered_df: Full dataframe for the current training period
:param metadata: pair metadata from strategy.
:return: Trained model which can be used to inference (self.predict)
"""
@abstractmethod
def fit(self, data_dictionary: Dict[str, Any], dk: FreqaiDataKitchen, **kwargs) -> Any:
"""
Most regressors use the same function names and arguments e.g. user
can drop in LGBMRegressor in place of CatBoostRegressor and all data
management will be properly handled by Freqai.
:param data_dictionary: Dict = the dictionary constructed by DataHandler to hold
all the training and test data/labels.
"""
return
@abstractmethod
def predict(
self, unfiltered_df: DataFrame, dk: FreqaiDataKitchen, **kwargs
) -> Tuple[DataFrame, NDArray[np.int_]]:
"""
Filter the prediction features data and predict with it.
:param unfiltered_df: Full dataframe for the current backtest period.
:param dk: FreqaiDataKitchen = Data management/analysis tool associated to present pair only
:param first: boolean = whether this is the first prediction or not.
:return:
:predictions: np.array of predictions
:do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
data (NaNs) or felt uncertain about data (i.e. SVM and/or DI index)
"""

View File

@ -92,6 +92,7 @@ class IFreqaiModel(ABC):
self.begin_time_train: float = 0
self.base_tf_seconds = timeframe_to_seconds(self.config['timeframe'])
self.continual_learning = self.freqai_info.get('continual_learning', False)
self.plot_features = self.ft_params.get("plot_feature_importances", 0)
self._threads: List[threading.Thread] = []
self._stop_event = threading.Event()
@ -143,8 +144,6 @@ class IFreqaiModel(ABC):
strategy, prediction_dataframe=dataframe, pair=metadata["pair"]
)
dk = self.start_backtesting(dataframe, metadata, self.dk)
# else:
# dk = self.start_backtesting_live_models(dataframe, metadata, self.dk)
dataframe = dk.remove_features_from_df(dk.return_dataframe)
self.clean_up()
@ -268,8 +267,8 @@ class IFreqaiModel(ABC):
if not dk.backtest_live_models:
logger.info(
f"Training {pair}, {self.pair_it}/{self.total_pairs} pairs"
f" from {tr_train_startts_str}"
f" to {tr_train_stopts_str}, {train_it}/{total_trains} "
f" from {tr_train_startts_str} "
f"to {tr_train_stopts_str}, {train_it}/{total_trains} "
"trains"
)
@ -285,9 +284,7 @@ class IFreqaiModel(ABC):
if dk.check_if_backtest_prediction_exists():
self.dd.load_metadata(dk)
if not dk.backtest_live_models:
self.check_if_feature_list_matches_strategy(dataframe_train, dk)
append_df = dk.get_backtesting_prediction()
dk.append_predictions(append_df)
else:
@ -299,24 +296,29 @@ class IFreqaiModel(ABC):
"mode"
)
dk.find_features(dataframe_train)
dk.find_labels(dataframe_train)
self.model = self.train(dataframe_train, pair, dk)
self.dd.pair_dict[pair]["trained_timestamp"] = int(
trained_timestamp.stopts)
if self.plot_features:
plot_feature_importance(self.model, pair, dk, self.plot_features)
if self.save_backtest_models:
logger.info('Saving backtest model to disk.')
self.dd.save_data(self.model, pair, dk)
else:
logger.info('Saving metadata to disk.')
self.dd.save_metadata(dk)
else:
self.model = self.dd.load_data(pair, dk)
self.check_if_feature_list_matches_strategy(dataframe_train, dk)
# self.check_if_feature_list_matches_strategy(dataframe_train, dk)
pred_df, do_preds = self.predict(dataframe_backtest, dk)
append_df = dk.get_predictions_to_append(pred_df, do_preds)
dk.append_predictions(append_df)
dk.save_backtesting_prediction(append_df)
dk.fill_predictions(dataframe)
return dk
def start_live(
@ -388,8 +390,7 @@ class IFreqaiModel(ABC):
self.dd.return_null_values_to_strategy(dataframe, dk)
return dk
# ensure user is feeding the correct indicators to the model
self.check_if_feature_list_matches_strategy(dataframe, dk)
dk.find_labels(dataframe)
self.build_strategy_return_arrays(dataframe, dk, metadata["pair"], trained_timestamp)
@ -508,7 +509,7 @@ class IFreqaiModel(ABC):
if ft_params.get(
"principal_component_analysis", False
):
dk.pca_transform(self.dk.data_dictionary['prediction_features'])
dk.pca_transform(dk.data_dictionary['prediction_features'])
if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=True)
@ -519,11 +520,10 @@ class IFreqaiModel(ABC):
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
dk.use_DBSCAN_to_remove_outliers(predict=True)
def model_exists(
self,
dk: FreqaiDataKitchen,
scanning: bool = False,
) -> bool:
# ensure user is feeding the correct indicators to the model
self.check_if_feature_list_matches_strategy(dk.data_dictionary['prediction_features'], dk)
def model_exists(self, dk: FreqaiDataKitchen) -> bool:
"""
Given a pair and path, check if a model already exists
:param pair: pair e.g. BTC/USD
@ -533,9 +533,9 @@ class IFreqaiModel(ABC):
"""
path_to_modelfile = Path(dk.data_path / f"{dk.model_filename}_model.joblib")
file_exists = path_to_modelfile.is_file()
if file_exists and not scanning:
if file_exists:
logger.info("Found model at %s", dk.data_path / dk.model_filename)
elif not scanning:
else:
logger.info("Could not find model at %s", dk.data_path / dk.model_filename)
return file_exists
@ -582,6 +582,7 @@ class IFreqaiModel(ABC):
# find the features indicated by strategy and store in datakitchen
dk.find_features(unfiltered_dataframe)
dk.find_labels(unfiltered_dataframe)
model = self.train(unfiltered_dataframe, pair, dk)
@ -589,8 +590,8 @@ class IFreqaiModel(ABC):
dk.set_new_model_names(pair, int(new_trained_timerange.stopts))
self.dd.save_data(model, pair, dk)
if self.freqai_info["feature_parameters"].get("plot_feature_importance", False):
plot_feature_importance(model, pair, dk)
if self.plot_features:
plot_feature_importance(model, pair, dk, self.plot_features)
if self.freqai_info.get("purge_old_models", False):
self.dd.purge_old_models()

View File

@ -170,7 +170,7 @@ def plot_feature_importance(model: Any, pair: str, dk: FreqaiDataKitchen,
# Data preparation
fi_df = pd.DataFrame({
"feature_names": np.array(dk.training_features_list),
"feature_names": np.array(dk.data_dictionary['train_features'].columns),
"feature_importance": np.array(feature_importance)
})
fi_df_top = fi_df.nlargest(count_max, "feature_importance")[::-1]

View File

@ -21,6 +21,7 @@ jinja2==3.1.2
tables==3.7.0
blosc==1.10.6
joblib==1.2.0
pyarrow==9.0.0
# find first, C search in arrays
py_find_1st==1.1.5

View File

@ -8,13 +8,11 @@ hyperopt = [
'scikit-learn',
'scikit-optimize>=0.7.0',
'filelock',
'joblib',
'progressbar2',
]
freqai = [
'scikit-learn',
'joblib',
'catboost; platform_machine != "aarch64"',
'lightgbm',
]
@ -74,6 +72,8 @@ setup(
'pandas',
'tables',
'blosc',
'joblib',
'pyarrow',
'fastapi',
'uvicorn',
'psutil',

View File

@ -9,9 +9,11 @@ from pandas import DataFrame
from freqtrade.configuration import TimeRange
from freqtrade.constants import AVAILABLE_DATAHANDLERS
from freqtrade.data.history.featherdatahandler import FeatherDataHandler
from freqtrade.data.history.hdf5datahandler import HDF5DataHandler
from freqtrade.data.history.idatahandler import IDataHandler, get_datahandler, get_datahandlerclass
from freqtrade.data.history.jsondatahandler import JsonDataHandler, JsonGzDataHandler
from freqtrade.data.history.parquetdatahandler import ParquetDataHandler
from freqtrade.enums import CandleType, TradingMode
from tests.conftest import log_has
@ -152,6 +154,15 @@ def test_jsondatahandler_ohlcv_load(testdatadir, caplog):
assert df.columns.equals(df1.columns)
@pytest.mark.parametrize('datahandler', ['feather', 'parquet'])
def test_datahandler_trades_not_supported(datahandler, testdatadir, ):
dh = get_datahandler(testdatadir, datahandler)
with pytest.raises(NotImplementedError):
dh.trades_load('UNITTEST/ETH')
with pytest.raises(NotImplementedError):
dh.trades_store('UNITTEST/ETH', MagicMock())
def test_jsondatahandler_trades_load(testdatadir, caplog):
dh = JsonGzDataHandler(testdatadir)
logmsg = "Old trades format detected - converting"
@ -312,6 +323,67 @@ def test_hdf5datahandler_ohlcv_load_and_resave(
assert ohlcv.empty
@pytest.mark.parametrize('pair,timeframe,candle_type,candle_append,startdt,enddt', [
# Data goes from 2018-01-10 - 2018-01-30
('UNITTEST/BTC', '5m', 'spot', '', '2018-01-15', '2018-01-19'),
# Mark data goes from to 2021-11-15 2021-11-19
('UNITTEST/USDT', '1h', 'mark', '-mark', '2021-11-16', '2021-11-18'),
])
@pytest.mark.parametrize('datahandler', ['hdf5', 'feather', 'parquet'])
def test_generic_datahandler_ohlcv_load_and_resave(
datahandler,
testdatadir,
tmpdir,
pair,
timeframe,
candle_type,
candle_append,
startdt, enddt
):
tmpdir1 = Path(tmpdir)
tmpdir2 = tmpdir1
if candle_type not in ('', 'spot'):
tmpdir2 = tmpdir1 / 'futures'
tmpdir2.mkdir()
# Load data from one common file
dhbase = get_datahandler(testdatadir, 'json')
ohlcv = dhbase._ohlcv_load(pair, timeframe, None, candle_type=candle_type)
assert isinstance(ohlcv, DataFrame)
assert len(ohlcv) > 0
# Get data to test
dh = get_datahandler(testdatadir, datahandler)
file = tmpdir2 / f"UNITTEST_NEW-{timeframe}{candle_append}.{dh._get_file_extension()}"
assert not file.is_file()
dh1 = get_datahandler(tmpdir1, datahandler)
dh1.ohlcv_store('UNITTEST/NEW', timeframe, ohlcv, candle_type=candle_type)
assert file.is_file()
assert not ohlcv[ohlcv['date'] < startdt].empty
timerange = TimeRange.parse_timerange(f"{startdt.replace('-', '')}-{enddt.replace('-', '')}")
ohlcv = dhbase.ohlcv_load(pair, timeframe, timerange=timerange, candle_type=candle_type)
if datahandler == 'hdf5':
ohlcv1 = dh1._ohlcv_load('UNITTEST/NEW', timeframe, timerange, candle_type=candle_type)
if candle_type == 'mark':
ohlcv1['volume'] = 0.0
else:
ohlcv1 = dh1.ohlcv_load('UNITTEST/NEW', timeframe,
timerange=timerange, candle_type=candle_type)
assert len(ohlcv) == len(ohlcv1)
assert ohlcv.equals(ohlcv1)
assert ohlcv[ohlcv['date'] < startdt].empty
assert ohlcv[ohlcv['date'] > enddt].empty
# Try loading inexisting file
ohlcv = dh.ohlcv_load('UNITTEST/NONEXIST', timeframe, candle_type=candle_type)
assert ohlcv.empty
def test_hdf5datahandler_ohlcv_purge(mocker, testdatadir):
mocker.patch.object(Path, "exists", MagicMock(return_value=False))
unlinkmock = mocker.patch.object(Path, "unlink", MagicMock())
@ -330,13 +402,24 @@ def test_gethandlerclass():
cl = get_datahandlerclass('json')
assert cl == JsonDataHandler
assert issubclass(cl, IDataHandler)
cl = get_datahandlerclass('jsongz')
assert cl == JsonGzDataHandler
assert issubclass(cl, IDataHandler)
assert issubclass(cl, JsonDataHandler)
cl = get_datahandlerclass('hdf5')
assert cl == HDF5DataHandler
assert issubclass(cl, IDataHandler)
cl = get_datahandlerclass('feather')
assert cl == FeatherDataHandler
assert issubclass(cl, IDataHandler)
cl = get_datahandlerclass('parquet')
assert cl == ParquetDataHandler
assert issubclass(cl, IDataHandler)
with pytest.raises(ValueError, match=r"No datahandler for .*"):
get_datahandlerclass('DeadBeef')