From 3c0d2c446d6c886c9eccc3de5ab42cec4943ffb4 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 19 Sep 2022 20:23:20 +0200 Subject: [PATCH 01/16] Add Feather datahandler (no trade mode yet) --- freqtrade/constants.py | 2 +- freqtrade/data/history/featherdatahandler.py | 133 +++++++++++++++++++ freqtrade/data/history/idatahandler.py | 3 + 3 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 freqtrade/data/history/featherdatahandler.py diff --git a/freqtrade/constants.py b/freqtrade/constants.py index fe17b40bc..5727aff0a 100644 --- a/freqtrade/constants.py +++ b/freqtrade/constants.py @@ -36,7 +36,7 @@ AVAILABLE_PAIRLISTS = ['StaticPairList', 'VolumePairList', 'PrecisionFilter', 'PriceFilter', 'RangeStabilityFilter', 'ShuffleFilter', 'SpreadFilter', 'VolatilityFilter'] AVAILABLE_PROTECTIONS = ['CooldownPeriod', 'LowProfitPairs', 'MaxDrawdown', 'StoplossGuard'] -AVAILABLE_DATAHANDLERS = ['json', 'jsongz', 'hdf5'] +AVAILABLE_DATAHANDLERS = ['json', 'jsongz', 'hdf5', 'feather'] BACKTEST_BREAKDOWNS = ['day', 'week', 'month'] BACKTEST_CACHE_AGE = ['none', 'day', 'week', 'month'] BACKTEST_CACHE_DEFAULT = 'day' diff --git a/freqtrade/data/history/featherdatahandler.py b/freqtrade/data/history/featherdatahandler.py new file mode 100644 index 000000000..dfb818ca8 --- /dev/null +++ b/freqtrade/data/history/featherdatahandler.py @@ -0,0 +1,133 @@ +import logging +from typing import Optional + +from pandas import DataFrame, read_feather, to_datetime + +from freqtrade.configuration import TimeRange +from freqtrade.constants import DEFAULT_DATAFRAME_COLUMNS, TradeList +from freqtrade.enums import CandleType + +from .idatahandler import IDataHandler + + +logger = logging.getLogger(__name__) + + +class FeatherDataHandler(IDataHandler): + + _columns = DEFAULT_DATAFRAME_COLUMNS + + def ohlcv_store( + self, pair: str, timeframe: str, data: DataFrame, candle_type: CandleType) -> None: + """ + Store data in json format "values". + format looks as follows: + [[,,,,]] + :param pair: Pair - used to generate filename + :param timeframe: Timeframe - used to generate filename + :param data: Dataframe containing OHLCV data + :param candle_type: Any of the enum CandleType (must match trading mode!) + :return: None + """ + filename = self._pair_data_filename(self._datadir, pair, timeframe, candle_type) + self.create_dir_if_needed(filename) + + data.reset_index(drop=True).loc[:, self._columns].to_feather( + filename, compression_level=9, compression='lz4') + + def _ohlcv_load(self, pair: str, timeframe: str, + timerange: Optional[TimeRange], candle_type: CandleType + ) -> DataFrame: + """ + Internal method used to load data for one pair from disk. + Implements the loading and conversion to a Pandas dataframe. + Timerange trimming and dataframe validation happens outside of this method. + :param pair: Pair to load data + :param timeframe: Timeframe (e.g. "5m") + :param timerange: Limit data to be loaded to this timerange. + Optionally implemented by subclasses to avoid loading + all data where possible. + :param candle_type: Any of the enum CandleType (must match trading mode!) + :return: DataFrame with ohlcv data, or empty DataFrame + """ + filename = self._pair_data_filename( + self._datadir, pair, timeframe, candle_type=candle_type) + if not filename.exists(): + # Fallback mode for 1M files + filename = self._pair_data_filename( + self._datadir, pair, timeframe, candle_type=candle_type, no_timeframe_modify=True) + if not filename.exists(): + return DataFrame(columns=self._columns) + try: + pairdata = read_feather(filename) + pairdata.columns = self._columns + except ValueError: + logger.error(f"Could not load data for {pair}.") + return DataFrame(columns=self._columns) + pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float', + 'low': 'float', 'close': 'float', 'volume': 'float'}) + pairdata['date'] = to_datetime(pairdata['date'], + unit='ms', + utc=True, + infer_datetime_format=True) + return pairdata + + def ohlcv_append( + self, + pair: str, + timeframe: str, + data: DataFrame, + candle_type: CandleType + ) -> None: + """ + Append data to existing data structures + :param pair: Pair + :param timeframe: Timeframe this ohlcv data is for + :param data: Data to append. + :param candle_type: Any of the enum CandleType (must match trading mode!) + """ + raise NotImplementedError() + + def trades_store(self, pair: str, data: TradeList) -> None: + """ + Store trades data (list of Dicts) to file + :param pair: Pair - used for filename + :param data: List of Lists containing trade data, + column sequence as in DEFAULT_TRADES_COLUMNS + """ + # filename = self._pair_trades_filename(self._datadir, pair) + + raise NotImplementedError() + # array = pa.array(data) + # array + # feather.write_feather(data, filename) + + def trades_append(self, pair: str, data: TradeList): + """ + Append data to existing files + :param pair: Pair - used for filename + :param data: List of Lists containing trade data, + column sequence as in DEFAULT_TRADES_COLUMNS + """ + raise NotImplementedError() + + def _trades_load(self, pair: str, timerange: Optional[TimeRange] = None) -> TradeList: + """ + Load a pair from file, either .json.gz or .json + # TODO: respect timerange ... + :param pair: Load trades for this pair + :param timerange: Timerange to load trades for - currently not implemented + :return: List of trades + """ + raise NotImplementedError() + # filename = self._pair_trades_filename(self._datadir, pair) + # tradesdata = misc.file_load_json(filename) + + # if not tradesdata: + # return [] + + # return tradesdata + + @classmethod + def _get_file_extension(cls): + return "feather" diff --git a/freqtrade/data/history/idatahandler.py b/freqtrade/data/history/idatahandler.py index 8c1823c00..c98fd362f 100644 --- a/freqtrade/data/history/idatahandler.py +++ b/freqtrade/data/history/idatahandler.py @@ -375,6 +375,9 @@ def get_datahandlerclass(datatype: str) -> Type[IDataHandler]: elif datatype == 'hdf5': from .hdf5datahandler import HDF5DataHandler return HDF5DataHandler + elif datatype == 'feather': + from .featherdatahandler import FeatherDataHandler + return FeatherDataHandler else: raise ValueError(f"No datahandler for datatype {datatype} available.") From dc2b93228bb0febb3f65ed7238e29b6c9b772d19 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 20 Sep 2022 13:42:15 +0000 Subject: [PATCH 02/16] Add ParquetDataHandler --- freqtrade/constants.py | 2 +- freqtrade/data/history/idatahandler.py | 3 + freqtrade/data/history/parquetdatahandler.py | 132 +++++++++++++++++++ 3 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 freqtrade/data/history/parquetdatahandler.py diff --git a/freqtrade/constants.py b/freqtrade/constants.py index 5727aff0a..76b37b2d8 100644 --- a/freqtrade/constants.py +++ b/freqtrade/constants.py @@ -36,7 +36,7 @@ AVAILABLE_PAIRLISTS = ['StaticPairList', 'VolumePairList', 'PrecisionFilter', 'PriceFilter', 'RangeStabilityFilter', 'ShuffleFilter', 'SpreadFilter', 'VolatilityFilter'] AVAILABLE_PROTECTIONS = ['CooldownPeriod', 'LowProfitPairs', 'MaxDrawdown', 'StoplossGuard'] -AVAILABLE_DATAHANDLERS = ['json', 'jsongz', 'hdf5', 'feather'] +AVAILABLE_DATAHANDLERS = ['json', 'jsongz', 'hdf5', 'feather', 'parquet'] BACKTEST_BREAKDOWNS = ['day', 'week', 'month'] BACKTEST_CACHE_AGE = ['none', 'day', 'week', 'month'] BACKTEST_CACHE_DEFAULT = 'day' diff --git a/freqtrade/data/history/idatahandler.py b/freqtrade/data/history/idatahandler.py index c98fd362f..eb5ad3621 100644 --- a/freqtrade/data/history/idatahandler.py +++ b/freqtrade/data/history/idatahandler.py @@ -378,6 +378,9 @@ def get_datahandlerclass(datatype: str) -> Type[IDataHandler]: elif datatype == 'feather': from .featherdatahandler import FeatherDataHandler return FeatherDataHandler + elif datatype == 'parquet': + from .parquetdatahandler import ParquetDataHandler + return ParquetDataHandler else: raise ValueError(f"No datahandler for datatype {datatype} available.") diff --git a/freqtrade/data/history/parquetdatahandler.py b/freqtrade/data/history/parquetdatahandler.py new file mode 100644 index 000000000..283d90ec0 --- /dev/null +++ b/freqtrade/data/history/parquetdatahandler.py @@ -0,0 +1,132 @@ +import logging +from typing import Optional + +from pandas import DataFrame, read_parquet, to_datetime + +from freqtrade.configuration import TimeRange +from freqtrade.constants import DEFAULT_DATAFRAME_COLUMNS, TradeList +from freqtrade.enums import CandleType + +from .idatahandler import IDataHandler + + +logger = logging.getLogger(__name__) + + +class ParquetDataHandler(IDataHandler): + + _columns = DEFAULT_DATAFRAME_COLUMNS + + def ohlcv_store( + self, pair: str, timeframe: str, data: DataFrame, candle_type: CandleType) -> None: + """ + Store data in json format "values". + format looks as follows: + [[,,,,]] + :param pair: Pair - used to generate filename + :param timeframe: Timeframe - used to generate filename + :param data: Dataframe containing OHLCV data + :param candle_type: Any of the enum CandleType (must match trading mode!) + :return: None + """ + filename = self._pair_data_filename(self._datadir, pair, timeframe, candle_type) + self.create_dir_if_needed(filename) + + data.reset_index(drop=True).loc[:, self._columns].to_parquet(filename) + + def _ohlcv_load(self, pair: str, timeframe: str, + timerange: Optional[TimeRange], candle_type: CandleType + ) -> DataFrame: + """ + Internal method used to load data for one pair from disk. + Implements the loading and conversion to a Pandas dataframe. + Timerange trimming and dataframe validation happens outside of this method. + :param pair: Pair to load data + :param timeframe: Timeframe (e.g. "5m") + :param timerange: Limit data to be loaded to this timerange. + Optionally implemented by subclasses to avoid loading + all data where possible. + :param candle_type: Any of the enum CandleType (must match trading mode!) + :return: DataFrame with ohlcv data, or empty DataFrame + """ + filename = self._pair_data_filename( + self._datadir, pair, timeframe, candle_type=candle_type) + if not filename.exists(): + # Fallback mode for 1M files + filename = self._pair_data_filename( + self._datadir, pair, timeframe, candle_type=candle_type, no_timeframe_modify=True) + if not filename.exists(): + return DataFrame(columns=self._columns) + try: + pairdata = read_parquet(filename) + pairdata.columns = self._columns + except ValueError: + logger.error(f"Could not load data for {pair}.") + return DataFrame(columns=self._columns) + pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float', + 'low': 'float', 'close': 'float', 'volume': 'float'}) + pairdata['date'] = to_datetime(pairdata['date'], + unit='ms', + utc=True, + infer_datetime_format=True) + return pairdata + + def ohlcv_append( + self, + pair: str, + timeframe: str, + data: DataFrame, + candle_type: CandleType + ) -> None: + """ + Append data to existing data structures + :param pair: Pair + :param timeframe: Timeframe this ohlcv data is for + :param data: Data to append. + :param candle_type: Any of the enum CandleType (must match trading mode!) + """ + raise NotImplementedError() + + def trades_store(self, pair: str, data: TradeList) -> None: + """ + Store trades data (list of Dicts) to file + :param pair: Pair - used for filename + :param data: List of Lists containing trade data, + column sequence as in DEFAULT_TRADES_COLUMNS + """ + # filename = self._pair_trades_filename(self._datadir, pair) + + raise NotImplementedError() + # array = pa.array(data) + # array + # feather.write_feather(data, filename) + + def trades_append(self, pair: str, data: TradeList): + """ + Append data to existing files + :param pair: Pair - used for filename + :param data: List of Lists containing trade data, + column sequence as in DEFAULT_TRADES_COLUMNS + """ + raise NotImplementedError() + + def _trades_load(self, pair: str, timerange: Optional[TimeRange] = None) -> TradeList: + """ + Load a pair from file, either .json.gz or .json + # TODO: respect timerange ... + :param pair: Load trades for this pair + :param timerange: Timerange to load trades for - currently not implemented + :return: List of trades + """ + raise NotImplementedError() + # filename = self._pair_trades_filename(self._datadir, pair) + # tradesdata = misc.file_load_json(filename) + + # if not tradesdata: + # return [] + + # return tradesdata + + @classmethod + def _get_file_extension(cls): + return "parquet" From 044891f5433735d0fc38e808a6e527113e4d67f6 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 07:02:18 +0200 Subject: [PATCH 03/16] Add conditional formats depending on mode --- freqtrade/commands/cli_options.py | 2 +- freqtrade/constants.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/freqtrade/commands/cli_options.py b/freqtrade/commands/cli_options.py index f383f0768..e50fb86d8 100644 --- a/freqtrade/commands/cli_options.py +++ b/freqtrade/commands/cli_options.py @@ -440,7 +440,7 @@ AVAILABLE_CLI_OPTIONS = { "dataformat_trades": Arg( '--data-format-trades', help='Storage format for downloaded trades data. (default: `jsongz`).', - choices=constants.AVAILABLE_DATAHANDLERS, + choices=constants.AVAILABLE_DATAHANDLERS_TRADES, ), "show_timerange": Arg( '--show-timerange', diff --git a/freqtrade/constants.py b/freqtrade/constants.py index 76b37b2d8..4c2bd6e18 100644 --- a/freqtrade/constants.py +++ b/freqtrade/constants.py @@ -36,7 +36,8 @@ AVAILABLE_PAIRLISTS = ['StaticPairList', 'VolumePairList', 'PrecisionFilter', 'PriceFilter', 'RangeStabilityFilter', 'ShuffleFilter', 'SpreadFilter', 'VolatilityFilter'] AVAILABLE_PROTECTIONS = ['CooldownPeriod', 'LowProfitPairs', 'MaxDrawdown', 'StoplossGuard'] -AVAILABLE_DATAHANDLERS = ['json', 'jsongz', 'hdf5', 'feather', 'parquet'] +AVAILABLE_DATAHANDLERS_TRADES = ['json', 'jsongz', 'hdf5'] +AVAILABLE_DATAHANDLERS = AVAILABLE_DATAHANDLERS_TRADES + ['feather', 'parquet'] BACKTEST_BREAKDOWNS = ['day', 'week', 'month'] BACKTEST_CACHE_AGE = ['none', 'day', 'week', 'month'] BACKTEST_CACHE_DEFAULT = 'day' @@ -434,7 +435,7 @@ CONF_SCHEMA = { }, 'dataformat_trades': { 'type': 'string', - 'enum': AVAILABLE_DATAHANDLERS, + 'enum': AVAILABLE_DATAHANDLERS_TRADES, 'default': 'jsongz' }, 'position_adjustment_enable': {'type': 'boolean'}, From 983a16d9370759d5105ba2fa3d31d9c9a2be158f Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 07:02:28 +0200 Subject: [PATCH 04/16] Rudimentary "not implemented" test --- tests/data/test_datahandler.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/data/test_datahandler.py b/tests/data/test_datahandler.py index 6d9f72a81..090ea3eba 100644 --- a/tests/data/test_datahandler.py +++ b/tests/data/test_datahandler.py @@ -152,6 +152,15 @@ def test_jsondatahandler_ohlcv_load(testdatadir, caplog): assert df.columns.equals(df1.columns) +@pytest.mark.parametrize('datahandler', ['feather', 'parquet']) +def test_datahandler_trades_not_supported(datahandler, testdatadir, ): + dh = get_datahandler(testdatadir, datahandler) + with pytest.raises(NotImplementedError): + dh.trades_load('UNITTEST/ETH') + with pytest.raises(NotImplementedError): + dh.trades_store('UNITTEST/ETH', MagicMock()) + + def test_jsondatahandler_trades_load(testdatadir, caplog): dh = JsonGzDataHandler(testdatadir) logmsg = "Old trades format detected - converting" From 5fb56b09f2eddad2ca5852983b8286d08fd29111 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 07:20:09 +0200 Subject: [PATCH 05/16] Test Feather/parquet datahandler init --- tests/data/test_datahandler.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/data/test_datahandler.py b/tests/data/test_datahandler.py index 090ea3eba..737f1f59f 100644 --- a/tests/data/test_datahandler.py +++ b/tests/data/test_datahandler.py @@ -9,9 +9,11 @@ from pandas import DataFrame from freqtrade.configuration import TimeRange from freqtrade.constants import AVAILABLE_DATAHANDLERS +from freqtrade.data.history.featherdatahandler import FeatherDataHandler from freqtrade.data.history.hdf5datahandler import HDF5DataHandler from freqtrade.data.history.idatahandler import IDataHandler, get_datahandler, get_datahandlerclass from freqtrade.data.history.jsondatahandler import JsonDataHandler, JsonGzDataHandler +from freqtrade.data.history.parquetdatahandler import ParquetDataHandler from freqtrade.enums import CandleType, TradingMode from tests.conftest import log_has @@ -339,13 +341,24 @@ def test_gethandlerclass(): cl = get_datahandlerclass('json') assert cl == JsonDataHandler assert issubclass(cl, IDataHandler) + cl = get_datahandlerclass('jsongz') assert cl == JsonGzDataHandler assert issubclass(cl, IDataHandler) assert issubclass(cl, JsonDataHandler) + cl = get_datahandlerclass('hdf5') assert cl == HDF5DataHandler assert issubclass(cl, IDataHandler) + + cl = get_datahandlerclass('feather') + assert cl == FeatherDataHandler + assert issubclass(cl, IDataHandler) + + cl = get_datahandlerclass('parquet') + assert cl == ParquetDataHandler + assert issubclass(cl, IDataHandler) + with pytest.raises(ValueError, match=r"No datahandler for .*"): get_datahandlerclass('DeadBeef') From 0bbb6faeba7520ef533d4dbbe0c0bd95acd5304c Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 07:30:11 +0200 Subject: [PATCH 06/16] Add generic datahandler test --- tests/data/test_datahandler.py | 55 ++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tests/data/test_datahandler.py b/tests/data/test_datahandler.py index 737f1f59f..07c9a20df 100644 --- a/tests/data/test_datahandler.py +++ b/tests/data/test_datahandler.py @@ -323,6 +323,61 @@ def test_hdf5datahandler_ohlcv_load_and_resave( assert ohlcv.empty +@pytest.mark.parametrize('pair,timeframe,candle_type,candle_append,startdt,enddt', [ + # Data goes from 2018-01-10 - 2018-01-30 + ('UNITTEST/BTC', '5m', 'spot', '', '2018-01-15', '2018-01-19'), + # Mark data goes from to 2021-11-15 2021-11-19 + ('UNITTEST/USDT', '1h', 'mark', '-mark', '2021-11-16', '2021-11-18'), +]) +@pytest.mark.parametrize('datahandler', ['hdf5', 'feather', 'parquet']) +def test_generic_datahandler_ohlcv_load_and_resave( + datahandler, + testdatadir, + tmpdir, + pair, + timeframe, + candle_type, + candle_append, + startdt, enddt +): + tmpdir1 = Path(tmpdir) + tmpdir2 = tmpdir1 + if candle_type not in ('', 'spot'): + tmpdir2 = tmpdir1 / 'futures' + tmpdir2.mkdir() + # Load data from one common file + dhbase = get_datahandler(testdatadir, 'json') + ohlcv = dhbase._ohlcv_load(pair, timeframe, None, candle_type=candle_type) + assert isinstance(ohlcv, DataFrame) + assert len(ohlcv) > 0 + + # Get data to test + dh = get_datahandler(testdatadir, datahandler) + + file = tmpdir2 / f"UNITTEST_NEW-{timeframe}{candle_append}.{dh._get_file_extension()}" + assert not file.is_file() + + dh1 = get_datahandler(tmpdir1, datahandler) + dh1.ohlcv_store('UNITTEST/NEW', timeframe, ohlcv, candle_type=candle_type) + assert file.is_file() + + assert not ohlcv[ohlcv['date'] < startdt].empty + + timerange = TimeRange.parse_timerange(f"{startdt.replace('-', '')}-{enddt.replace('-', '')}") + + ohlcv = dhbase.ohlcv_load(pair, timeframe, timerange=timerange, candle_type=candle_type) + ohlcv1 = dh1.ohlcv_load('UNITTEST/NEW', timeframe, timerange=timerange, candle_type=candle_type) + + assert len(ohlcv) == len(ohlcv1) + assert ohlcv.equals(ohlcv1) + assert ohlcv[ohlcv['date'] < startdt].empty + assert ohlcv[ohlcv['date'] > enddt].empty + + # Try loading inexisting file + ohlcv = dh.ohlcv_load('UNITTEST/NONEXIST', timeframe, candle_type=candle_type) + assert ohlcv.empty + + def test_hdf5datahandler_ohlcv_purge(mocker, testdatadir): mocker.patch.object(Path, "exists", MagicMock(return_value=False)) unlinkmock = mocker.patch.object(Path, "unlink", MagicMock()) From 48352b8a375dae287ca85f0aae47eb875bee10d2 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 14:49:17 +0000 Subject: [PATCH 07/16] Update hdf5 handler to reset index on load --- freqtrade/data/history/hdf5datahandler.py | 1 + tests/data/test_datahandler.py | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/freqtrade/data/history/hdf5datahandler.py b/freqtrade/data/history/hdf5datahandler.py index 01b7af7e7..fd46115de 100644 --- a/freqtrade/data/history/hdf5datahandler.py +++ b/freqtrade/data/history/hdf5datahandler.py @@ -81,6 +81,7 @@ class HDF5DataHandler(IDataHandler): raise ValueError("Wrong dataframe format") pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float', 'low': 'float', 'close': 'float', 'volume': 'float'}) + pairdata = pairdata.reset_index(drop=True) return pairdata def ohlcv_append( diff --git a/tests/data/test_datahandler.py b/tests/data/test_datahandler.py index 07c9a20df..8e1b0050a 100644 --- a/tests/data/test_datahandler.py +++ b/tests/data/test_datahandler.py @@ -366,7 +366,13 @@ def test_generic_datahandler_ohlcv_load_and_resave( timerange = TimeRange.parse_timerange(f"{startdt.replace('-', '')}-{enddt.replace('-', '')}") ohlcv = dhbase.ohlcv_load(pair, timeframe, timerange=timerange, candle_type=candle_type) - ohlcv1 = dh1.ohlcv_load('UNITTEST/NEW', timeframe, timerange=timerange, candle_type=candle_type) + if datahandler == 'hdf5': + ohlcv1 = dh1._ohlcv_load('UNITTEST/NEW', timeframe, timerange, candle_type=candle_type) + if candle_type == 'mark': + ohlcv1['volume'] = 0.0 + else: + ohlcv1 = dh1.ohlcv_load('UNITTEST/NEW', timeframe, + timerange=timerange, candle_type=candle_type) assert len(ohlcv) == len(ohlcv1) assert ohlcv.equals(ohlcv1) From 7e1e388b9ce492d390671df30b70ceabd5415e06 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 18:24:30 +0200 Subject: [PATCH 08/16] Add feather/parquet docs --- docs/data-download.md | 44 ++++++++++++++++++-- freqtrade/data/history/featherdatahandler.py | 9 ++-- freqtrade/data/history/parquetdatahandler.py | 9 ++-- 3 files changed, 47 insertions(+), 15 deletions(-) diff --git a/docs/data-download.md b/docs/data-download.md index 2b76d4f74..60e3f5efe 100644 --- a/docs/data-download.md +++ b/docs/data-download.md @@ -179,9 +179,11 @@ freqtrade download-data --exchange binance --pairs ETH/USDT XRP/USDT BTC/USDT -- Freqtrade currently supports 3 data-formats for both OHLCV and trades data: -* `json` (plain "text" json files) -* `jsongz` (a gzip-zipped version of json files) -* `hdf5` (a high performance datastore) +* `json` - plain "text" json files +* `jsongz` - a gzip-zipped version of json files +* `hdf5` - a high performance datastore +* `feather` - a dataformat based on Apache Arrow +* `parquet` - columnar datastore By default, OHLCV data is stored as `json` data, while trades data is stored as `jsongz` data. @@ -200,6 +202,42 @@ If the default data-format has been changed during download, then the keys `data !!! Note You can convert between data-formats using the [convert-data](#sub-command-convert-data) and [convert-trade-data](#sub-command-convert-trade-data) methods. +#### Dataformat comparison + +The following comparisons have been made with the following data, and by using the linux `time` command. + +``` +Found 6 pair / timeframe combinations. ++----------+-------------+--------+---------------------+---------------------+ +| Pair | Timeframe | Type | From | To | +|----------+-------------+--------+---------------------+---------------------| +| BTC/USDT | 5m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:25:00 | +| ETH/USDT | 1m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:26:00 | +| BTC/USDT | 1m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:30:00 | +| XRP/USDT | 5m | spot | 2018-05-04 08:10:00 | 2022-09-13 19:15:00 | +| XRP/USDT | 1m | spot | 2018-05-04 08:11:00 | 2022-09-13 19:22:00 | +| ETH/USDT | 5m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:20:00 | ++----------+-------------+--------+---------------------+---------------------+ +``` + +Timings have been taken in a not very scientific way with the following command, which forces reading the data into memory. + +``` bash +time freqtrade list-data --show-timerange --data-format-ohlcv +``` + +| Format | Size | timing | +|------------|-------------|-------------| +| `json` | 149Mb | 25.6s | +| `jsongz` | 39Mb | 27s | +| `hdf5` | 145Mb | 3.9s | +| `feather` | 72Mb | 3.5s | +| `parquet` | 83Mb | 3.8s | + +Size has been taken from the BTC/USDT 1m spot combination for the timerange specified above. + +To have a best performance/size mix, we recommend the use of either feather or parquet. + #### Sub-command convert data ``` diff --git a/freqtrade/data/history/featherdatahandler.py b/freqtrade/data/history/featherdatahandler.py index dfb818ca8..22a6805e7 100644 --- a/freqtrade/data/history/featherdatahandler.py +++ b/freqtrade/data/history/featherdatahandler.py @@ -58,12 +58,9 @@ class FeatherDataHandler(IDataHandler): self._datadir, pair, timeframe, candle_type=candle_type, no_timeframe_modify=True) if not filename.exists(): return DataFrame(columns=self._columns) - try: - pairdata = read_feather(filename) - pairdata.columns = self._columns - except ValueError: - logger.error(f"Could not load data for {pair}.") - return DataFrame(columns=self._columns) + + pairdata = read_feather(filename) + pairdata.columns = self._columns pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float', 'low': 'float', 'close': 'float', 'volume': 'float'}) pairdata['date'] = to_datetime(pairdata['date'], diff --git a/freqtrade/data/history/parquetdatahandler.py b/freqtrade/data/history/parquetdatahandler.py index 283d90ec0..57581861d 100644 --- a/freqtrade/data/history/parquetdatahandler.py +++ b/freqtrade/data/history/parquetdatahandler.py @@ -57,12 +57,9 @@ class ParquetDataHandler(IDataHandler): self._datadir, pair, timeframe, candle_type=candle_type, no_timeframe_modify=True) if not filename.exists(): return DataFrame(columns=self._columns) - try: - pairdata = read_parquet(filename) - pairdata.columns = self._columns - except ValueError: - logger.error(f"Could not load data for {pair}.") - return DataFrame(columns=self._columns) + + pairdata = read_parquet(filename) + pairdata.columns = self._columns pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float', 'low': 'float', 'close': 'float', 'volume': 'float'}) pairdata['date'] = to_datetime(pairdata['date'], From 4576d291a95e8703f7206f0441e0961be276ad2b Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 18:25:26 +0200 Subject: [PATCH 09/16] Update data command outputs --- docs/data-download.md | 64 +++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/docs/data-download.md b/docs/data-download.md index 60e3f5efe..700ca04f4 100644 --- a/docs/data-download.md +++ b/docs/data-download.md @@ -26,7 +26,7 @@ usage: freqtrade download-data [-h] [-v] [--logfile FILE] [-V] [-c PATH] [--timerange TIMERANGE] [--dl-trades] [--exchange EXCHANGE] [-t TIMEFRAMES [TIMEFRAMES ...]] [--erase] - [--data-format-ohlcv {json,jsongz,hdf5}] + [--data-format-ohlcv {json,jsongz,hdf5,feather,parquet}] [--data-format-trades {json,jsongz,hdf5}] [--trading-mode {spot,margin,futures}] [--prepend] @@ -55,7 +55,7 @@ optional arguments: list. Default: `1m 5m`. --erase Clean all existing data for the selected exchange/pairs/timeframes. - --data-format-ohlcv {json,jsongz,hdf5} + --data-format-ohlcv {json,jsongz,hdf5,feather,parquet} Storage format for downloaded candle (OHLCV) data. (default: `json`). --data-format-trades {json,jsongz,hdf5} @@ -76,7 +76,7 @@ Common arguments: `userdir/config.json` or `config.json` whichever exists). Multiple --config options may be used. Can be set to `-` to read config from stdin. - -d PATH, --datadir PATH + -d PATH, --datadir PATH, --data-dir PATH Path to directory with historical backtesting data. --userdir PATH, --user-data-dir PATH Path to userdata directory. @@ -244,32 +244,32 @@ To have a best performance/size mix, we recommend the use of either feather or p usage: freqtrade convert-data [-h] [-v] [--logfile FILE] [-V] [-c PATH] [-d PATH] [--userdir PATH] [-p PAIRS [PAIRS ...]] --format-from - {json,jsongz,hdf5} --format-to - {json,jsongz,hdf5} [--erase] - [-t {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...]] + {json,jsongz,hdf5,feather,parquet} --format-to + {json,jsongz,hdf5,feather,parquet} [--erase] [--exchange EXCHANGE] + [-t TIMEFRAMES [TIMEFRAMES ...]] [--trading-mode {spot,margin,futures}] - [--candle-types {spot,,futures,mark,index,premiumIndex,funding_rate} [{spot,,futures,mark,index,premiumIndex,funding_rate} ...]] + [--candle-types {spot,futures,mark,index,premiumIndex,funding_rate} [{spot,futures,mark,index,premiumIndex,funding_rate} ...]] optional arguments: -h, --help show this help message and exit -p PAIRS [PAIRS ...], --pairs PAIRS [PAIRS ...] Limit command to these pairs. Pairs are space- separated. - --format-from {json,jsongz,hdf5} + --format-from {json,jsongz,hdf5,feather,parquet} Source format for data conversion. - --format-to {json,jsongz,hdf5} + --format-to {json,jsongz,hdf5,feather,parquet} Destination format for data conversion. --erase Clean all existing data for the selected exchange/pairs/timeframes. - -t {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...], --timeframes {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...] - Specify which tickers to download. Space-separated - list. Default: `1m 5m`. --exchange EXCHANGE Exchange name (default: `bittrex`). Only valid if no config is provided. - --trading-mode {spot,margin,futures} + -t TIMEFRAMES [TIMEFRAMES ...], --timeframes TIMEFRAMES [TIMEFRAMES ...] + Specify which tickers to download. Space-separated + list. Default: `1m 5m`. + --trading-mode {spot,margin,futures}, --tradingmode {spot,margin,futures} Select Trading mode - --candle-types {spot,,futures,mark,index,premiumIndex,funding_rate} [{spot,,futures,mark,index,premiumIndex,funding_rate} ...] + --candle-types {spot,futures,mark,index,premiumIndex,funding_rate} [{spot,futures,mark,index,premiumIndex,funding_rate} ...] Select candle type to use Common arguments: @@ -283,7 +283,7 @@ Common arguments: `userdir/config.json` or `config.json` whichever exists). Multiple --config options may be used. Can be set to `-` to read config from stdin. - -d PATH, --datadir PATH + -d PATH, --datadir PATH, --data-dir PATH Path to directory with historical backtesting data. --userdir PATH, --user-data-dir PATH Path to userdata directory. @@ -305,20 +305,24 @@ freqtrade convert-data --format-from json --format-to jsongz --datadir ~/.freqtr usage: freqtrade convert-trade-data [-h] [-v] [--logfile FILE] [-V] [-c PATH] [-d PATH] [--userdir PATH] [-p PAIRS [PAIRS ...]] --format-from - {json,jsongz,hdf5} --format-to - {json,jsongz,hdf5} [--erase] + {json,jsongz,hdf5,feather,parquet} + --format-to + {json,jsongz,hdf5,feather,parquet} + [--erase] [--exchange EXCHANGE] optional arguments: -h, --help show this help message and exit -p PAIRS [PAIRS ...], --pairs PAIRS [PAIRS ...] - Show profits for only these pairs. Pairs are space- + Limit command to these pairs. Pairs are space- separated. - --format-from {json,jsongz,hdf5} + --format-from {json,jsongz,hdf5,feather,parquet} Source format for data conversion. - --format-to {json,jsongz,hdf5} + --format-to {json,jsongz,hdf5,feather,parquet} Destination format for data conversion. --erase Clean all existing data for the selected exchange/pairs/timeframes. + --exchange EXCHANGE Exchange name (default: `bittrex`). Only valid if no + config is provided. Common arguments: -v, --verbose Verbose mode (-vv for more, -vvv to get all messages). @@ -331,7 +335,7 @@ Common arguments: `userdir/config.json` or `config.json` whichever exists). Multiple --config options may be used. Can be set to `-` to read config from stdin. - -d PATH, --datadir PATH + -d PATH, --datadir PATH, --data-dir PATH Path to directory with historical backtesting data. --userdir PATH, --user-data-dir PATH Path to userdata directory. @@ -356,9 +360,9 @@ This command will allow you to repeat this last step for additional timeframes w usage: freqtrade trades-to-ohlcv [-h] [-v] [--logfile FILE] [-V] [-c PATH] [-d PATH] [--userdir PATH] [-p PAIRS [PAIRS ...]] - [-t {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...]] + [-t TIMEFRAMES [TIMEFRAMES ...]] [--exchange EXCHANGE] - [--data-format-ohlcv {json,jsongz,hdf5}] + [--data-format-ohlcv {json,jsongz,hdf5,feather,parquet}] [--data-format-trades {json,jsongz,hdf5}] optional arguments: @@ -366,12 +370,12 @@ optional arguments: -p PAIRS [PAIRS ...], --pairs PAIRS [PAIRS ...] Limit command to these pairs. Pairs are space- separated. - -t {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...], --timeframes {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...] + -t TIMEFRAMES [TIMEFRAMES ...], --timeframes TIMEFRAMES [TIMEFRAMES ...] Specify which tickers to download. Space-separated list. Default: `1m 5m`. --exchange EXCHANGE Exchange name (default: `bittrex`). Only valid if no config is provided. - --data-format-ohlcv {json,jsongz,hdf5} + --data-format-ohlcv {json,jsongz,hdf5,feather,parquet} Storage format for downloaded candle (OHLCV) data. (default: `json`). --data-format-trades {json,jsongz,hdf5} @@ -389,7 +393,7 @@ Common arguments: `userdir/config.json` or `config.json` whichever exists). Multiple --config options may be used. Can be set to `-` to read config from stdin. - -d PATH, --datadir PATH + -d PATH, --datadir PATH, --data-dir PATH Path to directory with historical backtesting data. --userdir PATH, --user-data-dir PATH Path to userdata directory. @@ -409,7 +413,7 @@ You can get a list of downloaded data using the `list-data` sub-command. ``` usage: freqtrade list-data [-h] [-v] [--logfile FILE] [-V] [-c PATH] [-d PATH] [--userdir PATH] [--exchange EXCHANGE] - [--data-format-ohlcv {json,jsongz,hdf5}] + [--data-format-ohlcv {json,jsongz,hdf5,feather,parquet}] [-p PAIRS [PAIRS ...]] [--trading-mode {spot,margin,futures}] [--show-timerange] @@ -418,13 +422,13 @@ optional arguments: -h, --help show this help message and exit --exchange EXCHANGE Exchange name (default: `bittrex`). Only valid if no config is provided. - --data-format-ohlcv {json,jsongz,hdf5} + --data-format-ohlcv {json,jsongz,hdf5,feather,parquet} Storage format for downloaded candle (OHLCV) data. (default: `json`). -p PAIRS [PAIRS ...], --pairs PAIRS [PAIRS ...] Limit command to these pairs. Pairs are space- separated. - --trading-mode {spot,margin,futures} + --trading-mode {spot,margin,futures}, --tradingmode {spot,margin,futures} Select Trading mode --show-timerange Show timerange available for available data. (May take a while to calculate). @@ -440,7 +444,7 @@ Common arguments: `userdir/config.json` or `config.json` whichever exists). Multiple --config options may be used. Can be set to `-` to read config from stdin. - -d PATH, --datadir PATH + -d PATH, --datadir PATH, --data-dir PATH Path to directory with historical backtesting data. --userdir PATH, --user-data-dir PATH Path to userdata directory. From 7c093388e7d01ddc859d23ca29f5aa4fb3ac4336 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 19:36:23 +0200 Subject: [PATCH 10/16] Add pyarrow dependency --- environment.yml | 1 + requirements.txt | 2 ++ setup.py | 4 ++-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index d6d85de9d..5298b2baa 100644 --- a/environment.yml +++ b/environment.yml @@ -34,6 +34,7 @@ dependencies: - schedule - python-dateutil - joblib + - pyarrow # ============================ diff --git a/requirements.txt b/requirements.txt index 690e33a09..c12d3fb08 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,6 +21,7 @@ jinja2==3.1.2 tables==3.7.0 blosc==1.10.6 joblib==1.2.0 +pyarrow==9.0.0 # find first, C search in arrays py_find_1st==1.1.5 @@ -54,3 +55,4 @@ schedule==1.1.0 #WS Messages websockets==10.3 janus==1.0.0 + diff --git a/setup.py b/setup.py index 2e6e354b0..cdd461f3f 100644 --- a/setup.py +++ b/setup.py @@ -8,13 +8,11 @@ hyperopt = [ 'scikit-learn', 'scikit-optimize>=0.7.0', 'filelock', - 'joblib', 'progressbar2', ] freqai = [ 'scikit-learn', - 'joblib', 'catboost; platform_machine != "aarch64"', 'lightgbm', ] @@ -74,6 +72,8 @@ setup( 'pandas', 'tables', 'blosc', + 'joblib', + 'pyarrow' 'fastapi', 'uvicorn', 'psutil', From 2fffe7c5ddaee734aad278afe6470601d48bf371 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 20:03:33 +0200 Subject: [PATCH 11/16] Fix missing comma --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cdd461f3f..1547b7974 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ setup( 'tables', 'blosc', 'joblib', - 'pyarrow' + 'pyarrow', 'fastapi', 'uvicorn', 'psutil', From e63f9e1c14ede90ef004cb410ed0f77f9490ddde Mon Sep 17 00:00:00 2001 From: Matthias Date: Sat, 24 Sep 2022 16:16:47 +0200 Subject: [PATCH 12/16] Use pre-commit in Ci to check for all things --- .github/workflows/ci.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 91d53044d..cb8084e59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -272,6 +272,11 @@ jobs: pip install pyaml python build_helpers/pre_commit_update.py + pre-commit: + runs-on: ubuntu-22.04 + steps: + - uses: pre-commit/action@v3.0.0 + docs_check: runs-on: ubuntu-20.04 steps: @@ -302,7 +307,7 @@ jobs: # Notify only once - when CI completes (and after deploy) in case it's successfull notify-complete: - needs: [ build_linux, build_macos, build_windows, docs_check, mypy_version_check ] + needs: [ build_linux, build_macos, build_windows, docs_check, mypy_version_check, pre-commit ] runs-on: ubuntu-20.04 # Discord notification can't handle schedule events if: (github.event_name != 'schedule') @@ -327,7 +332,7 @@ jobs: webhookUrl: ${{ secrets.DISCORD_WEBHOOK }} deploy: - needs: [ build_linux, build_macos, build_windows, docs_check, mypy_version_check ] + needs: [ build_linux, build_macos, build_windows, docs_check, mypy_version_check, pre-commit ] runs-on: ubuntu-20.04 if: (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'release') && github.repository == 'freqtrade/freqtrade' From 1bd742f7e986238e1c4d011b4cee71e4218c60d6 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sat, 24 Sep 2022 16:31:29 +0200 Subject: [PATCH 13/16] Properly setup pre-commit job --- .github/workflows/ci.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cb8084e59..b677d924f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -275,7 +275,12 @@ jobs: pre-commit: runs-on: ubuntu-22.04 steps: - - uses: pre-commit/action@v3.0.0 + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + - uses: pre-commit/action@v3.0.0 docs_check: runs-on: ubuntu-20.04 From 873d2a5069b35ff5b5ff35368999f014590ba98d Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sun, 25 Sep 2022 11:18:10 +0200 Subject: [PATCH 14/16] no model save backtest, plot features backtest, ensure inlier plays nice, doc --- docs/freqai.md | 2 +- freqtrade/freqai/data_drawer.py | 23 ++++++++++++++- freqtrade/freqai/data_kitchen.py | 10 +++++-- freqtrade/freqai/freqai_interface.py | 43 ++++++++++++++-------------- freqtrade/freqai/utils.py | 2 +- 5 files changed, 52 insertions(+), 28 deletions(-) diff --git a/docs/freqai.md b/docs/freqai.md index a03162b45..a186ce01a 100644 --- a/docs/freqai.md +++ b/docs/freqai.md @@ -110,7 +110,7 @@ Mandatory parameters are marked as **Required**, which means that they are requi | `indicator_periods_candles` | Calculate indicators for `indicator_periods_candles` time periods and add them to the feature set.
**Datatype:** List of positive integers. | `stratify_training_data` | This value is used to indicate the grouping of the data. For example, 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. See details about how it works [here](#stratifying-the-data-for-training-and-testing-the-model)
**Datatype:** Positive integer. | `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis) -| `plot_feature_importance` | Create an interactive feature importance plot for each model.
**Datatype:** Boolean.
**Datatype:** Boolean, defaults to `False` +| `plot_feature_importances` | Create a feature importance plot for each model for the top/bottom `plot_feature_importances` number of features.
**Datatype:** Boolean.
**Datatype:** Boolean, defaults to `0` | `DI_threshold` | Activates the Dissimilarity Index for outlier detection when > 0. See details about how it works [here](#removing-outliers-with-the-dissimilarity-index).
**Datatype:** Positive float (typically < 1). | `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm).
**Datatype:** Boolean. | `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm).
**Datatype:** Dictionary. diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py index 7f4459fa5..e6a39b6e7 100644 --- a/freqtrade/freqai/data_drawer.py +++ b/freqtrade/freqai/data_drawer.py @@ -313,6 +313,7 @@ class FreqaiDataDrawer: """ dk.find_features(dataframe) + dk.find_labels(dataframe) full_labels = dk.label_list + dk.unique_class_list @@ -376,7 +377,27 @@ class FreqaiDataDrawer: if self.config.get("freqai", {}).get("purge_old_models", False): self.purge_old_models() - # Functions pulled back from FreqaiDataKitchen because they relied on DataDrawer + def save_metaddata(self, dk: FreqaiDataKitchen) -> None: + """ + Saves only metadata for backtesting studies if user prefers + not to save model data. This saves tremendous amounts of space + for users generating huge studies. + This is only active when `save_backtest_models`: false (not default) + """ + if not dk.data_path.is_dir(): + dk.data_path.mkdir(parents=True, exist_ok=True) + + save_path = Path(dk.data_path) + + dk.data["data_path"] = str(dk.data_path) + dk.data["model_filename"] = str(dk.model_filename) + dk.data["training_features_list"] = list(dk.data_dictionary["train_features"].columns) + dk.data["label_list"] = dk.label_list + + with open(save_path / f"{dk.model_filename}_metadata.json", "w") as fp: + rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE) + + return def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None: """ diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 752cd0e45..f4fa4e5fd 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -831,7 +831,7 @@ class FreqaiDataKitchen: inlier_metric = pd.DataFrame( data=inliers.sum(axis=1) / no_prev_pts, - columns=['inlier_metric'], + columns=['%-inlier_metric'], index=compute_df.index ) @@ -881,11 +881,14 @@ class FreqaiDataKitchen: """ column_names = dataframe.columns features = [c for c in column_names if "%" in c] - labels = [c for c in column_names if "&" in c] if not features: raise OperationalException("Could not find any features!") self.training_features_list = features + + def find_labels(self, dataframe: DataFrame) -> None: + column_names = dataframe.columns + labels = [c for c in column_names if "&" in c] self.label_list = labels def check_if_pred_in_training_spaces(self) -> None: @@ -1206,7 +1209,8 @@ class FreqaiDataKitchen: def get_unique_classes_from_labels(self, dataframe: DataFrame) -> None: - self.find_features(dataframe) + # self.find_features(dataframe) + self.find_labels(dataframe) for key in self.label_list: if dataframe[key].dtype == object: diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index e0a45fb38..988aae4f5 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -92,6 +92,7 @@ class IFreqaiModel(ABC): self.begin_time_train: float = 0 self.base_tf_seconds = timeframe_to_seconds(self.config['timeframe']) self.continual_learning = self.freqai_info.get('continual_learning', False) + self.plot_features = self.ft_params.get("plot_feature_importances", 0) self._threads: List[threading.Thread] = [] self._stop_event = threading.Event() @@ -278,22 +279,24 @@ class IFreqaiModel(ABC): append_df = dk.get_backtesting_prediction() dk.append_predictions(append_df) else: - if not self.model_exists( - pair, dk, trained_timestamp=trained_timestamp_int - ): + if not self.model_exists(dk): dk.find_features(dataframe_train) + dk.find_labels(dataframe_train) self.model = self.train(dataframe_train, pair, dk) self.dd.pair_dict[pair]["trained_timestamp"] = int( trained_timestamp.stopts) - + if self.plot_features: + plot_feature_importance(self.model, pair, dk, self.plot_features) if self.save_backtest_models: logger.info('Saving backtest model to disk.') self.dd.save_data(self.model, pair, dk) + else: + logger.info('Saving metadata to disk.') + self.dd.save_metaddata(dk) else: self.model = self.dd.load_data(pair, dk) - self.check_if_feature_list_matches_strategy(dataframe_train, dk) - + # self.check_if_feature_list_matches_strategy(dataframe_train, dk) pred_df, do_preds = self.predict(dataframe_backtest, dk) append_df = dk.get_predictions_to_append(pred_df, do_preds) dk.append_predictions(append_df) @@ -372,8 +375,7 @@ class IFreqaiModel(ABC): self.dd.return_null_values_to_strategy(dataframe, dk) return dk - # ensure user is feeding the correct indicators to the model - self.check_if_feature_list_matches_strategy(dataframe, dk) + dk.find_labels(dataframe) self.build_strategy_return_arrays(dataframe, dk, metadata["pair"], trained_timestamp) @@ -492,7 +494,7 @@ class IFreqaiModel(ABC): if ft_params.get( "principal_component_analysis", False ): - dk.pca_transform(self.dk.data_dictionary['prediction_features']) + dk.pca_transform(dk.data_dictionary['prediction_features']) if ft_params.get("use_SVM_to_remove_outliers", False): dk.use_SVM_to_remove_outliers(predict=True) @@ -503,14 +505,10 @@ class IFreqaiModel(ABC): if ft_params.get("use_DBSCAN_to_remove_outliers", False): dk.use_DBSCAN_to_remove_outliers(predict=True) - def model_exists( - self, - pair: str, - dk: FreqaiDataKitchen, - trained_timestamp: int = None, - model_filename: str = "", - scanning: bool = False, - ) -> bool: + # ensure user is feeding the correct indicators to the model + self.check_if_feature_list_matches_strategy(dk.data_dictionary['prediction_features'], dk) + + def model_exists(self, dk: FreqaiDataKitchen) -> bool: """ Given a pair and path, check if a model already exists :param pair: pair e.g. BTC/USD @@ -518,11 +516,11 @@ class IFreqaiModel(ABC): :return: :boolean: whether the model file exists or not. """ - path_to_modelfile = Path(dk.data_path / f"{model_filename}_model.joblib") + path_to_modelfile = Path(dk.data_path / f"{dk.model_filename}_model.joblib") file_exists = path_to_modelfile.is_file() - if file_exists and not scanning: + if file_exists: logger.info("Found model at %s", dk.data_path / dk.model_filename) - elif not scanning: + else: logger.info("Could not find model at %s", dk.data_path / dk.model_filename) return file_exists @@ -569,6 +567,7 @@ class IFreqaiModel(ABC): # find the features indicated by strategy and store in datakitchen dk.find_features(unfiltered_dataframe) + dk.find_labels(unfiltered_dataframe) model = self.train(unfiltered_dataframe, pair, dk) @@ -576,8 +575,8 @@ class IFreqaiModel(ABC): dk.set_new_model_names(pair, new_trained_timerange) self.dd.save_data(model, pair, dk) - if self.freqai_info["feature_parameters"].get("plot_feature_importance", False): - plot_feature_importance(model, pair, dk) + if self.plot_features: + plot_feature_importance(model, pair, dk, self.plot_features) if self.freqai_info.get("purge_old_models", False): self.dd.purge_old_models() diff --git a/freqtrade/freqai/utils.py b/freqtrade/freqai/utils.py index f6358925c..22bc1e06e 100644 --- a/freqtrade/freqai/utils.py +++ b/freqtrade/freqai/utils.py @@ -170,7 +170,7 @@ def plot_feature_importance(model: Any, pair: str, dk: FreqaiDataKitchen, # Data preparation fi_df = pd.DataFrame({ - "feature_names": np.array(dk.training_features_list), + "feature_names": np.array(dk.data_dictionary['train_features'].columns), "feature_importance": np.array(feature_importance) }) fi_df_top = fi_df.nlargest(count_max, "feature_importance")[::-1] From adb5b98a3d282b92c35fde3467a49ac7ca55cd01 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 25 Sep 2022 19:29:20 +0200 Subject: [PATCH 15/16] Fix pre-commit lineending --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c12d3fb08..d4d15b198 100644 --- a/requirements.txt +++ b/requirements.txt @@ -55,4 +55,3 @@ schedule==1.1.0 #WS Messages websockets==10.3 janus==1.0.0 - From 48e89e68b90d942dcbc8d42e7863378400430f1f Mon Sep 17 00:00:00 2001 From: robcaulk Date: Sun, 25 Sep 2022 20:22:19 +0200 Subject: [PATCH 16/16] fix typos --- docs/freqai.md | 4 ++-- freqtrade/freqai/data_drawer.py | 2 +- freqtrade/freqai/freqai_interface.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/freqai.md b/docs/freqai.md index a186ce01a..e840e7136 100644 --- a/docs/freqai.md +++ b/docs/freqai.md @@ -109,8 +109,8 @@ Mandatory parameters are marked as **Required**, which means that they are requi | `indicator_max_period_candles` | **No longer used**. User must use the strategy set `startup_candle_count` which defines the maximum *period* used in `populate_any_indicators()` for indicator creation (timeframe independent). FreqAI uses this information in combination with the maximum timeframe to calculate how many data points it should download so that the first data point does not have a NaN
**Datatype:** positive integer. | `indicator_periods_candles` | Calculate indicators for `indicator_periods_candles` time periods and add them to the feature set.
**Datatype:** List of positive integers. | `stratify_training_data` | This value is used to indicate the grouping of the data. For example, 2 would set every 2nd data point into a separate dataset to be pulled from during training/testing. See details about how it works [here](#stratifying-the-data-for-training-and-testing-the-model)
**Datatype:** Positive integer. -| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis) -| `plot_feature_importances` | Create a feature importance plot for each model for the top/bottom `plot_feature_importances` number of features.
**Datatype:** Boolean.
**Datatype:** Boolean, defaults to `0` +| `principal_component_analysis` | Automatically reduce the dimensionality of the data set using Principal Component Analysis. See details about how it works [here](#reducing-data-dimensionality-with-principal-component-analysis)
**Datatype:** Boolean. defaults to `false`. +| `plot_feature_importances` | Create a feature importance plot for each model for the top/bottom `plot_feature_importances` number of features.
**Datatype:** Integer, defaults to `0`. | `DI_threshold` | Activates the Dissimilarity Index for outlier detection when > 0. See details about how it works [here](#removing-outliers-with-the-dissimilarity-index).
**Datatype:** Positive float (typically < 1). | `use_SVM_to_remove_outliers` | Train a support vector machine to detect and remove outliers from the training data set, as well as from incoming data points. See details about how it works [here](#removing-outliers-using-a-support-vector-machine-svm).
**Datatype:** Boolean. | `svm_params` | All parameters available in Sklearn's `SGDOneClassSVM()`. See details about some select parameters [here](#removing-outliers-using-a-support-vector-machine-svm).
**Datatype:** Dictionary. diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py index e6a39b6e7..1839724f8 100644 --- a/freqtrade/freqai/data_drawer.py +++ b/freqtrade/freqai/data_drawer.py @@ -377,7 +377,7 @@ class FreqaiDataDrawer: if self.config.get("freqai", {}).get("purge_old_models", False): self.purge_old_models() - def save_metaddata(self, dk: FreqaiDataKitchen) -> None: + def save_metadata(self, dk: FreqaiDataKitchen) -> None: """ Saves only metadata for backtesting studies if user prefers not to save model data. This saves tremendous amounts of space diff --git a/freqtrade/freqai/freqai_interface.py b/freqtrade/freqai/freqai_interface.py index 988aae4f5..d9f917338 100644 --- a/freqtrade/freqai/freqai_interface.py +++ b/freqtrade/freqai/freqai_interface.py @@ -292,7 +292,7 @@ class IFreqaiModel(ABC): self.dd.save_data(self.model, pair, dk) else: logger.info('Saving metadata to disk.') - self.dd.save_metaddata(dk) + self.dd.save_metadata(dk) else: self.model = self.dd.load_data(pair, dk)