From 3c0d2c446d6c886c9eccc3de5ab42cec4943ffb4 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 19 Sep 2022 20:23:20 +0200 Subject: [PATCH 01/11] Add Feather datahandler (no trade mode yet) --- freqtrade/constants.py | 2 +- freqtrade/data/history/featherdatahandler.py | 133 +++++++++++++++++++ freqtrade/data/history/idatahandler.py | 3 + 3 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 freqtrade/data/history/featherdatahandler.py diff --git a/freqtrade/constants.py b/freqtrade/constants.py index fe17b40bc..5727aff0a 100644 --- a/freqtrade/constants.py +++ b/freqtrade/constants.py @@ -36,7 +36,7 @@ AVAILABLE_PAIRLISTS = ['StaticPairList', 'VolumePairList', 'PrecisionFilter', 'PriceFilter', 'RangeStabilityFilter', 'ShuffleFilter', 'SpreadFilter', 'VolatilityFilter'] AVAILABLE_PROTECTIONS = ['CooldownPeriod', 'LowProfitPairs', 'MaxDrawdown', 'StoplossGuard'] -AVAILABLE_DATAHANDLERS = ['json', 'jsongz', 'hdf5'] +AVAILABLE_DATAHANDLERS = ['json', 'jsongz', 'hdf5', 'feather'] BACKTEST_BREAKDOWNS = ['day', 'week', 'month'] BACKTEST_CACHE_AGE = ['none', 'day', 'week', 'month'] BACKTEST_CACHE_DEFAULT = 'day' diff --git a/freqtrade/data/history/featherdatahandler.py b/freqtrade/data/history/featherdatahandler.py new file mode 100644 index 000000000..dfb818ca8 --- /dev/null +++ b/freqtrade/data/history/featherdatahandler.py @@ -0,0 +1,133 @@ +import logging +from typing import Optional + +from pandas import DataFrame, read_feather, to_datetime + +from freqtrade.configuration import TimeRange +from freqtrade.constants import DEFAULT_DATAFRAME_COLUMNS, TradeList +from freqtrade.enums import CandleType + +from .idatahandler import IDataHandler + + +logger = logging.getLogger(__name__) + + +class FeatherDataHandler(IDataHandler): + + _columns = DEFAULT_DATAFRAME_COLUMNS + + def ohlcv_store( + self, pair: str, timeframe: str, data: DataFrame, candle_type: CandleType) -> None: + """ + Store data in json format "values". + format looks as follows: + [[,,,,]] + :param pair: Pair - used to generate filename + :param timeframe: Timeframe - used to generate filename + :param data: Dataframe containing OHLCV data + :param candle_type: Any of the enum CandleType (must match trading mode!) + :return: None + """ + filename = self._pair_data_filename(self._datadir, pair, timeframe, candle_type) + self.create_dir_if_needed(filename) + + data.reset_index(drop=True).loc[:, self._columns].to_feather( + filename, compression_level=9, compression='lz4') + + def _ohlcv_load(self, pair: str, timeframe: str, + timerange: Optional[TimeRange], candle_type: CandleType + ) -> DataFrame: + """ + Internal method used to load data for one pair from disk. + Implements the loading and conversion to a Pandas dataframe. + Timerange trimming and dataframe validation happens outside of this method. + :param pair: Pair to load data + :param timeframe: Timeframe (e.g. "5m") + :param timerange: Limit data to be loaded to this timerange. + Optionally implemented by subclasses to avoid loading + all data where possible. + :param candle_type: Any of the enum CandleType (must match trading mode!) + :return: DataFrame with ohlcv data, or empty DataFrame + """ + filename = self._pair_data_filename( + self._datadir, pair, timeframe, candle_type=candle_type) + if not filename.exists(): + # Fallback mode for 1M files + filename = self._pair_data_filename( + self._datadir, pair, timeframe, candle_type=candle_type, no_timeframe_modify=True) + if not filename.exists(): + return DataFrame(columns=self._columns) + try: + pairdata = read_feather(filename) + pairdata.columns = self._columns + except ValueError: + logger.error(f"Could not load data for {pair}.") + return DataFrame(columns=self._columns) + pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float', + 'low': 'float', 'close': 'float', 'volume': 'float'}) + pairdata['date'] = to_datetime(pairdata['date'], + unit='ms', + utc=True, + infer_datetime_format=True) + return pairdata + + def ohlcv_append( + self, + pair: str, + timeframe: str, + data: DataFrame, + candle_type: CandleType + ) -> None: + """ + Append data to existing data structures + :param pair: Pair + :param timeframe: Timeframe this ohlcv data is for + :param data: Data to append. + :param candle_type: Any of the enum CandleType (must match trading mode!) + """ + raise NotImplementedError() + + def trades_store(self, pair: str, data: TradeList) -> None: + """ + Store trades data (list of Dicts) to file + :param pair: Pair - used for filename + :param data: List of Lists containing trade data, + column sequence as in DEFAULT_TRADES_COLUMNS + """ + # filename = self._pair_trades_filename(self._datadir, pair) + + raise NotImplementedError() + # array = pa.array(data) + # array + # feather.write_feather(data, filename) + + def trades_append(self, pair: str, data: TradeList): + """ + Append data to existing files + :param pair: Pair - used for filename + :param data: List of Lists containing trade data, + column sequence as in DEFAULT_TRADES_COLUMNS + """ + raise NotImplementedError() + + def _trades_load(self, pair: str, timerange: Optional[TimeRange] = None) -> TradeList: + """ + Load a pair from file, either .json.gz or .json + # TODO: respect timerange ... + :param pair: Load trades for this pair + :param timerange: Timerange to load trades for - currently not implemented + :return: List of trades + """ + raise NotImplementedError() + # filename = self._pair_trades_filename(self._datadir, pair) + # tradesdata = misc.file_load_json(filename) + + # if not tradesdata: + # return [] + + # return tradesdata + + @classmethod + def _get_file_extension(cls): + return "feather" diff --git a/freqtrade/data/history/idatahandler.py b/freqtrade/data/history/idatahandler.py index 8c1823c00..c98fd362f 100644 --- a/freqtrade/data/history/idatahandler.py +++ b/freqtrade/data/history/idatahandler.py @@ -375,6 +375,9 @@ def get_datahandlerclass(datatype: str) -> Type[IDataHandler]: elif datatype == 'hdf5': from .hdf5datahandler import HDF5DataHandler return HDF5DataHandler + elif datatype == 'feather': + from .featherdatahandler import FeatherDataHandler + return FeatherDataHandler else: raise ValueError(f"No datahandler for datatype {datatype} available.") From dc2b93228bb0febb3f65ed7238e29b6c9b772d19 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 20 Sep 2022 13:42:15 +0000 Subject: [PATCH 02/11] Add ParquetDataHandler --- freqtrade/constants.py | 2 +- freqtrade/data/history/idatahandler.py | 3 + freqtrade/data/history/parquetdatahandler.py | 132 +++++++++++++++++++ 3 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 freqtrade/data/history/parquetdatahandler.py diff --git a/freqtrade/constants.py b/freqtrade/constants.py index 5727aff0a..76b37b2d8 100644 --- a/freqtrade/constants.py +++ b/freqtrade/constants.py @@ -36,7 +36,7 @@ AVAILABLE_PAIRLISTS = ['StaticPairList', 'VolumePairList', 'PrecisionFilter', 'PriceFilter', 'RangeStabilityFilter', 'ShuffleFilter', 'SpreadFilter', 'VolatilityFilter'] AVAILABLE_PROTECTIONS = ['CooldownPeriod', 'LowProfitPairs', 'MaxDrawdown', 'StoplossGuard'] -AVAILABLE_DATAHANDLERS = ['json', 'jsongz', 'hdf5', 'feather'] +AVAILABLE_DATAHANDLERS = ['json', 'jsongz', 'hdf5', 'feather', 'parquet'] BACKTEST_BREAKDOWNS = ['day', 'week', 'month'] BACKTEST_CACHE_AGE = ['none', 'day', 'week', 'month'] BACKTEST_CACHE_DEFAULT = 'day' diff --git a/freqtrade/data/history/idatahandler.py b/freqtrade/data/history/idatahandler.py index c98fd362f..eb5ad3621 100644 --- a/freqtrade/data/history/idatahandler.py +++ b/freqtrade/data/history/idatahandler.py @@ -378,6 +378,9 @@ def get_datahandlerclass(datatype: str) -> Type[IDataHandler]: elif datatype == 'feather': from .featherdatahandler import FeatherDataHandler return FeatherDataHandler + elif datatype == 'parquet': + from .parquetdatahandler import ParquetDataHandler + return ParquetDataHandler else: raise ValueError(f"No datahandler for datatype {datatype} available.") diff --git a/freqtrade/data/history/parquetdatahandler.py b/freqtrade/data/history/parquetdatahandler.py new file mode 100644 index 000000000..283d90ec0 --- /dev/null +++ b/freqtrade/data/history/parquetdatahandler.py @@ -0,0 +1,132 @@ +import logging +from typing import Optional + +from pandas import DataFrame, read_parquet, to_datetime + +from freqtrade.configuration import TimeRange +from freqtrade.constants import DEFAULT_DATAFRAME_COLUMNS, TradeList +from freqtrade.enums import CandleType + +from .idatahandler import IDataHandler + + +logger = logging.getLogger(__name__) + + +class ParquetDataHandler(IDataHandler): + + _columns = DEFAULT_DATAFRAME_COLUMNS + + def ohlcv_store( + self, pair: str, timeframe: str, data: DataFrame, candle_type: CandleType) -> None: + """ + Store data in json format "values". + format looks as follows: + [[,,,,]] + :param pair: Pair - used to generate filename + :param timeframe: Timeframe - used to generate filename + :param data: Dataframe containing OHLCV data + :param candle_type: Any of the enum CandleType (must match trading mode!) + :return: None + """ + filename = self._pair_data_filename(self._datadir, pair, timeframe, candle_type) + self.create_dir_if_needed(filename) + + data.reset_index(drop=True).loc[:, self._columns].to_parquet(filename) + + def _ohlcv_load(self, pair: str, timeframe: str, + timerange: Optional[TimeRange], candle_type: CandleType + ) -> DataFrame: + """ + Internal method used to load data for one pair from disk. + Implements the loading and conversion to a Pandas dataframe. + Timerange trimming and dataframe validation happens outside of this method. + :param pair: Pair to load data + :param timeframe: Timeframe (e.g. "5m") + :param timerange: Limit data to be loaded to this timerange. + Optionally implemented by subclasses to avoid loading + all data where possible. + :param candle_type: Any of the enum CandleType (must match trading mode!) + :return: DataFrame with ohlcv data, or empty DataFrame + """ + filename = self._pair_data_filename( + self._datadir, pair, timeframe, candle_type=candle_type) + if not filename.exists(): + # Fallback mode for 1M files + filename = self._pair_data_filename( + self._datadir, pair, timeframe, candle_type=candle_type, no_timeframe_modify=True) + if not filename.exists(): + return DataFrame(columns=self._columns) + try: + pairdata = read_parquet(filename) + pairdata.columns = self._columns + except ValueError: + logger.error(f"Could not load data for {pair}.") + return DataFrame(columns=self._columns) + pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float', + 'low': 'float', 'close': 'float', 'volume': 'float'}) + pairdata['date'] = to_datetime(pairdata['date'], + unit='ms', + utc=True, + infer_datetime_format=True) + return pairdata + + def ohlcv_append( + self, + pair: str, + timeframe: str, + data: DataFrame, + candle_type: CandleType + ) -> None: + """ + Append data to existing data structures + :param pair: Pair + :param timeframe: Timeframe this ohlcv data is for + :param data: Data to append. + :param candle_type: Any of the enum CandleType (must match trading mode!) + """ + raise NotImplementedError() + + def trades_store(self, pair: str, data: TradeList) -> None: + """ + Store trades data (list of Dicts) to file + :param pair: Pair - used for filename + :param data: List of Lists containing trade data, + column sequence as in DEFAULT_TRADES_COLUMNS + """ + # filename = self._pair_trades_filename(self._datadir, pair) + + raise NotImplementedError() + # array = pa.array(data) + # array + # feather.write_feather(data, filename) + + def trades_append(self, pair: str, data: TradeList): + """ + Append data to existing files + :param pair: Pair - used for filename + :param data: List of Lists containing trade data, + column sequence as in DEFAULT_TRADES_COLUMNS + """ + raise NotImplementedError() + + def _trades_load(self, pair: str, timerange: Optional[TimeRange] = None) -> TradeList: + """ + Load a pair from file, either .json.gz or .json + # TODO: respect timerange ... + :param pair: Load trades for this pair + :param timerange: Timerange to load trades for - currently not implemented + :return: List of trades + """ + raise NotImplementedError() + # filename = self._pair_trades_filename(self._datadir, pair) + # tradesdata = misc.file_load_json(filename) + + # if not tradesdata: + # return [] + + # return tradesdata + + @classmethod + def _get_file_extension(cls): + return "parquet" From 044891f5433735d0fc38e808a6e527113e4d67f6 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 07:02:18 +0200 Subject: [PATCH 03/11] Add conditional formats depending on mode --- freqtrade/commands/cli_options.py | 2 +- freqtrade/constants.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/freqtrade/commands/cli_options.py b/freqtrade/commands/cli_options.py index f383f0768..e50fb86d8 100644 --- a/freqtrade/commands/cli_options.py +++ b/freqtrade/commands/cli_options.py @@ -440,7 +440,7 @@ AVAILABLE_CLI_OPTIONS = { "dataformat_trades": Arg( '--data-format-trades', help='Storage format for downloaded trades data. (default: `jsongz`).', - choices=constants.AVAILABLE_DATAHANDLERS, + choices=constants.AVAILABLE_DATAHANDLERS_TRADES, ), "show_timerange": Arg( '--show-timerange', diff --git a/freqtrade/constants.py b/freqtrade/constants.py index 76b37b2d8..4c2bd6e18 100644 --- a/freqtrade/constants.py +++ b/freqtrade/constants.py @@ -36,7 +36,8 @@ AVAILABLE_PAIRLISTS = ['StaticPairList', 'VolumePairList', 'PrecisionFilter', 'PriceFilter', 'RangeStabilityFilter', 'ShuffleFilter', 'SpreadFilter', 'VolatilityFilter'] AVAILABLE_PROTECTIONS = ['CooldownPeriod', 'LowProfitPairs', 'MaxDrawdown', 'StoplossGuard'] -AVAILABLE_DATAHANDLERS = ['json', 'jsongz', 'hdf5', 'feather', 'parquet'] +AVAILABLE_DATAHANDLERS_TRADES = ['json', 'jsongz', 'hdf5'] +AVAILABLE_DATAHANDLERS = AVAILABLE_DATAHANDLERS_TRADES + ['feather', 'parquet'] BACKTEST_BREAKDOWNS = ['day', 'week', 'month'] BACKTEST_CACHE_AGE = ['none', 'day', 'week', 'month'] BACKTEST_CACHE_DEFAULT = 'day' @@ -434,7 +435,7 @@ CONF_SCHEMA = { }, 'dataformat_trades': { 'type': 'string', - 'enum': AVAILABLE_DATAHANDLERS, + 'enum': AVAILABLE_DATAHANDLERS_TRADES, 'default': 'jsongz' }, 'position_adjustment_enable': {'type': 'boolean'}, From 983a16d9370759d5105ba2fa3d31d9c9a2be158f Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 07:02:28 +0200 Subject: [PATCH 04/11] Rudimentary "not implemented" test --- tests/data/test_datahandler.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/data/test_datahandler.py b/tests/data/test_datahandler.py index 6d9f72a81..090ea3eba 100644 --- a/tests/data/test_datahandler.py +++ b/tests/data/test_datahandler.py @@ -152,6 +152,15 @@ def test_jsondatahandler_ohlcv_load(testdatadir, caplog): assert df.columns.equals(df1.columns) +@pytest.mark.parametrize('datahandler', ['feather', 'parquet']) +def test_datahandler_trades_not_supported(datahandler, testdatadir, ): + dh = get_datahandler(testdatadir, datahandler) + with pytest.raises(NotImplementedError): + dh.trades_load('UNITTEST/ETH') + with pytest.raises(NotImplementedError): + dh.trades_store('UNITTEST/ETH', MagicMock()) + + def test_jsondatahandler_trades_load(testdatadir, caplog): dh = JsonGzDataHandler(testdatadir) logmsg = "Old trades format detected - converting" From 5fb56b09f2eddad2ca5852983b8286d08fd29111 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 07:20:09 +0200 Subject: [PATCH 05/11] Test Feather/parquet datahandler init --- tests/data/test_datahandler.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/data/test_datahandler.py b/tests/data/test_datahandler.py index 090ea3eba..737f1f59f 100644 --- a/tests/data/test_datahandler.py +++ b/tests/data/test_datahandler.py @@ -9,9 +9,11 @@ from pandas import DataFrame from freqtrade.configuration import TimeRange from freqtrade.constants import AVAILABLE_DATAHANDLERS +from freqtrade.data.history.featherdatahandler import FeatherDataHandler from freqtrade.data.history.hdf5datahandler import HDF5DataHandler from freqtrade.data.history.idatahandler import IDataHandler, get_datahandler, get_datahandlerclass from freqtrade.data.history.jsondatahandler import JsonDataHandler, JsonGzDataHandler +from freqtrade.data.history.parquetdatahandler import ParquetDataHandler from freqtrade.enums import CandleType, TradingMode from tests.conftest import log_has @@ -339,13 +341,24 @@ def test_gethandlerclass(): cl = get_datahandlerclass('json') assert cl == JsonDataHandler assert issubclass(cl, IDataHandler) + cl = get_datahandlerclass('jsongz') assert cl == JsonGzDataHandler assert issubclass(cl, IDataHandler) assert issubclass(cl, JsonDataHandler) + cl = get_datahandlerclass('hdf5') assert cl == HDF5DataHandler assert issubclass(cl, IDataHandler) + + cl = get_datahandlerclass('feather') + assert cl == FeatherDataHandler + assert issubclass(cl, IDataHandler) + + cl = get_datahandlerclass('parquet') + assert cl == ParquetDataHandler + assert issubclass(cl, IDataHandler) + with pytest.raises(ValueError, match=r"No datahandler for .*"): get_datahandlerclass('DeadBeef') From 0bbb6faeba7520ef533d4dbbe0c0bd95acd5304c Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 07:30:11 +0200 Subject: [PATCH 06/11] Add generic datahandler test --- tests/data/test_datahandler.py | 55 ++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tests/data/test_datahandler.py b/tests/data/test_datahandler.py index 737f1f59f..07c9a20df 100644 --- a/tests/data/test_datahandler.py +++ b/tests/data/test_datahandler.py @@ -323,6 +323,61 @@ def test_hdf5datahandler_ohlcv_load_and_resave( assert ohlcv.empty +@pytest.mark.parametrize('pair,timeframe,candle_type,candle_append,startdt,enddt', [ + # Data goes from 2018-01-10 - 2018-01-30 + ('UNITTEST/BTC', '5m', 'spot', '', '2018-01-15', '2018-01-19'), + # Mark data goes from to 2021-11-15 2021-11-19 + ('UNITTEST/USDT', '1h', 'mark', '-mark', '2021-11-16', '2021-11-18'), +]) +@pytest.mark.parametrize('datahandler', ['hdf5', 'feather', 'parquet']) +def test_generic_datahandler_ohlcv_load_and_resave( + datahandler, + testdatadir, + tmpdir, + pair, + timeframe, + candle_type, + candle_append, + startdt, enddt +): + tmpdir1 = Path(tmpdir) + tmpdir2 = tmpdir1 + if candle_type not in ('', 'spot'): + tmpdir2 = tmpdir1 / 'futures' + tmpdir2.mkdir() + # Load data from one common file + dhbase = get_datahandler(testdatadir, 'json') + ohlcv = dhbase._ohlcv_load(pair, timeframe, None, candle_type=candle_type) + assert isinstance(ohlcv, DataFrame) + assert len(ohlcv) > 0 + + # Get data to test + dh = get_datahandler(testdatadir, datahandler) + + file = tmpdir2 / f"UNITTEST_NEW-{timeframe}{candle_append}.{dh._get_file_extension()}" + assert not file.is_file() + + dh1 = get_datahandler(tmpdir1, datahandler) + dh1.ohlcv_store('UNITTEST/NEW', timeframe, ohlcv, candle_type=candle_type) + assert file.is_file() + + assert not ohlcv[ohlcv['date'] < startdt].empty + + timerange = TimeRange.parse_timerange(f"{startdt.replace('-', '')}-{enddt.replace('-', '')}") + + ohlcv = dhbase.ohlcv_load(pair, timeframe, timerange=timerange, candle_type=candle_type) + ohlcv1 = dh1.ohlcv_load('UNITTEST/NEW', timeframe, timerange=timerange, candle_type=candle_type) + + assert len(ohlcv) == len(ohlcv1) + assert ohlcv.equals(ohlcv1) + assert ohlcv[ohlcv['date'] < startdt].empty + assert ohlcv[ohlcv['date'] > enddt].empty + + # Try loading inexisting file + ohlcv = dh.ohlcv_load('UNITTEST/NONEXIST', timeframe, candle_type=candle_type) + assert ohlcv.empty + + def test_hdf5datahandler_ohlcv_purge(mocker, testdatadir): mocker.patch.object(Path, "exists", MagicMock(return_value=False)) unlinkmock = mocker.patch.object(Path, "unlink", MagicMock()) From 48352b8a375dae287ca85f0aae47eb875bee10d2 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 14:49:17 +0000 Subject: [PATCH 07/11] Update hdf5 handler to reset index on load --- freqtrade/data/history/hdf5datahandler.py | 1 + tests/data/test_datahandler.py | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/freqtrade/data/history/hdf5datahandler.py b/freqtrade/data/history/hdf5datahandler.py index 01b7af7e7..fd46115de 100644 --- a/freqtrade/data/history/hdf5datahandler.py +++ b/freqtrade/data/history/hdf5datahandler.py @@ -81,6 +81,7 @@ class HDF5DataHandler(IDataHandler): raise ValueError("Wrong dataframe format") pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float', 'low': 'float', 'close': 'float', 'volume': 'float'}) + pairdata = pairdata.reset_index(drop=True) return pairdata def ohlcv_append( diff --git a/tests/data/test_datahandler.py b/tests/data/test_datahandler.py index 07c9a20df..8e1b0050a 100644 --- a/tests/data/test_datahandler.py +++ b/tests/data/test_datahandler.py @@ -366,7 +366,13 @@ def test_generic_datahandler_ohlcv_load_and_resave( timerange = TimeRange.parse_timerange(f"{startdt.replace('-', '')}-{enddt.replace('-', '')}") ohlcv = dhbase.ohlcv_load(pair, timeframe, timerange=timerange, candle_type=candle_type) - ohlcv1 = dh1.ohlcv_load('UNITTEST/NEW', timeframe, timerange=timerange, candle_type=candle_type) + if datahandler == 'hdf5': + ohlcv1 = dh1._ohlcv_load('UNITTEST/NEW', timeframe, timerange, candle_type=candle_type) + if candle_type == 'mark': + ohlcv1['volume'] = 0.0 + else: + ohlcv1 = dh1.ohlcv_load('UNITTEST/NEW', timeframe, + timerange=timerange, candle_type=candle_type) assert len(ohlcv) == len(ohlcv1) assert ohlcv.equals(ohlcv1) From 7e1e388b9ce492d390671df30b70ceabd5415e06 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 18:24:30 +0200 Subject: [PATCH 08/11] Add feather/parquet docs --- docs/data-download.md | 44 ++++++++++++++++++-- freqtrade/data/history/featherdatahandler.py | 9 ++-- freqtrade/data/history/parquetdatahandler.py | 9 ++-- 3 files changed, 47 insertions(+), 15 deletions(-) diff --git a/docs/data-download.md b/docs/data-download.md index 2b76d4f74..60e3f5efe 100644 --- a/docs/data-download.md +++ b/docs/data-download.md @@ -179,9 +179,11 @@ freqtrade download-data --exchange binance --pairs ETH/USDT XRP/USDT BTC/USDT -- Freqtrade currently supports 3 data-formats for both OHLCV and trades data: -* `json` (plain "text" json files) -* `jsongz` (a gzip-zipped version of json files) -* `hdf5` (a high performance datastore) +* `json` - plain "text" json files +* `jsongz` - a gzip-zipped version of json files +* `hdf5` - a high performance datastore +* `feather` - a dataformat based on Apache Arrow +* `parquet` - columnar datastore By default, OHLCV data is stored as `json` data, while trades data is stored as `jsongz` data. @@ -200,6 +202,42 @@ If the default data-format has been changed during download, then the keys `data !!! Note You can convert between data-formats using the [convert-data](#sub-command-convert-data) and [convert-trade-data](#sub-command-convert-trade-data) methods. +#### Dataformat comparison + +The following comparisons have been made with the following data, and by using the linux `time` command. + +``` +Found 6 pair / timeframe combinations. ++----------+-------------+--------+---------------------+---------------------+ +| Pair | Timeframe | Type | From | To | +|----------+-------------+--------+---------------------+---------------------| +| BTC/USDT | 5m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:25:00 | +| ETH/USDT | 1m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:26:00 | +| BTC/USDT | 1m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:30:00 | +| XRP/USDT | 5m | spot | 2018-05-04 08:10:00 | 2022-09-13 19:15:00 | +| XRP/USDT | 1m | spot | 2018-05-04 08:11:00 | 2022-09-13 19:22:00 | +| ETH/USDT | 5m | spot | 2017-08-17 04:00:00 | 2022-09-13 19:20:00 | ++----------+-------------+--------+---------------------+---------------------+ +``` + +Timings have been taken in a not very scientific way with the following command, which forces reading the data into memory. + +``` bash +time freqtrade list-data --show-timerange --data-format-ohlcv +``` + +| Format | Size | timing | +|------------|-------------|-------------| +| `json` | 149Mb | 25.6s | +| `jsongz` | 39Mb | 27s | +| `hdf5` | 145Mb | 3.9s | +| `feather` | 72Mb | 3.5s | +| `parquet` | 83Mb | 3.8s | + +Size has been taken from the BTC/USDT 1m spot combination for the timerange specified above. + +To have a best performance/size mix, we recommend the use of either feather or parquet. + #### Sub-command convert data ``` diff --git a/freqtrade/data/history/featherdatahandler.py b/freqtrade/data/history/featherdatahandler.py index dfb818ca8..22a6805e7 100644 --- a/freqtrade/data/history/featherdatahandler.py +++ b/freqtrade/data/history/featherdatahandler.py @@ -58,12 +58,9 @@ class FeatherDataHandler(IDataHandler): self._datadir, pair, timeframe, candle_type=candle_type, no_timeframe_modify=True) if not filename.exists(): return DataFrame(columns=self._columns) - try: - pairdata = read_feather(filename) - pairdata.columns = self._columns - except ValueError: - logger.error(f"Could not load data for {pair}.") - return DataFrame(columns=self._columns) + + pairdata = read_feather(filename) + pairdata.columns = self._columns pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float', 'low': 'float', 'close': 'float', 'volume': 'float'}) pairdata['date'] = to_datetime(pairdata['date'], diff --git a/freqtrade/data/history/parquetdatahandler.py b/freqtrade/data/history/parquetdatahandler.py index 283d90ec0..57581861d 100644 --- a/freqtrade/data/history/parquetdatahandler.py +++ b/freqtrade/data/history/parquetdatahandler.py @@ -57,12 +57,9 @@ class ParquetDataHandler(IDataHandler): self._datadir, pair, timeframe, candle_type=candle_type, no_timeframe_modify=True) if not filename.exists(): return DataFrame(columns=self._columns) - try: - pairdata = read_parquet(filename) - pairdata.columns = self._columns - except ValueError: - logger.error(f"Could not load data for {pair}.") - return DataFrame(columns=self._columns) + + pairdata = read_parquet(filename) + pairdata.columns = self._columns pairdata = pairdata.astype(dtype={'open': 'float', 'high': 'float', 'low': 'float', 'close': 'float', 'volume': 'float'}) pairdata['date'] = to_datetime(pairdata['date'], From 4576d291a95e8703f7206f0441e0961be276ad2b Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 18:25:26 +0200 Subject: [PATCH 09/11] Update data command outputs --- docs/data-download.md | 64 +++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/docs/data-download.md b/docs/data-download.md index 60e3f5efe..700ca04f4 100644 --- a/docs/data-download.md +++ b/docs/data-download.md @@ -26,7 +26,7 @@ usage: freqtrade download-data [-h] [-v] [--logfile FILE] [-V] [-c PATH] [--timerange TIMERANGE] [--dl-trades] [--exchange EXCHANGE] [-t TIMEFRAMES [TIMEFRAMES ...]] [--erase] - [--data-format-ohlcv {json,jsongz,hdf5}] + [--data-format-ohlcv {json,jsongz,hdf5,feather,parquet}] [--data-format-trades {json,jsongz,hdf5}] [--trading-mode {spot,margin,futures}] [--prepend] @@ -55,7 +55,7 @@ optional arguments: list. Default: `1m 5m`. --erase Clean all existing data for the selected exchange/pairs/timeframes. - --data-format-ohlcv {json,jsongz,hdf5} + --data-format-ohlcv {json,jsongz,hdf5,feather,parquet} Storage format for downloaded candle (OHLCV) data. (default: `json`). --data-format-trades {json,jsongz,hdf5} @@ -76,7 +76,7 @@ Common arguments: `userdir/config.json` or `config.json` whichever exists). Multiple --config options may be used. Can be set to `-` to read config from stdin. - -d PATH, --datadir PATH + -d PATH, --datadir PATH, --data-dir PATH Path to directory with historical backtesting data. --userdir PATH, --user-data-dir PATH Path to userdata directory. @@ -244,32 +244,32 @@ To have a best performance/size mix, we recommend the use of either feather or p usage: freqtrade convert-data [-h] [-v] [--logfile FILE] [-V] [-c PATH] [-d PATH] [--userdir PATH] [-p PAIRS [PAIRS ...]] --format-from - {json,jsongz,hdf5} --format-to - {json,jsongz,hdf5} [--erase] - [-t {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...]] + {json,jsongz,hdf5,feather,parquet} --format-to + {json,jsongz,hdf5,feather,parquet} [--erase] [--exchange EXCHANGE] + [-t TIMEFRAMES [TIMEFRAMES ...]] [--trading-mode {spot,margin,futures}] - [--candle-types {spot,,futures,mark,index,premiumIndex,funding_rate} [{spot,,futures,mark,index,premiumIndex,funding_rate} ...]] + [--candle-types {spot,futures,mark,index,premiumIndex,funding_rate} [{spot,futures,mark,index,premiumIndex,funding_rate} ...]] optional arguments: -h, --help show this help message and exit -p PAIRS [PAIRS ...], --pairs PAIRS [PAIRS ...] Limit command to these pairs. Pairs are space- separated. - --format-from {json,jsongz,hdf5} + --format-from {json,jsongz,hdf5,feather,parquet} Source format for data conversion. - --format-to {json,jsongz,hdf5} + --format-to {json,jsongz,hdf5,feather,parquet} Destination format for data conversion. --erase Clean all existing data for the selected exchange/pairs/timeframes. - -t {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...], --timeframes {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...] - Specify which tickers to download. Space-separated - list. Default: `1m 5m`. --exchange EXCHANGE Exchange name (default: `bittrex`). Only valid if no config is provided. - --trading-mode {spot,margin,futures} + -t TIMEFRAMES [TIMEFRAMES ...], --timeframes TIMEFRAMES [TIMEFRAMES ...] + Specify which tickers to download. Space-separated + list. Default: `1m 5m`. + --trading-mode {spot,margin,futures}, --tradingmode {spot,margin,futures} Select Trading mode - --candle-types {spot,,futures,mark,index,premiumIndex,funding_rate} [{spot,,futures,mark,index,premiumIndex,funding_rate} ...] + --candle-types {spot,futures,mark,index,premiumIndex,funding_rate} [{spot,futures,mark,index,premiumIndex,funding_rate} ...] Select candle type to use Common arguments: @@ -283,7 +283,7 @@ Common arguments: `userdir/config.json` or `config.json` whichever exists). Multiple --config options may be used. Can be set to `-` to read config from stdin. - -d PATH, --datadir PATH + -d PATH, --datadir PATH, --data-dir PATH Path to directory with historical backtesting data. --userdir PATH, --user-data-dir PATH Path to userdata directory. @@ -305,20 +305,24 @@ freqtrade convert-data --format-from json --format-to jsongz --datadir ~/.freqtr usage: freqtrade convert-trade-data [-h] [-v] [--logfile FILE] [-V] [-c PATH] [-d PATH] [--userdir PATH] [-p PAIRS [PAIRS ...]] --format-from - {json,jsongz,hdf5} --format-to - {json,jsongz,hdf5} [--erase] + {json,jsongz,hdf5,feather,parquet} + --format-to + {json,jsongz,hdf5,feather,parquet} + [--erase] [--exchange EXCHANGE] optional arguments: -h, --help show this help message and exit -p PAIRS [PAIRS ...], --pairs PAIRS [PAIRS ...] - Show profits for only these pairs. Pairs are space- + Limit command to these pairs. Pairs are space- separated. - --format-from {json,jsongz,hdf5} + --format-from {json,jsongz,hdf5,feather,parquet} Source format for data conversion. - --format-to {json,jsongz,hdf5} + --format-to {json,jsongz,hdf5,feather,parquet} Destination format for data conversion. --erase Clean all existing data for the selected exchange/pairs/timeframes. + --exchange EXCHANGE Exchange name (default: `bittrex`). Only valid if no + config is provided. Common arguments: -v, --verbose Verbose mode (-vv for more, -vvv to get all messages). @@ -331,7 +335,7 @@ Common arguments: `userdir/config.json` or `config.json` whichever exists). Multiple --config options may be used. Can be set to `-` to read config from stdin. - -d PATH, --datadir PATH + -d PATH, --datadir PATH, --data-dir PATH Path to directory with historical backtesting data. --userdir PATH, --user-data-dir PATH Path to userdata directory. @@ -356,9 +360,9 @@ This command will allow you to repeat this last step for additional timeframes w usage: freqtrade trades-to-ohlcv [-h] [-v] [--logfile FILE] [-V] [-c PATH] [-d PATH] [--userdir PATH] [-p PAIRS [PAIRS ...]] - [-t {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...]] + [-t TIMEFRAMES [TIMEFRAMES ...]] [--exchange EXCHANGE] - [--data-format-ohlcv {json,jsongz,hdf5}] + [--data-format-ohlcv {json,jsongz,hdf5,feather,parquet}] [--data-format-trades {json,jsongz,hdf5}] optional arguments: @@ -366,12 +370,12 @@ optional arguments: -p PAIRS [PAIRS ...], --pairs PAIRS [PAIRS ...] Limit command to these pairs. Pairs are space- separated. - -t {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...], --timeframes {1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} [{1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,2w,1M,1y} ...] + -t TIMEFRAMES [TIMEFRAMES ...], --timeframes TIMEFRAMES [TIMEFRAMES ...] Specify which tickers to download. Space-separated list. Default: `1m 5m`. --exchange EXCHANGE Exchange name (default: `bittrex`). Only valid if no config is provided. - --data-format-ohlcv {json,jsongz,hdf5} + --data-format-ohlcv {json,jsongz,hdf5,feather,parquet} Storage format for downloaded candle (OHLCV) data. (default: `json`). --data-format-trades {json,jsongz,hdf5} @@ -389,7 +393,7 @@ Common arguments: `userdir/config.json` or `config.json` whichever exists). Multiple --config options may be used. Can be set to `-` to read config from stdin. - -d PATH, --datadir PATH + -d PATH, --datadir PATH, --data-dir PATH Path to directory with historical backtesting data. --userdir PATH, --user-data-dir PATH Path to userdata directory. @@ -409,7 +413,7 @@ You can get a list of downloaded data using the `list-data` sub-command. ``` usage: freqtrade list-data [-h] [-v] [--logfile FILE] [-V] [-c PATH] [-d PATH] [--userdir PATH] [--exchange EXCHANGE] - [--data-format-ohlcv {json,jsongz,hdf5}] + [--data-format-ohlcv {json,jsongz,hdf5,feather,parquet}] [-p PAIRS [PAIRS ...]] [--trading-mode {spot,margin,futures}] [--show-timerange] @@ -418,13 +422,13 @@ optional arguments: -h, --help show this help message and exit --exchange EXCHANGE Exchange name (default: `bittrex`). Only valid if no config is provided. - --data-format-ohlcv {json,jsongz,hdf5} + --data-format-ohlcv {json,jsongz,hdf5,feather,parquet} Storage format for downloaded candle (OHLCV) data. (default: `json`). -p PAIRS [PAIRS ...], --pairs PAIRS [PAIRS ...] Limit command to these pairs. Pairs are space- separated. - --trading-mode {spot,margin,futures} + --trading-mode {spot,margin,futures}, --tradingmode {spot,margin,futures} Select Trading mode --show-timerange Show timerange available for available data. (May take a while to calculate). @@ -440,7 +444,7 @@ Common arguments: `userdir/config.json` or `config.json` whichever exists). Multiple --config options may be used. Can be set to `-` to read config from stdin. - -d PATH, --datadir PATH + -d PATH, --datadir PATH, --data-dir PATH Path to directory with historical backtesting data. --userdir PATH, --user-data-dir PATH Path to userdata directory. From 7c093388e7d01ddc859d23ca29f5aa4fb3ac4336 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 19:36:23 +0200 Subject: [PATCH 10/11] Add pyarrow dependency --- environment.yml | 1 + requirements.txt | 2 ++ setup.py | 4 ++-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index d6d85de9d..5298b2baa 100644 --- a/environment.yml +++ b/environment.yml @@ -34,6 +34,7 @@ dependencies: - schedule - python-dateutil - joblib + - pyarrow # ============================ diff --git a/requirements.txt b/requirements.txt index 690e33a09..c12d3fb08 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,6 +21,7 @@ jinja2==3.1.2 tables==3.7.0 blosc==1.10.6 joblib==1.2.0 +pyarrow==9.0.0 # find first, C search in arrays py_find_1st==1.1.5 @@ -54,3 +55,4 @@ schedule==1.1.0 #WS Messages websockets==10.3 janus==1.0.0 + diff --git a/setup.py b/setup.py index 2e6e354b0..cdd461f3f 100644 --- a/setup.py +++ b/setup.py @@ -8,13 +8,11 @@ hyperopt = [ 'scikit-learn', 'scikit-optimize>=0.7.0', 'filelock', - 'joblib', 'progressbar2', ] freqai = [ 'scikit-learn', - 'joblib', 'catboost; platform_machine != "aarch64"', 'lightgbm', ] @@ -74,6 +72,8 @@ setup( 'pandas', 'tables', 'blosc', + 'joblib', + 'pyarrow' 'fastapi', 'uvicorn', 'psutil', From 2fffe7c5ddaee734aad278afe6470601d48bf371 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Sep 2022 20:03:33 +0200 Subject: [PATCH 11/11] Fix missing comma --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cdd461f3f..1547b7974 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ setup( 'tables', 'blosc', 'joblib', - 'pyarrow' + 'pyarrow', 'fastapi', 'uvicorn', 'psutil',