Merge pull request #6753 from freqtrade/download_prepend

Download prepend
2022-05-01 15:15:16 +02:00
parent d5fc923dcb 2cedbe5704
commit 910addd02b
10 changed files with 109 additions and 33 deletions
--- a/docs/data-download.md
+++ b/docs/data-download.md
@@ -30,6 +30,7 @@ usage: freqtrade download-data [-h] [-v] [--logfile FILE] [-V] [-c PATH]
                               [--data-format-ohlcv {json,jsongz,hdf5}]
                               [--data-format-trades {json,jsongz,hdf5}]
                               [--trading-mode {spot,margin,futures}]
+                               [--prepend]

 optional arguments:
  -h, --help            show this help message and exit
@@ -62,6 +63,7 @@ optional arguments:
                        `jsongz`).
  --trading-mode {spot,margin,futures}
                        Select Trading mode
+  --prepend             Allow data prepending.

 Common arguments:
  -v, --verbose         Verbose mode (-vv for more, -vvv to get all messages).
@@ -157,10 +159,21 @@ freqtrade download-data --exchange binance --pairs .*/USDT
 - To change the exchange used to download the historical data from, please use a different configuration file (you'll probably need to adjust rate limits etc.)
 - To use `pairs.json` from some other directory, use `--pairs-file some_other_dir/pairs.json`.
 - To download historical candle (OHLCV) data for only 10 days, use `--days 10` (defaults to 30 days).
- To download historical candle (OHLCV) data from a fixed starting point, use `--timerange 20200101-` - which will download all data from January 1st, 2020. Eventually set end dates are ignored.
+- To download historical candle (OHLCV) data from a fixed starting point, use `--timerange 20200101-` - which will download all data from January 1st, 2020.
 - Use `--timeframes` to specify what timeframe download the historical candle (OHLCV) data for. Default is `--timeframes 1m 5m` which will download 1-minute and 5-minute data.
 - To use exchange, timeframe and list of pairs as defined in your configuration file, use the `-c/--config` option. With this, the script uses the whitelist defined in the config as the list of currency pairs to download data for and does not require the pairs.json file. You can combine `-c/--config` with most other options.

+#### Download additional data before the current timerange
+
+Assuming you downloaded all data from 2022 (`--timerange 20220101-`) - but you'd now like to also backtest with earlier data.
+You can do so by using the `--prepend` flag, combined with `--timerange` - specifying an end-date.
+
+``` bash
+freqtrade download-data --exchange binance --pairs ETH/USDT XRP/USDT BTC/USDT --prepend --timerange 20210101-20220101
+```
+
+!!! Note
+    Freqtrade will ignore the end-date in this mode if data is available, updating the end-date to the existing data start point.

 ### Data format

--- a/freqtrade/commands/arguments.py
+++ b/freqtrade/commands/arguments.py
@@ -72,7 +72,8 @@ ARGS_LIST_DATA = ["exchange", "dataformat_ohlcv", "pairs", "trading_mode"]

 ARGS_DOWNLOAD_DATA = ["pairs", "pairs_file", "days", "new_pairs_days", "include_inactive",
                      "timerange", "download_trades", "exchange", "timeframes",
-                      "erase", "dataformat_ohlcv", "dataformat_trades", "trading_mode"]
+                      "erase", "dataformat_ohlcv", "dataformat_trades", "trading_mode",
+                      "prepend_data"]

 ARGS_PLOT_DATAFRAME = ["pairs", "indicators1", "indicators2", "plot_limit",
                       "db_url", "trade_source", "export", "exportfilename",
--- a/freqtrade/commands/cli_options.py
+++ b/freqtrade/commands/cli_options.py
@@ -443,6 +443,11 @@ AVAILABLE_CLI_OPTIONS = {
        default=['1m', '5m'],
        nargs='+',
    ),
+    "prepend_data": Arg(
+        '--prepend',
+        help='Allow data prepending.',
+        action='store_true',
+    ),
    "erase": Arg(
        '--erase',
        help='Clean all existing data for the selected exchange/pairs/timeframes.',
--- a/freqtrade/commands/data_commands.py
+++ b/freqtrade/commands/data_commands.py
@@ -85,6 +85,7 @@ def start_download_data(args: Dict[str, Any]) -> None:
                new_pairs_days=config['new_pairs_days'],
                erase=bool(config.get('erase')), data_format=config['dataformat_ohlcv'],
                trading_mode=config.get('trading_mode', 'spot'),
+                prepend=config.get('prepend_data', False)
            )

    except KeyboardInterrupt:
--- a/freqtrade/configuration/configuration.py
+++ b/freqtrade/configuration/configuration.py
@@ -393,6 +393,8 @@ class Configuration:
        self._args_to_config(config, argname='trade_source',
                             logstring='Using trades from: {}')

+        self._args_to_config(config, argname='prepend_data',
+                             logstring='Prepend detected. Allowing data prepending.')
        self._args_to_config(config, argname='erase',
                             logstring='Erase detected. Deleting existing data.')

--- a/freqtrade/data/history/history_utils.py
+++ b/freqtrade/data/history/history_utils.py
@@ -139,8 +139,9 @@ def _load_cached_data_for_updating(
    timeframe: str,
    timerange: Optional[TimeRange],
    data_handler: IDataHandler,
-    candle_type: CandleType
-) -> Tuple[DataFrame, Optional[int]]:
+    candle_type: CandleType,
+    prepend: bool = False,
+) -> Tuple[DataFrame, Optional[int], Optional[int]]:
    """
    Load cached data to download more data.
    If timerange is passed in, checks whether data from an before the stored data will be
@@ -150,9 +151,12 @@ def _load_cached_data_for_updating(
    Note: Only used by download_pair_history().
    """
    start = None
+    end = None
    if timerange:
        if timerange.starttype == 'date':
            start = datetime.fromtimestamp(timerange.startts, tz=timezone.utc)
+        if timerange.stoptype == 'date':
+            end = datetime.fromtimestamp(timerange.stopts, tz=timezone.utc)

    # Intentionally don't pass timerange in - since we need to load the full dataset.
    data = data_handler.ohlcv_load(pair, timeframe=timeframe,
@@ -160,14 +164,17 @@ def _load_cached_data_for_updating(
                                   drop_incomplete=True, warn_no_data=False,
                                   candle_type=candle_type)
    if not data.empty:
-        if start and start < data.iloc[0]['date']:
+        if not prepend and start and start < data.iloc[0]['date']:
            # Earlier data than existing data requested, redownload all
            data = DataFrame(columns=DEFAULT_DATAFRAME_COLUMNS)
        else:
-            start = data.iloc[-1]['date']
-
+            if prepend:
+                end = data.iloc[0]['date']
+            else:
+                start = data.iloc[-1]['date']
    start_ms = int(start.timestamp() * 1000) if start else None
-    return data, start_ms
+    end_ms = int(end.timestamp() * 1000) if end else None
+    return data, start_ms, end_ms


 def _download_pair_history(pair: str, *,
@@ -180,6 +187,7 @@ def _download_pair_history(pair: str, *,
                           timerange: Optional[TimeRange] = None,
                           candle_type: CandleType,
                           erase: bool = False,
+                           prepend: bool = False,
                           ) -> bool:
    """
    Download latest candles from the exchange for the pair and timeframe passed in parameters
@@ -187,8 +195,6 @@ def _download_pair_history(pair: str, *,
    exists in a cache. If timerange starts earlier than the data in the cache,
    the full data will be redownloaded

-    Based on @Rybolov work: https://github.com/rybolov/freqtrade-data
-
    :param pair: pair to download
    :param timeframe: Timeframe (e.g "5m")
    :param timerange: range of time to download
@@ -203,14 +209,17 @@ def _download_pair_history(pair: str, *,
            if data_handler.ohlcv_purge(pair, timeframe, candle_type=candle_type):
                logger.info(f'Deleting existing data for pair {pair}, {timeframe}, {candle_type}.')

-        logger.info(
-            f'Download history data for pair: "{pair}" ({process}), timeframe: {timeframe}, '
-            f'candle type: {candle_type} and store in {datadir}.'
-        )
+        data, since_ms, until_ms = _load_cached_data_for_updating(
+            pair, timeframe, timerange,
+            data_handler=data_handler,
+            candle_type=candle_type,
+            prepend=prepend)

-        data, since_ms = _load_cached_data_for_updating(pair, timeframe, timerange,
-                                                        data_handler=data_handler,
-                                                        candle_type=candle_type)
+        logger.info(f'({process}) - Download history data for "{pair}", {timeframe}, '
+                    f'{candle_type} and store in {datadir}.'
+                    f'From {format_ms_time(since_ms) if since_ms else "start"} to '
+                    f'{format_ms_time(until_ms) if until_ms else "now"}'
+                    )

        logger.debug("Current Start: %s",
                     f"{data.iloc[0]['date']:%Y-%m-%d %H:%M:%S}" if not data.empty else 'None')
@@ -225,6 +234,7 @@ def _download_pair_history(pair: str, *,
                                                   days=-new_pairs_days).int_timestamp * 1000,
                                               is_new_pair=data.empty,
                                               candle_type=candle_type,
+                                               until_ms=until_ms if until_ms else None
                                               )
        # TODO: Maybe move parsing to exchange class (?)
        new_dataframe = ohlcv_to_dataframe(new_data, timeframe, pair,
@@ -257,6 +267,7 @@ def refresh_backtest_ohlcv_data(exchange: Exchange, pairs: List[str], timeframes
                                timerange: Optional[TimeRange] = None,
                                new_pairs_days: int = 30, erase: bool = False,
                                data_format: str = None,
+                                prepend: bool = False,
                                ) -> List[str]:
    """
    Refresh stored ohlcv data for backtesting and hyperopt operations.
@@ -280,7 +291,7 @@ def refresh_backtest_ohlcv_data(exchange: Exchange, pairs: List[str], timeframes
                                   timerange=timerange, data_handler=data_handler,
                                   timeframe=str(timeframe), new_pairs_days=new_pairs_days,
                                   candle_type=candle_type,
-                                   erase=erase)
+                                   erase=erase, prepend=prepend)
        if trading_mode == 'futures':
            # Predefined candletype (and timeframe) depending on exchange
            # Downloads what is necessary to backtest based on futures data.
@@ -294,7 +305,7 @@ def refresh_backtest_ohlcv_data(exchange: Exchange, pairs: List[str], timeframes
                                       timerange=timerange, data_handler=data_handler,
                                       timeframe=str(tf_mark), new_pairs_days=new_pairs_days,
                                       candle_type=funding_candle_type,
-                                       erase=erase)
+                                       erase=erase, prepend=prepend)

    return pairs_not_available

@@ -312,8 +323,9 @@ def _download_trades_history(exchange: Exchange,
    try:

        until = None
-        if (timerange and timerange.starttype == 'date'):
-            since = timerange.startts * 1000
+        if timerange:
+            if timerange.starttype == 'date':
+                since = timerange.startts * 1000
            if timerange.stoptype == 'date':
                until = timerange.stopts * 1000
        else:
--- a/freqtrade/exchange/binance.py
+++ b/freqtrade/exchange/binance.py
@@ -95,6 +95,7 @@ class Binance(Exchange):
    async def _async_get_historic_ohlcv(self, pair: str, timeframe: str,
                                        since_ms: int, candle_type: CandleType,
                                        is_new_pair: bool = False, raise_: bool = False,
+                                        until_ms: int = None
                                        ) -> Tuple[str, str, str, List]:
        """
        Overwrite to introduce "fast new pair" functionality by detecting the pair's listing date
@@ -115,7 +116,8 @@ class Binance(Exchange):
            since_ms=since_ms,
            is_new_pair=is_new_pair,
            raise_=raise_,
-            candle_type=candle_type
+            candle_type=candle_type,
+            until_ms=until_ms,
        )

    def funding_fee_cutoff(self, open_date: datetime):
--- a/freqtrade/exchange/exchange.py
+++ b/freqtrade/exchange/exchange.py
@@ -1645,7 +1645,8 @@ class Exchange:

    def get_historic_ohlcv(self, pair: str, timeframe: str,
                           since_ms: int, candle_type: CandleType,
-                           is_new_pair: bool = False) -> List:
+                           is_new_pair: bool = False,
+                           until_ms: int = None) -> List:
        """
        Get candle history using asyncio and returns the list of candles.
        Handles all async work for this.
@@ -1653,13 +1654,14 @@ class Exchange:
        :param pair: Pair to download
        :param timeframe: Timeframe to get data for
        :param since_ms: Timestamp in milliseconds to get history from
+        :param until_ms: Timestamp in milliseconds to get history up to
        :param candle_type: '', mark, index, premiumIndex, or funding_rate
        :return: List with candle (OHLCV) data
        """
        pair, _, _, data = self.loop.run_until_complete(
            self._async_get_historic_ohlcv(pair=pair, timeframe=timeframe,
-                                           since_ms=since_ms, is_new_pair=is_new_pair,
-                                           candle_type=candle_type))
+                                           since_ms=since_ms, until_ms=until_ms,
+                                           is_new_pair=is_new_pair, candle_type=candle_type))
        logger.info(f"Downloaded data for {pair} with length {len(data)}.")
        return data

@@ -1680,6 +1682,7 @@ class Exchange:
    async def _async_get_historic_ohlcv(self, pair: str, timeframe: str,
                                        since_ms: int, candle_type: CandleType,
                                        is_new_pair: bool = False, raise_: bool = False,
+                                        until_ms: int = None
                                        ) -> Tuple[str, str, str, List]:
        """
        Download historic ohlcv
@@ -1695,7 +1698,7 @@ class Exchange:
        )
        input_coroutines = [self._async_get_candle_history(
            pair, timeframe, candle_type, since) for since in
-            range(since_ms, arrow.utcnow().int_timestamp * 1000, one_call)]
+            range(since_ms, until_ms or (arrow.utcnow().int_timestamp * 1000), one_call)]

        data: List = []
        # Chunk requests into batches of 100 to avoid overwelming ccxt Throttling
--- a/tests/data/test_history.py
+++ b/tests/data/test_history.py
@@ -149,8 +149,8 @@ def test_load_data_with_new_pair_1min(ohlcv_history_list, mocker, caplog,
    load_pair_history(datadir=tmpdir1, timeframe='1m', pair='MEME/BTC', candle_type=candle_type)
    assert file.is_file()
    assert log_has_re(
-        r'Download history data for pair: "MEME/BTC" \(0/1\), timeframe: 1m, '
-        r'candle type: spot and store in .*', caplog
+        r'\(0/1\) - Download history data for "MEME/BTC", 1m, '
+        r'spot and store in .*', caplog
    )


@@ -223,42 +223,65 @@ def test_load_cached_data_for_updating(mocker, testdatadir) -> None:
    # timeframe starts earlier than the cached data
    # should fully update data
    timerange = TimeRange('date', None, test_data[0][0] / 1000 - 1, 0)
-    data, start_ts = _load_cached_data_for_updating(
+    data, start_ts, end_ts = _load_cached_data_for_updating(
        'UNITTEST/BTC', '1m', timerange, data_handler, CandleType.SPOT)
    assert data.empty
    assert start_ts == test_data[0][0] - 1000
+    assert end_ts is None
+
+    # timeframe starts earlier than the cached data - prepending
+
+    timerange = TimeRange('date', None, test_data[0][0] / 1000 - 1, 0)
+    data, start_ts, end_ts = _load_cached_data_for_updating(
+        'UNITTEST/BTC', '1m', timerange, data_handler, CandleType.SPOT, True)
+    assert_frame_equal(data, test_data_df.iloc[:-1])
+    assert start_ts == test_data[0][0] - 1000
+    assert end_ts == test_data[0][0]

    # timeframe starts in the center of the cached data
    # should return the cached data w/o the last item
    timerange = TimeRange('date', None, test_data[0][0] / 1000 + 1, 0)
-    data, start_ts = _load_cached_data_for_updating(
+    data, start_ts, end_ts = _load_cached_data_for_updating(
        'UNITTEST/BTC', '1m', timerange, data_handler, CandleType.SPOT)

    assert_frame_equal(data, test_data_df.iloc[:-1])
    assert test_data[-2][0] <= start_ts < test_data[-1][0]
+    assert end_ts is None

    # timeframe starts after the cached data
    # should return the cached data w/o the last item
    timerange = TimeRange('date', None, test_data[-1][0] / 1000 + 100, 0)
-    data, start_ts = _load_cached_data_for_updating(
+    data, start_ts, end_ts = _load_cached_data_for_updating(
        'UNITTEST/BTC', '1m', timerange, data_handler, CandleType.SPOT)
    assert_frame_equal(data, test_data_df.iloc[:-1])
    assert test_data[-2][0] <= start_ts < test_data[-1][0]
+    assert end_ts is None

    # no datafile exist
    # should return timestamp start time
    timerange = TimeRange('date', None, now_ts - 10000, 0)
-    data, start_ts = _load_cached_data_for_updating(
+    data, start_ts, end_ts = _load_cached_data_for_updating(
        'NONEXIST/BTC', '1m', timerange, data_handler, CandleType.SPOT)
    assert data.empty
    assert start_ts == (now_ts - 10000) * 1000
+    assert end_ts is None
+
+    # no datafile exist
+    # should return timestamp start and end time time
+    timerange = TimeRange('date', 'date', now_ts - 1000000, now_ts - 100000)
+    data, start_ts, end_ts = _load_cached_data_for_updating(
+        'NONEXIST/BTC', '1m', timerange, data_handler, CandleType.SPOT)
+    assert data.empty
+    assert start_ts == (now_ts - 1000000) * 1000
+    assert end_ts == (now_ts - 100000) * 1000

    # no datafile exist, no timeframe is set
    # should return an empty array and None
-    data, start_ts = _load_cached_data_for_updating(
+    data, start_ts, end_ts = _load_cached_data_for_updating(
        'NONEXIST/BTC', '1m', None, data_handler, CandleType.SPOT)
    assert data.empty
    assert start_ts is None
+    assert end_ts is None


@pytest.mark.parametrize('candle_type,subdir,file_tail', [
--- a/tests/exchange/test_exchange.py
+++ b/tests/exchange/test_exchange.py
@@ -1983,6 +1983,20 @@ async def test__async_get_historic_ohlcv(default_conf, mocker, caplog, exchange_
    assert exchange._api_async.fetch_ohlcv.call_count > 200
    assert res[0] == ohlcv[0]

+    exchange._api_async.fetch_ohlcv.reset_mock()
+    end_ts = 1_500_500_000_000
+    start_ts = 1_500_000_000_000
+    respair, restf, _, res = await exchange._async_get_historic_ohlcv(
+        pair, "5m", since_ms=start_ts, candle_type=candle_type, is_new_pair=False,
+        until_ms=end_ts
+        )
+    # Required candles
+    candles = (end_ts - start_ts) / 300_000
+    exp = candles // exchange.ohlcv_candle_limit('5m') + 1
+
+    # Depending on the exchange, this should be called between 1 and 6 times.
+    assert exchange._api_async.fetch_ohlcv.call_count == exp
+

@pytest.mark.parametrize('candle_type', [CandleType.FUTURES, CandleType.MARK, CandleType.SPOT])
 def test_refresh_latest_ohlcv(mocker, default_conf, caplog, candle_type) -> None: