stable/freqtrade/data/history/history_utils.py

425 lines
18 KiB
Python
Raw Normal View History

import logging
import operator
from datetime import datetime, timezone
2018-12-15 12:54:35 +00:00
from pathlib import Path
2019-12-27 09:25:17 +00:00
from typing import Dict, List, Optional, Tuple
import arrow
2018-12-15 13:28:37 +00:00
from pandas import DataFrame
from freqtrade.configuration import TimeRange
from freqtrade.constants import DEFAULT_DATAFRAME_COLUMNS
2020-09-28 17:39:41 +00:00
from freqtrade.data.converter import (clean_ohlcv_dataframe, ohlcv_to_dataframe,
trades_remove_duplicates, trades_to_ohlcv)
from freqtrade.data.history.idatahandler import IDataHandler, get_datahandler
from freqtrade.exceptions import OperationalException
2019-12-27 09:25:17 +00:00
from freqtrade.exchange import Exchange
2020-04-01 18:31:21 +00:00
from freqtrade.misc import format_ms_time
2020-09-28 17:39:41 +00:00
logger = logging.getLogger(__name__)
def load_pair_history(pair: str,
timeframe: str,
datadir: Path, *,
2019-10-06 15:10:40 +00:00
timerange: Optional[TimeRange] = None,
2019-06-09 12:40:45 +00:00
fill_up_missing: bool = True,
2019-10-20 12:02:53 +00:00
drop_incomplete: bool = True,
startup_candles: int = 0,
2019-12-25 15:12:20 +00:00
data_format: str = None,
data_handler: IDataHandler = None,
2018-12-16 09:17:11 +00:00
) -> DataFrame:
"""
Load cached ohlcv history for the given pair.
2019-06-09 12:40:45 +00:00
:param pair: Pair to load data for
:param timeframe: Timeframe (e.g. "5m")
2019-06-09 12:40:45 +00:00
:param datadir: Path to the data storage location.
2019-12-25 15:12:20 +00:00
:param data_format: Format of the data. Ignored if data_handler is set.
2019-06-09 12:40:45 +00:00
:param timerange: Limit data to be loaded to this timerange
:param fill_up_missing: Fill missing values with "No action"-candles
:param drop_incomplete: Drop last candle assuming it may be incomplete.
2019-10-20 12:02:53 +00:00
:param startup_candles: Additional candles to load at the start of the period
:param data_handler: Initialized data-handler to use.
Will be initialized from data_format if not set
:return: DataFrame with ohlcv data, or empty DataFrame
"""
2019-12-25 15:12:20 +00:00
data_handler = get_datahandler(datadir, data_format, data_handler)
return data_handler.ohlcv_load(pair=pair,
timeframe=timeframe,
timerange=timerange,
fill_missing=fill_up_missing,
drop_incomplete=drop_incomplete,
startup_candles=startup_candles,
)
def load_data(datadir: Path,
timeframe: str,
2019-12-25 15:12:20 +00:00
pairs: List[str], *,
2019-10-06 15:10:40 +00:00
timerange: Optional[TimeRange] = None,
fill_up_missing: bool = True,
2019-10-20 12:02:53 +00:00
startup_candles: int = 0,
fail_without_data: bool = False,
data_format: str = 'json',
) -> Dict[str, DataFrame]:
"""
Load ohlcv history data for a list of pairs.
2019-10-20 12:02:53 +00:00
:param datadir: Path to the data storage location.
:param timeframe: Timeframe (e.g. "5m")
2019-10-20 12:02:53 +00:00
:param pairs: List of pairs to load
:param timerange: Limit data to be loaded to this timerange
:param fill_up_missing: Fill missing values with "No action"-candles
:param startup_candles: Additional candles to load at the start of the period
:param fail_without_data: Raise OperationalException if no data is found.
2019-12-28 13:57:39 +00:00
:param data_format: Data format which should be used. Defaults to json
2019-10-20 12:02:53 +00:00
:return: dict(<pair>:<Dataframe>)
"""
result: Dict[str, DataFrame] = {}
if startup_candles > 0 and timerange:
logger.info(f'Using indicator startup period: {startup_candles} ...')
2019-12-25 15:12:20 +00:00
data_handler = get_datahandler(datadir, data_format)
for pair in pairs:
hist = load_pair_history(pair=pair, timeframe=timeframe,
datadir=datadir, timerange=timerange,
fill_up_missing=fill_up_missing,
startup_candles=startup_candles,
data_handler=data_handler
)
if not hist.empty:
result[pair] = hist
if fail_without_data and not result:
raise OperationalException("No data found. Terminating.")
return result
def refresh_data(datadir: Path,
timeframe: str,
pairs: List[str],
exchange: Exchange,
2019-12-25 15:12:20 +00:00
data_format: str = None,
timerange: Optional[TimeRange] = None,
) -> None:
"""
Refresh ohlcv history data for a list of pairs.
:param datadir: Path to the data storage location.
:param timeframe: Timeframe (e.g. "5m")
:param pairs: List of pairs to load
:param exchange: Exchange object
2021-06-25 17:13:31 +00:00
:param data_format: dataformat to use
:param timerange: Limit data to be loaded to this timerange
"""
2019-12-25 15:12:20 +00:00
data_handler = get_datahandler(datadir, data_format)
for idx, pair in enumerate(pairs):
process = f'{idx}/{len(pairs)}'
_download_pair_history(pair=pair, process=process,
timeframe=timeframe, datadir=datadir,
timerange=timerange, exchange=exchange, data_handler=data_handler)
2019-12-27 09:12:56 +00:00
def _load_cached_data_for_updating(pair: str, timeframe: str, timerange: Optional[TimeRange],
data_handler: IDataHandler) -> Tuple[DataFrame, Optional[int]]:
"""
Load cached data to download more data.
2019-10-06 15:10:40 +00:00
If timerange is passed in, checks whether data from an before the stored data will be
downloaded.
If that's the case then what's available should be completely overwritten.
2019-12-27 09:12:56 +00:00
Otherwise downloads always start at the end of the available data to avoid data gaps.
Note: Only used by download_pair_history().
"""
start = None
if timerange:
if timerange.starttype == 'date':
start = datetime.fromtimestamp(timerange.startts, tz=timezone.utc)
# Intentionally don't pass timerange in - since we need to load the full dataset.
data = data_handler.ohlcv_load(pair, timeframe=timeframe,
timerange=None, fill_missing=False,
drop_incomplete=True, warn_no_data=False)
if not data.empty:
2019-12-27 06:07:27 +00:00
if start and start < data.iloc[0]['date']:
# Earlier data than existing data requested, redownload all
2019-12-27 09:11:49 +00:00
data = DataFrame(columns=DEFAULT_DATAFRAME_COLUMNS)
else:
start = data.iloc[-1]['date']
2019-12-27 06:07:27 +00:00
start_ms = int(start.timestamp() * 1000) if start else None
return data, start_ms
def _download_pair_history(pair: str, *,
datadir: Path,
2019-12-17 22:06:03 +00:00
exchange: Exchange,
timeframe: str = '5m',
process: str = '',
new_pairs_days: int = 30,
data_handler: IDataHandler = None,
timerange: Optional[TimeRange] = None,
candle_type: Optional[str] = "") -> bool:
"""
2019-11-13 10:28:26 +00:00
Download latest candles from the exchange for the pair and timeframe passed in parameters
The data is downloaded starting from the last correct data that
exists in a cache. If timerange starts earlier than the data in the cache,
the full data will be redownloaded
Based on @Rybolov work: https://github.com/rybolov/freqtrade-data
2019-05-17 16:05:36 +00:00
:param pair: pair to download
:param timeframe: Timeframe (e.g "5m")
:param timerange: range of time to download
2018-12-16 13:14:17 +00:00
:return: bool with success state
"""
2019-12-27 12:46:25 +00:00
data_handler = get_datahandler(datadir, data_handler=data_handler)
2019-12-25 15:12:20 +00:00
2018-12-16 09:29:53 +00:00
try:
2019-05-17 16:05:36 +00:00
logger.info(
f'Download history data for pair: "{pair}" ({process}), timeframe: {timeframe} '
2019-05-17 16:05:36 +00:00
f'and store in {datadir}.'
)
# data, since_ms = _load_cached_data_for_updating_old(datadir, pair, timeframe, timerange)
data, since_ms = _load_cached_data_for_updating(pair, timeframe, timerange,
data_handler=data_handler)
logger.debug("Current Start: %s",
f"{data.iloc[0]['date']:%Y-%m-%d %H:%M:%S}" if not data.empty else 'None')
logger.debug("Current End: %s",
f"{data.iloc[-1]['date']:%Y-%m-%d %H:%M:%S}" if not data.empty else 'None')
2018-12-16 09:29:53 +00:00
# Default since_ms to 30 days if nothing is given
2019-12-16 19:12:26 +00:00
new_data = exchange.get_historic_ohlcv(pair=pair,
timeframe=timeframe,
since_ms=since_ms if since_ms else
arrow.utcnow().shift(
days=-new_pairs_days).int_timestamp * 1000,
is_new_pair=data.empty,
candle_type=candle_type,
2019-12-16 19:12:26 +00:00
)
# TODO: Maybe move parsing to exchange class (?)
new_dataframe = ohlcv_to_dataframe(new_data, timeframe, pair,
fill_missing=False, drop_incomplete=True)
if data.empty:
data = new_dataframe
else:
2020-07-25 15:06:58 +00:00
# Run cleaning again to ensure there were no duplicate candles
# Especially between existing and new data.
data = clean_ohlcv_dataframe(data.append(new_dataframe), timeframe, pair,
fill_missing=False, drop_incomplete=False)
logger.debug("New Start: %s",
f"{data.iloc[0]['date']:%Y-%m-%d %H:%M:%S}" if not data.empty else 'None')
logger.debug("New End: %s",
f"{data.iloc[-1]['date']:%Y-%m-%d %H:%M:%S}" if not data.empty else 'None')
data_handler.ohlcv_store(pair, timeframe, data=data, candle_type=candle_type)
2018-12-16 09:29:53 +00:00
return True
2019-05-17 16:05:36 +00:00
except Exception:
logger.exception(
f'Failed to download history data for pair: "{pair}", timeframe: {timeframe}.'
2019-05-17 16:05:36 +00:00
)
2019-01-31 05:51:03 +00:00
return False
def refresh_backtest_ohlcv_data(exchange: Exchange, pairs: List[str], timeframes: List[str],
2019-12-16 18:43:33 +00:00
datadir: Path, timerange: Optional[TimeRange] = None,
new_pairs_days: int = 30, erase: bool = False,
data_format: str = None,
candle_type: Optional[str] = "") -> List[str]:
"""
Refresh stored ohlcv data for backtesting and hyperopt operations.
2019-12-16 18:43:33 +00:00
Used by freqtrade download-data subcommand.
:return: List of pairs that are not available.
"""
pairs_not_available = []
2019-12-25 15:12:20 +00:00
data_handler = get_datahandler(datadir, data_format)
for idx, pair in enumerate(pairs, start=1):
if pair not in exchange.markets:
pairs_not_available.append(pair)
logger.info(f"Skipping pair {pair}...")
continue
for timeframe in timeframes:
2019-12-27 09:49:30 +00:00
if erase:
if data_handler.ohlcv_purge(pair, timeframe, candle_type=candle_type):
2019-12-27 09:49:30 +00:00
logger.info(
f'Deleting existing data for pair {pair}, interval {timeframe}.')
logger.info(f'Downloading pair {pair}, interval {timeframe}.')
process = f'{idx}/{len(pairs)}'
_download_pair_history(pair=pair, process=process,
datadir=datadir, exchange=exchange,
timerange=timerange, data_handler=data_handler,
timeframe=str(timeframe), new_pairs_days=new_pairs_days,
candle_type=candle_type)
return pairs_not_available
2019-12-25 15:34:27 +00:00
def _download_trades_history(exchange: Exchange,
pair: str, *,
new_pairs_days: int = 30,
2019-12-25 15:34:27 +00:00
timerange: Optional[TimeRange] = None,
data_handler: IDataHandler
) -> bool:
2019-08-25 12:30:09 +00:00
"""
Download trade history from the exchange.
Appends to previously downloaded trades data.
"""
2019-08-16 08:51:04 +00:00
try:
until = None
if (timerange and timerange.starttype == 'date'):
2021-05-15 11:20:36 +00:00
since = timerange.startts * 1000
if timerange.stoptype == 'date':
until = timerange.stopts * 1000
else:
since = arrow.utcnow().shift(days=-new_pairs_days).int_timestamp * 1000
2019-08-16 08:51:04 +00:00
2019-12-25 15:34:27 +00:00
trades = data_handler.trades_load(pair)
2019-08-16 08:51:04 +00:00
# TradesList columns are defined in constants.DEFAULT_TRADES_COLUMNS
# DEFAULT_TRADES_COLUMNS: 0 -> timestamp
# DEFAULT_TRADES_COLUMNS: 1 -> id
2020-03-31 18:46:42 +00:00
if trades and since < trades[0][0]:
# since is before the first trade
logger.info(f"Start earlier than available data. Redownloading trades for {pair}...")
trades = []
from_id = trades[-1][1] if trades else None
2020-03-31 18:46:42 +00:00
if trades and since < trades[-1][0]:
# Reset since to the last available point
2020-04-01 18:04:36 +00:00
# - 5 seconds (to ensure we're getting all trades)
2020-04-01 18:50:00 +00:00
since = trades[-1][0] - (5 * 1000)
2020-04-01 18:31:21 +00:00
logger.info(f"Using last trade date -5s - Downloading trades for {pair} "
f"since: {format_ms_time(since)}.")
2019-08-16 08:51:04 +00:00
2020-04-01 18:31:21 +00:00
logger.debug(f"Current Start: {format_ms_time(trades[0][0]) if trades else 'None'}")
logger.debug(f"Current End: {format_ms_time(trades[-1][0]) if trades else 'None'}")
logger.info(f"Current Amount of trades: {len(trades)}")
2019-08-16 08:51:04 +00:00
2019-12-16 19:12:26 +00:00
# Default since_ms to 30 days if nothing is given
2019-08-25 12:14:31 +00:00
new_trades = exchange.get_historic_trades(pair=pair,
2020-04-02 06:20:50 +00:00
since=since,
until=until,
2019-08-25 12:30:09 +00:00
from_id=from_id,
)
2019-08-25 12:14:31 +00:00
trades.extend(new_trades[1])
2020-04-01 05:58:39 +00:00
# Remove duplicates to make sure we're not storing data we don't need
trades = trades_remove_duplicates(trades)
2019-12-25 15:34:27 +00:00
data_handler.trades_store(pair, data=trades)
2019-08-16 08:51:04 +00:00
2020-04-01 18:31:21 +00:00
logger.debug(f"New Start: {format_ms_time(trades[0][0])}")
logger.debug(f"New End: {format_ms_time(trades[-1][0])}")
2019-08-16 08:51:04 +00:00
logger.info(f"New Amount of trades: {len(trades)}")
return True
2019-08-16 08:51:04 +00:00
except Exception:
logger.exception(
2019-08-16 08:51:04 +00:00
f'Failed to download historic trades for pair: "{pair}". '
)
return False
def refresh_backtest_trades_data(exchange: Exchange, pairs: List[str], datadir: Path,
timerange: TimeRange, new_pairs_days: int = 30,
erase: bool = False, data_format: str = 'jsongz') -> List[str]:
2019-08-27 05:13:50 +00:00
"""
2019-12-16 18:43:33 +00:00
Refresh stored trades data for backtesting and hyperopt operations.
Used by freqtrade download-data subcommand.
:return: List of pairs that are not available.
2019-08-27 05:13:50 +00:00
"""
pairs_not_available = []
2019-12-25 15:34:27 +00:00
data_handler = get_datahandler(datadir, data_format=data_format)
2019-08-27 05:13:50 +00:00
for pair in pairs:
if pair not in exchange.markets:
pairs_not_available.append(pair)
logger.info(f"Skipping pair {pair}...")
continue
if erase:
if data_handler.trades_purge(pair):
logger.info(f'Deleting existing data for pair {pair}.')
2019-08-27 05:13:50 +00:00
logger.info(f'Downloading trades for pair {pair}.')
2019-12-26 09:22:38 +00:00
_download_trades_history(exchange=exchange,
pair=pair,
new_pairs_days=new_pairs_days,
2019-12-25 15:34:27 +00:00
timerange=timerange,
data_handler=data_handler)
2019-08-27 05:13:50 +00:00
return pairs_not_available
def convert_trades_to_ohlcv(
pairs: List[str],
timeframes: List[str],
datadir: Path,
timerange: TimeRange,
erase: bool = False,
data_format_ohlcv: str = 'json',
data_format_trades: str = 'jsongz',
candle_type: Optional[str] = ""
) -> None:
"""
Convert stored trades data to ohlcv data
"""
2019-12-27 12:46:25 +00:00
data_handler_trades = get_datahandler(datadir, data_format=data_format_trades)
data_handler_ohlcv = get_datahandler(datadir, data_format=data_format_ohlcv)
2019-12-25 15:34:27 +00:00
for pair in pairs:
2019-12-25 15:34:27 +00:00
trades = data_handler_trades.trades_load(pair)
for timeframe in timeframes:
2019-12-26 09:22:38 +00:00
if erase:
if data_handler_ohlcv.ohlcv_purge(pair, timeframe, candle_type=candle_type):
2019-12-26 09:22:38 +00:00
logger.info(f'Deleting existing data for pair {pair}, interval {timeframe}.')
try:
ohlcv = trades_to_ohlcv(trades, timeframe)
# Store ohlcv
data_handler_ohlcv.ohlcv_store(pair, timeframe, data=ohlcv, candle_type=candle_type)
except ValueError:
logger.exception(f'Could not convert {pair} to OHLCV.')
def get_timerange(data: Dict[str, DataFrame]) -> Tuple[datetime, datetime]:
"""
2019-12-17 22:06:03 +00:00
Get the maximum common timerange for the given backtest data.
:param data: dictionary with preprocessed backtesting data
:return: tuple containing min_date, max_date
"""
2019-12-17 22:06:03 +00:00
timeranges = [
(frame['date'].min().to_pydatetime(), frame['date'].max().to_pydatetime())
for frame in data.values()
]
2019-12-17 22:06:03 +00:00
return (min(timeranges, key=operator.itemgetter(0))[0],
max(timeranges, key=operator.itemgetter(1))[1])
def validate_backtest_data(data: DataFrame, pair: str, min_date: datetime,
2019-12-11 06:12:37 +00:00
max_date: datetime, timeframe_min: int) -> bool:
"""
Validates preprocessed backtesting data for missing values and shows warnings about it that.
:param data: preprocessed backtesting data (as DataFrame)
:param pair: pair used for log output.
:param min_date: start-date of the data
:param max_date: end-date of the data
:param timeframe_min: Timeframe in minutes
"""
# total difference in minutes / timeframe-minutes
2019-12-11 06:12:37 +00:00
expected_frames = int((max_date - min_date).total_seconds() // 60 // timeframe_min)
found_missing = False
dflen = len(data)
if dflen < expected_frames:
found_missing = True
logger.warning("%s has missing frames: expected %s, got %s, that's %s missing values",
pair, expected_frames, dflen, expected_frames - dflen)
return found_missing