Merge pull request #1936 from freqtrade/fix/validate_dataframe

Properly warn if data is incomplete
This commit is contained in:
Matthias 2019-06-24 06:50:48 +02:00 committed by GitHub
commit 1f8dc7f845
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 74 additions and 59 deletions

View File

@ -112,7 +112,7 @@ pair = "XLM/BTC" # Make sure to use a pair that exists on that exchange!
raw = ct.fetch_ohlcv(pair, timeframe=timeframe)
# convert to dataframe
df1 = parse_ticker_dataframe(raw, timeframe, drop_incomplete=False)
df1 = parse_ticker_dataframe(raw, timeframe, pair=pair, drop_incomplete=False)
print(df1["date"].tail(1))
print(datetime.utcnow())

View File

@ -10,13 +10,14 @@ from pandas import DataFrame, to_datetime
logger = logging.getLogger(__name__)
def parse_ticker_dataframe(ticker: list, ticker_interval: str, *,
def parse_ticker_dataframe(ticker: list, ticker_interval: str, pair: str, *,
fill_missing: bool = True,
drop_incomplete: bool = True) -> DataFrame:
"""
Converts a ticker-list (format ccxt.fetch_ohlcv) to a Dataframe
:param ticker: ticker list, as returned by exchange.async_get_candle_history
:param ticker_interval: ticker_interval (e.g. 5m). Used to fill up eventual missing data
:param pair: Pair this data is for (used to warn if fillup was necessary)
:param fill_missing: fill up missing candles with 0 candles
(see ohlcv_fill_up_missing_data for details)
:param drop_incomplete: Drop the last candle of the dataframe, assuming it's incomplete
@ -51,12 +52,12 @@ def parse_ticker_dataframe(ticker: list, ticker_interval: str, *,
logger.debug('Dropping last candle')
if fill_missing:
return ohlcv_fill_up_missing_data(frame, ticker_interval)
return ohlcv_fill_up_missing_data(frame, ticker_interval, pair)
else:
return frame
def ohlcv_fill_up_missing_data(dataframe: DataFrame, ticker_interval: str) -> DataFrame:
def ohlcv_fill_up_missing_data(dataframe: DataFrame, ticker_interval: str, pair: str) -> DataFrame:
"""
Fills up missing data with 0 volume rows,
using the previous close as price for "open", "high" "low" and "close", volume is set to 0
@ -84,7 +85,10 @@ def ohlcv_fill_up_missing_data(dataframe: DataFrame, ticker_interval: str) -> Da
'low': df['close'],
})
df.reset_index(inplace=True)
logger.debug(f"Missing data fillup: before: {len(dataframe)} - after: {len(df)}")
len_before = len(dataframe)
len_after = len(df)
if len_before != len_after:
logger.info(f"Missing data fillup for {pair}: before: {len_before} - after: {len_after}")
return df

View File

@ -116,7 +116,7 @@ def load_pair_history(pair: str,
logger.warning('Missing data at end for pair %s, data ends at %s',
pair,
arrow.get(pairdata[-1][0] // 1000).strftime('%Y-%m-%d %H:%M:%S'))
return parse_ticker_dataframe(pairdata, ticker_interval,
return parse_ticker_dataframe(pairdata, ticker_interval, pair=pair,
fill_missing=fill_up_missing,
drop_incomplete=drop_incomplete)
else:
@ -286,12 +286,13 @@ def get_timeframe(data: Dict[str, DataFrame]) -> Tuple[arrow.Arrow, arrow.Arrow]
max(timeframe, key=operator.itemgetter(1))[1]
def validate_backtest_data(data: Dict[str, DataFrame], min_date: datetime,
def validate_backtest_data(data: DataFrame, pair: str, min_date: datetime,
max_date: datetime, ticker_interval_mins: int) -> bool:
"""
Validates preprocessed backtesting data for missing values and shows warnings about it that.
:param data: dictionary with preprocessed backtesting data
:param data: preprocessed backtesting data (as DataFrame)
:param pair: pair used for log output.
:param min_date: start-date of the data
:param max_date: end-date of the data
:param ticker_interval_mins: ticker interval in minutes
@ -299,8 +300,7 @@ def validate_backtest_data(data: Dict[str, DataFrame], min_date: datetime,
# total difference in minutes / interval-minutes
expected_frames = int((max_date - min_date).total_seconds() // 60 // ticker_interval_mins)
found_missing = False
for pair, df in data.items():
dflen = len(df)
dflen = len(data)
if dflen < expected_frames:
found_missing = True
logger.warning("%s has missing frames: expected %s, got %s, that's %s missing values",

View File

@ -581,7 +581,7 @@ class Exchange(object):
self._pairs_last_refresh_time[(pair, ticker_interval)] = ticks[-1][0] // 1000
# keeping parsed dataframe in cache
self._klines[(pair, ticker_interval)] = parse_ticker_dataframe(
ticks, ticker_interval, fill_missing=True,
ticks, ticker_interval, pair=pair, fill_missing=True,
drop_incomplete=self._ohlcv_partial_candle)
return tickers

View File

@ -349,7 +349,7 @@ class Backtesting(object):
row = ticker[pair][indexes[pair]]
except IndexError:
# missing Data for one pair at the end.
# Warnings for this are shown by `validate_backtest_data`
# Warnings for this are shown during data loading
continue
# Waits until the time-counter reaches the start of the data for this pair.
@ -420,20 +420,19 @@ class Backtesting(object):
max_open_trades = 0
all_results = {}
for strat in self.strategylist:
logger.info("Running backtesting for Strategy %s", strat.get_strategy_name())
self._set_strategy(strat)
min_date, max_date = history.get_timeframe(data)
# Validate dataframe for missing values (mainly at start and end, as fillup is called)
history.validate_backtest_data(data, min_date, max_date,
timeframe_to_minutes(self.ticker_interval))
logger.info(
'Backtesting with data from %s up to %s (%s days)..',
min_date.isoformat(),
max_date.isoformat(),
(max_date - min_date).days
)
for strat in self.strategylist:
logger.info("Running backtesting for Strategy %s", strat.get_strategy_name())
self._set_strategy(strat)
# need to reprocess data every time to populate signals
preprocessed = self.strategy.tickerdata_to_dataframe(data)

View File

@ -19,8 +19,7 @@ from skopt import Optimizer
from skopt.space import Dimension
from freqtrade.arguments import Arguments
from freqtrade.data.history import load_data, get_timeframe, validate_backtest_data
from freqtrade.exchange import timeframe_to_minutes
from freqtrade.data.history import load_data, get_timeframe
from freqtrade.optimize.backtesting import Backtesting
from freqtrade.resolvers.hyperopt_resolver import HyperOptResolver
@ -281,9 +280,7 @@ class Hyperopt(Backtesting):
return
min_date, max_date = get_timeframe(data)
# Validate dataframe for missing values (mainly at start and end, as fillup is called)
validate_backtest_data(data, min_date, max_date,
timeframe_to_minutes(self.ticker_interval))
logger.info(
'Hyperopting with data from %s up to %s (%s days)..',
min_date.isoformat(),

View File

@ -674,7 +674,7 @@ def ticker_history_list():
@pytest.fixture
def ticker_history(ticker_history_list):
return parse_ticker_dataframe(ticker_history_list, "5m", fill_missing=True)
return parse_ticker_dataframe(ticker_history_list, "5m", pair="UNITTEST/BTC", fill_missing=True)
@pytest.fixture
@ -879,7 +879,8 @@ def tickers():
@pytest.fixture
def result():
with Path('freqtrade/tests/testdata/UNITTEST_BTC-1m.json').open('r') as data_file:
return parse_ticker_dataframe(json.load(data_file), '1m', fill_missing=True)
return parse_ticker_dataframe(json.load(data_file), '1m', pair="UNITTEST/BTC",
fill_missing=True)
# FIX:
# Create an fixture/function

View File

@ -15,7 +15,8 @@ def test_parse_ticker_dataframe(ticker_history_list, caplog):
caplog.set_level(logging.DEBUG)
# Test file with BV data
dataframe = parse_ticker_dataframe(ticker_history_list, '5m', fill_missing=True)
dataframe = parse_ticker_dataframe(ticker_history_list, '5m',
pair="UNITTEST/BTC", fill_missing=True)
assert dataframe.columns.tolist() == columns
assert log_has('Parsing tickerlist to dataframe', caplog.record_tuples)
@ -27,18 +28,19 @@ def test_ohlcv_fill_up_missing_data(caplog):
pair='UNITTEST/BTC',
fill_up_missing=False)
caplog.set_level(logging.DEBUG)
data2 = ohlcv_fill_up_missing_data(data, '1m')
data2 = ohlcv_fill_up_missing_data(data, '1m', 'UNITTEST/BTC')
assert len(data2) > len(data)
# Column names should not change
assert (data.columns == data2.columns).all()
assert log_has(f"Missing data fillup: before: {len(data)} - after: {len(data2)}",
assert log_has(f"Missing data fillup for UNITTEST/BTC: before: "
f"{len(data)} - after: {len(data2)}",
caplog.record_tuples)
# Test fillup actually fixes invalid backtest data
min_date, max_date = get_timeframe({'UNITTEST/BTC': data})
assert validate_backtest_data({'UNITTEST/BTC': data}, min_date, max_date, 1)
assert not validate_backtest_data({'UNITTEST/BTC': data2}, min_date, max_date, 1)
assert validate_backtest_data(data, 'UNITTEST/BTC', min_date, max_date, 1)
assert not validate_backtest_data(data2, 'UNITTEST/BTC', min_date, max_date, 1)
def test_ohlcv_fill_up_missing_data2(caplog):
@ -78,10 +80,10 @@ def test_ohlcv_fill_up_missing_data2(caplog):
]
# Generate test-data without filling missing
data = parse_ticker_dataframe(ticks, ticker_interval, fill_missing=False)
data = parse_ticker_dataframe(ticks, ticker_interval, pair="UNITTEST/BTC", fill_missing=False)
assert len(data) == 3
caplog.set_level(logging.DEBUG)
data2 = ohlcv_fill_up_missing_data(data, ticker_interval)
data2 = ohlcv_fill_up_missing_data(data, ticker_interval, "UNITTEST/BTC")
assert len(data2) == 4
# 3rd candle has been filled
row = data2.loc[2, :]
@ -94,7 +96,8 @@ def test_ohlcv_fill_up_missing_data2(caplog):
# Column names should not change
assert (data.columns == data2.columns).all()
assert log_has(f"Missing data fillup: before: {len(data)} - after: {len(data2)}",
assert log_has(f"Missing data fillup for UNITTEST/BTC: before: "
f"{len(data)} - after: {len(data2)}",
caplog.record_tuples)
@ -134,12 +137,14 @@ def test_ohlcv_drop_incomplete(caplog):
]
]
caplog.set_level(logging.DEBUG)
data = parse_ticker_dataframe(ticks, ticker_interval, fill_missing=False, drop_incomplete=False)
data = parse_ticker_dataframe(ticks, ticker_interval, pair="UNITTEST/BTC",
fill_missing=False, drop_incomplete=False)
assert len(data) == 4
assert not log_has("Dropping last candle", caplog.record_tuples)
# Drop last candle
data = parse_ticker_dataframe(ticks, ticker_interval, fill_missing=False, drop_incomplete=True)
data = parse_ticker_dataframe(ticks, ticker_interval, pair="UNITTEST/BTC",
fill_missing=False, drop_incomplete=True)
assert len(data) == 3
assert log_has("Dropping last candle", caplog.record_tuples)

View File

@ -555,8 +555,8 @@ def test_validate_backtest_data_warn(default_conf, mocker, caplog) -> None:
)
min_date, max_date = history.get_timeframe(data)
caplog.clear()
assert history.validate_backtest_data(data, min_date, max_date,
timeframe_to_minutes('1m'))
assert history.validate_backtest_data(data['UNITTEST/BTC'], 'UNITTEST/BTC',
min_date, max_date, timeframe_to_minutes('1m'))
assert len(caplog.record_tuples) == 1
assert log_has(
"UNITTEST/BTC has missing frames: expected 14396, got 13680, that's 716 missing values",
@ -579,6 +579,6 @@ def test_validate_backtest_data(default_conf, mocker, caplog) -> None:
min_date, max_date = history.get_timeframe(data)
caplog.clear()
assert not history.validate_backtest_data(data, min_date, max_date,
timeframe_to_minutes('5m'))
assert not history.validate_backtest_data(data['UNITTEST/BTC'], 'UNITTEST/BTC',
min_date, max_date, timeframe_to_minutes('5m'))
assert len(caplog.record_tuples) == 0

View File

@ -263,7 +263,7 @@ def mocked_load_data(datadir, pairs=[], ticker_interval='0m', refresh_pairs=Fals
hz = 0.1
base = 0.001
ETHBTC = [
NEOBTC = [
[
ticker_start_time.shift(minutes=(x * ticker_interval_in_minute)).timestamp * 1000,
math.sin(x * hz) / 1000 + base,
@ -285,8 +285,8 @@ def mocked_load_data(datadir, pairs=[], ticker_interval='0m', refresh_pairs=Fals
123.45
] for x in range(0, 500)]
pairdata = {'NEO/BTC': parse_ticker_dataframe(ETHBTC, '1h', fill_missing=True),
'LTC/BTC': parse_ticker_dataframe(LTCBTC, '1h', fill_missing=True)}
pairdata = {'NEO/BTC': parse_ticker_dataframe(NEOBTC, '1h', pair="NEO/BTC", fill_missing=True),
'LTC/BTC': parse_ticker_dataframe(LTCBTC, '1h', pair="LTC/BTC", fill_missing=True)}
return pairdata

View File

@ -73,7 +73,8 @@ def load_data_test(what):
pair[x][5] # Keep old volume
] for x in range(0, datalen)
]
return {'UNITTEST/BTC': parse_ticker_dataframe(data, '1m', fill_missing=True)}
return {'UNITTEST/BTC': parse_ticker_dataframe(data, '1m', pair="UNITTEST/BTC",
fill_missing=True)}
def simple_backtest(config, contour, num_results, mocker) -> None:
@ -102,7 +103,8 @@ def simple_backtest(config, contour, num_results, mocker) -> None:
def mocked_load_data(datadir, pairs=[], ticker_interval='0m', refresh_pairs=False,
timerange=None, exchange=None, live=False):
tickerdata = history.load_tickerdata_file(datadir, 'UNITTEST/BTC', '1m', timerange=timerange)
pairdata = {'UNITTEST/BTC': parse_ticker_dataframe(tickerdata, '1m', fill_missing=True)}
pairdata = {'UNITTEST/BTC': parse_ticker_dataframe(tickerdata, '1m', pair="UNITTEST/BTC",
fill_missing=True)}
return pairdata
@ -350,7 +352,8 @@ def test_tickerdata_to_dataframe_bt(default_conf, mocker) -> None:
patch_exchange(mocker)
timerange = TimeRange(None, 'line', 0, -100)
tick = history.load_tickerdata_file(None, 'UNITTEST/BTC', '1m', timerange=timerange)
tickerlist = {'UNITTEST/BTC': parse_ticker_dataframe(tick, '1m', fill_missing=True)}
tickerlist = {'UNITTEST/BTC': parse_ticker_dataframe(tick, '1m', pair="UNITTEST/BTC",
fill_missing=True)}
backtesting = Backtesting(default_conf)
data = backtesting.strategy.tickerdata_to_dataframe(tickerlist)

View File

@ -427,7 +427,8 @@ def test_has_space(hyperopt):
def test_populate_indicators(hyperopt) -> None:
tick = load_tickerdata_file(None, 'UNITTEST/BTC', '1m')
tickerlist = {'UNITTEST/BTC': parse_ticker_dataframe(tick, '1m', fill_missing=True)}
tickerlist = {'UNITTEST/BTC': parse_ticker_dataframe(tick, '1m', pair="UNITTEST/BTC",
fill_missing=True)}
dataframes = hyperopt.strategy.tickerdata_to_dataframe(tickerlist)
dataframe = hyperopt.custom_hyperopt.populate_indicators(dataframes['UNITTEST/BTC'],
{'pair': 'UNITTEST/BTC'})
@ -440,7 +441,8 @@ def test_populate_indicators(hyperopt) -> None:
def test_buy_strategy_generator(hyperopt) -> None:
tick = load_tickerdata_file(None, 'UNITTEST/BTC', '1m')
tickerlist = {'UNITTEST/BTC': parse_ticker_dataframe(tick, '1m', fill_missing=True)}
tickerlist = {'UNITTEST/BTC': parse_ticker_dataframe(tick, '1m', pair="UNITTEST/BTC",
fill_missing=True)}
dataframes = hyperopt.strategy.tickerdata_to_dataframe(tickerlist)
dataframe = hyperopt.custom_hyperopt.populate_indicators(dataframes['UNITTEST/BTC'],
{'pair': 'UNITTEST/BTC'})

View File

@ -10,7 +10,8 @@ from freqtrade.strategy.default_strategy import DefaultStrategy
@pytest.fixture
def result():
with open('freqtrade/tests/testdata/ETH_BTC-1m.json') as data_file:
return parse_ticker_dataframe(json.load(data_file), '1m', fill_missing=True)
return parse_ticker_dataframe(json.load(data_file), '1m', pair="UNITTEST/BTC",
fill_missing=True)
def test_default_strategy_structure():

View File

@ -111,7 +111,8 @@ def test_tickerdata_to_dataframe(default_conf) -> None:
timerange = TimeRange(None, 'line', 0, -100)
tick = load_tickerdata_file(None, 'UNITTEST/BTC', '1m', timerange=timerange)
tickerlist = {'UNITTEST/BTC': parse_ticker_dataframe(tick, '1m', fill_missing=True)}
tickerlist = {'UNITTEST/BTC': parse_ticker_dataframe(tick, '1m', pair="UNITTEST/BTC",
fill_missing=True)}
data = strategy.tickerdata_to_dataframe(tickerlist)
assert len(data['UNITTEST/BTC']) == 102 # partial candle was removed

View File

@ -17,7 +17,8 @@ def test_shorten_date() -> None:
def test_datesarray_to_datetimearray(ticker_history_list):
dataframes = parse_ticker_dataframe(ticker_history_list, "5m", fill_missing=True)
dataframes = parse_ticker_dataframe(ticker_history_list, "5m", pair="UNITTEST/BTC",
fill_missing=True)
dates = datesarray_to_datetimearray(dataframes['date'])
assert isinstance(dates[0], datetime.datetime)
@ -34,7 +35,8 @@ def test_datesarray_to_datetimearray(ticker_history_list):
def test_common_datearray(default_conf) -> None:
strategy = DefaultStrategy(default_conf)
tick = load_tickerdata_file(None, 'UNITTEST/BTC', '1m')
tickerlist = {'UNITTEST/BTC': parse_ticker_dataframe(tick, "1m", fill_missing=True)}
tickerlist = {'UNITTEST/BTC': parse_ticker_dataframe(tick, "1m", pair="UNITTEST/BTC",
fill_missing=True)}
dataframes = strategy.tickerdata_to_dataframe(tickerlist)
dates = common_datearray(dataframes)