Merge pull request #6707 from koradiyakaushal/develop

Ref: timeseries friendly merge_ordered in merge_informative_pair func…
This commit is contained in:
Matthias 2022-04-23 14:22:23 +02:00 committed by GitHub
commit b1ca47e3d6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 4 deletions

View File

@ -56,12 +56,18 @@ def merge_informative_pair(dataframe: pd.DataFrame, informative: pd.DataFrame,
# Combine the 2 dataframes # Combine the 2 dataframes
# all indicators on the informative sample MUST be calculated before this point # all indicators on the informative sample MUST be calculated before this point
dataframe = pd.merge(dataframe, informative, left_on='date', if ffill:
right_on=date_merge, how='left') # https://pandas.pydata.org/docs/user_guide/merging.html#timeseries-friendly-merging
# merge_ordered - ffill method is 2.5x faster than seperate ffill()
dataframe = pd.merge_ordered(dataframe, informative, fill_method="ffill", left_on='date',
right_on=date_merge, how='left')
else:
dataframe = pd.merge(dataframe, informative, left_on='date',
right_on=date_merge, how='left')
dataframe = dataframe.drop(date_merge, axis=1) dataframe = dataframe.drop(date_merge, axis=1)
if ffill: # if ffill:
dataframe = dataframe.ffill() # dataframe = dataframe.ffill()
return dataframe return dataframe

View File

@ -68,6 +68,21 @@ def test_merge_informative_pair():
assert result.iloc[7]['date_1h'] == result.iloc[4]['date'] assert result.iloc[7]['date_1h'] == result.iloc[4]['date']
assert result.iloc[8]['date_1h'] == result.iloc[4]['date'] assert result.iloc[8]['date_1h'] == result.iloc[4]['date']
informative = generate_test_data('1h', 40)
result = merge_informative_pair(data, informative, '15m', '1h', ffill=False)
# First 3 rows are empty
assert result.iloc[0]['date_1h'] is pd.NaT
assert result.iloc[1]['date_1h'] is pd.NaT
assert result.iloc[2]['date_1h'] is pd.NaT
# Next 4 rows contain the starting date (0:00)
assert result.iloc[3]['date_1h'] == result.iloc[0]['date']
assert result.iloc[4]['date_1h'] is pd.NaT
assert result.iloc[5]['date_1h'] is pd.NaT
assert result.iloc[6]['date_1h'] is pd.NaT
# Next 4 rows contain the next Hourly date original date row 4
assert result.iloc[7]['date_1h'] == result.iloc[4]['date']
assert result.iloc[8]['date_1h'] is pd.NaT
def test_merge_informative_pair_same(): def test_merge_informative_pair_same():
data = generate_test_data('15m', 40) data = generate_test_data('15m', 40)