Merge branch 'develop' into dev-merge-rl

This commit is contained in:
robcaulk
2022-09-22 19:46:50 +02:00
121 changed files with 1525 additions and 564 deletions

View File

@@ -16,6 +16,7 @@ from numpy.typing import NDArray
from pandas import DataFrame
from freqtrade.configuration import TimeRange
from freqtrade.constants import Config
from freqtrade.data.history import load_pair_history
from freqtrade.exceptions import OperationalException
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
@@ -27,9 +28,7 @@ logger = logging.getLogger(__name__)
class pair_info(TypedDict):
model_filename: str
first: bool
trained_timestamp: int
priority: int
data_path: str
extras: dict
@@ -58,7 +57,7 @@ class FreqaiDataDrawer:
Juha Nykänen @suikula, Wagner Costa @wagnercosta, Johan Vlugt @Jooopieeert
"""
def __init__(self, full_path: Path, config: dict, follow_mode: bool = False):
def __init__(self, full_path: Path, config: Config, follow_mode: bool = False):
self.config = config
self.freqai_info = config.get("freqai", {})
@@ -91,7 +90,7 @@ class FreqaiDataDrawer:
self.old_DBSCAN_eps: Dict[str, float] = {}
self.empty_pair_dict: pair_info = {
"model_filename": "", "trained_timestamp": 0,
"priority": 1, "first": True, "data_path": "", "extras": {}}
"data_path": "", "extras": {}}
self.limit_ram_use = self.freqai_info.get('limit_ram_usage', False)
def load_drawer_from_disk(self):
@@ -217,7 +216,6 @@ class FreqaiDataDrawer:
self.pair_dict[pair] = self.empty_pair_dict.copy()
model_filename = ""
trained_timestamp = 0
self.pair_dict[pair]["priority"] = len(self.pair_dict)
if not data_path_set and self.follow_mode:
logger.warning(
@@ -237,18 +235,9 @@ class FreqaiDataDrawer:
return
else:
self.pair_dict[metadata["pair"]] = self.empty_pair_dict.copy()
self.pair_dict[metadata["pair"]]["priority"] = len(self.pair_dict)
return
def pair_to_end_of_training_queue(self, pair: str) -> None:
# march all pairs up in the queue
with self.pair_dict_lock:
for p in self.pair_dict:
self.pair_dict[p]["priority"] -= 1
# send pair to end of queue
self.pair_dict[pair]["priority"] = len(self.pair_dict)
def set_initial_return_values(self, pair: str, pred_df: DataFrame) -> None:
"""
Set the initial return values to the historical predictions dataframe. This avoids needing
@@ -356,7 +345,7 @@ class FreqaiDataDrawer:
for dir in model_folders:
result = pattern.match(str(dir.name))
if result is None:
break
continue
coin = result.group(1)
timestamp = result.group(2)

View File

@@ -18,6 +18,7 @@ from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from freqtrade.configuration import TimeRange
from freqtrade.constants import Config
from freqtrade.exceptions import OperationalException
from freqtrade.exchange import timeframe_to_seconds
from freqtrade.strategy.interface import IStrategy
@@ -57,7 +58,7 @@ class FreqaiDataKitchen:
def __init__(
self,
config: Dict[str, Any],
config: Config,
live: bool = False,
pair: str = "",
):
@@ -774,12 +775,22 @@ class FreqaiDataKitchen:
def compute_inlier_metric(self, set_='train') -> None:
"""
Compute inlier metric from backwards distance distributions.
This metric defines how well features from a timepoint fit
into previous timepoints.
"""
def normalise(dataframe: DataFrame, key: str) -> DataFrame:
if set_ == 'train':
min_value = dataframe.min()
max_value = dataframe.max()
self.data[f'{key}_min'] = min_value
self.data[f'{key}_max'] = max_value
else:
min_value = self.data[f'{key}_min']
max_value = self.data[f'{key}_max']
return (dataframe - min_value) / (max_value - min_value)
no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"]
if set_ == 'train':
@@ -824,7 +835,12 @@ class FreqaiDataKitchen:
inliers = pd.DataFrame(index=distances.index)
for key in distances.keys():
current_distances = distances[key].dropna()
fit_params = stats.weibull_min.fit(current_distances)
current_distances = normalise(current_distances, key)
if set_ == 'train':
fit_params = stats.weibull_min.fit(current_distances)
self.data[f'{key}_fit_params'] = fit_params
else:
fit_params = self.data[f'{key}_fit_params']
quantiles = stats.weibull_min.cdf(current_distances, *fit_params)
df_inlier = pd.DataFrame(

View File

@@ -3,6 +3,7 @@ import shutil
import threading
import time
from abc import ABC, abstractmethod
from collections import deque
from datetime import datetime, timezone
from pathlib import Path
from threading import Lock
@@ -14,12 +15,13 @@ from numpy.typing import NDArray
from pandas import DataFrame
from freqtrade.configuration import TimeRange
from freqtrade.constants import DATETIME_PRINT_FORMAT
from freqtrade.constants import DATETIME_PRINT_FORMAT, Config
from freqtrade.enums import RunMode
from freqtrade.exceptions import OperationalException
from freqtrade.exchange import timeframe_to_seconds
from freqtrade.freqai.data_drawer import FreqaiDataDrawer
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from freqtrade.freqai.utils import plot_feature_importance
from freqtrade.strategy.interface import IStrategy
@@ -50,7 +52,7 @@ class IFreqaiModel(ABC):
Juha Nykänen @suikula, Wagner Costa @wagnercosta, Johan Vlugt @Jooopieeert
"""
def __init__(self, config: Dict[str, Any]) -> None:
def __init__(self, config: Config) -> None:
self.config = config
self.assert_config(self.config)
@@ -80,6 +82,7 @@ class IFreqaiModel(ABC):
self.pair_it = 0
self.pair_it_train = 0
self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist"))
self.train_queue = self._set_train_queue()
self.last_trade_database_summary: DataFrame = {}
self.current_trade_database_summary: DataFrame = {}
self.analysis_lock = Lock()
@@ -101,7 +104,7 @@ class IFreqaiModel(ABC):
return ({})
self.strategy: Optional[IStrategy] = None
def assert_config(self, config: Dict[str, Any]) -> None:
def assert_config(self, config: Config) -> None:
if not config.get("freqai", {}):
raise OperationalException("No freqai parameters found in configuration file.")
@@ -184,29 +187,40 @@ class IFreqaiModel(ABC):
"""
while not self._stop_event.is_set():
time.sleep(1)
for pair in self.config.get("exchange", {}).get("pair_whitelist"):
pair = self.train_queue[0]
(_, trained_timestamp, _) = self.dd.get_pair_dict_info(pair)
# ensure pair is avaialble in dp
if pair not in strategy.dp.current_whitelist():
self.train_queue.popleft()
logger.warning(f'{pair} not in current whitelist, removing from train queue.')
continue
if self.dd.pair_dict[pair]["priority"] != 1:
continue
dk = FreqaiDataKitchen(self.config, self.live, pair)
dk.set_paths(pair, trained_timestamp)
(
retrain,
new_trained_timerange,
data_load_timerange,
) = dk.check_if_new_training_required(trained_timestamp)
dk.set_paths(pair, new_trained_timerange.stopts)
(_, trained_timestamp, _) = self.dd.get_pair_dict_info(pair)
if retrain:
self.train_timer('start')
dk = FreqaiDataKitchen(self.config, self.live, pair)
dk.set_paths(pair, trained_timestamp)
(
retrain,
new_trained_timerange,
data_load_timerange,
) = dk.check_if_new_training_required(trained_timestamp)
dk.set_paths(pair, new_trained_timerange.stopts)
if retrain:
self.train_timer('start')
try:
self.extract_data_and_train_model(
new_trained_timerange, pair, strategy, dk, data_load_timerange
)
self.train_timer('stop')
except Exception as msg:
logger.warning(f'Training {pair} raised exception {msg}, skipping.')
self.dd.save_historic_predictions_to_disk()
self.train_timer('stop')
# only rotate the queue after the first has been trained.
self.train_queue.rotate(-1)
self.dd.save_historic_predictions_to_disk()
def start_backtesting(
self, dataframe: DataFrame, metadata: dict, dk: FreqaiDataKitchen
@@ -561,11 +575,11 @@ class IFreqaiModel(ABC):
self.dd.pair_dict[pair]["trained_timestamp"] = new_trained_timerange.stopts
dk.set_new_model_names(pair, new_trained_timerange)
self.dd.pair_dict[pair]["first"] = False
if self.dd.pair_dict[pair]["priority"] == 1 and self.scanning:
self.dd.pair_to_end_of_training_queue(pair)
self.dd.save_data(model, pair, dk)
if self.freqai_info["feature_parameters"].get("plot_feature_importance", False):
plot_feature_importance(model, pair, dk)
if self.freqai_info.get("purge_old_models", False):
self.dd.purge_old_models()
@@ -689,6 +703,32 @@ class IFreqaiModel(ABC):
return init_model
def _set_train_queue(self):
"""
Sets train queue from existing train timestamps if they exist
otherwise it sets the train queue based on the provided whitelist.
"""
current_pairlist = self.config.get("exchange", {}).get("pair_whitelist")
if not self.dd.pair_dict:
logger.info('Set fresh train queue from whitelist. '
f'Queue: {current_pairlist}')
return deque(current_pairlist)
best_queue = deque()
pair_dict_sorted = sorted(self.dd.pair_dict.items(),
key=lambda k: k[1]['trained_timestamp'])
for pair in pair_dict_sorted:
if pair[0] in current_pairlist:
best_queue.append(pair[0])
for pair in current_pairlist:
if pair not in best_queue:
best_queue.appendleft(pair)
logger.info('Set existing queue from trained timestamps. '
f'Best approximation queue: {best_queue}')
return best_queue
# Following methods which are overridden by user made prediction models.
# See freqai/prediction_models/CatboostPredictionModel.py for an example.

View File

@@ -0,0 +1,85 @@
import logging
from typing import Any, Dict, Tuple
import numpy as np
import numpy.typing as npt
import pandas as pd
from pandas import DataFrame
from pandas.api.types import is_integer_dtype
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from freqtrade.freqai.base_models.BaseClassifierModel import BaseClassifierModel
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
logger = logging.getLogger(__name__)
class XGBoostClassifier(BaseClassifierModel):
"""
User created prediction model. The class needs to override three necessary
functions, predict(), train(), fit(). The class inherits ModelHandler which
has its own DataHandler where data is held, saved, loaded, and managed.
"""
def fit(self, data_dictionary: Dict, dk: FreqaiDataKitchen, **kwargs) -> Any:
"""
User sets up the training and test data to fit their desired model here
:params:
:data_dictionary: the dictionary constructed by DataHandler to hold
all the training and test data/labels.
"""
X = data_dictionary["train_features"].to_numpy()
y = data_dictionary["train_labels"].to_numpy()[:, 0]
le = LabelEncoder()
if not is_integer_dtype(y):
y = pd.Series(le.fit_transform(y), dtype="int64")
if self.freqai_info.get('data_split_parameters', {}).get('test_size', 0.1) == 0:
eval_set = None
else:
test_features = data_dictionary["test_features"].to_numpy()
test_labels = data_dictionary["test_labels"].to_numpy()[:, 0]
if not is_integer_dtype(test_labels):
test_labels = pd.Series(le.transform(test_labels), dtype="int64")
eval_set = [(test_features, test_labels)]
train_weights = data_dictionary["train_weights"]
init_model = self.get_init_model(dk.pair)
model = XGBClassifier(**self.model_training_parameters)
model.fit(X=X, y=y, eval_set=eval_set, sample_weight=train_weights,
xgb_model=init_model)
return model
def predict(
self, unfiltered_df: DataFrame, dk: FreqaiDataKitchen, **kwargs
) -> Tuple[DataFrame, npt.NDArray[np.int_]]:
"""
Filter the prediction features data and predict with it.
:param: unfiltered_df: Full dataframe for the current backtest period.
:return:
:pred_df: dataframe containing the predictions
:do_predict: np.array of 1s and 0s to indicate places where freqai needed to remove
data (NaNs) or felt uncertain about data (PCA and DI index)
"""
(pred_df, dk.do_predict) = super().predict(unfiltered_df, dk, **kwargs)
le = LabelEncoder()
label = dk.label_list[0]
labels_before = list(dk.data['labels_std'].keys())
labels_after = le.fit_transform(labels_before).tolist()
pred_df[label] = le.inverse_transform(pred_df[label])
pred_df = pred_df.rename(
columns={labels_after[i]: labels_before[i] for i in range(len(labels_before))})
return (pred_df, dk.do_predict)

View File

@@ -1,19 +1,25 @@
import logging
from datetime import datetime, timezone
from typing import Any
import numpy as np
import pandas as pd
from freqtrade.configuration import TimeRange
from freqtrade.constants import Config
from freqtrade.data.dataprovider import DataProvider
from freqtrade.data.history.history_utils import refresh_backtest_ohlcv_data
from freqtrade.exceptions import OperationalException
from freqtrade.exchange import timeframe_to_seconds
from freqtrade.exchange.exchange import market_is_active
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from freqtrade.plugins.pairlist.pairlist_helpers import dynamic_expand_pairlist
logger = logging.getLogger(__name__)
def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
def download_all_data_for_training(dp: DataProvider, config: Config) -> None:
"""
Called only once upon start of bot to download the necessary data for
populating indicators and training the model.
@@ -47,9 +53,7 @@ def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
)
def get_required_data_timerange(
config: dict
) -> TimeRange:
def get_required_data_timerange(config: Config) -> TimeRange:
"""
Used to compute the required data download time range
for auto data-download in FreqAI
@@ -86,7 +90,7 @@ def get_required_data_timerange(
# Keep below for when we wish to download heterogeneously lengthed data for FreqAI.
# def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
# def download_all_data_for_training(dp: DataProvider, config: Config) -> None:
# """
# Called only once upon start of bot to download the necessary data for
# populating indicators and training a FreqAI model.
@@ -132,3 +136,58 @@ def get_required_data_timerange(
# trading_mode=config.get("trading_mode", "spot"),
# prepend=config.get("prepend_data", False),
# )
def plot_feature_importance(model: Any, pair: str, dk: FreqaiDataKitchen,
count_max: int = 25) -> None:
"""
Plot Best and worst features by importance for a single sub-train.
:param model: Any = A model which was `fit` using a common library
such as catboost or lightgbm
:param pair: str = pair e.g. BTC/USD
:param dk: FreqaiDataKitchen = non-persistent data container for current coin/loop
:param count_max: int = the amount of features to be loaded per column
"""
from freqtrade.plot.plotting import go, make_subplots, store_plot_file
# Extract feature importance from model
models = {}
if 'FreqaiMultiOutputRegressor' in str(model.__class__):
for estimator, label in zip(model.estimators_, dk.label_list):
models[label] = estimator
else:
models[dk.label_list[0]] = model
for label in models:
mdl = models[label]
if "catboost.core" in str(mdl.__class__):
feature_importance = mdl.get_feature_importance()
elif "lightgbm.sklearn" or "xgb" in str(mdl.__class__):
feature_importance = mdl.feature_importances_
else:
logger.info('Model type not support for generating feature importances.')
return
# Data preparation
fi_df = pd.DataFrame({
"feature_names": np.array(dk.training_features_list),
"feature_importance": np.array(feature_importance)
})
fi_df_top = fi_df.nlargest(count_max, "feature_importance")[::-1]
fi_df_worst = fi_df.nsmallest(count_max, "feature_importance")[::-1]
# Plotting
def add_feature_trace(fig, fi_df, col):
return fig.add_trace(
go.Bar(
x=fi_df["feature_importance"],
y=fi_df["feature_names"],
orientation='h', showlegend=False
), row=1, col=col
)
fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.5)
fig = add_feature_trace(fig, fi_df_top, 1)
fig = add_feature_trace(fig, fi_df_worst, 2)
fig.update_layout(title_text=f"Best and worst features by importance {pair}")
label = label.replace('&', '').replace('%', '') # escape two FreqAI specific characters
store_plot_file(fig, f"{dk.model_filename}-{label}.html", dk.data_path)