Merge branch 'develop' of github.com:lolongcovas/freqtrade into feature/training_data_slice_normalization

This commit is contained in:
longyu
2022-09-30 19:57:30 +02:00
174 changed files with 5153 additions and 1737 deletions

View File

@@ -1,4 +1,5 @@
import logging
from time import time
from typing import Any, Tuple
import numpy as np
@@ -32,7 +33,9 @@ class BaseClassifierModel(IFreqaiModel):
:model: Trained model which can be used to inference (self.predict)
"""
logger.info("-------------------- Starting training " f"{pair} --------------------")
logger.info(f"-------------------- Starting training {pair} --------------------")
start_time = time()
# filter the features requested by user in the configuration file and elegantly handle NaNs
features_filtered, labels_filtered = dk.filter_features(
@@ -45,10 +48,10 @@ class BaseClassifierModel(IFreqaiModel):
start_date = unfiltered_df["date"].iloc[0].strftime("%Y-%m-%d")
end_date = unfiltered_df["date"].iloc[-1].strftime("%Y-%m-%d")
logger.info(f"-------------------- Training on data from {start_date} to "
f"{end_date}--------------------")
f"{end_date} --------------------")
# split data into train/test data.
data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered)
if not self.freqai_info.get('fit_live_predictions', 0) or not self.live:
if not self.freqai_info.get("fit_live_predictions", 0) or not self.live:
dk.fit_labels()
# normalize all data based on train_dataset only
data_dictionary = dk.normalize_data(data_dictionary)
@@ -57,13 +60,16 @@ class BaseClassifierModel(IFreqaiModel):
self.data_cleaning_train(dk)
logger.info(
f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"
)
logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
logger.info(f"Training model on {len(data_dictionary['train_features'])} data points")
model = self.fit(data_dictionary, dk)
logger.info(f"--------------------done training {pair}--------------------")
end_time = time()
logger.info(f"-------------------- Done training {pair} "
f"({end_time - start_time:.2f} secs) --------------------")
return model

View File

@@ -1,4 +1,5 @@
import logging
from time import time
from typing import Any, Tuple
import numpy as np
@@ -31,7 +32,9 @@ class BaseRegressionModel(IFreqaiModel):
:model: Trained model which can be used to inference (self.predict)
"""
logger.info("-------------------- Starting training " f"{pair} --------------------")
logger.info(f"-------------------- Starting training {pair} --------------------")
start_time = time()
# filter the features requested by user in the configuration file and elegantly handle NaNs
features_filtered, labels_filtered = dk.filter_features(
@@ -44,10 +47,10 @@ class BaseRegressionModel(IFreqaiModel):
start_date = unfiltered_df["date"].iloc[0].strftime("%Y-%m-%d")
end_date = unfiltered_df["date"].iloc[-1].strftime("%Y-%m-%d")
logger.info(f"-------------------- Training on data from {start_date} to "
f"{end_date}--------------------")
f"{end_date} --------------------")
# split data into train/test data.
data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered)
if not self.freqai_info.get('fit_live_predictions', 0) or not self.live:
if not self.freqai_info.get("fit_live_predictions", 0) or not self.live:
dk.fit_labels()
# normalize all data based on train_dataset only
data_dictionary = dk.normalize_data(data_dictionary)
@@ -56,13 +59,16 @@ class BaseRegressionModel(IFreqaiModel):
self.data_cleaning_train(dk)
logger.info(
f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"
)
logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
logger.info(f"Training model on {len(data_dictionary['train_features'])} data points")
model = self.fit(data_dictionary, dk)
logger.info(f"--------------------done training {pair}--------------------")
end_time = time()
logger.info(f"-------------------- Done training {pair} "
f"({end_time - start_time:.2f} secs) --------------------")
return model

View File

@@ -1,4 +1,5 @@
import logging
from time import time
from typing import Any
from pandas import DataFrame
@@ -28,7 +29,9 @@ class BaseTensorFlowModel(IFreqaiModel):
:model: Trained model which can be used to inference (self.predict)
"""
logger.info("-------------------- Starting training " f"{pair} --------------------")
logger.info(f"-------------------- Starting training {pair} --------------------")
start_time = time()
# filter the features requested by user in the configuration file and elegantly handle NaNs
features_filtered, labels_filtered = dk.filter_features(
@@ -41,10 +44,10 @@ class BaseTensorFlowModel(IFreqaiModel):
start_date = unfiltered_df["date"].iloc[0].strftime("%Y-%m-%d")
end_date = unfiltered_df["date"].iloc[-1].strftime("%Y-%m-%d")
logger.info(f"-------------------- Training on data from {start_date} to "
f"{end_date}--------------------")
f"{end_date} --------------------")
# split data into train/test data.
data_dictionary = dk.make_train_test_datasets(features_filtered, labels_filtered)
if not self.freqai_info.get('fit_live_predictions', 0) or not self.live:
if not self.freqai_info.get("fit_live_predictions", 0) or not self.live:
dk.fit_labels()
# normalize all data based on train_dataset only
data_dictionary = dk.normalize_data(data_dictionary)
@@ -53,12 +56,15 @@ class BaseTensorFlowModel(IFreqaiModel):
self.data_cleaning_train(dk)
logger.info(
f'Training model on {len(dk.data_dictionary["train_features"].columns)}' " features"
f"Training model on {len(dk.data_dictionary['train_features'].columns)} features"
)
logger.info(f'Training model on {len(data_dictionary["train_features"])} data points')
logger.info(f"Training model on {len(data_dictionary['train_features'])} data points")
model = self.fit(data_dictionary, dk)
logger.info(f"--------------------done training {pair}--------------------")
end_time = time()
logger.info(f"-------------------- Done training {pair} "
f"({end_time - start_time:.2f} secs) --------------------")
return model

View File

@@ -1,4 +1,3 @@
from joblib import Parallel
from sklearn.multioutput import MultiOutputRegressor, _fit_estimator
from sklearn.utils.fixes import delayed

View File

@@ -16,6 +16,7 @@ from numpy.typing import NDArray
from pandas import DataFrame
from freqtrade.configuration import TimeRange
from freqtrade.constants import Config
from freqtrade.data.history import load_pair_history
from freqtrade.exceptions import OperationalException
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
@@ -27,9 +28,7 @@ logger = logging.getLogger(__name__)
class pair_info(TypedDict):
model_filename: str
first: bool
trained_timestamp: int
priority: int
data_path: str
extras: dict
@@ -58,7 +57,7 @@ class FreqaiDataDrawer:
Juha Nykänen @suikula, Wagner Costa @wagnercosta, Johan Vlugt @Jooopieeert
"""
def __init__(self, full_path: Path, config: dict, follow_mode: bool = False):
def __init__(self, full_path: Path, config: Config, follow_mode: bool = False):
self.config = config
self.freqai_info = config.get("freqai", {})
@@ -91,7 +90,7 @@ class FreqaiDataDrawer:
self.old_DBSCAN_eps: Dict[str, float] = {}
self.empty_pair_dict: pair_info = {
"model_filename": "", "trained_timestamp": 0,
"priority": 1, "first": True, "data_path": "", "extras": {}}
"data_path": "", "extras": {}}
def load_drawer_from_disk(self):
"""
@@ -216,7 +215,6 @@ class FreqaiDataDrawer:
self.pair_dict[pair] = self.empty_pair_dict.copy()
model_filename = ""
trained_timestamp = 0
self.pair_dict[pair]["priority"] = len(self.pair_dict)
if not data_path_set and self.follow_mode:
logger.warning(
@@ -236,18 +234,9 @@ class FreqaiDataDrawer:
return
else:
self.pair_dict[metadata["pair"]] = self.empty_pair_dict.copy()
self.pair_dict[metadata["pair"]]["priority"] = len(self.pair_dict)
return
def pair_to_end_of_training_queue(self, pair: str) -> None:
# march all pairs up in the queue
with self.pair_dict_lock:
for p in self.pair_dict:
self.pair_dict[p]["priority"] -= 1
# send pair to end of queue
self.pair_dict[pair]["priority"] = len(self.pair_dict)
def set_initial_return_values(self, pair: str, pred_df: DataFrame) -> None:
"""
Set the initial return values to the historical predictions dataframe. This avoids needing
@@ -324,6 +313,7 @@ class FreqaiDataDrawer:
"""
dk.find_features(dataframe)
dk.find_labels(dataframe)
full_labels = dk.label_list + dk.unique_class_list
@@ -387,7 +377,27 @@ class FreqaiDataDrawer:
if self.config.get("freqai", {}).get("purge_old_models", False):
self.purge_old_models()
# Functions pulled back from FreqaiDataKitchen because they relied on DataDrawer
def save_metadata(self, dk: FreqaiDataKitchen) -> None:
"""
Saves only metadata for backtesting studies if user prefers
not to save model data. This saves tremendous amounts of space
for users generating huge studies.
This is only active when `save_backtest_models`: false (not default)
"""
if not dk.data_path.is_dir():
dk.data_path.mkdir(parents=True, exist_ok=True)
save_path = Path(dk.data_path)
dk.data["data_path"] = str(dk.data_path)
dk.data["model_filename"] = str(dk.model_filename)
dk.data["training_features_list"] = list(dk.data_dictionary["train_features"].columns)
dk.data["label_list"] = dk.label_list
with open(save_path / f"{dk.model_filename}_metadata.json", "w") as fp:
rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE)
return
def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None:
"""
@@ -441,6 +451,16 @@ class FreqaiDataDrawer:
return
def load_metadata(self, dk: FreqaiDataKitchen) -> None:
"""
Load only metadata into datakitchen to increase performance during
presaved backtesting (prediction file loading).
"""
with open(dk.data_path / f"{dk.model_filename}_metadata.json", "r") as fp:
dk.data = json.load(fp)
dk.training_features_list = dk.data["training_features_list"]
dk.label_list = dk.data["label_list"]
def load_data(self, coin: str, dk: FreqaiDataKitchen) -> Any:
"""
loads all data required to make a prediction on a sub-train time range

View File

@@ -18,6 +18,7 @@ from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from freqtrade.configuration import TimeRange
from freqtrade.constants import Config
from freqtrade.exceptions import OperationalException
from freqtrade.exchange import timeframe_to_seconds
from freqtrade.strategy.interface import IStrategy
@@ -57,7 +58,7 @@ class FreqaiDataKitchen:
def __init__(
self,
config: Dict[str, Any],
config: Config,
live: bool = False,
pair: str = "",
):
@@ -465,27 +466,6 @@ class FreqaiDataKitchen:
return df
def remove_training_from_backtesting(
self
) -> DataFrame:
"""
Function which takes the backtesting time range and
remove training data from dataframe, keeping only the
startup_candle_count candles
"""
startup_candle_count = self.config.get('startup_candle_count', 0)
tf = self.config['timeframe']
tr = self.config["timerange"]
backtesting_timerange = TimeRange.parse_timerange(tr)
if startup_candle_count > 0 and backtesting_timerange:
backtesting_timerange.subtract_start(timeframe_to_seconds(tf) * startup_candle_count)
start = datetime.fromtimestamp(backtesting_timerange.startts, tz=timezone.utc)
df = self.return_dataframe
df = df.loc[df["date"] >= start, :]
return df
def principal_component_analysis(self) -> None:
"""
Performs Principal Component Analysis on the data for dimensionality reduction
@@ -774,12 +754,22 @@ class FreqaiDataKitchen:
def compute_inlier_metric(self, set_='train') -> None:
"""
Compute inlier metric from backwards distance distributions.
This metric defines how well features from a timepoint fit
into previous timepoints.
"""
def normalise(dataframe: DataFrame, key: str) -> DataFrame:
if set_ == 'train':
min_value = dataframe.min()
max_value = dataframe.max()
self.data[f'{key}_min'] = min_value
self.data[f'{key}_max'] = max_value
else:
min_value = self.data[f'{key}_min']
max_value = self.data[f'{key}_max']
return (dataframe - min_value) / (max_value - min_value)
no_prev_pts = self.freqai_config["feature_parameters"]["inlier_metric_window"]
if set_ == 'train':
@@ -824,7 +814,12 @@ class FreqaiDataKitchen:
inliers = pd.DataFrame(index=distances.index)
for key in distances.keys():
current_distances = distances[key].dropna()
fit_params = stats.weibull_min.fit(current_distances)
current_distances = normalise(current_distances, key)
if set_ == 'train':
fit_params = stats.weibull_min.fit(current_distances)
self.data[f'{key}_fit_params'] = fit_params
else:
fit_params = self.data[f'{key}_fit_params']
quantiles = stats.weibull_min.cdf(current_distances, *fit_params)
df_inlier = pd.DataFrame(
@@ -836,7 +831,7 @@ class FreqaiDataKitchen:
inlier_metric = pd.DataFrame(
data=inliers.sum(axis=1) / no_prev_pts,
columns=['inlier_metric'],
columns=['%-inlier_metric'],
index=compute_df.index
)
@@ -900,11 +895,14 @@ class FreqaiDataKitchen:
"""
column_names = dataframe.columns
features = [c for c in column_names if "%" in c]
labels = [c for c in column_names if "&" in c]
if not features:
raise OperationalException("Could not find any features!")
self.training_features_list = features
def find_labels(self, dataframe: DataFrame) -> None:
column_names = dataframe.columns
labels = [c for c in column_names if "&" in c]
self.label_list = labels
def check_if_pred_in_training_spaces(self) -> None:
@@ -992,8 +990,6 @@ class FreqaiDataKitchen:
to_keep = [col for col in dataframe.columns if not col.startswith("&")]
self.return_dataframe = pd.concat([dataframe[to_keep], self.full_df], axis=1)
self.return_dataframe = self.remove_training_from_backtesting()
self.full_df = DataFrame()
return
@@ -1227,7 +1223,8 @@ class FreqaiDataKitchen:
def get_unique_classes_from_labels(self, dataframe: DataFrame) -> None:
self.find_features(dataframe)
# self.find_features(dataframe)
self.find_labels(dataframe)
for key in self.label_list:
if dataframe[key].dtype == object:

View File

@@ -3,6 +3,7 @@ import shutil
import threading
import time
from abc import ABC, abstractmethod
from collections import deque
from datetime import datetime, timezone
from pathlib import Path
from threading import Lock
@@ -14,12 +15,13 @@ from numpy.typing import NDArray
from pandas import DataFrame
from freqtrade.configuration import TimeRange
from freqtrade.constants import DATETIME_PRINT_FORMAT
from freqtrade.constants import DATETIME_PRINT_FORMAT, Config
from freqtrade.enums import RunMode
from freqtrade.exceptions import OperationalException
from freqtrade.exchange import timeframe_to_seconds
from freqtrade.freqai.data_drawer import FreqaiDataDrawer
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from freqtrade.freqai.utils import plot_feature_importance
from freqtrade.strategy.interface import IStrategy
@@ -50,7 +52,7 @@ class IFreqaiModel(ABC):
Juha Nykänen @suikula, Wagner Costa @wagnercosta, Johan Vlugt @Jooopieeert
"""
def __init__(self, config: Dict[str, Any]) -> None:
def __init__(self, config: Config) -> None:
self.config = config
self.assert_config(self.config)
@@ -63,7 +65,7 @@ class IFreqaiModel(ABC):
self.first = True
self.set_full_path()
self.follow_mode: bool = self.freqai_info.get("follow_mode", False)
self.save_backtest_models: bool = self.freqai_info.get("save_backtest_models", False)
self.save_backtest_models: bool = self.freqai_info.get("save_backtest_models", True)
if self.save_backtest_models:
logger.info('Backtesting module configured to save all models.')
self.dd = FreqaiDataDrawer(Path(self.full_path), self.config, self.follow_mode)
@@ -80,6 +82,7 @@ class IFreqaiModel(ABC):
self.pair_it = 0
self.pair_it_train = 0
self.total_pairs = len(self.config.get("exchange", {}).get("pair_whitelist"))
self.train_queue = self._set_train_queue()
self.last_trade_database_summary: DataFrame = {}
self.current_trade_database_summary: DataFrame = {}
self.analysis_lock = Lock()
@@ -89,6 +92,7 @@ class IFreqaiModel(ABC):
self.begin_time_train: float = 0
self.base_tf_seconds = timeframe_to_seconds(self.config['timeframe'])
self.continual_learning = self.freqai_info.get('continual_learning', False)
self.plot_features = self.ft_params.get("plot_feature_importances", 0)
self._threads: List[threading.Thread] = []
self._stop_event = threading.Event()
@@ -99,7 +103,7 @@ class IFreqaiModel(ABC):
"""
return ({})
def assert_config(self, config: Dict[str, Any]) -> None:
def assert_config(self, config: Config) -> None:
if not config.get("freqai", {}):
raise OperationalException("No freqai parameters found in configuration file.")
@@ -181,29 +185,40 @@ class IFreqaiModel(ABC):
"""
while not self._stop_event.is_set():
time.sleep(1)
for pair in self.config.get("exchange", {}).get("pair_whitelist"):
pair = self.train_queue[0]
(_, trained_timestamp, _) = self.dd.get_pair_dict_info(pair)
# ensure pair is avaialble in dp
if pair not in strategy.dp.current_whitelist():
self.train_queue.popleft()
logger.warning(f'{pair} not in current whitelist, removing from train queue.')
continue
if self.dd.pair_dict[pair]["priority"] != 1:
continue
dk = FreqaiDataKitchen(self.config, self.live, pair)
dk.set_paths(pair, trained_timestamp)
(
retrain,
new_trained_timerange,
data_load_timerange,
) = dk.check_if_new_training_required(trained_timestamp)
dk.set_paths(pair, new_trained_timerange.stopts)
(_, trained_timestamp, _) = self.dd.get_pair_dict_info(pair)
if retrain:
self.train_timer('start')
dk = FreqaiDataKitchen(self.config, self.live, pair)
dk.set_paths(pair, trained_timestamp)
(
retrain,
new_trained_timerange,
data_load_timerange,
) = dk.check_if_new_training_required(trained_timestamp)
dk.set_paths(pair, new_trained_timerange.stopts)
if retrain:
self.train_timer('start')
try:
self.extract_data_and_train_model(
new_trained_timerange, pair, strategy, dk, data_load_timerange
)
self.train_timer('stop')
except Exception as msg:
logger.warning(f'Training {pair} raised exception {msg}, skipping.')
self.dd.save_historic_predictions_to_disk()
self.train_timer('stop')
# only rotate the queue after the first has been trained.
self.train_queue.rotate(-1)
self.dd.save_historic_predictions_to_disk()
def start_backtesting(
self, dataframe: DataFrame, metadata: dict, dk: FreqaiDataKitchen
@@ -230,7 +245,8 @@ class IFreqaiModel(ABC):
# following tr_train. Both of these windows slide through the
# entire backtest
for tr_train, tr_backtest in zip(dk.training_timeranges, dk.backtesting_timeranges):
(_, _, _) = self.dd.get_pair_dict_info(metadata["pair"])
pair = metadata["pair"]
(_, _, _) = self.dd.get_pair_dict_info(pair)
train_it += 1
total_trains = len(dk.backtesting_timeranges)
self.training_timerange = tr_train
@@ -245,40 +261,42 @@ class IFreqaiModel(ABC):
tr_train.stopts,
tz=timezone.utc).strftime(DATETIME_PRINT_FORMAT)
logger.info(
f"Training {metadata['pair']}, {self.pair_it}/{self.total_pairs} pairs"
f"Training {pair}, {self.pair_it}/{self.total_pairs} pairs"
f" from {tr_train_startts_str} to {tr_train_stopts_str}, {train_it}/{total_trains} "
"trains"
)
trained_timestamp_int = int(trained_timestamp.stopts)
dk.data_path = Path(
dk.full_path
/
f"sub-train-{metadata['pair'].split('/')[0]}_{trained_timestamp_int}"
dk.full_path / f"sub-train-{pair.split('/')[0]}_{trained_timestamp_int}"
)
dk.set_new_model_names(metadata["pair"], trained_timestamp)
dk.set_new_model_names(pair, trained_timestamp)
if dk.check_if_backtest_prediction_exists():
self.dd.load_metadata(dk)
self.check_if_feature_list_matches_strategy(dataframe_train, dk)
append_df = dk.get_backtesting_prediction()
dk.append_predictions(append_df)
else:
if not self.model_exists(
metadata["pair"], dk, trained_timestamp=trained_timestamp_int
):
if not self.model_exists(dk):
dk.find_features(dataframe_train)
self.model = self.train(dataframe_train, metadata["pair"], dk)
self.dd.pair_dict[metadata["pair"]]["trained_timestamp"] = int(
dk.find_labels(dataframe_train)
self.model = self.train(dataframe_train, pair, dk)
self.dd.pair_dict[pair]["trained_timestamp"] = int(
trained_timestamp.stopts)
if self.plot_features:
plot_feature_importance(self.model, pair, dk, self.plot_features)
if self.save_backtest_models:
logger.info('Saving backtest model to disk.')
self.dd.save_data(self.model, metadata["pair"], dk)
self.dd.save_data(self.model, pair, dk)
else:
logger.info('Saving metadata to disk.')
self.dd.save_metadata(dk)
else:
self.model = self.dd.load_data(metadata["pair"], dk)
self.check_if_feature_list_matches_strategy(dataframe_train, dk)
self.model = self.dd.load_data(pair, dk)
# self.check_if_feature_list_matches_strategy(dataframe_train, dk)
pred_df, do_preds = self.predict(dataframe_backtest, dk)
append_df = dk.get_predictions_to_append(pred_df, do_preds)
dk.append_predictions(append_df)
@@ -357,8 +375,7 @@ class IFreqaiModel(ABC):
self.dd.return_null_values_to_strategy(dataframe, dk)
return dk
# ensure user is feeding the correct indicators to the model
self.check_if_feature_list_matches_strategy(dataframe, dk)
dk.find_labels(dataframe)
self.build_strategy_return_arrays(dataframe, dk, metadata["pair"], trained_timestamp)
@@ -416,14 +433,16 @@ class IFreqaiModel(ABC):
if "training_features_list_raw" in dk.data:
feature_list = dk.data["training_features_list_raw"]
else:
feature_list = dk.training_features_list
feature_list = dk.data['training_features_list']
if dk.training_features_list != feature_list:
raise OperationalException(
"Trying to access pretrained model with `identifier` "
"but found different features furnished by current strategy."
"Change `identifier` to train from scratch, or ensure the"
"strategy is furnishing the same features as the pretrained"
"model"
"model. In case of --strategy-list, please be aware that FreqAI "
"requires all strategies to maintain identical "
"populate_any_indicator() functions"
)
def data_cleaning_train(self, dk: FreqaiDataKitchen) -> None:
@@ -475,7 +494,7 @@ class IFreqaiModel(ABC):
if ft_params.get(
"principal_component_analysis", False
):
dk.pca_transform(self.dk.data_dictionary['prediction_features'])
dk.pca_transform(dk.data_dictionary['prediction_features'])
if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=True)
@@ -486,14 +505,10 @@ class IFreqaiModel(ABC):
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
dk.use_DBSCAN_to_remove_outliers(predict=True)
def model_exists(
self,
pair: str,
dk: FreqaiDataKitchen,
trained_timestamp: int = None,
model_filename: str = "",
scanning: bool = False,
) -> bool:
# ensure user is feeding the correct indicators to the model
self.check_if_feature_list_matches_strategy(dk.data_dictionary['prediction_features'], dk)
def model_exists(self, dk: FreqaiDataKitchen) -> bool:
"""
Given a pair and path, check if a model already exists
:param pair: pair e.g. BTC/USD
@@ -501,11 +516,11 @@ class IFreqaiModel(ABC):
:return:
:boolean: whether the model file exists or not.
"""
path_to_modelfile = Path(dk.data_path / f"{model_filename}_model.joblib")
path_to_modelfile = Path(dk.data_path / f"{dk.model_filename}_model.joblib")
file_exists = path_to_modelfile.is_file()
if file_exists and not scanning:
if file_exists:
logger.info("Found model at %s", dk.data_path / dk.model_filename)
elif not scanning:
else:
logger.info("Could not find model at %s", dk.data_path / dk.model_filename)
return file_exists
@@ -552,16 +567,17 @@ class IFreqaiModel(ABC):
# find the features indicated by strategy and store in datakitchen
dk.find_features(unfiltered_dataframe)
dk.find_labels(unfiltered_dataframe)
model = self.train(unfiltered_dataframe, pair, dk)
self.dd.pair_dict[pair]["trained_timestamp"] = new_trained_timerange.stopts
dk.set_new_model_names(pair, new_trained_timerange)
self.dd.pair_dict[pair]["first"] = False
if self.dd.pair_dict[pair]["priority"] == 1 and self.scanning:
self.dd.pair_to_end_of_training_queue(pair)
self.dd.save_data(model, pair, dk)
if self.plot_features:
plot_feature_importance(model, pair, dk, self.plot_features)
if self.freqai_info.get("purge_old_models", False):
self.dd.purge_old_models()
@@ -685,6 +701,32 @@ class IFreqaiModel(ABC):
return init_model
def _set_train_queue(self):
"""
Sets train queue from existing train timestamps if they exist
otherwise it sets the train queue based on the provided whitelist.
"""
current_pairlist = self.config.get("exchange", {}).get("pair_whitelist")
if not self.dd.pair_dict:
logger.info('Set fresh train queue from whitelist. '
f'Queue: {current_pairlist}')
return deque(current_pairlist)
best_queue = deque()
pair_dict_sorted = sorted(self.dd.pair_dict.items(),
key=lambda k: k[1]['trained_timestamp'])
for pair in pair_dict_sorted:
if pair[0] in current_pairlist:
best_queue.append(pair[0])
for pair in current_pairlist:
if pair not in best_queue:
best_queue.appendleft(pair)
logger.info('Set existing queue from trained timestamps. '
f'Best approximation queue: {best_queue}')
return best_queue
# Following methods which are overridden by user made prediction models.
# See freqai/prediction_models/CatboostPredictionModel.py for an example.

View File

@@ -1,19 +1,25 @@
import logging
from datetime import datetime, timezone
from typing import Any
import numpy as np
import pandas as pd
from freqtrade.configuration import TimeRange
from freqtrade.constants import Config
from freqtrade.data.dataprovider import DataProvider
from freqtrade.data.history.history_utils import refresh_backtest_ohlcv_data
from freqtrade.exceptions import OperationalException
from freqtrade.exchange import timeframe_to_seconds
from freqtrade.exchange.exchange import market_is_active
from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
from freqtrade.plugins.pairlist.pairlist_helpers import dynamic_expand_pairlist
logger = logging.getLogger(__name__)
def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
def download_all_data_for_training(dp: DataProvider, config: Config) -> None:
"""
Called only once upon start of bot to download the necessary data for
populating indicators and training the model.
@@ -47,9 +53,7 @@ def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
)
def get_required_data_timerange(
config: dict
) -> TimeRange:
def get_required_data_timerange(config: Config) -> TimeRange:
"""
Used to compute the required data download time range
for auto data-download in FreqAI
@@ -86,7 +90,7 @@ def get_required_data_timerange(
# Keep below for when we wish to download heterogeneously lengthed data for FreqAI.
# def download_all_data_for_training(dp: DataProvider, config: dict) -> None:
# def download_all_data_for_training(dp: DataProvider, config: Config) -> None:
# """
# Called only once upon start of bot to download the necessary data for
# populating indicators and training a FreqAI model.
@@ -132,3 +136,58 @@ def get_required_data_timerange(
# trading_mode=config.get("trading_mode", "spot"),
# prepend=config.get("prepend_data", False),
# )
def plot_feature_importance(model: Any, pair: str, dk: FreqaiDataKitchen,
count_max: int = 25) -> None:
"""
Plot Best and worst features by importance for a single sub-train.
:param model: Any = A model which was `fit` using a common library
such as catboost or lightgbm
:param pair: str = pair e.g. BTC/USD
:param dk: FreqaiDataKitchen = non-persistent data container for current coin/loop
:param count_max: int = the amount of features to be loaded per column
"""
from freqtrade.plot.plotting import go, make_subplots, store_plot_file
# Extract feature importance from model
models = {}
if 'FreqaiMultiOutputRegressor' in str(model.__class__):
for estimator, label in zip(model.estimators_, dk.label_list):
models[label] = estimator
else:
models[dk.label_list[0]] = model
for label in models:
mdl = models[label]
if "catboost.core" in str(mdl.__class__):
feature_importance = mdl.get_feature_importance()
elif "lightgbm.sklearn" or "xgb" in str(mdl.__class__):
feature_importance = mdl.feature_importances_
else:
logger.info('Model type not support for generating feature importances.')
return
# Data preparation
fi_df = pd.DataFrame({
"feature_names": np.array(dk.data_dictionary['train_features'].columns),
"feature_importance": np.array(feature_importance)
})
fi_df_top = fi_df.nlargest(count_max, "feature_importance")[::-1]
fi_df_worst = fi_df.nsmallest(count_max, "feature_importance")[::-1]
# Plotting
def add_feature_trace(fig, fi_df, col):
return fig.add_trace(
go.Bar(
x=fi_df["feature_importance"],
y=fi_df["feature_names"],
orientation='h', showlegend=False
), row=1, col=col
)
fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.5)
fig = add_feature_trace(fig, fi_df_top, 1)
fig = add_feature_trace(fig, fi_df_worst, 2)
fig.update_layout(title_text=f"Best and worst features by importance {pair}")
label = label.replace('&', '').replace('%', '') # escape two FreqAI specific characters
store_plot_file(fig, f"{dk.model_filename}-{label}.html", dk.data_path)