diff --git a/config_examples/config_freqai.example.json b/config_examples/config_freqai.example.json index 65a93379e..147567f17 100644 --- a/config_examples/config_freqai.example.json +++ b/config_examples/config_freqai.example.json @@ -73,7 +73,8 @@ 10, 20 ], - "plot_feature_importances": 0 + "plot_feature_importances": 0, + "data_normalization": "legacy" }, "data_split_parameters": { "test_size": 0.33, diff --git a/freqtrade/freqai/data_drawer.py b/freqtrade/freqai/data_drawer.py index 14986d854..4b44fd8aa 100644 --- a/freqtrade/freqai/data_drawer.py +++ b/freqtrade/freqai/data_drawer.py @@ -427,6 +427,9 @@ class FreqaiDataDrawer: with (save_path / f"{dk.model_filename}_metadata.json").open("w") as fp: rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE) + with (save_path / f"{dk.model_filename}_metadata.pkl").open("wb") as fp: + cloudpickle.dump(dk.pkl_data, fp) + return def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None: @@ -456,10 +459,14 @@ class FreqaiDataDrawer: dk.data["model_filename"] = str(dk.model_filename) dk.data["training_features_list"] = dk.training_features_list dk.data["label_list"] = dk.label_list - # store the metadata + # store the json metadata with (save_path / f"{dk.model_filename}_metadata.json").open("w") as fp: rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE) + # store the pickle metadata + with (save_path / f"{dk.model_filename}_metadata.pkl").open("wb") as fp: + cloudpickle.dump(dk.pkl_data, fp) + # save the train data to file so we can check preds for area of applicability later dk.data_dictionary["train_features"].to_pickle( save_path / f"{dk.model_filename}_trained_df.pkl" @@ -486,6 +493,16 @@ class FreqaiDataDrawer: return + def load_pickle_metadata(self, dk: FreqaiDataKitchen): + pickle_file_path = dk.data_path / f"{dk.model_filename}_metadata.pkl" + exists = pickle_file_path.is_file() + # Check if the metadata pickle file exists before attempting to read it. + # This is for backward compatibility with models generated before the + # pickle metadata feature was implemented. + if exists: + with (dk.data_path / f"{dk.model_filename}_metadata.pkl").open("rb") as fp: + dk.pkl_data = cloudpickle.load(fp) + def load_metadata(self, dk: FreqaiDataKitchen) -> None: """ Load only metadata into datakitchen to increase performance during @@ -496,6 +513,8 @@ class FreqaiDataDrawer: dk.training_features_list = dk.data["training_features_list"] dk.label_list = dk.data["label_list"] + self.load_pickle_metadata(dk) + def load_data(self, coin: str, dk: FreqaiDataKitchen) -> Any: """ loads all data required to make a prediction on a sub-train time range @@ -517,6 +536,8 @@ class FreqaiDataDrawer: with (dk.data_path / f"{dk.model_filename}_metadata.json").open("r") as fp: dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE) + self.load_pickle_metadata(dk) + dk.data_dictionary["train_features"] = pd.read_pickle( dk.data_path / f"{dk.model_filename}_trained_df.pkl" ) diff --git a/freqtrade/freqai/data_kitchen.py b/freqtrade/freqai/data_kitchen.py index 52d487b08..2cde31441 100644 --- a/freqtrade/freqai/data_kitchen.py +++ b/freqtrade/freqai/data_kitchen.py @@ -25,6 +25,7 @@ from freqtrade.constants import Config from freqtrade.data.converter import reduce_dataframe_footprint from freqtrade.exceptions import OperationalException from freqtrade.exchange import timeframe_to_seconds +from freqtrade.freqai.normalization import Normalization, normalization_factory from freqtrade.strategy import merge_informative_pair from freqtrade.strategy.interface import IStrategy @@ -68,6 +69,7 @@ class FreqaiDataKitchen: pair: str = "", ): self.data: Dict[str, Any] = {} + self.pkl_data: Dict[str, Any] = {} self.data_dictionary: Dict[str, DataFrame] = {} self.config = config self.freqai_config: Dict[str, Any] = config["freqai"] @@ -109,6 +111,8 @@ class FreqaiDataKitchen: self.unique_classes: Dict[str, list] = {} self.unique_class_list: list = [] self.backtest_live_models_data: Dict[str, Any] = {} + self.normalizer: Normalization = normalization_factory(config, self.data, self.pkl_data, + self.unique_class_list) def set_paths( self, @@ -308,105 +312,16 @@ class FreqaiDataKitchen: return self.data_dictionary def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: - """ - Normalize all data in the data_dictionary according to the training dataset - :param data_dictionary: dictionary containing the cleaned and - split training/test data/labels - :returns: - :data_dictionary: updated dictionary with standardized values. - """ - - # standardize the data by training stats - train_max = data_dictionary["train_features"].max() - train_min = data_dictionary["train_features"].min() - data_dictionary["train_features"] = ( - 2 * (data_dictionary["train_features"] - train_min) / (train_max - train_min) - 1 - ) - data_dictionary["test_features"] = ( - 2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1 - ) - - for item in train_max.keys(): - self.data[item + "_max"] = train_max[item] - self.data[item + "_min"] = train_min[item] - - for item in data_dictionary["train_labels"].keys(): - if data_dictionary["train_labels"][item].dtype == object: - continue - train_labels_max = data_dictionary["train_labels"][item].max() - train_labels_min = data_dictionary["train_labels"][item].min() - data_dictionary["train_labels"][item] = ( - 2 - * (data_dictionary["train_labels"][item] - train_labels_min) - / (train_labels_max - train_labels_min) - - 1 - ) - if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: - data_dictionary["test_labels"][item] = ( - 2 - * (data_dictionary["test_labels"][item] - train_labels_min) - / (train_labels_max - train_labels_min) - - 1 - ) - - self.data[f"{item}_max"] = train_labels_max - self.data[f"{item}_min"] = train_labels_min - return data_dictionary + return self.normalizer.normalize_data(data_dictionary) def normalize_single_dataframe(self, df: DataFrame) -> DataFrame: - - train_max = df.max() - train_min = df.min() - df = ( - 2 * (df - train_min) / (train_max - train_min) - 1 - ) - - for item in train_max.keys(): - self.data[item + "_max"] = train_max[item] - self.data[item + "_min"] = train_min[item] - - return df + return self.normalizer.normalize_single_dataframe(df) def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: - """ - Normalize a set of data using the mean and standard deviation from - the associated training data. - :param df: Dataframe to be standardized - """ - - train_max = [None] * len(df.keys()) - train_min = [None] * len(df.keys()) - - for i, item in enumerate(df.keys()): - train_max[i] = self.data[f"{item}_max"] - train_min[i] = self.data[f"{item}_min"] - - train_max_series = pd.Series(train_max, index=df.keys()) - train_min_series = pd.Series(train_min, index=df.keys()) - - df = ( - 2 * (df - train_min_series) / (train_max_series - train_min_series) - 1 - ) - - return df + return self.normalizer.normalize_data_from_metadata(df) def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame: - """ - Denormalize a set of data using the mean and standard deviation from - the associated training data. - :param df: Dataframe of predictions to be denormalized - """ - - for label in df.columns: - if df[label].dtype == object or label in self.unique_class_list: - continue - df[label] = ( - (df[label] + 1) - * (self.data[f"{label}_max"] - self.data[f"{label}_min"]) - / 2 - ) + self.data[f"{label}_min"] - - return df + return self.normalizer.denormalize_labels_from_metadata(df) def split_timerange( self, tr: str, train_split: int = 28, bt_split: float = 7 @@ -524,7 +439,7 @@ class FreqaiDataKitchen: columns=["PC" + str(i) for i in range(0, n_keep_components)], index=self.data_dictionary["train_features"].index, ) - # normalsing transformed training features + # normalizing transformed training features self.data_dictionary["train_features"] = self.normalize_single_dataframe( self.data_dictionary["train_features"]) diff --git a/freqtrade/freqai/normalization.py b/freqtrade/freqai/normalization.py new file mode 100644 index 000000000..641bfcdaa --- /dev/null +++ b/freqtrade/freqai/normalization.py @@ -0,0 +1,272 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict, TypeVar + +import pandas as pd +from pandas import DataFrame +from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, StandardScaler + +from freqtrade.constants import Config +from freqtrade.exceptions import OperationalException + + +TransformerType = TypeVar('TransformerType', MinMaxScaler, StandardScaler, QuantileTransformer) + + +def normalization_factory( + config: Config, + meta_data: Dict[str, Any], + pickle_meta_data: Dict[str, Any], + unique_class_list: list + ): + freqai_config: Dict[str, Any] = config["freqai"] + norm_config_id = freqai_config["feature_parameters"].get("data_normalization", "legacy") + if norm_config_id.lower() == "legacy": + return LegacyNormalization(config, meta_data, pickle_meta_data, unique_class_list) + elif norm_config_id.lower() == "standard": + return StandardNormalization(config, meta_data, pickle_meta_data, unique_class_list) + elif norm_config_id.lower() == "minmax": + return MinMaxNormalization(config, meta_data, pickle_meta_data, unique_class_list) + elif norm_config_id.lower() == "quantile": + return QuantileNormalization(config, meta_data, pickle_meta_data, unique_class_list) + else: + raise OperationalException(f"Invalid data normalization identifier '{norm_config_id}'") + + +class Normalization(ABC): + def __init__( + self, + config: Config, + meta_data: Dict[str, Any], + pickle_meta_data: Dict[str, Any], + unique_class_list: list + ): + self.freqai_config: Dict[str, Any] = config["freqai"] + self.data: Dict[str, Any] = meta_data + self.pkl_data: Dict[str, Any] = pickle_meta_data + self.unique_class_list: list = unique_class_list + + @abstractmethod + def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: + """""" + + @abstractmethod + def normalize_single_dataframe(self, df: DataFrame) -> DataFrame: + """""" + + @abstractmethod + def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: + """""" + + @abstractmethod + def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame: + """""" + + +class LegacyNormalization(Normalization): + def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: + """ + Normalize all data in the data_dictionary according to the training dataset + :param data_dictionary: dictionary containing the cleaned and + split training/test data/labels + :returns: + :data_dictionary: updated dictionary with standardized values. + """ + + # standardize the data by training stats + train_max = data_dictionary["train_features"].max() + train_min = data_dictionary["train_features"].min() + data_dictionary["train_features"] = ( + 2 * (data_dictionary["train_features"] - train_min) / (train_max - train_min) - 1 + ) + data_dictionary["test_features"] = ( + 2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1 + ) + + for item in train_max.keys(): + self.data[item + "_max"] = train_max[item] + self.data[item + "_min"] = train_min[item] + + for item in data_dictionary["train_labels"].keys(): + if data_dictionary["train_labels"][item].dtype == object: + continue + train_labels_max = data_dictionary["train_labels"][item].max() + train_labels_min = data_dictionary["train_labels"][item].min() + data_dictionary["train_labels"][item] = ( + 2 + * (data_dictionary["train_labels"][item] - train_labels_min) + / (train_labels_max - train_labels_min) + - 1 + ) + if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: + data_dictionary["test_labels"][item] = ( + 2 + * (data_dictionary["test_labels"][item] - train_labels_min) + / (train_labels_max - train_labels_min) + - 1 + ) + + self.data[f"{item}_max"] = train_labels_max + self.data[f"{item}_min"] = train_labels_min + return data_dictionary + + def normalize_single_dataframe(self, df: DataFrame) -> DataFrame: + + train_max = df.max() + train_min = df.min() + df = ( + 2 * (df - train_min) / (train_max - train_min) - 1 + ) + + for item in train_max.keys(): + self.data[item + "_max"] = train_max[item] + self.data[item + "_min"] = train_min[item] + + return df + + def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: + """ + Normalize a set of data using the mean and standard deviation from + the associated training data. + :param df: Dataframe to be standardized + """ + + train_max = [None] * len(df.keys()) + train_min = [None] * len(df.keys()) + + for i, item in enumerate(df.keys()): + train_max[i] = self.data[f"{item}_max"] + train_min[i] = self.data[f"{item}_min"] + + train_max_series = pd.Series(train_max, index=df.keys()) + train_min_series = pd.Series(train_min, index=df.keys()) + + df = ( + 2 * (df - train_min_series) / (train_max_series - train_min_series) - 1 + ) + + return df + + def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame: + """ + Denormalize a set of data using the mean and standard deviation from + the associated training data. + :param df: Dataframe of predictions to be denormalized + """ + + for label in df.columns: + if df[label].dtype == object or label in self.unique_class_list: + continue + df[label] = ( + (df[label] + 1) + * (self.data[f"{label}_max"] - self.data[f"{label}_min"]) + / 2 + ) + self.data[f"{label}_min"] + + return df + + +class SKLearnNormalization(Normalization): + def __init__(self, + config: Config, + meta_data: Dict[str, Any], + pickle_meta_data: Dict[str, Any], + unique_class_list: list, + transformer: TransformerType): + super().__init__(config, meta_data, pickle_meta_data, unique_class_list) + self.transformer = transformer + + def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: + """ + Normalize all data in the data_dictionary according to the training dataset + :param data_dictionary: dictionary containing the cleaned and + split training/test data/labels + :returns: + :data_dictionary: updated dictionary with standardized values. + """ + + # standardize the data by training stats + for column in data_dictionary["train_features"].columns: + scaler = self.transformer() + data_dictionary["train_features"][column] = \ + scaler.fit_transform(data_dictionary["train_features"][[column]]) + data_dictionary["test_features"][column] = \ + scaler.transform(data_dictionary["test_features"][[column]]) + self.pkl_data[column + "_scaler"] = scaler + + for column in data_dictionary["train_labels"].columns: + if data_dictionary["train_labels"][column].dtype == object: + continue + scaler = self.transformer() + data_dictionary["train_labels"][column] = \ + scaler.fit_transform(data_dictionary["train_labels"][[column]]) + + if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: + data_dictionary["test_labels"][column] = \ + scaler.transform(data_dictionary["test_labels"][[column]]) + + self.pkl_data[column + "_scaler"] = scaler + return data_dictionary + + def normalize_single_dataframe(self, df: DataFrame) -> DataFrame: + for column in df.columns: + scaler = self.transformer() + df[column] = scaler.fit_transform(df[[column]]) + self.pkl_data[column + "_scaler"] = scaler + + return df + + def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: + """ + Normalize a set of data using the mean and standard deviation from + the associated training data. + :param df: Dataframe to be standardized + """ + + for column in df.columns: + df[column] = self.pkl_data[column + "_scaler"].transform(df[[column]]) + + return df + + def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame: + """ + Denormalize a set of data using the mean and standard deviation from + the associated training data. + :param df: Dataframe of predictions to be denormalized + """ + + for column in df.columns: + if df[column].dtype == object or column in self.unique_class_list: + continue + df[column] = self.pkl_data[column + "_scaler"].inverse_transform(df[[column]]) + + return df + + +class StandardNormalization(SKLearnNormalization): + def __init__(self, + config: Config, + meta_data: Dict[str, Any], + pickle_meta_data: Dict[str, Any], + unique_class_list: list): + super().__init__(config, meta_data, pickle_meta_data, unique_class_list, StandardScaler) + + +class MinMaxNormalization(SKLearnNormalization): + def __init__(self, + config: Config, + meta_data: Dict[str, Any], + pickle_meta_data: Dict[str, Any], + unique_class_list: list): + super().__init__(config, meta_data, pickle_meta_data, unique_class_list, MinMaxScaler) + + +class QuantileNormalization(SKLearnNormalization): + def __init__(self, + config: Config, + meta_data: Dict[str, Any], + pickle_meta_data: Dict[str, Any], + unique_class_list: list): + super().__init__(config, meta_data, pickle_meta_data, unique_class_list, + QuantileTransformer) + + diff --git a/tests/freqai/conftest.py b/tests/freqai/conftest.py index e140ee80b..d351ee5fb 100644 --- a/tests/freqai/conftest.py +++ b/tests/freqai/conftest.py @@ -142,7 +142,7 @@ def make_unfiltered_dataframe(mocker, freqai_conf): return freqai, unfiltered_dataframe -def make_data_dictionary(mocker, freqai_conf): +def make_data_dictionary(mocker, freqai_conf, normalized=True): freqai_conf.update({"timerange": "20180110-20180130"}) strategy = get_patched_freqai_strategy(mocker, freqai_conf) @@ -181,7 +181,8 @@ def make_data_dictionary(mocker, freqai_conf): data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered) - data_dictionary = freqai.dk.normalize_data(data_dictionary) + if normalized: + data_dictionary = freqai.dk.normalize_data(data_dictionary) return freqai diff --git a/tests/freqai/test_normalization.py b/tests/freqai/test_normalization.py new file mode 100644 index 000000000..36294215e --- /dev/null +++ b/tests/freqai/test_normalization.py @@ -0,0 +1,107 @@ +import pytest + +from freqtrade.exceptions import OperationalException +from freqtrade.freqai.normalization import (LegacyNormalization, MinMaxNormalization, + QuantileNormalization, StandardNormalization) +from tests.freqai.conftest import make_data_dictionary + + +def test_default_normalization_is_legacy(mocker, freqai_conf): + freqai_1st = make_data_dictionary(mocker, freqai_conf, normalized=False) + data_dict_1st = freqai_1st.dk.data_dictionary + freqai_1st.dk.normalize_data(data_dict_1st) + + freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = "legacy" + freqai_2nd = make_data_dictionary(mocker, freqai_conf, normalized=False) + data_dict_2nd = freqai_2nd.dk.data_dictionary + + assert not freqai_1st.dk.data_dictionary['train_features'].equals( + freqai_2nd.dk.data_dictionary['train_features']), "raw data is equal to normalized data" + + freqai_2nd.dk.normalize_data(data_dict_2nd) + + assert freqai_1st.dk.data_dictionary['train_features'].equals( + freqai_2nd.dk.data_dictionary['train_features']), \ + "explicit\\implicit legacy normalization mismatch" + + +def test_legacy_normalization_add_max_min_columns(mocker, freqai_conf): + freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = "legacy" + freqai = make_data_dictionary(mocker, freqai_conf, normalized=False) + data_dict = freqai.dk.data_dictionary + freqai.dk.normalize_data(data_dict) + + assert any('_max' in entry for entry in freqai.dk.data.keys()) + assert any('_min' in entry for entry in freqai.dk.data.keys()) + + +def test_standard_normalization_dont_add_max_min_columns(mocker, freqai_conf): + freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = "standard" + freqai = make_data_dictionary(mocker, freqai_conf, normalized=False) + data_dict = freqai.dk.data_dictionary + freqai.dk.normalize_data(data_dict) + assert all(not entry.endswith('_max') for entry in freqai.dk.data.keys()) + assert all(not entry.endswith('_min') for entry in freqai.dk.data.keys()) + + +def test_legacy_and_standard_normalization_difference(mocker, freqai_conf): + freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = "legacy" + freqai_1st = make_data_dictionary(mocker, freqai_conf, normalized=False) + data_dict_1st = freqai_1st.dk.data_dictionary + freqai_1st.dk.normalize_data(data_dict_1st) + + freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = "standard" + freqai_2nd = make_data_dictionary(mocker, freqai_conf, normalized=False) + data_dict_2nd = freqai_2nd.dk.data_dictionary + freqai_2nd.dk.normalize_data(data_dict_2nd) + + assert not freqai_1st.dk.data_dictionary['train_features'].equals( + freqai_2nd.dk.data_dictionary['train_features']), \ + "legacy and standard normalization produce same features" + + +@pytest.mark.parametrize( + "config_id, norm_class", + [ + ("legacy", LegacyNormalization), + ("standard", StandardNormalization), + ("minmax", MinMaxNormalization), + ("quantile", QuantileNormalization), + ], +) +def test_normalization_class(config_id, norm_class, mocker, freqai_conf): + freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = config_id + freqai = make_data_dictionary(mocker, freqai_conf) + assert type(freqai.dk.normalizer) == norm_class + + +def test_assertion_invalid_normalization_id(mocker, freqai_conf): + freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = "not_a_norm_id" + try: + make_data_dictionary(mocker, freqai_conf) + assert False, "missing expected normalization factory exception" + except OperationalException as e_info: + assert str(e_info).startswith("Invalid data normalization identifier"), \ + "unexpected exception string" + +@pytest.mark.parametrize( + "config_id", + [ + "legacy", + "standard", + "minmax", + "quantile", + ], +) +def test_denormalization(config_id, mocker, freqai_conf): + freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = config_id + freqai_1st = make_data_dictionary(mocker, freqai_conf) + data_dict_1st = freqai_1st.dk.data_dictionary + + freqai_2nd = make_data_dictionary(mocker, freqai_conf, normalized=False) + data_dict_2nd = freqai_2nd.dk.data_dictionary + + denorm_labels = freqai_1st.dk.denormalize_labels_from_metadata( + data_dict_1st["train_labels"]).round(9) + assert denorm_labels.equals(data_dict_2nd['train_labels'].round(9)), \ + "raw labels data isn't the same as denormalized labels"