Add additional data normalization methods to freqai module, including StandardScaler, MinMaxScaler, and QuantileTransformer. Add support for pickle metadata, normalization_factory, and unit tests.

This commit is contained in:
Zohar Kol 2023-03-29 17:16:20 +03:00
parent 8a49d62068
commit 4aa9284737
6 changed files with 415 additions and 98 deletions

View File

@ -73,7 +73,8 @@
10, 10,
20 20
], ],
"plot_feature_importances": 0 "plot_feature_importances": 0,
"data_normalization": "legacy"
}, },
"data_split_parameters": { "data_split_parameters": {
"test_size": 0.33, "test_size": 0.33,

View File

@ -427,6 +427,9 @@ class FreqaiDataDrawer:
with (save_path / f"{dk.model_filename}_metadata.json").open("w") as fp: with (save_path / f"{dk.model_filename}_metadata.json").open("w") as fp:
rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE) rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE)
with (save_path / f"{dk.model_filename}_metadata.pkl").open("wb") as fp:
cloudpickle.dump(dk.pkl_data, fp)
return return
def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None: def save_data(self, model: Any, coin: str, dk: FreqaiDataKitchen) -> None:
@ -456,10 +459,14 @@ class FreqaiDataDrawer:
dk.data["model_filename"] = str(dk.model_filename) dk.data["model_filename"] = str(dk.model_filename)
dk.data["training_features_list"] = dk.training_features_list dk.data["training_features_list"] = dk.training_features_list
dk.data["label_list"] = dk.label_list dk.data["label_list"] = dk.label_list
# store the metadata # store the json metadata
with (save_path / f"{dk.model_filename}_metadata.json").open("w") as fp: with (save_path / f"{dk.model_filename}_metadata.json").open("w") as fp:
rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE) rapidjson.dump(dk.data, fp, default=self.np_encoder, number_mode=rapidjson.NM_NATIVE)
# store the pickle metadata
with (save_path / f"{dk.model_filename}_metadata.pkl").open("wb") as fp:
cloudpickle.dump(dk.pkl_data, fp)
# save the train data to file so we can check preds for area of applicability later # save the train data to file so we can check preds for area of applicability later
dk.data_dictionary["train_features"].to_pickle( dk.data_dictionary["train_features"].to_pickle(
save_path / f"{dk.model_filename}_trained_df.pkl" save_path / f"{dk.model_filename}_trained_df.pkl"
@ -486,6 +493,16 @@ class FreqaiDataDrawer:
return return
def load_pickle_metadata(self, dk: FreqaiDataKitchen):
pickle_file_path = dk.data_path / f"{dk.model_filename}_metadata.pkl"
exists = pickle_file_path.is_file()
# Check if the metadata pickle file exists before attempting to read it.
# This is for backward compatibility with models generated before the
# pickle metadata feature was implemented.
if exists:
with (dk.data_path / f"{dk.model_filename}_metadata.pkl").open("rb") as fp:
dk.pkl_data = cloudpickle.load(fp)
def load_metadata(self, dk: FreqaiDataKitchen) -> None: def load_metadata(self, dk: FreqaiDataKitchen) -> None:
""" """
Load only metadata into datakitchen to increase performance during Load only metadata into datakitchen to increase performance during
@ -496,6 +513,8 @@ class FreqaiDataDrawer:
dk.training_features_list = dk.data["training_features_list"] dk.training_features_list = dk.data["training_features_list"]
dk.label_list = dk.data["label_list"] dk.label_list = dk.data["label_list"]
self.load_pickle_metadata(dk)
def load_data(self, coin: str, dk: FreqaiDataKitchen) -> Any: def load_data(self, coin: str, dk: FreqaiDataKitchen) -> Any:
""" """
loads all data required to make a prediction on a sub-train time range loads all data required to make a prediction on a sub-train time range
@ -517,6 +536,8 @@ class FreqaiDataDrawer:
with (dk.data_path / f"{dk.model_filename}_metadata.json").open("r") as fp: with (dk.data_path / f"{dk.model_filename}_metadata.json").open("r") as fp:
dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE) dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE)
self.load_pickle_metadata(dk)
dk.data_dictionary["train_features"] = pd.read_pickle( dk.data_dictionary["train_features"] = pd.read_pickle(
dk.data_path / f"{dk.model_filename}_trained_df.pkl" dk.data_path / f"{dk.model_filename}_trained_df.pkl"
) )

View File

@ -25,6 +25,7 @@ from freqtrade.constants import Config
from freqtrade.data.converter import reduce_dataframe_footprint from freqtrade.data.converter import reduce_dataframe_footprint
from freqtrade.exceptions import OperationalException from freqtrade.exceptions import OperationalException
from freqtrade.exchange import timeframe_to_seconds from freqtrade.exchange import timeframe_to_seconds
from freqtrade.freqai.normalization import Normalization, normalization_factory
from freqtrade.strategy import merge_informative_pair from freqtrade.strategy import merge_informative_pair
from freqtrade.strategy.interface import IStrategy from freqtrade.strategy.interface import IStrategy
@ -68,6 +69,7 @@ class FreqaiDataKitchen:
pair: str = "", pair: str = "",
): ):
self.data: Dict[str, Any] = {} self.data: Dict[str, Any] = {}
self.pkl_data: Dict[str, Any] = {}
self.data_dictionary: Dict[str, DataFrame] = {} self.data_dictionary: Dict[str, DataFrame] = {}
self.config = config self.config = config
self.freqai_config: Dict[str, Any] = config["freqai"] self.freqai_config: Dict[str, Any] = config["freqai"]
@ -109,6 +111,8 @@ class FreqaiDataKitchen:
self.unique_classes: Dict[str, list] = {} self.unique_classes: Dict[str, list] = {}
self.unique_class_list: list = [] self.unique_class_list: list = []
self.backtest_live_models_data: Dict[str, Any] = {} self.backtest_live_models_data: Dict[str, Any] = {}
self.normalizer: Normalization = normalization_factory(config, self.data, self.pkl_data,
self.unique_class_list)
def set_paths( def set_paths(
self, self,
@ -308,105 +312,16 @@ class FreqaiDataKitchen:
return self.data_dictionary return self.data_dictionary
def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
""" return self.normalizer.normalize_data(data_dictionary)
Normalize all data in the data_dictionary according to the training dataset
:param data_dictionary: dictionary containing the cleaned and
split training/test data/labels
:returns:
:data_dictionary: updated dictionary with standardized values.
"""
# standardize the data by training stats
train_max = data_dictionary["train_features"].max()
train_min = data_dictionary["train_features"].min()
data_dictionary["train_features"] = (
2 * (data_dictionary["train_features"] - train_min) / (train_max - train_min) - 1
)
data_dictionary["test_features"] = (
2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1
)
for item in train_max.keys():
self.data[item + "_max"] = train_max[item]
self.data[item + "_min"] = train_min[item]
for item in data_dictionary["train_labels"].keys():
if data_dictionary["train_labels"][item].dtype == object:
continue
train_labels_max = data_dictionary["train_labels"][item].max()
train_labels_min = data_dictionary["train_labels"][item].min()
data_dictionary["train_labels"][item] = (
2
* (data_dictionary["train_labels"][item] - train_labels_min)
/ (train_labels_max - train_labels_min)
- 1
)
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
data_dictionary["test_labels"][item] = (
2
* (data_dictionary["test_labels"][item] - train_labels_min)
/ (train_labels_max - train_labels_min)
- 1
)
self.data[f"{item}_max"] = train_labels_max
self.data[f"{item}_min"] = train_labels_min
return data_dictionary
def normalize_single_dataframe(self, df: DataFrame) -> DataFrame: def normalize_single_dataframe(self, df: DataFrame) -> DataFrame:
return self.normalizer.normalize_single_dataframe(df)
train_max = df.max()
train_min = df.min()
df = (
2 * (df - train_min) / (train_max - train_min) - 1
)
for item in train_max.keys():
self.data[item + "_max"] = train_max[item]
self.data[item + "_min"] = train_min[item]
return df
def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame: def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
""" return self.normalizer.normalize_data_from_metadata(df)
Normalize a set of data using the mean and standard deviation from
the associated training data.
:param df: Dataframe to be standardized
"""
train_max = [None] * len(df.keys())
train_min = [None] * len(df.keys())
for i, item in enumerate(df.keys()):
train_max[i] = self.data[f"{item}_max"]
train_min[i] = self.data[f"{item}_min"]
train_max_series = pd.Series(train_max, index=df.keys())
train_min_series = pd.Series(train_min, index=df.keys())
df = (
2 * (df - train_min_series) / (train_max_series - train_min_series) - 1
)
return df
def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame: def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame:
""" return self.normalizer.denormalize_labels_from_metadata(df)
Denormalize a set of data using the mean and standard deviation from
the associated training data.
:param df: Dataframe of predictions to be denormalized
"""
for label in df.columns:
if df[label].dtype == object or label in self.unique_class_list:
continue
df[label] = (
(df[label] + 1)
* (self.data[f"{label}_max"] - self.data[f"{label}_min"])
/ 2
) + self.data[f"{label}_min"]
return df
def split_timerange( def split_timerange(
self, tr: str, train_split: int = 28, bt_split: float = 7 self, tr: str, train_split: int = 28, bt_split: float = 7
@ -524,7 +439,7 @@ class FreqaiDataKitchen:
columns=["PC" + str(i) for i in range(0, n_keep_components)], columns=["PC" + str(i) for i in range(0, n_keep_components)],
index=self.data_dictionary["train_features"].index, index=self.data_dictionary["train_features"].index,
) )
# normalsing transformed training features # normalizing transformed training features
self.data_dictionary["train_features"] = self.normalize_single_dataframe( self.data_dictionary["train_features"] = self.normalize_single_dataframe(
self.data_dictionary["train_features"]) self.data_dictionary["train_features"])

View File

@ -0,0 +1,272 @@
from abc import ABC, abstractmethod
from typing import Any, Dict, TypeVar
import pandas as pd
from pandas import DataFrame
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, StandardScaler
from freqtrade.constants import Config
from freqtrade.exceptions import OperationalException
TransformerType = TypeVar('TransformerType', MinMaxScaler, StandardScaler, QuantileTransformer)
def normalization_factory(
config: Config,
meta_data: Dict[str, Any],
pickle_meta_data: Dict[str, Any],
unique_class_list: list
):
freqai_config: Dict[str, Any] = config["freqai"]
norm_config_id = freqai_config["feature_parameters"].get("data_normalization", "legacy")
if norm_config_id.lower() == "legacy":
return LegacyNormalization(config, meta_data, pickle_meta_data, unique_class_list)
elif norm_config_id.lower() == "standard":
return StandardNormalization(config, meta_data, pickle_meta_data, unique_class_list)
elif norm_config_id.lower() == "minmax":
return MinMaxNormalization(config, meta_data, pickle_meta_data, unique_class_list)
elif norm_config_id.lower() == "quantile":
return QuantileNormalization(config, meta_data, pickle_meta_data, unique_class_list)
else:
raise OperationalException(f"Invalid data normalization identifier '{norm_config_id}'")
class Normalization(ABC):
def __init__(
self,
config: Config,
meta_data: Dict[str, Any],
pickle_meta_data: Dict[str, Any],
unique_class_list: list
):
self.freqai_config: Dict[str, Any] = config["freqai"]
self.data: Dict[str, Any] = meta_data
self.pkl_data: Dict[str, Any] = pickle_meta_data
self.unique_class_list: list = unique_class_list
@abstractmethod
def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
""""""
@abstractmethod
def normalize_single_dataframe(self, df: DataFrame) -> DataFrame:
""""""
@abstractmethod
def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
""""""
@abstractmethod
def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame:
""""""
class LegacyNormalization(Normalization):
def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
"""
Normalize all data in the data_dictionary according to the training dataset
:param data_dictionary: dictionary containing the cleaned and
split training/test data/labels
:returns:
:data_dictionary: updated dictionary with standardized values.
"""
# standardize the data by training stats
train_max = data_dictionary["train_features"].max()
train_min = data_dictionary["train_features"].min()
data_dictionary["train_features"] = (
2 * (data_dictionary["train_features"] - train_min) / (train_max - train_min) - 1
)
data_dictionary["test_features"] = (
2 * (data_dictionary["test_features"] - train_min) / (train_max - train_min) - 1
)
for item in train_max.keys():
self.data[item + "_max"] = train_max[item]
self.data[item + "_min"] = train_min[item]
for item in data_dictionary["train_labels"].keys():
if data_dictionary["train_labels"][item].dtype == object:
continue
train_labels_max = data_dictionary["train_labels"][item].max()
train_labels_min = data_dictionary["train_labels"][item].min()
data_dictionary["train_labels"][item] = (
2
* (data_dictionary["train_labels"][item] - train_labels_min)
/ (train_labels_max - train_labels_min)
- 1
)
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
data_dictionary["test_labels"][item] = (
2
* (data_dictionary["test_labels"][item] - train_labels_min)
/ (train_labels_max - train_labels_min)
- 1
)
self.data[f"{item}_max"] = train_labels_max
self.data[f"{item}_min"] = train_labels_min
return data_dictionary
def normalize_single_dataframe(self, df: DataFrame) -> DataFrame:
train_max = df.max()
train_min = df.min()
df = (
2 * (df - train_min) / (train_max - train_min) - 1
)
for item in train_max.keys():
self.data[item + "_max"] = train_max[item]
self.data[item + "_min"] = train_min[item]
return df
def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
"""
Normalize a set of data using the mean and standard deviation from
the associated training data.
:param df: Dataframe to be standardized
"""
train_max = [None] * len(df.keys())
train_min = [None] * len(df.keys())
for i, item in enumerate(df.keys()):
train_max[i] = self.data[f"{item}_max"]
train_min[i] = self.data[f"{item}_min"]
train_max_series = pd.Series(train_max, index=df.keys())
train_min_series = pd.Series(train_min, index=df.keys())
df = (
2 * (df - train_min_series) / (train_max_series - train_min_series) - 1
)
return df
def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame:
"""
Denormalize a set of data using the mean and standard deviation from
the associated training data.
:param df: Dataframe of predictions to be denormalized
"""
for label in df.columns:
if df[label].dtype == object or label in self.unique_class_list:
continue
df[label] = (
(df[label] + 1)
* (self.data[f"{label}_max"] - self.data[f"{label}_min"])
/ 2
) + self.data[f"{label}_min"]
return df
class SKLearnNormalization(Normalization):
def __init__(self,
config: Config,
meta_data: Dict[str, Any],
pickle_meta_data: Dict[str, Any],
unique_class_list: list,
transformer: TransformerType):
super().__init__(config, meta_data, pickle_meta_data, unique_class_list)
self.transformer = transformer
def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
"""
Normalize all data in the data_dictionary according to the training dataset
:param data_dictionary: dictionary containing the cleaned and
split training/test data/labels
:returns:
:data_dictionary: updated dictionary with standardized values.
"""
# standardize the data by training stats
for column in data_dictionary["train_features"].columns:
scaler = self.transformer()
data_dictionary["train_features"][column] = \
scaler.fit_transform(data_dictionary["train_features"][[column]])
data_dictionary["test_features"][column] = \
scaler.transform(data_dictionary["test_features"][[column]])
self.pkl_data[column + "_scaler"] = scaler
for column in data_dictionary["train_labels"].columns:
if data_dictionary["train_labels"][column].dtype == object:
continue
scaler = self.transformer()
data_dictionary["train_labels"][column] = \
scaler.fit_transform(data_dictionary["train_labels"][[column]])
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
data_dictionary["test_labels"][column] = \
scaler.transform(data_dictionary["test_labels"][[column]])
self.pkl_data[column + "_scaler"] = scaler
return data_dictionary
def normalize_single_dataframe(self, df: DataFrame) -> DataFrame:
for column in df.columns:
scaler = self.transformer()
df[column] = scaler.fit_transform(df[[column]])
self.pkl_data[column + "_scaler"] = scaler
return df
def normalize_data_from_metadata(self, df: DataFrame) -> DataFrame:
"""
Normalize a set of data using the mean and standard deviation from
the associated training data.
:param df: Dataframe to be standardized
"""
for column in df.columns:
df[column] = self.pkl_data[column + "_scaler"].transform(df[[column]])
return df
def denormalize_labels_from_metadata(self, df: DataFrame) -> DataFrame:
"""
Denormalize a set of data using the mean and standard deviation from
the associated training data.
:param df: Dataframe of predictions to be denormalized
"""
for column in df.columns:
if df[column].dtype == object or column in self.unique_class_list:
continue
df[column] = self.pkl_data[column + "_scaler"].inverse_transform(df[[column]])
return df
class StandardNormalization(SKLearnNormalization):
def __init__(self,
config: Config,
meta_data: Dict[str, Any],
pickle_meta_data: Dict[str, Any],
unique_class_list: list):
super().__init__(config, meta_data, pickle_meta_data, unique_class_list, StandardScaler)
class MinMaxNormalization(SKLearnNormalization):
def __init__(self,
config: Config,
meta_data: Dict[str, Any],
pickle_meta_data: Dict[str, Any],
unique_class_list: list):
super().__init__(config, meta_data, pickle_meta_data, unique_class_list, MinMaxScaler)
class QuantileNormalization(SKLearnNormalization):
def __init__(self,
config: Config,
meta_data: Dict[str, Any],
pickle_meta_data: Dict[str, Any],
unique_class_list: list):
super().__init__(config, meta_data, pickle_meta_data, unique_class_list,
QuantileTransformer)

View File

@ -142,7 +142,7 @@ def make_unfiltered_dataframe(mocker, freqai_conf):
return freqai, unfiltered_dataframe return freqai, unfiltered_dataframe
def make_data_dictionary(mocker, freqai_conf): def make_data_dictionary(mocker, freqai_conf, normalized=True):
freqai_conf.update({"timerange": "20180110-20180130"}) freqai_conf.update({"timerange": "20180110-20180130"})
strategy = get_patched_freqai_strategy(mocker, freqai_conf) strategy = get_patched_freqai_strategy(mocker, freqai_conf)
@ -181,7 +181,8 @@ def make_data_dictionary(mocker, freqai_conf):
data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered) data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered)
data_dictionary = freqai.dk.normalize_data(data_dictionary) if normalized:
data_dictionary = freqai.dk.normalize_data(data_dictionary)
return freqai return freqai

View File

@ -0,0 +1,107 @@
import pytest
from freqtrade.exceptions import OperationalException
from freqtrade.freqai.normalization import (LegacyNormalization, MinMaxNormalization,
QuantileNormalization, StandardNormalization)
from tests.freqai.conftest import make_data_dictionary
def test_default_normalization_is_legacy(mocker, freqai_conf):
freqai_1st = make_data_dictionary(mocker, freqai_conf, normalized=False)
data_dict_1st = freqai_1st.dk.data_dictionary
freqai_1st.dk.normalize_data(data_dict_1st)
freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = "legacy"
freqai_2nd = make_data_dictionary(mocker, freqai_conf, normalized=False)
data_dict_2nd = freqai_2nd.dk.data_dictionary
assert not freqai_1st.dk.data_dictionary['train_features'].equals(
freqai_2nd.dk.data_dictionary['train_features']), "raw data is equal to normalized data"
freqai_2nd.dk.normalize_data(data_dict_2nd)
assert freqai_1st.dk.data_dictionary['train_features'].equals(
freqai_2nd.dk.data_dictionary['train_features']), \
"explicit\\implicit legacy normalization mismatch"
def test_legacy_normalization_add_max_min_columns(mocker, freqai_conf):
freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = "legacy"
freqai = make_data_dictionary(mocker, freqai_conf, normalized=False)
data_dict = freqai.dk.data_dictionary
freqai.dk.normalize_data(data_dict)
assert any('_max' in entry for entry in freqai.dk.data.keys())
assert any('_min' in entry for entry in freqai.dk.data.keys())
def test_standard_normalization_dont_add_max_min_columns(mocker, freqai_conf):
freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = "standard"
freqai = make_data_dictionary(mocker, freqai_conf, normalized=False)
data_dict = freqai.dk.data_dictionary
freqai.dk.normalize_data(data_dict)
assert all(not entry.endswith('_max') for entry in freqai.dk.data.keys())
assert all(not entry.endswith('_min') for entry in freqai.dk.data.keys())
def test_legacy_and_standard_normalization_difference(mocker, freqai_conf):
freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = "legacy"
freqai_1st = make_data_dictionary(mocker, freqai_conf, normalized=False)
data_dict_1st = freqai_1st.dk.data_dictionary
freqai_1st.dk.normalize_data(data_dict_1st)
freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = "standard"
freqai_2nd = make_data_dictionary(mocker, freqai_conf, normalized=False)
data_dict_2nd = freqai_2nd.dk.data_dictionary
freqai_2nd.dk.normalize_data(data_dict_2nd)
assert not freqai_1st.dk.data_dictionary['train_features'].equals(
freqai_2nd.dk.data_dictionary['train_features']), \
"legacy and standard normalization produce same features"
@pytest.mark.parametrize(
"config_id, norm_class",
[
("legacy", LegacyNormalization),
("standard", StandardNormalization),
("minmax", MinMaxNormalization),
("quantile", QuantileNormalization),
],
)
def test_normalization_class(config_id, norm_class, mocker, freqai_conf):
freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = config_id
freqai = make_data_dictionary(mocker, freqai_conf)
assert type(freqai.dk.normalizer) == norm_class
def test_assertion_invalid_normalization_id(mocker, freqai_conf):
freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = "not_a_norm_id"
try:
make_data_dictionary(mocker, freqai_conf)
assert False, "missing expected normalization factory exception"
except OperationalException as e_info:
assert str(e_info).startswith("Invalid data normalization identifier"), \
"unexpected exception string"
@pytest.mark.parametrize(
"config_id",
[
"legacy",
"standard",
"minmax",
"quantile",
],
)
def test_denormalization(config_id, mocker, freqai_conf):
freqai_conf["freqai"]["feature_parameters"]["data_normalization"] = config_id
freqai_1st = make_data_dictionary(mocker, freqai_conf)
data_dict_1st = freqai_1st.dk.data_dictionary
freqai_2nd = make_data_dictionary(mocker, freqai_conf, normalized=False)
data_dict_2nd = freqai_2nd.dk.data_dictionary
denorm_labels = freqai_1st.dk.denormalize_labels_from_metadata(
data_dict_1st["train_labels"]).round(9)
assert denorm_labels.equals(data_dict_2nd['train_labels'].round(9)), \
"raw labels data isn't the same as denormalized labels"