Reorder data cleaning

This commit is contained in:
th0rntwig 2022-11-04 11:21:22 +01:00
parent c2130ed3dd
commit e08cd6ebff
5 changed files with 55 additions and 24 deletions

View File

@ -496,6 +496,10 @@ class FreqaiDataDrawer:
save_path / f"{dk.model_filename}_trained_df.pkl" save_path / f"{dk.model_filename}_trained_df.pkl"
) )
dk.data_dictionary["train_features_no_transf"].to_pickle(
save_path / f"{dk.model_filename}_trained_df_no_transf.pkl"
)
dk.data_dictionary["train_dates"].to_pickle( dk.data_dictionary["train_dates"].to_pickle(
save_path / f"{dk.model_filename}_trained_dates_df.pkl" save_path / f"{dk.model_filename}_trained_dates_df.pkl"
) )
@ -513,6 +517,8 @@ class FreqaiDataDrawer:
if coin not in self.meta_data_dictionary: if coin not in self.meta_data_dictionary:
self.meta_data_dictionary[coin] = {} self.meta_data_dictionary[coin] = {}
self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"] self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"]
self.meta_data_dictionary[coin]["train_df_no_transf"] = \
dk.data_dictionary["train_features_no_transf"]
self.meta_data_dictionary[coin]["meta_data"] = dk.data self.meta_data_dictionary[coin]["meta_data"] = dk.data
self.save_drawer_to_disk() self.save_drawer_to_disk()
@ -553,6 +559,8 @@ class FreqaiDataDrawer:
if coin in self.meta_data_dictionary: if coin in self.meta_data_dictionary:
dk.data = self.meta_data_dictionary[coin]["meta_data"] dk.data = self.meta_data_dictionary[coin]["meta_data"]
dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"] dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"]
dk.data_dictionary["train_features_no_transf"] = \
self.meta_data_dictionary[coin]["train_df_no_transf"]
else: else:
with open(dk.data_path / f"{dk.model_filename}_metadata.json", "r") as fp: with open(dk.data_path / f"{dk.model_filename}_metadata.json", "r") as fp:
dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE) dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE)

View File

@ -273,11 +273,15 @@ class FreqaiDataKitchen:
test_labels: DataFrame, test_labels: DataFrame,
train_weights: Any, train_weights: Any,
test_weights: Any, test_weights: Any,
train_df_no_transf: DataFrame = DataFrame(),
test_df_no_transf: DataFrame = DataFrame()
) -> Dict: ) -> Dict:
self.data_dictionary = { self.data_dictionary = {
"train_features": train_df, "train_features": train_df,
"train_features_no_transf": train_df_no_transf,
"test_features": test_df, "test_features": test_df,
"test_features_no_transf": test_df_no_transf,
"train_labels": train_labels, "train_labels": train_labels,
"test_labels": test_labels, "test_labels": test_labels,
"train_weights": train_weights, "train_weights": train_weights,
@ -289,7 +293,7 @@ class FreqaiDataKitchen:
def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]: def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
""" """
Normalize all data in the data_dictionary according to the training dataset Normalize all data in the data_dictionary according to the training dataset.
:param data_dictionary: dictionary containing the cleaned and :param data_dictionary: dictionary containing the cleaned and
split training/test data/labels split training/test data/labels
:returns: :returns:
@ -495,6 +499,9 @@ class FreqaiDataKitchen:
""" """
from sklearn.decomposition import PCA # avoid importing if we dont need it from sklearn.decomposition import PCA # avoid importing if we dont need it
self.data_dictionary["train_features_no_transf"] = self.data_dictionary["train_features"]
self.data["training_features_list_no_transf"] = \
self.data_dictionary["train_features"].columns
pca = PCA(0.999) pca = PCA(0.999)
pca = pca.fit(self.data_dictionary["train_features"]) pca = pca.fit(self.data_dictionary["train_features"])
@ -520,6 +527,7 @@ class FreqaiDataKitchen:
self.training_features_list = self.data_dictionary["train_features"].columns self.training_features_list = self.data_dictionary["train_features"].columns
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0: if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
self.data_dictionary["test_features_no_transf"] = self.data_dictionary["test_features"]
test_components = pca.transform(self.data_dictionary["test_features"]) test_components = pca.transform(self.data_dictionary["test_features"])
self.data_dictionary["test_features"] = pd.DataFrame( self.data_dictionary["test_features"] = pd.DataFrame(
data=test_components, data=test_components,
@ -545,6 +553,8 @@ class FreqaiDataKitchen:
Use an existing pca transform to transform data into components Use an existing pca transform to transform data into components
:param filtered_dataframe: DataFrame = the cleaned dataframe :param filtered_dataframe: DataFrame = the cleaned dataframe
""" """
self.data_dictionary["prediction_features_no_transf"] = \
self.data_dictionary["prediction_features"]
pca_components = self.pca.transform(filtered_dataframe) pca_components = self.pca.transform(filtered_dataframe)
self.data_dictionary["prediction_features"] = pd.DataFrame( self.data_dictionary["prediction_features"] = pd.DataFrame(
data=pca_components, data=pca_components,
@ -559,7 +569,8 @@ class FreqaiDataKitchen:
""" """
Compute distances between each training point and every other training Compute distances between each training point and every other training
point. This metric defines the neighborhood of trained data and is used point. This metric defines the neighborhood of trained data and is used
for prediction confidence in the Dissimilarity Index for prediction confidence in the Dissimilarity Index.
Calculations are done on non-transformed (e.g., PCA) data.
""" """
# logger.info("computing average mean distance for all training points") # logger.info("computing average mean distance for all training points")
pairwise = pairwise_distances( pairwise = pairwise_distances(
@ -586,7 +597,7 @@ class FreqaiDataKitchen:
def use_SVM_to_remove_outliers(self, predict: bool) -> None: def use_SVM_to_remove_outliers(self, predict: bool) -> None:
""" """
Build/inference a Support Vector Machine to detect outliers Build/inference a Support Vector Machine to detect outliers
in training data and prediction in training data and prediction, before any transformation (e.g., PCA).
:param predict: bool = If true, inference an existing SVM model, else construct one :param predict: bool = If true, inference an existing SVM model, else construct one
""" """
@ -669,7 +680,7 @@ class FreqaiDataKitchen:
def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None: def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None:
""" """
Use DBSCAN to cluster training data and remove "noisy" data (read outliers). Use DBSCAN to cluster training data and remove outliers before transformation (e.g., PCA).
User controls this via the config param `DBSCAN_outlier_pct` which indicates the User controls this via the config param `DBSCAN_outlier_pct` which indicates the
pct of training data that they want to be considered outliers. pct of training data that they want to be considered outliers.
:param predict: bool = If False (training), iterate to find the best hyper parameters :param predict: bool = If False (training), iterate to find the best hyper parameters
@ -682,7 +693,10 @@ class FreqaiDataKitchen:
if predict: if predict:
if not self.data['DBSCAN_eps']: if not self.data['DBSCAN_eps']:
return return
train_ft_df = self.data_dictionary['train_features'] if self.data_dictionary["train_features_no_transf"].empty:
train_ft_df = self.data_dictionary['train_features']
else:
train_ft_df = self.data_dictionary["train_features_no_transf"]
pred_ft_df = self.data_dictionary['prediction_features'] pred_ft_df = self.data_dictionary['prediction_features']
num_preds = len(pred_ft_df) num_preds = len(pred_ft_df)
df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True) df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True)
@ -773,7 +787,8 @@ class FreqaiDataKitchen:
def compute_inlier_metric(self, set_='train') -> None: def compute_inlier_metric(self, set_='train') -> None:
""" """
Compute inlier metric from backwards distance distributions. Compute inlier metric from backwards distance distributions before any transformation
(e.g., PCA).
This metric defines how well features from a timepoint fit This metric defines how well features from a timepoint fit
into previous timepoints. into previous timepoints.
""" """
@ -918,9 +933,13 @@ class FreqaiDataKitchen:
and avoid making predictions on any points that are too far away and avoid making predictions on any points that are too far away
from the training data set. from the training data set.
""" """
if self.data_dictionary["train_features_no_transf"].empty:
train_features = self.data_dictionary["train_features"]
else:
train_features = self.data_dictionary["train_features_no_transf"]
distance = pairwise_distances( distance = pairwise_distances(
self.data_dictionary["train_features"], train_features,
self.data_dictionary["prediction_features"], self.data_dictionary["prediction_features"],
n_jobs=self.thread_count, n_jobs=self.thread_count,
) )

View File

@ -465,17 +465,9 @@ class IFreqaiModel(ABC):
if self.freqai_info["data_split_parameters"]["test_size"] > 0: if self.freqai_info["data_split_parameters"]["test_size"] > 0:
dk.compute_inlier_metric(set_='test') dk.compute_inlier_metric(set_='test')
if ft_params.get(
"principal_component_analysis", False
):
dk.principal_component_analysis()
if ft_params.get("use_SVM_to_remove_outliers", False): if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=False) dk.use_SVM_to_remove_outliers(predict=False)
if ft_params.get("DI_threshold", 0):
dk.data["avg_mean_dist"] = dk.compute_distances()
if ft_params.get("use_DBSCAN_to_remove_outliers", False): if ft_params.get("use_DBSCAN_to_remove_outliers", False):
if dk.pair in self.dd.old_DBSCAN_eps: if dk.pair in self.dd.old_DBSCAN_eps:
eps = self.dd.old_DBSCAN_eps[dk.pair] eps = self.dd.old_DBSCAN_eps[dk.pair]
@ -484,6 +476,14 @@ class IFreqaiModel(ABC):
dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps) dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps)
self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps'] self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps']
if ft_params.get("DI_threshold", 0):
dk.data["avg_mean_dist"] = dk.compute_distances()
if ft_params.get(
"principal_component_analysis", False
):
dk.principal_component_analysis()
if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0): if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0):
dk.add_noise_to_training_features() dk.add_noise_to_training_features()
@ -500,19 +500,19 @@ class IFreqaiModel(ABC):
if ft_params.get('inlier_metric_window', 0): if ft_params.get('inlier_metric_window', 0):
dk.compute_inlier_metric(set_='predict') dk.compute_inlier_metric(set_='predict')
if ft_params.get(
"principal_component_analysis", False
):
dk.pca_transform(dk.data_dictionary['prediction_features'])
if ft_params.get("use_SVM_to_remove_outliers", False): if ft_params.get("use_SVM_to_remove_outliers", False):
dk.use_SVM_to_remove_outliers(predict=True) dk.use_SVM_to_remove_outliers(predict=True)
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
dk.use_DBSCAN_to_remove_outliers(predict=True)
if ft_params.get("DI_threshold", 0): if ft_params.get("DI_threshold", 0):
dk.check_if_pred_in_training_spaces() dk.check_if_pred_in_training_spaces()
if ft_params.get("use_DBSCAN_to_remove_outliers", False): if ft_params.get(
dk.use_DBSCAN_to_remove_outliers(predict=True) "principal_component_analysis", False
):
dk.pca_transform(dk.data_dictionary['prediction_features'])
def model_exists(self, dk: FreqaiDataKitchen) -> bool: def model_exists(self, dk: FreqaiDataKitchen) -> bool:
""" """

View File

@ -157,5 +157,5 @@ def test_make_train_test_datasets(mocker, freqai_conf):
data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered) data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered)
assert data_dictionary assert data_dictionary
assert len(data_dictionary) == 7 assert len(data_dictionary) == 9
assert len(data_dictionary['train_features'].index) == 1916 assert len(data_dictionary['train_features'].index) == 1916

View File

@ -338,7 +338,11 @@ def test_follow_mode(mocker, freqai_conf):
freqai.dd.load_all_pair_histories(timerange, freqai.dk) freqai.dd.load_all_pair_histories(timerange, freqai.dk)
df = strategy.dp.get_pair_dataframe('ADA/BTC', '5m') df = strategy.dp.get_pair_dataframe('ADA/BTC', '5m')
# import pytest
# pytest.set_trace()
freqai.dk.build_data_dictionary(
[], [], [], [], [], []
)
freqai.start_live(df, metadata, strategy, freqai.dk) freqai.start_live(df, metadata, strategy, freqai.dk)
assert len(freqai.dk.return_dataframe.index) == 5702 assert len(freqai.dk.return_dataframe.index) == 5702