Reorder data cleaning
This commit is contained in:
parent
c2130ed3dd
commit
e08cd6ebff
@ -496,6 +496,10 @@ class FreqaiDataDrawer:
|
|||||||
save_path / f"{dk.model_filename}_trained_df.pkl"
|
save_path / f"{dk.model_filename}_trained_df.pkl"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
dk.data_dictionary["train_features_no_transf"].to_pickle(
|
||||||
|
save_path / f"{dk.model_filename}_trained_df_no_transf.pkl"
|
||||||
|
)
|
||||||
|
|
||||||
dk.data_dictionary["train_dates"].to_pickle(
|
dk.data_dictionary["train_dates"].to_pickle(
|
||||||
save_path / f"{dk.model_filename}_trained_dates_df.pkl"
|
save_path / f"{dk.model_filename}_trained_dates_df.pkl"
|
||||||
)
|
)
|
||||||
@ -513,6 +517,8 @@ class FreqaiDataDrawer:
|
|||||||
if coin not in self.meta_data_dictionary:
|
if coin not in self.meta_data_dictionary:
|
||||||
self.meta_data_dictionary[coin] = {}
|
self.meta_data_dictionary[coin] = {}
|
||||||
self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"]
|
self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"]
|
||||||
|
self.meta_data_dictionary[coin]["train_df_no_transf"] = \
|
||||||
|
dk.data_dictionary["train_features_no_transf"]
|
||||||
self.meta_data_dictionary[coin]["meta_data"] = dk.data
|
self.meta_data_dictionary[coin]["meta_data"] = dk.data
|
||||||
self.save_drawer_to_disk()
|
self.save_drawer_to_disk()
|
||||||
|
|
||||||
@ -553,6 +559,8 @@ class FreqaiDataDrawer:
|
|||||||
if coin in self.meta_data_dictionary:
|
if coin in self.meta_data_dictionary:
|
||||||
dk.data = self.meta_data_dictionary[coin]["meta_data"]
|
dk.data = self.meta_data_dictionary[coin]["meta_data"]
|
||||||
dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"]
|
dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"]
|
||||||
|
dk.data_dictionary["train_features_no_transf"] = \
|
||||||
|
self.meta_data_dictionary[coin]["train_df_no_transf"]
|
||||||
else:
|
else:
|
||||||
with open(dk.data_path / f"{dk.model_filename}_metadata.json", "r") as fp:
|
with open(dk.data_path / f"{dk.model_filename}_metadata.json", "r") as fp:
|
||||||
dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE)
|
dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE)
|
||||||
|
@ -273,11 +273,15 @@ class FreqaiDataKitchen:
|
|||||||
test_labels: DataFrame,
|
test_labels: DataFrame,
|
||||||
train_weights: Any,
|
train_weights: Any,
|
||||||
test_weights: Any,
|
test_weights: Any,
|
||||||
|
train_df_no_transf: DataFrame = DataFrame(),
|
||||||
|
test_df_no_transf: DataFrame = DataFrame()
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
|
|
||||||
self.data_dictionary = {
|
self.data_dictionary = {
|
||||||
"train_features": train_df,
|
"train_features": train_df,
|
||||||
|
"train_features_no_transf": train_df_no_transf,
|
||||||
"test_features": test_df,
|
"test_features": test_df,
|
||||||
|
"test_features_no_transf": test_df_no_transf,
|
||||||
"train_labels": train_labels,
|
"train_labels": train_labels,
|
||||||
"test_labels": test_labels,
|
"test_labels": test_labels,
|
||||||
"train_weights": train_weights,
|
"train_weights": train_weights,
|
||||||
@ -289,7 +293,7 @@ class FreqaiDataKitchen:
|
|||||||
|
|
||||||
def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
|
def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
|
||||||
"""
|
"""
|
||||||
Normalize all data in the data_dictionary according to the training dataset
|
Normalize all data in the data_dictionary according to the training dataset.
|
||||||
:param data_dictionary: dictionary containing the cleaned and
|
:param data_dictionary: dictionary containing the cleaned and
|
||||||
split training/test data/labels
|
split training/test data/labels
|
||||||
:returns:
|
:returns:
|
||||||
@ -495,6 +499,9 @@ class FreqaiDataKitchen:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from sklearn.decomposition import PCA # avoid importing if we dont need it
|
from sklearn.decomposition import PCA # avoid importing if we dont need it
|
||||||
|
self.data_dictionary["train_features_no_transf"] = self.data_dictionary["train_features"]
|
||||||
|
self.data["training_features_list_no_transf"] = \
|
||||||
|
self.data_dictionary["train_features"].columns
|
||||||
|
|
||||||
pca = PCA(0.999)
|
pca = PCA(0.999)
|
||||||
pca = pca.fit(self.data_dictionary["train_features"])
|
pca = pca.fit(self.data_dictionary["train_features"])
|
||||||
@ -520,6 +527,7 @@ class FreqaiDataKitchen:
|
|||||||
self.training_features_list = self.data_dictionary["train_features"].columns
|
self.training_features_list = self.data_dictionary["train_features"].columns
|
||||||
|
|
||||||
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
|
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
|
||||||
|
self.data_dictionary["test_features_no_transf"] = self.data_dictionary["test_features"]
|
||||||
test_components = pca.transform(self.data_dictionary["test_features"])
|
test_components = pca.transform(self.data_dictionary["test_features"])
|
||||||
self.data_dictionary["test_features"] = pd.DataFrame(
|
self.data_dictionary["test_features"] = pd.DataFrame(
|
||||||
data=test_components,
|
data=test_components,
|
||||||
@ -545,6 +553,8 @@ class FreqaiDataKitchen:
|
|||||||
Use an existing pca transform to transform data into components
|
Use an existing pca transform to transform data into components
|
||||||
:param filtered_dataframe: DataFrame = the cleaned dataframe
|
:param filtered_dataframe: DataFrame = the cleaned dataframe
|
||||||
"""
|
"""
|
||||||
|
self.data_dictionary["prediction_features_no_transf"] = \
|
||||||
|
self.data_dictionary["prediction_features"]
|
||||||
pca_components = self.pca.transform(filtered_dataframe)
|
pca_components = self.pca.transform(filtered_dataframe)
|
||||||
self.data_dictionary["prediction_features"] = pd.DataFrame(
|
self.data_dictionary["prediction_features"] = pd.DataFrame(
|
||||||
data=pca_components,
|
data=pca_components,
|
||||||
@ -559,7 +569,8 @@ class FreqaiDataKitchen:
|
|||||||
"""
|
"""
|
||||||
Compute distances between each training point and every other training
|
Compute distances between each training point and every other training
|
||||||
point. This metric defines the neighborhood of trained data and is used
|
point. This metric defines the neighborhood of trained data and is used
|
||||||
for prediction confidence in the Dissimilarity Index
|
for prediction confidence in the Dissimilarity Index.
|
||||||
|
Calculations are done on non-transformed (e.g., PCA) data.
|
||||||
"""
|
"""
|
||||||
# logger.info("computing average mean distance for all training points")
|
# logger.info("computing average mean distance for all training points")
|
||||||
pairwise = pairwise_distances(
|
pairwise = pairwise_distances(
|
||||||
@ -586,7 +597,7 @@ class FreqaiDataKitchen:
|
|||||||
def use_SVM_to_remove_outliers(self, predict: bool) -> None:
|
def use_SVM_to_remove_outliers(self, predict: bool) -> None:
|
||||||
"""
|
"""
|
||||||
Build/inference a Support Vector Machine to detect outliers
|
Build/inference a Support Vector Machine to detect outliers
|
||||||
in training data and prediction
|
in training data and prediction, before any transformation (e.g., PCA).
|
||||||
:param predict: bool = If true, inference an existing SVM model, else construct one
|
:param predict: bool = If true, inference an existing SVM model, else construct one
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -669,7 +680,7 @@ class FreqaiDataKitchen:
|
|||||||
|
|
||||||
def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None:
|
def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None:
|
||||||
"""
|
"""
|
||||||
Use DBSCAN to cluster training data and remove "noisy" data (read outliers).
|
Use DBSCAN to cluster training data and remove outliers before transformation (e.g., PCA).
|
||||||
User controls this via the config param `DBSCAN_outlier_pct` which indicates the
|
User controls this via the config param `DBSCAN_outlier_pct` which indicates the
|
||||||
pct of training data that they want to be considered outliers.
|
pct of training data that they want to be considered outliers.
|
||||||
:param predict: bool = If False (training), iterate to find the best hyper parameters
|
:param predict: bool = If False (training), iterate to find the best hyper parameters
|
||||||
@ -682,7 +693,10 @@ class FreqaiDataKitchen:
|
|||||||
if predict:
|
if predict:
|
||||||
if not self.data['DBSCAN_eps']:
|
if not self.data['DBSCAN_eps']:
|
||||||
return
|
return
|
||||||
|
if self.data_dictionary["train_features_no_transf"].empty:
|
||||||
train_ft_df = self.data_dictionary['train_features']
|
train_ft_df = self.data_dictionary['train_features']
|
||||||
|
else:
|
||||||
|
train_ft_df = self.data_dictionary["train_features_no_transf"]
|
||||||
pred_ft_df = self.data_dictionary['prediction_features']
|
pred_ft_df = self.data_dictionary['prediction_features']
|
||||||
num_preds = len(pred_ft_df)
|
num_preds = len(pred_ft_df)
|
||||||
df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True)
|
df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True)
|
||||||
@ -773,7 +787,8 @@ class FreqaiDataKitchen:
|
|||||||
|
|
||||||
def compute_inlier_metric(self, set_='train') -> None:
|
def compute_inlier_metric(self, set_='train') -> None:
|
||||||
"""
|
"""
|
||||||
Compute inlier metric from backwards distance distributions.
|
Compute inlier metric from backwards distance distributions before any transformation
|
||||||
|
(e.g., PCA).
|
||||||
This metric defines how well features from a timepoint fit
|
This metric defines how well features from a timepoint fit
|
||||||
into previous timepoints.
|
into previous timepoints.
|
||||||
"""
|
"""
|
||||||
@ -918,9 +933,13 @@ class FreqaiDataKitchen:
|
|||||||
and avoid making predictions on any points that are too far away
|
and avoid making predictions on any points that are too far away
|
||||||
from the training data set.
|
from the training data set.
|
||||||
"""
|
"""
|
||||||
|
if self.data_dictionary["train_features_no_transf"].empty:
|
||||||
|
train_features = self.data_dictionary["train_features"]
|
||||||
|
else:
|
||||||
|
train_features = self.data_dictionary["train_features_no_transf"]
|
||||||
|
|
||||||
distance = pairwise_distances(
|
distance = pairwise_distances(
|
||||||
self.data_dictionary["train_features"],
|
train_features,
|
||||||
self.data_dictionary["prediction_features"],
|
self.data_dictionary["prediction_features"],
|
||||||
n_jobs=self.thread_count,
|
n_jobs=self.thread_count,
|
||||||
)
|
)
|
||||||
|
@ -465,17 +465,9 @@ class IFreqaiModel(ABC):
|
|||||||
if self.freqai_info["data_split_parameters"]["test_size"] > 0:
|
if self.freqai_info["data_split_parameters"]["test_size"] > 0:
|
||||||
dk.compute_inlier_metric(set_='test')
|
dk.compute_inlier_metric(set_='test')
|
||||||
|
|
||||||
if ft_params.get(
|
|
||||||
"principal_component_analysis", False
|
|
||||||
):
|
|
||||||
dk.principal_component_analysis()
|
|
||||||
|
|
||||||
if ft_params.get("use_SVM_to_remove_outliers", False):
|
if ft_params.get("use_SVM_to_remove_outliers", False):
|
||||||
dk.use_SVM_to_remove_outliers(predict=False)
|
dk.use_SVM_to_remove_outliers(predict=False)
|
||||||
|
|
||||||
if ft_params.get("DI_threshold", 0):
|
|
||||||
dk.data["avg_mean_dist"] = dk.compute_distances()
|
|
||||||
|
|
||||||
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
|
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
|
||||||
if dk.pair in self.dd.old_DBSCAN_eps:
|
if dk.pair in self.dd.old_DBSCAN_eps:
|
||||||
eps = self.dd.old_DBSCAN_eps[dk.pair]
|
eps = self.dd.old_DBSCAN_eps[dk.pair]
|
||||||
@ -484,6 +476,14 @@ class IFreqaiModel(ABC):
|
|||||||
dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps)
|
dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps)
|
||||||
self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps']
|
self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps']
|
||||||
|
|
||||||
|
if ft_params.get("DI_threshold", 0):
|
||||||
|
dk.data["avg_mean_dist"] = dk.compute_distances()
|
||||||
|
|
||||||
|
if ft_params.get(
|
||||||
|
"principal_component_analysis", False
|
||||||
|
):
|
||||||
|
dk.principal_component_analysis()
|
||||||
|
|
||||||
if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0):
|
if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0):
|
||||||
dk.add_noise_to_training_features()
|
dk.add_noise_to_training_features()
|
||||||
|
|
||||||
@ -500,19 +500,19 @@ class IFreqaiModel(ABC):
|
|||||||
if ft_params.get('inlier_metric_window', 0):
|
if ft_params.get('inlier_metric_window', 0):
|
||||||
dk.compute_inlier_metric(set_='predict')
|
dk.compute_inlier_metric(set_='predict')
|
||||||
|
|
||||||
if ft_params.get(
|
|
||||||
"principal_component_analysis", False
|
|
||||||
):
|
|
||||||
dk.pca_transform(dk.data_dictionary['prediction_features'])
|
|
||||||
|
|
||||||
if ft_params.get("use_SVM_to_remove_outliers", False):
|
if ft_params.get("use_SVM_to_remove_outliers", False):
|
||||||
dk.use_SVM_to_remove_outliers(predict=True)
|
dk.use_SVM_to_remove_outliers(predict=True)
|
||||||
|
|
||||||
|
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
|
||||||
|
dk.use_DBSCAN_to_remove_outliers(predict=True)
|
||||||
|
|
||||||
if ft_params.get("DI_threshold", 0):
|
if ft_params.get("DI_threshold", 0):
|
||||||
dk.check_if_pred_in_training_spaces()
|
dk.check_if_pred_in_training_spaces()
|
||||||
|
|
||||||
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
|
if ft_params.get(
|
||||||
dk.use_DBSCAN_to_remove_outliers(predict=True)
|
"principal_component_analysis", False
|
||||||
|
):
|
||||||
|
dk.pca_transform(dk.data_dictionary['prediction_features'])
|
||||||
|
|
||||||
def model_exists(self, dk: FreqaiDataKitchen) -> bool:
|
def model_exists(self, dk: FreqaiDataKitchen) -> bool:
|
||||||
"""
|
"""
|
||||||
|
@ -157,5 +157,5 @@ def test_make_train_test_datasets(mocker, freqai_conf):
|
|||||||
data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered)
|
data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered)
|
||||||
|
|
||||||
assert data_dictionary
|
assert data_dictionary
|
||||||
assert len(data_dictionary) == 7
|
assert len(data_dictionary) == 9
|
||||||
assert len(data_dictionary['train_features'].index) == 1916
|
assert len(data_dictionary['train_features'].index) == 1916
|
||||||
|
@ -338,7 +338,11 @@ def test_follow_mode(mocker, freqai_conf):
|
|||||||
freqai.dd.load_all_pair_histories(timerange, freqai.dk)
|
freqai.dd.load_all_pair_histories(timerange, freqai.dk)
|
||||||
|
|
||||||
df = strategy.dp.get_pair_dataframe('ADA/BTC', '5m')
|
df = strategy.dp.get_pair_dataframe('ADA/BTC', '5m')
|
||||||
|
# import pytest
|
||||||
|
# pytest.set_trace()
|
||||||
|
freqai.dk.build_data_dictionary(
|
||||||
|
[], [], [], [], [], []
|
||||||
|
)
|
||||||
freqai.start_live(df, metadata, strategy, freqai.dk)
|
freqai.start_live(df, metadata, strategy, freqai.dk)
|
||||||
|
|
||||||
assert len(freqai.dk.return_dataframe.index) == 5702
|
assert len(freqai.dk.return_dataframe.index) == 5702
|
||||||
|
Loading…
Reference in New Issue
Block a user