Reorder data cleaning
This commit is contained in:
parent
c2130ed3dd
commit
e08cd6ebff
@ -496,6 +496,10 @@ class FreqaiDataDrawer:
|
||||
save_path / f"{dk.model_filename}_trained_df.pkl"
|
||||
)
|
||||
|
||||
dk.data_dictionary["train_features_no_transf"].to_pickle(
|
||||
save_path / f"{dk.model_filename}_trained_df_no_transf.pkl"
|
||||
)
|
||||
|
||||
dk.data_dictionary["train_dates"].to_pickle(
|
||||
save_path / f"{dk.model_filename}_trained_dates_df.pkl"
|
||||
)
|
||||
@ -513,6 +517,8 @@ class FreqaiDataDrawer:
|
||||
if coin not in self.meta_data_dictionary:
|
||||
self.meta_data_dictionary[coin] = {}
|
||||
self.meta_data_dictionary[coin]["train_df"] = dk.data_dictionary["train_features"]
|
||||
self.meta_data_dictionary[coin]["train_df_no_transf"] = \
|
||||
dk.data_dictionary["train_features_no_transf"]
|
||||
self.meta_data_dictionary[coin]["meta_data"] = dk.data
|
||||
self.save_drawer_to_disk()
|
||||
|
||||
@ -553,6 +559,8 @@ class FreqaiDataDrawer:
|
||||
if coin in self.meta_data_dictionary:
|
||||
dk.data = self.meta_data_dictionary[coin]["meta_data"]
|
||||
dk.data_dictionary["train_features"] = self.meta_data_dictionary[coin]["train_df"]
|
||||
dk.data_dictionary["train_features_no_transf"] = \
|
||||
self.meta_data_dictionary[coin]["train_df_no_transf"]
|
||||
else:
|
||||
with open(dk.data_path / f"{dk.model_filename}_metadata.json", "r") as fp:
|
||||
dk.data = rapidjson.load(fp, number_mode=rapidjson.NM_NATIVE)
|
||||
|
@ -273,11 +273,15 @@ class FreqaiDataKitchen:
|
||||
test_labels: DataFrame,
|
||||
train_weights: Any,
|
||||
test_weights: Any,
|
||||
train_df_no_transf: DataFrame = DataFrame(),
|
||||
test_df_no_transf: DataFrame = DataFrame()
|
||||
) -> Dict:
|
||||
|
||||
self.data_dictionary = {
|
||||
"train_features": train_df,
|
||||
"train_features_no_transf": train_df_no_transf,
|
||||
"test_features": test_df,
|
||||
"test_features_no_transf": test_df_no_transf,
|
||||
"train_labels": train_labels,
|
||||
"test_labels": test_labels,
|
||||
"train_weights": train_weights,
|
||||
@ -289,7 +293,7 @@ class FreqaiDataKitchen:
|
||||
|
||||
def normalize_data(self, data_dictionary: Dict) -> Dict[Any, Any]:
|
||||
"""
|
||||
Normalize all data in the data_dictionary according to the training dataset
|
||||
Normalize all data in the data_dictionary according to the training dataset.
|
||||
:param data_dictionary: dictionary containing the cleaned and
|
||||
split training/test data/labels
|
||||
:returns:
|
||||
@ -495,6 +499,9 @@ class FreqaiDataKitchen:
|
||||
"""
|
||||
|
||||
from sklearn.decomposition import PCA # avoid importing if we dont need it
|
||||
self.data_dictionary["train_features_no_transf"] = self.data_dictionary["train_features"]
|
||||
self.data["training_features_list_no_transf"] = \
|
||||
self.data_dictionary["train_features"].columns
|
||||
|
||||
pca = PCA(0.999)
|
||||
pca = pca.fit(self.data_dictionary["train_features"])
|
||||
@ -520,6 +527,7 @@ class FreqaiDataKitchen:
|
||||
self.training_features_list = self.data_dictionary["train_features"].columns
|
||||
|
||||
if self.freqai_config.get('data_split_parameters', {}).get('test_size', 0.1) != 0:
|
||||
self.data_dictionary["test_features_no_transf"] = self.data_dictionary["test_features"]
|
||||
test_components = pca.transform(self.data_dictionary["test_features"])
|
||||
self.data_dictionary["test_features"] = pd.DataFrame(
|
||||
data=test_components,
|
||||
@ -545,6 +553,8 @@ class FreqaiDataKitchen:
|
||||
Use an existing pca transform to transform data into components
|
||||
:param filtered_dataframe: DataFrame = the cleaned dataframe
|
||||
"""
|
||||
self.data_dictionary["prediction_features_no_transf"] = \
|
||||
self.data_dictionary["prediction_features"]
|
||||
pca_components = self.pca.transform(filtered_dataframe)
|
||||
self.data_dictionary["prediction_features"] = pd.DataFrame(
|
||||
data=pca_components,
|
||||
@ -559,7 +569,8 @@ class FreqaiDataKitchen:
|
||||
"""
|
||||
Compute distances between each training point and every other training
|
||||
point. This metric defines the neighborhood of trained data and is used
|
||||
for prediction confidence in the Dissimilarity Index
|
||||
for prediction confidence in the Dissimilarity Index.
|
||||
Calculations are done on non-transformed (e.g., PCA) data.
|
||||
"""
|
||||
# logger.info("computing average mean distance for all training points")
|
||||
pairwise = pairwise_distances(
|
||||
@ -586,7 +597,7 @@ class FreqaiDataKitchen:
|
||||
def use_SVM_to_remove_outliers(self, predict: bool) -> None:
|
||||
"""
|
||||
Build/inference a Support Vector Machine to detect outliers
|
||||
in training data and prediction
|
||||
in training data and prediction, before any transformation (e.g., PCA).
|
||||
:param predict: bool = If true, inference an existing SVM model, else construct one
|
||||
"""
|
||||
|
||||
@ -669,7 +680,7 @@ class FreqaiDataKitchen:
|
||||
|
||||
def use_DBSCAN_to_remove_outliers(self, predict: bool, eps=None) -> None:
|
||||
"""
|
||||
Use DBSCAN to cluster training data and remove "noisy" data (read outliers).
|
||||
Use DBSCAN to cluster training data and remove outliers before transformation (e.g., PCA).
|
||||
User controls this via the config param `DBSCAN_outlier_pct` which indicates the
|
||||
pct of training data that they want to be considered outliers.
|
||||
:param predict: bool = If False (training), iterate to find the best hyper parameters
|
||||
@ -682,7 +693,10 @@ class FreqaiDataKitchen:
|
||||
if predict:
|
||||
if not self.data['DBSCAN_eps']:
|
||||
return
|
||||
train_ft_df = self.data_dictionary['train_features']
|
||||
if self.data_dictionary["train_features_no_transf"].empty:
|
||||
train_ft_df = self.data_dictionary['train_features']
|
||||
else:
|
||||
train_ft_df = self.data_dictionary["train_features_no_transf"]
|
||||
pred_ft_df = self.data_dictionary['prediction_features']
|
||||
num_preds = len(pred_ft_df)
|
||||
df = pd.concat([train_ft_df, pred_ft_df], axis=0, ignore_index=True)
|
||||
@ -773,7 +787,8 @@ class FreqaiDataKitchen:
|
||||
|
||||
def compute_inlier_metric(self, set_='train') -> None:
|
||||
"""
|
||||
Compute inlier metric from backwards distance distributions.
|
||||
Compute inlier metric from backwards distance distributions before any transformation
|
||||
(e.g., PCA).
|
||||
This metric defines how well features from a timepoint fit
|
||||
into previous timepoints.
|
||||
"""
|
||||
@ -918,9 +933,13 @@ class FreqaiDataKitchen:
|
||||
and avoid making predictions on any points that are too far away
|
||||
from the training data set.
|
||||
"""
|
||||
if self.data_dictionary["train_features_no_transf"].empty:
|
||||
train_features = self.data_dictionary["train_features"]
|
||||
else:
|
||||
train_features = self.data_dictionary["train_features_no_transf"]
|
||||
|
||||
distance = pairwise_distances(
|
||||
self.data_dictionary["train_features"],
|
||||
train_features,
|
||||
self.data_dictionary["prediction_features"],
|
||||
n_jobs=self.thread_count,
|
||||
)
|
||||
|
@ -465,17 +465,9 @@ class IFreqaiModel(ABC):
|
||||
if self.freqai_info["data_split_parameters"]["test_size"] > 0:
|
||||
dk.compute_inlier_metric(set_='test')
|
||||
|
||||
if ft_params.get(
|
||||
"principal_component_analysis", False
|
||||
):
|
||||
dk.principal_component_analysis()
|
||||
|
||||
if ft_params.get("use_SVM_to_remove_outliers", False):
|
||||
dk.use_SVM_to_remove_outliers(predict=False)
|
||||
|
||||
if ft_params.get("DI_threshold", 0):
|
||||
dk.data["avg_mean_dist"] = dk.compute_distances()
|
||||
|
||||
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
|
||||
if dk.pair in self.dd.old_DBSCAN_eps:
|
||||
eps = self.dd.old_DBSCAN_eps[dk.pair]
|
||||
@ -484,6 +476,14 @@ class IFreqaiModel(ABC):
|
||||
dk.use_DBSCAN_to_remove_outliers(predict=False, eps=eps)
|
||||
self.dd.old_DBSCAN_eps[dk.pair] = dk.data['DBSCAN_eps']
|
||||
|
||||
if ft_params.get("DI_threshold", 0):
|
||||
dk.data["avg_mean_dist"] = dk.compute_distances()
|
||||
|
||||
if ft_params.get(
|
||||
"principal_component_analysis", False
|
||||
):
|
||||
dk.principal_component_analysis()
|
||||
|
||||
if self.freqai_info["feature_parameters"].get('noise_standard_deviation', 0):
|
||||
dk.add_noise_to_training_features()
|
||||
|
||||
@ -500,19 +500,19 @@ class IFreqaiModel(ABC):
|
||||
if ft_params.get('inlier_metric_window', 0):
|
||||
dk.compute_inlier_metric(set_='predict')
|
||||
|
||||
if ft_params.get(
|
||||
"principal_component_analysis", False
|
||||
):
|
||||
dk.pca_transform(dk.data_dictionary['prediction_features'])
|
||||
|
||||
if ft_params.get("use_SVM_to_remove_outliers", False):
|
||||
dk.use_SVM_to_remove_outliers(predict=True)
|
||||
|
||||
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
|
||||
dk.use_DBSCAN_to_remove_outliers(predict=True)
|
||||
|
||||
if ft_params.get("DI_threshold", 0):
|
||||
dk.check_if_pred_in_training_spaces()
|
||||
|
||||
if ft_params.get("use_DBSCAN_to_remove_outliers", False):
|
||||
dk.use_DBSCAN_to_remove_outliers(predict=True)
|
||||
if ft_params.get(
|
||||
"principal_component_analysis", False
|
||||
):
|
||||
dk.pca_transform(dk.data_dictionary['prediction_features'])
|
||||
|
||||
def model_exists(self, dk: FreqaiDataKitchen) -> bool:
|
||||
"""
|
||||
|
@ -157,5 +157,5 @@ def test_make_train_test_datasets(mocker, freqai_conf):
|
||||
data_dictionary = freqai.dk.make_train_test_datasets(features_filtered, labels_filtered)
|
||||
|
||||
assert data_dictionary
|
||||
assert len(data_dictionary) == 7
|
||||
assert len(data_dictionary) == 9
|
||||
assert len(data_dictionary['train_features'].index) == 1916
|
||||
|
@ -338,7 +338,11 @@ def test_follow_mode(mocker, freqai_conf):
|
||||
freqai.dd.load_all_pair_histories(timerange, freqai.dk)
|
||||
|
||||
df = strategy.dp.get_pair_dataframe('ADA/BTC', '5m')
|
||||
|
||||
# import pytest
|
||||
# pytest.set_trace()
|
||||
freqai.dk.build_data_dictionary(
|
||||
[], [], [], [], [], []
|
||||
)
|
||||
freqai.start_live(df, metadata, strategy, freqai.dk)
|
||||
|
||||
assert len(freqai.dk.return_dataframe.index) == 5702
|
||||
|
Loading…
Reference in New Issue
Block a user