Check for constant columns

This commit is contained in:
th0rntwig 2022-09-07 17:47:27 +02:00
parent c08c82bc40
commit 047ded1baa

View File

@ -184,7 +184,7 @@ class FreqaiDataKitchen:
def filter_features( def filter_features(
self, self,
unfiltered_dataframe: DataFrame, unfiltered_df: DataFrame,
training_feature_list: List, training_feature_list: List,
label_list: List = list(), label_list: List = list(),
training_filter: bool = True, training_filter: bool = True,
@ -195,31 +195,36 @@ class FreqaiDataKitchen:
0s in the prediction dataset. However, prediction dataset do_predict will reflect any 0s in the prediction dataset. However, prediction dataset do_predict will reflect any
row that had a NaN and will shield user from that prediction. row that had a NaN and will shield user from that prediction.
:params: :params:
:unfiltered_dataframe: the full dataframe for the present training period :unfiltered_df: the full dataframe for the present training period
:training_feature_list: list, the training feature list constructed by :training_feature_list: list, the training feature list constructed by
self.build_feature_list() according to user specified parameters in the configuration file. self.build_feature_list() according to user specified parameters in the configuration file.
:labels: the labels for the dataset :labels: the labels for the dataset
:training_filter: boolean which lets the function know if it is training data or :training_filter: boolean which lets the function know if it is training data or
prediction data to be filtered. prediction data to be filtered.
:returns: :returns:
:filtered_dataframe: dataframe cleaned of NaNs and only containing the user :filtered_df: dataframe cleaned of NaNs and only containing the user
requested feature set. requested feature set.
:labels: labels cleaned of NaNs. :labels: labels cleaned of NaNs.
""" """
filtered_dataframe = unfiltered_dataframe.filter(training_feature_list, axis=1) filtered_df = unfiltered_df.filter(training_feature_list, axis=1)
filtered_dataframe = filtered_dataframe.replace([np.inf, -np.inf], np.nan) filtered_df = filtered_df.replace([np.inf, -np.inf], np.nan)
drop_index = pd.isnull(filtered_dataframe).any(1) # get the rows that have NaNs, const_cols = filtered_df[:, filtered_df.nunique(axis=0) == len(filtered_df.index)].columns
if const_cols:
filtered_df = filtered_df.filter(filtered_df.columns.difference(const_cols))
logger.warning(f"Removed features {const_cols} with constant values.")
drop_index = pd.isnull(filtered_df).any(1) # get the rows that have NaNs,
drop_index = drop_index.replace(True, 1).replace(False, 0) # pep8 requirement. drop_index = drop_index.replace(True, 1).replace(False, 0) # pep8 requirement.
if (training_filter): if (training_filter):
# we don't care about total row number (total no. datapoints) in training, we only care # we don't care about total row number (total no. datapoints) in training, we only care
# about removing any row with NaNs # about removing any row with NaNs
# if labels has multiple columns (user wants to train multiple modelEs), we detect here # if labels has multiple columns (user wants to train multiple modelEs), we detect here
labels = unfiltered_dataframe.filter(label_list, axis=1) labels = unfiltered_df.filter(label_list, axis=1)
drop_index_labels = pd.isnull(labels).any(1) drop_index_labels = pd.isnull(labels).any(1)
drop_index_labels = drop_index_labels.replace(True, 1).replace(False, 0) drop_index_labels = drop_index_labels.replace(True, 1).replace(False, 0)
dates = unfiltered_dataframe['date'] dates = unfiltered_df['date']
filtered_dataframe = filtered_dataframe[ filtered_df = filtered_df[
(drop_index == 0) & (drop_index_labels == 0) (drop_index == 0) & (drop_index_labels == 0)
] # dropping values ] # dropping values
labels = labels[ labels = labels[
@ -229,13 +234,13 @@ class FreqaiDataKitchen:
(drop_index == 0) & (drop_index_labels == 0) (drop_index == 0) & (drop_index_labels == 0)
] ]
logger.info( logger.info(
f"dropped {len(unfiltered_dataframe) - len(filtered_dataframe)} training points" f"dropped {len(unfiltered_df) - len(filtered_df)} training points"
f" due to NaNs in populated dataset {len(unfiltered_dataframe)}." f" due to NaNs in populated dataset {len(unfiltered_df)}."
) )
if (1 - len(filtered_dataframe) / len(unfiltered_dataframe)) > 0.1 and self.live: if (1 - len(filtered_df) / len(unfiltered_df)) > 0.1 and self.live:
worst_indicator = str(unfiltered_dataframe.count().idxmin()) worst_indicator = str(unfiltered_df.count().idxmin())
logger.warning( logger.warning(
f" {(1 - len(filtered_dataframe)/len(unfiltered_dataframe)) * 100:.0f} percent " f" {(1 - len(filtered_df)/len(unfiltered_df)) * 100:.0f} percent "
" of training data dropped due to NaNs, model may perform inconsistent " " of training data dropped due to NaNs, model may perform inconsistent "
f"with expectations. Verify {worst_indicator}" f"with expectations. Verify {worst_indicator}"
) )
@ -244,9 +249,9 @@ class FreqaiDataKitchen:
else: else:
# we are backtesting so we need to preserve row number to send back to strategy, # we are backtesting so we need to preserve row number to send back to strategy,
# so now we use do_predict to avoid any prediction based on a NaN # so now we use do_predict to avoid any prediction based on a NaN
drop_index = pd.isnull(filtered_dataframe).any(1) drop_index = pd.isnull(filtered_df).any(1)
self.data["filter_drop_index_prediction"] = drop_index self.data["filter_drop_index_prediction"] = drop_index
filtered_dataframe.fillna(0, inplace=True) filtered_df.fillna(0, inplace=True)
# replacing all NaNs with zeros to avoid issues in 'prediction', but any prediction # replacing all NaNs with zeros to avoid issues in 'prediction', but any prediction
# that was based on a single NaN is ultimately protected from buys with do_predict # that was based on a single NaN is ultimately protected from buys with do_predict
drop_index = ~drop_index drop_index = ~drop_index
@ -255,11 +260,11 @@ class FreqaiDataKitchen:
logger.info( logger.info(
"dropped %s of %s prediction data points due to NaNs.", "dropped %s of %s prediction data points due to NaNs.",
len(self.do_predict) - self.do_predict.sum(), len(self.do_predict) - self.do_predict.sum(),
len(filtered_dataframe), len(filtered_df),
) )
labels = [] labels = []
return filtered_dataframe, labels return filtered_df, labels
def build_data_dictionary( def build_data_dictionary(
self, self,