Check for constant columns
This commit is contained in:
parent
c08c82bc40
commit
047ded1baa
@ -184,7 +184,7 @@ class FreqaiDataKitchen:
|
||||
|
||||
def filter_features(
|
||||
self,
|
||||
unfiltered_dataframe: DataFrame,
|
||||
unfiltered_df: DataFrame,
|
||||
training_feature_list: List,
|
||||
label_list: List = list(),
|
||||
training_filter: bool = True,
|
||||
@ -195,31 +195,36 @@ class FreqaiDataKitchen:
|
||||
0s in the prediction dataset. However, prediction dataset do_predict will reflect any
|
||||
row that had a NaN and will shield user from that prediction.
|
||||
:params:
|
||||
:unfiltered_dataframe: the full dataframe for the present training period
|
||||
:unfiltered_df: the full dataframe for the present training period
|
||||
:training_feature_list: list, the training feature list constructed by
|
||||
self.build_feature_list() according to user specified parameters in the configuration file.
|
||||
:labels: the labels for the dataset
|
||||
:training_filter: boolean which lets the function know if it is training data or
|
||||
prediction data to be filtered.
|
||||
:returns:
|
||||
:filtered_dataframe: dataframe cleaned of NaNs and only containing the user
|
||||
:filtered_df: dataframe cleaned of NaNs and only containing the user
|
||||
requested feature set.
|
||||
:labels: labels cleaned of NaNs.
|
||||
"""
|
||||
filtered_dataframe = unfiltered_dataframe.filter(training_feature_list, axis=1)
|
||||
filtered_dataframe = filtered_dataframe.replace([np.inf, -np.inf], np.nan)
|
||||
filtered_df = unfiltered_df.filter(training_feature_list, axis=1)
|
||||
filtered_df = filtered_df.replace([np.inf, -np.inf], np.nan)
|
||||
|
||||
drop_index = pd.isnull(filtered_dataframe).any(1) # get the rows that have NaNs,
|
||||
const_cols = filtered_df[:, filtered_df.nunique(axis=0) == len(filtered_df.index)].columns
|
||||
if const_cols:
|
||||
filtered_df = filtered_df.filter(filtered_df.columns.difference(const_cols))
|
||||
logger.warning(f"Removed features {const_cols} with constant values.")
|
||||
|
||||
drop_index = pd.isnull(filtered_df).any(1) # get the rows that have NaNs,
|
||||
drop_index = drop_index.replace(True, 1).replace(False, 0) # pep8 requirement.
|
||||
if (training_filter):
|
||||
# we don't care about total row number (total no. datapoints) in training, we only care
|
||||
# about removing any row with NaNs
|
||||
# if labels has multiple columns (user wants to train multiple modelEs), we detect here
|
||||
labels = unfiltered_dataframe.filter(label_list, axis=1)
|
||||
labels = unfiltered_df.filter(label_list, axis=1)
|
||||
drop_index_labels = pd.isnull(labels).any(1)
|
||||
drop_index_labels = drop_index_labels.replace(True, 1).replace(False, 0)
|
||||
dates = unfiltered_dataframe['date']
|
||||
filtered_dataframe = filtered_dataframe[
|
||||
dates = unfiltered_df['date']
|
||||
filtered_df = filtered_df[
|
||||
(drop_index == 0) & (drop_index_labels == 0)
|
||||
] # dropping values
|
||||
labels = labels[
|
||||
@ -229,13 +234,13 @@ class FreqaiDataKitchen:
|
||||
(drop_index == 0) & (drop_index_labels == 0)
|
||||
]
|
||||
logger.info(
|
||||
f"dropped {len(unfiltered_dataframe) - len(filtered_dataframe)} training points"
|
||||
f" due to NaNs in populated dataset {len(unfiltered_dataframe)}."
|
||||
f"dropped {len(unfiltered_df) - len(filtered_df)} training points"
|
||||
f" due to NaNs in populated dataset {len(unfiltered_df)}."
|
||||
)
|
||||
if (1 - len(filtered_dataframe) / len(unfiltered_dataframe)) > 0.1 and self.live:
|
||||
worst_indicator = str(unfiltered_dataframe.count().idxmin())
|
||||
if (1 - len(filtered_df) / len(unfiltered_df)) > 0.1 and self.live:
|
||||
worst_indicator = str(unfiltered_df.count().idxmin())
|
||||
logger.warning(
|
||||
f" {(1 - len(filtered_dataframe)/len(unfiltered_dataframe)) * 100:.0f} percent "
|
||||
f" {(1 - len(filtered_df)/len(unfiltered_df)) * 100:.0f} percent "
|
||||
" of training data dropped due to NaNs, model may perform inconsistent "
|
||||
f"with expectations. Verify {worst_indicator}"
|
||||
)
|
||||
@ -244,9 +249,9 @@ class FreqaiDataKitchen:
|
||||
else:
|
||||
# we are backtesting so we need to preserve row number to send back to strategy,
|
||||
# so now we use do_predict to avoid any prediction based on a NaN
|
||||
drop_index = pd.isnull(filtered_dataframe).any(1)
|
||||
drop_index = pd.isnull(filtered_df).any(1)
|
||||
self.data["filter_drop_index_prediction"] = drop_index
|
||||
filtered_dataframe.fillna(0, inplace=True)
|
||||
filtered_df.fillna(0, inplace=True)
|
||||
# replacing all NaNs with zeros to avoid issues in 'prediction', but any prediction
|
||||
# that was based on a single NaN is ultimately protected from buys with do_predict
|
||||
drop_index = ~drop_index
|
||||
@ -255,11 +260,11 @@ class FreqaiDataKitchen:
|
||||
logger.info(
|
||||
"dropped %s of %s prediction data points due to NaNs.",
|
||||
len(self.do_predict) - self.do_predict.sum(),
|
||||
len(filtered_dataframe),
|
||||
len(filtered_df),
|
||||
)
|
||||
labels = []
|
||||
|
||||
return filtered_dataframe, labels
|
||||
return filtered_df, labels
|
||||
|
||||
def build_data_dictionary(
|
||||
self,
|
||||
|
Loading…
Reference in New Issue
Block a user