Check for constant columns
This commit is contained in:
parent
c08c82bc40
commit
047ded1baa
@ -184,7 +184,7 @@ class FreqaiDataKitchen:
|
|||||||
|
|
||||||
def filter_features(
|
def filter_features(
|
||||||
self,
|
self,
|
||||||
unfiltered_dataframe: DataFrame,
|
unfiltered_df: DataFrame,
|
||||||
training_feature_list: List,
|
training_feature_list: List,
|
||||||
label_list: List = list(),
|
label_list: List = list(),
|
||||||
training_filter: bool = True,
|
training_filter: bool = True,
|
||||||
@ -195,31 +195,36 @@ class FreqaiDataKitchen:
|
|||||||
0s in the prediction dataset. However, prediction dataset do_predict will reflect any
|
0s in the prediction dataset. However, prediction dataset do_predict will reflect any
|
||||||
row that had a NaN and will shield user from that prediction.
|
row that had a NaN and will shield user from that prediction.
|
||||||
:params:
|
:params:
|
||||||
:unfiltered_dataframe: the full dataframe for the present training period
|
:unfiltered_df: the full dataframe for the present training period
|
||||||
:training_feature_list: list, the training feature list constructed by
|
:training_feature_list: list, the training feature list constructed by
|
||||||
self.build_feature_list() according to user specified parameters in the configuration file.
|
self.build_feature_list() according to user specified parameters in the configuration file.
|
||||||
:labels: the labels for the dataset
|
:labels: the labels for the dataset
|
||||||
:training_filter: boolean which lets the function know if it is training data or
|
:training_filter: boolean which lets the function know if it is training data or
|
||||||
prediction data to be filtered.
|
prediction data to be filtered.
|
||||||
:returns:
|
:returns:
|
||||||
:filtered_dataframe: dataframe cleaned of NaNs and only containing the user
|
:filtered_df: dataframe cleaned of NaNs and only containing the user
|
||||||
requested feature set.
|
requested feature set.
|
||||||
:labels: labels cleaned of NaNs.
|
:labels: labels cleaned of NaNs.
|
||||||
"""
|
"""
|
||||||
filtered_dataframe = unfiltered_dataframe.filter(training_feature_list, axis=1)
|
filtered_df = unfiltered_df.filter(training_feature_list, axis=1)
|
||||||
filtered_dataframe = filtered_dataframe.replace([np.inf, -np.inf], np.nan)
|
filtered_df = filtered_df.replace([np.inf, -np.inf], np.nan)
|
||||||
|
|
||||||
drop_index = pd.isnull(filtered_dataframe).any(1) # get the rows that have NaNs,
|
const_cols = filtered_df[:, filtered_df.nunique(axis=0) == len(filtered_df.index)].columns
|
||||||
|
if const_cols:
|
||||||
|
filtered_df = filtered_df.filter(filtered_df.columns.difference(const_cols))
|
||||||
|
logger.warning(f"Removed features {const_cols} with constant values.")
|
||||||
|
|
||||||
|
drop_index = pd.isnull(filtered_df).any(1) # get the rows that have NaNs,
|
||||||
drop_index = drop_index.replace(True, 1).replace(False, 0) # pep8 requirement.
|
drop_index = drop_index.replace(True, 1).replace(False, 0) # pep8 requirement.
|
||||||
if (training_filter):
|
if (training_filter):
|
||||||
# we don't care about total row number (total no. datapoints) in training, we only care
|
# we don't care about total row number (total no. datapoints) in training, we only care
|
||||||
# about removing any row with NaNs
|
# about removing any row with NaNs
|
||||||
# if labels has multiple columns (user wants to train multiple modelEs), we detect here
|
# if labels has multiple columns (user wants to train multiple modelEs), we detect here
|
||||||
labels = unfiltered_dataframe.filter(label_list, axis=1)
|
labels = unfiltered_df.filter(label_list, axis=1)
|
||||||
drop_index_labels = pd.isnull(labels).any(1)
|
drop_index_labels = pd.isnull(labels).any(1)
|
||||||
drop_index_labels = drop_index_labels.replace(True, 1).replace(False, 0)
|
drop_index_labels = drop_index_labels.replace(True, 1).replace(False, 0)
|
||||||
dates = unfiltered_dataframe['date']
|
dates = unfiltered_df['date']
|
||||||
filtered_dataframe = filtered_dataframe[
|
filtered_df = filtered_df[
|
||||||
(drop_index == 0) & (drop_index_labels == 0)
|
(drop_index == 0) & (drop_index_labels == 0)
|
||||||
] # dropping values
|
] # dropping values
|
||||||
labels = labels[
|
labels = labels[
|
||||||
@ -229,13 +234,13 @@ class FreqaiDataKitchen:
|
|||||||
(drop_index == 0) & (drop_index_labels == 0)
|
(drop_index == 0) & (drop_index_labels == 0)
|
||||||
]
|
]
|
||||||
logger.info(
|
logger.info(
|
||||||
f"dropped {len(unfiltered_dataframe) - len(filtered_dataframe)} training points"
|
f"dropped {len(unfiltered_df) - len(filtered_df)} training points"
|
||||||
f" due to NaNs in populated dataset {len(unfiltered_dataframe)}."
|
f" due to NaNs in populated dataset {len(unfiltered_df)}."
|
||||||
)
|
)
|
||||||
if (1 - len(filtered_dataframe) / len(unfiltered_dataframe)) > 0.1 and self.live:
|
if (1 - len(filtered_df) / len(unfiltered_df)) > 0.1 and self.live:
|
||||||
worst_indicator = str(unfiltered_dataframe.count().idxmin())
|
worst_indicator = str(unfiltered_df.count().idxmin())
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f" {(1 - len(filtered_dataframe)/len(unfiltered_dataframe)) * 100:.0f} percent "
|
f" {(1 - len(filtered_df)/len(unfiltered_df)) * 100:.0f} percent "
|
||||||
" of training data dropped due to NaNs, model may perform inconsistent "
|
" of training data dropped due to NaNs, model may perform inconsistent "
|
||||||
f"with expectations. Verify {worst_indicator}"
|
f"with expectations. Verify {worst_indicator}"
|
||||||
)
|
)
|
||||||
@ -244,9 +249,9 @@ class FreqaiDataKitchen:
|
|||||||
else:
|
else:
|
||||||
# we are backtesting so we need to preserve row number to send back to strategy,
|
# we are backtesting so we need to preserve row number to send back to strategy,
|
||||||
# so now we use do_predict to avoid any prediction based on a NaN
|
# so now we use do_predict to avoid any prediction based on a NaN
|
||||||
drop_index = pd.isnull(filtered_dataframe).any(1)
|
drop_index = pd.isnull(filtered_df).any(1)
|
||||||
self.data["filter_drop_index_prediction"] = drop_index
|
self.data["filter_drop_index_prediction"] = drop_index
|
||||||
filtered_dataframe.fillna(0, inplace=True)
|
filtered_df.fillna(0, inplace=True)
|
||||||
# replacing all NaNs with zeros to avoid issues in 'prediction', but any prediction
|
# replacing all NaNs with zeros to avoid issues in 'prediction', but any prediction
|
||||||
# that was based on a single NaN is ultimately protected from buys with do_predict
|
# that was based on a single NaN is ultimately protected from buys with do_predict
|
||||||
drop_index = ~drop_index
|
drop_index = ~drop_index
|
||||||
@ -255,11 +260,11 @@ class FreqaiDataKitchen:
|
|||||||
logger.info(
|
logger.info(
|
||||||
"dropped %s of %s prediction data points due to NaNs.",
|
"dropped %s of %s prediction data points due to NaNs.",
|
||||||
len(self.do_predict) - self.do_predict.sum(),
|
len(self.do_predict) - self.do_predict.sum(),
|
||||||
len(filtered_dataframe),
|
len(filtered_df),
|
||||||
)
|
)
|
||||||
labels = []
|
labels = []
|
||||||
|
|
||||||
return filtered_dataframe, labels
|
return filtered_df, labels
|
||||||
|
|
||||||
def build_data_dictionary(
|
def build_data_dictionary(
|
||||||
self,
|
self,
|
||||||
|
Loading…
Reference in New Issue
Block a user