bugfix skip test split when empty

This commit is contained in:
Yinon Polak 2023-03-28 14:40:23 +03:00
parent 8903ba5d89
commit 026b6a39a9
5 changed files with 28 additions and 15 deletions

View File

@ -97,7 +97,7 @@ class BasePyTorchClassifier(BasePyTorchModel):
""" """
target_column_name = dk.label_list[0] target_column_name = dk.label_list[0]
for split in ["train", "test"]: for split in self.splits:
label_df = data_dictionary[f"{split}_labels"] label_df = data_dictionary[f"{split}_labels"]
self.assert_valid_class_names(label_df[target_column_name], class_names) self.assert_valid_class_names(label_df[target_column_name], class_names)
label_df[target_column_name] = list( label_df[target_column_name] = list(

View File

@ -22,6 +22,8 @@ class BasePyTorchModel(IFreqaiModel):
super().__init__(config=kwargs["config"]) super().__init__(config=kwargs["config"])
self.dd.model_type = "pytorch" self.dd.model_type = "pytorch"
self.device = "cuda" if torch.cuda.is_available() else "cpu" self.device = "cuda" if torch.cuda.is_available() else "cpu"
test_size = self.freqai_info.get('data_split_parameters', {}).get('test_size')
self.splits = ["train", "test"] if test_size != 0 else ["train"]
def train( def train(
self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs

View File

@ -76,5 +76,5 @@ class PyTorchMLPClassifier(BasePyTorchClassifier):
squeeze_target_tensor=True, squeeze_target_tensor=True,
**self.trainer_kwargs, **self.trainer_kwargs,
) )
trainer.fit(data_dictionary) trainer.fit(data_dictionary, self.splits)
return trainer return trainer

View File

@ -72,5 +72,5 @@ class PyTorchMLPRegressor(BasePyTorchRegressor):
target_tensor_type=torch.float, target_tensor_type=torch.float,
**self.trainer_kwargs, **self.trainer_kwargs,
) )
trainer.fit(data_dictionary) trainer.fit(data_dictionary, self.splits)
return trainer return trainer

View File

@ -1,7 +1,7 @@
import logging import logging
import math import math
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Optional from typing import Any, Dict, List, Optional
import pandas as pd import pandas as pd
import torch import torch
@ -43,7 +43,6 @@ class PyTorchModelTrainer:
self.optimizer.step(). used to calculate n_epochs. self.optimizer.step(). used to calculate n_epochs.
:param batch_size: The size of the batches to use during training. :param batch_size: The size of the batches to use during training.
:param max_n_eval_batches: The maximum number batches to use for evaluation. :param max_n_eval_batches: The maximum number batches to use for evaluation.
""" """
self.model = model self.model = model
self.optimizer = optimizer self.optimizer = optimizer
@ -58,21 +57,27 @@ class PyTorchModelTrainer:
if init_model: if init_model:
self.load_from_checkpoint(init_model) self.load_from_checkpoint(init_model)
def fit(self, data_dictionary: Dict[str, pd.DataFrame]): def fit(self, data_dictionary: Dict[str, pd.DataFrame], splits: List[str]):
""" """
:param data_dictionary: the dictionary constructed by DataHandler to hold
all the training and test data/labels.
:param splits: splits to use in training, splits must contain "train",
optional "test" could be added by setting freqai.data_split_parameters.test_size > 0
in the config file.
- Calculates the predicted output for the batch using the PyTorch model. - Calculates the predicted output for the batch using the PyTorch model.
- Calculates the loss between the predicted and actual output using a loss function. - Calculates the loss between the predicted and actual output using a loss function.
- Computes the gradients of the loss with respect to the model's parameters using - Computes the gradients of the loss with respect to the model's parameters using
backpropagation. backpropagation.
- Updates the model's parameters using an optimizer. - Updates the model's parameters using an optimizer.
""" """
data_loaders_dictionary = self.create_data_loaders_dictionary(data_dictionary) data_loaders_dictionary = self.create_data_loaders_dictionary(data_dictionary, splits)
epochs = self.calc_n_epochs( epochs = self.calc_n_epochs(
n_obs=len(data_dictionary["train_features"]), n_obs=len(data_dictionary["train_features"]),
batch_size=self.batch_size, batch_size=self.batch_size,
n_iters=self.max_iters n_iters=self.max_iters
) )
for epoch in range(epochs): for epoch in range(1, epochs+1):
# training # training
losses = [] losses = []
for i, batch_data in enumerate(data_loaders_dictionary["train"]): for i, batch_data in enumerate(data_loaders_dictionary["train"]):
@ -87,13 +92,18 @@ class PyTorchModelTrainer:
self.optimizer.step() self.optimizer.step()
losses.append(loss.item()) losses.append(loss.item())
train_loss = sum(losses) / len(losses) train_loss = sum(losses) / len(losses)
log_message = f"epoch {epoch}/{epochs}: train loss {train_loss:.4f}"
# evaluation # evaluation
test_loss = self.estimate_loss(data_loaders_dictionary, self.max_n_eval_batches, "test") if "test" in splits:
logger.info( test_loss = self.estimate_loss(
f"epoch {epoch}/{epochs}:" data_loaders_dictionary,
f" train loss {train_loss:.4f} ; test loss {test_loss:.4f}" self.max_n_eval_batches,
) "test"
)
log_message += f" ; test loss {test_loss:.4f}"
logger.info(log_message)
@torch.no_grad() @torch.no_grad()
def estimate_loss( def estimate_loss(
@ -122,13 +132,14 @@ class PyTorchModelTrainer:
def create_data_loaders_dictionary( def create_data_loaders_dictionary(
self, self,
data_dictionary: Dict[str, pd.DataFrame] data_dictionary: Dict[str, pd.DataFrame],
splits: List[str]
) -> Dict[str, DataLoader]: ) -> Dict[str, DataLoader]:
""" """
Converts the input data to PyTorch tensors using a data loader. Converts the input data to PyTorch tensors using a data loader.
""" """
data_loader_dictionary = {} data_loader_dictionary = {}
for split in ["train", "test"]: for split in splits:
x = torch.from_numpy(data_dictionary[f"{split}_features"].values).float() x = torch.from_numpy(data_dictionary[f"{split}_features"].values).float()
y = torch.from_numpy(data_dictionary[f"{split}_labels"].values)\ y = torch.from_numpy(data_dictionary[f"{split}_labels"].values)\
.to(self.target_tensor_type) .to(self.target_tensor_type)