bugfix skip test split when empty

This commit is contained in:
Yinon Polak 2023-03-28 14:40:23 +03:00
parent 8903ba5d89
commit 026b6a39a9
5 changed files with 28 additions and 15 deletions

View File

@ -97,7 +97,7 @@ class BasePyTorchClassifier(BasePyTorchModel):
"""
target_column_name = dk.label_list[0]
for split in ["train", "test"]:
for split in self.splits:
label_df = data_dictionary[f"{split}_labels"]
self.assert_valid_class_names(label_df[target_column_name], class_names)
label_df[target_column_name] = list(

View File

@ -22,6 +22,8 @@ class BasePyTorchModel(IFreqaiModel):
super().__init__(config=kwargs["config"])
self.dd.model_type = "pytorch"
self.device = "cuda" if torch.cuda.is_available() else "cpu"
test_size = self.freqai_info.get('data_split_parameters', {}).get('test_size')
self.splits = ["train", "test"] if test_size != 0 else ["train"]
def train(
self, unfiltered_df: DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs

View File

@ -76,5 +76,5 @@ class PyTorchMLPClassifier(BasePyTorchClassifier):
squeeze_target_tensor=True,
**self.trainer_kwargs,
)
trainer.fit(data_dictionary)
trainer.fit(data_dictionary, self.splits)
return trainer

View File

@ -72,5 +72,5 @@ class PyTorchMLPRegressor(BasePyTorchRegressor):
target_tensor_type=torch.float,
**self.trainer_kwargs,
)
trainer.fit(data_dictionary)
trainer.fit(data_dictionary, self.splits)
return trainer

View File

@ -1,7 +1,7 @@
import logging
import math
from pathlib import Path
from typing import Any, Dict, Optional
from typing import Any, Dict, List, Optional
import pandas as pd
import torch
@ -43,7 +43,6 @@ class PyTorchModelTrainer:
self.optimizer.step(). used to calculate n_epochs.
:param batch_size: The size of the batches to use during training.
:param max_n_eval_batches: The maximum number batches to use for evaluation.
"""
self.model = model
self.optimizer = optimizer
@ -58,21 +57,27 @@ class PyTorchModelTrainer:
if init_model:
self.load_from_checkpoint(init_model)
def fit(self, data_dictionary: Dict[str, pd.DataFrame]):
def fit(self, data_dictionary: Dict[str, pd.DataFrame], splits: List[str]):
"""
:param data_dictionary: the dictionary constructed by DataHandler to hold
all the training and test data/labels.
:param splits: splits to use in training, splits must contain "train",
optional "test" could be added by setting freqai.data_split_parameters.test_size > 0
in the config file.
- Calculates the predicted output for the batch using the PyTorch model.
- Calculates the loss between the predicted and actual output using a loss function.
- Computes the gradients of the loss with respect to the model's parameters using
backpropagation.
- Updates the model's parameters using an optimizer.
"""
data_loaders_dictionary = self.create_data_loaders_dictionary(data_dictionary)
data_loaders_dictionary = self.create_data_loaders_dictionary(data_dictionary, splits)
epochs = self.calc_n_epochs(
n_obs=len(data_dictionary["train_features"]),
batch_size=self.batch_size,
n_iters=self.max_iters
)
for epoch in range(epochs):
for epoch in range(1, epochs+1):
# training
losses = []
for i, batch_data in enumerate(data_loaders_dictionary["train"]):
@ -87,13 +92,18 @@ class PyTorchModelTrainer:
self.optimizer.step()
losses.append(loss.item())
train_loss = sum(losses) / len(losses)
log_message = f"epoch {epoch}/{epochs}: train loss {train_loss:.4f}"
# evaluation
test_loss = self.estimate_loss(data_loaders_dictionary, self.max_n_eval_batches, "test")
logger.info(
f"epoch {epoch}/{epochs}:"
f" train loss {train_loss:.4f} ; test loss {test_loss:.4f}"
if "test" in splits:
test_loss = self.estimate_loss(
data_loaders_dictionary,
self.max_n_eval_batches,
"test"
)
log_message += f" ; test loss {test_loss:.4f}"
logger.info(log_message)
@torch.no_grad()
def estimate_loss(
@ -122,13 +132,14 @@ class PyTorchModelTrainer:
def create_data_loaders_dictionary(
self,
data_dictionary: Dict[str, pd.DataFrame]
data_dictionary: Dict[str, pd.DataFrame],
splits: List[str]
) -> Dict[str, DataLoader]:
"""
Converts the input data to PyTorch tensors using a data loader.
"""
data_loader_dictionary = {}
for split in ["train", "test"]:
for split in splits:
x = torch.from_numpy(data_dictionary[f"{split}_features"].values).float()
y = torch.from_numpy(data_dictionary[f"{split}_labels"].values)\
.to(self.target_tensor_type)