feat/336 : check if pandas df and drop_last default to True

dreamquark-ai · Nov 12, 2021 · a0fd306 · a0fd306
1 parent 233f74e
commit a0fd306
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 10 deletions.
diff --git a/pytorch_tabnet/abstract_model.py b/pytorch_tabnet/abstract_model.py
@@ -13,6 +13,7 @@
  create_dataloaders,
  define_device,
  ComplexEncoder,
+ check_input
 )
 from pytorch_tabnet.callbacks import (
  CallbackContainer,
@@ -22,7 +23,7 @@
 )
 from pytorch_tabnet.metrics import MetricContainer, check_metrics
 from sklearn.base import BaseEstimator
-from sklearn.utils import check_array
+
 from torch.utils.data import DataLoader
 import io
 import json
@@ -115,7 +116,7 @@ def fit(
  batch_size=1024,
  virtual_batch_size=128,
  num_workers=0,
- drop_last=False,
+ drop_last=True,
  callbacks=None,
  pin_memory=True,
  from_unsupervised=None,
@@ -182,7 +183,7 @@ def fit(
  else:
  self.loss_fn = loss_fn
 
- check_array(X_train)
+ check_input(X_train)
 
  self.update_fit_params(
  X_train,

diff --git a/pytorch_tabnet/pretraining.py b/pytorch_tabnet/pretraining.py
@@ -1,12 +1,12 @@
 import torch
 import numpy as np
-from sklearn.utils import check_array
 from torch.utils.data import DataLoader
 from pytorch_tabnet import tab_network
 from pytorch_tabnet.utils import (
  create_explain_matrix,
  filter_weights,
- PredictDataset
+ PredictDataset,
+ check_input
 )
 from torch.nn.utils import clip_grad_norm_
 from pytorch_tabnet.pretraining_utils import (
@@ -55,7 +55,7 @@ def fit(
  batch_size=1024,
  virtual_batch_size=128,
  num_workers=0,
- drop_last=False,
+ drop_last=True,
  callbacks=None,
  pin_memory=True,
  ):
@@ -118,7 +118,7 @@ def fit(
  else:
  self.loss_fn = loss_fn
 
- check_array(X_train)
+ check_input(X_train)
 
  self.update_fit_params(
  weights,

diff --git a/pytorch_tabnet/pretraining_utils.py b/pytorch_tabnet/pretraining_utils.py
@@ -2,8 +2,8 @@
 from pytorch_tabnet.utils import (
  create_sampler,
  PredictDataset,
+ check_input
 )
-from sklearn.utils import check_array
 
 
 def create_dataloaders(
@@ -93,7 +93,7 @@ def validate_eval_set(eval_set, eval_name, X_train):
  ), "eval_set and eval_name have not the same length"
 
  for set_nb, X in enumerate(eval_set):
- check_array(X)
+ check_input(X)
  msg = (
  f"Number of columns is different between eval set {set_nb}"
  + f"({X.shape[1]}) and X_train ({X_train.shape[1]})"

diff --git a/pytorch_tabnet/utils.py b/pytorch_tabnet/utils.py
@@ -5,6 +5,7 @@
 import scipy
 import json
 from sklearn.utils import check_array
+import pandas as pd
 
 
 class TorchDataset(Dataset):
@@ -271,7 +272,7 @@ def validate_eval_set(eval_set, eval_name, X_train, y_train):
  len(elem) == 2 for elem in eval_set
  ), "Each tuple of eval_set need to have two elements"
  for name, (X, y) in zip(eval_name, eval_set):
- check_array(X)
+ check_input(X)
  msg = (
  f"Dimension mismatch between X_{name} "
  + f"{X.shape} and X_train {X_train.shape}"
@@ -337,3 +338,15 @@ def default(self, obj):
  return int(obj)
  # Let the base class default method raise the TypeError
  return json.JSONEncoder.default(self, obj)
+
+
+def check_input(X):
+ """
+ Raise a clear error if X is a pandas dataframe
+ and check array according to scikit rules
+ """
+ if isinstance(X, (pd.DataFrame, pd.Series)):
+ err_message = "Pandas DataFrame are not supported: apply X.values when calling fit"
+ raise(ValueError, err_message)
+ check_array(X)
+ return