Source code for skrobot.tasks.train_task

import os, joblib, copy

import pandas as pd

import numpy as np

from . import BaseTask

[docs]class TrainTask(BaseTask):
  """
  The :class:`.TrainTask` class can be used to fit a scikit-learn estimator/pipeline on train data.
  """
[docs]  def __init__ (self, estimator, train_data_set, estimator_params=None, field_delimiter=',', feature_columns='all', id_column='id', label_column='label', random_seed=42):
    """
    This is the constructor method and can be used to create a new object instance of :class:`.TrainTask` class.

    :param estimator: It can be either an estimator (e.g., LogisticRegression) or a pipeline ending with an estimator.
    :type estimator: scikit-learn {estimator, pipeline}

    :param train_data_set: The input train data set. It can be either a URL, a disk file path or a pandas DataFrame.
    :type train_data_set: {str, pandas DataFrame}

    :param estimator_params: The parameters to override in the provided estimator/pipeline. It defaults to None.
    :type estimator_params: dict, optional

    :param field_delimiter: The separation delimiter (comma for CSV, tab for TSV, etc.) used in the input train data set file. It defaults to ','.
    :type field_delimiter: str, optional

    :param feature_columns: Either 'all' to use from the input train data set file all the columns or a list of column names to select specific columns. It defaults to 'all'.
    :type feature_columns: {str, list}, optional

    :param id_column: The name of the column in the input train data set file containing the sample IDs. It defaults to 'id'.
    :type id_column: str, optional

    :param label_column: The name of the column in the input train data set file containing the ground truth labels. It defaults to 'label'.
    :type label_column: str, optional

    :param random_seed: The random seed used in the random number generator. It can be used to reproduce the output. It defaults to 42.
    :type random_seed: int, optional
    """
    arguments = copy.deepcopy(locals())

    super(TrainTask, self).__init__(TrainTask.__name__, arguments)

[docs]  def run(self, output_directory):
    """
    Run the task.

    The fitted estimator/pipeline is returned as a result and also stored in a *trained_model.pkl* pickle file under the output directory path.

    :param output_directory: The output directory path under which task-related generated files are stored.
    :type output_directory: str

    :return: The task's result. Specifically, the fitted estimator/pipeline.
    :rtype: dict
    """

    if isinstance(self.train_data_set, str):
      data_set_data_frame = pd.read_csv(self.train_data_set, delimiter=self.field_delimiter)
    else:
      data_set_data_frame = self.train_data_set.copy()

      data_set_data_frame.reset_index(inplace=True, drop=True)

    y = data_set_data_frame[self.label_column]

    X = data_set_data_frame.drop(columns=[self.label_column, self.id_column])

    if self.feature_columns != 'all':
      X = X[self.feature_columns]

    np.random.seed(self.random_seed)

    estimator = self._build_estimator()

    estimator.fit(X, y)

    joblib.dump(estimator, os.path.join(output_directory, 'trained_model.pkl'))

    return { 'estimator': estimator }

  def _build_estimator (self):
    estimator = copy.deepcopy(self.estimator)

    if self.estimator_params: estimator.set_params(**self.estimator_params)

    return estimator