Source code for skrobot.tasks.train_task

import os, joblib, copy

import pandas as pd

import numpy as np

from . import BaseTask

[docs]class TrainTask(BaseTask): """ The :class:`.TrainTask` class can be used to fit a scikit-learn estimator/pipeline on train data. """
[docs] def __init__ (self, estimator, train_data_set, estimator_params=None, field_delimiter=',', feature_columns='all', id_column='id', label_column='label', random_seed=42): """ This is the constructor method and can be used to create a new object instance of :class:`.TrainTask` class. :param estimator: It can be either an estimator (e.g., LogisticRegression) or a pipeline ending with an estimator. :type estimator: scikit-learn {estimator, pipeline} :param train_data_set: The input train data set. It can be either a URL, a disk file path or a pandas DataFrame. :type train_data_set: {str, pandas DataFrame} :param estimator_params: The parameters to override in the provided estimator/pipeline. It defaults to None. :type estimator_params: dict, optional :param field_delimiter: The separation delimiter (comma for CSV, tab for TSV, etc.) used in the input train data set file. It defaults to ','. :type field_delimiter: str, optional :param feature_columns: Either 'all' to use from the input train data set file all the columns or a list of column names to select specific columns. It defaults to 'all'. :type feature_columns: {str, list}, optional :param id_column: The name of the column in the input train data set file containing the sample IDs. It defaults to 'id'. :type id_column: str, optional :param label_column: The name of the column in the input train data set file containing the ground truth labels. It defaults to 'label'. :type label_column: str, optional :param random_seed: The random seed used in the random number generator. It can be used to reproduce the output. It defaults to 42. :type random_seed: int, optional """ arguments = copy.deepcopy(locals()) super(TrainTask, self).__init__(TrainTask.__name__, arguments)
[docs] def run(self, output_directory): """ Run the task. The fitted estimator/pipeline is returned as a result and also stored in a *trained_model.pkl* pickle file under the output directory path. :param output_directory: The output directory path under which task-related generated files are stored. :type output_directory: str :return: The task's result. Specifically, the fitted estimator/pipeline. :rtype: dict """ if isinstance(self.train_data_set, str): data_set_data_frame = pd.read_csv(self.train_data_set, delimiter=self.field_delimiter) else: data_set_data_frame = self.train_data_set.copy() data_set_data_frame.reset_index(inplace=True, drop=True) y = data_set_data_frame[self.label_column] X = data_set_data_frame.drop(columns=[self.label_column, self.id_column]) if self.feature_columns != 'all': X = X[self.feature_columns] np.random.seed(self.random_seed) estimator = self._build_estimator() estimator.fit(X, y) joblib.dump(estimator, os.path.join(output_directory, 'trained_model.pkl')) return { 'estimator': estimator }
def _build_estimator (self): estimator = copy.deepcopy(self.estimator) if self.estimator_params: estimator.set_params(**self.estimator_params) return estimator