Source code for skrobot.tasks.prediction_task

import os, copy

import pandas as pd

from . import BaseTask

[docs]class PredictionTask(BaseTask):
  """
  The :class:`.PredictionTask` class can be used to predict new data using a scikit-learn estimator/pipeline.
  """
[docs]  def __init__ (self, estimator, data_set, field_delimiter=',', feature_columns='all', id_column='id', prediction_column='prediction', threshold=0.5):
    """
    This is the constructor method and can be used to create a new object instance of :class:`.PredictionTask` class.

    :param estimator: It can be either an estimator (e.g., LogisticRegression) or a pipeline ending with an estimator. The estimator needs to be able to predict probabilities through a ``predict_proba`` method.
    :type estimator: scikit-learn {estimator, pipeline}

    :param data_set: The input data set. It can be either a URL, a disk file path or a pandas DataFrame.
    :type data_set: {str, pandas DataFrame}

    :param field_delimiter: The separation delimiter (comma for CSV, tab for TSV, etc.) used in the input data set file. It defaults to ','.
    :type field_delimiter: str, optional

    :param feature_columns: Either 'all' to use from the input data set file all the columns or a list of column names to select specific columns. It defaults to 'all'.
    :type feature_columns: {str, list}, optional

    :param id_column: The name of the column in the input data set file containing the sample IDs. It defaults to 'id'.
    :type id_column: str, optional

    :param prediction_column: The name of the column for the predicted binary class labels. It defaults to 'prediction'.
    :type prediction_column: str, optional

    :param threshold: The threshold to use for converting the predicted probability into a binary class label. It defaults to 0.5.
    :type threshold: float, optional
    """

    arguments = copy.deepcopy(locals())

    super(PredictionTask, self).__init__(PredictionTask.__name__, arguments)

[docs]  def run(self, output_directory):
    """
    Run the task.

    The predictions are returned as a result and also stored in a *predictions.csv* CSV file under the output directory path.

    :param output_directory: The output directory path under which task-related generated files are stored.
    :type output_directory: str

    :return: The task's result. Specifically, the predictions for the input data set, containing the sample IDs, the predicted binary class labels, and the predicted probabilities for the positive class.
    :rtype: pandas DataFrame
    """

    if isinstance(self.data_set, str):
      data_set_data_frame = pd.read_csv(self.data_set, delimiter=self.field_delimiter)
    else:
      data_set_data_frame = self.data_set.copy()

      data_set_data_frame.reset_index(inplace=True, drop=True)

    ids = data_set_data_frame[self.id_column]

    X = data_set_data_frame.drop(columns=[self.id_column])

    if self.feature_columns != 'all':
      X = X[self.feature_columns]

    predictions = pd.DataFrame({ self.id_column : ids, self.prediction_column : self._calculate_y_hat_for_threshold(self.estimator, X, self.threshold), "probability" : self._calculate_probability(self.estimator, X)})

    predictions.to_csv(os.path.join(output_directory, f'predictions.csv'), index=False)

    return predictions

  def _calculate_y_hat_for_threshold (self, estimator, X, threshold):
    y_proba = estimator.predict_proba(X)

    y_hat = y_proba[:, 1] >= threshold

    return y_hat.astype(int)

  def _calculate_probability(self, estimator, X):
    y_proba = estimator.predict_proba(X)

    return y_proba[:, 1]