Source code for skrobot.tasks.prediction_task

import os, copy

import pandas as pd

from . import BaseTask

[docs]class PredictionTask(BaseTask): """ The :class:`.PredictionTask` class can be used to predict new data using a scikit-learn estimator/pipeline. """
[docs] def __init__ (self, estimator, data_set, field_delimiter=',', feature_columns='all', id_column='id', prediction_column='prediction', threshold=0.5): """ This is the constructor method and can be used to create a new object instance of :class:`.PredictionTask` class. :param estimator: It can be either an estimator (e.g., LogisticRegression) or a pipeline ending with an estimator. The estimator needs to be able to predict probabilities through a ``predict_proba`` method. :type estimator: scikit-learn {estimator, pipeline} :param data_set: The input data set. It can be either a URL, a disk file path or a pandas DataFrame. :type data_set: {str, pandas DataFrame} :param field_delimiter: The separation delimiter (comma for CSV, tab for TSV, etc.) used in the input data set file. It defaults to ','. :type field_delimiter: str, optional :param feature_columns: Either 'all' to use from the input data set file all the columns or a list of column names to select specific columns. It defaults to 'all'. :type feature_columns: {str, list}, optional :param id_column: The name of the column in the input data set file containing the sample IDs. It defaults to 'id'. :type id_column: str, optional :param prediction_column: The name of the column for the predicted binary class labels. It defaults to 'prediction'. :type prediction_column: str, optional :param threshold: The threshold to use for converting the predicted probability into a binary class label. It defaults to 0.5. :type threshold: float, optional """ arguments = copy.deepcopy(locals()) super(PredictionTask, self).__init__(PredictionTask.__name__, arguments)
[docs] def run(self, output_directory): """ Run the task. The predictions are returned as a result and also stored in a *predictions.csv* CSV file under the output directory path. :param output_directory: The output directory path under which task-related generated files are stored. :type output_directory: str :return: The task's result. Specifically, the predictions for the input data set, containing the sample IDs, the predicted binary class labels, and the predicted probabilities for the positive class. :rtype: pandas DataFrame """ if isinstance(self.data_set, str): data_set_data_frame = pd.read_csv(self.data_set, delimiter=self.field_delimiter) else: data_set_data_frame = self.data_set.copy() data_set_data_frame.reset_index(inplace=True, drop=True) ids = data_set_data_frame[self.id_column] X = data_set_data_frame.drop(columns=[self.id_column]) if self.feature_columns != 'all': X = X[self.feature_columns] predictions = pd.DataFrame({ self.id_column : ids, self.prediction_column : self._calculate_y_hat_for_threshold(self.estimator, X, self.threshold), "probability" : self._calculate_probability(self.estimator, X)}) predictions.to_csv(os.path.join(output_directory, f'predictions.csv'), index=False) return predictions
def _calculate_y_hat_for_threshold (self, estimator, X, threshold): y_proba = estimator.predict_proba(X) y_hat = y_proba[:, 1] >= threshold return y_hat.astype(int) def _calculate_probability(self, estimator, X): y_proba = estimator.predict_proba(X) return y_proba[:, 1]