Source code for skrobot.tasks.feature_selection_cross_validation_task

import os, copy

import pandas as pd

import numpy as np

from sklearn.feature_selection import RFECV

from sklearn.pipeline import Pipeline

from . import BaseCrossValidationTask

[docs]class FeatureSelectionCrossValidationTask(BaseCrossValidationTask): """ The :class:`.FeatureSelectionCrossValidationTask` class can be used to perform feature selection with Recursive Feature Elimination using a scikit-learn estimator on some data. A scikit-learn preprocessor can be used on the input train data set before feature selection runs. It can support both stratified k-fold cross-validation as well as cross-validation with user-defined folds. By default, stratified k-fold cross-validation is used with the default parameters of :meth:`.stratified_folds` method. """
[docs] def __init__ (self, estimator, train_data_set, estimator_params=None, field_delimiter=',', preprocessor=None, preprocessor_params=None, min_features_to_select=1, scoring='f1', feature_columns='all', id_column='id', label_column='label', random_seed=42, verbose=3, n_jobs=1): """ This is the constructor method and can be used to create a new object instance of :class:`.FeatureSelectionCrossValidationTask` class. :param estimator: An estimator (e.g., LogisticRegression). It needs to provide feature importances through either a ``coef_`` or a ``feature_importances_`` attribute. :type estimator: scikit-learn estimator :param train_data_set: The input train data set. It can be either a URL, a disk file path or a pandas DataFrame. :type train_data_set: {str, pandas DataFrame} :param estimator_params: The parameters to override in the provided estimator. It defaults to None. :type estimator_params: dict, optional :param field_delimiter: The separation delimiter (comma for CSV, tab for TSV, etc.) used in the input train data set file. It defaults to ','. :type field_delimiter: str, optional :param preprocessor: The preprocessor you want to run on the input train data set before feature selection. You can set for example a scikit-learn ColumnTransformer, OneHotEncoder, etc. It defaults to None. :type preprocessor: scikit-learn preprocessor, optional :param preprocessor_params: The parameters to override in the provided preprocessor. It defaults to None. :type preprocessor_params: dict, optional :param min_features_to_select: The minimum number of features to be selected. This number of features will always be scored. It defaults to 1. :type min_features_to_select: int, optional :param scoring: A single scikit-learn scorer string (e.g., 'f1') or a callable that is built with scikit-learn ``make_scorer``. Note that when using custom scorers, each scorer should return a single value. It defaults to 'f1'. :type scoring: {str, callable}, optional :param feature_columns: Either 'all' to use from the input train data set file all the columns or a list of column names to select specific columns. It defaults to 'all'. :type feature_columns: {str, list}, optional :param id_column: The name of the column in the input train data set file containing the sample IDs. It defaults to 'id'. :type id_column: str, optional :param label_column: The name of the column in the input train data set file containing the ground truth labels. It defaults to 'label'. :type label_column: str, optional :param random_seed: The random seed used in the random number generator. It can be used to reproduce the output. It defaults to 42. :type random_seed: int, optional :param verbose: Controls the verbosity of output. The higher, the more messages. It defaults to 3. :type verbose: int, optional :param n_jobs: Number of jobs to run in parallel. -1 means using all processors. It defaults to 1. :type n_jobs: int, optional """ super(FeatureSelectionCrossValidationTask, self).__init__(FeatureSelectionCrossValidationTask.__name__, locals())
[docs] def run(self, output_directory): """ Run the task. The selected features are returned as a result and also stored in a *features_selected.txt* file under the output directory path. :param output_directory: The output directory path under which task-related generated files are stored. :type output_directory: str :return: The task's result. Specifically, the selected features, which can be either column names from the input train data set or column indexes from the preprocessed data set, depending on whether a ``preprocessor`` was used or not. :rtype: list """ if isinstance(self.train_data_set, str): self.train_data_set_data_frame = pd.read_csv(self.train_data_set, delimiter=self.field_delimiter) else: self.train_data_set_data_frame = self.train_data_set.copy() self.train_data_set_data_frame.reset_index(inplace=True, drop=True) y = self.train_data_set_data_frame[self.label_column] X = self.train_data_set_data_frame.drop(columns=[self.label_column, self.id_column]) if self.feature_columns != 'all': X = X[self.feature_columns] np.random.seed(self.random_seed) model = RFECV(self._build_estimator(), min_features_to_select=self.min_features_to_select, step=1, cv=self._build_cv_splits(X, y), scoring=self.scoring, verbose=self.verbose, n_jobs=self.n_jobs) if self.preprocessor: model = Pipeline(steps=[('preprocessor', self._build_preprocessor()), ('selection', model)]) model.fit(X, y) if self.preprocessor: features_selected = np.nonzero(model.named_steps.selection.support_)[0].tolist() else: features_selected = X.columns[model.support_].values.tolist() with open(os.path.join(output_directory, 'features_selected.txt'), "w") as f: f.writelines('\n'.join(map(str, features_selected))) return features_selected
def _build_preprocessor (self): preprocessor = copy.deepcopy(self.preprocessor) if self.preprocessor_params: preprocessor.set_params(**self.preprocessor_params) return preprocessor def _build_estimator (self): estimator = copy.deepcopy(self.estimator) if self.estimator_params: estimator.set_params(**self.estimator_params) return estimator