Source code for skrobot.tasks.hyperparameters_search_cross_validation_task
import os, copy
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from . import BaseCrossValidationTask
[docs]class HyperParametersSearchCrossValidationTask(BaseCrossValidationTask):
"""
The :class:`.HyperParametersSearchCrossValidationTask` class can be used to search the best hyperparameters of a scikit-learn estimator/pipeline on some data.
**Cross-Validation**
It can support both stratified k-fold cross-validation as well as cross-validation with user-defined folds.
By default, stratified k-fold cross-validation is used with the default parameters of :meth:`.stratified_folds` method.
**Search**
It can support both grid search as well as random search.
By default, grid search is used.
"""
[docs] def __init__ (self, estimator, search_params, train_data_set, estimator_params=None, field_delimiter=',', scorers=['roc_auc', 'average_precision', 'f1', 'precision', 'recall', 'accuracy'], feature_columns='all', id_column='id', label_column='label', objective_score='f1', random_seed=42, verbose=3, n_jobs=1, return_train_score=True):
"""
This is the constructor method and can be used to create a new object instance of :class:`.HyperParametersSearchCrossValidationTask` class.
:param estimator: It can be either an estimator (e.g., LogisticRegression) or a pipeline ending with an estimator.
:type estimator: scikit-learn {estimator, pipeline}
:param search_params: Dictionary with hyperparameters names as keys and lists of hyperparameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of hyperparameter settings.
:type search_params: {dict, list of dictionaries}
:param train_data_set: The input train data set. It can be either a URL, a disk file path or a pandas DataFrame.
:type train_data_set: {str, pandas DataFrame}
:param estimator_params: The parameters to override in the provided estimator/pipeline. It defaults to None.
:type estimator_params: dict, optional
:param field_delimiter: The separation delimiter (comma for CSV, tab for TSV, etc.) used in the input train data set file. It defaults to ','.
:type field_delimiter: str, optional
:param scorers: Multiple metrics to evaluate the predictions on the hold-out data. Either give a list of (unique) strings or a dict with names as keys and callables as values. The callables should be scorers built using scikit-learn ``make_scorer``. Note that when using custom scorers, each scorer should return a single value. It defaults to ['roc_auc', 'average_precision', 'f1', 'precision', 'recall', 'accuracy'].
:type scorers: {list, dict}, optional
:param feature_columns: Either 'all' to use from the input train data set file all the columns or a list of column names to select specific columns. It defaults to 'all'.
:type feature_columns: {str, list}, optional
:param id_column: The name of the column in the input train data set file containing the sample IDs. It defaults to 'id'.
:type id_column: str, optional
:param label_column: The name of the column in the input train data set file containing the ground truth labels. It defaults to 'label'.
:type label_column: str, optional
:param objective_score: The scorer that would be used to find the best hyperparameters for refitting the best estimator/pipeline at the end. It defaults to 'f1'.
:type objective_score: str, optional
:param random_seed: The random seed used in the random number generator. It can be used to reproduce the output. It defaults to 42.
:type random_seed: int, optional
:param verbose: Controls the verbosity of output. The higher, the more messages. It defaults to 3.
:type verbose: int, optional
:param n_jobs: Number of jobs to run in parallel. -1 means using all processors. It defaults to 1.
:type n_jobs: int, optional
:param return_train_score: If False, training scores will not be computed and returned. Computing training scores is used to get insights on how different parameter settings impact the overfitting/underfitting trade-off. It defaults to True.
:type return_train_score: bool, optional
"""
super(HyperParametersSearchCrossValidationTask, self).__init__(HyperParametersSearchCrossValidationTask.__name__, locals())
self.grid_search()
pd.set_option('display.max_colwidth', None)
[docs] def grid_search(self):
"""
Optional method.
Use the grid search method when searching the best hyperparameters.
:return: The object instance itself.
:rtype: :class:`.HyperParametersSearchCrossValidationTask`
"""
options = self._filter_arguments(locals())
self._update_arguments({ 'search_options' : options})
self._update_arguments({ 'search_method': 'grid' })
return self
[docs] def random_search(self, n_iters=200):
"""
Optional method.
Use the random search method when searching the best hyperparameters.
:param n_iters: Number of hyperparameter settings that are sampled. ``n_iters`` trades off runtime vs quality of the solution. It defaults to 200.
:type n_iters: int, optional
:return: The object instance itself.
:rtype: :class:`.HyperParametersSearchCrossValidationTask`
"""
options = self._filter_arguments(locals())
self._update_arguments({ 'search_options' : options})
self._update_arguments({ 'search_method': 'random' })
return self
[docs] def run(self, output_directory):
"""
Run the task.
The search results (``search_results``) are stored also in a *search_results.html* file as a static HTML table under the output directory path.
:param output_directory: The output directory path under which task-related generated files are stored.
:type output_directory: str
:return: The task's result. Specifically, **1)** ``best_estimator``: The estimator/pipeline that was chosen by the search, i.e. estimator/pipeline which gave best score on the hold-out data. **2)** ``best_params``: The hyperparameters setting that gave the best results on the hold-out data. **3)** ``best_score``: Mean cross-validated score of the ``best_estimator``. **4)** ``search_results``: Metrics measured for each of the hyperparameters setting in the search. **5)** ``best_index``: The index (of the ``search_results``) which corresponds to the best candidate hyperparameters setting.
:rtype: dict
"""
if isinstance(self.train_data_set, str):
self.train_data_set_data_frame = pd.read_csv(self.train_data_set, delimiter=self.field_delimiter)
else:
self.train_data_set_data_frame = self.train_data_set.copy()
self.train_data_set_data_frame.reset_index(inplace=True, drop=True)
y = self.train_data_set_data_frame[self.label_column]
X = self.train_data_set_data_frame.drop(columns=[self.label_column, self.id_column])
if self.feature_columns != 'all':
X = X[self.feature_columns]
np.random.seed(self.random_seed)
search = self._build_search_method(self._build_cv_splits(X, y))
search.fit(X, y)
cv_results = pd.DataFrame(search.cv_results_).reset_index()
cv_results.columns = cv_results.columns.str.replace('_test_', '_validation_')
cv_results.to_html(os.path.join(output_directory, 'search_results.html'), index=False)
return { 'best_estimator': search.best_estimator_, 'best_params': search.best_params_, 'best_index': search.best_index_, 'best_score': search.best_score_, 'search_results': cv_results }
def _build_search_method (self, cv):
if self.search_method == 'random':
return RandomizedSearchCV(self._build_estimator(), self.search_params, cv=cv, scoring=self.scorers, refit=self.objective_score, return_train_score=self.return_train_score, n_jobs=self.n_jobs, verbose=self.verbose, n_iter=self.search_options['n_iters'], random_state=self.random_seed)
else:
return GridSearchCV(self._build_estimator(), self.search_params, cv=cv, scoring=self.scorers, refit=self.objective_score, return_train_score=self.return_train_score, n_jobs=self.n_jobs, verbose=self.verbose)
def _build_estimator (self):
estimator = copy.deepcopy(self.estimator)
if self.estimator_params: estimator.set_params(**self.estimator_params)
return estimator