Source code for skrobot.tasks.deep_feature_synthesis_task

import os, copy

import pandas as pd

import featuretools as ft

from . import BaseTask

[docs]class DeepFeatureSynthesisTask(BaseTask): """ The :class:`.DeepFeatureSynthesisTask` class is a wrapper for Featuretools. It can be used to automate feature engineering and create features from temporal and relational datasets. """
[docs] def __init__ (self, entities=None, relationships=None, entityset=None, target_entity=None, cutoff_time=None, instance_ids=None, agg_primitives=None, trans_primitives=None, groupby_trans_primitives=None, allowed_paths=None, max_depth=2, ignore_entities=None, ignore_variables=None, primitive_options=None, seed_features=None, drop_contains=None, drop_exact=None, where_primitives=None, max_features=-1, save_progress=None, training_window=None, approximate=None, chunk_size=None, n_jobs=1, dask_kwargs=None, verbose=False, return_variable_types=None, progress_callback=None, include_cutoff_time=True, export_feature_graphs=False, export_feature_information=False, label_column='label'): """ This is the constructor method and can be used to create a new object instance of :class:`.DeepFeatureSynthesisTask` class. Most of the arguments are documented here: https://featuretools.alteryx.com/en/stable/generated/featuretools.dfs.html#featuretools.dfs :param export_feature_graphs: If this task will export feature computation graphs. It defaults to False. :type export_feature_graphs: bool, optional :param export_feature_information: If this task will export feature information. The feature definitions can be used to recalculate features for a different data set. It defaults to False. :type export_feature_information: bool, optional :param label_column: The name of the column containing the ground truth labels. It defaults to 'label'. :type label_column: str, optional """ arguments = copy.deepcopy(locals()) super(DeepFeatureSynthesisTask, self).__init__(DeepFeatureSynthesisTask.__name__, arguments)
[docs] def run(self, output_directory): """ Run the task. The synthesized output data set is returned as a result and also stored in a *synthesized_dataset.csv* file under the output directory path. The features information are stored in a *feature_information.html* file as a static HTML table under the output directory path. The feature computation graphs are stored as PNG files under the output directory path. Also, the feature definitions are stored in a *feature_definitions.txt* file under the output directory path. :param output_directory: The output directory path under which task-related generated files are stored. :type output_directory: str :return: The task's result. Specifically, **1)** ``synthesized_dataset``: The synthesized output data set as a pandas DataFrame. **2)** ``feature_definitions``: The definitions of features in the synthesized output data set. The feature definitions can be used to recalculate features for a different data set. :rtype: dict """ synthesized_dataset, feature_defs = ft.dfs( entities=self.entities, relationships=self.relationships, entityset=self.entityset, target_entity=self.target_entity, cutoff_time=self.cutoff_time, instance_ids=self.instance_ids, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, groupby_trans_primitives=self.groupby_trans_primitives, allowed_paths=self.allowed_paths, max_depth=self.max_depth, ignore_entities=self.ignore_entities, ignore_variables=self.ignore_variables, primitive_options=self.primitive_options, seed_features=self.seed_features, drop_contains=self.drop_contains, drop_exact=self.drop_exact, where_primitives=self.where_primitives, max_features=self.max_features, save_progress=self.save_progress, training_window=self.training_window, approximate=self.approximate, chunk_size=self.chunk_size, n_jobs=self.n_jobs, dask_kwargs=self.dask_kwargs, verbose=self.verbose, return_variable_types=self.return_variable_types, progress_callback=self.progress_callback, include_cutoff_time=self.include_cutoff_time ) label_related_columns_to_drop = [] for column in synthesized_dataset: if column == self.label_column: pass elif self.label_column in column: label_related_columns_to_drop.append(column) synthesized_dataset = synthesized_dataset[[o for o in synthesized_dataset if o not in label_related_columns_to_drop]] feature_defs = [ o for o in feature_defs if o.get_name() != self.label_column and o.get_name() not in label_related_columns_to_drop ] features_df = pd.DataFrame({ 'feature_id' : range(len(feature_defs)), 'feature_def' : feature_defs }) features_df['feature_name'] = features_df['feature_def'].apply(lambda o: o.get_name()) features_df['feature_type'] = features_df['feature_def'].apply(lambda o: synthesized_dataset.dtypes[o.get_name()]) if self.export_feature_graphs: features_df.apply(lambda o: ft.graph_feature(o['feature_def'], to_file=os.path.join(output_directory, 'feature_graphs', f'{o["feature_id"]}.png'), description=True), axis=1) if self.export_feature_information: features_df['feature_description'] = features_df['feature_def'].apply(lambda o: ft.describe_feature(o)) features_df.drop(columns=['feature_def'], inplace=True) features_df.to_html(os.path.join(output_directory, 'feature_information.html'), index=False) synthesized_dataset.reset_index(inplace=True) synthesized_dataset.to_csv(os.path.join(output_directory, 'synthesized_dataset.csv'), index=False) ft.save_features(feature_defs, os.path.join(output_directory, 'feature_definitions.txt')) return { 'synthesized_dataset' : synthesized_dataset, 'feature_definitions': feature_defs }