Source code for skrobot.tasks.deep_feature_synthesis_task

import os, copy

import pandas as pd

import featuretools as ft

from . import BaseTask

[docs]class DeepFeatureSynthesisTask(BaseTask):
  """
  The :class:`.DeepFeatureSynthesisTask` class is a wrapper for Featuretools. It can be used to automate feature engineering and create features from temporal and relational datasets.
  """
[docs]  def __init__ (self, entities=None,
                      relationships=None,
                      entityset=None,
                      target_entity=None,
                      cutoff_time=None,
                      instance_ids=None,
                      agg_primitives=None,
                      trans_primitives=None,
                      groupby_trans_primitives=None,
                      allowed_paths=None,
                      max_depth=2,
                      ignore_entities=None,
                      ignore_variables=None,
                      primitive_options=None,
                      seed_features=None,
                      drop_contains=None,
                      drop_exact=None,
                      where_primitives=None,
                      max_features=-1,
                      save_progress=None,
                      training_window=None,
                      approximate=None,
                      chunk_size=None,
                      n_jobs=1,
                      dask_kwargs=None,
                      verbose=False,
                      return_variable_types=None,
                      progress_callback=None,
                      include_cutoff_time=True,
                      export_feature_graphs=False,
                      export_feature_information=False,
                      label_column='label'):
    """
    This is the constructor method and can be used to create a new object instance of :class:`.DeepFeatureSynthesisTask` class.

    Most of the arguments are documented here: https://featuretools.alteryx.com/en/stable/generated/featuretools.dfs.html#featuretools.dfs

    :param export_feature_graphs: If this task will export feature computation graphs. It defaults to False.
    :type export_feature_graphs: bool, optional

    :param export_feature_information: If this task will export feature information. The feature definitions can be used to recalculate features for a different data set. It defaults to False.
    :type export_feature_information: bool, optional

    :param label_column: The name of the column containing the ground truth labels. It defaults to 'label'.
    :type label_column: str, optional
    """
    arguments = copy.deepcopy(locals())

    super(DeepFeatureSynthesisTask, self).__init__(DeepFeatureSynthesisTask.__name__, arguments)

[docs]  def run(self, output_directory):
    """
    Run the task.

    The synthesized output data set is returned as a result and also stored in a *synthesized_dataset.csv* file under the output directory path.

    The features information are stored in a *feature_information.html* file as a static HTML table under the output directory path.

    The feature computation graphs are stored as PNG files under the output directory path.

    Also, the feature definitions are stored in a *feature_definitions.txt* file under the output directory path.

    :param output_directory: The output directory path under which task-related generated files are stored.
    :type output_directory: str

    :return: The task's result. Specifically, **1)** ``synthesized_dataset``: The synthesized output data set as a pandas DataFrame. **2)** ``feature_definitions``: The definitions of features in the synthesized output data set. The feature definitions can be used to recalculate features for a different data set.
    :rtype: dict
    """
    synthesized_dataset, feature_defs = ft.dfs(
      entities=self.entities,
      relationships=self.relationships,
      entityset=self.entityset,
      target_entity=self.target_entity,
      cutoff_time=self.cutoff_time,
      instance_ids=self.instance_ids,
      agg_primitives=self.agg_primitives,
      trans_primitives=self.trans_primitives,
      groupby_trans_primitives=self.groupby_trans_primitives,
      allowed_paths=self.allowed_paths,
      max_depth=self.max_depth,
      ignore_entities=self.ignore_entities,
      ignore_variables=self.ignore_variables,
      primitive_options=self.primitive_options,
      seed_features=self.seed_features,
      drop_contains=self.drop_contains,
      drop_exact=self.drop_exact,
      where_primitives=self.where_primitives,
      max_features=self.max_features,
      save_progress=self.save_progress,
      training_window=self.training_window,
      approximate=self.approximate,
      chunk_size=self.chunk_size,
      n_jobs=self.n_jobs,
      dask_kwargs=self.dask_kwargs,
      verbose=self.verbose,
      return_variable_types=self.return_variable_types,
      progress_callback=self.progress_callback,
      include_cutoff_time=self.include_cutoff_time
    )

    label_related_columns_to_drop = []

    for column in synthesized_dataset:
      if column == self.label_column:
        pass
      elif self.label_column in column:
          label_related_columns_to_drop.append(column)

    synthesized_dataset = synthesized_dataset[[o for o in synthesized_dataset if o not in label_related_columns_to_drop]]

    feature_defs = [ o for o in feature_defs if o.get_name() != self.label_column and o.get_name() not in label_related_columns_to_drop ]

    features_df = pd.DataFrame({ 'feature_id' : range(len(feature_defs)), 'feature_def' : feature_defs })

    features_df['feature_name'] = features_df['feature_def'].apply(lambda o: o.get_name())
    features_df['feature_type'] = features_df['feature_def'].apply(lambda o: synthesized_dataset.dtypes[o.get_name()])

    if self.export_feature_graphs:
      features_df.apply(lambda o: ft.graph_feature(o['feature_def'], to_file=os.path.join(output_directory, 'feature_graphs', f'{o["feature_id"]}.png'), description=True), axis=1)

    if self.export_feature_information:
      features_df['feature_description'] = features_df['feature_def'].apply(lambda o: ft.describe_feature(o))

      features_df.drop(columns=['feature_def'], inplace=True)

      features_df.to_html(os.path.join(output_directory, 'feature_information.html'), index=False)

    synthesized_dataset.reset_index(inplace=True)

    synthesized_dataset.to_csv(os.path.join(output_directory, 'synthesized_dataset.csv'), index=False)

    ft.save_features(feature_defs, os.path.join(output_directory, 'feature_definitions.txt'))

    return { 'synthesized_dataset' : synthesized_dataset, 'feature_definitions': feature_defs }