Analysis using MatplotlibΒΆ

../_images/analysis_using_matplotlib-1.png
import itertools

import matplotlib.pyplot
import pandas
import subrela.analysis
import subrela.clustering
import subrela.plot.matplotlib
import subrela.records


def read_dataset(path):

    # This is a dummy. You should read actual dataset from 'path'.
    dataset = pandas.DataFrame({'feature_A': [0, 0], 'feature_B': [-5, -1],
                                'feature_C': [-5, 1], 'feature_D': [6, -2],
                                'feature_E': [6, 2], 'target': [0.2, 0.7]})

    return dataset


def perform_regression(dataset, features, target):

    X = dataset[features].to_numpy()
    y = dataset[target].to_numpy()

    # This is a dummy. You should perform a regression using 'X' and 'y'.
    score = 0.1 * sum(dataset.columns.to_list().index(feature) - 1
                      for feature in features)

    return score


# prepare a dataset
dataset = read_dataset('/path/to/dataset/file')
features = ['feature_A', 'feature_B', 'feature_C', 'feature_D', 'feature_E']

# clustering
Z = subrela.clustering.get_clusters(dataset[features].to_numpy())
groups = subrela.clustering.get_groups(Z, 4.5)

# evaluate scores for feature subsets
flags = list(itertools.product([False, True], repeat=len(features)))
flags = flags[1:]  # drop a case in which no features are used
subset_scores = []
for fs in flags:
    feats = [feature for feature, flag in zip(features, fs) if flag]
    subset_score = perform_regression(dataset, feats, 'target')
    subset_scores.append(subset_score)
s = subrela.records.from_arrays(flags, subset_scores)

# evaluate relevance scores
srs = subrela.analysis.get_strong_relevance_scores(s, Z, clusters=groups,
                                                   descendants=True)
wrs = pandas.concat([subrela.analysis.get_weak_relevance_scores(s, Z, group)
                     for group in groups])

# prepare data for plots
leaf_data, node_data, tree_data, cut_data \
    = subrela.plot.get_dendrogram_data(Z, labels=features, groups=groups)
trace_data = subrela.plot.get_trace_data(node_data, cut_data, wrs, tol=0.1)

# make a figure
_, (sr_ax, wr_ax) = matplotlib.pyplot.subplots(nrows=1, ncols=2,
                                               figsize=(8, 4))
sr_ax.set_title('strong relevance')
sr_ax.invert_yaxis()
subrela.plot.matplotlib.draw_dendrogram(sr_ax, leaf_data, tree_data,
                                        cut_data, orientation='horizontal')
subrela.plot.matplotlib.draw_node_info(
    sr_ax, node_data, srs['relevance_score'], formatter='{:.1f}'.format,
    orientation='horizontal')
sr_ax.set_xlim(left=0)
wr_ax.set_title('weak relevance')
wr_ax.invert_yaxis()
subrela.plot.matplotlib.draw_dendrogram(wr_ax, leaf_data, tree_data,
                                        cut_data, orientation='horizontal')
subrela.plot.matplotlib.draw_node_info(
    wr_ax, node_data, wrs['relevance_score'], formatter='{:.1f}'.format,
    orientation='horizontal')
subrela.plot.matplotlib.draw_trace(wr_ax, trace_data,
                                   orientation='horizontal')
wr_ax.set_xlim(left=0)
matplotlib.pyplot.show()