Source code for visualime.feature_selection

from typing import Any, Callable, Dict, List, Optional, Tuple

import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import lars_path

from ._models import LINEAR_MODEL_TYPES, MODEL_TYPE_PARAMS_DOC, instantiate_model
from .lime import SAMPLES_PREDICTIONS_LABEL_IDX_DOC
from .metrics import DISTANCES_KERNEL_DOC, cosine_distance, exponential_kernel


def _get_num_segments(
    samples: np.ndarray, num_segments_to_select: Optional[int]
) -> Tuple[int, int]:
    num_segments = samples.shape[1]
    num_segments_to_select = num_segments_to_select or num_segments
    if num_segments_to_select > num_segments:
        raise ValueError(
            f"Number of features to select ({num_segments_to_select}) cannot exceed "
            f"number of features in data ({num_segments})"
        )
    return num_segments, num_segments_to_select


[docs]def select_by_weight(
    samples: np.ndarray,
    predictions: np.ndarray,
    label_idx: int,
    model_type: LINEAR_MODEL_TYPES = "bayesian_ridge",
    model_params: Optional[Dict[str, Any]] = None,
    distances: Optional[np.ndarray] = None,
    kernel: Callable[[np.ndarray], np.ndarray] = exponential_kernel,
    num_segments_to_select: Optional[int] = None,
) -> List[int]:
    num_segments, num_segments_to_select = _get_num_segments(
        samples, num_segments_to_select
    )

    if distances is None:
        distances = cosine_distance(samples)
    sample_weight = kernel(distances)

    linear_model = instantiate_model(model_type=model_type, model_params=model_params)

    selector = SelectFromModel(
        estimator=linear_model, threshold=-np.inf, max_features=num_segments_to_select
    )
    selector.fit(X=samples, y=predictions[:, label_idx], sample_weight=sample_weight)

    return list(selector.get_support(indices=True))


select_by_weight.__doc__ = f"""Select the `num_segments_to_select` segments with the highest weight.

    Parameters
    ----------
    {SAMPLES_PREDICTIONS_LABEL_IDX_DOC}

    {MODEL_TYPE_PARAMS_DOC}

        It is generally advisable to use the same model as for the final
        :meth:`visualime.lime.weigh_segments` function.

    {DISTANCES_KERNEL_DOC}

    num_segments_to_select : int, optional
        The number of segments to select. If not given, select all segments.

    Returns
    -------
    list of ints
        List of the indices of the selected segments.
        Segments are ordered by descending weight.
"""


[docs]def forward_selection(
    samples: np.ndarray,
    predictions: np.ndarray,
    label_idx: int,
    model_type: LINEAR_MODEL_TYPES = "ridge",
    model_params: Optional[Dict[str, Any]] = None,
    distances: Optional[np.ndarray] = None,
    kernel: Callable[[np.ndarray], np.ndarray] = exponential_kernel,
    num_segments_to_select: Optional[int] = None,
) -> List[int]:
    num_segments, num_segments_to_select = _get_num_segments(
        samples, num_segments_to_select
    )

    if distances is None:
        distances = cosine_distance(samples)
    sample_weight = kernel(distances)

    # TODO: Understand and account for the implications of regularization
    linear_model = instantiate_model(model_type=model_type, model_params=model_params)

    # TODO: Wait for https://github.com/scikit-learn/scikit-learn/issues/25236
    def score(current_features: List[int], next_feature_idx: int) -> float:
        linear_model.fit(
            samples[:, current_features + [next_feature_idx]],
            predictions[:, label_idx],
            sample_weight=sample_weight,
        )

        return float(
            linear_model.score(
                samples[:, current_features + [next_feature_idx]],
                predictions[:, label_idx],
                sample_weight=sample_weight,
            )
        )

    selected_segments: List[int] = []
    for _ in range(num_segments_to_select):
        selectable_segments = set(range(num_segments)) - set(selected_segments)
        scores = (
            (score(selected_segments, segment_idx), segment_idx)
            for segment_idx in selectable_segments
        )
        segment_with_highest_score = max(scores, key=lambda x: x[0])[1]
        selected_segments.append(segment_with_highest_score)

    return selected_segments


forward_selection.__doc__ = f"""Select `num_segments_to_select` through forward selection.

    Parameters
    ----------
    {SAMPLES_PREDICTIONS_LABEL_IDX_DOC}

    {MODEL_TYPE_PARAMS_DOC}

        It is generally advisable to use the same model as for the final
        :meth:`visualime.lime.weigh_segments` function.

    {DISTANCES_KERNEL_DOC}

    num_segments_to_select : int, optional
        The number of segments to select. If not given, select all segments.

    Returns
    -------
    list of ints
        List of the indices of the selected segments.
        The segments are ordered as they were selected.
"""


[docs]def lars_selection(
    samples: np.ndarray,
    predictions: np.ndarray,
    label_idx: int,
    num_segments_to_select: Optional[int] = None,
) -> List[int]:
    num_segments, num_segments_to_select = _get_num_segments(
        samples, num_segments_to_select
    )

    _, _, coefs, num_of_iterations = lars_path(
        samples, predictions[:, label_idx], return_path=True, return_n_iter=True
    )

    for iteration in range(num_of_iterations, 0, -1):
        segments_with_nonzero_coefficients = coefs.T[iteration].nonzero()[0]
        if len(segments_with_nonzero_coefficients) <= num_segments_to_select:
            break
    else:
        raise RuntimeError(
            f"Could not find subset of {num_segments_to_select} features"
        )

    return list(segments_with_nonzero_coefficients)


lars_selection.__doc__ = f"""Select up to `num_segments_to_select` segments using the LARS path method.

    Parameters
    ----------
    {SAMPLES_PREDICTIONS_LABEL_IDX_DOC}

    num_segments_to_select : int, optional
        The maximum number of segments to select.
        If not given, this value is set to the total number of segments.

    Returns
    -------
    list of ints
        List of the indices of the selected segments.
        The segment indices are in ascending order.
"""