Source code for visualime.feature_selection

from typing import Any, Callable, Dict, List, Optional, Tuple

import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import lars_path

from ._models import LINEAR_MODEL_TYPES, MODEL_TYPE_PARAMS_DOC, instantiate_model
from .lime import SAMPLES_PREDICTIONS_LABEL_IDX_DOC
from .metrics import DISTANCES_KERNEL_DOC, cosine_distance, exponential_kernel


def _get_num_segments(
    samples: np.ndarray, num_segments_to_select: Optional[int]
) -> Tuple[int, int]:
    num_segments = samples.shape[1]
    num_segments_to_select = num_segments_to_select or num_segments
    if num_segments_to_select > num_segments:
        raise ValueError(
            f"Number of features to select ({num_segments_to_select}) cannot exceed "
            f"number of features in data ({num_segments})"
        )
    return num_segments, num_segments_to_select


[docs]def select_by_weight( samples: np.ndarray, predictions: np.ndarray, label_idx: int, model_type: LINEAR_MODEL_TYPES = "bayesian_ridge", model_params: Optional[Dict[str, Any]] = None, distances: Optional[np.ndarray] = None, kernel: Callable[[np.ndarray], np.ndarray] = exponential_kernel, num_segments_to_select: Optional[int] = None, ) -> List[int]: num_segments, num_segments_to_select = _get_num_segments( samples, num_segments_to_select ) if distances is None: distances = cosine_distance(samples) sample_weight = kernel(distances) linear_model = instantiate_model(model_type=model_type, model_params=model_params) selector = SelectFromModel( estimator=linear_model, threshold=-np.inf, max_features=num_segments_to_select ) selector.fit(X=samples, y=predictions[:, label_idx], sample_weight=sample_weight) return list(selector.get_support(indices=True))
select_by_weight.__doc__ = f"""Select the `num_segments_to_select` segments with the highest weight. Parameters ---------- {SAMPLES_PREDICTIONS_LABEL_IDX_DOC} {MODEL_TYPE_PARAMS_DOC} It is generally advisable to use the same model as for the final :meth:`visualime.lime.weigh_segments` function. {DISTANCES_KERNEL_DOC} num_segments_to_select : int, optional The number of segments to select. If not given, select all segments. Returns ------- list of ints List of the indices of the selected segments. Segments are ordered by descending weight. """
[docs]def forward_selection( samples: np.ndarray, predictions: np.ndarray, label_idx: int, model_type: LINEAR_MODEL_TYPES = "ridge", model_params: Optional[Dict[str, Any]] = None, distances: Optional[np.ndarray] = None, kernel: Callable[[np.ndarray], np.ndarray] = exponential_kernel, num_segments_to_select: Optional[int] = None, ) -> List[int]: num_segments, num_segments_to_select = _get_num_segments( samples, num_segments_to_select ) if distances is None: distances = cosine_distance(samples) sample_weight = kernel(distances) # TODO: Understand and account for the implications of regularization linear_model = instantiate_model(model_type=model_type, model_params=model_params) # TODO: Wait for https://github.com/scikit-learn/scikit-learn/issues/25236 def score(current_features: List[int], next_feature_idx: int) -> float: linear_model.fit( samples[:, current_features + [next_feature_idx]], predictions[:, label_idx], sample_weight=sample_weight, ) return float( linear_model.score( samples[:, current_features + [next_feature_idx]], predictions[:, label_idx], sample_weight=sample_weight, ) ) selected_segments: List[int] = [] for _ in range(num_segments_to_select): selectable_segments = set(range(num_segments)) - set(selected_segments) scores = ( (score(selected_segments, segment_idx), segment_idx) for segment_idx in selectable_segments ) segment_with_highest_score = max(scores, key=lambda x: x[0])[1] selected_segments.append(segment_with_highest_score) return selected_segments
forward_selection.__doc__ = f"""Select `num_segments_to_select` through forward selection. Parameters ---------- {SAMPLES_PREDICTIONS_LABEL_IDX_DOC} {MODEL_TYPE_PARAMS_DOC} It is generally advisable to use the same model as for the final :meth:`visualime.lime.weigh_segments` function. {DISTANCES_KERNEL_DOC} num_segments_to_select : int, optional The number of segments to select. If not given, select all segments. Returns ------- list of ints List of the indices of the selected segments. The segments are ordered as they were selected. """
[docs]def lars_selection( samples: np.ndarray, predictions: np.ndarray, label_idx: int, num_segments_to_select: Optional[int] = None, ) -> List[int]: num_segments, num_segments_to_select = _get_num_segments( samples, num_segments_to_select ) _, _, coefs, num_of_iterations = lars_path( samples, predictions[:, label_idx], return_path=True, return_n_iter=True ) for iteration in range(num_of_iterations, 0, -1): segments_with_nonzero_coefficients = coefs.T[iteration].nonzero()[0] if len(segments_with_nonzero_coefficients) <= num_segments_to_select: break else: raise RuntimeError( f"Could not find subset of {num_segments_to_select} features" ) return list(segments_with_nonzero_coefficients)
lars_selection.__doc__ = f"""Select up to `num_segments_to_select` segments using the LARS path method. Parameters ---------- {SAMPLES_PREDICTIONS_LABEL_IDX_DOC} num_segments_to_select : int, optional The maximum number of segments to select. If not given, this value is set to the total number of segments. Returns ------- list of ints List of the indices of the selected segments. The segment indices are in ascending order. """