Skip to content

capturegraph.scheduling.void_and_cluster #

Void & Cluster Scheduling Algorithm#

A generic implementation of the Void & Cluster importance sampling algorithm. This module selects a subset of candidate items that maximize diversity relative to a set of existing items, using a user-provided distance metric.

select_sessions(potential_sessions, previous_sessions, distance_fn, energy_fn=gaussian(sigma=1), energy_mode='sum', selections=10) #

Select sessions using Void & Cluster importance sampling.

The algorithm iteratively selects points that maximize diversity: 1. Find the "void" (point with lowest energy from existing/selected points). 2. Add it to the selection. 3. Find the "cluster" (point with highest energy, i.e., too similar). 4. Remove the cluster if different from the void. 5. Repeat until desired count is reached.

Parameters:

Name Type Description Default
potential_sessions List[Dict[Any]]

List of candidate sessions to select from. Each session must have attributes compatible with the distance_fn.

required
previous_sessions List[Dict[Any]]

List of already-captured sessions. New selections will be spread away from these.

required
distance_fn Callable[[Dict[Any], Dict[Any]], float]

Function (session_a, session_b) -> float returning statistical distance between two sessions.

required
energy_fn Callable[[float], float]

Function distance -> energy that converts distance to coverage energy. Default is Gaussian with sigma=1.

gaussian(sigma=1)
energy_mode str

How to combine energies from multiple sources. - "sum": Total energy is sum of all contributions. - "max": Total energy is maximum contribution.

'sum'
selections int

Number of sessions to select.

10

Returns:

Type Description
List[Dict[Any]]

List of selected session Dicts from potential_sessions.

Raises:

Type Description
ValueError

If energy_mode is not "sum" or "max".

Source code in capturegraph-lib/capturegraph/scheduling/void_and_cluster.py
def select_sessions(
    potential_sessions: cg.List[cg.Dict[Any]],
    previous_sessions: cg.List[cg.Dict[Any]],
    distance_fn: Callable[[cg.Dict[Any], cg.Dict[Any]], float],
    energy_fn: Callable[[float], float] = gaussian(sigma=1),
    energy_mode: str = "sum",
    selections: int = 10,
) -> cg.List[cg.Dict[Any]]:
    """Select sessions using Void & Cluster importance sampling.

    The algorithm iteratively selects points that maximize diversity:
    1. Find the "void" (point with lowest energy from existing/selected points).
    2. Add it to the selection.
    3. Find the "cluster" (point with highest energy, i.e., too similar).
    4. Remove the cluster if different from the void.
    5. Repeat until desired count is reached.

    Args:
        potential_sessions: List of candidate sessions to select from.
            Each session must have attributes compatible with the distance_fn.
        previous_sessions: List of already-captured sessions. New selections
            will be spread away from these.
        distance_fn: Function `(session_a, session_b) -> float` returning
            statistical distance between two sessions.
        energy_fn: Function `distance -> energy` that converts distance to
            coverage energy. Default is Gaussian with sigma=1.
        energy_mode: How to combine energies from multiple sources.
            - "sum": Total energy is sum of all contributions.
            - "max": Total energy is maximum contribution.
        selections: Number of sessions to select.

    Returns:
        List of selected session Dicts from potential_sessions.

    Raises:
        ValueError: If energy_mode is not "sum" or "max".
    """

    if energy_mode == "sum":
        nil_value = 0
    elif energy_mode == "max":
        nil_value = -np.inf
    else:
        raise ValueError(f"Unknown energy mode: {energy_mode}")

    selections = min(selections, len(potential_sessions))

    def rand_argmin(arr: np.ndarray) -> int:
        return np.random.choice(np.flatnonzero(arr == np.min(arr)))

    def rand_argmax(arr: np.ndarray) -> int:
        return np.random.choice(np.flatnonzero(arr == np.max(arr)))

    # Compute distance matrices then convert to energy
    potential_to_existing = energy_fn(
        batch_matrix(distance_fn, potential_sessions, previous_sessions)
    )
    potential_to_potential = energy_fn(
        batch_matrix(distance_fn, potential_sessions, potential_sessions)
    )

    np.fill_diagonal(potential_to_potential, nil_value)

    if len(previous_sessions) == 0:
        base_energy = np.full(len(potential_sessions), nil_value)
    elif energy_mode == "sum":
        base_energy = np.sum(potential_to_existing, axis=1)
    elif energy_mode == "max":
        base_energy = np.max(potential_to_existing, axis=1)

    selected = np.zeros(len(potential_sessions), dtype=bool)

    while np.sum(selected) < selections:
        # Calculate energy from already-selected sessions
        if np.sum(selected) == 0:
            # First selection: pick the point furthest from existing sessions
            void = rand_argmin(base_energy)
            selected[void] = True
            continue
        elif energy_mode == "sum":
            selected_energy = np.sum(potential_to_potential[:, selected], axis=1)
            energy = base_energy + selected_energy
        elif energy_mode == "max":
            selected_energy = np.max(potential_to_potential[:, selected], axis=1)
            energy = np.maximum(base_energy, selected_energy)

        energy[selected] = np.inf
        void = rand_argmin(energy)
        selected[void] = True

        if energy_mode == "sum":
            cluster_energy = np.sum(potential_to_potential[:, selected], axis=1)
            new_energy = base_energy + cluster_energy
        elif energy_mode == "max":
            cluster_energy = np.max(potential_to_potential[:, selected], axis=1)
            new_energy = np.maximum(base_energy, cluster_energy)

        new_energy[selected] = -np.inf
        cluster = rand_argmax(new_energy)

        if cluster != void:
            selected[cluster] = False

    return cg.List([potential_sessions[int(i)] for i in np.where(selected)[0]])