Source code for yamle.utils.tuning_utils

import os
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import natsort

from yamle.evaluation.metrics.algorithmic import (
    METRIC_TENDENCY,
    MIN_TENDENCY,
    parse_metric,
)



[docs]
def sample_initial_random_configs(
    config_space: Dict[str, Any], n_samples: int = 10
) -> List[Dict[str, Any]]:
    """Sample random initial configurations from the config space."""
    initial_configs = []
    for _ in range(n_samples):
        initial_config = {}
        for key, value in config_space.items():
            if hasattr(value, "sample"):
                initial_config[key] = value.sample()
            else:
                initial_config[key] = value

        initial_configs.append(initial_config)
    return initial_configs




[docs]
def best_config_to_command_arguments(
    best_config: Dict[str, Any],
    omit: List[str] = ["label", "no_evaluation", "no_saving"],
) -> str:
    """Convert the best config to command arguments."""
    command = ""
    for key, value in best_config.items():
        if "config_" in key:
            key = key.replace("config_", "")
            if key in omit:
                continue
            if isinstance(value, list):
                for v in value:
                    command += f"--{key} {v} "
            else:
                command += f"--{key} {value} "

    return command




[docs]
def plot_different_runs_and_metrics(results_df: pd.DataFrame, save_path: str) -> None:
    """Plot each different run separately on the same graph with respect to the epoch and all the logged metrics."""
    # Get unique runs
    # Color each run differently
    unique_runs = results_df["trial_id"].unique()
    # Get all the metrics from the results dataframe which where the columns can be found in the METRIC_TENDENCY dictionary's keys
    unique_metrics = [
        column
        for column in results_df.columns
        if parse_metric(column) in METRIC_TENDENCY
    ]
    colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_runs)))
    # Create a folder to save the plots
    save_path = os.path.join(save_path, "individual_runs")
    # Get the maximum number of epochs, these are stored under `config_trainer_epochs` column
    epochs = max(results_df["config_trainer_epochs"].values)
    os.makedirs(save_path, exist_ok=True)
    for metric in unique_metrics:
        last_value = None
        best_id = None
        best_index = None
        full_training = 0  # Count the number of complete runs
        ymin = float("inf")
        ymax = float("-inf")
        for i, run in enumerate(unique_runs):
            run_df = results_df[results_df["trial_id"] == run]
            x = np.arange(0, len(run_df[metric].values))
            plt.plot(x, run_df[metric].values, color=colors[i])
            ymin = min(ymin, np.min(run_df[metric].values))
            ymax = max(ymax, np.max(run_df[metric].values))
            if last_value is None:
                last_value = run_df[metric].values[-1]
                best_id = run
                best_index = i
            else:
                if METRIC_TENDENCY[parse_metric(metric)] == MIN_TENDENCY:
                    if run_df[metric].values[-1] < last_value:
                        last_value = run_df[metric].values[-1]
                        best_id = run
                        best_index = i
                else:
                    if run_df[metric].values[-1] > last_value:
                        last_value = run_df[metric].values[-1]
                        best_id = run
                        best_index = i
            if len(x) == epochs:
                full_training += 1
        # Plot the best run with respect to black color
        best_run = results_df[results_df["trial_id"] == best_id]
        x = np.arange(0, len(best_run[metric].values))
        plt.plot(
            x,
            best_run[metric].values,
            color=colors[best_index],
            linestyle="--",
            linewidth=4,
        )
        plt.xlabel("Epoch")
        plt.ylabel(metric)
        plt.grid()
        plt.ylim(ymin - (ymax - ymin) * 0.1, ymax + (ymax - ymin) * 0.1)
        plt.title(
            f"Individual runs, Full training: {full_training}, All runs: {len(unique_runs)}, Best run id: {best_id}",
            fontsize=10,
        )
        plt.savefig(
            os.path.join(save_path, f"{metric}_individual_runs.pdf"),
            bbox_inches="tight",
        )
        plt.close()
        plt.clf()




[docs]
def plot_different_runs_and_metric_config_combinations(
    results_df: pd.DataFrame, save_path: str
) -> None:
    """Plot a scatter plot for all the different runs and their end metric values with respect toall hyperparameter combinations."""
    # Get unique runs
    unique_runs = results_df["trial_id"].unique()
    # Get all the metrics from the results dataframe which where the columns can be found in the METRIC_TENDENCY dictionary's keys
    unique_metrics = [
        column
        for column in results_df.columns
        if parse_metric(column) in METRIC_TENDENCY
    ]
    unique_config_keys = [
        column for column in results_df.columns if "config_" in column
    ]

    # Create a folder to save the plots
    save_path = os.path.join(save_path, "metric_config_combinations")
    os.makedirs(save_path, exist_ok=True)

    for config_key in unique_config_keys:
        config_values = results_df[config_key].unique()
        # If there is just a single value this was a constant hyperparameter
        if len(config_values) == 1:
            continue

        is_float = all(isinstance(value, float) for value in config_values)

        config_path = os.path.join(save_path, config_key)
        # Create a folder to save the plots
        os.makedirs(config_path, exist_ok=True)
        # Iterate over all the metrics
        # For each metric, collect the end values for each run and the given hyperparameter
        for metric in unique_metrics:
            results = {}
            for run in unique_runs:
                run_df = results_df[results_df["trial_id"] == run]
                run_last_value = run_df[metric].values[-1]
                run_config_value = run_df[config_key].values[-1]
                if run_config_value not in results:
                    results[run_config_value] = []
                results[run_config_value].append(run_last_value)

            if not is_float:
                # Sort the keys in the results dictionary
                results = {
                    key: results[key] for key in natsort.natsorted(results.keys())
                }

                # Create an index mapping the config values to the x-axis
                # This is needed because the config values can be strings
                config_values_index = {
                    config_value: i for i, config_value in enumerate(results.keys())
                }
                index_config_values = {
                    i: config_value for i, config_value in enumerate(results.keys())
                }

            # Take the mean and standard deviation of the results
            # Next to the mean print how many runs were used for that specific hyperparameter value
            x = []
            y = []
            yerr = []
            counts = []
            nan_counts = []
            for config_value, config_results in results.items():
                if not is_float:
                    # Filter out the nan values
                    config_results = [
                        value for value in config_results if not np.isnan(value)
                    ]
                    y.append(np.mean(config_results))
                    yerr.append(
                        np.std(config_results) if len(config_results) > 1 else 0
                    )
                    counts.append(len(config_results))
                    nan_counts.append(len(results[config_value]) - len(config_results))
                    x.append(config_values_index[config_value])
                else:
                    config_results = [
                        value for value in config_results if not np.isnan(value)
                    ]
                    y.append(np.mean(config_results))
                    yerr.append(
                        np.std(config_results) if len(config_results) > 1 else 0
                    )
                    x.append(config_value)

            fig = plt.figure()
            ax = fig.add_subplot(111)
            ax.errorbar(x, y, yerr=yerr, fmt="o")
            if not is_float:
                for i, count in enumerate(counts):
                    ax.annotate(f"{count}, NaN: {nan_counts[i]}", (x[i], y[i]))

            ax.set_xlabel(config_key)
            if not is_float:
                ax.set_xticks(x, [index_config_values[i] for i in x])
            ax.set_ylabel(metric)
            ax.set_title(f"{config_key} vs {metric}")
            ax.grid()
            plt.savefig(
                os.path.join(config_path, f"{metric}_vs_{config_key}.pdf"),
                bbox_inches="tight",
            )
            plt.close()
            plt.clf()




[docs]
def plot_different_metrics_and_trial_id(
    results_df: pd.DataFrame, save_path: str
) -> None:
    """Plot a scatter plot where trial id is on the x-axis and the last value of the metric is on the y-axis."""
    # Get unique runs
    unique_runs = results_df["trial_id"].unique()
    # Get all the metrics from the results dataframe which where the columns can be found in the METRIC_TENDENCY dictionary's keys
    unique_metrics = [
        column
        for column in results_df.columns
        if parse_metric(column) in METRIC_TENDENCY
    ]
    # Create a folder to save the plots
    save_path = os.path.join(save_path, "metric_trial_id")
    os.makedirs(save_path, exist_ok=True)

    for metric in unique_metrics:
        results = {}
        for run in unique_runs:
            run_df = results_df[results_df["trial_id"] == run]
            run_last_value = run_df[metric].values[-1]
            results[run] = run_last_value

        # Plot the results
        x = []
        y = []
        for run, value in results.items():
            x.append(run)
            y.append(value)

        plt.scatter(x, y)
        plt.xlabel("Trial id")

        # Plot best fit line
        m, b = np.polyfit(x, y, 1)
        plt.plot(x, m * np.array(x) + b, color="red")

        plt.ylabel(metric)
        plt.grid()
        plt.savefig(
            os.path.join(save_path, f"{metric}_vs_trial_id.pdf"), bbox_inches="tight"
        )
        plt.close()