Skip to content

Ablation Runner

Ablation study runner for feature, architecture, data, and hyperparameter studies.

pitch_sequencing.evaluation.ablation

Ablation study runner for feature, architecture, data size, and hyperparameter studies.

AblationRunner

Run ablation studies with MLflow logging.

Source code in src/pitch_sequencing/evaluation/ablation.py
class AblationRunner:
    """Run ablation studies with MLflow logging."""

    def __init__(
        self,
        ablation_config: AblationConfig,
        data_config: DataConfig,
        models_config_dir: str = None,
    ):
        if models_config_dir is None:
            models_config_dir = str(get_default_config("models"))
        self.cfg = ablation_config
        self.data_cfg = data_config
        self.models_config_dir = models_config_dir

    def _load_and_prepare(self):
        """Load data and return (df, encoders, norm_stats)."""
        df = load_pitch_data(self.data_cfg.data_path)
        df, encoders = encode_categoricals(
            df, [c for c in self.data_cfg.categorical_features if c in df.columns]
        )
        df, norm_stats = normalize_numericals(df, self.data_cfg.numerical_features)
        return df, encoders, norm_stats

    def _get_model_config(self, model_name: str) -> dict:
        config_path = os.path.join(
            self.models_config_dir,
            f"{model_name.replace('logistic_regression', 'logistic')}.yaml",
        )
        if not os.path.exists(config_path):
            config_path = os.path.join(self.models_config_dir, f"{model_name}.yaml")
        return load_config(config_path) if os.path.exists(config_path) else {}

    def _train_and_evaluate(self, model_name, model_cfg, X, y, n_folds=3):
        """Train model with CV and return list of accuracy scores."""
        folds = create_splits(X, y, n_folds=n_folds, random_state=self.data_cfg.random_state)
        scores = []
        for train_idx, test_idx in folds:
            model = get_model(model_name, model_cfg)
            X_train, y_train = X[train_idx], y[train_idx]
            X_test, y_test = X[test_idx], y[test_idx]
            model.fit(X_train, y_train, X_val=X_test, y_val=y_test)
            y_pred = model.predict(X_test)
            scores.append(compute_metrics(y_test, y_pred)["accuracy"])
        return scores

    def run_feature_ablation(self, model_name: Optional[str] = None) -> pd.DataFrame:
        """Remove each feature group one-at-a-time, measure accuracy drop."""
        model_name = model_name or self.cfg.default_model
        model_cfg = self._get_model_config(model_name)
        df, encoders, norm_stats = self._load_and_prepare()

        mlflow.set_tracking_uri(f"file://{os.path.abspath('experiments')}")
        mlflow.set_experiment(f"ablation_feature_{model_name}")

        # Determine base model type
        temp_model = get_model(model_name, model_cfg)
        is_sequence = temp_model.model_type == "sequence"

        results = []
        for group_name, features in self.cfg.feature_groups.items():
            print(f"  Feature group: {group_name}")

            if is_sequence:
                # For sequence models, select which features to include
                seq_features = []
                for f in features:
                    enc_col = f"{f}_enc"
                    if enc_col in df.columns:
                        seq_features.append(enc_col)
                    elif f in df.columns:
                        seq_features.append(f)
                if not seq_features:
                    continue
                X, y, _ = create_sequences(
                    df,
                    window_size=self.data_cfg.window_size,
                    feature_cols=seq_features,
                    target_col=f"{self.data_cfg.target_col}_enc",
                )
            else:
                tab_features = []
                for f in features:
                    enc_col = f"{f}_enc"
                    if enc_col in df.columns:
                        tab_features.append(enc_col)
                    elif f in df.columns:
                        tab_features.append(f)
                if not tab_features:
                    continue
                X = df[tab_features].values
                y = df[f"{self.data_cfg.target_col}_enc"].values

            with mlflow.start_run(run_name=f"feature_{group_name}"):
                scores = self._train_and_evaluate(model_name, model_cfg, X, y)
                mean, ci_low, ci_high = bootstrap_confidence_interval(scores)
                mlflow.log_metric("accuracy_mean", mean)
                mlflow.log_param("feature_group", group_name)

            results.append({
                "variant": group_name,
                "accuracy": mean,
                "ci_low": ci_low,
                "ci_high": ci_high,
            })

        return pd.DataFrame(results)

    def run_architecture_ablation(self, model_name: Optional[str] = None) -> pd.DataFrame:
        """For neural models, test architecture variants."""
        model_name = model_name or self.cfg.default_model
        base_cfg = self._get_model_config(model_name)
        df, encoders, norm_stats = self._load_and_prepare()

        mlflow.set_tracking_uri(f"file://{os.path.abspath('experiments')}")
        mlflow.set_experiment(f"ablation_architecture_{model_name}")

        variants = self.cfg.architecture_variants.get(model_name, {})
        if not variants:
            print(f"No architecture variants defined for {model_name}")
            return pd.DataFrame()

        X, y, _ = create_sequences(
            df,
            window_size=self.data_cfg.window_size,
            feature_cols=self.data_cfg.sequence_features,
            target_col=f"{self.data_cfg.target_col}_enc",
        )

        results = []
        for param_name, param_values in variants.items():
            for val in param_values:
                variant_cfg = dict(base_cfg)
                variant_cfg[param_name] = val
                variant_cfg["epochs"] = min(variant_cfg.get("epochs", 10), 10)
                label = f"{param_name}={val}"
                print(f"  Architecture variant: {label}")

                with mlflow.start_run(run_name=f"arch_{label}"):
                    scores = self._train_and_evaluate(model_name, variant_cfg, X, y)
                    mean, ci_low, ci_high = bootstrap_confidence_interval(scores)
                    mlflow.log_metric("accuracy_mean", mean)
                    mlflow.log_param("variant", label)

                results.append({
                    "variant": label,
                    "accuracy": mean,
                    "ci_low": ci_low,
                    "ci_high": ci_high,
                })

        return pd.DataFrame(results)

    def run_data_ablation(self, model_name: Optional[str] = None) -> pd.DataFrame:
        """Train on varying fractions of data, plot learning curves."""
        model_name = model_name or self.cfg.default_model
        model_cfg = self._get_model_config(model_name)
        df, encoders, norm_stats = self._load_and_prepare()

        mlflow.set_tracking_uri(f"file://{os.path.abspath('experiments')}")
        mlflow.set_experiment(f"ablation_data_{model_name}")

        temp_model = get_model(model_name, model_cfg)
        is_sequence = temp_model.model_type == "sequence"

        if is_sequence:
            X, y, _ = create_sequences(
                df,
                window_size=self.data_cfg.window_size,
                feature_cols=self.data_cfg.sequence_features,
                target_col=f"{self.data_cfg.target_col}_enc",
            )
        else:
            tab_features = []
            for col in self.data_cfg.tabular_features:
                enc_col = f"{col}_enc"
                if enc_col in df.columns:
                    tab_features.append(enc_col)
                elif col in df.columns:
                    tab_features.append(col)
            X = df[tab_features].values
            y = df[f"{self.data_cfg.target_col}_enc"].values

        results = []
        for frac in self.cfg.data_fractions:
            n_samples = int(len(X) * frac)
            X_sub, y_sub = X[:n_samples], y[:n_samples]
            label = f"{frac*100:.0f}%"
            print(f"  Data fraction: {label} ({n_samples} samples)")

            with mlflow.start_run(run_name=f"data_{label}"):
                scores = self._train_and_evaluate(model_name, model_cfg, X_sub, y_sub)
                mean, ci_low, ci_high = bootstrap_confidence_interval(scores)
                mlflow.log_metric("accuracy_mean", mean)
                mlflow.log_param("data_fraction", frac)

            results.append({
                "variant": label,
                "accuracy": mean,
                "ci_low": ci_low,
                "ci_high": ci_high,
                "n_samples": n_samples,
            })

        return pd.DataFrame(results)

    def run_hyperparameter_sensitivity(
        self, model_name: Optional[str] = None, param_name: Optional[str] = None, values: Optional[list] = None
    ) -> pd.DataFrame:
        """Vary one hyperparameter, keep others at defaults."""
        model_name = model_name or self.cfg.default_model
        base_cfg = self._get_model_config(model_name)
        df, encoders, norm_stats = self._load_and_prepare()

        mlflow.set_tracking_uri(f"file://{os.path.abspath('experiments')}")
        mlflow.set_experiment(f"ablation_hyperparam_{model_name}")

        if param_name is None and values is None:
            # Run all configured sensitivity studies
            all_results = []
            for pname, pvalues in self.cfg.hyperparameter_sensitivity.items():
                for val in pvalues:
                    variant_cfg = dict(base_cfg)
                    variant_cfg[pname] = val
                    variant_cfg["epochs"] = min(variant_cfg.get("epochs", 10), 10)
                    label = f"{pname}={val}"
                    print(f"  Hyperparam: {label}")

                    temp_model = get_model(model_name, variant_cfg)
                    is_sequence = temp_model.model_type == "sequence"
                    if is_sequence:
                        ws = variant_cfg.get("window_size", self.data_cfg.window_size)
                        if pname == "window_size":
                            ws = val
                        X, y, _ = create_sequences(
                            df, window_size=ws,
                            feature_cols=self.data_cfg.sequence_features,
                            target_col=f"{self.data_cfg.target_col}_enc",
                        )
                    else:
                        tab_features = []
                        for col in self.data_cfg.tabular_features:
                            enc_col = f"{col}_enc"
                            if enc_col in df.columns:
                                tab_features.append(enc_col)
                            elif col in df.columns:
                                tab_features.append(col)
                        X = df[tab_features].values
                        y = df[f"{self.data_cfg.target_col}_enc"].values

                    with mlflow.start_run(run_name=f"hp_{label}"):
                        scores = self._train_and_evaluate(model_name, variant_cfg, X, y)
                        mean, ci_low, ci_high = bootstrap_confidence_interval(scores)
                        mlflow.log_metric("accuracy_mean", mean)

                    all_results.append({
                        "variant": label,
                        "accuracy": mean,
                        "ci_low": ci_low,
                        "ci_high": ci_high,
                    })
            return pd.DataFrame(all_results)

        # Single param sweep
        results = []
        for val in values:
            variant_cfg = dict(base_cfg)
            variant_cfg[param_name] = val
            label = f"{param_name}={val}"
            print(f"  Hyperparam: {label}")

            temp_model = get_model(model_name, variant_cfg)
            is_sequence = temp_model.model_type == "sequence"
            if is_sequence:
                X, y, _ = create_sequences(
                    df,
                    window_size=self.data_cfg.window_size,
                    feature_cols=self.data_cfg.sequence_features,
                    target_col=f"{self.data_cfg.target_col}_enc",
                )
            else:
                tab_features = []
                for col in self.data_cfg.tabular_features:
                    enc_col = f"{col}_enc"
                    if enc_col in df.columns:
                        tab_features.append(enc_col)
                    elif col in df.columns:
                        tab_features.append(col)
                X = df[tab_features].values
                y = df[f"{self.data_cfg.target_col}_enc"].values

            with mlflow.start_run(run_name=f"hp_{label}"):
                scores = self._train_and_evaluate(model_name, variant_cfg, X, y)
                mean, ci_low, ci_high = bootstrap_confidence_interval(scores)
                mlflow.log_metric("accuracy_mean", mean)

            results.append({
                "variant": label,
                "accuracy": mean,
                "ci_low": ci_low,
                "ci_high": ci_high,
            })

        return pd.DataFrame(results)

run_architecture_ablation(model_name=None)

For neural models, test architecture variants.

Source code in src/pitch_sequencing/evaluation/ablation.py
def run_architecture_ablation(self, model_name: Optional[str] = None) -> pd.DataFrame:
    """For neural models, test architecture variants."""
    model_name = model_name or self.cfg.default_model
    base_cfg = self._get_model_config(model_name)
    df, encoders, norm_stats = self._load_and_prepare()

    mlflow.set_tracking_uri(f"file://{os.path.abspath('experiments')}")
    mlflow.set_experiment(f"ablation_architecture_{model_name}")

    variants = self.cfg.architecture_variants.get(model_name, {})
    if not variants:
        print(f"No architecture variants defined for {model_name}")
        return pd.DataFrame()

    X, y, _ = create_sequences(
        df,
        window_size=self.data_cfg.window_size,
        feature_cols=self.data_cfg.sequence_features,
        target_col=f"{self.data_cfg.target_col}_enc",
    )

    results = []
    for param_name, param_values in variants.items():
        for val in param_values:
            variant_cfg = dict(base_cfg)
            variant_cfg[param_name] = val
            variant_cfg["epochs"] = min(variant_cfg.get("epochs", 10), 10)
            label = f"{param_name}={val}"
            print(f"  Architecture variant: {label}")

            with mlflow.start_run(run_name=f"arch_{label}"):
                scores = self._train_and_evaluate(model_name, variant_cfg, X, y)
                mean, ci_low, ci_high = bootstrap_confidence_interval(scores)
                mlflow.log_metric("accuracy_mean", mean)
                mlflow.log_param("variant", label)

            results.append({
                "variant": label,
                "accuracy": mean,
                "ci_low": ci_low,
                "ci_high": ci_high,
            })

    return pd.DataFrame(results)

run_data_ablation(model_name=None)

Train on varying fractions of data, plot learning curves.

Source code in src/pitch_sequencing/evaluation/ablation.py
def run_data_ablation(self, model_name: Optional[str] = None) -> pd.DataFrame:
    """Train on varying fractions of data, plot learning curves."""
    model_name = model_name or self.cfg.default_model
    model_cfg = self._get_model_config(model_name)
    df, encoders, norm_stats = self._load_and_prepare()

    mlflow.set_tracking_uri(f"file://{os.path.abspath('experiments')}")
    mlflow.set_experiment(f"ablation_data_{model_name}")

    temp_model = get_model(model_name, model_cfg)
    is_sequence = temp_model.model_type == "sequence"

    if is_sequence:
        X, y, _ = create_sequences(
            df,
            window_size=self.data_cfg.window_size,
            feature_cols=self.data_cfg.sequence_features,
            target_col=f"{self.data_cfg.target_col}_enc",
        )
    else:
        tab_features = []
        for col in self.data_cfg.tabular_features:
            enc_col = f"{col}_enc"
            if enc_col in df.columns:
                tab_features.append(enc_col)
            elif col in df.columns:
                tab_features.append(col)
        X = df[tab_features].values
        y = df[f"{self.data_cfg.target_col}_enc"].values

    results = []
    for frac in self.cfg.data_fractions:
        n_samples = int(len(X) * frac)
        X_sub, y_sub = X[:n_samples], y[:n_samples]
        label = f"{frac*100:.0f}%"
        print(f"  Data fraction: {label} ({n_samples} samples)")

        with mlflow.start_run(run_name=f"data_{label}"):
            scores = self._train_and_evaluate(model_name, model_cfg, X_sub, y_sub)
            mean, ci_low, ci_high = bootstrap_confidence_interval(scores)
            mlflow.log_metric("accuracy_mean", mean)
            mlflow.log_param("data_fraction", frac)

        results.append({
            "variant": label,
            "accuracy": mean,
            "ci_low": ci_low,
            "ci_high": ci_high,
            "n_samples": n_samples,
        })

    return pd.DataFrame(results)

run_feature_ablation(model_name=None)

Remove each feature group one-at-a-time, measure accuracy drop.

Source code in src/pitch_sequencing/evaluation/ablation.py
def run_feature_ablation(self, model_name: Optional[str] = None) -> pd.DataFrame:
    """Remove each feature group one-at-a-time, measure accuracy drop."""
    model_name = model_name or self.cfg.default_model
    model_cfg = self._get_model_config(model_name)
    df, encoders, norm_stats = self._load_and_prepare()

    mlflow.set_tracking_uri(f"file://{os.path.abspath('experiments')}")
    mlflow.set_experiment(f"ablation_feature_{model_name}")

    # Determine base model type
    temp_model = get_model(model_name, model_cfg)
    is_sequence = temp_model.model_type == "sequence"

    results = []
    for group_name, features in self.cfg.feature_groups.items():
        print(f"  Feature group: {group_name}")

        if is_sequence:
            # For sequence models, select which features to include
            seq_features = []
            for f in features:
                enc_col = f"{f}_enc"
                if enc_col in df.columns:
                    seq_features.append(enc_col)
                elif f in df.columns:
                    seq_features.append(f)
            if not seq_features:
                continue
            X, y, _ = create_sequences(
                df,
                window_size=self.data_cfg.window_size,
                feature_cols=seq_features,
                target_col=f"{self.data_cfg.target_col}_enc",
            )
        else:
            tab_features = []
            for f in features:
                enc_col = f"{f}_enc"
                if enc_col in df.columns:
                    tab_features.append(enc_col)
                elif f in df.columns:
                    tab_features.append(f)
            if not tab_features:
                continue
            X = df[tab_features].values
            y = df[f"{self.data_cfg.target_col}_enc"].values

        with mlflow.start_run(run_name=f"feature_{group_name}"):
            scores = self._train_and_evaluate(model_name, model_cfg, X, y)
            mean, ci_low, ci_high = bootstrap_confidence_interval(scores)
            mlflow.log_metric("accuracy_mean", mean)
            mlflow.log_param("feature_group", group_name)

        results.append({
            "variant": group_name,
            "accuracy": mean,
            "ci_low": ci_low,
            "ci_high": ci_high,
        })

    return pd.DataFrame(results)

run_hyperparameter_sensitivity(model_name=None, param_name=None, values=None)

Vary one hyperparameter, keep others at defaults.

Source code in src/pitch_sequencing/evaluation/ablation.py
def run_hyperparameter_sensitivity(
    self, model_name: Optional[str] = None, param_name: Optional[str] = None, values: Optional[list] = None
) -> pd.DataFrame:
    """Vary one hyperparameter, keep others at defaults."""
    model_name = model_name or self.cfg.default_model
    base_cfg = self._get_model_config(model_name)
    df, encoders, norm_stats = self._load_and_prepare()

    mlflow.set_tracking_uri(f"file://{os.path.abspath('experiments')}")
    mlflow.set_experiment(f"ablation_hyperparam_{model_name}")

    if param_name is None and values is None:
        # Run all configured sensitivity studies
        all_results = []
        for pname, pvalues in self.cfg.hyperparameter_sensitivity.items():
            for val in pvalues:
                variant_cfg = dict(base_cfg)
                variant_cfg[pname] = val
                variant_cfg["epochs"] = min(variant_cfg.get("epochs", 10), 10)
                label = f"{pname}={val}"
                print(f"  Hyperparam: {label}")

                temp_model = get_model(model_name, variant_cfg)
                is_sequence = temp_model.model_type == "sequence"
                if is_sequence:
                    ws = variant_cfg.get("window_size", self.data_cfg.window_size)
                    if pname == "window_size":
                        ws = val
                    X, y, _ = create_sequences(
                        df, window_size=ws,
                        feature_cols=self.data_cfg.sequence_features,
                        target_col=f"{self.data_cfg.target_col}_enc",
                    )
                else:
                    tab_features = []
                    for col in self.data_cfg.tabular_features:
                        enc_col = f"{col}_enc"
                        if enc_col in df.columns:
                            tab_features.append(enc_col)
                        elif col in df.columns:
                            tab_features.append(col)
                    X = df[tab_features].values
                    y = df[f"{self.data_cfg.target_col}_enc"].values

                with mlflow.start_run(run_name=f"hp_{label}"):
                    scores = self._train_and_evaluate(model_name, variant_cfg, X, y)
                    mean, ci_low, ci_high = bootstrap_confidence_interval(scores)
                    mlflow.log_metric("accuracy_mean", mean)

                all_results.append({
                    "variant": label,
                    "accuracy": mean,
                    "ci_low": ci_low,
                    "ci_high": ci_high,
                })
        return pd.DataFrame(all_results)

    # Single param sweep
    results = []
    for val in values:
        variant_cfg = dict(base_cfg)
        variant_cfg[param_name] = val
        label = f"{param_name}={val}"
        print(f"  Hyperparam: {label}")

        temp_model = get_model(model_name, variant_cfg)
        is_sequence = temp_model.model_type == "sequence"
        if is_sequence:
            X, y, _ = create_sequences(
                df,
                window_size=self.data_cfg.window_size,
                feature_cols=self.data_cfg.sequence_features,
                target_col=f"{self.data_cfg.target_col}_enc",
            )
        else:
            tab_features = []
            for col in self.data_cfg.tabular_features:
                enc_col = f"{col}_enc"
                if enc_col in df.columns:
                    tab_features.append(enc_col)
                elif col in df.columns:
                    tab_features.append(col)
            X = df[tab_features].values
            y = df[f"{self.data_cfg.target_col}_enc"].values

        with mlflow.start_run(run_name=f"hp_{label}"):
            scores = self._train_and_evaluate(model_name, variant_cfg, X, y)
            mean, ci_low, ci_high = bootstrap_confidence_interval(scores)
            mlflow.log_metric("accuracy_mean", mean)

        results.append({
            "variant": label,
            "accuracy": mean,
            "ci_low": ci_low,
            "ci_high": ci_high,
        })

    return pd.DataFrame(results)