Skip to content

API Reference

Auto-generated API documentation from source code docstrings.

Top-Level Package

pitch_sequencing

Baseball pitch sequence prediction and analysis.

__version__ = '0.1.0' module-attribute

MODEL_REGISTRY = {'logistic_regression': LogisticRegressionModel, 'random_forest': RandomForestModel, 'hmm': HMMModel, 'autogluon': AutoGluonModel, 'lstm': LSTMModel, 'cnn1d': CNN1DModel, 'transformer': TransformerModel} module-attribute

DataConfig dataclass

Source code in src/pitch_sequencing/config.py
@dataclass
class DataConfig:
    data_path: str = field(default_factory=_default_data_path)
    hmm_data_path: str = field(default_factory=_default_hmm_path)
    target_col: str = "PitchType"
    outcome_col: str = "Outcome"
    test_size: float = 0.2
    n_folds: int = 5
    random_state: int = 42
    window_size: int = 8
    tabular_features: List[str] = field(default_factory=lambda: [
        "Balls", "Strikes", "PitcherType", "PitchNumber",
        "AtBatNumber", "RunnersOn", "ScoreDiff", "PreviousPitchType",
    ])
    sequence_features: List[str] = field(default_factory=lambda: [
        "PitchType_enc", "Balls", "Strikes", "PitcherType_enc",
        "PitchNumber", "RunnersOn", "ScoreDiff",
    ])
    categorical_features: List[str] = field(default_factory=lambda: [
        "PitchType", "PitcherType", "PreviousPitchType", "Outcome",
    ])
    numerical_features: List[str] = field(default_factory=lambda: [
        "Balls", "Strikes", "PitchNumber", "AtBatNumber", "ScoreDiff",
    ])

    @classmethod
    def from_yaml(cls, path: str) -> "DataConfig":
        cfg = load_config(path)
        return cls(**{k: v for k, v in cfg.items() if k in cls.__dataclass_fields__})

ModelConfig dataclass

Source code in src/pitch_sequencing/config.py
@dataclass
class ModelConfig:
    model_type: str = "lstm"
    hyperparameters: Dict[str, Any] = field(default_factory=dict)

    @classmethod
    def from_yaml(cls, path: str) -> "ModelConfig":
        cfg = load_config(path)
        model_type = cfg.pop("model_type", "lstm")
        return cls(model_type=model_type, hyperparameters=cfg)

get_model(name, config=None)

Instantiate a model by registry name.

Parameters:

Name Type Description Default
name

Key in MODEL_REGISTRY (e.g. 'lstm', 'random_forest').

required
config

Optional dict of hyperparameters.

None

Returns:

Type Description

Instance of the model class.

Source code in src/pitch_sequencing/models/__init__.py
def get_model(name, config=None):
    """Instantiate a model by registry name.

    Args:
        name: Key in MODEL_REGISTRY (e.g. 'lstm', 'random_forest').
        config: Optional dict of hyperparameters.

    Returns:
        Instance of the model class.
    """
    if name not in MODEL_REGISTRY:
        raise ValueError(f"Unknown model '{name}'. Available: {list(MODEL_REGISTRY.keys())}")
    return MODEL_REGISTRY[name](config)

load_pitch_data(path, filter_none_prev=True)

Load the main pitch dataset.

Parameters:

Name Type Description Default
path str

Path to baseball_pitch_data.csv.

required
filter_none_prev bool

If True, drop rows where PreviousPitchType is 'None'.

True

Returns:

Type Description
DataFrame

DataFrame with pitch data.

Source code in src/pitch_sequencing/data/loader.py
def load_pitch_data(path: str, filter_none_prev: bool = True) -> pd.DataFrame:
    """Load the main pitch dataset.

    Args:
        path: Path to baseball_pitch_data.csv.
        filter_none_prev: If True, drop rows where PreviousPitchType is 'None'.

    Returns:
        DataFrame with pitch data.
    """
    df = pd.read_csv(path)
    if filter_none_prev:
        df = df[df["PreviousPitchType"] != "None"].reset_index(drop=True)
    return df

create_sequences(df, window_size=8, feature_cols=None, target_col='PitchType_enc')

Create sliding-window sequences respecting game boundaries.

Game boundaries are detected via PitchNumber resets (the raw column must be present or reconstructable). The function expects that categorical columns have already been encoded (e.g. PitchType_enc, PitcherType_enc).

Parameters:

Name Type Description Default
df DataFrame

DataFrame with encoded features.

required
window_size int

Number of previous timesteps per sample.

8
feature_cols Optional[List[str]]

Columns to include as features in each timestep.

None
target_col str

Column to predict.

'PitchType_enc'

Returns:

Type Description
ndarray

(X, y, game_starts) where X has shape (n_samples, window_size, n_features),

ndarray

y has shape (n_samples,), and game_starts lists the indices where new games start.

Source code in src/pitch_sequencing/data/loader.py
def create_sequences(
    df: pd.DataFrame,
    window_size: int = 8,
    feature_cols: Optional[List[str]] = None,
    target_col: str = "PitchType_enc",
) -> Tuple[np.ndarray, np.ndarray, List[int]]:
    """Create sliding-window sequences respecting game boundaries.

    Game boundaries are detected via PitchNumber resets (the raw column must
    be present or reconstructable). The function expects that categorical
    columns have already been encoded (e.g. PitchType_enc, PitcherType_enc).

    Args:
        df: DataFrame with encoded features.
        window_size: Number of previous timesteps per sample.
        feature_cols: Columns to include as features in each timestep.
        target_col: Column to predict.

    Returns:
        (X, y, game_starts) where X has shape (n_samples, window_size, n_features),
        y has shape (n_samples,), and game_starts lists the indices where new games start.
    """
    if feature_cols is None:
        feature_cols = [
            "PitchType_enc", "Balls", "Strikes", "PitcherType_enc",
            "PitchNumber", "RunnersOn", "ScoreDiff",
        ]

    features = df[feature_cols].values
    targets = df[target_col].values

    # Detect game boundaries using AtBatNumber resets (drops from high to low).
    # Falls back to PitchNumber drops if AtBatNumber is not available.
    if "AtBatNumber_raw" in df.columns:
        boundary_col = df["AtBatNumber_raw"].values
    elif "AtBatNumber" in df.columns:
        boundary_col = df["AtBatNumber"].values
    elif "PitchNumber_raw" in df.columns:
        boundary_col = df["PitchNumber_raw"].values
    else:
        boundary_col = df["PitchNumber"].values
    game_starts = set(np.where(np.diff(boundary_col, prepend=boundary_col[0] + 1) < 0)[0])

    X_sequences = []
    y_targets = []

    for i in range(window_size, len(features)):
        window_range = range(i - window_size + 1, i + 1)
        if any(idx in game_starts for idx in window_range):
            continue
        X_sequences.append(features[i - window_size:i])
        y_targets.append(targets[i])

    X = np.array(X_sequences, dtype=np.float32)
    y = np.array(y_targets, dtype=np.int64)
    return X, y, sorted(game_starts)

generate_dataset(num_games=3000, at_bats_per_game=35, seed=42)

Generate the main pitch dataset by simulating full games.

Source code in src/pitch_sequencing/data/simulator.py
def generate_dataset(num_games: int = 3000, at_bats_per_game: int = 35, seed: int = 42) -> pd.DataFrame:
    """Generate the main pitch dataset by simulating full games."""
    random.seed(seed)
    pitcher_types = list(PITCHER_ARCHETYPES.keys())
    data = []

    for _ in range(num_games):
        pitcher_type = random.choice(pitcher_types)
        simulator = BaseballPitchSimulator(pitcher_type=pitcher_type)
        score_diff = 0

        for at_bat_num in range(1, at_bats_per_game + 1):
            runners_on = random.random() < 0.35
            if random.random() < 0.15:
                score_diff += random.choice([-1, 1, 1, 2])
            score_diff = max(min(score_diff, 8), -8)

            at_bat = simulator.simulate_at_bat(runners_on=runners_on, score_diff=score_diff)
            for item in at_bat[:-1]:
                state, pitch_type, outcome = item
                balls, strikes = state
                data.append([
                    balls, strikes, pitch_type, outcome,
                    pitcher_type, simulator.pitch_count,
                    at_bat_num, int(runners_on), score_diff,
                ])

    df = pd.DataFrame(data, columns=[
        "Balls", "Strikes", "PitchType", "Outcome",
        "PitcherType", "PitchNumber", "AtBatNumber",
        "RunnersOn", "ScoreDiff",
    ])
    df["PreviousPitchType"] = df["PitchType"].shift(1).fillna("None")
    return df