API Reference¶

Auto-generated API documentation from source code docstrings.

Top-Level Package¶

`pitch_sequencing` ¶

Baseball pitch sequence prediction and analysis.

`version = '0.1.0'` `module-attribute` ¶

`MODEL_REGISTRY = {'logistic_regression': LogisticRegressionModel, 'random_forest': RandomForestModel, 'hmm': HMMModel, 'autogluon': AutoGluonModel, 'lstm': LSTMModel, 'cnn1d': CNN1DModel, 'transformer': TransformerModel}` `module-attribute` ¶

`DataConfig` `dataclass` ¶

Source code in src/pitch_sequencing/config.py

@dataclass
class DataConfig:
    data_path: str = field(default_factory=_default_data_path)
    hmm_data_path: str = field(default_factory=_default_hmm_path)
    target_col: str = "PitchType"
    outcome_col: str = "Outcome"
    test_size: float = 0.2
    n_folds: int = 5
    random_state: int = 42
    window_size: int = 8
    tabular_features: List[str] = field(default_factory=lambda: [
        "Balls", "Strikes", "PitcherType", "PitchNumber",
        "AtBatNumber", "RunnersOn", "ScoreDiff", "PreviousPitchType",
    ])
    sequence_features: List[str] = field(default_factory=lambda: [
        "PitchType_enc", "Balls", "Strikes", "PitcherType_enc",
        "PitchNumber", "RunnersOn", "ScoreDiff",
    ])
    categorical_features: List[str] = field(default_factory=lambda: [
        "PitchType", "PitcherType", "PreviousPitchType", "Outcome",
    ])
    numerical_features: List[str] = field(default_factory=lambda: [
        "Balls", "Strikes", "PitchNumber", "AtBatNumber", "ScoreDiff",
    ])

    @classmethod
    def from_yaml(cls, path: str) -> "DataConfig":
        cfg = load_config(path)
        return cls(**{k: v for k, v in cfg.items() if k in cls.__dataclass_fields__})

`ModelConfig` `dataclass` ¶

Source code in src/pitch_sequencing/config.py

@dataclass
class ModelConfig:
    model_type: str = "lstm"
    hyperparameters: Dict[str, Any] = field(default_factory=dict)

    @classmethod
    def from_yaml(cls, path: str) -> "ModelConfig":
        cfg = load_config(path)
        model_type = cfg.pop("model_type", "lstm")
        return cls(model_type=model_type, hyperparameters=cfg)

`get_model(name, config=None)` ¶

Instantiate a model by registry name.

Parameters:

Name	Type	Description	Default
`name`		Key in MODEL_REGISTRY (e.g. 'lstm', 'random_forest').	required
`config`		Optional dict of hyperparameters.	`None`

Returns:

Type	Description
	Instance of the model class.

Source code in src/pitch_sequencing/models/__init__.py

def get_model(name, config=None):
    """Instantiate a model by registry name.

    Args:
        name: Key in MODEL_REGISTRY (e.g. 'lstm', 'random_forest').
        config: Optional dict of hyperparameters.

    Returns:
        Instance of the model class.
    """
    if name not in MODEL_REGISTRY:
        raise ValueError(f"Unknown model '{name}'. Available: {list(MODEL_REGISTRY.keys())}")
    return MODEL_REGISTRY[name](config)

`load_pitch_data(path, filter_none_prev=True)` ¶

Load the main pitch dataset.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to baseball_pitch_data.csv.	required
`filter_none_prev`	`bool`	If True, drop rows where PreviousPitchType is 'None'.	`True`

Returns:

Type	Description
`DataFrame`	DataFrame with pitch data.

Source code in src/pitch_sequencing/data/loader.py

def load_pitch_data(path: str, filter_none_prev: bool = True) -> pd.DataFrame:
    """Load the main pitch dataset.

    Args:
        path: Path to baseball_pitch_data.csv.
        filter_none_prev: If True, drop rows where PreviousPitchType is 'None'.

    Returns:
        DataFrame with pitch data.
    """
    df = pd.read_csv(path)
    if filter_none_prev:
        df = df[df["PreviousPitchType"] != "None"].reset_index(drop=True)
    return df

`create_sequences(df, window_size=8, feature_cols=None, target_col='PitchType_enc')` ¶

Create sliding-window sequences respecting game boundaries.

Game boundaries are detected via PitchNumber resets (the raw column must be present or reconstructable). The function expects that categorical columns have already been encoded (e.g. PitchType_enc, PitcherType_enc).

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame with encoded features.	required
`window_size`	`int`	Number of previous timesteps per sample.	`8`
`feature_cols`	`Optional[List[str]]`	Columns to include as features in each timestep.	`None`
`target_col`	`str`	Column to predict.	`'PitchType_enc'`

Returns:

Type	Description
`ndarray`	(X, y, game_starts) where X has shape (n_samples, window_size, n_features),
`ndarray`	y has shape (n_samples,), and game_starts lists the indices where new games start.

Source code in src/pitch_sequencing/data/loader.py

def create_sequences(
    df: pd.DataFrame,
    window_size: int = 8,
    feature_cols: Optional[List[str]] = None,
    target_col: str = "PitchType_enc",
) -> Tuple[np.ndarray, np.ndarray, List[int]]:
    """Create sliding-window sequences respecting game boundaries.

    Game boundaries are detected via PitchNumber resets (the raw column must
    be present or reconstructable). The function expects that categorical
    columns have already been encoded (e.g. PitchType_enc, PitcherType_enc).

    Args:
        df: DataFrame with encoded features.
        window_size: Number of previous timesteps per sample.
        feature_cols: Columns to include as features in each timestep.
        target_col: Column to predict.

    Returns:
        (X, y, game_starts) where X has shape (n_samples, window_size, n_features),
        y has shape (n_samples,), and game_starts lists the indices where new games start.
    """
    if feature_cols is None:
        feature_cols = [
            "PitchType_enc", "Balls", "Strikes", "PitcherType_enc",
            "PitchNumber", "RunnersOn", "ScoreDiff",
        ]

    features = df[feature_cols].values
    targets = df[target_col].values

    # Detect game boundaries using AtBatNumber resets (drops from high to low).
    # Falls back to PitchNumber drops if AtBatNumber is not available.
    if "AtBatNumber_raw" in df.columns:
        boundary_col = df["AtBatNumber_raw"].values
    elif "AtBatNumber" in df.columns:
        boundary_col = df["AtBatNumber"].values
    elif "PitchNumber_raw" in df.columns:
        boundary_col = df["PitchNumber_raw"].values
    else:
        boundary_col = df["PitchNumber"].values
    game_starts = set(np.where(np.diff(boundary_col, prepend=boundary_col[0] + 1) < 0)[0])

    X_sequences = []
    y_targets = []

    for i in range(window_size, len(features)):
        window_range = range(i - window_size + 1, i + 1)
        if any(idx in game_starts for idx in window_range):
            continue
        X_sequences.append(features[i - window_size:i])
        y_targets.append(targets[i])

    X = np.array(X_sequences, dtype=np.float32)
    y = np.array(y_targets, dtype=np.int64)
    return X, y, sorted(game_starts)

`generate_dataset(num_games=3000, at_bats_per_game=35, seed=42)` ¶

Generate the main pitch dataset by simulating full games.

Source code in src/pitch_sequencing/data/simulator.py

def generate_dataset(num_games: int = 3000, at_bats_per_game: int = 35, seed: int = 42) -> pd.DataFrame:
    """Generate the main pitch dataset by simulating full games."""
    random.seed(seed)
    pitcher_types = list(PITCHER_ARCHETYPES.keys())
    data = []

    for _ in range(num_games):
        pitcher_type = random.choice(pitcher_types)
        simulator = BaseballPitchSimulator(pitcher_type=pitcher_type)
        score_diff = 0

        for at_bat_num in range(1, at_bats_per_game + 1):
            runners_on = random.random() < 0.35
            if random.random() < 0.15:
                score_diff += random.choice([-1, 1, 1, 2])
            score_diff = max(min(score_diff, 8), -8)

            at_bat = simulator.simulate_at_bat(runners_on=runners_on, score_diff=score_diff)
            for item in at_bat[:-1]:
                state, pitch_type, outcome = item
                balls, strikes = state
                data.append([
                    balls, strikes, pitch_type, outcome,
                    pitcher_type, simulator.pitch_count,
                    at_bat_num, int(runners_on), score_diff,
                ])

    df = pd.DataFrame(data, columns=[
        "Balls", "Strikes", "PitchType", "Outcome",
        "PitcherType", "PitchNumber", "AtBatNumber",
        "RunnersOn", "ScoreDiff",
    ])
    df["PreviousPitchType"] = df["PitchType"].shift(1).fillna("None")
    return df

API Reference¶

Top-Level Package¶

pitch_sequencing ¶

__version__ = '0.1.0' module-attribute ¶

MODEL_REGISTRY = {'logistic_regression': LogisticRegressionModel, 'random_forest': RandomForestModel, 'hmm': HMMModel, 'autogluon': AutoGluonModel, 'lstm': LSTMModel, 'cnn1d': CNN1DModel, 'transformer': TransformerModel} module-attribute ¶

DataConfig dataclass ¶

ModelConfig dataclass ¶

get_model(name, config=None) ¶

load_pitch_data(path, filter_none_prev=True) ¶

create_sequences(df, window_size=8, feature_cols=None, target_col='PitchType_enc') ¶

generate_dataset(num_games=3000, at_bats_per_game=35, seed=42) ¶

`pitch_sequencing` ¶

`version = '0.1.0'` `module-attribute` ¶

`MODEL_REGISTRY = {'logistic_regression': LogisticRegressionModel, 'random_forest': RandomForestModel, 'hmm': HMMModel, 'autogluon': AutoGluonModel, 'lstm': LSTMModel, 'cnn1d': CNN1DModel, 'transformer': TransformerModel}` `module-attribute` ¶

`DataConfig` `dataclass` ¶

`ModelConfig` `dataclass` ¶

`get_model(name, config=None)` ¶

`load_pitch_data(path, filter_none_prev=True)` ¶

`create_sequences(df, window_size=8, feature_cols=None, target_col='PitchType_enc')` ¶

`generate_dataset(num_games=3000, at_bats_per_game=35, seed=42)` ¶