Skip to content

Data Simulator

Synthetic baseball pitch data generation.

pitch_sequencing.data.simulator

Baseball pitch simulator with pitcher archetypes, sequence strategies, and fatigue.

generate_dataset(num_games=3000, at_bats_per_game=35, seed=42)

Generate the main pitch dataset by simulating full games.

Source code in src/pitch_sequencing/data/simulator.py
def generate_dataset(num_games: int = 3000, at_bats_per_game: int = 35, seed: int = 42) -> pd.DataFrame:
    """Generate the main pitch dataset by simulating full games."""
    random.seed(seed)
    pitcher_types = list(PITCHER_ARCHETYPES.keys())
    data = []

    for _ in range(num_games):
        pitcher_type = random.choice(pitcher_types)
        simulator = BaseballPitchSimulator(pitcher_type=pitcher_type)
        score_diff = 0

        for at_bat_num in range(1, at_bats_per_game + 1):
            runners_on = random.random() < 0.35
            if random.random() < 0.15:
                score_diff += random.choice([-1, 1, 1, 2])
            score_diff = max(min(score_diff, 8), -8)

            at_bat = simulator.simulate_at_bat(runners_on=runners_on, score_diff=score_diff)
            for item in at_bat[:-1]:
                state, pitch_type, outcome = item
                balls, strikes = state
                data.append([
                    balls, strikes, pitch_type, outcome,
                    pitcher_type, simulator.pitch_count,
                    at_bat_num, int(runners_on), score_diff,
                ])

    df = pd.DataFrame(data, columns=[
        "Balls", "Strikes", "PitchType", "Outcome",
        "PitcherType", "PitchNumber", "AtBatNumber",
        "RunnersOn", "ScoreDiff",
    ])
    df["PreviousPitchType"] = df["PitchType"].shift(1).fillna("None")
    return df

generate_hmm_sequences(num_sequences=2500, sequence_length=100, seed=42)

Generate the HMM synthetic pitch sequences dataset.

Source code in src/pitch_sequencing/data/simulator.py
def generate_hmm_sequences(
    num_sequences: int = 2500,
    sequence_length: int = 100,
    seed: int = 42,
) -> pd.DataFrame:
    """Generate the HMM synthetic pitch sequences dataset."""
    pitch_types = {0: "Fastball", 1: "Curveball", 2: "Slider", 3: "Changeup"}
    num_pitches = len(pitch_types)

    transition_matrix = np.array([
        [0.15, 0.20, 0.30, 0.35],
        [0.45, 0.10, 0.25, 0.20],
        [0.35, 0.30, 0.10, 0.25],
        [0.50, 0.15, 0.25, 0.10],
    ])

    np.random.seed(seed)
    sequences = []
    for _ in range(num_sequences):
        start = np.random.randint(num_pitches)
        seq = [start]
        current = start
        for _ in range(sequence_length - 1):
            nxt = np.random.choice(num_pitches, p=transition_matrix[current])
            seq.append(nxt)
            current = nxt
        sequences.append(seq)

    df = pd.DataFrame(sequences, columns=[f"Pitch_{i+1}" for i in range(sequence_length)])
    df.replace(pitch_types, inplace=True)
    return df