import sqlite3
import pandas as pd
import re
import numpy as np
import requests
import os
import shutil
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
game_log = pd.read_csv('./data/game_log.csv', low_memory=False)
park_c = pd.read_csv('./data/park_codes.csv')
person_c = pd.read_csv('./data/person_codes.csv')
team_c = pd.read_csv('./data/team_codes.csv')
print("reading done")
game_log.csv: one line per game with stats like date, teams, player names, game stats for victory and loosing team
print(game_log.shape)
display(game_log.iloc[:3,[0,1,2,3,4,5,6,7,8,20,80]])
park_codes.csv: stadium information like city, state, league
print(park_c.shape)
display(park_c[:3])
person_codes.csv: personal information on name and role (player, coach, etc)
print(person_c.shape)
display(person_c[:3])
team_codes.csv: teams information like ID, league, city, dates for the start and end of the team
print(team_c.shape)
display(team_c[:3])
game_log["h_league"].value_counts()
We will create a new database so the first thing to do is to delete it if it exists on disk
try:
os.remove("./data/mlb.db")
print("file removed")
except:
print("file not present")
conn = sqlite3.connect('./data/mlb.db')
game_log.to_sql('game_log', conn, index=False)
person_c.to_sql('person_codes', conn, index=False)
team_c.to_sql('team_codes', conn, index=False)
park_c.to_sql('park_codes', conn, index=False)
conn.close()
database_file = './data/mlb.db'
print("databases created")
q=""" ALTER TABLE game_log
ADD COLUMN game_id text
"""
run_command(database_file, q)
q=""" UPDATE game_log
set game_id = h_name || date || number_of_game
"""
run_command(database_file, q)
q = "select game_id from game_log LIMIT 10"
df = run_query(database_file, q)
display(df[:3])
A few facts about the data to take into account for the normalization decisions:
key aspects of normalization:
q=""" CREATE TABLE IF NOT EXISTS person (person_id text ,
first_name text,
last_name text,
PRIMARY KEY (person_id))
"""
run_command(database_file, q)
q=""" INSERT OR IGNORE INTO person
SELECT id, first, last
FROM person_codes
"""
run_command(database_file, q)
q="select * from person"
df = run_query(database_file, q)
display(df[:2])
q=""" CREATE TABLE IF NOT EXISTS park (park_id text,
name text,
nickname text,
city text,
state text,
notes text,
PRIMARY KEY (park_id))
"""
run_command(database_file, q)
q=""" INSERT OR IGNORE INTO park
select park_id,
name,
aka,
city,
state,
notes
from park_codes
"""
run_command(database_file, q)
q="select * from park"
df = run_query(database_file, q)
display(df[:2])
q=""" CREATE TABLE IF NOT EXISTS league (league_id text,
name text,
PRIMARY KEY (league_id))
"""
run_command(database_file, q)
q=""" INSERT OR IGNORE INTO league VALUES ('NL', 'National League'),
('UA', 'Unions Association'),
('PL', 'Players League'),
('AA', 'Double-A League'),
('AL', 'American League'),
('FL', 'Florida League')
"""
run_command(database_file, q)
q="select * from league"
df = run_query(database_file, q)
display(df[:2])
q=""" CREATE TABLE IF NOT EXISTS appearance_type (appearance_type_id text PRIMARY KEY,
name text,
category text)
"""
run_command(database_file, q)
#this information is in the original tables, but to make it simpler it is loaded from a file
conn = sqlite3.connect('./data/mlb.db')
df = pd.read_csv('./data/appearance_type.csv')
df.to_sql('appearance_type_full', conn, index=False)
conn.close()
q="INSERT OR IGNORE INTO appearance_type select * from appearance_type_full"
run_command(database_file, q)
q="select * from appearance_type"
df = run_query(database_file, q)
display(df[:2])
q=""" CREATE TABLE IF NOT EXISTS team (team_id text PRIMARY KEY,
league_id text,
city text,
nickname text,
franch_id text,
FOREIGN KEY (league_id) REFERENCES league(league_id))
"""
run_command(database_file, q)
q=""" INSERT OR IGNORE INTO team
SELECT team_id,
league,
city,
nickname,
franch_id
FROM team_codes
"""
run_command(database_file, q)
q="select * from team"
df = run_query(database_file, q)
display(df[:2])
q=""" CREATE TABLE IF NOT EXISTS game (game_id text PRIMARY KEY,
date text,
number_of_game integer,
park_id text,
length_outs float,
day BOOLEAN,
completion text,
forfeit text,
protest text,
attendance float,
length_minutes float,
additional_info text,
acquisition_info text,
FOREIGN KEY (park_id) REFERENCES park(park_id))
"""
run_command(database_file, q)
q=""" INSERT OR IGNORE INTO game
SELECT game_id,
date,
number_of_game,
park_id,
length_outs,
CASE WHEN 'day' THEN 1 ELSE 0 END as day,
completion,
forefeit,
protest,
attendance,
length_minutes,
additional_info,
acquisition_info
FROM game_log
"""
run_command(database_file, q)
q="select * from game"
df = run_query(database_file, q)
display(df.iloc[:2,:-1])
q="""CREATE TABLE IF NOT EXISTS team_appearance (team_id text,
game_id text,
home BOOLEAN,
league_id text,
score INTEGER,
line_score TEXT,
at_bats ,
hits REAL,
doubles REAL,
triples REAL,
homeruns REAL,
rbi REAL,
sacrifice_hits REAL,
sacrifice_flies REAL,
hit_by_pitch REAL,
walks REAL,
intentional_walks REAL,
strikeouts REAL,
stolen_bases REAL,
caught_stealing REAL,
grounded_into_double REAL,
first_catcher_interference REAL,
left_on_base REAL,
pitchers_used REAL,
individual_earned_runs REAL,
team_earned_runs REAL,
wild_pitches REAL,
balks REAL,
putouts REAL,
assists REAL,
errors REAL,
passed_balls REAL,
double_plays REAL,
triple_plays REAL,
PRIMARY KEY (team_id, game_id),
FOREIGN KEY (team_id) REFERENCES team(team_id),
FOREIGN KEY (game_id) REFERENCES game(game_id),
FOREIGN KEY (league_id) REFERENCES league(league_id))
"""
run_command(database_file, q)
#for each line in game_log there will be two entrie in team_apperance
#one for the home and one for the away team
q=""" INSERT OR IGNORE INTO team_appearance
SELECT h_name,
game_id,
1 AS home,
h_league,
h_score,
h_line_score,
h_at_bats,
h_hits,
h_doubles,
h_triples,
h_homeruns,
h_rbi,
h_sacrifice_hits,
h_sacrifice_flies,
h_hit_by_pitch,
h_walks,
h_intentional_walks,
h_strikeouts,
h_stolen_bases,
h_caught_stealing,
h_grounded_into_double,
h_first_catcher_interference,
h_left_on_base,
h_pitchers_used,
h_individual_earned_runs,
h_team_earned_runs,
h_wild_pitches,
h_balks,
h_putouts,
h_assists,
h_errors,
h_passed_balls,
h_double_plays,
h_triple_plays
from game_log
UNION
SELECT v_name,
game_id,
0 AS home,
v_league,
v_score,
v_line_score,
v_at_bats,
v_hits,
v_doubles,
v_triples,
v_homeruns,
v_rbi,
v_sacrifice_hits,
v_sacrifice_flies,
v_hit_by_pitch,
v_walks,
v_intentional_walks,
v_strikeouts,
v_stolen_bases,
v_caught_stealing,
v_grounded_into_double,
v_first_catcher_interference,
v_left_on_base,
v_pitchers_used,
v_individual_earned_runs,
v_team_earned_runs,
v_wild_pitches,
v_balks,
v_putouts,
v_assists,
v_errors,
v_passed_balls,
v_double_plays,
v_triple_plays
from game_log
"""
run_command(database_file, q)
q="select * from team_appearance"
df = run_query(database_file, q)
display(df.iloc[:2,:9])
q=""" CREATE TABLE IF NOT EXISTS person_appearance (appearance_id INTEGER,
game_id text,
team_id integer,
person_id text,
appearance_type_id text,
PRIMARY KEY (appearance_id),
FOREIGN KEY (person_id) REFERENCES person(person_id),
FOREIGN KEY (team_id) REFERENCES team(team_id),
FOREIGN KEY (game_id) REFERENCES game(game_id),
FOREIGN KEY (appearance_type_id) REFERENCES appearance_type(appearance_type_id))
"""
run_command(database_file, q)
print("table created")
insert into person_appearance first part with explicit query, below with for loop for players
q=""" INSERT OR IGNORE INTO person_appearance(game_id, team_id, person_id, appearance_type_id)
SELECT game_id,
NULL,
lf_umpire_id,
"ULF"
FROM game_log
WHERE lf_umpire_id IS NOT NULL
UNION
SELECT game_id,
NULL,
rf_umpire_id,
"URF"
FROM game_log
WHERE rf_umpire_id IS NOT NULL
UNION
SELECT game_id,
NULL,
hp_umpire_id,
"UHP"
FROM game_log
WHERE hp_umpire_id IS NOT NULL
UNION
SELECT game_id,
NULL,
[1b_umpire_id],
"U1B"
FROM game_log
WHERE [1b_umpire_id] IS NOT NULL
UNION
SELECT game_id,
NULL,
[2b_umpire_id],
"U2B"
FROM game_log
WHERE [2b_umpire_id] IS NOT NULL
UNION
SELECT game_id,
NULL,
[3b_umpire_id],
"U3B"
FROM game_log
WHERE [3b_umpire_id] IS NOT NULL
UNION
SELECT game_id,
v_name,
v_manager_id,
"MM"
FROM game_log
WHERE v_manager_id IS NOT NULL
UNION
SELECT game_id,
h_name,
h_manager_id,
"MM"
FROM game_log
WHERE h_manager_id IS NOT NULL
UNION
SELECT game_id,
CASE WHEN h_score > v_score THEN h_name ELSE v_name END,
winning_pitcher_id,
"AWP"
FROM game_log
WHERE winning_pitcher_id IS NOT NULL
UNION
SELECT game_id,
CASE WHEN h_score < v_score THEN h_name ELSE v_name END,
losing_pitcher_id,
"ALP"
FROM game_log
WHERE losing_pitcher_id IS NOT NULL
UNION
SELECT game_id,
CASE WHEN h_score > v_score THEN h_name ELSE v_name END,
saving_pitcher_id,
"ASP"
FROM game_log
WHERE saving_pitcher_id IS NOT NULL
UNION
SELECT game_id,
CASE WHEN h_score > v_score THEN h_name ELSE v_name END,
winning_rbi_batter_id,
"AWB"
FROM game_log
WHERE winning_rbi_batter_id IS NOT NULL
UNION
SELECT game_id,
h_name,
h_starting_pitcher_id,
"PSP"
FROM game_log
WHERE h_starting_pitcher_id IS NOT NULL
UNION
SELECT game_id,
v_name,
v_starting_pitcher_id,
"PSP"
FROM game_log
WHERE v_starting_pitcher_id IS NOT NULL
"""
run_command(database_file, q)
print("insert done")
insert player positions with for loop, each line in game log will have multiple entries plus offensive and defensive positions must be joined
template = """ INSERT INTO person_appearance (game_id,
team_id,
person_id,
appearance_type_id)
SELECT game_id,
{hv}_name,
{hv}_player_{num}_id,
"O{num}"
FROM game_log
WHERE {hv}_player_{num}_id IS NOT NULL
UNION
SELECT game_id,
{hv}_name,
{hv}_player_{num}_id,
"D" || CAST({hv}_player_{num}_def_pos AS INT)
FROM game_log
WHERE {hv}_player_{num}_id IS NOT NULL;
"""
#run_command(c1)
#run_command(c2)
for hv in ["h","v"]:
for num in range(1,10):
query_vars = {
"hv": hv,
"num": num
}
run_command(database_file, template.format(**query_vars))
print("insert done")
q="""select appearance_type_id, count(*) as count
from person_appearance
group by appearance_type_id
order by 1"""
df = run_query(database_file, q)
display(df[:3])
q="drop table game_log"
run_command(database_file, q)
q="drop table park_codes"
run_command(database_file, q)
q="drop table team_codes"
run_command(database_file, q)
q="drop table person_codes"
run_command(database_file, q)
print("done!")