import sqlite3
import pandas as pd
import re
import numpy as np
import requests
import os
import shutil
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
We will modify the original database so the first thing to do is to copy the original into a new file
try:
os.remove("./data/nominations.db")
print("file removed")
except:
print("file not present")
shutil.copy("./data/nominations_original.db", "./data/nominations.db")
print("copy created")
conn = sqlite3.connect('./data/nominations.db')
query = 'PRAGMA table_info(nominations)'
schema = conn.execute(query).fetchall()
display(schema)
query = 'Select * from nominations'
df = pd.read_sql_query(query, conn)
display(df[:3])
Created a Host column in the nominations table would generate lots of redundant data. We'll create a ceremonies table that contains the Year and Host for each ceremony and then set up a one-to-many relationship between ceremonies and nominations.
query = """CREATE table IF NOT EXISTS ceremonies (id integer PRIMARY KEY,
year integer,
Host text)"""
conn.execute(query);
years_hosts = [(2010, "Steve Martin"),
(2009, "Hugh Jackman"),
(2008, "Jon Stewart"),
(2007, "Ellen DeGeneres"),
(2006, "Jon Stewart"),
(2005, "Chris Rock"),
(2004, "Billy Crystal"),
(2003, "Steve Martin"),
(2002, "Whoopi Goldberg"),
(2001, "Steve Martin"),
(2000, "Billy Crystal")]
query = 'Insert OR IGNORE INTO ceremonies (year, host) VALUES (?, ?)'
conn.executemany(query, years_hosts);
print("Insert finished")
#print(conn.execute('select * from ceremonies LIMIT 10').fetchall())
#print(conn.execute('PRAGMA table_info(ceremonies)').fetchall())
print('Schema for the newly created cerimonies table:')
query = 'PRAGMA table_info(ceremonies)'
df = pd.read_sql_query(query, conn)
display(df[:3])
print("\n\n First rows:")
query = 'select * from ceremonies LIMIT 10'
df = pd.read_sql_query(query, conn)
display(df[:3])
For example, since the ceremonies table only contains the id values 1 to 10, inserting a row into nominations while specifying that the ceremony_id value be 11 will work and no error will be returned. This is problematic because if we try to actually join that row with the ceremonies table, the results set will be empty since the id value 11 doesn't map to any row in the ceremonies table.
conn.execute('PRAGMA foreign_keys=ON');
SQLite does not allow removal of columns so we need to create a new table with the additional column and delete the original one.
#create table nominations_two with foreign key
query = '''CREATE TABLE IF NOT EXISTS nominations_two(id integer primary key,
category text,
nominee text,
movie text,
character text,
won integer,
ceremony_id integer,
FOREIGN KEY(ceremony_id) REFERENCES ceremonies(id) )'''
conn.execute(query)
print("table created")
insert values:
query = '''select nominations.category, nominations.nominee,
nominations.movie, nominations.character,
nominations.won,
ceremonies.id
from nominations,
ceremonies
where nominations.year == ceremonies.year'''
#inner join ceremonies on nominations.year==ceremonies.year'''
joined_nominations = conn.execute(query).fetchall()
query = 'INSERT OR IGNORE into nominations_two (category, nominee, movie, character, won, ceremony_id) VALUES (?, ? ,? ,? ,?, ?)'
conn.executemany(query, joined_nominations)
print("INSERT finished")
query = 'select * from nominations_two limit 10'
df = pd.read_sql_query(query, conn)
display(df[:2])
conn.execute('DROP table nominations')
conn.execute('Alter table nominations_two RENAME TO nominations');
#create table movies, actors and movies_actors
query = 'CREATE TABLE movies (id integer primary key, movie text)'
conn.execute(query)
query = 'CREATE TABLE actors (id integer primary key, actor text)'
conn.execute(query)
query = '''CREATE TABLE movies_actors (id integer primary key,
movie_id integer,
actor_id integer,
FOREIGN KEY (movie_id) REFERENCES movies(id),
FOREIGN KEY (actor_id) REFERENCES actors(id))'''
conn.execute(query)
print("tables created")
populate actors
query = 'select nominations.id, nominations.nominee from nominations'
actors = conn.execute(query).fetchall()
query = 'INSERT OR IGNORE INTO actors (id, actor) VALUES (?, ?)'
conn.executemany(query, actors)
print(conn.execute('select * from actors limit 2').fetchall())
populate movies
query = 'select distinct nominations.movie from nominations'
result = conn.execute(query).fetchall()
query = 'insert OR IGNORE into movies (movie) VALUES(?)'
conn.executemany(query, result)
print(conn.execute('select * from movies limit 2').fetchall())
populate movies_actors
query = 'select movies.id, nominations.id as actors_id from nominations inner join movies on nominations.movie==movies.movie limit 12'
result = conn.execute(query).fetchall()
query = 'insert OR IGNORE into movies_actors (movie_id, actor_id) VALUES(?,?)'
conn.executemany(query, result)
print(conn.execute('select * from movies_actors limit 15').fetchall())
query = '''select movie_id, count(*)
from movies_actors
group by movie_id
ORDER by 2 DESC LIMIT 1'''
df = pd.read_sql_query(query, conn)
display(df[:10])
query = '''select *
from actors, movies, movies_actors
where movies.id = (select movie_id from movies_actors group by movie_id ORDER by count(*) DESC LIMIT 1)
and movies.id = movies_actors.movie_id
and actors.id = movies_actors.actor_id '''
df = pd.read_sql_query(query, conn)
display(df[:3])