Project: Split table and Create relations Between new Tables (Academy Awards)


Questions to address:

  • Normalize a single table into multiple tables and create relations between the new tables.
  • Create a cerimonies table with a foreign key to ceremony_id in nominations and remove year from nominations
  • Create and populate tables movies, actors and Join table movies_actors
  • Select the movies with the most number of actors nominated using the new tables


Tools:

  • insert multiple rows into a table
    • query = 'Insert into table (col1, col2) VALUES (?, ?)'
    • conn.executemany(query, conn)
  • PRAGMA foreign_keys=ON
  • DROP table table_1)
  • conn.execute('Alter table table_1 RENAME TO table_11');


load defaults

In [1]:
import sqlite3
import pandas as pd
import re
import numpy as np
import requests 
import os
import shutil

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates

from functions import *

plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})

colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))


Dataset: Academy Awards for different years

We will modify the original database so the first thing to do is to copy the original into a new file

In [130]:
try:
    os.remove("./data/nominations.db")
    print("file removed")
except:
    print("file not present")
    
shutil.copy("./data/nominations_original.db", "./data/nominations.db")
print("copy created")
file removed
copy created
In [131]:
conn = sqlite3.connect('./data/nominations.db')
query = 'PRAGMA table_info(nominations)'
schema = conn.execute(query).fetchall()
display(schema)

query = 'Select * from nominations'
df = pd.read_sql_query(query, conn)
  
display(df[:3])
[(0, 'Year', 'INTEGER', 0, None, 0),
 (1, 'Category', 'TEXT', 0, None, 0),
 (2, 'Nominee', 'TEXT', 0, None, 0),
 (3, 'Won', 'INTEGER', 0, None, 0),
 (4, 'Movie', 'TEXT', 0, None, 0),
 (5, 'Character', 'TEXT', 0, None, 0)]
Year Category Nominee Won Movie Character
0 2010 Actor -- Leading Role Javier Bardem 0 Biutiful Uxbal
1 2010 Actor -- Leading Role Jeff Bridges 0 True Grit Rooster Cogburn
2 2010 Actor -- Leading Role Jesse Eisenberg 0 The Social Network Mark Zuckerberg


Q1: Create a new table with information about award cerimonies

Created a Host column in the nominations table would generate lots of redundant data. We'll create a ceremonies table that contains the Year and Host for each ceremony and then set up a one-to-many relationship between ceremonies and nominations.

In [132]:
query = """CREATE table IF NOT EXISTS ceremonies (id integer PRIMARY KEY, 
                                                  year integer,
                                                  Host text)"""
conn.execute(query);
In [133]:
years_hosts = [(2010, "Steve Martin"),
               (2009, "Hugh Jackman"),
               (2008, "Jon Stewart"),
               (2007, "Ellen DeGeneres"),
               (2006, "Jon Stewart"),
               (2005, "Chris Rock"),
               (2004, "Billy Crystal"),
               (2003, "Steve Martin"),
               (2002, "Whoopi Goldberg"),
               (2001, "Steve Martin"),
               (2000, "Billy Crystal")]
query = 'Insert OR IGNORE INTO ceremonies (year, host) VALUES (?, ?)'
conn.executemany(query, years_hosts);

print("Insert finished")
Insert finished


Q2: Check if the upload worked

In [134]:
#print(conn.execute('select * from ceremonies LIMIT 10').fetchall())
#print(conn.execute('PRAGMA table_info(ceremonies)').fetchall())

print('Schema for the newly created cerimonies table:')
query = 'PRAGMA table_info(ceremonies)'
df = pd.read_sql_query(query, conn)
display(df[:3])

print("\n\n First rows:")
query = 'select * from ceremonies LIMIT 10'
df = pd.read_sql_query(query, conn)
display(df[:3])
Schema for the newly created cerimonies table:
cid name type notnull dflt_value pk
0 0 id integer 0 None 1
1 1 year integer 0 None 0
2 2 Host text 0 None 0

 First rows:
id year Host
0 1 2010 Steve Martin
1 2 2009 Hugh Jackman
2 3 2008 Jon Stewart


Q3: Turn on key constraints to make sure that we have consistent foreign keys between different tables

For example, since the ceremonies table only contains the id values 1 to 10, inserting a row into nominations while specifying that the ceremony_id value be 11 will work and no error will be returned. This is problematic because if we try to actually join that row with the ceremonies table, the results set will be empty since the id value 11 doesn't map to any row in the ceremonies table.

In [135]:
conn.execute('PRAGMA foreign_keys=ON');
  • The above query needs to be run every time we connect to a database to insert foreign keys.


Q4: Remove year column from nominations and add a foreign key to the cerimonies table (ceremony_id)

SQLite does not allow removal of columns so we need to create a new table with the additional column and delete the original one.

In [136]:
#create table nominations_two with foreign key
query = '''CREATE TABLE IF NOT EXISTS nominations_two(id integer primary key, 
                                        category text,
                                        nominee text,
                                        movie text,
                                        character text, 
                                        won integer,
                                        ceremony_id integer,
                                        FOREIGN KEY(ceremony_id) REFERENCES ceremonies(id) )'''
conn.execute(query)
print("table created")
table created

insert values:

In [137]:
query = '''select nominations.category, nominations.nominee,
                  nominations.movie, nominations.character,
                  nominations.won,
                  ceremonies.id
             from nominations, 
                  ceremonies
            where nominations.year == ceremonies.year'''
#inner join ceremonies on nominations.year==ceremonies.year'''

joined_nominations = conn.execute(query).fetchall()

query = 'INSERT OR IGNORE into nominations_two (category, nominee, movie, character, won, ceremony_id) VALUES (?, ? ,? ,? ,?, ?)'
conn.executemany(query, joined_nominations)
print("INSERT finished")
INSERT finished
In [138]:
query = 'select * from nominations_two limit 10'    
df = pd.read_sql_query(query, conn)
display(df[:2])
id category nominee movie character won ceremony_id
0 1 Actor -- Leading Role Javier Bardem Biutiful Uxbal 0 1
1 2 Actor -- Leading Role Jeff Bridges True Grit Rooster Cogburn 0 1
In [139]:
conn.execute('DROP table nominations')
conn.execute('Alter table nominations_two RENAME TO nominations');


Q5: Create table movies, actors and Join table movies_actors

In [140]:
#create table movies, actors and movies_actors
query = 'CREATE TABLE movies (id integer primary key, movie text)'
conn.execute(query)
query = 'CREATE TABLE actors (id integer primary key, actor text)'
conn.execute(query)
query = '''CREATE TABLE movies_actors (id integer primary key,
                                       movie_id integer,
                                       actor_id integer,
                                       FOREIGN KEY (movie_id) REFERENCES movies(id),                              
                                       FOREIGN KEY (actor_id) REFERENCES actors(id))'''
conn.execute(query)
print("tables created")
tables created


Q6: Populate tables movies, actors and movies_actors

populate actors

In [141]:
query = 'select nominations.id, nominations.nominee from nominations'
actors = conn.execute(query).fetchall()
query = 'INSERT OR IGNORE INTO actors (id, actor) VALUES (?, ?)'
conn.executemany(query, actors)
print(conn.execute('select * from actors limit 2').fetchall())
[(1, 'Javier Bardem'), (2, 'Jeff Bridges')]

populate movies

In [142]:
query = 'select distinct nominations.movie from nominations'
result = conn.execute(query).fetchall()
query = 'insert OR IGNORE into movies (movie) VALUES(?)'
conn.executemany(query, result)
print(conn.execute('select * from movies limit 2').fetchall())
[(1, 'Biutiful '), (2, 'True Grit ')]

populate movies_actors

In [143]:
query = 'select movies.id, nominations.id as actors_id from nominations inner join movies on nominations.movie==movies.movie limit 12'
result = conn.execute(query).fetchall()
query = 'insert OR IGNORE into movies_actors (movie_id, actor_id) VALUES(?,?)'
conn.executemany(query, result)
print(conn.execute('select * from movies_actors limit 15').fetchall())
[(1, 1, 1), (2, 2, 2), (3, 3, 3), (4, 4, 4), (5, 5, 5), (6, 6, 6), (7, 7, 7), (8, 8, 8), (9, 9, 9), (10, 4, 10), (11, 9, 11), (12, 10, 12)]


Q7: Select the movie with the most number of actors nominated using the new tables

In [144]:
query = '''select movie_id, count(*)
             from movies_actors 
         group by movie_id 
         ORDER by 2 DESC LIMIT 1'''
df = pd.read_sql_query(query, conn)
display(df[:10])
movie_id count(*)
0 4 2
In [145]:
query = '''select * 
             from actors, movies, movies_actors 
            where movies.id = (select movie_id from movies_actors group by movie_id ORDER by count(*) DESC LIMIT 1)
              and movies.id = movies_actors.movie_id 
              and actors.id = movies_actors.actor_id '''
df = pd.read_sql_query(query, conn)
display(df[:3])
id actor id movie id movie_id actor_id
0 4 Colin Firth 4 The King's Speech 4 4 4
1 10 Geoffrey Rush 4 The King's Speech 10 4 10
In [ ]: