import sqlite3
import pandas as pd
import re
import numpy as np
import requests
import os
import shutil
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
df = pd.read_csv('./data/academy_awards.csv', encoding='ISO-8859-1')
display(df.iloc[:5,:7])
df['Year'] = df['Year'].str[0:4].astype(np.int64)
#select only years after 2000
later_than_2000 = df[df['Year']>2000].copy()
display(later_than_2000.iloc[:3,:7])
#select only the following categories
award_categories = ['Actor -- Leading Role', 'Actor -- Supporting Role', 'Actress -- Leading Role', 'Actress -- Supporting Role']
nominations = later_than_2000.loc[later_than_2000['Category'].isin(award_categories),:].copy()
display(nominations.iloc[:3,:7])
replace_dict = {'YES': 1, 'NO': 0}
nominations['Won?'] = nominations['Won?'].map(replace_dict)
display(nominations.iloc[:2,:7])
nominations['Won'] = nominations['Won?']
cols_to_drop = ['Won?', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10']
nominations.drop(labels=cols_to_drop, axis=1, inplace=True)
display(nominations[:3])
final_nominations = nominations.reset_index(drop=True).copy()
display(nominations[:3])
additional_info_one = final_nominations['Additional Info'].str.rstrip("'}")
additional_info_two = additional_info_one.str.split("{'")
additional_info_two = additional_info_two.str
movie_names = additional_info_two[0]
characters = additional_info_two[1]
final_nominations['Movie'] = movie_names
final_nominations['Character'] = characters
final_nominations = final_nominations.drop('Additional Info', axis=1)
display(final_nominations[:3])
To create a new database file, we use the sqlite3 library to connect to a file path that doesn't exist yet. If Python can't find the file we specified, it will create it for us and treat it as a SQLite database file.
conn = sqlite3.connect('./data/nominations.db')
final_nominations.to_sql('nominations', conn, index=False, if_exists='replace')
conn.close()
print('table loaded')
conn = sqlite3.connect('./data/nominations.db')
query = 'pragma table_info(nominations)'
schema = conn.execute(query).fetchall()
display(schema)
query = 'select * from nominations LIMIT 10'
df = pd.read_sql_query(query, conn)
display(df[:3])
conn.close()
Future steps, explore academy_awards.csv and fix the rest of the dataset: