import pandas as pd
import re
import numpy as np
import requests
from bs4 import BeautifulSoup
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
movies = pd.read_csv('./data/fandango_score_comparison.csv')
display(movies.iloc[:4,:10])
The dataset contains user and critic scores from RottenTomatoes, Metacritic, IMDB, and Fandango. Each service has ratings on a different scale:
To make it easier to compare scores across services:
df = pd.read_csv('./data/movie_ratings_16_17.csv')
#select only fandango ratings
selection = ['movie', 'year', 'fandango']
new_fandango = df[selection]
display(new_fandango[:3])
import matplotlib.pyplot as plt
%matplotlib inline
movies['Metacritic_norm_round'].hist()
movies['Fandango_Stars'].hist()
plt.show()
import numpy as np
mean_Metacritic = movies['Metacritic_norm_round'].mean()
median_Metacritic = movies['Metacritic_norm_round'].median()
std_dev_Metacritic = np.std(movies['Metacritic_norm_round'])
mean_Fandango = movies['Fandango_Stars'].mean()
median_Fandango = movies['Fandango_Stars'].median()
std_dev_Fandango = np.std(movies['Fandango_Stars'])
print("Mean Fandango: %0.2f, Mean Metacritic: %0.2f" % (mean_Fandango, mean_Metacritic))
print("Median Fandango: %0.2f, Median Metacritic: %0.2f" % (median_Fandango, median_Metacritic))
print("Std_dev Fandango: %0.2f, Std_dev Metacritic: %0.2f" % (std_dev_Fandango, std_dev_Metacritic))
The Fandango mean is higher than the median - distribution skewed towards high values
The Fandango standard deviation is lower - "thiner" distribution
These stats seem to indicate that Fandango ratings are biased high.
fig = plt.figure(figsize=(6,5))
ax= plt.subplot()
ax.scatter(movies['Metacritic_norm_round'],movies['Fandango_Stars'], label='')
#show 1-to-1 line
xx = np.arange(0,5,0.1)
ax.plot(xx,xx, label='1-to-1 line')
ax.set_xlim(0.,5.5), ax.set_xlim(0.,5.5)
ax.legend(), ax.set_ylabel('Fandango Rating'), ax.set_xlabel('Metacritic Rating')
plt.show()
movies['fm_diff'] = np.absolute(movies['Metacritic_norm_round'] - movies['Fandango_Stars'])
sorted_movies = movies.sort_values('fm_diff', axis=0, ascending=False)
sorted_movies.reset_index(drop=True, inplace=True)
print(sorted_movies[['FILM','fm_diff']].iloc[:5])
from scipy.stats.stats import pearsonr
r, p_value = pearsonr(movies['Fandango_Stars'], movies['Metacritic_norm_round'])
print(r)
from scipy.stats.stats import linregress
slope, intercept, r_value, p_value, stderr_slope = linregress(movies['Metacritic_norm_round'], movies['Fandango_Stars'])
fig = plt.figure(figsize=(5,5))
plt.scatter(movies['Metacritic_norm_round'], movies['Fandango_Stars'])
xx = np.arange(0,5,0.1)
yy = xx*slope+intercept
plt.plot(xx,yy)
plt.xlim(0,5.5), plt.ylim(0,5.5)
plt.show()
old_selection = ['FILM', 'Fandango_Stars', 'Fandango_Ratingvalue', 'Fandango_votes', 'Fandango_Difference']
old_fandango = movies[old_selection]
old_fandango.loc[:,'movie'] = old_fandango['FILM'].apply(lambda x: x.split('(')[0])
old_fandango.loc[:,'year'] = old_fandango['FILM'].apply(lambda x: int(x.split('(')[1].replace(')','')))
display(old_fandango[:3])
old_popular = old_fandango[old_fandango['Fandango_votes']>30]
print("Fraction of popular movies in the old dataset: %0.1f" % float(len(old_popular)/len(old_fandango)) )
print(old_popular['year'].value_counts())
print(new_fandango['year'].value_counts())
fandango_2015 = old_popular.loc[old_popular['year']==2015,:]
fandango_2016 = new_fandango.loc[new_fandango['year']==2016,:]
display(fandango_2015[:3])
display(fandango_2016[:3])
print('2015 ratings:')
print(fandango_2015['Fandango_Stars'].value_counts(normalize=True).sort_index()*100.)
print('\n2016 ratings:')
print(fandango_2016['fandango'].value_counts(normalize=True).sort_index()*100.)
plt.style.use('fivethirtyeight')
fandango_2015['Fandango_Stars'].plot.kde(label='2015', legend = True, figsize = (5,4))
fandango_2016['fandango'].plot.kde(label='2016', legend = True)
# the `y` parameter pads the title upward
plt.title("Fandango's ratings (2015 vs 2016)", y = 1.02)
plt.xlabel('Stars')
plt.xlim(0,5) # because ratings start at 0 and end at 5
plt.xticks(np.arange(0,5.1,.5))
plt.show()
mean_2015 = fandango_2015['Fandango_Stars'].mean()
mean_2016 = fandango_2016['fandango'].mean()
median_2015 = fandango_2015['Fandango_Stars'].median()
median_2016 = fandango_2016['fandango'].median()
mode_2015 = fandango_2015['Fandango_Stars'].mode()[0] # the output of Series.mode() is a bit uncommon
mode_2016 = fandango_2016['fandango'].mode()[0]
summary = pd.DataFrame()
summary['2015'] = [mean_2015, median_2015, mode_2015]
summary['2016'] = [mean_2016, median_2016, mode_2016]
summary.index = ['mean', 'median', 'mode']
display(summary)
plt.style.use('fivethirtyeight')
summary['2015'].plot.bar(color = '#0066FF', align = 'center', label = '2015', width = .25)
summary['2016'].plot.bar(color = '#CC0000', align = 'edge', label = '2016', width = .25, rot = 0, figsize = (5,4))
plt.title('Summary statistics: 2015 vs 2016', y = 1.02)
plt.ylim(0,5.5)
plt.yticks(np.arange(0,5.1,.5))
plt.ylabel('Stars')
plt.legend(framealpha = 0, loc = 'upper center')
plt.show()