import pandas as pd
import re
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
star_wars = pd.read_csv("./data/star_wars.csv", encoding="ISO-8859-1")
star_wars = star_wars[pd.notnull(star_wars['RespondentID'])]
display(star_wars.iloc[:3,-6:])
There are several other columns containing answers to questions about the Star Wars movies. For some questions, the respondent had to check one or more boxes.
column_one = 'Have you seen any of the 6 films in the Star Wars franchise?'
column_two = 'Do you consider yourself to be a fan of the Star Wars film franchise?'
#change results from str to Boolean
yes_no = {'Yes': True,
'No': False}
star_wars[column_one]=star_wars[column_one].map(yes_no)
star_wars[column_two]=star_wars[column_two].map(yes_no)
print(star_wars[column_one].value_counts())
print(star_wars[column_two].value_counts())
Columns 3 to 9 represent a single checkbox question. Which of the following Star Wars films have you seen?
For each column, if the value in a cell is the name of the movie, that means the respondent saw the movie. If the value is NaN, the respondent either didn't see the movie.
Convert the values using map function and dictionary: { "Star Wars: Episode I The Phantom Menace": True, NaN: False}
Next, rename the columns to seen1 to seen6
answers = ['Star Wars: Episode I The Phantom Menace',
'Star Wars: Episode II Attack of the Clones',
'Star Wars: Episode III Revenge of the Sith',
'Star Wars: Episode IV A New Hope',
'Star Wars: Episode V The Empire Strikes Back',
'Star Wars: Episode VI Return of the Jedi']
idy=0
for idx in range(3,9):
#convert string to boolean
yes_no = {answers[idy]: True, np.nan: False}
col = star_wars.columns[idx]
star_wars[col] = star_wars[col].map(yes_no)
#rename columns to seen1 to seen6
idy+=1
star_wars = star_wars.rename(columns={col : 'seen'+str(idy)})
display(star_wars.iloc[:3,3:9])
Columns 9 to 15 ask the respondent to rank the Star Wars movies (1-favorite, 6-least favorite).
#convert to float
star_wars[star_wars.columns[9:15]] = star_wars[star_wars.columns[9:15]].astype(float)
idy=1
for idx in range(9,15):
#rename
star_wars = star_wars.rename(columns={star_wars.columns[idx] : 'ranking_'+str(idy)})
idy+=1
display(star_wars.iloc[:3,9:15])
import matplotlib.pyplot as plt
%matplotlib inline
cols=[]
for idx in range(1,7):
cols.append('ranking_'+str(idx))
#df.mean
rankings = star_wars[cols].mean()
#horizontal bar plot
fig, ax = plt.subplots(figsize=(5,5))
bar_pos = np.arange(0,len(rankings),1)+0.75
bar_widths = rankings
ax.barh(bar_pos, bar_widths, height=0.5, color=colors[0])
ax.set_yticks(bar_pos)
ax.set_yticklabels(cols)
for key, spine in ax.spines.items():
spine.set_visible(False)
ax.tick_params(left=False, right=False, top=False, bottom=False)
plt.show()
cols=[]
for idx in range(1,7):
cols.append('seen'+str(idx))
seen_count = star_wars[cols].sum()/len(star_wars[cols])
fig, ax = plt.subplots(figsize=(5,5))
bar_pos = np.arange(0,len(seen_count),1)+0.75
bar_widths = seen_count
ax.barh(bar_pos, bar_widths, height=0.5, color=colors[0])
ax.set_yticks(bar_pos)
ax.set_yticklabels(cols)
for key, spine in ax.spines.items():
spine.set_visible(False)
ax.tick_params(left=False, right=False, top=False, bottom=False)
plt.show()
There are several columns that segment our data into two groups. Here are a few examples:
def ranks_and_seen_count(gender):
if(gender=='Male'):
df = star_wars[star_wars['Gender'] == 'Male']
elif(gender=='Female'):
df = star_wars[star_wars['Gender'] == 'Female']
cols=[]
for idx in range(1,7):
cols.append('ranking_'+str(idx))
rankings = df[cols].mean()
cols=[]
for idx in range(1,7):
cols.append('seen'+str(idx))
seen_count = df[cols].sum()
return rankings, seen_count
(rankings, seen_count) = ranks_and_seen_count('Male')
print('highest male counts')
print(rankings.idxmax(), ':', rankings.max(), ' ',
seen_count.idxmax(), ':', seen_count.max())
print('\nlowest male counts')
print(rankings.idxmin(), ':', rankings.min(), ' ',
seen_count.idxmin(), ':', seen_count.min())
(rankings, seen_count) = ranks_and_seen_count('Female')
print('\nhighest Female counts')
print(rankings.idxmax(), ':', rankings.max(), ' ',
seen_count.idxmax(), ':', seen_count.max())
(rankings, seen_count) = ranks_and_seen_count('Female')
print('\nlowest Female counts')
print(rankings.idxmin(), ':', rankings.min(), ' ',
seen_count.idxmin(), ':', seen_count.min())
Potential next steps: