import pandas as pd
import re
import numpy as np
import requests
from bs4 import BeautifulSoup
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
jeopardy = pd.read_csv('./data/jeopardy.csv')
display(jeopardy[:3])
new_columns=[re.sub('^ ','', ii) for ii in jeopardy.columns]
jeopardy.columns = new_columns
print(jeopardy.columns.values)
def normalize (string):
#lower case
new_string = string.lower()
#remove punctuation
new_string = re.sub('[:;,\'\".!?]','', new_string)
return new_string
jeopardy.loc[:, 'clean_question'] = jeopardy['Question'].apply(normalize)
jeopardy.loc[:, 'clean_answer'] = jeopardy['Answer'].apply(normalize)
display(jeopardy[:1])
def normalize (string):
new_string = re.sub('[$:;,\'\".!?]','', string)
try:
integer = int(new_string)
except ValueError:
integer =0
return integer
#remove dolar sign and convert to integer
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize)
#convert to datetime
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date']);
Decide whether to study past questions, general knowledge, or not study it all:
def funct (series):
#split answer and question into list of words
split_answer = series['clean_answer'].split(' ')
split_question = series['clean_question'].split(' ')
match_count = 0
#remove 'The' from word list
if('The' in split_answer):
split_answer.remove('The')
if(len(split_answer)==0):
return 0
#count number of words in common
for element in split_answer:
if(element in split_question):
match_count+=1
match_count = match_count/len(split_answer)
return match_count
answer_in_question = jeopardy.apply(funct, axis=1)
print("Average fraction of words shared by question and answer: %0.2f" % answer_in_question.mean())
#count number of aswers for which there is at least on common word in the question
fraction = len(answer_in_question[answer_in_question>0.])/ len(answer_in_question)
print("Fraction of answers with at least one word in the question: %0.2f" % fraction)
question_overlap = []
terms_used = set()
for idx, row in jeopardy.iterrows():
split_question = row['clean_question'].split(' ')
split_question = [ii for ii in split_question if len(ii)>6]
match_count = 0
for element in split_question:
if(element in terms_used):
match_count+=1
#since terms_used is a set, the new worded is only added if it's not present
terms_used.add(element)
if(len(split_question)>0):
match_count=match_count/len(split_question)
#fraction of words that have occured before
question_overlap.append(match_count)
jeopardy['question_overlap'] = question_overlap
print("Average fraction of reoccuring words: %0.2f" % jeopardy['question_overlap'].mean())
#high or low value question
def funct (row):
if(row['clean_value']>800):
value=1
else:
value=0
return value
jeopardy['high_value'] = jeopardy.apply(funct, axis=1)
def funct_word(word):
low_count=0
high_count=0
for idx, row in jeopardy.iterrows():
split_question = row['clean_question'].split(' ')
if(word in split_question):
if(row['high_value']==1):
high_count+=1
else:
low_count+=1
return high_count, low_count
observed_counts = []
#select only a subset of repeated words since this takes a long time
comparison_terms = list(terms_used)[:19]
#find the number of low and high value questions that the terms appeared in
for element in comparison_terms:
observed_counts.append(funct_word(element))
print("Number of occurences in high and low value questions")
for ii in range(0, len(observed_counts)):
print("%s - %d | %d" %(list(terms_used)[ii], observed_counts[ii][0], observed_counts[ii][1]))
from scipy.stats import chisquare
high_value_count = len(jeopardy[jeopardy['high_value']==1])
low_value_count = len(jeopardy[jeopardy['high_value']==0])
chi_squared=[]
for element in observed_counts:
#take number of words that appeared more than once
total = element[0]+element[1]
#divide by total number of words
total_prop = total/jeopardy.shape[0]
#get expected number of high_values for the number of words that are repeated
expected_high_value_count = total_prop*high_value_count
expected_low_value_count = total_prop*low_value_count
observed = [element[0], element[1]]
expected = [expected_high_value_count, expected_low_value_count]
chi, p_value = chisquare(observed, expected)
chi_squared.append([chi, p_value])
for idx, element in enumerate(chi_squared):
print("%s - p_value: %0.2f" % (comparison_terms[idx], element[1]))
Potential next steps: