import pandas as pd
import re
import numpy as np
from IPython.display import display
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green']
The dataset consists of 8 files that need to be cleaned and combined:
all files will be loaded into a single dictionary: data{}
data_files = ["ap_2010.csv", "class_size.csv", "demographics.csv",
"graduation.csv", "hs_directory.csv", "sat_results.csv"]
data = {}
for f in data_files:
d = pd.read_csv("./data/schools/{0}".format(f))
data[f.replace(".csv", "")] = d
all_survey = pd.read_csv("data/schools/survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv("data/schools/survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0, sort=True)
survey["DBN"] = survey["dbn"]
survey_fields = ["DBN", "rr_s", "rr_t", "rr_p", "N_s", "N_t", "N_p", "saf_p_11",
"com_p_11", "eng_p_11", "aca_p_11", "saf_t_11", "com_t_11",
"eng_t_11", "aca_t_11", "saf_s_11", "com_s_11", "eng_s_11",
"aca_s_11", "saf_tot_11", "com_tot_11", "eng_tot_11", "aca_tot_11"]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
data_sets = ["ap_2010", "class_size", "demographics", "graduation", "hs_directory", "sat_results", "survey"]
print("reading done")
DF snipet:
for df in data_sets:
display(data[df].iloc[:3,:5])
'hs_directory' and 'class_size' don't have DBN columns
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]
def pad_csd(num):
string_representation = str(num)
if len(string_representation) > 1:
return string_representation
else:
return "0" + string_representation
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]
print("DBN columns added")
'sat_results', 'ap_2010' and 'hs_directory' data needs to be converted to numeric (errors="coerce" will return NaN for problematic values)
#sat_results
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")
data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
#ap_2010
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']
for col in cols:
data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")
#hs_directory
def find_lat(loc):
coords = re.findall("\(.+, .+\)", loc)
lat = coords[0].split(",")[0].replace("(", "")
return lat
def find_lon(loc):
coords = re.findall("\(.+, .+\)", loc)
lon = coords[0].split(",")[1].replace(")", "").strip()
return lon
data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)
data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")
print("conversion to numeric done")
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]
#aggregrate the CORE SUBJECTS (mean of strings with same name) and return the mean of all other columns
class_size = class_size.groupby("DBN").agg(np.mean)
#Reset the index to make DBN a column again.
class_size.reset_index(inplace=True)
data["class_size"] = class_size
data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]
data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]
print("data sets condensed")
combine all files:
#Both the ap_2010 and the graduation data sets have many missing DBN values,
#so we'll use a left join when we merge the sat_results data set with them.
combined = data["sat_results"]
combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")
#Because these files contain information that's more valuable to our analysis and
#also have fewer missing DBN values, we'll use the inner join type.
to_merge = ["class_size", "demographics", "survey", "hs_directory"]
for m in to_merge:
combined = combined.merge(data[m], on="DBN", how="inner")
combined = combined.fillna(combined.mean())
combined = combined.fillna(0)
print("datasets combined")
def get_first_two_chars(dbn):
return dbn[0:2]
combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)
correlations = combined.corr()
correlations = correlations["sat_score"]
#remove low correlations and variables with the SAT score that have very high correlation
print(correlations[((correlations>0.4) | (correlations<-0.4)) & (correlations<0.97)])
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
fig, ax = plt.subplots(figsize=(5,10))
field_names = ["rr_s", "rr_t", "rr_p", "N_s", "N_t", "N_p",
"saf_p_11", "com_p_11", "eng_p_11", "aca_p_11",
"saf_t_11", "com_t_11", "eng_t_11", "aca_t_11",
"saf_s_11", "com_s_11", "eng_s_11", "aca_s_11",
"saf_tot_11", "com_tot_11", "eng_tot_11", "aca_tot_11"]
fields = ['StudentResponse_Rt', 'TeacherResponse_Rt', 'ParentResponse_Rt',
'N_student_resp', 'N_teacher_resp', 'N_parent_res',
'Safety_Respect_parents', 'Communication_parents',
'Engagement_parents','Academic_expectations_parents',
'Safety_Respect_teachers','Communication_teachers',
'Engagement_teachers','Academic_expectations_teachers',
'Safety_Respect_students','Communication_students',
'Engagement_students','Academic_expectations_students',
'Safety_Respect_total','Communication_total',
'Engagement_total','Academic_expectations_total']
bar_pos = np.arange(0,len(fields),1.) + 1.
bar_widths = correlations[field_names]
#sort fields according to bar_widths
combined_list = sorted(zip(bar_widths, fields))
fields = [x for y, x in combined_list]
bar_widths = sorted(bar_widths)
ax.barh(bar_pos, bar_widths, height=0.5)
ax.set_ylim(0,len(fields)+1)
ax.set_yticks(bar_pos)
ax.set_yticklabels(fields)
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.tick_params(left=False, right=False, top=False, bottom=False)
ax.axhline(15.5)
ax.text(0.75, 0.69, 'r_value>0.25', fontsize=12, transform=ax.transAxes)
plt.show()
from scipy.stats import linregress
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(combined['saf_s_11'], combined['sat_score'], s=5, color='blue')
ax.set_xlabel('Safety and Respect (students)')
ax.set_ylabel('SAT scores')
slope,intercept,r_value,p_value,stderr_slope = linregress(combined['saf_s_11'], combined['sat_score'])
xx = np.arange(min(combined['saf_s_11']), max(combined['saf_s_11']), (max(combined['saf_s_11'])-min(combined['saf_s_11']))/100.)
ax.plot(xx,xx*slope+intercept, color="#849AB8", linewidth=2, alpha=0.7)
ax.text(0.5, 0.9, 'r_value=%0.2f' % (r_value), fontsize=15, transform=ax.transAxes)
plt.show()
This will allows us to look at the geographical distribution of SAT scores and english learning percentage. We can test the hipotesis that students learning english will likely be in immigrant communities with lower socio-economic backgrounds and have lower SAT scores
groups = combined.groupby(combined['school_dist'])
districts = groups.agg(np.mean)
districts.reset_index(inplace=True)
from mpl_toolkits.basemap import Basemap
fig = plt.figure(figsize=(5,5))
m = Basemap(projection='merc', llcrnrlat=40.496044, urcrnrlat=40.915256,
llcrnrlon=-74.255735, urcrnrlon=-73.700272, resolution='h')
m.fillcontinents(color='#F0E0BB', lake_color='#BBDDF0')
m.drawmapboundary(fill_color='#C2DCEA')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)
longitudes = districts['lon'].tolist()
latitudes = districts['lat'].tolist()
#zorder - 2 draws the points on top of the continents, which is where we want them.
#latlon - latitude and longitude coordinates instead of x and y plot coordinates.
#c - color code by another variable automatically converted to values ranging from 0 to 1.
m.scatter(longitudes, latitudes, s=50, zorder=2, latlon=True, c=districts['sat_score'], cmap='summer')
plt.show()
fig = plt.figure(figsize=(5,5))
m = Basemap(projection='merc', llcrnrlat=40.496044, urcrnrlat=40.915256,
llcrnrlon=-74.255735, urcrnrlon=-73.700272, resolution='h')
m.fillcontinents(color='#F0E0BB', lake_color='#BBDDF0')
m.drawmapboundary(fill_color='#C2DCEA')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)
longitudes = districts['lon'].tolist()
latitudes = districts['lat'].tolist()
m.scatter(longitudes, latitudes, s=50, zorder=2, latlon=True, c=districts['ell_percent'], cmap='summer')
plt.show()
fig, ax = plt.subplots(figsize=(5,5))
field_names = ['white_per', 'asian_per', 'black_per', 'hispanic_per']
bar_widths = correlations[field_names]
bar_pos = np.arange(0,len(field_names),1.0)+1.
ax.barh(bar_pos, bar_widths, height=0.5)
ax.set_ylim(0+0.5, (len(field_names)+1)-0.5)
ax.set_xlim(-0.5, 0.7)
ax.set_yticks(bar_pos)
ax.set_yticklabels(field_names)
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.tick_params(left=False, right=False, top=False, bottom=False)
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(combined['hispanic_per'], combined['sat_score'], s=5, color='blue')
The correlation between hispanic_per and sat_score isnt obvious. It seems to be driven by very high scores at low hispanic percentages and very low scores at 100% hispanic
print(combined[combined['hispanic_per']>95][['SCHOOL NAME','sat_score']])
print(combined[(combined['hispanic_per']<10) & (combined['sat_score']>1800)][['SCHOOL NAME']])
fig, ax = plt.subplots(figsize=(5,2))
field_names=['male_per', 'female_per']
bar_widths=correlations[field_names]
bar_pos=np.arange(0,len(field_names),1)-0.5
ax.barh(bar_pos, bar_widths, height=0.5)
ax.set_yticks(bar_pos)
ax.set_yticklabels(field_names)
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.tick_params(right=False, top=False)
plt.show()
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(combined['female_per'], combined['sat_score'], s=5, color='blue')
plt.show()
Schools with 100% female perform average.
between 50% and 80% there are some schools with very high SAT scores
highest achievers are schools with ~50/50 (or 40% female)
sel = (combined['female_per']>60) & (combined['sat_score']>1700)
print(combined[sel]['SCHOOL NAME'])
combined['ap_per'] = combined['AP Test Takers ']/combined['total_enrollment']
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(combined['ap_per'], combined['sat_score'], s=5, color='blue')
plt.show()
sel = (combined['ap_per']>0.7)
print(combined[sel]['SCHOOL NAME'])
potential next steps: