import pandas as pd
import re
import numpy as np
from IPython.display import display
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green']
recent_grads = pd.read_csv('./data/recent-grads.csv')
recent_grads.dropna(axis=0, inplace=True)
recent_grads.rename(columns={'Median': 'Median_salary'}, inplace=True)
print("reading done")
for element in recent_grads.columns:
print("%s | " % element, end='')
columns_list = ['Rank','Major_code','Major','Total','Men','Women','Major_category',
'Sample_size','Employed','Full_time','Part_time', 'Median_salary']
display(recent_grads[columns_list].iloc[:3])
print("Number of Majors: %d\n" % len(recent_grads))
print("Number of Majors per area of study:")
print(recent_grads['Major_category'].value_counts())
table_data = recent_grads['Major_category'].value_counts()
columns = recent_grads['Major_category'].value_counts().index
fig = plt.figure(figsize=(15,7))
ax = plt.subplot()
bar_width = 0.4
index = np.arange(len(columns)) + 0.1
ax.bar(index, table_data, bar_width, color=colors[0])
ax.set_xticks(index)
ax.set_xticklabels(columns, rotation=90)
for key, spine in ax.spines.items():
if(key=='top' or key=='right'):
spine.set_visible(False)
plt.show()
from scipy.stats import linregress
ax = recent_grads.plot(x='Sample_size', y='Median_salary', kind='scatter',
title = 'Salary vs Number of Students', figsize=(7,5))
slope,intercept,r_value,p_value,stderr_slope = linregress(recent_grads["Sample_size"], recent_grads["Median_salary"])
xx = np.arange(0, max(recent_grads["Sample_size"]), 100.)
ax.plot(xx,xx*slope+intercept, color="#849AB8", linewidth=2, alpha=0.7)
ax.text(0.5, 0.9, 'y=%0.2fx+%0.2e' % (slope, intercept), fontsize=15, transform=ax.transAxes)
ax.text(0.5, 0.8, 'r_value=%0.2f' % (r_value), fontsize=15, transform=ax.transAxes)
plt.show()
ax = recent_grads.plot(x='Full_time', y='Median_salary', kind='scatter',
title = 'Salary vs Number Full Time Jobs', figsize=(7,5))
slope,intercept,r_value,p_value,stderr_slope = linregress(recent_grads["Full_time"], recent_grads["Median_salary"])
xx = np.arange(0, max(recent_grads["Full_time"]), 100.)
ax.plot(xx,xx*slope+intercept, color="#849AB8", linewidth=2, alpha=0.7)
ax.text(0.5, 0.9, 'y=%0.2fx+%0.2e' % (slope, intercept), fontsize=15, transform=ax.transAxes)
ax.text(0.5, 0.8, 'r_value=%0.2f' % (r_value), fontsize=15, transform=ax.transAxes)
plt.show()
max_salary_major = recent_grads['Median_salary']==recent_grads['Median_salary'].max()
print("Major with highest salary: %s\nHighest Salary: %d\nNumber of Students: %d" %
(recent_grads[max_salary_major]['Major'][0], recent_grads['Median_salary'].max(),
recent_grads[max_salary_major]['Total'][0]) )
ax = recent_grads.plot(x='ShareWomen', y='Unemployment_rate', kind='scatter')
plt.show()
ax = recent_grads.plot(x='ShareWomen', y='Median_salary', kind='scatter')
slope,intercept,r_value,p_value,stderr_slope = linregress(recent_grads["ShareWomen"], recent_grads["Median_salary"])
xx = np.arange(0, max(recent_grads["ShareWomen"]), max(recent_grads["ShareWomen"])/100.)
ax.plot(xx,xx*slope+intercept, color="#849AB8", linewidth=2, alpha=0.7)
ax.text(0.5, 0.9, 'r_value=%0.2f' % (r_value), fontsize=15, transform=ax.transAxes)
plt.show()
recent_grads['Median_salary'].hist(bins=20, range=(0,120000))
plt.show()
recent_grads['Unemployment_rate'].hist(bins=12, range=(0.,0.18))
plt.show()
recent_grads['ShareWomen'].hist(bins=10, range=(0,1))
plt.show()
from pandas.plotting import scatter_matrix
scatter_matrix(recent_grads[['Median_salary','Unemployment_rate','ShareWomen']], figsize=(10,10))
plt.show()
recent_grads.plot.hexbin(x='Sample_size', y='Median_salary', gridsize=25, sharex=False)
recent_grads.plot.hexbin(x='ShareWomen', y='Median_salary', gridsize=25, sharex=False)
recent_grads.plot.hexbin(x='Unemployment_rate', y='Median_salary', gridsize=25, sharex=False)
plt.show()
sorted_grads=recent_grads.sort_values('Median_salary', ascending=False)
title = 'ShareWomen in Majors with '
sorted_grads[:10].plot.bar(x='Major', y='ShareWomen', ylim=[0,1], title = title+'High-Salary', color=colors[0])
sorted_grads[-10:].plot.bar(x='Major', y='ShareWomen', ylim=[0,1], title = title+'Low-Salary', color=colors[0])
plt.show()
sorted_grads[:10].plot.bar(x='Major', y=['Men','Women'], title = 'Number of Men and Women in High-Salary Majors')
sorted_grads[-10:].plot.bar(x='Major', y=['Men','Women'], title = 'Number of Men and Women in Low-Salary Majors')
plt.show()
title = 'Unemployment rate in Major with '
sorted_grads[:10].plot.bar(x='Major', y='Unemployment_rate', ylim=[0,0.2], title = title+'High-Salary', color=colors[0])
sorted_grads[-10:].plot.bar(x='Major', y='Unemployment_rate', ylim=[0,0.2], title = title+'Low-Salary', color=colors[0])
plt.show()
sorted_grads['Median_salary'].plot.box()
plt.show()
sorted_grads['Unemployment_rate'].plot.box()
plt.show()