import pandas as pd
import re
import numpy as np
import requests
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
df = pd.read_csv('./data/2017-fCC-New-Coders-Survey-Data.csv')
display(df.iloc[:3,:8])
Do we have new developers interested in web development?
df1 = df['JobRoleInterest'].value_counts().reset_index().reset_index(drop=True)
df1.columns = ['Job Title', 'Frequency']
display(df1[:9])
Are people interested in only one subject or they can be interested in more than one subject?
print('Total answers=%d, Total people=%d' % (sum(df1['Frequency']),len(df['JobRoleInterest'])))
How many people are interested in at least one of the two subjects we teach (web and mobile development)?
number_of_web = 0
for index, row in df1.iterrows():
if 'Web' in row['Job Title']:
number_of_web+=row['Frequency']
print("People interested in web: %0.1f%%" % (100.*int(number_of_web)/len(df['JobRoleInterest'])))
number_of_web = 0
for index, row in df1.iterrows():
if 'Mobile' in row['Job Title']:
number_of_web+=row['Frequency']
print("People interested in mobile: %0.1f%%" % (100.*int(number_of_web)/len(df['JobRoleInterest'])))
df.dropna(axis='rows', subset=['JobRoleInterest'], inplace=True)
print('Total people=%d' % len(df['JobRoleInterest']))
df1 = df['CountryLive'].value_counts().reset_index().reset_index(drop=True)
df1.columns = ['CountryLive', 'Frequency']
print (df1[:5])
country_list=['United States of America', 'India', 'United Kingdom', 'Canada']
final_df = df.loc[df['CountryLive'].isin(country_list)].copy()
display(final_df[['Age','CountryLive']].iloc[:3])
#compute money per month
final_df['MonthsProgramming'].replace(0,1, inplace = True)
final_df['MoneyPerMonth'] = final_df['MoneyForLearning']/final_df['MonthsProgramming']
#drop null values
final_df = final_df[final_df['JobRoleInterest'].notnull()].copy()
final_df = final_df[final_df['MoneyPerMonth'].notnull()]
final_df = final_df[final_df['CountryLive'].notnull()]
print('Number of students willing to pay: %d' % len(final_df))
#group by country and calculate mean of money spent per month
print(final_df.groupby(['CountryLive'])['MoneyPerMonth'].mean())
sns.boxplot(y = 'MoneyPerMonth', x = 'CountryLive',data = final_df)
plt.title('Money Spent Per Month Per Country\n(Distributions)',fontsize = 16)
plt.ylabel('Money per month (US dollars)')
plt.xlabel('Country')
plt.xticks(range(4), ['US', 'UK', 'India', 'Canada']) # avoids tick labels overlap
plt.show()
# Isolate only those participants who spend less than 10000 per month
final_df = final_df[final_df['MoneyPerMonth'] < 20000]
print(final_df.groupby(['CountryLive'])['MoneyPerMonth'].mean())
sns.boxplot(y = 'MoneyPerMonth', x = 'CountryLive',data = final_df)
plt.title('Money Spent Per Month Per Country\n(Distributions)',fontsize = 16)
plt.ylabel('Money per month (US dollars)')
plt.xlabel('Country')
plt.xticks(range(4), ['US', 'UK', 'India', 'Canada']) # avoids tick labels overlap
plt.show()
Remove India's outliers: MoneyPerMonth >= 2500
india_outliers = final_df[(final_df['CountryLive'] == 'India') & (final_df['MoneyPerMonth'] >= 2500)]
final_df = final_df.drop(india_outliers.index) # using the row labels
Remove Canada's outliers: MoneyPerMonth >= 4500
canada_outliers = final_df[(final_df['CountryLive'] == 'Canada') & (final_df['MoneyPerMonth'] > 4500)]
# Remove the extreme outliers for Canada
final_df = final_df.drop(canada_outliers.index)
Remove US's outliers: No Bootcamp and less than 3 months programming (one of buyers)
us_outliers = final_df[(final_df['CountryLive'] == 'United States of America') & (final_df['MoneyPerMonth'] >= 6000)]
#remove: Didn't attend bootcamps, less than 3 months programming
no_bootcamp = final_df[(final_df['CountryLive'] == 'United States of America') &
(final_df['MoneyPerMonth'] >= 6000) &
(final_df['AttendedBootcamp'] == 0) ]
final_df = final_df.drop(no_bootcamp.index)
# Remove the respondents that had been programming for less than 3 months
less_than_3_months = final_df[(final_df['CountryLive'] == 'United States of America') &
(final_df['MoneyPerMonth'] >= 6000) &
(final_df['MonthsProgramming'] <= 3) ]
final_df = final_df.drop(less_than_3_months.index)
money_spent = final_df.groupby(['CountryLive'])['MoneyPerMonth'].mean()
population = final_df['CountryLive'].value_counts(normalize = True) * 100
print(money_spent)
print(population)
print(population*money_spent)