import numpy as np
import pandas as pd
import seaborn as sns
import re
import requests
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from datetime import datetime
from IPython.display import display, Math
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 14, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
#specific to this project
import pyspark
from pyspark.sql import SQLContext
print("Defaults Loaded")
#sc.stop()
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)
df = sqlContext.read.json("./data/census_2010.json")
df.registerTempTable('census2010')
df = sqlContext.read.json('./data/census_1980.json')
df.registerTempTable('census1980')
df = sqlContext.read.json('./data/census_1990.json')
df.registerTempTable('census1990')
df = sqlContext.read.json('./data/census_2000.json')
df.registerTempTable('census2000')
tables = sqlContext.tableNames()
print(tables)
print top 5 rows from the 2010 Census
query = 'select * from census2010'
df = sqlContext.sql(query)
df.show(5)
print summary statistics for the number of males and females in the 2010 census
query = 'select males, females from census2010'
df = sqlContext.sql(query)
df.describe().show()
print top 2 rows for the totals of different years
query = """select c1.age,
c1.total as total_2010,
c2.total as total_2000,
c3.total as total_1990,
c4.total as total_1980
from census2010 c1
inner join census2000 c2 on c1.age=c2.age
inner join census1990 c3 on c1.age=c3.age
inner join census1980 c4 on c1.age=c4.age
"""
df = sqlContext.sql(query)
df.show(2)
pandas_df = df.toPandas()
sns.set_style()
plt_color=['blue', 'green', 'orange', 'red']
fig, ax = plt.subplots(figsize=(7,7))
ax.plot(pandas_df['age'], pandas_df['total_1980'], label='1980', color=plt_color[0])
ax.plot(pandas_df['age'], pandas_df['total_1990'], label='1990', color=plt_color[1])
ax.plot(pandas_df['age'], pandas_df['total_2000'], label='2000', color=plt_color[2])
ax.plot(pandas_df['age'], pandas_df['total_2010'], label='2010', color=plt_color[3])
ax.set_xlabel('Age'), ax.set_ylabel('Number of People')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
mean_Age = sum(pandas_df['total_1980']*pandas_df['age'])/sum(pandas_df['total_1980'])
ax.axvline(mean_Age, alpha=0.3, color=plt_color[0])
mean_Age = sum(pandas_df['total_1990']*pandas_df['age'])/sum(pandas_df['total_1980'])
ax.axvline(mean_Age, alpha=0.3, color=plt_color[1])
mean_Age = sum(pandas_df['total_2000']*pandas_df['age'])/sum(pandas_df['total_1980'])
ax.axvline(mean_Age, alpha=0.3, color=plt_color[2])
mean_Age = sum(pandas_df['total_2010']*pandas_df['age'])/sum(pandas_df['total_1980'])
ax.axvline(mean_Age, alpha=0.3, color=plt_color[3])
plt.legend()
plt.show()
overall the population is getting older, with two peaks of people born in 1960 and 1990
query="""select sum(c1.total) as total_2010,
sum(c2.total) as total_2000,
sum(c3.total) as total_1990,
sum(c4.total) as total_1980
from census2010 c1
inner join census2000 c2 on c2.age=c1.age
inner join census1990 c3 on c3.age=c1.age
inner join census1980 c4 on c4.age=c1.age"""
df = sqlContext.sql(query)
df.show()
pandas_df = df.toPandas()
sns.set_style()
ylim=[0,400]
year = np.array([1980, 1990, 2000, 2010])
pop = np.array([pandas_df['total_1980'], pandas_df['total_1990'],
pandas_df['total_2000'], pandas_df['total_2010']])
fig, ax = plt.subplots(figsize=(7,7))
ax.plot(year, pop/1e6, label='pop evo')
ax.set_xlabel('Year'), ax.set_ylabel('Millions of People')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.set_ylim(ylim)
plt.legend()
plt.show()
the US population is still growing significantly