Project: Spark (USCensus)¶

Questions to address:¶

How does the number of people of different ages varied between 1980 and 2010 in the US?
How is the total number of people evolving with time?

Tools:¶

use sqlContext.read.json to read data and registerTempTable() to register them and create a local database
use SQl queries to explore the data and merge census from different years
df.toPandas() to convert data to pandas dataframe and do plots

load defaults¶

import numpy as np
import pandas as pd
import seaborn as sns
import re
import requests 

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from datetime import datetime
from IPython.display import display, Math

from functions import *

plt.rcParams.update({'axes.titlepad': 20, 'font.size': 14, 'axes.titlesize':20})

colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))


#specific to this project
import pyspark
from pyspark.sql import SQLContext

print("Defaults Loaded")

Defaults Loaded

#sc.stop()
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)

df = sqlContext.read.json("./data/census_2010.json")
df.registerTempTable('census2010')
df = sqlContext.read.json('./data/census_1980.json')
df.registerTempTable('census1980')
df = sqlContext.read.json('./data/census_1990.json')
df.registerTempTable('census1990')
df = sqlContext.read.json('./data/census_2000.json')
df.registerTempTable('census2000')

tables = sqlContext.tableNames()
print(tables)

['census1980', 'census1990', 'census2000', 'census2010']

print top 5 rows from the 2010 Census

query = 'select * from census2010'
df = sqlContext.sql(query)
df.show(5)

+---+-------+-------+-------+----+
|age|females|  males|  total|year|
+---+-------+-------+-------+----+
|  0|1994141|2085528|4079669|2010|
|  1|1997991|2087350|4085341|2010|
|  2|2000746|2088549|4089295|2010|
|  3|2002756|2089465|4092221|2010|
|  4|2004366|2090436|4094802|2010|
+---+-------+-------+-------+----+
only showing top 5 rows

print summary statistics for the number of males and females in the 2010 census

query = 'select males, females from census2010'
df = sqlContext.sql(query)
df.describe().show()

+-------+------------------+-----------------+
|summary|             males|          females|
+-------+------------------+-----------------+
|  count|               101|              101|
|   mean|1520095.3168316833|1571460.287128713|
| stddev|  818587.208016823|748671.0493484351|
|    min|              4612|            25673|
|    max|           2285990|          2331572|
+-------+------------------+-----------------+

print top 2 rows for the totals of different years

query = """select c1.age, 
                  c1.total as total_2010, 
                  c2.total  as total_2000,
                  c3.total  as total_1990,
                  c4.total  as total_1980
             from census2010 c1 
       inner join census2000 c2 on c1.age=c2.age
       inner join census1990 c3 on c1.age=c3.age
       inner join census1980 c4 on c1.age=c4.age
       """
df = sqlContext.sql(query)
df.show(2)

+---+----------+----------+----------+----------+
|age|total_2010|total_2000|total_1990|total_1980|
+---+----------+----------+----------+----------+
|  0|   4079669|   3733034|   3857376|   3438584|
|  1|   4085341|   3825896|   3854707|   3367035|
+---+----------+----------+----------+----------+
only showing top 2 rows

Question 1: How does the number of people of different ages varied between 1980 and 2010 in the US?¶

pandas_df = df.toPandas()
sns.set_style() 

plt_color=['blue', 'green', 'orange', 'red']

fig, ax = plt.subplots(figsize=(7,7))
ax.plot(pandas_df['age'], pandas_df['total_1980'], label='1980', color=plt_color[0])
ax.plot(pandas_df['age'], pandas_df['total_1990'], label='1990', color=plt_color[1])
ax.plot(pandas_df['age'], pandas_df['total_2000'], label='2000', color=plt_color[2])
ax.plot(pandas_df['age'], pandas_df['total_2010'], label='2010', color=plt_color[3])

ax.set_xlabel('Age'), ax.set_ylabel('Number of People')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

mean_Age = sum(pandas_df['total_1980']*pandas_df['age'])/sum(pandas_df['total_1980'])
ax.axvline(mean_Age, alpha=0.3, color=plt_color[0])
mean_Age = sum(pandas_df['total_1990']*pandas_df['age'])/sum(pandas_df['total_1980'])
ax.axvline(mean_Age, alpha=0.3, color=plt_color[1])
mean_Age = sum(pandas_df['total_2000']*pandas_df['age'])/sum(pandas_df['total_1980'])
ax.axvline(mean_Age, alpha=0.3, color=plt_color[2])
mean_Age = sum(pandas_df['total_2010']*pandas_df['age'])/sum(pandas_df['total_1980'])
ax.axvline(mean_Age, alpha=0.3, color=plt_color[3])

plt.legend()
plt.show()

overall the population is getting older, with two peaks of people born in 1960 and 1990

Question 2 - How is the total number of people evolving with time?¶

query="""select sum(c1.total) as total_2010, 
                sum(c2.total) as total_2000, 
                sum(c3.total) as total_1990,
                sum(c4.total) as total_1980
           from census2010 c1
     inner join census2000 c2 on c2.age=c1.age
     inner join census1990 c3 on c3.age=c1.age
     inner join census1980 c4 on c4.age=c1.age"""
df = sqlContext.sql(query)
df.show()

+----------+----------+----------+----------+
|total_2010|total_2000|total_1990|total_1980|
+----------+----------+----------+----------+
| 312247116| 284594395| 254506647| 230176361|
+----------+----------+----------+----------+

pandas_df = df.toPandas()
sns.set_style() 

ylim=[0,400]

year = np.array([1980, 1990, 2000, 2010])
pop = np.array([pandas_df['total_1980'], pandas_df['total_1990'], 
                pandas_df['total_2000'], pandas_df['total_2010']])

fig, ax = plt.subplots(figsize=(7,7))
ax.plot(year, pop/1e6, label='pop evo')

ax.set_xlabel('Year'), ax.set_ylabel('Millions of People')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.set_ylim(ylim)

plt.legend()
plt.show()

the US population is still growing significantly