import sqlite3
import pandas as pd
import re
import numpy as np
import requests
import os
import shutil
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
We will modify the original database so the first thing to do is to copy the original into a new file
try:
os.remove("./data/factbook.db")
print("file removed")
except:
print("file not present")
shutil.copy("./data/factbook_original.db", "./data/factbook.db")
print("copy created")
import sqlite3
import pandas as pd
from IPython.display import display
conn = sqlite3.connect('./data/factbook.db')
df = pd.read_csv('./data/cities.csv')
df.columns=["id", "name", "population", "capital", "facts_id"]
df.to_sql('cities', conn, if_exists='replace', index=False)
query = 'select * from cities'
df = pd.read_sql_query(query, conn)
display(df[:3])
The INNER JOIN only includes rows where there is a match of IDs between the two tables
query = '''select * from facts
INNER JOIN cities ON facts.id = cities.facts_id
Limit 10'''
df = pd.read_sql_query(query, conn)
display(df.iloc[:4,[0,5,8,9,10,13,14,15,16,17]])
query = """select f.name as country, c.name as capital_city from facts f
INNER JOIN cities c on f.id = c.facts_id
where c.capital=1"""
df = pd.read_sql_query(query, conn)
display(df.iloc[:3])
query = """select f.name as country, f.population from facts f
LEFT JOIN cities c on f.id = c.facts_id
where c.name IS NULL"""
df = pd.read_sql_query(query, conn)
display(df[:9])
query = """select c.name as capital_city,
f.name as country,
c.population as population
from facts f
Inner Join (
select * from cities
where capital = 1 and population>5000000
) c on f.id = c.facts_id
order by 3 desc"""
df = pd.read_sql_query(query, conn)
display(df[:9])
Get coordinates from './data/worldcities.csv' and combine dataframes
world_cities = pd.read_csv('./data/worldcities.csv', encoding='Latin-1')
cols = ['city','lat','lng','country','population']
cities = world_cities[cols]
capitals = cities.loc[world_cities['capital']=='primary']
#strip the spaces to be able to merge dataframes
df.loc[:, 'capital_city'] = df['capital_city'].apply(lambda x: x.strip())
combined_df = df.merge(capitals, left_on='capital_city', right_on='city', how='left')
combined_df['country'] = combined_df['country_x']
combined_df['population'] = combined_df['population_x']
combined_df.drop(['country_x', 'country_y', 'population_x', 'population_y'], axis=1, inplace=True)
#display(combined_df[:])
from mpl_toolkits.basemap import Basemap
fig, ax = plt.subplots(figsize=(10,10))
m = Basemap(projection='merc', llcrnrlat=-60, urcrnrlat=80, llcrnrlon=-180, urcrnrlon=180, resolution='l')
m.drawcoastlines()
m.fillcontinents(color='#F0E0BB', lake_color='#BBDDF0')
m.drawmapboundary(fill_color='#BBDDF0')
m.drawcountries(linewidth=0.5)
longitudes = combined_df['lng'].tolist()
latitudes = combined_df['lat'].tolist()
population = combined_df['population']
re_scaled_pop = convert_range_01(population)
m.scatter(longitudes, latitudes, s=(re_scaled_pop+0.1)*200, zorder=2, latlon=True, c=population, cmap='summer')
plt.show()
query = """select f.name as country,
c.population as urban_pop,
f.population as total_pop,
CAST(c.population as float)/CAST(f.population as float) as urban_pct
from facts f
Inner join (
select facts_id,
sum(population) as population
from cities
group by facts_id
) c on f.id = c.facts_id
where urban_pct > 0.0 and urban_pct<1.0
ORDER by 4 DESC"""
df = pd.read_sql_query(query, conn)
display(df[:2])
fig = plt.figure(figsize=(7,7))
ax = plt.subplot()
ax.hist(df['urban_pct'])
for key, spine in ax.spines.items():
spine.set_visible(False)
ax.tick_params(left=False, bottom=False)
ax.set_xlabel('urban percentage'), ax.set_ylabel('Number of Countries')
plt.show()