import sqlite3
import pandas as pd
import re
import numpy as np
import requests
import os
import shutil
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
def run_query(q):
with sqlite3.connect('./data/chinook.db') as conn:
return pd.read_sql(q, conn)
def run_command(c):
with sqlite3.connect('./data/chinook.db') as conn:
conn.isolation_level = None
conn.execute(c)
def show_tables():
query = """
SELECT name,
type
FROM sqlite_master
WHERE type IN ("table","view");"""
return run_query(query)
We will modify the original database so the first thing to do is to copy the original into a new file
try:
os.remove("./data/chinook.db")
print("file removed")
except:
print("file not present")
shutil.copy("./data/chinook_original.db", "./data/chinook.db")
print("copy created")
df = show_tables()
display(df)
Query to return each genre with the number of tracks sold (absolute numbers and percentages)
q = """
WITH USA_invoice_lines AS
(
SELECT * from invoice_line il
INNER JOIN invoice i on i.invoice_id = il.invoice_id
where i.billing_country = 'USA'
),
track_genre_quantity AS
(
SELECT il.*,
t.name as track_name,
g.name as genre
from USA_invoice_lines il
INNER JOIN track t on t.track_id = il.track_id
INNER JOIN genre g on g.genre_id = t.genre_id
)
select genre, count(*) as tracks_sold
from track_genre_quantity
group by 1
order by 2 desc
"""
df = run_query(q)
df.set_index('genre', inplace=True)
display(df[:3])
df.plot(kind='bar');
Best sells: Rock, then Alternative & Punk, then Blues, then POP, then Hip-Hop
q = """
WITH customer_totals AS
(
select c.customer_id,
c.support_rep_id,
c.first_name || " " || c.last_name as name,
SUM(il.quantity) as total_number,
SUM(i.total) as total
from customer c
INNER JOIN invoice i ON i.customer_id = c.customer_id
INNER JOIN invoice_line il ON il.invoice_id = i.invoice_id
group by c.customer_id
)
select e.first_name || " " || e.last_name as name,
SUM(ct.total) as total,
e.birthdate,
e.hire_date,
e.country
from customer_totals ct
INNER JOIN employee e ON e.employee_id = ct.support_rep_id
group by e.employee_id
order by 2 desc
"""
df = run_query(q)
display(df)
df.set_index('name', inplace=True)
df['total'].plot(kind='bar');
present_time = pd.Timestamp.now()
df['hire_date'] = pd.DatetimeIndex(df['hire_date'])
df['days_in_company'] = (present_time-df['hire_date']).dt.days
df.plot.scatter(x='days_in_company', y='total');
Count number of Costumers per country
q = """
CREATE VIEW IF NOT EXISTS country_counts AS
select c.customer_id,
CASE
WHEN count(*) > 1 THEN country
ELSE "Other"
END AS country,
count(*) as count
from customer c
group by c.country
order by 3 desc """
run_command(q)
q = """
CREATE VIEW IF NOT EXISTS country_counts_with_Other AS
select country,
CASE
WHEN count > 1 THEN count
ELSE SUM(count)
END AS count
from country_counts
group by country
order by 2 DESC """
run_command(q)
#just to put Other at the bottom of the list
q = """
CREATE VIEW IF NOT EXISTS final_country_counts AS
SELECT country,
count
from
(
select cc_other.*,
CASE
WHEN cc_other.country = "Other" THEN 1
ELSE 0
END as sorted
from country_counts_with_Other cc_other
)
ORDER BY sorted ASC """
run_command(q)
q = """ select * from final_country_counts"""
df = run_query(q)
display(df.iloc[[0,1,2,3,9],:])
Create a new table with costumer_id, original_country and country (other if it was other in previous query)
q = """
CREATE VIEW IF NOT EXISTS country_list_other AS
select c.customer_id,
c.country as original_country,
c.country
from customer c
INNER JOIN final_country_counts fcc ON fcc.country = c.country
UNION ALL
select c.customer_id,
c.country as original_country,
CASE
WHEN 1 THEN "Other"
END as country
from customer c
where c.country NOT IN (select country from final_country_counts)
"""
run_command(q)
q="select * from country_list_other"
df = run_query(q)
display(df.iloc[41:46,:])
Compute total sales per country
#Main query
q="""
CREATE VIEW IF NOT EXISTS sales as
select co.country,
fcc.count as N_clients,
SUM(i.total) as total,
SUM(i.total)/fcc.count as average_per_client,
SUM(i.total)/count(*) as average_per_order
from country_list_other co
LEFT JOIN invoice i ON i.customer_id = co.customer_id
LEFT JOIN final_country_counts fcc ON fcc.country = co.country
group by co.country
order by 2 DESC"""
run_command(q)
#Put Other in the end
q="""
CREATE VIEW IF NOT EXISTS total_sales as
select country,
N_clients,
total as total_sales,
average_per_client,
average_per_order
from
(
select *,
CASE
WHEN s.country = "Other" THEN 1
ELSE 0
END AS sort
from sales s
)
order by sort
"""
run_command(q)
q="""select * from total_sales"""
df = run_query(q)
display(df.iloc[[0,1,2,3,9],:])
Plot histograms for 'N_clients', 'total_sales', 'average_per_client' and 'average_per_order' for different countries
import matplotlib.pyplot as plt
import numpy as np
cats = ['N_clients','total_sales','average_per_client','average_per_order']
fig = plt.figure(figsize=(12,12))
bar_positions = np.arange(0,len(df),1.) + 1.
tick_positions = range(1,len(df)+1)
color_map = plt.cm.Blues
bar_colors = [color_map(i) for i in np.linspace(0.9, 0.3, len(df))]
for sp, element in enumerate(cats):
ax = fig.add_subplot(2,2,sp+1)
bar_heights = df[element].values
ax.bar(bar_positions,bar_heights,width=0.5,color=bar_colors)
ax.set_xticks(tick_positions)
ax.set_xticklabels(df['country'], rotation=90)
ax.set_xlabel('country')
ax.set_title(element, fontsize=14)
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.tick_params(bottom=False, top=False, left=False, right=False)
plt.tight_layout()
q= """
WITH invoice_first_album as
(
select il.invoice_id,
t.album_id
from invoice_line il
INNER JOIN track t ON t.track_id = il.track_id
group by il.invoice_id
)
select AP as album_purchase,
Count(*) as Invoice_Number,
CAST(Count(*) AS float)/(select count(*) from invoice_first_album) as percentage
from
(
select *,
CASE WHEN
(
select t.track_id
from track t
where t.album_id = ia.album_id
EXCEPT
select il.track_id
from invoice_line il
where il.invoice_id = ia.invoice_id
) IS NULL
AND
(
select il.track_id
from invoice_line il
where il.invoice_id = ia.invoice_id
EXCEPT
select t.track_id
from track t
where t.album_id = ia.album_id
) IS NULL
then "yes"
ELSE "no"
END as AP
from invoice_first_album ia
)
group by AP
"""
df = run_query(q)
display(df)