import numpy as np
import pandas as pd
import seaborn as sns
import re
import requests
import math
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from datetime import datetime
from IPython.display import display, Math
from functions import *
plt.style.use('seaborn')
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
#specific to this project
from sklearn.cluster import KMeans
print("Defaults Loaded")
votes = pd.read_csv('./data/114_congress.csv')
display(votes[:3])
print(votes['party'].value_counts())
#fit transform returns in the eucledian distance from each Senator to the first cluster and second clusters
kmeans_model = KMeans(n_clusters=2, random_state=1)
senator_distances = kmeans_model.fit_transform(votes.iloc[:,3:])
#get clustering labels from model
labels = kmeans_model.labels_
#compare model with original data
display(pd.crosstab(labels,votes['party']))
some democrats voted along side the Republicans in some votes, identify the outliers:
democratic_outliers = votes[(labels==1) & (votes['party']=='D')]
sel = votes['party']=='D'
print("Average vote from Democrats in first votes:")
print(votes[sel].iloc[:,:9].mean())
print(democratic_outliers.iloc[:,:9])
plt.scatter(senator_distances[:,0],senator_distances[:,1],c=labels)
plt.show()
find rows further away from other cluster:
extremism = np.sum(senator_distances**3, axis=1)
votes['extremism'] = extremism
votes.sort_values('extremism', ascending = False, inplace=True)
print(votes.iloc[:10,[0,1,2,3,4,5,6,7,8,-1]])
nba = pd.read_csv("./data/nba_2013.csv")
display(nba.iloc[:3,:20])
point guards are crucial because they create scoring opportunities, lets look at what types of point guards exist and group similar point guards together
point_guards = nba[nba['pos']=='PG'].copy()
#ppg
point_guards['ppg'] = point_guards['pts'] / point_guards['g']
#drop players with no turnovers
point_guards = point_guards.loc[point_guards['tov'] != 0]
point_guards['atr'] = point_guards['ast']/point_guards['tov']
plt.scatter(point_guards['ppg'], point_guards['atr'], c='y')
plt.title("Point Guards")
plt.xlabel('Points Per Game', fontsize=13)
plt.ylabel('Assist Turnover Ratio', fontsize=13)
plt.show()
maybe 5 clusters (definitely 3)
num_clusters = 5
# Visualizing clusters
def visualize_clusters(df, num_clusters):
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for n in range(num_clusters):
clustered_df = df[df['cluster'] == n]
plt.scatter(clustered_df['ppg'], clustered_df['atr'], c=colors[n-1])
plt.xlabel('Points Per Game', fontsize=13)
plt.ylabel('Assist Turnover Ratio', fontsize=13)
plt.show()
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(point_guards[['ppg', 'atr']])
point_guards['cluster'] = kmeans.labels_
visualize_clusters(point_guards, num_clusters)