import numpy as np
import pandas as pd
import seaborn as sns
import re
import requests
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from datetime import datetime
from IPython.display import display, Math
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
#specific to this project
import csv
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
print("Defaults Loaded")
# Read in the training data
with open("./data/text_train.csv", 'r') as file:
reviews = list(csv.reader(file))
with open("./data/text_test.csv", 'r') as file:
test = list(csv.reader(file))
# Generate counts from text using a vectorizer
# We can choose from other available vectorizers, and set many different options
# This code performs our step of computing word counts
vectorizer = CountVectorizer(stop_words='english', max_df=.05)
train_features = vectorizer.fit_transform([r[0] for r in reviews])
test_features = vectorizer.transform([r[0] for r in test])
test_target = [int(r[1]) for r in test]
# Fit a Naive Bayes model to the training data
# train the model using the word counts we computed and the existing classifications in the training set
nb = MultinomialNB()
nb.fit(train_features, [int(r[1]) for r in reviews])
# Now we can use the model to predict classifications for our test features
predictions = nb.predict(test_features)
# Compute the error
accuracy = metrics.accuracy_score(predictions, test_target)
print("Accuracy: {:0.3f}".format(accuracy))
# Generate the ROC curve using scikits-learn
fpr, tpr, thresholds = metrics.roc_curve(test_target, predictions, pos_label=1)
# Measure the area under the curve
# The closer to 1 it is, the "better" the predictions
print(f"AUC of the predictions: {metrics.auc(fpr, tpr):0.3f}")
import pandas as pd
submissions = pd.read_csv("./data/sel_hn_stories.csv")
submissions.columns = ["submission_time", "upvotes", "url", "headline"]
submissions = submissions.dropna()
display(submissions[:5])
create list of cleaned individual words
tokenized_headlines = []
headlines = submissions['headline']
for element in headlines:
tokenized_headlines.append(element.split())
punctuation = [",", ":", ";", ".", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")"]
clean_tokenized = []
for element in tokenized_headlines:
clean_token = []
for token in element:
for punc in punctuation:
token = token.replace(punc,"")
clean_token.append(token.lower())
clean_tokenized.append(clean_token)
print(clean_tokenized[:3])
counts tokens that appear two times or more and create dataframe with bag of words initiated with 0's
import numpy as np
unique_tokens = []
single_tokens = []
for element in clean_tokenized:
for item in element:
#single tokens don't add much
if(item not in single_tokens):
single_tokens.append(item)
#only had to unique tokens if it appears a second time
else:
if(item not in unique_tokens):
unique_tokens.append(item)
counts = pd.DataFrame(0, index = np.arange(len(clean_tokenized)), columns = unique_tokens)
display(counts.iloc[:3,:9])
create bag of words model
for idx, element in enumerate(clean_tokenized):
for item in element:
if(item in unique_tokens):
counts.iloc[idx][item]+=1
#remove words that appear less than 5 or more than 100 times
word_counts = counts.sum(axis=0)
counts = counts.loc[:,(word_counts>=5) & (word_counts<=100)]
display(counts.iloc[:3,:9])
split into train and test, train and predict and calculate error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
X_train, X_test, y_train, y_test = train_test_split(counts, submissions["upvotes"], test_size=0.2, random_state=1)
clf = LinearRegression()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
mse = ((y_test-predictions)**2).sum()/len(y_test)
print("RMSE LinearRegression: {:0.3f}".format(np.sqrt(mse)))
clf = RandomForestRegressor(n_estimators=150, random_state=1)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
mse = ((y_test-predictions)**2).sum()/len(y_test)
print("RMSE RandomForest: {:0.3f}".format(np.sqrt(mse)))