import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
import myML_functions as myML_functions
# Import the lemmatizer from nltk
import gensim
from gensim import corpora
import pyLDAvis.gensim
from sklearn.metrics import classification_report, confusion_matrix, r2_score, roc_auc_score
flag emails with some specif words as true fraud
df = pd.read_csv('./data/Fraud/chapter_4/enron_emails_clean.csv', keep_default_na=False)
searchfor = ['enron stock', 'sell stock', 'stock bonus', 'sell enron stock']
df['flag'] = np.where((df['clean_content'].str.contains('|'.join(searchfor)) == True), 1, 0)
count = df['flag'].value_counts()
Tokenize and clean text
lemma = WordNetLemmatizer()
porter= PorterStemmer()
# Define stopwords to exclude
stop = set(stopwords.words('english'))
stop.update(("to","cc","subject","http","from","sent", "ect", "u", "fwd", "www", "com"))
# Define punctuations to exclude
exclude = set(string.punctuation)
# Define word cleaning function
def clean(text, stop):
#remove punctuation, tokenization finished
punc_free = [i for i in text if i not in exclude]
#2 - Remove all stopwords
stop_free = [i for i in punc_free if((i not in stop) and (not i.isdigit()))]
#3 - Lemmatize words
normalized = [lemma.lemmatize(i) for i in stop_free]
#4 - Stem words
cleaned_text = [porter.stem(token) for token in normalized]
return cleaned_text
# Clean the emails in df and print results
for text in df['clean_content']:
text = text.rstrip().lower().split()
text_clean.append(clean(text, stop))
Topic Modelling with LDA
dictionary = corpora.Dictionary(text_clean)
dictionary.filter_extremes(no_below=5, keep_n=50000)
corpus = [dictionary.doc2bow(text) for text in text_clean]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=5)
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
Flag fraud based on topic inspection
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
def get_topic_details(ldamodel, corpus):
topic_details_df = pd.DataFrame()
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]), ignore_index=True)
topic_details_df.columns = ['Dominant_Topic', '% Score']
return topic_details_df
contents = pd.DataFrame({'Original text': text_clean})
topic_details = pd.concat([get_topic_details(ldamodel, corpus), contents], axis=1)
# Create flag for text highest associated with topic 3
topic_details['flag'] = np.where((topic_details['Dominant_Topic'] == 3.0), 1, 0)
y_true = df['flag']
y_predict = topic_details['flag']
print(f"R2: {r2_score(y_true, y_predict):0.3f}")
print(f"Roc_Auc: {roc_auc_score(y_true, y_predict):0.3f}\n")
print('Classifcation report:\n', classification_report(y_true, y_predict))
class_names = np.array(['Non-Fraud', 'Fraud'])
myML_functions.plot_confusion_matrix(y_true, y_predict, classes=class_names)