import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
import myML_functions as myML_functions
# Import the lemmatizer from nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
import gensim
from gensim import corpora
import pyLDAvis.gensim
from sklearn.metrics import classification_report, confusion_matrix, r2_score, roc_auc_score
flag emails with some specif words as true fraud
df = pd.read_csv('./data/Fraud/chapter_4/enron_emails_clean.csv', keep_default_na=False)
searchfor = ['enron stock', 'sell stock', 'stock bonus', 'sell enron stock']
df['flag'] = np.where((df['clean_content'].str.contains('|'.join(searchfor)) == True), 1, 0)
count = df['flag'].value_counts()
print(count)
Tokenize and clean text
lemma = WordNetLemmatizer()
porter= PorterStemmer()
# Define stopwords to exclude
stop = set(stopwords.words('english'))
stop.update(("to","cc","subject","http","from","sent", "ect", "u", "fwd", "www", "com"))
# Define punctuations to exclude
exclude = set(string.punctuation)
# Define word cleaning function
def clean(text, stop):
#remove punctuation, tokenization finished
punc_free = [i for i in text if i not in exclude]
#2 - Remove all stopwords
stop_free = [i for i in punc_free if((i not in stop) and (not i.isdigit()))]
#3 - Lemmatize words
normalized = [lemma.lemmatize(i) for i in stop_free]
#4 - Stem words
cleaned_text = [porter.stem(token) for token in normalized]
return cleaned_text
# Clean the emails in df and print results
text_clean=[]
for text in df['clean_content']:
text = text.rstrip().lower().split()
text_clean.append(clean(text, stop))
Topic Modelling with LDA
dictionary = corpora.Dictionary(text_clean)
dictionary.filter_extremes(no_below=5, keep_n=50000)
corpus = [dictionary.doc2bow(text) for text in text_clean]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=5)
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
print(topic)
Flag fraud based on topic inspection
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)
def get_topic_details(ldamodel, corpus):
topic_details_df = pd.DataFrame()
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]), ignore_index=True)
topic_details_df.columns = ['Dominant_Topic', '% Score']
return topic_details_df
contents = pd.DataFrame({'Original text': text_clean})
topic_details = pd.concat([get_topic_details(ldamodel, corpus), contents], axis=1)
# Create flag for text highest associated with topic 3
topic_details['flag'] = np.where((topic_details['Dominant_Topic'] == 3.0), 1, 0)
print(topic_details.head())
y_true = df['flag']
y_predict = topic_details['flag']
print(f"R2: {r2_score(y_true, y_predict):0.3f}")
print(f"Roc_Auc: {roc_auc_score(y_true, y_predict):0.3f}\n")
print('Classifcation report:\n', classification_report(y_true, y_predict))
class_names = np.array(['Non-Fraud', 'Fraud'])
myML_functions.plot_confusion_matrix(y_true, y_predict, classes=class_names)
plt.show()