-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
594 lines (514 loc) · 19.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 16 15:08:53 2018
@author: huwei
Project 3 Natural Language Processing with Women’s Clothing E-Commerce dataset
Data source: https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews
"""
# General
import pandas as pd
import numpy as np
import time
import os
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim # don't skip this
import matplotlib.pyplot as plt
import seaborn as sns
import eli5
# Preprocessing
import string
from nltk.tokenize import RegexpTokenizer
# nltk.download('stopwords')
from nltk.corpus import stopwords
# Regualr expression operator
import re
#from nltk.stem import PorterStemmer
# Modeling
import statsmodels.api as sm
# Give a sentiment intensity score to sentences
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk.util import ngrams
from collections import Counter
from gensim.models import word2vec
from sklearn import linear_model, metrics, cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
#import nltk
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
# Warning
import warnings
warnings.filterwarnings('ignore')
from gensim import corpora, models, similarities, matutils
from gensim.corpora import Dictionary
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)
from nltk.stem.lancaster import LancasterStemmer
# Vs. nltk.stem.PorterStemmer
from nltk.stem.porter import PorterStemmer
##############################################################################
# Self-defined functions
##############################################################################
# Define accuracy calculation function
def prediction_error(y, y_hat):
"""
Calculate prediction accuaracy.
Arguments:
y -- np.array / pd.Series
The original value of the binary dependent variable;
y_hat -- np.array / pd.Series
The predicted value of the binary dependent variable;
Return:
error -- float
"""
error = float(sum(y!=y_hat))/len(y)
return error
# Generate cross validation error
def logitCV(X, y, Kfolds = 10):
'''
Calculate the K-fold cross validation error
Arguments:
X -- np.array
The explanatory variables;
y -- np.array
The binary dependent variable;
Kfolds -- integer
The subsets number in the cross validation, default is 10;
Return:
cv_error -- float
Cross validation prediction error;
'''
# initialized the container for the out-of-sample prediction errors
errors = np.zeros((Kfolds,), dtype = 'float32')
# split the training data into 'Kfolds' subsets
obs = len(y)
training_split = [subset[1] for subset in \
cross_validation.KFold(obs, Kfolds, shuffle=True, random_state=1)]
# Combine n-1 folds
for i in range(Kfolds):
test_idx = training_split[i]
#pdb.set_trace()
training_idx = [j for j in range(obs) if j not in test_idx]
# logistic regression
clf = linear_model.LogisticRegression()
# fit the model
clf.fit(X[training_idx], y[training_idx])
# make prediction
y_hat = clf.predict(X[test_idx,])
errors[i] = prediction_error(y[test_idx], y_hat)
cv_error = np.mean(errors)
return cv_error
# Define the cross validation function for LDA
def ldaCV(texts, sentiment, n_topics):
"""
Define the cross validation function for LDA
Arguments:
texts -- list
The preprocessed texts;
sentiment -- pandas.core.series.Series
The sentiment of the text;
n_topics -- numpy.ndarray
The array of the number of topics for LDA;
Return:
n_topics[idx] -- numpy.int64
Number of topics for LDA with lowest cross-validation error;
best_lda -- gensim.models.ldamodel.LdaModel
The LDA model with lowest cross-validation error;
cv_errors -- numpy.ndarray
The array of cross-validation error using different number of topics;
"""
# Create dictionary
dictionary = Dictionary(texts)
# convert text to BoW format corpus
corpus = [dct.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
cv_errors = np.zeros((len(n_topics),), dtype = 'float32')
lda = []
for i in range(len(n_topics)):
lda.append(models.LdaModel(corpus_tfidf, id2word = dictionary, num_topics = n_topics[i]))
corpus_lda = lda[i][corpus_tfidf]
matrix_lda = np.transpose(matutils.corpus2dense(corpus_lda, num_terms = n_topics[-1]))
cv_errors[i] = logitCV(matrix_lda, sentiment)
print('Number of topics:', n_topics[i], '; Cross validation error:', cv_errors[i])
# Find the index for the smallest cv_errors
val,idx = min((val,idx) for (idx, val) in enumerate(cv_errors))
# Return the number of optimal corpus and the corresponding lda model
return n_topics[idx], lda[idx], cv_errors
#
def preprocessing(data):
"""
Preprocess the textual data
Arguments:
data -- pandas.core.series.Series
The texts data which need to be preprocessed;
Return:
words -- list
list of all the preprocessed words in data;
"""
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
txt = data.str.lower().str.cat(sep=' ') #1
words = tokenizer.tokenize(txt) #2
words = [w for w in words if not w in stop_words] #3
words = [st.stem(w) for w in words] #4
return words
def get_ngrams(text, n):
"""
Return the ngrams generated from a sequence of items, as an iterator.
For example:
>>> from nltk.util import ngrams
>>> list(ngrams([1,2,3,4,5], 3))
[(1, 2, 3), (2, 3, 4), (3, 4, 5)]
Argument:
texts -- pandas.core.series.Series
The textual data;
n -- int
The number of ngrams;
Return -- list
list of all the ngrams of the data;
"""
n_grams = ngrams((text), n)
return [ ' '.join(grams) for grams in n_grams]
def gramfreq(text,n,num):
"""
Get the frequency of ngrams in text data;
Argument:
text --
n --
num --
Return:
"""
# Extracting bigrams
result = get_ngrams(text,n)
# Counting bigrams
result_count = Counter(result)
# Converting to the result to a data frame
df = pd.DataFrame.from_dict(result_count, orient='index')
df = df.rename(columns={'index':'words', 0:'frequency'}) # Renaming index column name
return df.sort_values(["frequency"],ascending=[0])[:num]
def gram_table(data, gram, length):
out = pd.DataFrame(index=None)
for i in gram:
table = pd.DataFrame(gramfreq(preprocessing(data),i,length).reset_index())
table.columns = ["{}-Gram".format(i),"Occurrence"]
out = pd.concat([out, table], axis=1)
return out
##############################################################################
path_save_fig = os.path.join(os.getcwd(), 'Images')
path_save_data = os.path.join(os.getcwd(), 'Data')
##############################################################################
#### Read Data
##############################################################################
df = pd.read_csv(os.path.join(path_save_data,'Womens Clothing E-Commerce Reviews.csv')).fillna(' ')
print(df.columns)
print(df.index)
print(df.shape)
##############################################################################
#### Data Preprocessing
##############################################################################
pdtextpreprocess = df[['Title', 'Review Text', 'Rating']]
pdtextpreprocess['index'] = pdtextpreprocess.index
documents = pdtextpreprocess['Review Text']
# print
print(documents.shape)
word_tokenize(documents[0])
PorterStemmer().stem(word_tokenize(documents[0])[0])
type(stopwords)
#
#####################
# Text tokenized -- Unigram
print('Text Tokenization...')
t1 = time.time()
text_tokenized = [[word.lower() for word in word_tokenize(document)] \
for document in documents]
print('Time used: %s seconds' % (time.time()-t1))
#####################
# move stopwords
print('Removing Stopwords...')
t1 = time.time()
text_filtered_stopwords = [[word for word in document if not word in stopwords.words('english')]\
for document in text_tokenized]
print('Time used: %s seconds' % (time.time()-t1))
print('Length: %s' % len(text_filtered_stopwords))
#####################
# Removing English Punctuation
print('Removing English Punctuation...')
t1 = time.time()
english_punctuations = string.punctuation
texts_filtered = [[word for word in document if not word in english_punctuations] for document in text_filtered_stopwords]
print('Time used: %s seconds' % (time.time()-t1))
print('Length %s' % len(texts_filtered))
#####################
# Stemming
print('Stemming...')
st = PorterStemmer()
t1 = time.time()
texts_stemmed = [[st.stem(word) for word in document] for document in texts_filtered]
print('Time used: %s seconds' % (time.time()-t1))
print('Length %s' % len(texts_stemmed))
print(texts_filtered[0])
print(texts_stemmed[0])
# list of all stems
all_stems = sum(texts_stemmed, [])
type(all_stems)
len(all_stems)
# count() method counts how many times an element has occurred in a list and returns it.
# set of all stems that appears only once
once_stems = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
type(once_stems)
len(once_stems)
#####################
#Removing the low frequence word (such as occurence equals one)
texts = [[stem for stem in text if stem not in once_stems] for text in texts_stemmed]
type(texts)
len(texts)
#####################
# save
with open(os.path.join(path_save_data,'Preprocessed Review Text.txt'), 'w') as f:
for text in texts:
for item in text:
f.write("%s " % item)
f.write("\n")
# Read
# =============================================================================
# texts = []
# with open(os.path.join(path_save_data,'Preprocessed Review Text.txt'), 'r') as f:
# for line in f:
# text = line.split()
# texts.append(text)
# =============================================================================
# =============================================================================
# # Or you can use pickle
# import pickle
#
# with open('Preprocessed Review Text.p', 'wb') as fp:
# pickle.dump(texts, fp)
#
# with open ('Preprocessed Review Text.p', 'rb') as fp:
# itemlist = pickle.load(fp)
# =============================================================================
##############################################################################
# Extracting topics and calculating texts correlation
##############################################################################
# Create dictionary
dct = Dictionary(texts)
# convert text to BoW format corpus
corpus = [dct.doc2bow(text) for text in texts]
#
for cp in corpus[:1]:
for id, freq in cp:
print(dct[id],',', freq)
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
type(corpus_tfidf)
for cp in corpus_tfidf[:1]:
for id, freq in cp:
print(dct[id],',', freq)
# =============================================================================
# for doc in corpus_tfidf:
# print(doc,"\n")
# =============================================================================
# Latent Semantic Indexing
# Training lsi model
lsi = models.LsiModel(corpus_tfidf, id2word = dct, num_topics = 100)
lsi.print_topics(10)
# Map the document to the topic space to see the correlation between the document and topic
corpus_lsi = lsi[corpus_tfidf]
# =============================================================================
#for doc in corpus_lsi[:10]:
# print(doc)
# =============================================================================
query = "good dress"
# Change query word to vector
query_bow = dct.doc2bow(query.lower().split())
print(query_bow)
# Mapping query word to 100 dimensional topic space with LSI model
query_lsi = lsi[query_bow]
for idx, val in enumerate(query_lsi):
print(val[0], ',', val[1])
# Calculate the cosine similarity/correlation degree btw documents and query word
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[query_lsi]
print(list(enumerate(sims)))
# Output sorted results
sort_sims = sorted(enumerate(sims), key = lambda item:-item[-1])
top10 = sort_sims[:10]
top10doc = [texts[j[0]] for j in top10]
print(top10doc)
############################
# Train lda model
lda = models.LdaModel(corpus_tfidf, id2word = dct, num_topics = 100)
# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus_tfidf))
# a measure of how good the model is. lower the better.
# Compute Coherence Score
coherence_model_lda = models.CoherenceModel(model=lda, texts=texts, dictionary=dct, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
# =============================================================================
# # Visualize the topics
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda, corpus_tfidf, dct)
# vis
# =============================================================================
###########################
# Cross-validation of LDA
sentiment = df['Recommended IND']
#
f, axes = plt.subplots(1,1, figsize=(14,4), sharex=False)
sns.countplot(x='Recommended IND', data=df,order=sentiment.value_counts().index)
axes.set_title("Frequency Distribution for\nRecommended IND")
axes.set_ylabel("Occurrence")
axes.set_xlabel('Recommended IND')
fig = plt.gcf()
fig.savefig(os.path.join(path_save_fig,'RecommendedIND.png'))
plt.show()
#
print('Cross-Validation...')
t1 = time.time()
[best_topic, best_lda, cv_errors] = ldaCV(texts, sentiment, n_topics = np.linspace(5,100,20).astype(int))
print('Time used: %s seconds' % (time.time()-t1))
# Save model to disk
best_lda.save(os.path.join(path_save_data,'best_lda'))
# =============================================================================
# # Load a potentially pretrained model from disk.
# best_lda = LdaModel.load(os.path.join(path_save_data,'best_lda'))
# =============================================================================
# Svae figure
plt.figure()
plt.plot(np.linspace(5,100,20).astype(int), cv_errors, linewidth = 6)
plt.axvline(x = best_topic, color = 'r', linestyle = '--')
plt.xlabel('# of Topics', fontsize = 18)
plt.ylabel('CV Error', fontsize = 18)
plt.title('Cross-validation errors for LDA', fontsize = 18)
fig = plt.gcf()
fig.set_size_inches(18.5,10.5,5)
fig.savefig(os.path.join(path_save_fig, 'cv_errors.png'))
#######################
# Gram Table
# Recommended
Recommended_gramtab = gram_table(data= documents[df["Recommended IND"].astype(int) == 0], gram=[1,2,3], length=20)
Recommended_gramtab.to_csv(os.path.join(path_save_data, 'Recomended_Gram_Tab.csv'))
#
NotRecommended_gramtab = gram_table(data= documents[df["Recommended IND"].astype(int) == 1], gram=[1,2,3], length=20)
NotRecommended_gramtab.to_csv(os.path.join(path_save_data, 'NotRecomended_Gram_Tab.csv'))
##########################################################
# Creating Bigrams and Trigrams Models, higher threshold fewer phrases
##########################################################
print('Creating Bigrams Model...')
# Build the bigram models
t1 = time.time()
bigram = models.Phrases(texts, min_count=1, threshold=1)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = models.phrases.Phraser(bigram)
print('Time used: %s seconds' % (time.time()-t1))
# See example
print(bigram_mod[texts[9]])
texts_bigram = bigram_mod[texts]
#####################
# save
with open(os.path.join(path_save_data,'Preprocessed Review Text Bigram.txt'), 'w') as f:
for text in texts_bigram:
for item in text:
f.write("%s " % item)
f.write("\n")
# Read
# =============================================================================
# texts_bigram = []
# with open(os.path.join(path_save_data,'Preprocessed Review Text Bigram.txt'), 'r') as f:
# for line in f:
# text = line.split()
# texts_bigram.append(text)
# =============================================================================
#
print('Cross-Validation...')
t1 = time.time()
n_topics = np.linspace(5,100,20).astype(int)
[best_topic_bi, best_lda_bi, cv_errors_bi] = ldaCV(texts_bigram, sentiment, n_topics)
print('Time used: %s seconds' % (time.time()-t1))
# Save model to disk
best_lda.save(os.path.join(path_save_data,'best_lda_bigram'))
# Svae figure
plt.figure()
plt.plot(n_topics, cv_errors_bi, linewidth = 6)
plt.axvline(x = best_topic_bi, color = 'r', linestyle = '--')
plt.xlabel('# of Topics', fontsize = 18)
plt.ylabel('CV Error', fontsize = 18)
plt.title('Cross-validation errors for LDA-Bigram', fontsize = 18)
fig = plt.gcf()
fig.set_size_inches(18.5,10.5,5)
plt.show()
fig.savefig(os.path.join(path_save_fig, 'cv_errors_bigram.png'))
#######################
# Build the trigram models
t1 = time.time()
trigram = models.Phrases(bigram[texts], threshold=1)
# Faster way to get a sentence clubbed as a trigram/bigram
trigram_mod = models.phrases.Phraser(trigram)
print('Time used: %s seconds' % (time.time()-t1))
# See example
print(trigram_mod[bigram_mod[texts[9]]])
texts_trigram = trigram_mod[bigram_mod[texts]]
#####################
# save
with open(os.path.join(path_save_data,'Preprocessed Review Text Trigram.txt'), 'w') as f:
for text in texts_trigram:
for item in text:
f.write("%s " % item)
f.write("\n")
# Read
# =============================================================================
# texts_trigram = []
# with open(os.path.join(path_save_data,'Preprocessed Review Text Trigram.txt'), 'r') as f:
# for line in f:
# text = line.split()
# texts_trigram.append(text)
# =============================================================================
#
print('Cross-Validation...')
t1 = time.time()
n_topics = np.linspace(5,100,20).astype(int)
[best_topic_tri, best_lda_tri, cv_errors_tri] = ldaCV(texts_trigram, sentiment, n_topics)
print('Time used: %s seconds' % (time.time()-t1))
# Save model to disk
best_lda.save(os.path.join(path_save_data,'best_lda_trigram'))
# Svae figure
plt.figure()
plt.plot(n_topics, cv_errors_tri, linewidth = 6)
plt.axvline(x = best_topic_tri, color = 'r', linestyle = '--')
plt.xlabel('# of Topics', fontsize = 18)
plt.ylabel('CV Error', fontsize = 18)
plt.title('Cross-validation errors for LDA-Trigram', fontsize = 18)
fig = plt.gcf()
fig.set_size_inches(18.5,10.5,5)
plt.show()
fig.savefig(os.path.join(path_save_fig, 'cv_errors_trigram.png'))
#############################
# Using scipy.sparse.csr.csr_matrix to get unigram, bigram and trigram textual features
word_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
stop_words='english',
ngram_range=(1, 1),
max_features=10000)
word_vectorizer.fit(texts)
X = vect.transform(texts)
y = df["Recommended IND"].copy()
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size=0.20, random_state=23, stratify=y)
model = linear_model.LogisticRegression()
model.fit(X_train, y_train)
print("Train Set Accuracy: {}".format(metrics.accuracy_score(model.predict(X_train), y_train)))
print("Train Set ROC: {}\n".format(metrics.roc_auc_score(model.predict(X_train), y_train)))
print("Validation Set Accuracy: {}".format(metrics.accuracy_score(model.predict(X_valid), y_valid)))
print("Validation Set ROC: {}".format(metrics.roc_auc_score(model.predict(X_valid), y_valid)))
target_names = ["Not Recommended","Recommended"]
from IPython.display import display, HTML, Image
display(eli5.show_weights(model, vec=vect, top=100,
target_names=target_names))