-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWordEmbedder.py
234 lines (182 loc) · 8.05 KB
/
WordEmbedder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import warnings
warnings.filterwarnings('ignore')
from mxnet import gluon
from mxnet import nd
import gluonnlp as nlp
import numpy as np
nlp.utils.check_version('0.7.0')
EMB_PATH = 'glove.6B.'
class Embedder():
def __init__(self, dimensions=100): # takes 50, 100, 2000, 300
self.__d = dimensions # size of the embeddings of each word
self.__emb_path = EMB_PATH + str(self.__d) + 'd'
temp = nlp.embedding.create('glove', source= self.__emb_path)
embedder = nlp.Vocab(nlp.data.Counter(temp.idx_to_token))
embedder.set_embedding(temp)
self.__embedder = embedder # embedder object
self.__emb_mapper = self.__embedder.embedding # maps words to the embeddings
def __norm_vecs_by_row(self, x):
return x / nd.sqrt(nd.sum(x * x, axis=1) + 1E-10).reshape((-1,1))
def __cos_sim(self, x, y):
return nd.dot(x, y) / (nd.norm(x) * nd.norm(y))
def get_embedder(self):
'''
Returns an embedder dictionary that returns embeddings of words.
Eg.
emb = Embedder(dimensions = 50)
embedding = emb.get_embedder()
embedding['beautiful']
Returns...
50 dimension embedding for the word 'beautiful' in the form of NDArray
'''
return self.__emb_mapper
def most_similar_to(self,word, k=5):
'''
Returns top k words similar to the argument.
Eg.
emb = Embedder(dimensions = 50)
print(emb.most_similar_to('baby'))
Returns...
['babies', 'boy', 'girl', 'newborn', 'pregnant']
'''
vec = self.__emb_mapper[word].reshape((-1,1))
emb_vecs = self.__norm_vecs_by_row(self.__emb_mapper.idx_to_vec)
dot_product = nd.dot(emb_vecs, vec)
indices = nd.topk(dot_product.reshape((len(self.__embedder), )), k = k+1, ret_typ='indices')
indices = [int(i.asscalar()) for i in indices]
# Remove unknown and input tokens.
return self.__embedder.to_tokens(indices[1:])
def get_top_k_by_analogy(self, word1, word2, word3, k=1):
'''
Returns analogical word for the set of 3 words that are passed as arguments.
Analogy refers to:
king->queen ; man->woman
good->better ; bad->worse
do->did ; go->went
Eg.
emb= Embedder(dimensions=50)
print(emb.get_top_k_analogy('good','best','bad'))
Returns...
['worst']
Returns a list because you can have top k analogies as result
'''
word_vecs = self.__emb_mapper[word1, word2, word3]
word_diff = (word_vecs[1] - word_vecs[0] + word_vecs[2]).reshape((-1, 1))
vocab_vecs = self.__norm_vecs_by_row(self.__emb_mapper.idx_to_vec)
dot_product = nd.dot(vocab_vecs, word_diff)
indices = nd.topk(dot_product.reshape((len(self.__embedder), )), k=k, ret_typ='indices')
indices = [int(i.asscalar()) for i in indices]
return self.__embedder.to_tokens(indices)
def cosine_similarity(self, word1, word2):
'''
Returns cosine of the angle between the embeddings of the two words.
Eg.
emb = Embedder(dimensions = 50)
print(emb.cosine_similarity('good', 'bad'))
0.79648924
print(emb.cosine_similarity('good', 'bad'))
0.8546279
'''
x = self.__emb_mapper[word1]
y = self.__emb_mapper[word2]
return np.squeeze(self.__cos_sim(x,y).asnumpy())
def cosine_sim_analogy(self, word1, word2, word3, word4):
'''
Returns correctness of an analogy.
Scores closer to 1 indicate better analogy.
Eg.
emb = Embedder(dimensions=50)
print(emb.cosine_sim_analogy('man', 'woman', 'son', 'daughter'))
Returns...
0.9658341
'''
words = [word1, word2, word3, word4]
vecs = self.__emb_mapper[words]
return np.squeeze(self.__cos_sim(vecs[1] - vecs[0] + vecs[2], vecs[3]).asnumpy())
def get_embedding_matrix(self, vocab, num_words, oov_token='<oov>'):
'''
Parameters:
vocab (list): top {num_words} words in the tokenized dataset
num_words (int): size of the vocabulary
Returns:
embedding_matrix, idx_to_token (dict), token_to_idx (dict)
When using pre-trained word embeddings, the Embedding Layer from tensorflow.keras.layers demands for a parameter 'weights'.
We pass: weights = [embedding_matrix] for the same...
This function returns embedding_matrix for your vocabulary
Eg.
emb = Embedder(dimensions=50)
embedding_matrix, id_to_token, token_to_id = emb.get_embedding_matrix(vocabulary, num_words = 5000)
'''
token_to_idx = {}
idx_to_token = {}
embedding_matrix = []
if oov_token not in vocab:
embedding_matrix.append(self.__emb_mapper[oov_token].asnumpy())
token_to_idx[oov_token] = 0
idx_to_token[0] = oov_token
for i in range(num_words-1):
word = vocab[i].lower()
token_to_idx[word] = i+1
idx_to_token[i+1] = word
embedding_matrix.append(self.__emb_mapper[word].asnumpy())
else:
for i in range(num_words):
word = vocab[i].lower()
token_to_idx[word] = i
idx_to_token[i] = word
embedding_matrix.append(self.__emb_mapper[word].asnumpy())
embedding_matrix = np.asarray(embedding_matrix)
return (
embedding_matrix,
idx_to_token,
token_to_idx
)
def generate_vocabulary(tokenized_sentences):
'''
Parameters:
tokenized_sentences : list of tokenised sentences of the dataset
Returns:
vocab -> list of words in the dataset sorted from highest occurance to lowest
vocab_dict -> dictionary that maps words to the count of their occurance in the dataset
'''
vocab_dict = dict()
# getting vocab dictionary
for sentence in tokenized_sentences:
for word in sentence:
vocab_dict[word] = vocab_dict.get(word,0)+1
# sorting vocab_dict from highest frequency to lowest
vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1], reverse=True)}
vocab = [word for word in vocab_dict]
return vocab, vocab_dict
def token_seq_to_num_seq(token_sequences, token_to_idx, oov_token):
'''
Paramters:
token_sequences -> list of word sequences generated by tokenizing the word sentences
token_to_idx -> dictionary that maps tokens to ID's
oov_token -> token used for out of vocabulary words
Returns:
num_sequences -> list of number sequences generated by mapping tokens to their ID's
Eg.
emb = Embedder(dimensions=50)
embedding_matrix, id2token, token2id = emb.get_embedding_matrix(vocab, num_words)
token_seq_to_num_seq(token_sequences, token2id, oov_token = '<oov>')
'''
num_sequences = []
for sequence in token_sequences:
num_sequences.append(np.asarray([ token_to_idx.get(token.lower(), token_to_idx.get(oov_token,0)) for token in sequence]))
return num_sequences
def num_seq_to_token_seq(num_sequences, idx_to_token):
'''
Parameters:
num_sequences -> list of sequences represented using ID's of the tokens
idx_to_token -> dictionary that maps ID's -> tokens
Returns:
token_sequences -> list of token sequences for the corresponding number sequences
Eg.
X = token_seq_to_num_seq(token_sequences, token2id, oov_token = '<oov>')
num_seq_to_num_seq(X, id2token)
'''
token_sequences = []
for sequence in num_sequences:
token_sequences.append([ idx_to_token[i] for i in sequence])
return token_sequences