-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_mining_toolbox2.py
204 lines (169 loc) · 8.08 KB
/
text_mining_toolbox2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#Author: Dimitry Slavin
#Name: "text_mining_toolbox2.py"
#Date Created: Monday, August 31, 2015
#General Purpose: A set of functions that make it easier to manipulate text (keyword extraction, keyword labeling, conditional filtration)
#A brief description of each function is provided below the function declaration.
def ngrams(text, n):
#Given a string (text) and an integer (n), returns a list of all of text's n-grams
text = text.split(' ')
output1 = []
for i in range(len(text)-n+1):
output1.append(text[i:i+n])
return output1
def flattenList(inputList):
#Given a list of lists (list1), returns a flattened version of list1
return [child for parent in inputList for child in parent]
def getFrequentGrams(textList, n):
#Returns the most frequent n-grams that appear in textList
import pandas as pd
grams = flattenList(list(map(lambda x: ngrams(x, n), textList)))
grams = [' '.join(gram) for gram in grams]
return pd.Series(grams).value_counts()
#TEXT EXTRACTION
#--------------------------------------------------------------------------------------------------------------------------------------------------------
def getHelper(text, expression):
#Given a string (text), and a regular expression (expression), returns all matches of expression found in text
import re
reobj = re.compile(expression)
outputList = reobj.findall(text)
return outputList
def getEmails(text):
#Returns list of emails found in text
expression = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
return getHelper(text, expression)
def getHandles(text):
#Returns list of handles found in text
expression = r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))(@[A-Za-z]+[A-Za-z0-9_]+)'
return getHelper(text, expression)
def getHashtags(text):
#Returns list of hashtags found in text
expression = r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))(#[A-Za-z]+[A-Za-z0-9_]+)'
return getHelper(text, expression)
def getURLs(text):
#Returns list of URLs found in text
expression = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))'
matchGroups = getHelper(text, expression)
outputList = [matchGroup[0] for matchGroup in matchGroups]
return outputList
def prepExpressionList(inputList):
#Returns a modified list of regular expressions (modifications described below)
before = r'(?<![a-zA-Z0-9-])' #aka not directly preceded by a character, number, or dash (note: ?<! also matches beginning of string)
after = r'(?![a-zA-Z0-9-])' #aka not directly followed by character, number, or dash
expressionList = [before+tag+after for tag in inputList]
return expressionList
def getExpressions(text, inputList, prep = True):
#Returns a list of all matches of the regex's found in inputList
if isinstance(inputList, str): #if a string is passed in instead of a list, make it a list
inputList = [inputList]
if prep:
expressionList = prepExpressionList(inputList)
else:
expressionList = inputList
outputList = []
for expression in expressionList:
matches = getHelper(text, expression)
if matches:
outputList.extend(matches)
return outputList
#LABELING
#--------------------------------------------------------------------------------------------------------------------------------------------------------
def labelHelper(text, expression, label):
#Given a string (text), a regular expression (expression), and a label, returns a modified version of text with matches labeled as label
import re
reobj = re.compile(expression)
newText = reobj.sub(label, text)
return newText
def labelURLs(text, label='{URL}'):
#Returns text with urls labeled with label
expression = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))'
return labelHelper(text, expression, label)
def labelEmails(text, label='{EMAIL}'):
#Returns text with emails labeled with label
expression = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
return labelHelper(text, expression, label)
def labelHandles(text, label='{HANDLE}'):
#Returns text with handles labeled with label
expression = r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))(@[A-Za-z]+[A-Za-z0-9_]+)'
return labelHelper(text, expression, label)
def labelHashtags(text, label='{HASHTAG}'):
#Returns text with hashtags labeled with label
expression = r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))(#[A-Za-z]+[A-Za-z0-9_]+)'
return labelHelper(text, expression, label)
def labelExpressions(text, inputList, label, prep=True):
#Returns text with all the matched expressions in inputList labeled with label
if isinstance(inputList, str): #if a string is passed in instead of a list, make it a list
inputList = [inputList]
if prep:
expressionList = prepExpressionList(inputList)
else:
expressionList = inputList
newText = text #initialize newText
for expression in expressionList:
newText = labelHelper(newText, expression, label)
return newText
def labelStopWords(text, label='{SW}'):
#Returns text with all stopwords labeled with label
import nltk
stopwordList = nltk.corpus.stopwords.words('english')
return labelExpressions(text, stopwordList, label)
#FILTRATION
#--------------------------------------------------------------------------------------------------------------------------------------------------------
def filterByHelper(matchObjLists, method='AND'):
condition = []
length = len(matchObjLists[0]) #length of first list in matchObjLists (all lists should be the same length)
if method == 'OR':
for i in range(0, length):
if any([matchObjList[i] for matchObjList in matchObjLists]):
condition.append(True)
else:
condition.append(False)
return condition
else: # method == 'AND'
for i in range(0, length):
if all([matchObjList[i] for matchObjList in matchObjLists]):
condition.append(True)
else:
condition.append(False)
return condition
def filterByWordCount(textList, inputList, index = False, invert = False):
#filter textList by word counts in inputList
import itertools
if isinstance(inputList, int): #if an int is passed in instead of a list, make it a list
inputList = [inputList]
matchObjLists = []
for wordCount in inputList:
matchObjList = []
for textString in textList:
isProperLength = len(textString.split(' ')) == wordCount
matchObjList.append(isProperLength)
matchObjLists.append(matchObjList)
condition = filterByHelper(matchObjLists, method='OR')
if invert:
condition = [not boolVal for boolVal in condition]
if index is False:
return list(itertools.compress(textList, condition))
else:
return list(itertools.compress(enumerate(textList), condition))
def filterByExpression(textList, inputList, prep = True, method = 'AND', index = False, invert = False):
#filter textList by expressions in inputList
import itertools
import re
if isinstance(inputList, str): #if a string is passed in instead of a list, make it a list
inputList = [inputList]
if prep:
expressionList = prepExpressionList(inputList)
else:
expressionList = inputList
matchObjLists = []
for expression in expressionList:
matchObjList = []
for textString in textList:
matchObjList.append(re.search(expression, textString))
matchObjLists.append(matchObjList)
condition = filterByHelper(matchObjLists, method = method)
if invert:
condition = [not boolVal for boolVal in condition]
if index is False:
return list(itertools.compress(textList, condition))
else:
return list(itertools.compress(enumerate(textList), condition))