-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathtag.py
50 lines (43 loc) · 1.25 KB
/
tag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import sys
import os
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import pos_tag
"""
Input : Folder containing all the cleaned data
Output : Tagged output for each file in input
"""
PREPROCESSED_DATA = "preprocessed/"
stop_words = dict()
def tag(path, filename):
print("Tagging "+path)
WRITE_HANDLER = open(PREPROCESSED_DATA + filename.strip() + "_features", 'w')
for line in open(path, 'r'):
tokens = line.split()
if(len(tokens) == 0):
continue
tags = pos_tag(tokens) # tag
features = list()
for token in tags:
tok = token[0]
tag = token[1]
if tok.lower() not in stop_words:
features.append(tok+":"+tag)
if(len(features)>0):
WRITE_HANDLER.write(str(features)+'\n\n')
else: ## EMPTY lines
WRITE_HANDLER.write('\n\n')
def get_stop_words():
if(len(stop_words)>0):
return stop_words
stop = stopwords.words('english') ## Stop chars
for s in stop:
stop_words[s] = 1
return stop_words
CLEANED_DATA_DIR = sys.argv[1]
get_stop_words()
for root, dirs, files in os.walk(CLEANED_DATA_DIR): # gets all the files from subfolders recrsively
for name in files:
absolute_path = os.path.join(root, name)
if os.path.isfile(absolute_path) and name != ".DS_Store":
tag(absolute_path, name)