-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathextract-proper-nouns
executable file
·60 lines (46 loc) · 1.46 KB
/
extract-proper-nouns
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python3
import sys
import os.path
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer
#
# Parses arguments
#
if len(sys.argv) != 2:
print('Usage:', sys.argv[0], '<text file>')
sys.exit(1)
filename = sys.argv[1]
if not os.path.exists(filename):
print(filename + ': no such file')
sys.exit(2)
#
# Extract proper nouns
#
with open (filename, "r") as file:
text=file.read().replace('\n', ' ').strip()
# First, the punkt tokenizer divides our text in sentences.
# Each sentence is then tokenized and POS tagged.
#
# Proper nouns receive the tags 'NPP', we discard first words of sentence to
# reduce the false positive rate. For example, in the following sentence,
# onomatopoeias are tagged as NPP: "Bang! Ssssssss! It exploded.".
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
for sentence in sent_detector.tokenize(text):
tokenizedSentence = word_tokenize(sentence)
taggedSentence = pos_tag(tokenizedSentence)
start = True
currentCandidate = []
for word, pos in taggedSentence:
if start:
start = False
continue
if pos == 'NNP':
currentCandidate.append(word)
continue
if len(currentCandidate) > 0:
print(' '.join(currentCandidate))
currentCandidate = []
if len(currentCandidate) > 0:
print(' '.join(currentCandidate))