-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcrawl.py
110 lines (92 loc) · 3.48 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Import the necessary methods from tweepy library
import sys
from tweepy import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy import API
from tweepy import Cursor
from tweepy import TweepError
import warnings
warnings.filterwarnings("ignore")
# Variables that contains the user credentials to access Twitter API
# # keys from "Twitter Tweet Summarization" app
access_token = "288597754-rJGehtrfHILQLoIzdhUjnNhbqkpxPvzrBOUaJQGl"
access_token_secret = "BEwxd4UbWgVBWFz3tGsOdXYEbDNT5bK7XQNKrdcmdjML7"
consumer_key = "TsD81Kd8J93gtVcsPRwfmDXFh"
consumer_secret = "daPxbl9oexqmsBoEp6nZ764Ro0j0jUE9pzSMXIT4xoVCnUi8ff"
DATA_FOLDER = sys.argv[1]
# This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
print(data)
return True
def on_error(self, status):
print (status)
def query_through_stream(topic):
stream = Stream(auth, l)
stream.filter(track=[topic])
def query_through_search(query):
TOPIC_DATA_HANDLER = open(DATA_FOLDER + query, 'w')
api = API(auth)
tweets = dict()
# # Initialization ##
max_tweets = 500
tweet_count = 0
max_id = -1
since_id = None
tweet_per_query = 100
# print("Downloading tweets for query : "+query)
while tweet_count < max_tweets:
try:
if (max_id <= 0):
if (not since_id):
new_tweets = api.search(q=query, count=tweet_per_query, lang="en", result_type="mixed", locale="en")
else:
new_tweets = api.search(q=query, count=tweet_per_query, since_id=since_id, lang="en", result_type="mixed", locale="en")
else:
if (not since_id):
new_tweets = api.search(q=query, count=tweet_per_query, max_id=str(max_id - 1), lang="en", result_type="mixed", locale="en")
else:
new_tweets = api.search(q=query, count=tweet_per_query, max_id=str(max_id - 1), since_id=since_id, lang="en", result_type="mixed", locale="en")
if not new_tweets:
print("No more tweets found")
break
tweet_id_iter = None
for tweet in new_tweets:
# json_tweet = jsonpickle.encode(tweet._json, unpicklable=False)
if(tweet.user.followers_count > 200 and tweet.text not in tweets):
tweet_text = (tweet.text).encode('utf-8').strip()
tweet_text = tweet_text.replace('\n', " ")
tweets[tweet.text] = 1 # # for duplicate identification
TOPIC_DATA_HANDLER.write(tweet_text + '\n\n')
tweet_count += 1
if(tweet_id_iter):
tweet_id_iter = min(tweet_id_iter, tweet.id)
else:
tweet_id_iter = tweet.id
if(tweet_count == max_tweets):
break
# tweet_count += len(new_tweets)
# print("Downloaded {0} tweets".format(tweet_count))
# max_id = new_tweets[-1].id
max_id = tweet_id_iter
except TweepError as e:
print("some error : " + str(e))
break
def isEnglish(s):
try:
s.decode('ascii')
except UnicodeDecodeError:
return False
else:
return True
if __name__ == '__main__':
# This handles Twitter authentication and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
TOPICS = sys.argv[1]
for topic in open(TOPICS, 'r'):
if(isEnglish(topic)):
query_through_search(topic.encode('utf-8').strip())
# query_through_stream("Scandal")