-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathrun.py
137 lines (120 loc) · 5.65 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python
# encoding: utf-8
"""
How to use:
* Go to https://developers.facebook.com/tools/explorer and click 'Get User Access Token'
* Then select 'manage_pages' and 'read_page_mailboxes'
* Switch to a page that you want to scrape
* Get the page_id and the token and pass as parameters to this script
"""
import os
import csv
import json
import requests
import argparse
import sys
import re
import datetime
import unidecode
import concurrent.futures
CONNECTIONS = 100
class FBScraper:
def __init__(self, page, output, token, since=None, until=None, folder=None):
self.token = token
self.output = output
self.since = since
self.until = until
folder_param = ('&folder=' + folder) if folder is not None else ''
self.uri = self.build_url('{}/conversations?fields=participants,link&limit=400{}', page, folder_param)
def build_url(self, endpoint, *params):
buildres = "https://graph.facebook.com/v3.1/" + endpoint.format(*params) + '&access_token={}'.format(self.token)
print("URL: ", buildres)
return buildres
def scrape_thread(self, url, lst):
if self.since:
matches = re.findall('&until=(\d+)', url)
if matches and int(matches[0]) <= self.since:
return lst
messages = requests.get(url).json()
for m in messages['data']:
time = datetime.datetime.strptime(m['created_time'], '%Y-%m-%dT%H:%M:%S+0000').replace(tzinfo=datetime.timezone.utc).timestamp()
if self.since and time < self.since:
continue
if self.until and time > self.until:
continue
lst.append({
'time': m['created_time'].replace('+0000', '').replace('T', ' '),
'message': m['message'],
'attachments': m.get('attachments', {}).get('data', [{}])[0].get('image_data', {}).get('url', ''),
'shares': m.get('shares', {}).get('data', [{}])[0].get('name', ''),
'from_id': m['from']['id']
})
# if messages['data']:
# print(' +', len(messages['data']))
next = messages.get('paging', {}).get('next', '')
if next:
self.scrape_thread(next, lst)
return lst
def get_messages(self, t):
extra_params = (('&since=' + str(self.since)) if self.since else '') + (('&until=' + str(self.until)) if self.until else '')
url = self.build_url('{}/messages?fields=from,created_time,message,shares,attachments&limit=400' + extra_params, t['id'])
thread = self.scrape_thread(url, [])
if thread:
print(
thread[0]['time'],
t['id'].ljust(20),
str(len(thread)).rjust(3) + ' from',
unidecode.unidecode(t['participants']['data'][0]['name'])
)
id_map = {p['id']: p['name'] for p in t['participants']['data']}
for message in thread:
message['from'] = id_map[message['from_id']]
return [{
# 'page_id': t['participants']['data'][1]['id'],
# 'page_name': t['participants']['data'][1]['name'],
# 'user_id': t['participants']['data'][0]['id'],
# 'user_name': t['participants']['data'][0]['name'],
'url': t['link'],
}] + list(reversed(thread))
return []
def scrape_thread_list(self, threads, count):
with concurrent.futures.ThreadPoolExecutor(max_workers=CONNECTIONS) as executor:
futures = (executor.submit(self.get_messages, conv) for conv in threads['data'])
for future in concurrent.futures.as_completed(futures):
messages = future.result()
for message in messages:
self.writer.writerow(message)
next = threads.get('paging', {}).get('next', '')
if next and count > 1:
self.scrape_thread_list(requests.get(next).json(), count - 1)
def run(self):
output = open(self.output, 'w', newline="\n", encoding="utf-8")
threads = requests.get(self.uri).json()
if 'error' in threads:
print(threads)
return
fieldnames = ['from_id', 'from', 'time', 'message', 'attachments', 'shares', 'url']
self.writer = csv.DictWriter(output, dialect='excel', fieldnames=fieldnames, extrasaction='ignore', quoting=csv.QUOTE_NONNUMERIC)
self.writer.writerow(dict((n, n) for n in fieldnames))
self.scrape_thread_list(threads, 20)
output.close()
def main():
"""
Main method
"""
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('page', metavar='page_id', type=int, nargs=1, help='Facebook Page ID')
parser.add_argument('output', metavar='output_file', type=str, nargs=1, help='CSV Output File')
parser.add_argument('token', metavar='access_token', type=str, nargs=1, help='Access Token')
parser.add_argument('--folder', metavar='folder', type=str, nargs='?', help='Get messages in a specific folder eg: page_done')
parser.add_argument('--since', metavar='since_epoch', type=int, nargs='?', help='Filter messages from after this time')
parser.add_argument('--until', metavar='until_epoch', type=int, nargs='?', help='Filter messages from before this time')
args = parser.parse_args()
print("pageid: ", args.page[0])
print("csv: ", args.output[0])
print("token: ", args.token[0])
print("since: ", args.since)
print("until: ", args.until)
FBScraper(args.page[0], args.output[0], args.token[0], args.since, args.until, args.folder).run()
if __name__ == "__main__":
main()