This repository has been archived by the owner on Sep 10, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
133 lines (105 loc) · 3.8 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import logging
import re
import sqlite3
import pendulum
import requests
from bs4 import BeautifulSoup
import db
non_numbers = re.compile('[^0-9]')
no_quotes = re.compile('"')
months = {}
months[1] = re.compile('^jan', re.IGNORECASE)
months[2] = re.compile('^feb', re.IGNORECASE)
months[3] = re.compile('^mar', re.IGNORECASE)
months[4] = re.compile('^apr', re.IGNORECASE)
months[5] = re.compile('^may', re.IGNORECASE)
months[6] = re.compile('^jun', re.IGNORECASE)
months[7] = re.compile('^jul', re.IGNORECASE)
months[8] = re.compile('^aug', re.IGNORECASE)
months[9] = re.compile('^sep', re.IGNORECASE)
months[10] = re.compile('^oct', re.IGNORECASE)
months[11] = re.compile('^nov', re.IGNORECASE)
months[12] = re.compile('^dec', re.IGNORECASE)
# adapted from https://docs.python.org/3/howto/logging-cookbook.html
logger = logging.getLogger('tos_scraper')
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('scraper.log')
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.WARNING)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(ch)
def convert_article_date(article_date: list):
"""Given a date in list like:
['January', '1st,', '2000']
...extract the date.
Args:
article_date (list): yes
Returns:
tuple: (year, month, day)
"""
for m, regex in months.items():
if regex.match(article_date[0]):
month = m
day = int(non_numbers.sub('', article_date[1]))
year = int(article_date[2])
return (year, month, day)
def get_article_url(article):
"""Gets the url of a given `article`.
Args:
article (bs4.element.Tag): the article in the page
Returns:
str: the url
"""
url = no_quotes.sub('', article.a['href'])
return f'https://treeofsavior.com{url}'
def get_news():
# with open('example.html', 'r') as example:
# soup = BeautifulSoup(example, 'html.parser')
page = requests.get('https://treeofsavior.com/page/news/')
soup = BeautifulSoup(page.text, 'html.parser')
news = soup.find_all('div', 'news_box')
all_news = []
for news_article in news:
article = {}
inner = news_article.find('div', 'box_inner')
article_date = inner.find('div', 'date')
try:
article_date = article_date.string.lstrip().split()
article['url'] = get_article_url(news_article)
article['title'] = news_article.h3.string
a_date = convert_article_date(article_date)
today = pendulum.today()
if db.check_if_entry_exists(article['url']):
article['date'] = db.get_entry_time(article['url'])
else:
if today == pendulum.datetime(*a_date, tz = today.tz):
article['date'] = pendulum.datetime(
*a_date,
pendulum.now(tz = 'UTC').hour,
tz = 'UTC'
)
else:
# If the article date doesn't match up with today's date,
# do not modify the publish time. Fallback for
# migrating setups, or resuming after pausing for some
# time.
article['date'] = pendulum.datetime(
*a_date,
0,
tz = 'UTC'
)
db.add_entry(
article['url'],
article['date']
)
all_news.append(article)
except AttributeError as e:
logger.warning(
f'Caught exception {e} from {article}, inner {inner}'
)
db.purge_old()
return all_news