This repository has been archived by the owner on Mar 28, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
167 lines (141 loc) · 5.22 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from gunpla import Gunpla
from HTMLParser import HTMLParser
import datetime
import requests
__author__ = 'porthunt'
'''
Parses the website information.
'''
class LinksParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.recording = 0
self.data = []
def handle_starttag(self, tag, attributes):
if tag != 'div' and tag != 'tr':
return
if self.recording:
self.recording += 1
return
for name, value in attributes:
# Name
if name == 'class' and value == 'TextBreak':
break
# Manufacturer
elif name == 'id' and value == 'masterBody_trMaker':
break
# Scale
elif name == 'id' and value == 'masterBody_trScale':
break
# Series
elif name == 'id' and value == 'masterBody_trSerieshin':
break
# Release Date
elif name == 'id' and value == 'masterBody_trSalseDate':
break
else:
return
self.recording = 1
def handle_endtag(self, tag):
if tag == 'div' and self.recording:
self.recording -= 1
elif tag == 'tr' and self.recording:
self.recording -= 1
def handle_data(self, data):
if self.recording:
self.data.append(data)
'''
Parses the item information. Verifies if the
item is a gunpla and formats the item info.
'''
def parse_gunpla(self, parser_data):
preorder = False
lst = list()
for item in parser_data:
item = item.replace('\r\n\t\t\t\t', '').strip()
if item != '' and item != ':' and item != ',':
lst.append(item)
if 'Gundam Model Kits' not in lst[0]:
return 'Not a gunpla model.'
if 'Pre-order' in lst[-1]:
preorder = True
gunpla = {'name': lst[0]}
for idx, item in enumerate(lst):
if lst[idx] == 'Manufacturer':
gunpla['manufacture'] = lst[idx + 1]
elif lst[idx] == 'Scale':
if lst[idx + 2] == 'Original':
gunpla['scale'] = lst[idx + 1]
else:
gunpla['scale'] = lst[idx + 2]
elif lst[idx] == 'Original':
gunpla['series'] = lst[idx + 1]
elif lst[idx] == 'Release Date':
if not preorder:
gunpla['release date'] = lst[idx + 1]
else:
gunpla['release date'] = 'Pre-Order'
else:
pass
gunpla['grade'] = ''
grades = ['FG', 'SD', 'HG', 'MG', 'RG', 'PG', 'RE/100']
for item in grades:
if '(' + item in gunpla['name']:
gunpla['grade'] = item
return gunpla
'''
Returns the item of the given URL.
'''
def run_one(self, url):
gunpla = None
parser = LinksParser()
while(True):
page = requests.get(url.strip())
try:
if page.status_code == 200:
parser.feed(page.text)
if not parser.data: # check if it parsed nothing
print('Item {} doesn\'t exist.'.
format(url.split('/')[-1]))
break
product = parser.parse_gunpla(parser.data)
if product == 'Not a gunpla model.':
print('{} is not a gunpla model.'.
format(url.split('/')[-1]))
return None
gunpla_id = url.split('/')[-1]
gunpla = Gunpla(gunpla_id, product)
# Check if already on the database
if gunpla.find() == None:
gunpla.insert()
# print(gunpla.summary())
break
else:
gunpla_found = gunpla.find()
diff = datetime.datetime.now() - gunpla_found.added
if diff.total_seconds() >= 86400: # one month
try:
gunpla.update()
print('Gunpla updated because the info '
'was more than one month old.\n')
except:
print('Error updating gunpla info.')
else:
print('Info is recent. Update was not made.\n')
break
except requests.ConnectionError:
print('Connection aborted.')
break
return gunpla
'''
Runs a list of URLs inside a file.
'''
def run_list(self, file_path, extension):
url_list = open(file_path + extension, 'r')
url_list_results = open(file_path + '_results' + extension, 'w')
for item in url_list:
gunpla = self.run_one(item)
print(gunpla.summary())
url_list_results.write(gunpla.summary())
print('==================')
url_list_results.write('\n==================\n')