-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgoogle_patent_scrapper.py
351 lines (309 loc) · 14.5 KB
/
google_patent_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
from urllib.request import Request, urlopen
import urllib.parse
from urllib.error import HTTPError
from bs4 import BeautifulSoup
# json #
import json
# define Python user-defined exceptions
class Error(Exception):
"""Base class for other exceptions"""
pass
class PatentClassError(Error):
"""Raised when the input value is too small"""
pass
class NoPatentsError(Error):
"""Raised when no patents to scrape"""
pass
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# Create scraper class
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
class scraper_class:
"""
Google scraper class used to scrape data from 'https://patents.google.com/'
There are two primary ways to use the class:
(1) Add list of patents to class and scrape all patents at once
scraper=scraper_class() #<- Initialize class
# ~ Add patents to list ~ #
scraper.add_patents('US2668287A')
scraper.add_patents('US266827A')
# ~ Scrape all patents ~ #
scraper.scrape_all_patents()
# ~ Get results of scrape ~ #
patent_1_parsed = scraper.parsed_patents['2668287A']
patent_2_parsed = scraper.parsed_patents['266827A']
(2) Scrape each patent individually
scraper=scraper_class() #<- Initialize class
# ~~ Scrape patents individually ~~ #
patent_1 = 'US2668287A'
patent_2 = 'US266827A'
err_1, soup_1, url_1 = scraper.request_single_patent(patent_1)
err_2, soup_2, url_2 = scraper.request_single_patent(patent_2)
# ~ Parse results of scrape ~ #
patent_1_parsed = scraper.get_scraped_data(soup_1,patent_1,url_1)
patent_2_parsed = scraper.get_scraped_data(soup_2,patetn_2,url_2)
Attributes:
- list_of_patents (list) : patents to be scraped
- scrape_status (dict) : status of request using patent
- parsed_patents (dict) : result of parsing patent html
"""
def __init__(self):
self.list_of_patents = []
self.scrape_status = {}
self.parsed_patents = {}
def add_patents(self, patent):
"""Append patent to patent list attribute self.list_of_patents
Inputs:
- patent (str) : patent number
"""
# ~ Check if patent is string ~ #
if not isinstance(patent,str):
raise(PatentClassError("'patent' variable must be a string"))
# ~ Append patent to list to be scrapped ~ #
else:
self.list_of_patents.append(patent)
def delete_patents(self,patent):
"""Remove patent from patent list attribute self.list_of_patents
Inputs:
- patent (str) : patent number
"""
# ~ Check if patent is in list ~ #
if patent in self.list_of_patents:
self.list_of_patents.pop(self.list_of_patents.index(patent))
else:
print('Patent {0} not in patent list'.format(patent))
def add_scrape_status(self,patent,success_value):
"""Add status of scrape to dictionary self.scrape_status"""
self.scrape_status[patent] = success_value
def request_single_patent(self,patent,url=False):
"""Calls request function to retreive google patent data and parses returned html using BeautifulSoup
Returns:
- Status of scrape <- String
- Html of patent <- BS4 object
Inputs:
- patent (str) : if url == False then patent is patent number
elif url == True then patent is google patent url
- url (bool) : determines whether patent is treated as patent number
or google patent url
"""
try:
if not url:
url='https://patents.google.com/patent/{0}/en?'.format(patent)
else:
url=patent
#print(url)
req = Request(url,headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, features="lxml")
return(('Success',soup,url))
except HTTPError as e:
print('Patent: {0}, Error Status Code : {1}'.format(patent,e.code))
return(e.code,'',url)
def scrap_single_patent(self,patent,url=False):
error_status, soup, url = self.request_single_patent(patent)
if error_status=='Success':
parsed_patent = self.get_scraped_data(soup,patent,url)
else:
parsed_patent = None
print("failed search, try checking patent number, or pass patent link in place of patent with passing url =True")
return parsed_patent
def parse_citation(self,single_citation):
"""Parses patent citation, returning results as a dictionary
Returns (variables returned in dictionary, following are key names):
- patent_number (str) : patent number
- priority_date (str) : priority date of patent
- pub_date (str) : publication date of patent
Inputs:
- single_citation (str) : html string from citation section in google patent html
"""
try:
patent_number = single_citation.find('span',itemprop='publicationNumber').get_text()
except:
patent_number = ''
# ~ Get priority date ~ #
try:
priority_date = single_citation.find('td',itemprop='priorityDate').get_text()
except:
priority_date = ''
# ~ Get publication date ~ #
try:
pub_date = single_citation.find('td',itemprop='publicationDate').get_text()
except:
pub_date
return({'patent_number':patent_number,
'priority_date':priority_date,
'pub_date':pub_date})
def clean_scrapped(self,string, patent):
string=string.replace('{}'.format(patent), '')
string=string.replace('Google Patents','')
string=string.replace(';\n',' ')
string=string.replace('\n',' ')
return string
def process_patent_html(self,soup,patent):
""" Parse patent html using BeautifulSoup module
Returns (variables returned in dictionary, following are key names):
- application_number (str) : application number
- inventor_name (json) : inventors of patent
- assignee_name_orig (json) : original assignees to patent
- assignee_name_current (json) : current assignees to patent
- pub_date (str) : publication date
- filing_date (str) : filing date
- priority_date (str) : priority date
- grant_date (str) : grant date
- forward_cites_no_family (json) : forward citations that are not family-to-family cites
- forward_cites_yes_family (json) : forward citations that are family-to-family cites
- backward_cites_no_family (json) : backward citations that are not family-to-family cites
- backward_cites_yes_family (json) : backward citations that are family-to-family cites
Inputs:
- soup (str) : html string from of google patent html
"""
try:
abstract=""
for x in soup.select("[class~=abstract]",itemprop='content'): abstract+=(" "+x.get_text())
abstract = self.clean_scrapped(abstract, patent)
except Exception as e:
print(e)
abstract = []
# Assignee #
try:
title = self.clean_scrapped( soup.find('title').get_text(), patent)
except Exception as e:
print(e)
title = []
# Assignee #
try:
claims = ""
for x in soup.select(".claim-text",limit = 120) : claims+=(" "+ x.get_text())
claims = self.clean_scrapped(claims, patent)
except Exception as e:
print(e)
claims = []
try:
classification = ""
for x in soup.find_all('span',itemprop='Description'): classification+=(" "+x.get_text())
classification = self.clean_scrapped(classification, patent)
except Exception as e:
print(e)
classification = []
"""
try:
inventor_name = [{'inventor_name':x.get_text()} for x in soup.find_all('dd',itemprop='inventor')]
except:
inventor_name = []
# Assignee #
try:
assignee_name_orig = [{'assignee_name':x.get_text()} for x in soup.find_all('dd',itemprop='assigneeOriginal')]
except:
assignee_name_orig = []
try:
assignee_name_current = [{'assignee_name':x.get_text()} for x in soup.find_all('dd',itemprop='assigneeCurrent')]
except:
assignee_name_current = []
# Publication Date #
try:
pub_date = soup.find('dd',itemprop='publicationDate').get_text()
except:
pub_date = ''
# Application Number #
try:
application_number = soup.find('dd',itemprop="applicationNumber").get_text()
except:
application_number = ''
# Filing Date #
try:
filing_date = soup.find('dd',itemprop='filingDate').get_text()
except:
filing_date = ''
# Loop through all events #
list_of_application_events = soup.find_all('dd',itemprop='events')
priority_date = ''
grant_date = ''
for app_event in list_of_application_events:
# Get information #
try:
title_info = app_event.find('span',itemprop='type').get_text()
timeevent = app_event.find('time',itemprop='date').get_text()
if title_info == 'priority':
priority_date = timeevent
if title_info == 'granted':
grant_date = timeevent
if title_info == 'publication' and pub_date=='':
pub_date = timeevent
except:
continue
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# Citations
#
# All citations are of the same format
# -Find all citations
# -If there are any citations, parse each individually using "parse_citation"
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# ~~~ Forward Citations (No Family to Family) ~~~ #
found_forward_cites_orig=soup.find_all('tr', itemprop="forwardReferencesOrig")
forward_cites_no_family=[]
if len(found_forward_cites_orig)>0:
for citation in found_forward_cites_orig:
forward_cites_no_family.append(self.parse_citation(citation))
# ~~~ Forward Citations (Yes Family to Family) ~~~ #
found_forward_cites_family=soup.find_all('tr', itemprop="forwardReferencesFamily")
forward_cites_yes_family=[]
if len(found_forward_cites_family)>0:
for citation in found_forward_cites_family:
forward_cites_yes_family.append(self.parse_citation(citation))
# ~~~ Backward Citations (No Family to Family) ~~~ #
found_backward_cites_orig = soup.find_all('tr', itemprop='backwardReferences')
backward_cites_no_family=[]
if len(found_backward_cites_orig)>0:
for citation in found_backward_cites_orig:
backward_cites_no_family.append(self.parse_citation(citation))
# ~~~ Backward Citations (Yes Family to Family) ~~~ #
found_backward_cites_family = soup.find_all('tr', itemprop='backwardReferencesFamily')
backward_cites_yes_family=[]
if len(found_backward_cites_family)>0:
for citation in found_backward_cites_family:
backward_cites_yes_family.append(self.parse_citation(citation))
"""
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# Return data as a dictionary
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
return({'title' :json.dumps(title),
'abstract':json.dumps(abstract),
'claims':json.dumps(claims),
'classification':json.dumps(classification)})
#'inventor_name':json.dumps(inventor_name),
#'assignee_name_orig':json.dumps(assignee_name_orig),
#'assignee_name_current':json.dumps(assignee_name_current),
#'pub_date':pub_date,
#'priority_date':priority_date,
#'grant_date':grant_date,
#'filing_date':filing_date,
#'forward_cite_no_family':json.dumps(forward_cites_no_family),
#'forward_cite_yes_family':json.dumps(forward_cites_yes_family),
#'backward_cite_no_family':json.dumps(backward_cites_no_family),
#'backward_cite_yes_family':json.dumps(backward_cites_yes_family)})
def get_scraped_data(self,soup,patent,url):
# ~~ Parse individual patent ~~ #
parsing_individ_patent = self.process_patent_html(soup, patent)
# ~~ Add url + patent to dictionary ~~ #
parsing_individ_patent['url'] = url
parsing_individ_patent['patent'] = patent
# ~~ Return patent info ~~ #
return(parsing_individ_patent)
def scrape_all_patents(self):
""" Scrapes all patents in list self.list_of_patents using function "request_single_patent".
If you want to scrape a single patent without adding it to the class variable,
use "request_single_patent" function as a method on the class. See the doc string
in the class module for an example.
"""
# ~ Check if there are any patents ~ #
if len(self.list_of_patents)==0:
raise(NoPatentsError("no patents to scrape specified in 'patent' variable: add patent using class.add_patents([<PATENTNUMBER>])"))
# ~ Loop through list of patents and scrape them ~ #
else:
for patent in self.list_of_patents:
error_status, soup, url = self.request_single_patent(patent)
# ~ Add scrape status variable ~ #
self.add_scrape_status(patent,error_status)
if error_status=='Success':
self.parsed_patents[patent] = self.get_scraped_data(soup,patent,url)
else:
self.parsed_patents[patent] = {}