-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcormenHTML2PDF.py
51 lines (31 loc) · 1.16 KB
/
cormenHTML2PDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pdfkit
import urllib.request as urlreq
import re
from bs4 import BeautifulSoup
from PyPDF2 import PdfFileReader, PdfFileWriter
filename = 'Introduction to Algorithms Cormen 3rd Edition.pdf'
outstream = open(filename,'wb')
url = "http://staff.ustc.edu.cn/~csli/graduate/algorithms/book6/toc.htm"
baseurl = "http://staff.ustc.edu.cn/~csli/graduate/algorithms/book6/"
website = urlreq.urlopen(url)
html = website.read().decode('utf-8')
#print(html)
soup = BeautifulSoup(html,features='lxml')
chaps2write = []
#pdfkit.from_url(prefaceurl,'preface.pdf')
for link in soup.findAll('a'):
chapname = str(link.get('href'))
chaps2write.append(chapname)
pdfkit.from_url(baseurl+chapname,chapname[:-4]+'.pdf')
#https://stackoverflow.com/questions/3444645/merge-pdf-files
inpstream = []
for chapname in chaps2write:
inpstream.append(open(chapname[:-4]+'.pdf','rb'))
writer = PdfFileWriter()
for reader in map(PdfFileReader,inpstream):
for n in range(reader.getNumPages()):
writer.addPage(reader.getPage(n))
writer.write(outstream)
for inps in inpstream:
inps.close()
print('\n\n'+"Done.....We have the new book\n\n")