-
Notifications
You must be signed in to change notification settings - Fork 0
/
ClimbFromLXFtoPDF.py
109 lines (99 loc) · 3.08 KB
/
ClimbFromLXFtoPDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#! usr/bin/env python3
# -*- coding:UTF-8 -*-
# author DZG 2017-02-15
import requests
from bs4 import BeautifulSoup as bs
import pdfkit
import time
import os
#total_url = "http://www.liaoxuefeng.com/wiki/001434446689867b27157e896e74d51a89c25cc8b43bdb3000"
total_url = 'http://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000'
ISOTIMEFORMAT = '%Y-%m-%d %X'
file_name_all = "廖雪松Git教程.pdf"
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{content}
</html>
</body>
"""
# 将传入的url分析为Html文件
def parse_url_to_html(url, name):
response = requests.get(url)
soup = bs(response.content, 'lxml')
# 有时候图片选取的是服务器本地图片,需要转换
imgFiles = soup.find_all("img")
for img in imgFiles:
src = str(img.get("src"))
if(src.startswith('/')):
img.attrs["src"] = "http://www.liaoxuefeng.com" + src
# 转换图片后的Html才是完整的Html
# 正文
body = soup.find_all(class_="x-wiki-content")[0]
# 标题
title = soup.find_all(class_="x-content")[0].find_all("h4")[0].text
# 标题加入到正文的最前面,居中显示
center_tag = soup.new_tag("center")
title_tag = soup.new_tag("h1")
title_tag.insert(1, title)
center_tag.insert(1, title_tag)
body.insert(1, center_tag)
html = str(body)
html = html_template.format(content=html)
html = html.encode("utf-8")
with open(name, "wb") as f:
f.write(html)
return name
# 获取一堆文档Url
def get_url_list(url):
# 获取该url下的所有目录Url
response = requests.get(url)
soup = bs(response.content, 'lxml')
allmeaus = soup.find_all(class_="uk-nav uk-nav-side")[1]
urls = []
for li in allmeaus.find_all("li"):
href = ''
href_1 = str(li.a.get("href"))
if(href_1.startswith('/')):
href = "http://www.liaoxuefeng.com" + href_1
urls.append(href)
return urls
# 将Html转换为pdf
def save_pdf(htmls, file_name):
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'outline-depth': 10
}
pdfkit.from_file(htmls, file_name, options=options)
def main1():
urls = get_url_list(total_url)
htmls = [parse_url_to_html(url,str(index)+".html") for index,url in enumerate(urls)]
save_pdf(htmls,file_name_all)
return htmls
def main2(htmls):
for html in htmls:
os.remove(html)
if __name__ == '__main__':
print('开始:', time.strftime(ISOTIMEFORMAT, time.localtime()))
start = time.time()
hs = main1()
main2(hs)
total_time = time.time() -start
print('结束:', time.strftime(ISOTIMEFORMAT, time.localtime()))
print("总耗时:"+total_time+"秒")