-
Notifications
You must be signed in to change notification settings - Fork 3
/
crawlar.js
104 lines (81 loc) · 2.24 KB
/
crawlar.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
var https = require('https')
,request = require('request')
,cheerio = require('cheerio')
,path = require('path')
,fs = require('fs')
,$ = require('jquery')
,url = 'https://www.zhihu.com/question/35846840';
function filterHtml(html){
var _ = cheerio.load(html);
var noscriptImgs = _('.zm-item-rich-text img');
// var clickMore = &('.zu-button-more');
// clickMore.click(function(){
// console.log('click me');
// });
// clickMore.trigger('click');
var imgData = [];
noscriptImgs.each(function(){
var noscriptImg = _(this);
var imgsSrc = noscriptImg.attr('src');
imgData.push(imgsSrc);
return imgData;
});
var newArr = addHttpsForArray(imgData);
// console.log(newArr);
newArr.map(function(item){
var filename = parseUrlForFileName(item);
downloadImg(item,filename,function(err) {
if(err){
console.log(err);
}
console.log(filename + ' done');
});
});
}
// 获取各个答案的答主名称并作为文件夹名
function createFileName(){
}
// 获取下载文件的时候的文件名
function parseUrlForFileName(address) {
var filename = path.basename(address);
return filename;
}
//给字符串增加https
function addHttpsForString(string){
if(string.indexOf('https') === -1 ){
string = 'https' + string;
}
return string;
}
// 给数组增加https
function addHttpsForArray(arr){
for( var i=0; i<arr.length; i++ ){
if( arr[i].indexOf( "https" ) === -1 ) {
arr[i] = "https:" + arr[i];
}
}
return arr;
}
// 下载到本地制定images文件夹
function downloadImg(url, filename, callback){
request.head(url, function(err, res, body){
if (err) {
console.log('err: '+ err);
return false;
}
request(url)
.pipe(fs.createWriteStream('images/'+filename))
.on('close', callback);
});
};
https.get(url,function(res){
var html = '';
res.on('data',function(data){
html += data;
});
res.on('end',function(){
filterHtml(html);
});
}).on('error',function(){
console.log('Error');
});