Skip to content

Commit

Permalink
增量爬
Browse files Browse the repository at this point in the history
  • Loading branch information
alsotang committed Nov 5, 2014
1 parent db3c835 commit f5fc5f8
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 43 deletions.
90 changes: 47 additions & 43 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,64 +4,68 @@ var _ = require('lodash');
var cheerio = require('cheerio');
var model = require('./model');
var Post = model.Post;
var eventproxy = require('eventproxy');

var q = async.queue(function (task, callback) {
var postInfo = task;
superagent.get(postInfo.author_url)
.set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36')
.end(function (err, res) {
if (err) {
return callback(err);
}
var $ = cheerio.load(res.text);
var location = $('.loc').text().replace('常居: \n', '').trim();
Post.findOne({url: postInfo.url}).exec(function (err, post) {
if (err) {
return callback(err);
}
if (!post) {
postInfo.author_location = location;
post = new Post({
url: postInfo.url,
title: postInfo.title,
imgs: postInfo.imgs,
author: postInfo.author,
author_url: postInfo.author_url,
author_location: location,
});
post.save(function (err) {
if (err) {
return callback(err);
}
console.log('got %s', postInfo.title);
callback(null);
});
} else {
callback(null);
var ep = new eventproxy();
ep.fail(callback);

// 如果帖子已经抓取过就不再抓取
Post.findOne({url: postInfo.url}, ep.done(function (post) {
if (post) {
return ep.emit('got_author');
}
ep.emit('fetch_author');
}));

ep.all('fetch_author', function () {
superagent.get(postInfo.author_url)
.set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36')
.end(ep.done(function (res) {
if (res.status !== 200) {
console.error(403, postInfo.author_url);
ep.emit('got_author');
return;
}
});
});
var $ = cheerio.load(res.text);
var location = $('.loc').text().replace('常居: \n', '').trim();
var post = new Post({
url: postInfo.url,
title: postInfo.title,
imgs: postInfo.imgs,
author: postInfo.author,
author_url: postInfo.author_url,
author_location: location,
});
post.save(ep.done(function () {
console.log('got %s', postInfo.title);
ep.emit('got_author');
}));
}));
});

ep.all('got_author', function () {
callback();
});
}, 3);

function fetchHaixiuzu() {
var ep = new eventproxy();
ep.fail(function (err) {
console.error(err);
});
superagent.get('https://database.duoshuo.com/api/threads/listPosts.json?thread_key=haixiuzu&page=1&limit=100')
.set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36')
.end(function (err, res) {
if (err) {
return console.error('fetchHaixiuzu', err);
}
.end(ep.done(function (res) {
var json = JSON.parse(res.text);
var parentPosts = json.parentPosts;
for (var postId in parentPosts) {
var postInfo = parentPosts[postId];
postInfo = JSON.parse(new Buffer(postInfo.message, 'base64'));
q.push(postInfo, function (err) {
if (err) {
return console.error(err);
}
});
q.push(postInfo, ep.done(function () {}));
}
});
}));
}

exports.start = function () {
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"dependencies": {
"async": "^0.9.0",
"cheerio": "^0.17.0",
"eventproxy": "^0.3.1",
"express": "^4.10.1",
"express-handlebars": "^1.1.0",
"lodash": "^2.4.1",
Expand Down

0 comments on commit f5fc5f8

Please sign in to comment.