-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from rockdai/v2
v2
- Loading branch information
Showing
10 changed files
with
243 additions
and
119 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
node_modules | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,19 @@ | ||
var config = { | ||
mongodb_url: process.env.MONGOHQ_URL || 'mongodb://localhost/haixiu', | ||
port: process.env.PORT || 3000, | ||
douban_cookie: 'viewed="3590768_1016272"; ll="108296"; bid="hWcTdvDyOYI"; ct=y; _ga=GA1.2.1167184662.1412271513; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1415179087%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; __utmt=1; as="http://www.douban.com/"; dbcl2="105659582:Xb5v0vuj9OM"; ck="GKUf"; _pk_id.100001.8cb4=d9658059b6096e90.1412332268.27.1415180767.1415172287.; _pk_ses.100001.8cb4=*; push_noty_num=0; push_doumail_num=0; __utma=30149280.1167184662.1412271513.1415166517.1415179088.44; __utmb=30149280.29.10.1415179088; __utmc=30149280; __utmz=30149280.1415162297.42.25.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmv=30149280.10565', | ||
gaid: 'UA-26476625-4', | ||
/**! | ||
* haixiu - config.js | ||
* | ||
*/ | ||
|
||
'use strict'; | ||
|
||
/** | ||
* Module dependencies. | ||
*/ | ||
let config = { | ||
mongodb_url: process.env.MONGOHQ_URL || 'mongodb://127.0.0.1/haixiu', | ||
port: process.env.PORT || 27017, | ||
apikey: process.env.DB_APIKEY || '', | ||
groupName: 'haixiuzu', | ||
fetchPage: 20, // 抓取最新20页数据 | ||
}; | ||
|
||
exports = module.exports = config; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,80 +1,69 @@ | ||
var superagent = require('superagent'); | ||
var async = require('async'); | ||
var _ = require('lodash'); | ||
var cheerio = require('cheerio'); | ||
var model = require('./model'); | ||
var Post = model.Post; | ||
var eventproxy = require('eventproxy'); | ||
var config = require('./config'); | ||
/**! | ||
* haixiu - crawler.js | ||
* | ||
*/ | ||
|
||
var q = async.queue(function (task, callback) { | ||
var postInfo = task; | ||
var ep = new eventproxy(); | ||
ep.fail(callback); | ||
'use strict'; | ||
|
||
// 如果帖子已经抓取过就不再抓取 | ||
Post.findOne({url: postInfo.url}, ep.done(function (post) { | ||
if (post) { | ||
return ep.emit('got_author'); | ||
} | ||
ep.emit('fetch_author'); | ||
})); | ||
/** | ||
* Module dependencies. | ||
*/ | ||
const Douban = require('./lib/douban'); | ||
const config = require('./config'); | ||
const model = require('./model'); | ||
const _ = require('lodash'); | ||
const co = require('co'); | ||
|
||
const DB = new Douban({ | ||
apikey: config.apikey, | ||
}); | ||
|
||
ep.all('fetch_author', function () { | ||
superagent.get(postInfo.author_url) | ||
.set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36') | ||
.set('Cookie', config.douban_cookie) | ||
.end(ep.done(function (res) { | ||
if (res.status !== 200) { | ||
console.error(403, postInfo.author_url); | ||
ep.emit('got_author'); | ||
return; | ||
} | ||
var $ = cheerio.load(res.text); | ||
var location = $('.loc').text().replace('常居: \n', '').trim(); | ||
var post = new Post({ | ||
url: postInfo.url, | ||
title: postInfo.title, | ||
imgs: postInfo.imgs, | ||
author: postInfo.author, | ||
author_url: postInfo.author_url, | ||
author_location: location, | ||
}); | ||
post.save(ep.done(function () { | ||
console.log('got %s', postInfo.title); | ||
ep.emit('got_author'); | ||
})); | ||
})); | ||
}); | ||
const Post = model.Post; | ||
|
||
function onerror(err) { | ||
console.error(err.stack); | ||
console.log(err); | ||
} | ||
|
||
ep.all('got_author', function () { | ||
callback(); | ||
}); | ||
// 并发数 | ||
}, 1); | ||
function* handleTopic(topic) { | ||
topic = topic || {}; | ||
let topicId = topic.id; | ||
let imgs = _.pluck(topic.photos, 'alt'); | ||
|
||
let exists = yield Post.findOne({id: topicId}).exec(); | ||
if (exists) { | ||
imgs = _.union(imgs, exists.imgs); | ||
} | ||
let post = { | ||
id: topicId, | ||
url: `http://www.douban.com/group/topic/${topicId}/`, | ||
title: topic.title, | ||
imgs: imgs, | ||
author_id: topic.authorInfo.id, | ||
author_name: topic.authorInfo.name, | ||
author_url: topic.authorInfo.alt, | ||
author_location: topic.authorInfo.loc_name || '', | ||
update_at: new Date(), | ||
}; | ||
return yield Post.update({id: topicId}, post, {upsert: true}).exec(); | ||
} | ||
|
||
function fetchHaixiuzu() { | ||
var ep = new eventproxy(); | ||
ep.on('error',function (err) { | ||
console.error(err); | ||
}); | ||
superagent.get('https://database.duoshuo.com/api/threads/listPosts.json?thread_key=haixiuzu&page=1&limit=100') | ||
.set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36') | ||
.set('Cookie', config.douban_cookie) | ||
.end(ep.done(function (res) { | ||
var json = JSON.parse(res.text); | ||
var parentPosts = json.parentPosts; | ||
for (var postId in parentPosts) { | ||
var postInfo = parentPosts[postId]; | ||
postInfo = JSON.parse(new Buffer(postInfo.message, 'base64')); | ||
q.push(postInfo, ep.done(function () {})); | ||
co(function* () { | ||
for (let page = 1; page <= config.fetchPage; page++) { | ||
let topics = DB.groupTopic(config.groupName, page); | ||
for (let i = 0; i < topics.length; i++) { | ||
let topic = topics[i]; | ||
topic.authorInfo = DB.user((topic.author || {}).id); | ||
yield handleTopic(topic); | ||
} | ||
})); | ||
} | ||
}).catch(onerror); | ||
} | ||
|
||
exports.start = function () { | ||
fetchHaixiuzu(); | ||
|
||
// 每分钟运行一次 | ||
setInterval(fetchHaixiuzu, 60 * 1000); | ||
// 每10分钟运行一次 | ||
setInterval(fetchHaixiuzu, 10 * 60 * 1000); | ||
}; |
Oops, something went wrong.