-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
巴思
committed
Nov 10, 2015
1 parent
c744633
commit c5bbd3f
Showing
8 changed files
with
232 additions
and
117 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,21 @@ | ||
var config = { | ||
mongodb_url: process.env.MONGOHQ_URL || 'mongodb://localhost/haixiu', | ||
port: process.env.PORT || 3000, | ||
douban_cookie: 'viewed="3590768_1016272"; ll="108296"; bid="hWcTdvDyOYI"; ct=y; _ga=GA1.2.1167184662.1412271513; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1415179087%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; __utmt=1; as="http://www.douban.com/"; dbcl2="105659582:Xb5v0vuj9OM"; ck="GKUf"; _pk_id.100001.8cb4=d9658059b6096e90.1412332268.27.1415180767.1415172287.; _pk_ses.100001.8cb4=*; push_noty_num=0; push_doumail_num=0; __utma=30149280.1167184662.1412271513.1415166517.1415179088.44; __utmb=30149280.29.10.1415179088; __utmc=30149280; __utmz=30149280.1415162297.42.25.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmv=30149280.10565', | ||
gaid: 'UA-26476625-4', | ||
/**! | ||
* haixiu - config.js | ||
* | ||
* Authors: | ||
* rockdai <rockdai@qq.com> | ||
*/ | ||
|
||
'use strict'; | ||
|
||
/** | ||
* Module dependencies. | ||
*/ | ||
let config = { | ||
mongodb_url: process.env.MONGOHQ_URL || 'mongodb://127.0.0.1/haixiu', | ||
port: process.env.PORT || 27017, | ||
apikey: process.env.DB_APIKEY || '', | ||
groupName: 'haixiuzu', | ||
fetchPage: 20, // 抓取最新20页数据 | ||
}; | ||
|
||
exports = module.exports = config; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,80 +1,78 @@ | ||
var superagent = require('superagent'); | ||
var async = require('async'); | ||
var _ = require('lodash'); | ||
var cheerio = require('cheerio'); | ||
var model = require('./model'); | ||
var Post = model.Post; | ||
var eventproxy = require('eventproxy'); | ||
var config = require('./config'); | ||
/**! | ||
* haixiu - crawler.js | ||
* | ||
* Authors: | ||
* rockdai <rockdai@qq.com> | ||
*/ | ||
|
||
var q = async.queue(function (task, callback) { | ||
var postInfo = task; | ||
var ep = new eventproxy(); | ||
ep.fail(callback); | ||
'use strict'; | ||
|
||
// 如果帖子已经抓取过就不再抓取 | ||
Post.findOne({url: postInfo.url}, ep.done(function (post) { | ||
if (post) { | ||
return ep.emit('got_author'); | ||
} | ||
ep.emit('fetch_author'); | ||
})); | ||
/** | ||
* Module dependencies. | ||
*/ | ||
const Douban = require('./lib/douban'); | ||
const config = require('./config'); | ||
const model = require('./model'); | ||
const _ = require('lodash'); | ||
const co = require('co'); | ||
|
||
ep.all('fetch_author', function () { | ||
superagent.get(postInfo.author_url) | ||
.set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36') | ||
.set('Cookie', config.douban_cookie) | ||
.end(ep.done(function (res) { | ||
if (res.status !== 200) { | ||
console.error(403, postInfo.author_url); | ||
ep.emit('got_author'); | ||
return; | ||
} | ||
var $ = cheerio.load(res.text); | ||
var location = $('.loc').text().replace('常居: \n', '').trim(); | ||
var post = new Post({ | ||
url: postInfo.url, | ||
title: postInfo.title, | ||
imgs: postInfo.imgs, | ||
author: postInfo.author, | ||
author_url: postInfo.author_url, | ||
author_location: location, | ||
}); | ||
post.save(ep.done(function () { | ||
console.log('got %s', postInfo.title); | ||
ep.emit('got_author'); | ||
})); | ||
})); | ||
}); | ||
const DB = new Douban({ | ||
apikey: config.apikey, | ||
}); | ||
|
||
const Post = model.Post; | ||
|
||
function onerror(err) { | ||
console.error(err.stack); | ||
console.log(err); | ||
} | ||
|
||
ep.all('got_author', function () { | ||
callback(); | ||
function* handleTopic(topic) { | ||
topic = topic || {}; | ||
let topicId = topic.id; | ||
let imgs = _.pluck(topic.photos, 'alt'); | ||
|
||
let doc = yield Post.findOne({id: topicId}).exec(); | ||
if (doc) { | ||
let updates = { | ||
title: topic.title, | ||
imgs: _.union(imgs, doc.imgs), | ||
author_name: topic.authorInfo.name, | ||
author_url: topic.authorInfo.alt, | ||
author_location: topic.authorInfo.loc_name || '', | ||
update_at: new Date(), | ||
}; | ||
return yield doc.update(updates).exec(); | ||
} | ||
let post = new Post({ | ||
id: topicId, | ||
url: `http://www.douban.com/group/topic/${topicId}/`, | ||
title: topic.title, | ||
imgs: imgs, | ||
author_id: topic.authorInfo.id, | ||
author_name: topic.authorInfo.name, | ||
author_url: topic.authorInfo.alt, | ||
author_location: topic.authorInfo.loc_name || '', | ||
}); | ||
// 并发数 | ||
}, 1); | ||
return yield post.save(); | ||
} | ||
|
||
function fetchHaixiuzu() { | ||
var ep = new eventproxy(); | ||
ep.on('error',function (err) { | ||
console.error(err); | ||
}); | ||
superagent.get('https://database.duoshuo.com/api/threads/listPosts.json?thread_key=haixiuzu&page=1&limit=100') | ||
.set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36') | ||
.set('Cookie', config.douban_cookie) | ||
.end(ep.done(function (res) { | ||
var json = JSON.parse(res.text); | ||
var parentPosts = json.parentPosts; | ||
for (var postId in parentPosts) { | ||
var postInfo = parentPosts[postId]; | ||
postInfo = JSON.parse(new Buffer(postInfo.message, 'base64')); | ||
q.push(postInfo, ep.done(function () {})); | ||
co(function* () { | ||
for (let page = 1; page <= config.fetchPage; page++) { | ||
let topics = DB.groupTopic(config.groupName, page); | ||
for (let i = 0; i < topics.length; i++) { | ||
let topic = topics[i]; | ||
topic.authorInfo = DB.user((topic.author || {}).id); | ||
yield handleTopic(topic); | ||
} | ||
})); | ||
} | ||
}).catch(onerror); | ||
} | ||
|
||
exports.start = function () { | ||
fetchHaixiuzu(); | ||
|
||
// 每分钟运行一次 | ||
setInterval(fetchHaixiuzu, 60 * 1000); | ||
// 每10分钟运行一次 | ||
setInterval(fetchHaixiuzu, 10 * 60 * 1000); | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
/**! | ||
* haixiu - douban.js | ||
* | ||
* Authors: | ||
* rockdai <rockdai@qq.com> | ||
*/ | ||
|
||
'use strict'; | ||
|
||
/** | ||
* Module dependencies. | ||
*/ | ||
const querystring = require('querystring'); | ||
const request = require('urllib-sync').request; | ||
|
||
const API_ROOT = 'https://api.douban.com/v2'; | ||
|
||
/** | ||
* Expose `Client` | ||
*/ | ||
|
||
module.exports = Client; | ||
|
||
function Client(options) { | ||
if (!(this instanceof Client)) { | ||
return new Client(options); | ||
} | ||
|
||
options = options || {}; | ||
this.apikey = options.apikey; | ||
this.timeout = options.timeout || 30000; | ||
} | ||
|
||
Client.prototype.getUrl = function(path, query) { | ||
let result = API_ROOT + path; | ||
query = query || {}; | ||
if (this.apikey) { | ||
query.apikey = this.apikey; | ||
} | ||
result = result + '?' + querystring.stringify(query); | ||
return result; | ||
}; | ||
|
||
Client.prototype.request = function (url, args) { | ||
|
||
args = args || {}; | ||
args.timeout = this.timeout; | ||
|
||
let result = request(url, args); | ||
|
||
let body = result.data.toString(); | ||
let status = result.status; | ||
let headers = result.headers; | ||
if (status.toString()[0] !== '2') { | ||
let err = new Error('Request Douban API error.'); | ||
err.name = 'RequestDoubanAPIError'; | ||
err.statusCode = status; | ||
err.originHeaders = headers; | ||
err.originBody = body; | ||
throw err; | ||
} | ||
let jsonBody; | ||
try { | ||
jsonBody = JSON.parse(body); | ||
} catch (ex) { | ||
ex.name = 'ParseDoubanAPIFailed'; | ||
ex.statusCode = status; | ||
ex.originHeaders = headers; | ||
ex.originBody = body; | ||
throw ex; | ||
} | ||
return jsonBody; | ||
}; | ||
|
||
Client.prototype.user = function (userId) { | ||
let url = this.getUrl(`/user/${userId}`); | ||
let body = this.request(url); | ||
return body; | ||
}; | ||
|
||
Client.prototype.groupTopic = function (groupName, page) { | ||
page = page || 1; | ||
let start = (page - 1) * 20; | ||
let url = this.getUrl(`/group/${groupName}/topics`, { | ||
start: start, | ||
}); | ||
let body = this.request(url); | ||
let topics = body.topics || []; | ||
return topics; | ||
}; |
Oops, something went wrong.