From c5bbd3f10ae55ec1097b8ef15b0992c690dcd1ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B7=B4=E6=80=9D?= Date: Tue, 10 Nov 2015 14:54:34 +0800 Subject: [PATCH] refine --- app.js | 65 ++++++++-------- config.js | 23 ++++-- crawler.js | 130 ++++++++++++++++---------------- lib/douban.js | 90 ++++++++++++++++++++++ model.js | 25 ++++-- package.json | 12 +-- views/author.handlebars | 2 +- views/partials/_post.handlebars | 2 +- 8 files changed, 232 insertions(+), 117 deletions(-) create mode 100644 lib/douban.js diff --git a/app.js b/app.js index 930b145..69e2aaa 100644 --- a/app.js +++ b/app.js @@ -1,15 +1,25 @@ -var express = require('express'); -var exphbs = require('express-handlebars'); -var mongoose = require('mongoose'); -var crawler = require('./crawler'); -var model = require('./model'); -var Post = model.Post; -var config = require('./config'); +/**! + * haixiu - app.js + * + */ + +'use strict'; + +/** + * Module dependencies. + */ +const express = require('express'); +const exphbs = require('express-handlebars'); +const mongoose = require('mongoose'); +const crawler = require('./crawler'); +const model = require('./model'); +const Post = model.Post; +const config = require('./config'); mongoose.connect(config.mongodb_url); -var app = express(); -var hbs = exphbs.create({ +let app = express(); +let hbs = exphbs.create({ defaultLayout: 'main', helpers: { gaid: function () { @@ -20,7 +30,7 @@ var hbs = exphbs.create({ app.engine('handlebars', hbs.engine); app.set('view engine', 'handlebars'); -var cities = [ +let cities = [ {key: 'hangzhou', name: '浙江杭州'}, {key: 'shanghai', name: '上海'}, {key: 'beijing', name: '北京'}, @@ -41,36 +51,25 @@ app.get('/', function (req, res, next) { res.render('home', {cities: cities}); }); -function getDocsAuthorId(docs) { - docs = docs || []; - var reg = /http:\/\/www.douban.com\/group\/people\/(\w+)(\/)?/; - for (var i = 0; i < docs.length; i++) { - docs[i].authorId = reg.exec(docs[i].author_url)[1]; - } - return docs; -} - // 针对各个地域的 route 配置 app.get('/all', function (req, res, next) { - Post.find().sort({create_at: -1}).limit(100).exec(function (err, docs) { + Post.find().sort({id: -1}).limit(100).exec(function (err, docs) { if (err) { return next(err); } - docs = getDocsAuthorId(docs); res.render('posts', {docs: docs}); }); }); -for (var i = 0; i < cities.length; i++) { +for (let i = 0; i < cities.length; i++) { (function (city) { - var names = city.names || [city.name]; + let names = city.names || [city.name]; app.get('/city/' + city.key, function (req, res, next) { - Post.find({author_location: {$in: names}}).sort({create_at: -1}).limit(100).exec(function (err, docs) { + Post.find({author_location: {$in: names}}).sort({id: -1}).limit(100).exec(function (err, docs) { if (err) { return next(err); } - docs = getDocsAuthorId(docs); res.render('posts', {docs: docs}); }); }); @@ -81,22 +80,20 @@ for (var i = 0; i < cities.length; i++) { // 某个用户的发帖 app.get('/author/:authorId', function (req, res, next) { - var authorId = req.params.authorId; - var authorUrl = 'http://www.douban.com/group/people/' + authorId + '/'; - Post.find({author_url: authorUrl}).sort({create_at: -1}).limit(100).exec(function (err, docs) { + const authorId = req.params.authorId; + Post.find({author_id: authorId}).sort({id: -1}).limit(100).exec(function (err, docs) { if (err) { return next(err); } - var authorName = ''; + let authorName = ''; if (docs && docs.length) { // 取最近一条帖子的昵称 - authorName = docs[0].author; + authorName = docs[0].author_name; } - docs = getDocsAuthorId(docs); res.render('author', { + authorId: authorId, authorName: authorName, - authorUrl: authorUrl, - docs: docs + docs: docs, }); }); }); @@ -104,6 +101,6 @@ app.get('/author/:authorId', function (req, res, next) { // 启动爬虫 crawler.start(); -var server = app.listen(config.port, function () { +let server = app.listen(config.port, function () { console.log('app is listening ' + server.address().port); }); diff --git a/config.js b/config.js index 109f9bc..8e44086 100644 --- a/config.js +++ b/config.js @@ -1,8 +1,21 @@ -var config = { - mongodb_url: process.env.MONGOHQ_URL || 'mongodb://localhost/haixiu', - port: process.env.PORT || 3000, - douban_cookie: 'viewed="3590768_1016272"; ll="108296"; bid="hWcTdvDyOYI"; ct=y; _ga=GA1.2.1167184662.1412271513; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1415179087%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; __utmt=1; as="http://www.douban.com/"; dbcl2="105659582:Xb5v0vuj9OM"; ck="GKUf"; _pk_id.100001.8cb4=d9658059b6096e90.1412332268.27.1415180767.1415172287.; _pk_ses.100001.8cb4=*; push_noty_num=0; push_doumail_num=0; __utma=30149280.1167184662.1412271513.1415166517.1415179088.44; __utmb=30149280.29.10.1415179088; __utmc=30149280; __utmz=30149280.1415162297.42.25.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmv=30149280.10565', - gaid: 'UA-26476625-4', +/**! + * haixiu - config.js + * + * Authors: + * rockdai + */ + +'use strict'; + +/** + * Module dependencies. + */ +let config = { + mongodb_url: process.env.MONGOHQ_URL || 'mongodb://127.0.0.1/haixiu', + port: process.env.PORT || 27017, + apikey: process.env.DB_APIKEY || '', + groupName: 'haixiuzu', + fetchPage: 20, // 抓取最新20页数据 }; exports = module.exports = config; diff --git a/crawler.js b/crawler.js index f311095..3629aeb 100644 --- a/crawler.js +++ b/crawler.js @@ -1,80 +1,78 @@ -var superagent = require('superagent'); -var async = require('async'); -var _ = require('lodash'); -var cheerio = require('cheerio'); -var model = require('./model'); -var Post = model.Post; -var eventproxy = require('eventproxy'); -var config = require('./config'); +/**! + * haixiu - crawler.js + * + * Authors: + * rockdai + */ -var q = async.queue(function (task, callback) { - var postInfo = task; - var ep = new eventproxy(); - ep.fail(callback); +'use strict'; - // 如果帖子已经抓取过就不再抓取 - Post.findOne({url: postInfo.url}, ep.done(function (post) { - if (post) { - return ep.emit('got_author'); - } - ep.emit('fetch_author'); - })); +/** + * Module dependencies. + */ +const Douban = require('./lib/douban'); +const config = require('./config'); +const model = require('./model'); +const _ = require('lodash'); +const co = require('co'); - ep.all('fetch_author', function () { - superagent.get(postInfo.author_url) - .set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36') - .set('Cookie', config.douban_cookie) - .end(ep.done(function (res) { - if (res.status !== 200) { - console.error(403, postInfo.author_url); - ep.emit('got_author'); - return; - } - var $ = cheerio.load(res.text); - var location = $('.loc').text().replace('常居: \n', '').trim(); - var post = new Post({ - url: postInfo.url, - title: postInfo.title, - imgs: postInfo.imgs, - author: postInfo.author, - author_url: postInfo.author_url, - author_location: location, - }); - post.save(ep.done(function () { - console.log('got %s', postInfo.title); - ep.emit('got_author'); - })); - })); - }); +const DB = new Douban({ + apikey: config.apikey, +}); + +const Post = model.Post; + +function onerror(err) { + console.error(err.stack); + console.log(err); +} - ep.all('got_author', function () { - callback(); +function* handleTopic(topic) { + topic = topic || {}; + let topicId = topic.id; + let imgs = _.pluck(topic.photos, 'alt'); + + let doc = yield Post.findOne({id: topicId}).exec(); + if (doc) { + let updates = { + title: topic.title, + imgs: _.union(imgs, doc.imgs), + author_name: topic.authorInfo.name, + author_url: topic.authorInfo.alt, + author_location: topic.authorInfo.loc_name || '', + update_at: new Date(), + }; + return yield doc.update(updates).exec(); + } + let post = new Post({ + id: topicId, + url: `http://www.douban.com/group/topic/${topicId}/`, + title: topic.title, + imgs: imgs, + author_id: topic.authorInfo.id, + author_name: topic.authorInfo.name, + author_url: topic.authorInfo.alt, + author_location: topic.authorInfo.loc_name || '', }); -// 并发数 -}, 1); + return yield post.save(); +} function fetchHaixiuzu() { - var ep = new eventproxy(); - ep.on('error',function (err) { - console.error(err); - }); - superagent.get('https://database.duoshuo.com/api/threads/listPosts.json?thread_key=haixiuzu&page=1&limit=100') - .set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36') - .set('Cookie', config.douban_cookie) - .end(ep.done(function (res) { - var json = JSON.parse(res.text); - var parentPosts = json.parentPosts; - for (var postId in parentPosts) { - var postInfo = parentPosts[postId]; - postInfo = JSON.parse(new Buffer(postInfo.message, 'base64')); - q.push(postInfo, ep.done(function () {})); + co(function* () { + for (let page = 1; page <= config.fetchPage; page++) { + let topics = DB.groupTopic(config.groupName, page); + for (let i = 0; i < topics.length; i++) { + let topic = topics[i]; + topic.authorInfo = DB.user((topic.author || {}).id); + yield handleTopic(topic); } - })); + } + }).catch(onerror); } exports.start = function () { fetchHaixiuzu(); - // 每分钟运行一次 - setInterval(fetchHaixiuzu, 60 * 1000); + // 每10分钟运行一次 + setInterval(fetchHaixiuzu, 10 * 60 * 1000); }; diff --git a/lib/douban.js b/lib/douban.js new file mode 100644 index 0000000..d5fa805 --- /dev/null +++ b/lib/douban.js @@ -0,0 +1,90 @@ +/**! + * haixiu - douban.js + * + * Authors: + * rockdai + */ + +'use strict'; + +/** + * Module dependencies. + */ +const querystring = require('querystring'); +const request = require('urllib-sync').request; + +const API_ROOT = 'https://api.douban.com/v2'; + +/** + * Expose `Client` + */ + +module.exports = Client; + +function Client(options) { + if (!(this instanceof Client)) { + return new Client(options); + } + + options = options || {}; + this.apikey = options.apikey; + this.timeout = options.timeout || 30000; +} + +Client.prototype.getUrl = function(path, query) { + let result = API_ROOT + path; + query = query || {}; + if (this.apikey) { + query.apikey = this.apikey; + } + result = result + '?' + querystring.stringify(query); + return result; +}; + +Client.prototype.request = function (url, args) { + + args = args || {}; + args.timeout = this.timeout; + + let result = request(url, args); + + let body = result.data.toString(); + let status = result.status; + let headers = result.headers; + if (status.toString()[0] !== '2') { + let err = new Error('Request Douban API error.'); + err.name = 'RequestDoubanAPIError'; + err.statusCode = status; + err.originHeaders = headers; + err.originBody = body; + throw err; + } + let jsonBody; + try { + jsonBody = JSON.parse(body); + } catch (ex) { + ex.name = 'ParseDoubanAPIFailed'; + ex.statusCode = status; + ex.originHeaders = headers; + ex.originBody = body; + throw ex; + } + return jsonBody; +}; + +Client.prototype.user = function (userId) { + let url = this.getUrl(`/user/${userId}`); + let body = this.request(url); + return body; +}; + +Client.prototype.groupTopic = function (groupName, page) { + page = page || 1; + let start = (page - 1) * 20; + let url = this.getUrl(`/group/${groupName}/topics`, { + start: start, + }); + let body = this.request(url); + let topics = body.topics || []; + return topics; +}; diff --git a/model.js b/model.js index b92b6ca..383562c 100644 --- a/model.js +++ b/model.js @@ -1,19 +1,34 @@ -var mongoose = require('mongoose'); -var Schema = mongoose.Schema; +/**! + * haixiu - model.js + * + * Authors: + * rockdai + */ -var PostSchema = new Schema({ +'use strict'; + +/** + * Module dependencies. + */ +const mongoose = require('mongoose'); +const Schema = mongoose.Schema; + +let PostSchema = new Schema({ + id: String, url: String, title: String, imgs: [String], - author: String, + author_id: String, + author_name: String, author_url: String, author_location: String, create_at: { type: Date, default: Date.now }, update_at: { type: Date, default: Date.now }, }); +PostSchema.index({id: -1}, { unique: true }); PostSchema.index({create_at: -1}); -var Post = mongoose.model('Post', PostSchema); +let Post = mongoose.model('Post', PostSchema); exports.Post = Post; diff --git a/package.json b/package.json index 62448b0..389cc71 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "haixiu", - "version": "0.0.0", + "version": "0.1.0", "description": "", "main": "app.js", "scripts": { @@ -9,14 +9,16 @@ "author": "", "license": "MIT", "dependencies": { - "async": "^0.9.0", - "cheerio": "^0.17.0", + "co": "^4.6.0", + "urllib-sync": "~1.1.2", "eventproxy": "^0.3.1", "express": "^4.10.1", "express-handlebars": "^1.1.0", "lodash": "^2.4.1", "moment": "^2.8.3", - "mongoose": "^3.8.18", - "superagent": "^0.20.0" + "mongoose": "^4.2.4" + }, + "engines": { + "node": "4.2.1" } } diff --git a/views/author.handlebars b/views/author.handlebars index b0e39dd..9e8fae9 100644 --- a/views/author.handlebars +++ b/views/author.handlebars @@ -1,4 +1,4 @@ -

{{authorName}}

+

{{authorName}}

{{#each docs}} {{> _post this}} {{/each}} diff --git a/views/partials/_post.handlebars b/views/partials/_post.handlebars index d69b1f0..a069185 100644 --- a/views/partials/_post.handlebars +++ b/views/partials/_post.handlebars @@ -1,6 +1,6 @@

标题:《{{title}}》 {{url}}

创建时间:{{ create_at }}

-

作者:{{author}}

+

作者:{{author_name}}

地址:{{author_location}}

{{#each imgs}}