Skip to content

Commit

Permalink
refine
Browse files Browse the repository at this point in the history
  • Loading branch information
巴思 committed Nov 10, 2015
1 parent c744633 commit c5bbd3f
Show file tree
Hide file tree
Showing 8 changed files with 232 additions and 117 deletions.
65 changes: 31 additions & 34 deletions app.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
var express = require('express');
var exphbs = require('express-handlebars');
var mongoose = require('mongoose');
var crawler = require('./crawler');
var model = require('./model');
var Post = model.Post;
var config = require('./config');
/**!
* haixiu - app.js
*
*/

'use strict';

/**
* Module dependencies.
*/
const express = require('express');
const exphbs = require('express-handlebars');
const mongoose = require('mongoose');
const crawler = require('./crawler');
const model = require('./model');
const Post = model.Post;
const config = require('./config');

mongoose.connect(config.mongodb_url);

var app = express();
var hbs = exphbs.create({
let app = express();
let hbs = exphbs.create({
defaultLayout: 'main',
helpers: {
gaid: function () {
Expand All @@ -20,7 +30,7 @@ var hbs = exphbs.create({
app.engine('handlebars', hbs.engine);
app.set('view engine', 'handlebars');

var cities = [
let cities = [
{key: 'hangzhou', name: '浙江杭州'},
{key: 'shanghai', name: '上海'},
{key: 'beijing', name: '北京'},
Expand All @@ -41,36 +51,25 @@ app.get('/', function (req, res, next) {
res.render('home', {cities: cities});
});

function getDocsAuthorId(docs) {
docs = docs || [];
var reg = /http:\/\/www.douban.com\/group\/people\/(\w+)(\/)?/;
for (var i = 0; i < docs.length; i++) {
docs[i].authorId = reg.exec(docs[i].author_url)[1];
}
return docs;
}

// 针对各个地域的 route 配置

app.get('/all', function (req, res, next) {
Post.find().sort({create_at: -1}).limit(100).exec(function (err, docs) {
Post.find().sort({id: -1}).limit(100).exec(function (err, docs) {
if (err) {
return next(err);
}
docs = getDocsAuthorId(docs);
res.render('posts', {docs: docs});
});
});

for (var i = 0; i < cities.length; i++) {
for (let i = 0; i < cities.length; i++) {
(function (city) {
var names = city.names || [city.name];
let names = city.names || [city.name];
app.get('/city/' + city.key, function (req, res, next) {
Post.find({author_location: {$in: names}}).sort({create_at: -1}).limit(100).exec(function (err, docs) {
Post.find({author_location: {$in: names}}).sort({id: -1}).limit(100).exec(function (err, docs) {
if (err) {
return next(err);
}
docs = getDocsAuthorId(docs);
res.render('posts', {docs: docs});
});
});
Expand All @@ -81,29 +80,27 @@ for (var i = 0; i < cities.length; i++) {

// 某个用户的发帖
app.get('/author/:authorId', function (req, res, next) {
var authorId = req.params.authorId;
var authorUrl = 'http://www.douban.com/group/people/' + authorId + '/';
Post.find({author_url: authorUrl}).sort({create_at: -1}).limit(100).exec(function (err, docs) {
const authorId = req.params.authorId;
Post.find({author_id: authorId}).sort({id: -1}).limit(100).exec(function (err, docs) {
if (err) {
return next(err);
}
var authorName = '';
let authorName = '';
if (docs && docs.length) {
// 取最近一条帖子的昵称
authorName = docs[0].author;
authorName = docs[0].author_name;
}
docs = getDocsAuthorId(docs);
res.render('author', {
authorId: authorId,
authorName: authorName,
authorUrl: authorUrl,
docs: docs
docs: docs,
});
});
});

// 启动爬虫
crawler.start();

var server = app.listen(config.port, function () {
let server = app.listen(config.port, function () {
console.log('app is listening ' + server.address().port);
});
23 changes: 18 additions & 5 deletions config.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,21 @@
var config = {
mongodb_url: process.env.MONGOHQ_URL || 'mongodb://localhost/haixiu',
port: process.env.PORT || 3000,
douban_cookie: 'viewed="3590768_1016272"; ll="108296"; bid="hWcTdvDyOYI"; ct=y; _ga=GA1.2.1167184662.1412271513; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1415179087%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; __utmt=1; as="http://www.douban.com/"; dbcl2="105659582:Xb5v0vuj9OM"; ck="GKUf"; _pk_id.100001.8cb4=d9658059b6096e90.1412332268.27.1415180767.1415172287.; _pk_ses.100001.8cb4=*; push_noty_num=0; push_doumail_num=0; __utma=30149280.1167184662.1412271513.1415166517.1415179088.44; __utmb=30149280.29.10.1415179088; __utmc=30149280; __utmz=30149280.1415162297.42.25.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmv=30149280.10565',
gaid: 'UA-26476625-4',
/**!
* haixiu - config.js
*
* Authors:
* rockdai <rockdai@qq.com>
*/

'use strict';

/**
* Module dependencies.
*/
let config = {
mongodb_url: process.env.MONGOHQ_URL || 'mongodb://127.0.0.1/haixiu',
port: process.env.PORT || 27017,
apikey: process.env.DB_APIKEY || '',
groupName: 'haixiuzu',
fetchPage: 20, // 抓取最新20页数据
};

exports = module.exports = config;
130 changes: 64 additions & 66 deletions crawler.js
Original file line number Diff line number Diff line change
@@ -1,80 +1,78 @@
var superagent = require('superagent');
var async = require('async');
var _ = require('lodash');
var cheerio = require('cheerio');
var model = require('./model');
var Post = model.Post;
var eventproxy = require('eventproxy');
var config = require('./config');
/**!
* haixiu - crawler.js
*
* Authors:
* rockdai <rockdai@qq.com>
*/

var q = async.queue(function (task, callback) {
var postInfo = task;
var ep = new eventproxy();
ep.fail(callback);
'use strict';

// 如果帖子已经抓取过就不再抓取
Post.findOne({url: postInfo.url}, ep.done(function (post) {
if (post) {
return ep.emit('got_author');
}
ep.emit('fetch_author');
}));
/**
* Module dependencies.
*/
const Douban = require('./lib/douban');
const config = require('./config');
const model = require('./model');
const _ = require('lodash');
const co = require('co');

ep.all('fetch_author', function () {
superagent.get(postInfo.author_url)
.set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36')
.set('Cookie', config.douban_cookie)
.end(ep.done(function (res) {
if (res.status !== 200) {
console.error(403, postInfo.author_url);
ep.emit('got_author');
return;
}
var $ = cheerio.load(res.text);
var location = $('.loc').text().replace('常居: \n', '').trim();
var post = new Post({
url: postInfo.url,
title: postInfo.title,
imgs: postInfo.imgs,
author: postInfo.author,
author_url: postInfo.author_url,
author_location: location,
});
post.save(ep.done(function () {
console.log('got %s', postInfo.title);
ep.emit('got_author');
}));
}));
});
const DB = new Douban({
apikey: config.apikey,
});

const Post = model.Post;

function onerror(err) {
console.error(err.stack);
console.log(err);
}

ep.all('got_author', function () {
callback();
function* handleTopic(topic) {
topic = topic || {};
let topicId = topic.id;
let imgs = _.pluck(topic.photos, 'alt');

let doc = yield Post.findOne({id: topicId}).exec();
if (doc) {
let updates = {
title: topic.title,
imgs: _.union(imgs, doc.imgs),
author_name: topic.authorInfo.name,
author_url: topic.authorInfo.alt,
author_location: topic.authorInfo.loc_name || '',
update_at: new Date(),
};
return yield doc.update(updates).exec();
}
let post = new Post({
id: topicId,
url: `http://www.douban.com/group/topic/${topicId}/`,
title: topic.title,
imgs: imgs,
author_id: topic.authorInfo.id,
author_name: topic.authorInfo.name,
author_url: topic.authorInfo.alt,
author_location: topic.authorInfo.loc_name || '',
});
// 并发数
}, 1);
return yield post.save();
}

function fetchHaixiuzu() {
var ep = new eventproxy();
ep.on('error',function (err) {
console.error(err);
});
superagent.get('https://database.duoshuo.com/api/threads/listPosts.json?thread_key=haixiuzu&page=1&limit=100')
.set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36')
.set('Cookie', config.douban_cookie)
.end(ep.done(function (res) {
var json = JSON.parse(res.text);
var parentPosts = json.parentPosts;
for (var postId in parentPosts) {
var postInfo = parentPosts[postId];
postInfo = JSON.parse(new Buffer(postInfo.message, 'base64'));
q.push(postInfo, ep.done(function () {}));
co(function* () {
for (let page = 1; page <= config.fetchPage; page++) {
let topics = DB.groupTopic(config.groupName, page);
for (let i = 0; i < topics.length; i++) {
let topic = topics[i];
topic.authorInfo = DB.user((topic.author || {}).id);
yield handleTopic(topic);
}
}));
}
}).catch(onerror);
}

exports.start = function () {
fetchHaixiuzu();

// 每分钟运行一次
setInterval(fetchHaixiuzu, 60 * 1000);
// 每10分钟运行一次
setInterval(fetchHaixiuzu, 10 * 60 * 1000);
};
90 changes: 90 additions & 0 deletions lib/douban.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/**!
* haixiu - douban.js
*
* Authors:
* rockdai <rockdai@qq.com>
*/

'use strict';

/**
* Module dependencies.
*/
const querystring = require('querystring');
const request = require('urllib-sync').request;

const API_ROOT = 'https://api.douban.com/v2';

/**
* Expose `Client`
*/

module.exports = Client;

function Client(options) {
if (!(this instanceof Client)) {
return new Client(options);
}

options = options || {};
this.apikey = options.apikey;
this.timeout = options.timeout || 30000;
}

Client.prototype.getUrl = function(path, query) {
let result = API_ROOT + path;
query = query || {};
if (this.apikey) {
query.apikey = this.apikey;
}
result = result + '?' + querystring.stringify(query);
return result;
};

Client.prototype.request = function (url, args) {

args = args || {};
args.timeout = this.timeout;

let result = request(url, args);

let body = result.data.toString();
let status = result.status;
let headers = result.headers;
if (status.toString()[0] !== '2') {
let err = new Error('Request Douban API error.');
err.name = 'RequestDoubanAPIError';
err.statusCode = status;
err.originHeaders = headers;
err.originBody = body;
throw err;
}
let jsonBody;
try {
jsonBody = JSON.parse(body);
} catch (ex) {
ex.name = 'ParseDoubanAPIFailed';
ex.statusCode = status;
ex.originHeaders = headers;
ex.originBody = body;
throw ex;
}
return jsonBody;
};

Client.prototype.user = function (userId) {
let url = this.getUrl(`/user/${userId}`);
let body = this.request(url);
return body;
};

Client.prototype.groupTopic = function (groupName, page) {
page = page || 1;
let start = (page - 1) * 20;
let url = this.getUrl(`/group/${groupName}/topics`, {
start: start,
});
let body = this.request(url);
let topics = body.topics || [];
return topics;
};
Loading

0 comments on commit c5bbd3f

Please sign in to comment.