Skip to content

Commit

Permalink
Merge pull request #5 from rockdai/v2
Browse files Browse the repository at this point in the history
v2
  • Loading branch information
alsotang committed Nov 10, 2015
2 parents c744633 + 7960c2b commit 004da91
Show file tree
Hide file tree
Showing 10 changed files with 243 additions and 119 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
node_modules
.DS_Store
87 changes: 53 additions & 34 deletions app.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
var express = require('express');
var exphbs = require('express-handlebars');
var mongoose = require('mongoose');
var crawler = require('./crawler');
var model = require('./model');
var Post = model.Post;
var config = require('./config');
/**!
* haixiu - app.js
*
*/

'use strict';

/**
* Module dependencies.
*/
const express = require('express');
const exphbs = require('express-handlebars');
const mongoose = require('mongoose');
const crawler = require('./crawler');
const model = require('./model');
const Post = model.Post;
const config = require('./config');

mongoose.connect(config.mongodb_url);

var app = express();
var hbs = exphbs.create({
let app = express();
let hbs = exphbs.create({
defaultLayout: 'main',
helpers: {
gaid: function () {
Expand All @@ -20,7 +30,7 @@ var hbs = exphbs.create({
app.engine('handlebars', hbs.engine);
app.set('view engine', 'handlebars');

var cities = [
let cities = [
{key: 'hangzhou', name: '浙江杭州'},
{key: 'shanghai', name: '上海'},
{key: 'beijing', name: '北京'},
Expand All @@ -37,40 +47,50 @@ var cities = [
{key: 'shenzhen', name: '广东深圳'},
];

app.get('/', function (req, res, next) {
res.render('home', {cities: cities});
});
function fixImages(imgs) {
imgs = imgs || [];
return imgs.map(function (img) {
if (img && img.startsWith('https://')) {
img = img.replace('https://', 'http://');
}
img = img.replace('.doubanio.com', '.douban.com');
return img;
});
}

function getDocsAuthorId(docs) {
function fixDocs(docs) {
docs = docs || [];
var reg = /http:\/\/www.douban.com\/group\/people\/(\w+)(\/)?/;
for (var i = 0; i < docs.length; i++) {
docs[i].authorId = reg.exec(docs[i].author_url)[1];
}
return docs;
return docs.map(function (doc) {
doc.imgs = fixImages(doc.imgs);
return doc;
});
}

app.get('/', function (req, res, next) {
res.render('home', {cities: cities});
});

// 针对各个地域的 route 配置

app.get('/all', function (req, res, next) {
Post.find().sort({create_at: -1}).limit(100).exec(function (err, docs) {
Post.find().sort({id: -1}).limit(100).exec(function (err, docs) {
if (err) {
return next(err);
}
docs = getDocsAuthorId(docs);
docs = fixDocs(docs);
res.render('posts', {docs: docs});
});
});

for (var i = 0; i < cities.length; i++) {
for (let i = 0; i < cities.length; i++) {
(function (city) {
var names = city.names || [city.name];
let names = city.names || [city.name];
app.get('/city/' + city.key, function (req, res, next) {
Post.find({author_location: {$in: names}}).sort({create_at: -1}).limit(100).exec(function (err, docs) {
Post.find({author_location: {$in: names}}).sort({id: -1}).limit(100).exec(function (err, docs) {
if (err) {
return next(err);
}
docs = getDocsAuthorId(docs);
docs = fixDocs(docs);
res.render('posts', {docs: docs});
});
});
Expand All @@ -81,29 +101,28 @@ for (var i = 0; i < cities.length; i++) {

// 某个用户的发帖
app.get('/author/:authorId', function (req, res, next) {
var authorId = req.params.authorId;
var authorUrl = 'http://www.douban.com/group/people/' + authorId + '/';
Post.find({author_url: authorUrl}).sort({create_at: -1}).limit(100).exec(function (err, docs) {
const authorId = req.params.authorId;
Post.find({author_id: authorId}).sort({id: -1}).limit(100).exec(function (err, docs) {
if (err) {
return next(err);
}
var authorName = '';
let authorName = '';
if (docs && docs.length) {
// 取最近一条帖子的昵称
authorName = docs[0].author;
authorName = docs[0].author_name;
}
docs = getDocsAuthorId(docs);
docs = fixDocs(docs);
res.render('author', {
authorId: authorId,
authorName: authorName,
authorUrl: authorUrl,
docs: docs
docs: docs,
});
});
});

// 启动爬虫
crawler.start();

var server = app.listen(config.port, function () {
let server = app.listen(config.port, function () {
console.log('app is listening ' + server.address().port);
});
21 changes: 16 additions & 5 deletions config.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
var config = {
mongodb_url: process.env.MONGOHQ_URL || 'mongodb://localhost/haixiu',
port: process.env.PORT || 3000,
douban_cookie: 'viewed="3590768_1016272"; ll="108296"; bid="hWcTdvDyOYI"; ct=y; _ga=GA1.2.1167184662.1412271513; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1415179087%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; __utmt=1; as="http://www.douban.com/"; dbcl2="105659582:Xb5v0vuj9OM"; ck="GKUf"; _pk_id.100001.8cb4=d9658059b6096e90.1412332268.27.1415180767.1415172287.; _pk_ses.100001.8cb4=*; push_noty_num=0; push_doumail_num=0; __utma=30149280.1167184662.1412271513.1415166517.1415179088.44; __utmb=30149280.29.10.1415179088; __utmc=30149280; __utmz=30149280.1415162297.42.25.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmv=30149280.10565',
gaid: 'UA-26476625-4',
/**!
* haixiu - config.js
*
*/

'use strict';

/**
* Module dependencies.
*/
let config = {
mongodb_url: process.env.MONGOHQ_URL || 'mongodb://127.0.0.1/haixiu',
port: process.env.PORT || 27017,
apikey: process.env.DB_APIKEY || '',
groupName: 'haixiuzu',
fetchPage: 20, // 抓取最新20页数据
};

exports = module.exports = config;
123 changes: 56 additions & 67 deletions crawler.js
Original file line number Diff line number Diff line change
@@ -1,80 +1,69 @@
var superagent = require('superagent');
var async = require('async');
var _ = require('lodash');
var cheerio = require('cheerio');
var model = require('./model');
var Post = model.Post;
var eventproxy = require('eventproxy');
var config = require('./config');
/**!
* haixiu - crawler.js
*
*/

var q = async.queue(function (task, callback) {
var postInfo = task;
var ep = new eventproxy();
ep.fail(callback);
'use strict';

// 如果帖子已经抓取过就不再抓取
Post.findOne({url: postInfo.url}, ep.done(function (post) {
if (post) {
return ep.emit('got_author');
}
ep.emit('fetch_author');
}));
/**
* Module dependencies.
*/
const Douban = require('./lib/douban');
const config = require('./config');
const model = require('./model');
const _ = require('lodash');
const co = require('co');

const DB = new Douban({
apikey: config.apikey,
});

ep.all('fetch_author', function () {
superagent.get(postInfo.author_url)
.set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36')
.set('Cookie', config.douban_cookie)
.end(ep.done(function (res) {
if (res.status !== 200) {
console.error(403, postInfo.author_url);
ep.emit('got_author');
return;
}
var $ = cheerio.load(res.text);
var location = $('.loc').text().replace('常居: \n', '').trim();
var post = new Post({
url: postInfo.url,
title: postInfo.title,
imgs: postInfo.imgs,
author: postInfo.author,
author_url: postInfo.author_url,
author_location: location,
});
post.save(ep.done(function () {
console.log('got %s', postInfo.title);
ep.emit('got_author');
}));
}));
});
const Post = model.Post;

function onerror(err) {
console.error(err.stack);
console.log(err);
}

ep.all('got_author', function () {
callback();
});
// 并发数
}, 1);
function* handleTopic(topic) {
topic = topic || {};
let topicId = topic.id;
let imgs = _.pluck(topic.photos, 'alt');

let exists = yield Post.findOne({id: topicId}).exec();
if (exists) {
imgs = _.union(imgs, exists.imgs);
}
let post = {
id: topicId,
url: `http://www.douban.com/group/topic/${topicId}/`,
title: topic.title,
imgs: imgs,
author_id: topic.authorInfo.id,
author_name: topic.authorInfo.name,
author_url: topic.authorInfo.alt,
author_location: topic.authorInfo.loc_name || '',
update_at: new Date(),
};
return yield Post.update({id: topicId}, post, {upsert: true}).exec();
}

function fetchHaixiuzu() {
var ep = new eventproxy();
ep.on('error',function (err) {
console.error(err);
});
superagent.get('https://database.duoshuo.com/api/threads/listPosts.json?thread_key=haixiuzu&page=1&limit=100')
.set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36')
.set('Cookie', config.douban_cookie)
.end(ep.done(function (res) {
var json = JSON.parse(res.text);
var parentPosts = json.parentPosts;
for (var postId in parentPosts) {
var postInfo = parentPosts[postId];
postInfo = JSON.parse(new Buffer(postInfo.message, 'base64'));
q.push(postInfo, ep.done(function () {}));
co(function* () {
for (let page = 1; page <= config.fetchPage; page++) {
let topics = DB.groupTopic(config.groupName, page);
for (let i = 0; i < topics.length; i++) {
let topic = topics[i];
topic.authorInfo = DB.user((topic.author || {}).id);
yield handleTopic(topic);
}
}));
}
}).catch(onerror);
}

exports.start = function () {
fetchHaixiuzu();

// 每分钟运行一次
setInterval(fetchHaixiuzu, 60 * 1000);
// 每10分钟运行一次
setInterval(fetchHaixiuzu, 10 * 60 * 1000);
};
Loading

0 comments on commit 004da91

Please sign in to comment.