-
Notifications
You must be signed in to change notification settings - Fork 1
/
web scraping.py
170 lines (153 loc) · 6.29 KB
/
web scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#This code is used to scrape data(news title, date, contents, # of reviews, # of participants and first 10 reviews)
#from www.sina.com.cn but can be easily adjusted for other websites.
# -*- coding:utf-8 -*-
import scrapy
#import time
import csv
from sina.items import SinaItem
#from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider
from scrapy.contrib.spiders import Rule
from scrapy.spider import Spider
from scrapy.http import Request
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
class SinaNewsSpider(CrawlSpider): #Scrape the news title, date and contents
name = 'news'
allowed_domains = ['sina.com.cn']
start_urls = [
'http://news.sina.com.cn/'
'http://finance.sina.com.cn/',
'http://mil.news.sina.com.cn/',
'http://news.sina.com.cn/society/',
'http://tech.sina.com.cn/',
'http://mobile.sina.com.cn/',
'http://sports.sina.com.cn/',
'http://ent.sina.com.cn/',
'http://auto.sina.com.cn/',
'http://book.sina.com.cn/',
'http://edu.sina.com.cn/',
'http://health.sina.com.cn/',
'http://eladies.sina.com.cn/',
'http://collection.sina.com.cn/',
'http://fashion.sina.com.cn/',
'http://sky.news.sina.com.cn/',
'http://edu.sina.com.cn/a/',
'http://games.sina.com.cn/',
'http://golf.sina.com.cn/',
'http://fo.sina.com.cn/']
rules = [
Rule(LinkExtractor(allow=('news\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'finance\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'tech\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'sports\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'ent\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'//auto\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'book\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'edu\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'health\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'eladies\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'collection\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'fashion\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'games\.sina\.com\.cn/.*/?2015[-\d/]{1}',
'fo\.sina\.com\.cn/.*/?2015[-\d/]{1}'),deny=('html.+','/l/')),callback = 'parse_item',follow = True)]
def parse_item(self,response):
url = str(response.url)
#Title Extraction
title = [n.encode('utf-8') for n in response.xpath('//h1[contains(@id,"artibodyTitle")]/text()').extract()]
#title = response.xpath('//head/title/text()').extract()
#Date Exctration
Date = response.xpath('//span[contains(@id,"pub_date")]/text()').extract() #html
Date += response.xpath('//span[contains(@class,"time-source")]/text()').extract() #shtml
date = [n.encode('utf-8') for n in Date]
#Contents Extraction
contents = ''
for body in response.xpath('//div[contains(@id,"artibody")]//p/text()'):
for n in body.extract():
contents += n.encode('utf-8')
#Dynamic Contents Extraction:(1)Extract News ID and review channel
newsID = ''
channel = ''
raw = str(response.xpath('//meta[contains(@content,"comment_channel")]/@content').extract())
real=''
i = 3
while i <= len(raw)-3:
real += raw[i]
i = i + 1
final=[]
for s in real.split(':'):
ss = s.split(';')
for eachone in ss:
final.append(eachone)
i = 0
while i < len(final):
if 'comment_id' in final[i]:
i = i + 1
newsID = final[i]
elif 'comment_channel' in final[i]:
i = i + 1
channel = final[i]
i = i + 1
#Dynamic Contents Extraction:(2)Generate the url of reviews
comment_url = 'http://comment5.news.sina.com.cn/comment/skin/default.html?channel='+channel+'&newsid='+newsID
if title and date and contents: #and newsID and channel:
item = SinaItem()
item['url'] = url.encode('utf-8')
item['comment_url'] = comment_url.encode('utf-8')
item['title'] = title
item['date'] = date
item['body'] = contents
yield item
class SinaCommentSpider(Spider): #Scrape the news reviews
name = 'comment'
allowed_domains = ['sina.com.cn']
start_urls = ['http://comment5.news.sina.com.cn/comment/skin/default.html?channel=kj&newsid=2-1-9977035']
def parse(self,response):
url = str(response.url)
if 'comment5' in url:
item = SinaItem()
#Dynamic Contents Extraction:(3)Open selenium explorer
driver = webdriver.Firefox()
driver.get(url)
#extracing news title(adjustment for abnormal urls)
try:
title = driver.find_element_by_xpath('//h1[@id = "J_NewsTitle"]/a').text.encode('utf-8') #when channel is not 'kj'
title = driver.find_element_by_xpath('//h1[@bbs-node-type = "title"]/a').text.encode('utf-8') #when channel is 'kj'
except:
title = ''
item['title'] = title
#Dynamic Contents Extraction:(4)Extracting # of reviews
contents = driver.find_elements_by_xpath('//span[contains(@class,"f_red")]') #when channel is not 'kj'
# contents = driver.find_elements_by_xpath('//span[contains(@class,"count")]//em') #when channel is 'kj'
try:
item['num_comment'] = contents[0].text.encode('utf-8')
except:
item['num_comment'] = 0
#Dynamic Contents Extraction:(5)Extractng # of participants
try:
item['num_part'] = contents[1].text.encode('utf-8')
except:
item['num_part'] = 0
#Dynamic Contents Extraction:(6)Extracting reviews
comments = ''
#when channel is not 'kj'
for comment in driver.find_elements_by_xpath('//div[@id="J_Comment_List_Hot"]//div[@class="orig_content"]'):
comments += comment.text.encode('utf-8')+'\n'+'\n'
for comment in driver.find_elements_by_xpath('//div[@id="J_Comment_List_Hot"]//div[@class="t_txt"]'):
comments += comment.text.encode('utf-8')+'\n'+'\n'
for comment in driver.find_elements_by_xpath('//div[@class="comment_item_page_first"]//div[@class="orig_content"]'):
comments += comment.text.encode('utf-8')+'\n'+'\n'
for comment in driver.find_elements_by_xpath('//div[@class="comment_item_page_first"]//div[@class="t_txt"]'):
comments += comment.text.encode('utf-8')+'\n'+'\n'
#when channel is 'kj'
# for comment in driver.find_elements_by_xpath('//div[@class="sina-comment-page sina-comment-page-show"]//div[@comment-type="itemTxt"]'):
# comments += comment.text.encode('utf-8')+'\n'+'\n'
item['comment'] = comments
driver.close()
yield item
#Obtain the url of next piece of news
reader = csv.reader(open('/media/sunzeyeah/Personal/SENIOR/Thesis/Data/Chinese/Sina/news_0317.csv','r'))
for line in reader:
if line[0] != 'comment_url':
yield Request(line[0], callback=self.parse)