-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
36 lines (28 loc) · 1.05 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import argparse
import scrapy
from scrapy.crawler import CrawlerProcess
from news_scraper.spiders.foxnews_spider import FoxnewsSpider
from news_scraper.spiders.cnn_spider import CNNSpider
from scrapy.utils.project import get_project_settings
def main():
parser = argparse.ArgumentParser(description='News parse configuration arguments')
parser.add_argument('--source', type=str,
required=True, help='news source to scrape')
parser.add_argument('--start', type=int,
required=True, help='scraping start date')
parser.add_argument('--end', type=int,
required=True, help='scraping end date')
args = parser.parse_args()
settings = get_project_settings()
settings.set('WAYBACK_MACHINE_TIME_RANGE', (args.start, args.end))
process = CrawlerProcess(settings)
if args.source == 'cnn':
process.crawl(CNNSpider)
elif args.source == 'foxnews':
process.crawl(FoxnewsSpider)
else:
print("Unsupported source " + args.source)
return
process.start() # the script will block here until the crawling is finished
if __name__ == '__main__':
main()