This repository has been archived by the owner on Nov 11, 2023. It is now read-only.
forked from openeventdata/stanford_pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.py
138 lines (107 loc) · 3.81 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# -*- encoding=utf-8 -*-
import os
import glob
import parser
import logging
import datetime
from pymongo import MongoClient
from ConfigParser import ConfigParser
def make_conn(db_auth, db_user, db_pass, db_host=None):
"""
Function to establish a connection to a local MonoDB instance.
Parameters
----------
db_auth: String.
MongoDB database that should be used for user authentication.
db_user: String.
Username for MongoDB authentication.
db_user: String.
Password for MongoDB authentication.
Returns
-------
collection: pymongo.collection.Collection.
Collection within MongoDB that holds the scraped news stories.
"""
if db_host:
client = MongoClient(db_host)
else:
client = MongoClient()
if db_auth:
client[db_auth].authenticate(db_user, db_pass)
database = client.event_scrape
collection = database['stories']
return collection
def query_today(collection, date):
"""
Function to query the MongoDB instance and obtain results for the desired
date range. Pulls stories that aren't Stanford parsed yet
(``"stanford: 0"``) and that were added within the last day.
Parameters
----------
collection: pymongo.collection.Collection.
Collection within MongoDB that holds the scraped news stories.
date: String.
Current date that the program is running.
Returns
-------
posts: pymongo.cursor.Cursor.
Results from the MongoDB query.
"""
logger = logging.getLogger('stanford')
gt_date = date - datetime.timedelta(days=1)
posts = collection.find({"$and": [{"date_added": {"$lte": date}},
{"date_added": {"$gt": gt_date}},
{"stanford": 0}]})
logger.info('Returning {} total stories.'.format(posts.count()))
return posts
def _parse_config(cparser):
try:
stanford_dir = cparser.get('StanfordNLP', 'stanford_dir')
if 'Logging' in cparser.sections():
log_dir = cparser.get('Logging', 'log_file')
else:
log_dir = ''
if 'Auth' in cparser.sections():
auth_db = cparser.get('Auth', 'auth_db')
auth_user = cparser.get('Auth', 'auth_user')
auth_pass = cparser.get('Auth', 'auth_pass')
db_host = cparser.get('Auth', 'db_host')
else:
auth_db = ''
auth_user = ''
auth_pass = ''
db_host = os.getenv('MONGO_HOST')
return stanford_dir, log_dir, auth_db, auth_user, auth_pass, db_host
except Exception, e:
print 'There was an error parsing the config file. {}'.format(e)
raise
def parse_config():
"""Function to parse the config file."""
config_file = glob.glob('config.ini')
cparser = ConfigParser()
if config_file:
cparser.read(config_file)
else:
cwd = os.path.abspath(os.path.dirname(__file__))
config_file = os.path.join(cwd, 'default_config.ini')
cparser.read(config_file)
return _parse_config(cparser)
def run():
stanford_dir, log_dir, db_auth, db_user, db_pass, db_host = parse_config()
# Setup the logging
logger = logging.getLogger('stanford')
logger.setLevel(logging.INFO)
if log_dir:
fh = logging.FileHandler(log_dir, 'a')
else:
fh = logging.FileHandler('stanford.log', 'a')
formatter = logging.Formatter('%(levelname)s %(asctime)s: %(message)s')
fh.setFormatter(formatter)
logger.addHandler(fh)
logger.info('Running.')
now = datetime.datetime.utcnow()
coll = make_conn(db_auth, db_user, db_pass, db_host)
stories = query_today(coll, now)
parser.stanford_parse(coll, stories, stanford_dir)
if __name__ == '__main__':
run()