-
Notifications
You must be signed in to change notification settings - Fork 61
/
StoryParser.java
296 lines (252 loc) · 9.49 KB
/
StoryParser.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
package com.airlocksoftware.hackernews.parser;
import android.content.Context;
import android.util.Log;
import com.airlocksoftware.hackernews.data.ConnectionManager;
import com.airlocksoftware.hackernews.data.UserPrefs;
import com.airlocksoftware.hackernews.model.*;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class StoryParser {
private static final String TAG = StoryParser.class.getSimpleName();
private static final int NO_POSITION = -1;
// num comments / points
private static final Pattern NUM_COMMENTS_PATTERN = Pattern.compile("\\d+");
/** Parse stories from Front Page, Ask, Best, or New * */
public static StoryResponse parseStoryList(Context context, Page page, Request request, String moreFnid) {
String urlExtension = generateUrlExtension(request, page, moreFnid);
StoryResponse response = parseStories(context, page, urlExtension);
// parseStories() doesn't know about MORE, so potentially set it here
if (response.result == Result.SUCCESS && moreFnid != null && request == Request.MORE) {
response.result = Result.MORE;
}
return response;
}
/** Generate the extension that we're trying to load (goes on the end of ConnectionManager.BASE_URL) * */
private static String generateUrlExtension(Request request, Page page, String moreFnid) {
String urlExtension = "/";
if (moreFnid != null && request == Request.MORE) urlExtension += moreFnid;
switch (page) {
case ASK:
urlExtension += "ask";
break;
case BEST:
urlExtension += "best";
break;
case NEW:
urlExtension += "newest";
break;
case ACTIVE:
urlExtension += "active";
break;
default:
break;
}
return urlExtension;
}
/** Parse stories from the user's submissions page * */
public static StoryResponse parseUserSubmissions(Context context, String username, String moreFnid) {
if (StringUtils.isBlank(username)) {
throw new RuntimeException("StoryParser.parseUserSubmissions received a blank username");
}
String urlExtension = StringUtils.isNotBlank(moreFnid) ? "/" + moreFnid : "/submitted?id=" + username;
StoryResponse response = parseStories(context, Page.USER, urlExtension);
if (StringUtils.isNotBlank(moreFnid) && response.result == Result.SUCCESS) {
// switch result to MORE
response.result = Result.MORE;
}
return response;
}
private static StoryResponse parseStories(Context context, Page page, String urlExtension) {
StoryResponse response = new StoryResponse();
response.stories = new ArrayList<Story>();
response.result = Result.SUCCESS; // success unless error state is tripped
try {
UserPrefs data = new UserPrefs(context);
String userCookie = data.getUserCookie();
Document doc = getDocument(urlExtension, userCookie);
// check for expired fnid
Element body = doc.body();
String bodyText = body.text();
if (bodyText.equals("Unknown or expired link.")) {
response.result = Result.FNID_EXPIRED;
return response;
}
Elements titles = doc.select("span.rank"); // html changed, story rank numbers now have this class
Elements subtexts = doc.select("td.subtext");
ListIterator<Element> titlesIterator = titles.listIterator();
ListIterator<Element> subtextIterator = subtexts.listIterator();
while (titlesIterator.hasNext() && subtextIterator.hasNext()) {
Element child = titlesIterator.next();
Element titleElement = child.parent().parent();
Element subtextElement = subtextIterator.next();
Story story = parseStory(titleElement, subtextElement, userCookie != null);
story.page = page;
response.stories.add(story);
}
response.timestamp = getNewTimestamp(doc);
} catch (IOException e) {
response.result = Result.FAILURE;
} catch (NumberFormatException e) {
response.result = Result.FAILURE;
} catch (NullPointerException e) {
response.result = Result.FAILURE;
}
if (response.stories == null || response.stories.size() < 1) {
response.result = Result.FAILURE;
}
return response;
}
private static Document getDocument(String urlExtension, String userCookie) throws IOException {
Connection con;
if (userCookie != null) con = ConnectionManager.authConnect(urlExtension, userCookie);
else con = ConnectionManager.anonConnect(urlExtension);
return con.get();
}
/** Creates a new timestamp if the more element exists on the page, else returns null. * */
private static StoryTimestamp getNewTimestamp(Document doc) {
// get new moreFnid & Timestamp
Element more = doc.select("td.title a:matchesOwn(^More$)")
.first();
if (more == null) return null;
String fnid = more.attr("href");
// strip leading slash (/) since it's added by the urlExtension code above
if (fnid.startsWith("/")) fnid = fnid.substring(1);
StoryTimestamp timestamp = new StoryTimestamp();
timestamp.fnid = fnid;
timestamp.time = System.currentTimeMillis();
return timestamp;
}
public static class StoryResponse {
// NULL_RESPONSE :: A response with all fields set to `null`
public static final StoryResponse NULL_RESPONSE = new StoryResponse();
public Result result = null;
public List<Story> stories = null;
public StoryTimestamp timestamp = null;
public boolean isNull() {
return (this.equals(NULL_RESPONSE));
}
@Override
public boolean equals(Object other) {
if (other == null) return false;
if (other == this) return true;
if (!(other instanceof StoryResponse)) return false;
StoryResponse o = (StoryResponse) other;
return (result == o.result && stories == o.stories && timestamp == o.timestamp);
}
}
/**
* Parses a story from the two tags we can reach with "td.title:containsOwn(.)" and
* "td.subtext"
* TODO figure out a better way of parsing than try / catching exceptions
*/
public static Story parseStory(Element title, Element subtext, boolean loggedIn) {
Story story = new Story();
story.position = parsePosition(title);
String potentialJobsUrl = null;
try {
Element titleLink = title.select("td.title > a")
.first();
story.title = titleLink.text();
// try to get url & domain, if it fails you're on a self post
try {
story.url = titleLink.attr("href");
// if url starts with item?id, it's a self post & may potentially be a url for a jobs post
if (story.url.startsWith("item?id=")) potentialJobsUrl = ConnectionManager.BASE_URL + "/" + story.url;
story.domain = parseDomain(title);
} catch (NullPointerException e) {
story.url = null;
story.domain = null;
}
story.ago = parseAgo(subtext);
story.storyId = parseStoryId(subtext);
// if the user is logged in, get isUpvoted, go_to, and auth
if (loggedIn) {
story.isUpvoted = true;
story.whence = null;
story.auth = null;
Element voteAnchor = title.select("a[href^=vote]")
.first();
if (voteAnchor != null) {
String[] voteHref = voteAnchor.attr("href")
.split("[=&]");
story.isUpvoted = false;
story.whence = voteHref[voteHref.length - 1];
story.auth = voteHref[7];
}
}
story.numPoints = parseNumPoints(subtext);
story.username = (subtext.select("a[href^=user]").text());
story.numComments = parseNumComments(subtext);
} catch (Exception e) {
// this means it's a YCombinator jobs post
story.storyId = 0;
story.whence = null;
story.numPoints = 0;
story.username = null;
story.numComments = 0;
if (potentialJobsUrl != null) story.url = potentialJobsUrl;
}
return story;
}
/** try to get number of comments. If it fails there are 0 comments. * */
private static int parseNumComments(Element subtext) {
// last child is <a href="item?id=9029159">20 comments</a>
try {
int lastIndex = subtext.children().size() - 1;
Element numComments = subtext.child(lastIndex);
Matcher matcher = NUM_COMMENTS_PATTERN.matcher(numComments.text());
if (matcher.find()) return Integer.parseInt(matcher.group());
} catch (NumberFormatException e) {
Log.i(TAG, "Error parsing number of comments from: ", e);// + numComments.text());
} catch (Throwable t) {
Log.i(TAG, "Other error", t);
}
return 0;
}
private static int parseNumPoints(Element subtext) {
return Integer.parseInt(subtext.select("span.score").first().text().split("\\s")[0]);
}
// private static boolean parseHasUpvoteButton(Element voteAnchor) {
// Elements voteButtons = voteAnchor.select("img[src=http://ycombinator.com/images/grayarrow.gif]");
// return voteButtons.size() == 1;
// }
private static long parseStoryId(Element subtext) {
return Long.parseLong(subtext.select("a[href^=item]")
.attr("href")
.split("=")[1]);
}
private static String parseAgo(Element subtext) {
Element agoLink = subtext.select("a").get(1);
return agoLink.text().replace("|", "").trim();
}
private static String parseDomain(Element title) {
String domain = title.select("span.comhead")
.first()
.text()
.trim();
// trim parens from domain;
domain = domain.substring(1, domain.length() - 1);
return domain;
}
/** Get the stories position (i.e. 1st, 2nd, 3rd, etc) on the page. * */
private static int parsePosition(Element title) {
try {
String position = title.child(0)
.text()
.replace(".", "");
return Integer.parseInt(position);
} catch (Exception e) { // TODO fix exception catch'em all!
// this means we're on the comments page
return NO_POSITION;
}
}
}