This repository has been archived by the owner on Dec 12, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.coffee
122 lines (97 loc) · 3.48 KB
/
index.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
fs = require 'fs'
path = require 'path'
request = require 'request'
jsdom = require 'jsdom'
# utility function to call a callback on the next tick
callback = (cb) ->
args = [].slice.call(arguments, 1)
process.nextTick () ->
cb.apply undefined, args
# for a team, return a url to their "My FIRST" page
grab_team_url = (team, cb) ->
request { uri: "http://frclinks.frclinks.com/t/#{escape team}" }, (err, resp, body) ->
if err?
callback cb, err
return
if body.indexOf('No information') isnt -1
callback cb, '404'
return
body = body.split 'url='
if body.length < 2
callback cb, '404'
return
callback cb, undefined, body[1].split('"')[0]
# for a team, return the contents of their "My FIRST" page
grab_team_page = (team, cb) ->
grab_team_url team, (err, url) ->
if err?
callback cb, err
return
request { uri: url }, (err, resp, body) ->
if err?
callback cb, err
return
callback cb, undefined, body
# for a team, return a jQuerified jsDOM of their "My FIRST" page
grab_team_dom = (team, cb) ->
grab_team_page team, (err, body) ->
if err?
callback cb, err
return
jsdom.env { html: body, scripts: [ path.join process.cwd(), 'jquery-1.7.2.min.js' ] }, cb
# for a team, return a csv dump of their awards over the years
scrape_team_awards = (team, cb) ->
grab_team_dom team, (err, window) ->
if err?
callback cb, err, team
return
$ = window.jQuery
rows = $('th:contains(Season)')
.last()
.parent()
.parent() # at this point, we have the "Team History" table
.find('tr:not(:first)') # now we grab every row but the header
.map -> # return an array containing, for each row, ...
$(this)
.find('td')
.map -> # an array, containing...
$(this)
.text()
.trim()
.replace(/[^a-zA-Z0-9\&\' \n]/g, '')
.replace(/( )+/, ' ') # the cleaned, trimmed text of each td in the row
# format it as csv, filtering out rows where no awards were won
csv = do ->
for row in rows when row[2] isnt ''
# remove the year from the event name
row[1] = row[1].substring row[0].length + 1
# split up the per-year awards field into each individual award
awards = row[2].split('\n').filter (award) ->
award isnt ''
# filter out "2000 and prior award listings may be incomplete"
awards = awards.filter (award) ->
award.indexOf('incomplete') is -1
"#{team},#{row[0]},#{row[1]},#{award}" for award in awards
# flatten out the resulting array into a string
ret = ''
for item in csv
if Array.isArray item
ret += '\n' + item.join '\n'
else
ret += '\n' + item
ret = ret.trim()
ret = ret.replace /(\n)+/, '\n'
callback cb, undefined, team, ret
process.argv.splice 0, 2
min = parseInt process.argv[0]
max = parseInt process.argv[1] or min + 1
if not min? or not max? or isNaN(min) or isNaN(max) or max < min
console.warn 'Please specify a team range to scrape, in the format: [min] [max]'
process.exit 1
else
for i in [min..max]
scrape_team_awards i, (err, team, csv) ->
if err?
console.warn 'Error on team %s: ', team, err if err isnt '404'
else
console.log csv