-
-
Notifications
You must be signed in to change notification settings - Fork 201
/
scrape.jl
executable file
·75 lines (59 loc) · 2.01 KB
/
scrape.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
module ScrapeJuliajl
function scrape_md(filename)
# get the category, don't process some files
category = split(splitdir(filename)[end], ".")[1]
category in ("LICENSE", "README") && return []
subcategory = ""
records = NTuple{5,String}[]
subcategory = ""
records = NTuple{5,String}[]
# process the lines
f = open(filename)
for l in eachline(f)
l = strip(l)
if length(l) == 0
continue
# handle subcategories
elseif l[1] == '#'
subcategory = join(split(l)[2:end])
# collect repo info
elseif l[1] == '+'
tmp = split(l, " :: ")
repotuple = split(tmp[1], "](")
length(repotuple) == 2 || continue
reponame = repotuple[1][4:end]
repourl = repotuple[2][1:end-1]
length(repourl) > 4 || continue
repourl[1:4] == "http" || continue
desc = length(tmp) > 1 ? join(tmp[2:end]...) : ""
push!(records, (category, subcategory, reponame, repourl, desc))
end
end
close(f)
println("Processed $(length(records)) records in category $category.")
records
end
function write_csv_line(io::IO, record::NTuple{N,T}) where {N, T<:AbstractString}
record = map(x->replace(x, '"'=>"\"\""), record)
for (i, el) in enumerate(record)
any(x->x in el, (',', '"')) ? write(io, '"', el, '"') :
write(io, el)
write(io, i == N ? '\n' : ',')
end
end
# -----------------------------------------------------------------
import Glob: glob
const _dir = joinpath(relpath(Base.source_dir()), "../")
# loop over all markdown files in the root directory, appending the records to the list
records = []
for filename in glob(joinpath(_dir, "*.md"))
append!(records, scrape_md(filename))
end
# save a csv file
println("Writing out $(length(records)) records.")
f = open(joinpath(_dir, "db.csv"), "w")
for record in records
write_csv_line(f, record)
end
close(f)
end # module