-
Notifications
You must be signed in to change notification settings - Fork 0
/
SerbianScrapper.java
153 lines (136 loc) · 5.1 KB
/
SerbianScrapper.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
package scraping;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import scraping.Errors.BadDataException;
import scraping.Errors.NoDateException;
import scraping.Errors.NoJsonException;
import scraping.Errors.NoNameException;
import scraping.Errors.NoUrlException;
public class SerbianScrapper extends Scrapper {
// .reference-list.expanded
public static final String LAST_REFERENCE_SELECTOR = "#collapseReferences > ul > div > ul > li > a";
public SerbianScrapper(LinkQueue links, ResultQueue results) {
super(links, results);
start();
}
public static class Util {
public static JsonObject getJsonObject(Document doc) throws NoJsonException {
Elements scripts = doc.select("script");
for(Element e : scripts) {
String s = e.toString();
boolean status = s.contains("actData");
if(status) {
String jsonString = s.substring(s.indexOf("{"), s.lastIndexOf(";"));
JsonObject json = new JsonParser().parse(jsonString).getAsJsonObject();
return json;
}
}
throw new Errors.NoJsonException();
}
public static String getUrl(Document doc) throws NoUrlException {
// http://www.pravno-informacioni-sistem.rs/SlGlasnikPortal/viewdoc?actid=920142&doctype=og&abc=cba&injectEliUri=true
Elements els = doc.select("h5");
for(Element e : els) {
if(e.text().contains("Основни текст")) {
Element a = e.nextElementSibling().child(0).child(0);
String link = a.attr("ng-click");
Pattern p = Pattern.compile("\\('(.*)'\\)");
Matcher m = p.matcher(link);
if(m.find()) {
return "http://www.pravno-informacioni-sistem.rs" + m.group(1);
}
}
}
throw new Errors.NoUrlException();
}
public static String extractLastUpdateDate(Document doc) throws NoDateException {
Elements els = doc.select(".potpis");
for(Element e : els) {
if(e.text().matches(".*[0-9]*.*[0-9]{4}.*")) {
Pattern p = Pattern.compile("([0-9]*)\\.(.*)([0-9]{4})");
Matcher m = p.matcher(e.text());
if (m.find()) {
String date = m.group(1) + ".";
String month = "0";
switch(m.group(2).trim().substring(0, 3)) {
case "јан": month = "1"; break;
case "феб": month = "2"; break;
case "мар": month = "3"; break;
case "апр": month = "4"; break;
case "мај": month = "5"; break;
case "јун": month = "6"; break;
case "јул": month = "7"; break;
case "авг": month = "8"; break;
case "сеп": month = "9"; break;
case "окт": month = "10"; break;
case "нов": month = "11"; break;
case "дец": month = "12"; break;
}
date = date + month + "." + m.group(3) + ".";
return date;
}
}
}
throw new Errors.NoDateException();
}
public static String getLastUpdateDate(Document doc) throws NoJsonException, NoUrlException, IOException, NoDateException {
JsonObject jsonObject = Util.getJsonObject(doc);
JsonArray array = jsonObject.get("htmlLinks").getAsJsonArray();
String date;
if(array.size() == 0 || array.size() == 1) {
date = Util.getMetaDate(doc);
} else {
String link = Util.getUrl(doc);
Document dateDoc = Jsoup.connect(link).get();
date = Util.extractLastUpdateDate(dateDoc);
}
return date;
}
public static String getMetaDate(Document doc) throws NoDateException {
Elements els = doc.select("meta[property=\"eli:date_document\"]");
if(els.size() == 0) throw new NoDateException();
String[] content = els.get(0).attr("content").split("-");
if(content.length != 3) throw new NoDateException();
String date = content[2] + "." + content[1] + "." + content[0] + ".";
return date;
}
public static String getLawName(Document doc) throws NoNameException {
JsonObject jsonObject;
try {
jsonObject = getJsonObject(doc);
String name = jsonObject.get("baseTitle").getAsString();
return name;
} catch (NoJsonException e) {
throw new Errors.NoNameException();
}
}
}
@Override
public Result scrapeData(String url) throws IOException, NoNameException, NoDateException, BadDataException {
Document doc = Jsoup.connect(url).get();
try {
JsonObject jsonObject = Util.getJsonObject(doc);
String name = jsonObject.get("baseTitle").getAsString();
JsonArray array = jsonObject.get("htmlLinks").getAsJsonArray();
String date;
if(array.size() == 0 || array.size() == 1) {
date = Util.getMetaDate(doc);
} else {
String link = Util.getUrl(doc);
Document dateDoc = Jsoup.connect(link).get();
date = Util.extractLastUpdateDate(dateDoc);
}
return new Result(name, date);
} catch (NoJsonException | NoUrlException e) {
throw new Errors.BadDataException();
}
}
}