-
-
Notifications
You must be signed in to change notification settings - Fork 17
/
BibTeXConverter.java
401 lines (373 loc) · 16.5 KB
/
BibTeXConverter.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
package de.undercouch.citeproc.bibtex;
import de.undercouch.citeproc.csl.CSLDate;
import de.undercouch.citeproc.csl.CSLItemData;
import de.undercouch.citeproc.csl.CSLItemDataBuilder;
import de.undercouch.citeproc.csl.CSLType;
import org.jbibtex.BibTeXDatabase;
import org.jbibtex.BibTeXEntry;
import org.jbibtex.BibTeXParser;
import org.jbibtex.BibTeXString;
import org.jbibtex.Key;
import org.jbibtex.LaTeXObject;
import org.jbibtex.LaTeXParser;
import org.jbibtex.LaTeXPrinter;
import org.jbibtex.ParseException;
import org.jbibtex.TokenMgrException;
import org.jbibtex.Value;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/**
* <p>Converts BibTeX items to CSL citation items</p>
* <p>The class maps BibTeX attributes to CSL attributes. The mapping is
* based on the one used in <a href="http://www.docear.org">Docear</a> as
* <a href="http://www.docear.org/2012/08/08/docear4word-mapping-bibtex-fields-and-types-with-the-citation-style-language/">presented
* by Joeran Beel</a>.</p>
* <p>Docear is released under the GPLv2 but its code may also be reused in
* projects licensed under Apache License 2.0 (see
* <a href="http://www.docear.org/software/licence/">http://www.docear.org/software/licence/</a>,
* last visited 2013-09-06). The mapping here is released under the
* Apache License 2.0 by permission of Joaran Beel, Docear.</p>
* @author Joaran Beel
* @author Michel Kraemer
*/
public class BibTeXConverter {
private static final String FIELD_ABSTRACT = "abstract";
private static final String FIELD_ACCESSED = "accessed";
private static final String FIELD_ADDRESS = "address";
private static final String FIELD_ANNOTE = "annote";
private static final String FIELD_AUTHOR = "author";
private static final String FIELD_BOOKTITLE = "booktitle";
private static final String FIELD_CHAPTER = "chapter";
private static final String FIELD_DATE = "date";
private static final String FIELD_DOI = "doi";
private static final String FIELD_EDITION = "edition";
private static final String FIELD_EDITOR = "editor";
private static final String FIELD_INSTITUTION = "institution";
private static final String FIELD_ISBN = "isbn";
private static final String FIELD_ISSN = "issn";
private static final String FIELD_ISSUE = "issue";
private static final String FIELD_JOURNAL = "journal";
private static final String FIELD_JOURNALTITLE = "journaltitle";
private static final String FIELD_KEYWORDS = "keywords";
private static final String FIELD_LANGUAGE = "language";
private static final String FIELD_LOCATION = "location";
private static final String FIELD_MONTH = "month";
private static final String FIELD_NOTE = "note";
private static final String FIELD_NUMBER = "number";
private static final String FIELD_ORGANIZATION = "organization";
private static final String FIELD_PAGES = "pages";
private static final String FIELD_PUBLISHER = "publisher";
private static final String FIELD_REVISION = "revision";
private static final String FIELD_SCHOOL = "school";
private static final String FIELD_SERIES = "series";
private static final String FIELD_STATUS = "status";
private static final String FIELD_TITLE = "title";
private static final String FIELD_TYPE = "type";
private static final String FIELD_URL = "url";
private static final String FIELD_URLDATE = "urldate";
private static final String FIELD_VOLUME = "volume";
private static final String FIELD_YEAR = "year";
private static final String TYPE_ARTICLE = "article";
private static final String TYPE_BOOK = "book";
private static final String TYPE_BOOKLET = "booklet";
private static final String TYPE_CONFERENCE = "conference";
private static final String TYPE_ELECTRONIC = "electronic";
private static final String TYPE_INBOOK = "inbook";
private static final String TYPE_INCOLLECTION = "incollection";
private static final String TYPE_INPROCEEDINGS = "inproceedings";
private static final String TYPE_MANUAL = "manual";
private static final String TYPE_MASTERSTHESIS = "mastersthesis";
private static final String TYPE_ONLINE = "online";
private static final String TYPE_PATENT = "patent";
private static final String TYPE_PERIODICAL = "periodical";
private static final String TYPE_PHDTHESIS = "phdthesis";
private static final String TYPE_PROCEEDINGS = "proceedings";
private static final String TYPE_STANDARD = "standard";
private static final String TYPE_TECHREPORT = "techreport";
private static final String TYPE_UNPUBLISHED = "unpublished";
private static final String TYPE_WWW = "www";
private final LaTeXParser latexParser;
private final LaTeXPrinter latexPrinter;
/**
* Default constructor
*/
public BibTeXConverter() {
try {
latexParser = new LaTeXParser();
} catch (ParseException e) {
// can actually never happen because the default constructor
// of LaTeXParser doesn't throw
throw new RuntimeException(e);
}
latexPrinter = new LaTeXPrinter();
}
/**
* <p>Loads a BibTeX database from a stream.</p>
* <p>This method does not close the given stream. The caller is
* responsible for closing it.</p>
* @param is the input stream to read from
* @return the BibTeX database
* @throws ParseException if the database is invalid
*/
public BibTeXDatabase loadDatabase(InputStream is) throws ParseException {
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
BibTeXParser parser = new BibTeXParser() {
@Override
public void checkStringResolution(Key key, BibTeXString string) {
// ignore
}
};
try {
return parser.parse(reader);
} catch (TokenMgrException err) {
throw new ParseException("Could not parse BibTeX library: " +
err.getMessage());
}
}
/**
* Converts the given database to a map of CSL citation items
* @param db the database
* @return a map consisting of citation keys and citation items
*/
public Map<String, CSLItemData> toItemData(BibTeXDatabase db) {
Map<String, CSLItemData> result = new LinkedHashMap<>();
for (Map.Entry<Key, BibTeXEntry> e : db.getEntries().entrySet()) {
result.put(e.getKey().getValue(), toItemData(e.getValue()));
}
return result;
}
/**
* Converts a BibTeX entry to a citation item
* @param e the BibTeX entry to convert
* @return the citation item
*/
public CSLItemData toItemData(BibTeXEntry e) {
// get all fields from the BibTeX entry
Map<String, String> entries = new HashMap<>();
for (Map.Entry<Key, Value> field : e.getFields().entrySet()) {
String us = field.getValue().toUserString().replaceAll("\\r", "");
// convert LaTeX string to normal text
try {
List<LaTeXObject> objs = latexParser.parse(new StringReader(us));
us = latexPrinter.print(objs).replaceAll("\\n", " ").replaceAll("\\r", "").trim();
} catch (ParseException | TokenMgrException ex) {
// ignore
}
entries.put(field.getKey().getValue().toLowerCase(), us);
}
// map type
CSLType type = toType(e.getType());
CSLItemDataBuilder builder = new CSLItemDataBuilder()
.id(e.getKey().getValue()).type(type);
// map address
if (entries.containsKey(FIELD_LOCATION)) {
builder.eventPlace(entries.get(FIELD_LOCATION));
builder.publisherPlace(entries.get(FIELD_LOCATION));
} else {
builder.eventPlace(entries.get(FIELD_ADDRESS));
builder.publisherPlace(entries.get(FIELD_ADDRESS));
}
// map author
if (entries.containsKey(FIELD_AUTHOR)) {
builder.author(NameParser.parse(entries.get(FIELD_AUTHOR)));
}
// map editor
if (entries.containsKey(FIELD_EDITOR)) {
builder.editor(NameParser.parse(entries.get(FIELD_EDITOR)));
builder.collectionEditor(NameParser.parse(entries.get(FIELD_EDITOR)));
}
// map date
if (type == CSLType.WEBPAGE && entries.containsKey(FIELD_URLDATE)) {
CSLDate date = DateParser.toDate(entries.get(FIELD_URLDATE));
builder.issued(date);
} else if (entries.containsKey(FIELD_DATE)) {
CSLDate date = DateParser.toDate(entries.get(FIELD_DATE));
builder.issued(date);
builder.eventDate(date);
} else {
CSLDate date = DateParser.toDate(entries.get(FIELD_YEAR), entries.get(FIELD_MONTH));
builder.issued(date);
builder.eventDate(date);
}
// map journal/journaltitle, booktitle, series
if (entries.containsKey(FIELD_JOURNAL)) {
builder.containerTitle(entries.get(FIELD_JOURNAL));
builder.collectionTitle(entries.get(FIELD_JOURNAL));
} else if (entries.containsKey(FIELD_JOURNALTITLE)) {
builder.containerTitle(entries.get(FIELD_JOURNALTITLE));
builder.collectionTitle(entries.get(FIELD_JOURNALTITLE));
} else if (entries.containsKey(FIELD_BOOKTITLE)) {
builder.containerTitle(entries.get(FIELD_BOOKTITLE));
builder.collectionTitle(entries.get(FIELD_BOOKTITLE));
} else {
builder.containerTitle(entries.get(FIELD_SERIES));
builder.collectionTitle(entries.get(FIELD_SERIES));
}
if (entries.containsKey(FIELD_SERIES)) {
if (entries.containsKey(FIELD_JOURNAL)) {
builder.containerTitle(entries.get(FIELD_JOURNAL));
builder.collectionTitle(entries.get(FIELD_SERIES));
} else if (entries.containsKey(FIELD_JOURNALTITLE)) {
builder.containerTitle(entries.get(FIELD_JOURNALTITLE));
builder.collectionTitle(entries.get(FIELD_SERIES));
} else if (entries.containsKey(FIELD_BOOKTITLE)) {
builder.containerTitle(entries.get(FIELD_BOOKTITLE));
builder.collectionTitle(entries.get(FIELD_SERIES));
}
}
// map number and issue
builder.number(entries.get(FIELD_NUMBER));
if (entries.containsKey(FIELD_ISSUE)) {
builder.issue(entries.get(FIELD_ISSUE));
} else {
builder.issue(entries.get(FIELD_NUMBER));
}
// map publisher, institution, school, organisation
if (type == CSLType.REPORT) {
if (entries.containsKey(FIELD_PUBLISHER)) {
builder.publisher(entries.get(FIELD_PUBLISHER));
} else if (entries.containsKey(FIELD_INSTITUTION)) {
builder.publisher(entries.get(FIELD_INSTITUTION));
} else if (entries.containsKey(FIELD_SCHOOL)) {
builder.publisher(entries.get(FIELD_SCHOOL));
} else {
builder.publisher(entries.get(FIELD_ORGANIZATION));
}
} else if (type == CSLType.THESIS) {
if (entries.containsKey(FIELD_PUBLISHER)) {
builder.publisher(entries.get(FIELD_PUBLISHER));
} else if (entries.containsKey(FIELD_SCHOOL)) {
builder.publisher(entries.get(FIELD_SCHOOL));
} else if (entries.containsKey(FIELD_INSTITUTION)) {
builder.publisher(entries.get(FIELD_INSTITUTION));
} else {
builder.publisher(entries.get(FIELD_ORGANIZATION));
}
} else {
if (entries.containsKey(FIELD_PUBLISHER)) {
builder.publisher(entries.get(FIELD_PUBLISHER));
} else if (entries.containsKey(FIELD_ORGANIZATION)) {
builder.publisher(entries.get(FIELD_ORGANIZATION));
} else if (entries.containsKey(FIELD_INSTITUTION)) {
builder.publisher(entries.get(FIELD_INSTITUTION));
} else {
builder.publisher(entries.get(FIELD_SCHOOL));
}
}
// map title or chapter
if (entries.containsKey(FIELD_TITLE)) {
builder.title(entries.get(FIELD_TITLE));
} else {
builder.title(entries.get(FIELD_CHAPTER));
}
// map pages
String pages = entries.get(FIELD_PAGES);
if (pages != null) {
PageRange pr = PageParser.parse(pages);
builder.page(pr.getLiteral());
builder.pageFirst(pr.getPageFirst());
if (pr.getNumberOfPages() != null) {
builder.numberOfPages(String.valueOf(pr.getNumberOfPages()));
}
}
// map last accessed date
if (entries.containsKey(FIELD_ACCESSED)) {
builder.accessed(DateParser.toDate(entries.get(FIELD_ACCESSED)));
}
// map genre as per https://aurimasv.github.io/z2csl/typeMap.xml#map-thesis
switch (type) {
case BOOK:
case MANUSCRIPT:
case MAP:
case MOTION_PICTURE:
case PERSONAL_COMMUNICATION:
case POST:
case POST_WEBLOG:
case REPORT:
case SPEECH:
case THESIS:
case WEBPAGE:
if (entries.containsKey(FIELD_TYPE)) {
builder.genre(entries.get(FIELD_TYPE));
}
break;
default:
// ignore genre
break;
}
// map language
if (entries.containsKey(FIELD_LANGUAGE)) {
builder.language(entries.get(FIELD_LANGUAGE));
}
// map other attributes
builder.volume(entries.get(FIELD_VOLUME));
builder.keyword(entries.get(FIELD_KEYWORDS));
builder.URL(entries.get(FIELD_URL));
builder.status(entries.get(FIELD_STATUS));
builder.ISSN(entries.get(FIELD_ISSN));
builder.ISBN(entries.get(FIELD_ISBN));
builder.version(entries.get(FIELD_REVISION));
builder.annote(entries.get(FIELD_ANNOTE));
builder.edition(entries.get(FIELD_EDITION));
builder.abstrct(entries.get(FIELD_ABSTRACT));
builder.DOI(entries.get(FIELD_DOI));
builder.note(entries.get(FIELD_NOTE));
// create citation item
return builder.build();
}
/**
* Converts a BibTeX type to a CSL type
* @param type the type to convert
* @return the converted type (never null, falls back to {@link CSLType#ARTICLE})
*/
public CSLType toType(Key type) {
String s = type.getValue();
if (s.equalsIgnoreCase(TYPE_ARTICLE)) {
return CSLType.ARTICLE_JOURNAL;
} else if (s.equalsIgnoreCase(TYPE_PROCEEDINGS)) {
return CSLType.BOOK;
} else if (s.equalsIgnoreCase(TYPE_MANUAL)) {
return CSLType.BOOK;
} else if (s.equalsIgnoreCase(TYPE_BOOK)) {
return CSLType.BOOK;
} else if (s.equalsIgnoreCase(TYPE_PERIODICAL)) {
return CSLType.BOOK;
} else if (s.equalsIgnoreCase(TYPE_BOOKLET)) {
return CSLType.PAMPHLET;
} else if (s.equalsIgnoreCase(TYPE_INBOOK)) {
return CSLType.CHAPTER;
} else if (s.equalsIgnoreCase(TYPE_INCOLLECTION)) {
return CSLType.CHAPTER;
} else if (s.equalsIgnoreCase(TYPE_INPROCEEDINGS)) {
return CSLType.PAPER_CONFERENCE;
} else if (s.equalsIgnoreCase(TYPE_CONFERENCE)) {
return CSLType.PAPER_CONFERENCE;
} else if (s.equalsIgnoreCase(TYPE_MASTERSTHESIS)) {
return CSLType.THESIS;
} else if (s.equalsIgnoreCase(TYPE_PHDTHESIS)) {
return CSLType.THESIS;
} else if (s.equalsIgnoreCase(TYPE_TECHREPORT)) {
return CSLType.REPORT;
} else if (s.equalsIgnoreCase(TYPE_PATENT)) {
return CSLType.PATENT;
} else if (s.equalsIgnoreCase(TYPE_ELECTRONIC)) {
return CSLType.WEBPAGE;
} else if (s.equalsIgnoreCase(TYPE_ONLINE)) {
return CSLType.WEBPAGE;
} else if (s.equalsIgnoreCase(TYPE_WWW)) {
return CSLType.WEBPAGE;
} else if (s.equalsIgnoreCase(TYPE_STANDARD)) {
return CSLType.LEGISLATION;
} else if (s.equalsIgnoreCase(TYPE_UNPUBLISHED)) {
return CSLType.MANUSCRIPT;
}
return CSLType.ARTICLE;
}
}