-
Notifications
You must be signed in to change notification settings - Fork 0
/
BibParser.hs
142 lines (121 loc) · 5.48 KB
/
BibParser.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
{-# LANGUAGE Haskell2010 #-}
-- The parser that converts a BibTeX file into a simple intermediate representation using Parsec.
-- With the subset of BibTeX that we parse, it is never neccessary to look ahead further than a
-- single character. This means no backtracking is neccessary (Parsec's 'try' is not used) and the
-- parser should be very efficient. We also take advantage of the fact that BibTeX files must be
-- encoded in ASCII, meaning that we can use a quick byte-based aproach with lazy ByteString
-- buffers.
module BibParser (parseBibFile, Identifier, BibEntry (..)) where
import Feedback
import ParseUtils
import Control.Monad
import Data.List
import Data.Maybe
import System.IO
import Debug.Trace
import Data.Map (Map)
import qualified Data.Map as M
import Data.ByteString.Lazy (ByteString)
import qualified Data.ByteString.Lazy as B
-- parsec3
import Text.Parsec hiding (runParser)
import Text.Parsec.ByteString.Lazy
-------------------------------------------------------
-- Represents a BibTeX identifier.
type Identifier = String
-- A single BibTeX entry.
data BibEntry = BibEntry {
category :: Identifier,
citeKey :: Identifier,
keyValues :: [(Identifier, String)]
} deriving (Eq, Show)
-- Since BibTeX files are required to be ASCII-encoded, a Parsec parser accepting Lazy Bytestrings
-- that treats input as being encoded as 'char8' (a superset of ASCII) can be used. This is more
-- efficient than a parser consuming a standard String or UTF-16 Text.
type BParser a = Text.Parsec.ByteString.Lazy.Parser a
-- Full parser that can be applied to a complete BibTeX file.
pBib :: BParser [BibEntry]
pBib = do
-- Skip everything until next @ or EOF.
-- Standard comments are not allowed to contain a @, so this is okay.
skipMany $ notChar '@'
atChar <- optionMaybe (char '@')
if isNothing atChar
then return [] -- At end of input.
else do
-- Parse entry category.
cat <- many1 letter
-- Parse entry, unless category was "Comment".
if cat == "Comment"
then pBib -- Note: fails when comment contains a '@' within a string literal.
-- Let's just pretend that never happens.
else do
ws $ char '{'
citekey <- (pNumber <|> pName) <?> "Invalid cite key."
ws $ char ','
keyVals <- commaList $ ws pField
char '}'
rest <- pBib
return $ BibEntry cat citekey keyVals : rest
-- Decimal integer. May be negative.
pNumber :: BParser Identifier
pNumber = (`label` "number") $ liftM2 (:) (digit <|> char '-') $ many digit
-- BibTeX identifier.
pName :: BParser Identifier
pName = (`label` "name") $ liftM2 (:) nameChar (many $ nameChar <|> digit)
where nameChar = letter <|> oneOf "_!$&+./:;<>?^`|"
-- Parses a key-value pair within a BibTeX entry.
pField :: BParser (Identifier, String)
pField = do key <- ws pName
char '='
val <- ws pValue
return (key,val)
-- Parses a value. String literals (and any escape sequences therin) are parsed and converted to
-- the string they represent.
pValue :: BParser String
pValue = (`label` "value") $ pNumber <|> pName <|> pStringLiteral
-- No common prefixes, so no backtracking neccessary.
-- Consumes a string literal, unquoting and unescaping it.
-- BibTeX supports tons of LateX stuff within string, many of which would which influence style
-- (e.g. the ^-sign can be used for superscript). Obviously we are not going to cover this and
-- instead we simply replace a few special characters and account for nested braces.
pStringLiteral :: BParser String
pStringLiteral = (`label` "string literal") $ eatQuotes $ many litUnit
where eatQuotes p = (char '"' >> p << char '"') <|> brackets p
brackets p = char '{' >> p << char '}'
litUnit = brackets (char '"' <|> escapeSeq <|> litUnit) <|> noneOf "{}\""
-- Support non-ASCII characters common in Dutch.
escapeSeq = do char '\\'
diacritic <- anyChar
letter <- anyChar
case M.lookup [diacritic, letter] specialChars of
Nothing -> fail $ "Unsupported escape sequence: '\\"
++ [diacritic, letter] ++ "'."
Just c -> return c
-- Mapping of a diacritic-letter combination to corresponding Unicode character. Contains pairs
-- common in Dutch.
specialChars :: Map String Char
specialChars = M.fromList [
("\"a", 'ä'),
("\"e", 'ë'),
("\"o", 'ö'),
("\"u", 'ü'),
("\"i", 'ï'),
("\'a", 'á'),
("\'e", 'é'),
("\'o", 'ó'),
("\'u", 'ú'),
("\'i", 'í'),
("`a", 'à'),
("`e", 'è'),
("`o", 'ò'),
("`u", 'ù'),
("`i", 'ì')
]
-- Tests BParser with file test.bib.
runTest :: IO ()
runTest = do input <- B.readFile "test.bib"
parseTest pBib input
-- Gets a BibTeX file as a lazy Bytestring and tries parsing it.
parseBibFile :: ByteString -> Feedback [BibEntry]
parseBibFile = runParser pBib "Unable to parse BibTeX input."