-
Notifications
You must be signed in to change notification settings - Fork 20
/
csvstream.hpp
308 lines (258 loc) · 8.74 KB
/
csvstream.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
/* -*- mode: c++ -*- */
#ifndef CSVSTREAM_HPP
#define CSVSTREAM_HPP
/* csvstream.hpp
*
* Andrew DeOrio <awdeorio@umich.edu>
*
* An easy-to-use CSV file parser for C++
* https://github.com/awdeorio/csvstream
*/
#include <iostream>
#include <fstream>
#include <sstream>
#include <cassert>
#include <string>
#include <vector>
#include <map>
#include <regex>
#include <exception>
// A custom exception type
class csvstream_exception : public std::exception {
public:
const char * what () const noexcept override {
return msg.c_str();
}
const std::string msg;
csvstream_exception(const std::string &msg) : msg(msg) {};
};
// csvstream interface
class csvstream {
public:
// Constructor from filename. Throws csvstream_exception if open fails.
csvstream(const std::string &filename, char delimiter=',', bool strict=true)
: filename(filename),
is(fin),
delimiter(delimiter),
strict(strict),
line_no(0) {
// Open file
fin.open(filename.c_str());
if (!fin.is_open()) {
throw csvstream_exception("Error opening file: " + filename);
}
// Process header
read_header();
}
// Constructor from stream
csvstream(std::istream &is, char delimiter=',', bool strict=true)
: filename("[no filename]"),
is(is),
delimiter(delimiter),
strict(strict),
line_no(0) {
read_header();
}
// Destructor
~csvstream() {
if (fin.is_open()) fin.close();
}
// Return false if an error flag on underlying stream is set
explicit operator bool() const {
return static_cast<bool>(is);
}
// Return header processed by constructor
std::vector<std::string> getheader() const {
return header;
}
// Stream extraction operator reads one row. Throws csvstream_exception if
// the number of items in a row does not match the header.
csvstream & operator>> (std::map<std::string, std::string>& row) {
return extract_row(row);
}
// Stream extraction operator reads one row, keeping column order. Throws
// csvstream_exception if the number of items in a row does not match the
// header.
csvstream & operator>> (std::vector<std::pair<std::string, std::string> >& row) {
return extract_row(row);
}
private:
// Filename. Used for error messages.
std::string filename;
// File stream in CSV format, used when library is called with filename ctor
std::ifstream fin;
// Stream in CSV format
std::istream &is;
// Delimiter between columns
char delimiter;
// Strictly enforce the number of values in each row. Raise an exception if
// a row contains too many values or too few compared to the header. When
// strict=false, ignore extra values and set missing values to empty string.
bool strict;
// Line no in file. Used for error messages
size_t line_no;
// Store header column names
std::vector<std::string> header;
// Disable copying because copying streams is bad!
csvstream(const csvstream &);
csvstream & operator= (const csvstream &);
/////////////////////////////////////////////////////////////////////////////
// Implementation
// Read and tokenize one line from a stream
static bool read_csv_line(std::istream &is,
std::vector<std::string> &data,
char delimiter
) {
// Add entry for first token, start with empty string
data.clear();
data.push_back(std::string());
// Process one character at a time
char c = '\0';
enum State {BEGIN, QUOTED, QUOTED_ESCAPED, UNQUOTED, UNQUOTED_ESCAPED, END};
State state = BEGIN;
while(is.get(c)) {
switch (state) {
case BEGIN:
// We need this state transition to properly handle cases where nothing
// is extracted.
state = UNQUOTED;
// Intended switch fallthrough. Beginning with GCC7, this triggers an
// error by default. Disable the error for this specific line.
#if __GNUG__ && __GNUC__ >= 7
[[fallthrough]];
#endif
case UNQUOTED:
if (c == '"') {
// Change states when we see a double quote
state = QUOTED;
} else if (c == '\\') { //note this checks for a single backslash char
state = UNQUOTED_ESCAPED;
data.back() += c;
} else if (c == delimiter) {
// If you see a delimiter, then start a new field with an empty string
data.push_back("");
} else if (c == '\n' || c == '\r') {
// If you see a line ending *and it's not within a quoted token*, stop
// parsing the line. Works for UNIX (\n) and OSX (\r) line endings.
// Consumes the line ending character.
state = END;
} else {
// Append character to current token
data.back() += c;
}
break;
case UNQUOTED_ESCAPED:
// If a character is escaped, add it no matter what.
data.back() += c;
state = UNQUOTED;
break;
case QUOTED:
if (c == '"') {
// Change states when we see a double quote
state = UNQUOTED;
} else if (c == '\\') {
state = QUOTED_ESCAPED;
data.back() += c;
} else {
// Append character to current token
data.back() += c;
}
break;
case QUOTED_ESCAPED:
// If a character is escaped, add it no matter what.
data.back() += c;
state = QUOTED;
break;
case END:
if (c == '\n') {
// Handle second character of a Windows line ending (\r\n). Do
// nothing, only consume the character.
} else {
// If this wasn't a Windows line ending, then put character back for
// the next call to read_csv_line()
is.unget();
}
// We're done with this line, so break out of both the switch and loop.
goto multilevel_break; //This is a rare example where goto is OK
break;
default:
assert(0);
throw state;
}//switch
}//while
multilevel_break:
// Clear the failbit if we extracted anything. This is to mimic the
// behavior of getline(), which will set the eofbit, but *not* the failbit
// if a partial line is read.
if (state != BEGIN) is.clear();
// Return status is the underlying stream's status
return static_cast<bool>(is);
}
// Process header, the first line of the file
void read_header() {
// read first line, which is the header
if (!read_csv_line(is, header, delimiter)) {
throw csvstream_exception("error reading header");
}
}
// Extract a row into a map
csvstream & extract_row(std::map<std::string, std::string>& row) {
// Clear input row
row.clear();
// Read one line from stream, bail out if we're at the end
std::vector<std::string> data;
if (!read_csv_line(is, data, delimiter)) return *this;
line_no += 1;
// When strict mode is disabled, coerce the length of the data. If data is
// larger than header, discard extra values. If data is smaller than header,
// pad data with empty strings.
if (!strict) {
data.resize(header.size());
}
// Check length of data
if (data.size() != header.size()) {
auto msg = "Number of items in row does not match header. " +
filename + ":L" + std::to_string(line_no) + " " +
"header.size() = " + std::to_string(header.size()) + " " +
"row.size() = " + std::to_string(data.size()) + " "
;
throw csvstream_exception(msg);
}
// combine data and header into a row object
for (size_t i=0; i<data.size(); ++i) {
row[header[i]] = data[i];
}
return *this;
}
// Extract a row into a vector of pairs
csvstream & extract_row(std::vector<std::pair<std::string, std::string> >& row) {
// Clear input row
row.clear();
row.resize(header.size());
// Read one line from stream, bail out if we're at the end
std::vector<std::string> data;
if (!read_csv_line(is, data, delimiter)) return *this;
line_no += 1;
// When strict mode is disabled, coerce the length of the data. If data is
// larger than header, discard extra values. If data is smaller than header,
// pad data with empty strings.
if (!strict) {
data.resize(header.size());
}
// Check length of data
if (row.size() != header.size()) {
auto msg = "Number of items in row does not match header. " +
filename + ":L" + std::to_string(line_no) + " " +
"header.size() = " + std::to_string(header.size()) + " " +
"row.size() = " + std::to_string(row.size()) + " "
;
throw csvstream_exception(msg);
}
// combine data and header into a row object
for (size_t i=0; i<data.size(); ++i) {
row[i] = make_pair(header[i], data[i]);
}
return *this;
}
};
#endif