-
Notifications
You must be signed in to change notification settings - Fork 3
/
treebankparser.h
89 lines (67 loc) · 1.93 KB
/
treebankparser.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/**
* \file treebankparser.h
*
* \brief Main treebank-parser class hedader file.
*
* \author Damir Cavar <dcavar@iu.edu>
*
* \version 0.1
*
* \date 2016/09/10 16:20:00
*
* \date Created on: Mon September 10 16:20:00 2016
*
* \copyright Copyright 2016 by Damir Cavar
*
* \license{Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.}
*
* \note This needs some more coding.
*
* \bug None
*/
#ifndef PCFG_TREEBANKPARSER_H
#define PCFG_TREEBANKPARSER_H
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <vector>
#include <map>
#include <utility>
#include <numeric>
#include <set>
#include <regex>
#include <boost/algorithm/string.hpp>
#include <boost/filesystem.hpp>
#include <boost/lexical_cast.hpp>
using namespace std;
class TreebankParser {
public:
TreebankParser();
~TreebankParser();
void processFiles(vector<string> treebankfiles);
bool parseBrackets(string content);
void saveToFile(string fname);
void printToStream(ostream &buf);
void printToStdout();
void loadGrammar(string fname);
void tagTerminalRules();
string grammarfile;
string rootsymbol;
bool skipterminals = false;
bool relcounts = false; // relative frequencies
bool symbolcounts = false ; // count the frequency of the LHS symbol rather than the rule
set<vector<string>> terminalRules;
map<vector<string>, unsigned long> rules;
};
#endif //PCFG_TREEBANKPARSER_H