forked from attardi/wikiextractor
-
Notifications
You must be signed in to change notification settings - Fork 3
/
cirrus-extract.py
executable file
·246 lines (206 loc) · 8.14 KB
/
cirrus-extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# =============================================================================
# Version: 1.00 (December 15, 2015)
# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
#
# =============================================================================
# Copyright (c) 2015. Giuseppe Attardi (attardi@di.unipi.it).
# =============================================================================
# This file is part of Tanl.
#
# Tanl is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License, version 3,
# as published by the Free Software Foundation.
#
# Tanl is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# =============================================================================
"""Wikipedia Cirrus Extractor:
Extracts and cleans text from a Wikipedia Cirrus dump and stores output in a
number of files of similar size in a given directory.
Each file will contain several documents in the format:
<doc id="" url="" title="">
...
</doc>
"""
import sys, os.path, time
import re
import json
import argparse
import bz2
import gzip
import logging
# Program version
version = '1.00'
urlbase = 'http://it.wikipedia.org/'
# ----------------------------------------------------------------------
class NextFile(object):
"""
Synchronous generation of next available file name.
"""
filesPerDir = 100
def __init__(self, path_name):
self.path_name = path_name
self.dir_index = -1
self.file_index = -1
def next(self):
self.file_index = (self.file_index + 1) % NextFile.filesPerDir
if self.file_index == 0:
self.dir_index += 1
dirname = self._dirname()
if not os.path.isdir(dirname):
os.makedirs(dirname)
return self._filepath()
def _dirname(self):
char1 = self.dir_index % 26
char2 = self.dir_index / 26 % 26
return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
def _filepath(self):
return '%s/wiki_%02d' % (self._dirname(), self.file_index)
class OutputSplitter(object):
"""
File-like object, that splits output to multiple files of a given max size.
"""
def __init__(self, nextFile, max_file_size=0, compress=True):
"""
:param nextfile: a NextFile object from which to obtain filenames
to use.
:param max_file_size: the maximum size of each file.
:para compress: whether to write data with bzip compression.
"""
self.nextFile = nextFile
self.compress = compress
self.max_file_size = max_file_size
self.file = self.open(self.nextFile.next())
def reserve(self, size):
if self.file.tell() + size > self.max_file_size:
self.close()
self.file = self.open(self.nextFile.next())
def write(self, data):
self.reserve(len(data))
self.file.write(data)
def close(self):
self.file.close()
def open(self, filename):
if self.compress:
return bz2.BZ2File(filename + '.bz2', 'w')
else:
return open(filename, 'w')
# ----------------------------------------------------------------------
class Extractor(object):
def extract(self, out):
"""
:param out: output file.
"""
logging.debug("%s\t%s", self.id, self.title)
text = ''.join(self.page)
url = get_url(self.id)
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
# Separate header from text with a newline.
header += self.title + '\n\n'
header = header.encode('utf-8')
footer = "\n</doc>\n"
out.write(header)
text = clean(self, text)
for line in compact(text):
out.write(line.encode('utf-8'))
out.write('\n')
out.write(footer)
def process_dump(input_file, out_file, file_size, file_compress):
"""
:param input_file: name of the wikipedia dump file; '-' to read from stdin
:param out_file: directory where to store extracted data, or '-' for stdout
:param file_size: max size of each extracted file, or None for no max (one file)
:param file_compress: whether to compress files with bzip.
"""
if input_file == '-':
input = sys.stdin
else:
input = gzip.open(input_file)
if out_file == '-':
output = sys.stdout
if file_compress:
logging.warn("writing to stdout, so no output compression (use external tool)")
else:
nextFile = NextFile(out_file)
output = OutputSplitter(nextFile, file_size, file_compress)
# process dump
# format
# {"index":{"_type":"page","_id":"3825914"}}
# {"namespace":0,"title":TITLE,"timestamp":"2014-06-29T15:51:09Z","text":TEXT,...}
while True:
line = input.readline()
if not line:
break
index = json.loads(line)
content = json.loads(input.readline())
type = index['index']['_type']
id = index['index']['_id']
if type == 'page' and content['namespace'] == 0:
title = content['title']
text = content['text']
# drop references:
# ^ The Penguin Dictionary
text = re.sub(r' \^ .*', '', text)
url = urlbase + 'wiki?curid=' + id
header = '<doc id="%s" url="%s" title="%s">\n' % (id, url, title)
page = header + title + '\n\n' + text + '\n</doc>\n'
output.write(page.encode('utf-8'))
# ----------------------------------------------------------------------
# Minimum size of output files
minFileSize = 200 * 1024
def main():
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
description=__doc__)
parser.add_argument("input",
help="Cirrus Json wiki dump file")
groupO = parser.add_argument_group('Output')
groupO.add_argument("-o", "--output", default="text",
help="directory for extracted files (or '-' for dumping to stdin)")
groupO.add_argument("-b", "--bytes", default="1M",
help="maximum bytes per output file (default %(default)s)",
metavar="n[KMG]")
groupO.add_argument("-c", "--compress", action="store_true",
help="compress output files using bzip")
groupP = parser.add_argument_group('Processing')
groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
help="accepted namespaces")
groupS = parser.add_argument_group('Special')
groupS.add_argument("-q", "--quiet", action="store_true",
help="suppress reporting progress info")
groupS.add_argument("-v", "--version", action="version",
version='%(prog)s ' + version,
help="print program version")
args = parser.parse_args()
try:
power = 'kmg'.find(args.bytes[-1].lower()) + 1
file_size = int(args.bytes[:-1]) * 1024 ** power
if file_size < minFileSize:
raise ValueError()
except ValueError:
logging.error('Insufficient or invalid size: %s', args.bytes)
return
FORMAT = '%(levelname)s: %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger()
if not args.quiet:
logger.setLevel(logging.INFO)
input_file = args.input
output_path = args.output
if output_path != '-' and not os.path.isdir(output_path):
try:
os.makedirs(output_path)
except:
logging.error('Could not create: %s', output_path)
return
process_dump(input_file, output_path, file_size, args.compress)
if __name__ == '__main__':
main()