-
Notifications
You must be signed in to change notification settings - Fork 0
/
worker.py
57 lines (49 loc) · 2.64 KB
/
worker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import zstandard as zstd
import json
import csv
import re
from multiprocessing import Pool
from pybloom_live import BloomFilter
def extract_zst_file(file_path, output_directory, keywords):
output_file_path = os.path.join(output_directory, os.path.basename(file_path).replace('.zst', '.csv'))
print(f'Processing {file_path}...')
# Erstellen Sie ein großes Suchmuster, indem Sie alle Schlüsselwörter durch "|" (OR) verbinden
# Fügen Sie Wortgrenzen um jedes Schlüsselwort hinzu
pattern = r'\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b'
regex = re.compile(pattern, re.IGNORECASE)
written_titles = BloomFilter(capacity=1000000, error_rate=0.1)
with open(output_file_path, 'w', newline='') as output_file:
csv_writer = csv.writer(output_file)
csv_writer.writerow(["created_utc", "title", "selftext", "score", "num_comments", "author", "subreddit"]) # Header-Zeile hinzufügen
with open(file_path, 'rb') as compressed_file:
dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
reader = dctx.stream_reader(compressed_file)
chunk_size = 500 * 1024 * 1024
current_line = [b'']
counters = {keyword: 0 for keyword in keywords}
def line_generator():
for chunk in iter(lambda: reader.read(chunk_size), b''):
lines = (current_line[0] + chunk).split(b'\n')
for line in lines[:-1]:
yield line
current_line[0] = lines[-1]
if current_line[0]:
yield current_line[0]
for line in line_generator():
decoded_line = line.decode('utf-8')
decoded_line = json.loads(decoded_line)
title = decoded_line.get("title", "").lower()
selftext = decoded_line.get("selftext", "")
score = decoded_line.get("score", "")
num_comments = decoded_line.get("num_comments", "")
created_utc = decoded_line.get("created_utc", "")
author = decoded_line.get("author", "")
subreddit = decoded_line.get("subreddit", "")
match = regex.search(title)
if match is not None and title not in written_titles:
csv_writer.writerow([created_utc, title, selftext, score, num_comments, author, subreddit])
written_titles.add(title)
if match.group() in counters:
counters[match.group()] += 1
print(f"Anzahl der Zeilen, die das Schlüsselwort enthalten für Datei {file_path}:", counters)