From f6dee88dd283985f13121069bd1d9af060e03239 Mon Sep 17 00:00:00 2001 From: Brian Lester Date: Thu, 1 Aug 2024 01:30:26 -0400 Subject: [PATCH] manually remove html --- licensed_pile/scripts/remove_html.py | 54 ++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/licensed_pile/scripts/remove_html.py b/licensed_pile/scripts/remove_html.py index b79e745..07b6644 100644 --- a/licensed_pile/scripts/remove_html.py +++ b/licensed_pile/scripts/remove_html.py @@ -2,6 +2,7 @@ import argparse import multiprocessing as mp +import re from tempfile import TemporaryDirectory import bs4 @@ -43,10 +44,57 @@ help="Number of processors for multicore.", ) -logs.configure_logging("dolma.RemoveHTMLParallel", level="INFO") +logs.configure_logging("dolma.RegexRemoveHTMLParallel", level="DEBUG") -class RemoveHTMLParallel(ShardParallelProcessor): +class CaptureMatches: + def __init__(self): + self.matches = [] + + def __call__(self, m): + try: + self.matches.append(m.group(1)) + except IndexError: + self.matches.append(m) + return "" + + def __iter__(self): + yield from self.matches + + def __bool__(self): + return bool(self.matches) + + +class RegexRemoveHTMLParallel(ShardParallelProcessor): + @classmethod + def process_example(cls, example, **kwargs): + logger = cls.get_logger() + cm = CaptureMatches() + # Capture the smallest amount of text between
+ # This would not be ok if we cared about malicious input. + cleaned_text = re.sub(r"(<(?:div|font).*?>)", cm, example["text"]) + + if cm: + for m in cm: + logger.debug( + "Removed %s based on regex", + m, + extra={ + "file": kwargs.get("source_file"), + "line": kwargs.get("line_number"), + "source": example["source"], + "example_id": example["id"], + "match": m, + }, + ) + + example["text"] = cleaned_text + return example + + +class BS4RemoveHTMLParallel(ShardParallelProcessor): + """There are issues with using bs4 to remove partial html.""" + @classmethod def process_example(cls, example, **kwargs): try: @@ -88,7 +136,7 @@ def process_example(cls, example, **kwargs): def main(args): with TemporaryDirectory() as tempdir: - processor = RemoveHTMLParallel( + processor = RegexRemoveHTMLParallel( source_prefix=utils.dolma_input(args.input, args.filename), destination_prefix=utils.dolma_output(args.output), metadata_prefix=tempdir,