Skip to content

Commit

Permalink
manually remove html
Browse files Browse the repository at this point in the history
  • Loading branch information
blester125 committed Aug 1, 2024
1 parent e7d562c commit f6dee88
Showing 1 changed file with 51 additions and 3 deletions.
54 changes: 51 additions & 3 deletions licensed_pile/scripts/remove_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import argparse
import multiprocessing as mp
import re
from tempfile import TemporaryDirectory

import bs4
Expand Down Expand Up @@ -43,10 +44,57 @@
help="Number of processors for multicore.",
)

logs.configure_logging("dolma.RemoveHTMLParallel", level="INFO")
logs.configure_logging("dolma.RegexRemoveHTMLParallel", level="DEBUG")


class RemoveHTMLParallel(ShardParallelProcessor):
class CaptureMatches:
def __init__(self):
self.matches = []

def __call__(self, m):
try:
self.matches.append(m.group(1))
except IndexError:
self.matches.append(m)
return ""

def __iter__(self):
yield from self.matches

def __bool__(self):
return bool(self.matches)


class RegexRemoveHTMLParallel(ShardParallelProcessor):
@classmethod
def process_example(cls, example, **kwargs):
logger = cls.get_logger()
cm = CaptureMatches()
# Capture the smallest amount of text between <div or <font and >
# This would not be ok if we cared about malicious input.
cleaned_text = re.sub(r"(<(?:div|font).*?>)", cm, example["text"])

if cm:
for m in cm:
logger.debug(
"Removed %s based on regex",
m,
extra={
"file": kwargs.get("source_file"),
"line": kwargs.get("line_number"),
"source": example["source"],
"example_id": example["id"],
"match": m,
},
)

example["text"] = cleaned_text
return example


class BS4RemoveHTMLParallel(ShardParallelProcessor):
"""There are issues with using bs4 to remove partial html."""

@classmethod
def process_example(cls, example, **kwargs):
try:
Expand Down Expand Up @@ -88,7 +136,7 @@ def process_example(cls, example, **kwargs):

def main(args):
with TemporaryDirectory() as tempdir:
processor = RemoveHTMLParallel(
processor = RegexRemoveHTMLParallel(
source_prefix=utils.dolma_input(args.input, args.filename),
destination_prefix=utils.dolma_output(args.output),
metadata_prefix=tempdir,
Expand Down

0 comments on commit f6dee88

Please sign in to comment.