Skip to content

Commit

Permalink
core cleaner: update typing
Browse files Browse the repository at this point in the history
  • Loading branch information
ACA committed Mar 10, 2024
1 parent cf8b612 commit f6b3564
Showing 1 changed file with 6 additions and 7 deletions.
13 changes: 6 additions & 7 deletions lncrawl/core/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import unicodedata
from typing import AnyStr, Dict, List, Set, Union

from bs4 import Comment, Tag, PageElement
from bs4 import Comment, Tag

LINE_SEP = "<br>"

Expand All @@ -16,8 +16,6 @@
NONPRINTABLE = itertools.chain(range(0x00, 0x20), range(0x7F, 0xA0), INVISIBLE_CHARS)
NONPRINTABLE_MAPPING = {character: None for character in NONPRINTABLE}

TAG_LIKE = Union[Comment, PageElement, Tag]


class TextCleaner:
def __init__(self) -> None:
Expand Down Expand Up @@ -176,7 +174,8 @@ def clean_contents(self, div):
for tag in div.find_all(True):
if isinstance(tag, Comment):
tag.extract() # Remove comments
elif not isinstance(tag, Tag):
continue
if not isinstance(tag, Tag):
continue # Skip elements that are not a Tag
if tag.name in self.bad_tags:
tag.extract() # Remove bad tags
Expand All @@ -203,14 +202,14 @@ def clean_text(self, text) -> str:
)
return text

def extract_on_duplicate_sibling(self, tag: TAG_LIKE):
def extract_on_duplicate_sibling(self, tag: Tag):
next_tag = tag.next_sibling
if not isinstance(next_tag, Tag):
return
if next_tag.name == tag.name:
tag.extract()

def clean_attributes(self, tag: TAG_LIKE):
def clean_attributes(self, tag: Tag):
attrs = {}
for name, value in tag.attrs.items():
if name not in self.whitelist_attributes:
Expand All @@ -234,7 +233,7 @@ def tag_contains_bad_text(self, tag: Tag) -> bool:
self.bad_tag_text_pairs[tag.name] = pattern
return bool(pattern.search(tag.text))

def clean_image(self, tag: TAG_LIKE):
def clean_image(self, tag: Tag):
src = None
for name in self.image_src_attributes:
src = tag.get(name)
Expand Down

0 comments on commit f6b3564

Please sign in to comment.