Skip to content

Commit

Permalink
fix truncated_html
Browse files Browse the repository at this point in the history
  • Loading branch information
etiennecallies committed Oct 12, 2024
1 parent 0c5f104 commit f16bdfc
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion scraping/services/parse_pruning_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from home.utils.hash_utils import hash_string_to_hex
from scraping.parse.parse_with_llm import parse_with_llm, get_llm_model, get_prompt_template
from scraping.parse.schedules import SchedulesList, ScheduleItem
from scraping.refine.refine_content import remove_link_from_html

TRUNCATION_LENGTH = 10

Expand All @@ -18,7 +19,8 @@ def get_truncated_html(pruning: Pruning) -> str:
last_index = -1
for index in pruning.pruned_indices:
if index != last_index + 1:
truncated_lines.append(f'[{lines[last_index + 1][:TRUNCATION_LENGTH]}...]')
truncated_line = remove_link_from_html(lines[last_index + 1])[:TRUNCATION_LENGTH]
truncated_lines.append(f'[{truncated_line}...]')
truncated_lines.append(lines[index])
last_index = index

Expand Down

0 comments on commit f16bdfc

Please sign in to comment.