Skip to content
This repository has been archived by the owner on Dec 1, 2024. It is now read-only.

Commit

Permalink
Refactored date parsing logic for improved readability and maintainab…
Browse files Browse the repository at this point in the history
…ility
  • Loading branch information
dikayx committed Aug 12, 2024
1 parent 4ecfd92 commit 133237f
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 24 deletions.
77 changes: 54 additions & 23 deletions mapy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,65 @@
from email import message_from_string
from email.message import Message

from bs4 import BeautifulSoup

import dateutil.parser
import pygal
import requests

from bs4 import BeautifulSoup
from pygal.style import Style
from typing import Optional, Any
from typing import Optional


def try_parse_date(date_str: str) -> datetime:
"""
Attempts to parse a date string using dateutil parser with fuzzy parsing.
:param date_str: A date string to parse
:return: A datetime object or None if parsing fails
"""
try:
return dateutil.parser.parse(date_str, fuzzy=True)
except ValueError:
return None


def extract_and_parse_date(line: str, regex: str) -> datetime:
"""
Extracts date string from a line using a regex pattern and attempts to parse it.
:param line: A line of text from the email header
:param regex: A regex pattern to extract the date string
:return: A datetime object or None if parsing fails
"""
match = re.findall(regex, line, re.I)
if match:
# If match is a tuple (from multiple groups), use the first group
date_str = match[0] if isinstance(match[0], str) else match[0][0]
return try_parse_date(date_str)
return None


def parse_date(line: str) -> datetime:
"""
This function takes a line of text from the email header and tries
to parse the date from it.
Parses the date from a line of text from the email header.
:param line: A line of text from the email header
:return: A datetime object
"""
try:
# Attempt to parse date using dateutil with fuzzy parsing
r = dateutil.parser.parse(line, fuzzy=True)

except ValueError:
# Handle potential ValueError from incorrect timezones
r = re.findall(r'^(.*?)\s*(?:\(|utc)', line, re.I)
if r:
r = dateutil.parser.parse(r[0])
return r
result = try_parse_date(line)

# Try manually if fuzzy parsing fails
if result is None:
result = extract_and_parse_date(line, r'^(.*?)\s*(?:\(|utc)')

# Handle the most exotic cases of date formats
if result is None:
result = extract_and_parse_date(line, r'(?P<date>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(?:\.\d+)? \+\d{4}|(?P<weekday>[a-zA-Z]{3}), \d{2} [a-zA-Z]{3} \d{4} \d{2}:\d{2}:\d{2}(?:\.\d+)? \+\d{4})')

return result


def get_header_value(h: str, data: str, rex: str = r'\s*(.*?)(?:\n\S+:|$)') -> str | None:
Expand All @@ -48,6 +79,8 @@ def get_header_value(h: str, data: str, rex: str = r'\s*(.*?)(?:\n\S+:|$)') -> s
:param h: The header name
:param data: The email header data
:param rex: The regular expression pattern for matching the header value
:return: The value of the header or None if not found
"""
# Use regular expressions to find header values
r = re.findall('%s:%s' % (h, rex), data, re.X | re.DOTALL | re.I)
Expand Down Expand Up @@ -407,14 +440,12 @@ def process_attachment(part: Message) -> Optional[dict]:

attachment_data = part.get_payload(decode=True)

if len(attachment_data) > 0:
encoded_data = base64.b64encode(attachment_data).decode('utf-8')
return {
'filename': filename,
'data': encoded_data,
'length': len(attachment_data)
}
return None
encoded_data = base64.b64encode(attachment_data).decode('utf-8')
return {
'filename': filename,
'data': encoded_data,
'length': len(attachment_data)
}


def process_message_part(part: Message, email_date: str) -> Optional[dict]:
Expand Down
43 changes: 42 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,40 @@
message_part.set_payload('This is a test message.', charset='utf-8')


def test_try_parse_date():
# Test with a valid date string
date_str = "Fri, 23 Jul 2024 10:21:35 +0000"
expected_date = datetime(2024, 7, 23, 10, 21, 35, tzinfo=timezone.utc)
assert try_parse_date(date_str) == expected_date

# Test with a fuzzy date string
fuzzy_date_str = "Received: from example.com (Fri, 23 Jul 2024 10:21:35)"
fuzzy_expected_date = datetime(2024, 7, 23, 10, 21, 35)
assert try_parse_date(fuzzy_date_str) == fuzzy_expected_date

# Test with an invalid date string
invalid_date_str = "This is not a date"
assert try_parse_date(invalid_date_str) is None


def test_extract_and_parse_date():
# Test extracting and parsing a date from a valid string
line = "Received: from example.com by example.org; Fri, 23 Jul 2024 10:21:35 +0000"
regex = r'(?<=;\s)(.*?)(?=\s*\(|$)'
expected_date = datetime(2024, 7, 23, 10, 21, 35, tzinfo=timezone.utc)
assert extract_and_parse_date(line, regex) == expected_date

# Test extracting and parsing a date from a string with multiple capture groups
line = "Thu, 4 Jul 2024 10:42:48 +0200 (CEST)"
regex = r'([a-zA-Z]{3}, \s*\d{1,2} [a-zA-Z]{3} \d{4} \d{2}:\d{2}:\d{2} \+\d{4})'
expected_date = datetime(2024, 7, 4, 10, 42, 48, tzinfo=timezone(timedelta(hours=2)))
assert extract_and_parse_date(line, regex) == expected_date

# Test extracting and parsing a date from an invalid string
invalid_line = "This line has no date"
assert extract_and_parse_date(invalid_line, regex) is None


def test_parse_date():
# Test valid date parsing
date_str = "Thu, 4 Jul 2024 10:42:48 +0200 (CEST)"
Expand Down Expand Up @@ -224,7 +258,14 @@ def test_process_attachment():
empty_attachment = Message()
empty_attachment.add_header('Content-Disposition', 'attachment', filename='empty.txt')
empty_attachment.set_payload(b'')
assert process_attachment(empty_attachment) is None
empty_attachment_info = process_attachment(empty_attachment)
assert empty_attachment_info['filename'] == 'empty.txt'
assert empty_attachment_info['length'] == 0

# Test with no attachment data
no_attachment = Message()
no_attachment_info = process_attachment(no_attachment)
assert no_attachment_info is None


def test_process_message_part():
Expand Down

0 comments on commit 133237f

Please sign in to comment.