-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
827c002
commit 3c5e53e
Showing
3 changed files
with
63 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import docx, PyPDF2 | ||
from io import BytesIO | ||
from tractor_beam.utils.globals import _f | ||
|
||
def _parse_docx(self): | ||
""" | ||
The function reads the contents of a PDF or DOCX file and returns the extracted text. | ||
:return: The code is returning the extracted text from a PDF or DOCX file. | ||
""" | ||
with open(self.path, 'rb') as f: | ||
_r = f.read() | ||
try: | ||
if self.path.endswith('.pdf'): | ||
p = PyPDF2.PdfReader(BytesIO(_r)) | ||
_t = "\n".join([_p.extract_text() for _p in p.pages]) | ||
elif self.path.endswith('.docx'): | ||
_d = docx.Document(BytesIO(_r)) | ||
_t = "" | ||
for paragraph in _d.paragraphs: | ||
_t += paragraph.text + '\n' | ||
# maybe don't need to sanitize? | ||
# return Strip(copy=text).sanitize() | ||
return _t | ||
except Exception as e: | ||
_f('fatal', f'document: {e} | {self.path}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
from rich.markdown import Markdown | ||
from rich.console import Console | ||
|
||
def m_print(text): | ||
Console().print(Markdown(str(text))) | ||
def _convert_to_markdown(text): | ||
return Markdown(str(text)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import xml.etree.ElementTree as ET | ||
from bs4 import BeautifulSoup as bs | ||
import re, html | ||
from tractor_beam.utils.globals import _f | ||
|
||
|
||
def _parse_xml(self, xml: bool = False): | ||
""" | ||
The `sanitize` function takes in an XML or HTML string and returns a cleaned version of the text | ||
content, either as plain text or as a formatted string with tag names. | ||
:param xml: The `xml` parameter is a boolean flag that indicates whether the input should be | ||
treated as XML or not. If `xml` is set to `True`, the function will parse the input as XML using | ||
the `ElementTree` module and extract the text content of each element. If `xml`, defaults to | ||
False (optional) | ||
:return: The code is returning the text content of an XML or HTML document, depending on the | ||
value of the `xml` parameter. If `xml` is `True`, it returns the text content of the XML | ||
document. If `xml` is `False`, it returns the text content of the HTML document. | ||
""" | ||
if xml: | ||
r = ET.fromstring(self.ml) | ||
_r = "" | ||
for e in r.iter(): | ||
if e.text and e.tag: | ||
try: | ||
_r += f"{e.tag.split('}')[1]}: {e.text}\n" | ||
except: | ||
_r += f"{e.tag}: {e.text}\n" | ||
return _r | ||
else: | ||
_s = bs(self.ml, 'html.parser') | ||
_r = _s.get_text(separator=' ') | ||
_c = html.unescape(_r) | ||
_c = re.sub(r'<[^>]+>', '', _c) | ||
c = _c.replace('\n', ' ').replace('\t', ' ') | ||
return c |