Skip to content

Commit

Permalink
🚧 fix(wip): converters
Browse files Browse the repository at this point in the history
  • Loading branch information
mxchinegod committed Mar 14, 2024
1 parent 827c002 commit 3c5e53e
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 2 deletions.
25 changes: 25 additions & 0 deletions hygiene/utils/common/helpers/_docx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import docx, PyPDF2
from io import BytesIO
from tractor_beam.utils.globals import _f

def _parse_docx(self):
"""
The function reads the contents of a PDF or DOCX file and returns the extracted text.
:return: The code is returning the extracted text from a PDF or DOCX file.
"""
with open(self.path, 'rb') as f:
_r = f.read()
try:
if self.path.endswith('.pdf'):
p = PyPDF2.PdfReader(BytesIO(_r))
_t = "\n".join([_p.extract_text() for _p in p.pages])
elif self.path.endswith('.docx'):
_d = docx.Document(BytesIO(_r))
_t = ""
for paragraph in _d.paragraphs:
_t += paragraph.text + '\n'
# maybe don't need to sanitize?
# return Strip(copy=text).sanitize()
return _t
except Exception as e:
_f('fatal', f'document: {e} | {self.path}')
4 changes: 2 additions & 2 deletions hygiene/utils/common/helpers/_markdown.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from rich.markdown import Markdown
from rich.console import Console

def m_print(text):
Console().print(Markdown(str(text)))
def _convert_to_markdown(text):
return Markdown(str(text))
36 changes: 36 additions & 0 deletions hygiene/utils/common/helpers/_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup as bs
import re, html
from tractor_beam.utils.globals import _f


def _parse_xml(self, xml: bool = False):
"""
The `sanitize` function takes in an XML or HTML string and returns a cleaned version of the text
content, either as plain text or as a formatted string with tag names.
:param xml: The `xml` parameter is a boolean flag that indicates whether the input should be
treated as XML or not. If `xml` is set to `True`, the function will parse the input as XML using
the `ElementTree` module and extract the text content of each element. If `xml`, defaults to
False (optional)
:return: The code is returning the text content of an XML or HTML document, depending on the
value of the `xml` parameter. If `xml` is `True`, it returns the text content of the XML
document. If `xml` is `False`, it returns the text content of the HTML document.
"""
if xml:
r = ET.fromstring(self.ml)
_r = ""
for e in r.iter():
if e.text and e.tag:
try:
_r += f"{e.tag.split('}')[1]}: {e.text}\n"
except:
_r += f"{e.tag}: {e.text}\n"
return _r
else:
_s = bs(self.ml, 'html.parser')
_r = _s.get_text(separator=' ')
_c = html.unescape(_r)
_c = re.sub(r'<[^>]+>', '', _c)
c = _c.replace('\n', ' ').replace('\t', ' ')
return c

0 comments on commit 3c5e53e

Please sign in to comment.