🚧 fix(wip): converters

Prismadic · Mar 14, 2024 · 3c5e53e · 3c5e53e
1 parent 827c002
commit 3c5e53e
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 2 deletions.
diff --git a/hygiene/utils/common/helpers/_docx.py b/hygiene/utils/common/helpers/_docx.py
@@ -0,0 +1,25 @@
+import docx, PyPDF2
+from io import BytesIO
+from tractor_beam.utils.globals import _f
+
+def _parse_docx(self):
+    """
+    The function reads the contents of a PDF or DOCX file and returns the extracted text.
+    :return: The code is returning the extracted text from a PDF or DOCX file.
+    """
+    with open(self.path, 'rb') as f:
+        _r = f.read()
+        try:
+            if self.path.endswith('.pdf'):
+                p = PyPDF2.PdfReader(BytesIO(_r))
+                _t = "\n".join([_p.extract_text() for _p in p.pages])
+            elif self.path.endswith('.docx'):
+                _d = docx.Document(BytesIO(_r))
+                _t = ""
+                for paragraph in _d.paragraphs:
+                    _t += paragraph.text + '\n'
+            # maybe don't need to sanitize?
+            # return Strip(copy=text).sanitize()
+            return _t
+        except Exception as e:
+            _f('fatal', f'document: {e} | {self.path}')
diff --git a/hygiene/utils/common/helpers/_markdown.py b/hygiene/utils/common/helpers/_markdown.py
@@ -1,5 +1,5 @@
 from rich.markdown import Markdown
 from rich.console import Console
 
-def m_print(text):
-    Console().print(Markdown(str(text)))
+def _convert_to_markdown(text):
+    return Markdown(str(text))
diff --git a/hygiene/utils/common/helpers/_xml.py b/hygiene/utils/common/helpers/_xml.py
@@ -0,0 +1,36 @@
+import xml.etree.ElementTree as ET
+from bs4 import BeautifulSoup as bs
+import re, html
+from tractor_beam.utils.globals import _f
+
+
+def _parse_xml(self, xml: bool = False):
+        """
+        The `sanitize` function takes in an XML or HTML string and returns a cleaned version of the text
+        content, either as plain text or as a formatted string with tag names.
+        
+        :param xml: The `xml` parameter is a boolean flag that indicates whether the input should be
+        treated as XML or not. If `xml` is set to `True`, the function will parse the input as XML using
+        the `ElementTree` module and extract the text content of each element. If `xml`, defaults to
+        False (optional)
+        :return: The code is returning the text content of an XML or HTML document, depending on the
+        value of the `xml` parameter. If `xml` is `True`, it returns the text content of the XML
+        document. If `xml` is `False`, it returns the text content of the HTML document.
+        """
+        if xml:
+            r = ET.fromstring(self.ml)
+            _r = ""
+            for e in r.iter():
+                if e.text and e.tag:
+                    try:
+                        _r += f"{e.tag.split('}')[1]}: {e.text}\n"
+                    except:
+                        _r += f"{e.tag}: {e.text}\n"
+            return _r
+        else:
+            _s = bs(self.ml, 'html.parser')
+            _r = _s.get_text(separator=' ')
+            _c = html.unescape(_r)
+            _c = re.sub(r'<[^>]+>', '', _c)
+            c = _c.replace('\n', ' ').replace('\t', ' ')
+            return c