From 2532022483ea615b9d2b4715b427888f684a5e46 Mon Sep 17 00:00:00 2001
From: Peter Chang <pchang388@gmail.com>
Date: Tue, 19 Sep 2023 04:48:28 -0400
Subject: [PATCH] initial code refactor

---
 bookstack_file_exporter/archiver/archiver.py |  49 ++++---
 bookstack_file_exporter/archiver/util.py     |   3 -
 bookstack_file_exporter/exporter/exporter.py |  95 ++++++++++++++
 bookstack_file_exporter/exporter/node.py     |   9 +-
 bookstack_file_exporter/exporter/util.py     |  48 +------
 bookstack_file_exporter/run.py               | 131 +++++++------------
 6 files changed, 180 insertions(+), 155 deletions(-)
 create mode 100644 bookstack_file_exporter/exporter/exporter.py

diff --git a/bookstack_file_exporter/archiver/archiver.py b/bookstack_file_exporter/archiver/archiver.py
index ffd0a3d..6abb186 100644
--- a/bookstack_file_exporter/archiver/archiver.py
+++ b/bookstack_file_exporter/archiver/archiver.py
@@ -1,6 +1,6 @@
 from typing import List, Dict, Union
-from pathlib import Path
-import json
+from time import sleep
+from datetime import datetime
 
 from bookstack_file_exporter.exporter.node import Node
 from bookstack_file_exporter.archiver import util
@@ -19,6 +19,8 @@
     "tar": _TAR_GZ_SUFFIX
 }
 
+_DATE_STR_FORMAT = "%Y-%m-%d_%H-%M-%S"
+
 class Archiver:
     """
     Archiver pulls all the necessary files from upstream and then pushes them to the specified backup location(s)
@@ -32,41 +34,47 @@ class Archiver:
     Returns:
         Archiver instance with attributes that are accessible for use for file level archival and backup.
     """
-    def __init__(self, root_dir: str, add_meta: bool, base_page_url: str, headers: Dict[str, str]):
-        self.root_dir = root_dir
+    def __init__(self, base_dir: str, add_meta: bool, base_page_url: str, headers: Dict[str, str]):
+        self.base_dir = base_dir
         self.add_meta = add_meta
         self.base_page_url = base_page_url
         self.headers = headers
         # remote_system to function mapping
         self._remote_exports = {'minio': self._archive_minio, 's3': self._archive_s3}
-        # self._tar_file = ""
+        self._root_dir = self.generate_root_folder(self.base_dir)
         self._minio_token = ""
         self._minio_id = ""
     
     # create local tarball first
+    def archive(self, page_nodes: Dict[int, Node], export_formats: List[str]):
+        for _, page in page_nodes.items():
+            for format in export_formats:
+                # instead of sleep, implement back off retry in utils
+                sleep(0.5)
+                self._gather(page, format)
+        self._tar_dir()
+    
     # convert to bytes to be agnostic to end destination (future use case?)
-    def gather(self, page_node: Node, export_format: str):
+    def _gather(self, page_node: Node, export_format: str):
         raw_data = self._get_data_format(page_node.id, export_format)
         self._gather_local(page_node.file_path, raw_data, export_format, page_node.meta)
-        
-    def archive(self):
-        self._tar_dir()
-        
-    # send to remote systems
-    def archive_remote(self, remote_dest: str):
-        self._remote_exports[remote_dest]()
     
     def _gather_local(self, page_path: str, data: bytes, export_format: str, meta_data: Union[bytes, None]):
-        file_path = self._get_combined_path(page_path)
+        file_path = f"{self._root_dir}/{page_path}"
         file_full_name = f"{file_path}{_FILE_EXTENSION_MAP[export_format]}"
         util.write_bytes(file_path=file_full_name, data=data)
         if self.add_meta:
             meta_file_name = f"{file_path}{_FILE_EXTENSION_MAP['meta']}"
             util.dump_json(file_name=meta_file_name, data=meta_data)
 
+    # send to remote systems
+    def archive_remote(self, remote_targets: List[str]):
+        if remote_targets:
+            for target in remote_targets:
+                self._remote_exports[target]()
+
     def _tar_dir(self):
-        # tar_path = f"{self.root_dir}{_FILE_EXTENSION_MAP['tar']}"
-        util.create_tar(self.root_dir, _FILE_EXTENSION_MAP['tar'])
+        util.create_tar(self._root_dir, _FILE_EXTENSION_MAP['tar'])
 
     def _archive_minio(self):
         pass
@@ -78,9 +86,10 @@ def _archive_s3(self):
     def _get_data_format(self, page_node_id: int, export_format: str) -> bytes:
         url = self._get_export_url(node_id=page_node_id, export_format=export_format)
         return util.get_byte_response(url=url, headers=self.headers)
-
-    def _get_combined_path(self, dir_name: str) -> str:
-        return f"{self.root_dir}/{dir_name}"
     
     def _get_export_url(self, node_id: int, export_format: str) -> str:
-        return f"{self.base_page_url}/{node_id}/{_EXPORT_API_PATH}/{export_format}"
\ No newline at end of file
+        return f"{self.base_page_url}/{node_id}/{_EXPORT_API_PATH}/{export_format}"
+    
+    @staticmethod
+    def generate_root_folder(base_folder_name: str) -> str:
+        return base_folder_name + "_" + datetime.now().strftime(_DATE_STR_FORMAT)
\ No newline at end of file
diff --git a/bookstack_file_exporter/archiver/util.py b/bookstack_file_exporter/archiver/util.py
index 6869e0b..031f579 100644
--- a/bookstack_file_exporter/archiver/util.py
+++ b/bookstack_file_exporter/archiver/util.py
@@ -10,9 +10,6 @@
 
 log = logging.getLogger(__name__)
 
-def generate_root_folder(base_folder_name: str) -> str:
-    return base_folder_name + "_" + datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
-
 def get_byte_response(url: str, headers: Dict[str, str]) -> bytes:
     try:
         response = requests.get(url=url, headers=headers)
diff --git a/bookstack_file_exporter/exporter/exporter.py b/bookstack_file_exporter/exporter/exporter.py
new file mode 100644
index 0000000..77a5c0c
--- /dev/null
+++ b/bookstack_file_exporter/exporter/exporter.py
@@ -0,0 +1,95 @@
+from typing import Dict, List, Union
+
+import bookstack_file_exporter.exporter.util as util
+from bookstack_file_exporter.exporter.node import Node
+
+
+# _API_SUFFIX_PATHS = {
+#     "shelves": "api/shelves",
+#     "books": "api/books",
+#     "chapters": "api/chapters",
+#     "pages": "api/pages"
+# }
+
+class NodeExporter():
+    """
+    NodeExporter class provides an interface to help create Bookstack resources/nodes (pages, books, etc) and their relationships.
+
+    Raises:
+
+    ValueError if data returned from bookstack api is empty or not in desired format.
+    """
+    def __init__(self, api_urls: Dict[str, str], headers: Dict[str,str]):
+        self.api_urls = api_urls
+        self.headers = headers
+
+    def get_shelf_nodes(self) -> Dict[int, Node]:
+        """
+        Function to get all shelf Node instances 
+        :returns: Dict[int, Node] for all shelf nodes
+        """
+        base_url = self.api_urls["shelves"]
+        all_parents: List[int] = util.get_all_ids(base_url, self.headers)
+        if not all_parents:
+            raise ValueError(f"No resources returned from Bookstack api url: {base_url}")
+        return self._get_parents(base_url, all_parents)
+        
+    def _get_parents(self, base_url: str, parent_ids: List[int], path_prefix: Union[str, None] = None) -> Dict[int, Node]:
+        parent_nodes = {}
+        for parent_id in parent_ids:
+            parent_url = f"{base_url}/{parent_id}"
+            parent_data = util.get_json_response(url=parent_url, headers=self.headers)
+            parent_nodes[parent_id] = Node(parent_data, path_prefix=path_prefix)
+        return parent_nodes
+    
+    def get_chapter_nodes(self, book_nodes: Dict[int, Node]):
+        # Chapters are treated a little differently
+        # They are under books like pages but have their own children
+        # i.e. not a terminal node
+        base_url = self.api_urls["chapters"]
+        all_chapters: List[int] = util.get_all_ids(base_url, self.headers)
+        if not all_chapters:
+            raise ValueError(f"No resources returned from Bookstack api url: {base_url}")
+        return self._get_chapters(base_url, all_chapters, book_nodes)
+
+    def _get_chapters(self, base_url: str, all_chapters: List[int], book_nodes: Dict[int, Node]):
+        chapter_nodes = {}
+        for chapter_id in all_chapters:
+            chapter_url = f"{base_url}/{chapter_id}"
+            chapter_data = util.get_json_response(url=chapter_url, headers=self.headers)
+            book_id = chapter_data['book_id']
+            chapter_nodes[chapter_id] = Node(chapter_data, book_nodes[book_id])
+        return chapter_nodes
+    
+    def get_child_nodes(self, resource_type: str, parent_nodes: Dict[int, Node], filter_empty: bool = True):
+        base_url = self.api_urls[resource_type]
+        return self._get_children(base_url, parent_nodes, filter_empty)
+
+    def _get_children(self, base_url: str, parent_nodes: Dict[int, Node], filter_empty: bool):
+        child_nodes = {}
+        for _, parent in parent_nodes.items():
+            if parent.children:
+                for child in parent.children:
+                    child_id = child['id']
+                    child_url = f"{base_url}/{child_id}"
+                    child_data = util.get_json_response(url=child_url, headers=self.headers)
+                    child_node = Node(child_data, parent)
+                    if filter_empty:
+                        if not child_node.empty:
+                            child_nodes[child_id] = child_node
+                    else:
+                        child_nodes[child_id] = child_node
+        return child_nodes
+
+    def get_unassigned_books(self, existing_resources: Dict[int, Node], path_prefix: str) -> Dict[int, Node]:
+        base_url = self.api_urls["books"]
+        all_resources: List[int] = util.get_all_ids(url=base_url, headers=self.headers)
+        unassigned = []
+        # get all existing ones and compare against current known resources
+        for resource_id in all_resources:
+            if resource_id not in existing_resources:
+                unassigned.append(resource_id)
+        if not unassigned:
+            raise ValueError(f"No unassigned resources found for type: {base_url}")
+        # books with no shelf treated like a parent resource
+        return self._get_parents(base_url, unassigned, path_prefix)
\ No newline at end of file
diff --git a/bookstack_file_exporter/exporter/node.py b/bookstack_file_exporter/exporter/node.py
index 64c453f..d4aa849 100644
--- a/bookstack_file_exporter/exporter/node.py
+++ b/bookstack_file_exporter/exporter/node.py
@@ -9,7 +9,7 @@
 
 class Node():
     """
-    Node class provides an interface to create and reference bookstack child/parent relationships for resources like pages, books, chapters, and shelves.
+    Node class provides an interface to create bookstack child/parent relationships for resources like pages, books, chapters, and shelves.
 
     Args:
         metadata: Dict[str, Union[str, int]] (required) = The metadata of the resource from bookstack api
@@ -23,7 +23,7 @@ class Node():
     """
     def __init__(self, meta: Dict[str, Union[str, int]], parent: Union['Node', None] = None, path_prefix: Union[str, None] = None):
         self.meta = meta
-        self.__parent = parent
+        self._parent = parent
         self._path_prefix = path_prefix
         self.name: str = ""
         self.id: int = 0
@@ -39,9 +39,8 @@ def _initialize(self):
         self.id = self.meta['id']
         self._display_name = self.meta['name']
         # get base file path from parent if it exists
-        if self.__parent:
-            self._file_path = f"{self.__parent.file_path}/{self.name}"
-            # self._file_path = self.__parent.file_path + '/' + self.name
+        if self._parent:
+            self._file_path = f"{self._parent.file_path}/{self.name}"
         # normalize path prefix if it does not exist
         if not self._path_prefix:
             self._path_prefix = ""
diff --git a/bookstack_file_exporter/exporter/util.py b/bookstack_file_exporter/exporter/util.py
index 7c3b8a7..ff0afdf 100644
--- a/bookstack_file_exporter/exporter/util.py
+++ b/bookstack_file_exporter/exporter/util.py
@@ -17,49 +17,9 @@ def get_json_response(url: str, headers: Dict[str, str], verify: bool = True, ti
 
 def get_all_ids(url: str, headers: Dict[str, str]) -> List[int]:
     ids_api_meta = get_json_response(url=url, headers=headers)
-    all_ids = [item['id'] for item in ids_api_meta['data']]
-    return all_ids
-
-def get_parent_meta(url: str, headers: Dict[str, str], parent_ids: List[int],
-                     path_prefix: Union[str, None] = None) -> Dict[int, Node]:
-    parent_nodes = {}
-    for parent_id in parent_ids:
-        parent_url = f"{url}/{parent_id}"
-        # parent_url = url + "/" + str(parent_id)
-        parent_data = get_json_response(url=parent_url, headers=headers)
-        parent_nodes[parent_id] = Node(parent_data, path_prefix=path_prefix)
-    return parent_nodes
-
-def get_chapter_meta(url: str, headers: Dict[str, str], chapters: List[int],
-                     books:Dict[int, Node], path_prefix: Union[str, None] = None) -> Dict[int, Node]:
-    chapter_nodes = {}
-    for chapter_id in chapters:
-        chapter_url = f"{url}/{chapter_id}"
-        # chapter_url = url + "/" + str(chapter_id)
-        chapter_data = get_json_response(url=chapter_url, headers=headers)
-        book_id = chapter_data['book_id']
-        chapter_nodes[chapter_id] = Node(chapter_data, books[book_id], path_prefix=path_prefix)
-    return chapter_nodes
-
-def get_child_meta(url: str, headers: Dict[str, str], parent_nodes: Dict[int, Node],
-                    filter_empty: bool = False, path_prefix: Union[str, None] = None) -> Dict[int, Node]:
-    child_nodes = {}
-    for _, parent in parent_nodes.items():
-        if parent.children:
-            for child in parent.children:
-                child_id = child['id']
-                child_url = f"{url}/{child_id}"
-                # child_url = url + "/" + str(child_id)
-                child_data = get_json_response(url=child_url, headers=headers)
-                child_node = Node(child_data, parent, path_prefix=path_prefix)
-                if filter_empty:
-                    if not child_node.empty:
-                        child_nodes[child_id] = child_node
-                else:
-                    child_nodes[child_id] = child_node
-    return child_nodes
-
-def get_page_export(url: str, headers: Dict[str, str]):
-    pass
+    if ids_api_meta:
+        return [item['id'] for item in ids_api_meta['data']]
+    else:
+        return []
 
 
diff --git a/bookstack_file_exporter/run.py b/bookstack_file_exporter/run.py
index 5cd1138..784b900 100644
--- a/bookstack_file_exporter/run.py
+++ b/bookstack_file_exporter/run.py
@@ -5,114 +5,79 @@
 from typing import Dict, Union, List
 
 from bookstack_file_exporter.config_helper.config_helper import ConfigNode
-from bookstack_file_exporter.exporter import util
 from bookstack_file_exporter.exporter.node import Node
+from bookstack_file_exporter.exporter.exporter import NodeExporter
 from bookstack_file_exporter.archiver import util as archiver_util
 from bookstack_file_exporter.archiver.archiver import Archiver
 
-
 log = logging.getLogger(__name__)
 
-# def get_shelve_nodes(shelve_url: str, headers: Dict[str, str]) -> Dict[int, Node]:
-#     all_shelves: List[int] = util.get_all_ids(url=shelve_url, headers=headers)
-#     return util.get_parent_meta(url=shelve_url, headers=headers, parent_ids=all_shelves)
-
-
-# def get_chapters(chapter_url: str, headers: Dict[str, str]):
-#     pass
-
 def test(args: argparse.Namespace, token_id_env: str, token_secret_env: str):
+    ## get configuration from helper
     config = ConfigNode(args)
     config.token_id= os.environ.get(token_id_env, "")
     config.token_secret = os.environ.get(token_secret_env, "")
 
     ## convenience vars 
     bookstack_headers = config.headers
+    api_urls = config.urls
     export_formats = config.user_inputs.formats
     remote_targets = config.user_inputs.remote_targets
-    shelve_base_url = config.urls['shelves']
-    book_base_url = config.urls['books']
-    chapter_base_url = config.urls['chapters']
+    unassigned_dir = config.unassigned_book_dir
     page_base_url = config.urls['pages']
+    base_export_dir = config.base_dir_name
+
+    #### Export Data #####
+    # need to implement pagination for apis
 
+    ## Use exporter class to get all the resources (pages, books, etc.) and their relationships
+    exportHelper = NodeExporter(api_urls, bookstack_headers)
     ## shelves
-    # shelve_nodes: Dict[int, Node] = get_shelve_nodes(shelve_base_url, bookstack_headers)
-    # need to implement pagination
-    all_shelves: List[int] = util.get_all_ids(url=shelve_base_url, headers=bookstack_headers)
-    shelve_nodes: Dict[int, Node] = util.get_parent_meta(url=shelve_base_url, headers=bookstack_headers,
-                                                         parent_ids=all_shelves)
-    
+    shelve_nodes: Dict[int, Node] = exportHelper.get_shelf_nodes()
     ## books
-    book_nodes: Dict[int, Node] = util.get_child_meta(url=book_base_url, headers=bookstack_headers,
-                                                      parent_nodes=shelve_nodes)
-    
-    ## pages
-    page_nodes = util.get_child_meta(url=page_base_url, headers=bookstack_headers, parent_nodes=book_nodes, filter_empty=True)
-
-
-    ## chapters
-    # get_chapters(chapter_base_url, bookstack_headers)
+    book_nodes: Dict[int, Node] = exportHelper.get_child_nodes("books", shelve_nodes)
+    # books with no shelve assignment
+    # default will be put in "unassigned" directory relative to backup dir
+    # catch ValueError for Missing Response/Empty Data if no chapters exists
+    try:
+        books_no_shelf: Dict[int, Node] = exportHelper.get_unassigned_books(book_nodes, unassigned_dir)
+    except ValueError:
+        log.Info("No unassigned books found")
+        books_no_shelf = {}
+
+    # add new book nodes to map
+    # these should not already be present in map
+    # since we started with shelves first and then moved our way down.
+    if books_no_shelf:
+        for key, value in books_no_shelf.items():
+            book_nodes[key] = value
+
+    ## chapters (if exists)
+    # chapter nodes are treated a little differently
+    # are children under books
+    try:
+        chapter_nodes: Dict[int, Node] = exportHelper.get_chapter_nodes(book_nodes)
+    except ValueError:
+        log.Info("No chapter data was found")
+        chapter_nodes = {}
 
-    all_chapters: List[int] = util.get_all_ids(url=chapter_base_url, headers=bookstack_headers)
-    # check for chapters since they are optional
-    if all_chapters:
-        chapter_nodes: Dict[int, Node] = util.get_chapter_meta(url=chapter_base_url, headers=bookstack_headers,
-                                                            chapters=all_chapters, books=book_nodes)
-        # add all pages in a chapter first
-        page_chapter_nodes = util.get_child_meta(url=page_base_url, headers=bookstack_headers, parent_nodes=chapter_nodes, filter_empty=True)
+    ## pages
+    page_nodes: Dict[int, Node] = exportHelper.get_child_nodes("pages", book_nodes)
+    # add chapter node pages
+    # replace existing page node if found with proper chapter parent
+    if chapter_nodes:
+        page_chapter_nodes: Dict[int, Node] = exportHelper.get_child_nodes("pages", chapter_nodes)
+        ## since we filter empty, check if there is any content
+        ## add all chapter pages to existing page nodes
         if page_chapter_nodes:
             for key, value in page_chapter_nodes.items():
                 page_nodes[key] = value
-
-    # print(chapter_nodes)
-    # for _, value in chapter_nodes.items():
-    #     print(value.children)
-
-    ## get books with no shelf
-    all_books: List[int] = util.get_all_ids(url=book_base_url, headers=bookstack_headers)
-    # filter out already seen books
-    books_no_shelf = []
-    for book_id in all_books:
-        if book_id not in book_nodes.keys():
-            books_no_shelf.append(book_id)
-    
-
-    if books_no_shelf:
-        no_shelf_book_nodes = util.get_parent_meta(url=book_base_url, headers=bookstack_headers,
-                                                    parent_ids=books_no_shelf, path_prefix=config.unassigned_book_dir)
-        no_shelf_page_nodes = util.get_child_meta(url=page_base_url, headers=bookstack_headers,
-                                                parent_nodes=no_shelf_book_nodes, filter_empty=True)
-        for key, value in no_shelf_page_nodes.items():
-            page_nodes[key] = value
-    
-    
-    # for key, page in page_nodes.items():
-    #     print(page.file_path)
-    
-    # for format in config.user_inputs.export_formats:
-    #     for key, page in page_nodes.items():
-    #         if config.user_inputs.export_meta:
-    #             pass
-
-    base_dir_name = archiver_util.generate_root_folder(config.base_dir_name)
-    log.info(base_dir_name)
-
-    archive: Archiver = Archiver(base_dir_name, config.user_inputs.export_meta, page_base_url, bookstack_headers)
-
-    # for _, page in page_nodes.items():
-    #     archive.archive(page, 'markdown')
     
-    # First create local archive and tar ball
-    for _, page in page_nodes.items():
-        for format in export_formats:
-            # instead of sleep, implement back off retry in utils
-            sleep(2)
-            archive.gather(page, format)
+    ## start archive ##
+    archive: Archiver = Archiver(base_export_dir, config.user_inputs.export_meta, page_base_url, bookstack_headers)
     
     # create tar
-    archive.archive()
+    archive.archive(page_nodes, export_formats)
     
     # archive to remote targets
-    if remote_targets:
-        for target in remote_targets:
-            archive.archive_remote(target)
\ No newline at end of file
+    archive.archive_remote(remote_targets)
\ No newline at end of file