From 2532022483ea615b9d2b4715b427888f684a5e46 Mon Sep 17 00:00:00 2001 From: Peter Chang Date: Tue, 19 Sep 2023 04:48:28 -0400 Subject: [PATCH] initial code refactor --- bookstack_file_exporter/archiver/archiver.py | 49 ++++--- bookstack_file_exporter/archiver/util.py | 3 - bookstack_file_exporter/exporter/exporter.py | 95 ++++++++++++++ bookstack_file_exporter/exporter/node.py | 9 +- bookstack_file_exporter/exporter/util.py | 48 +------ bookstack_file_exporter/run.py | 131 +++++++------------ 6 files changed, 180 insertions(+), 155 deletions(-) create mode 100644 bookstack_file_exporter/exporter/exporter.py diff --git a/bookstack_file_exporter/archiver/archiver.py b/bookstack_file_exporter/archiver/archiver.py index ffd0a3d..6abb186 100644 --- a/bookstack_file_exporter/archiver/archiver.py +++ b/bookstack_file_exporter/archiver/archiver.py @@ -1,6 +1,6 @@ from typing import List, Dict, Union -from pathlib import Path -import json +from time import sleep +from datetime import datetime from bookstack_file_exporter.exporter.node import Node from bookstack_file_exporter.archiver import util @@ -19,6 +19,8 @@ "tar": _TAR_GZ_SUFFIX } +_DATE_STR_FORMAT = "%Y-%m-%d_%H-%M-%S" + class Archiver: """ Archiver pulls all the necessary files from upstream and then pushes them to the specified backup location(s) @@ -32,41 +34,47 @@ class Archiver: Returns: Archiver instance with attributes that are accessible for use for file level archival and backup. """ - def __init__(self, root_dir: str, add_meta: bool, base_page_url: str, headers: Dict[str, str]): - self.root_dir = root_dir + def __init__(self, base_dir: str, add_meta: bool, base_page_url: str, headers: Dict[str, str]): + self.base_dir = base_dir self.add_meta = add_meta self.base_page_url = base_page_url self.headers = headers # remote_system to function mapping self._remote_exports = {'minio': self._archive_minio, 's3': self._archive_s3} - # self._tar_file = "" + self._root_dir = self.generate_root_folder(self.base_dir) self._minio_token = "" self._minio_id = "" # create local tarball first + def archive(self, page_nodes: Dict[int, Node], export_formats: List[str]): + for _, page in page_nodes.items(): + for format in export_formats: + # instead of sleep, implement back off retry in utils + sleep(0.5) + self._gather(page, format) + self._tar_dir() + # convert to bytes to be agnostic to end destination (future use case?) - def gather(self, page_node: Node, export_format: str): + def _gather(self, page_node: Node, export_format: str): raw_data = self._get_data_format(page_node.id, export_format) self._gather_local(page_node.file_path, raw_data, export_format, page_node.meta) - - def archive(self): - self._tar_dir() - - # send to remote systems - def archive_remote(self, remote_dest: str): - self._remote_exports[remote_dest]() def _gather_local(self, page_path: str, data: bytes, export_format: str, meta_data: Union[bytes, None]): - file_path = self._get_combined_path(page_path) + file_path = f"{self._root_dir}/{page_path}" file_full_name = f"{file_path}{_FILE_EXTENSION_MAP[export_format]}" util.write_bytes(file_path=file_full_name, data=data) if self.add_meta: meta_file_name = f"{file_path}{_FILE_EXTENSION_MAP['meta']}" util.dump_json(file_name=meta_file_name, data=meta_data) + # send to remote systems + def archive_remote(self, remote_targets: List[str]): + if remote_targets: + for target in remote_targets: + self._remote_exports[target]() + def _tar_dir(self): - # tar_path = f"{self.root_dir}{_FILE_EXTENSION_MAP['tar']}" - util.create_tar(self.root_dir, _FILE_EXTENSION_MAP['tar']) + util.create_tar(self._root_dir, _FILE_EXTENSION_MAP['tar']) def _archive_minio(self): pass @@ -78,9 +86,10 @@ def _archive_s3(self): def _get_data_format(self, page_node_id: int, export_format: str) -> bytes: url = self._get_export_url(node_id=page_node_id, export_format=export_format) return util.get_byte_response(url=url, headers=self.headers) - - def _get_combined_path(self, dir_name: str) -> str: - return f"{self.root_dir}/{dir_name}" def _get_export_url(self, node_id: int, export_format: str) -> str: - return f"{self.base_page_url}/{node_id}/{_EXPORT_API_PATH}/{export_format}" \ No newline at end of file + return f"{self.base_page_url}/{node_id}/{_EXPORT_API_PATH}/{export_format}" + + @staticmethod + def generate_root_folder(base_folder_name: str) -> str: + return base_folder_name + "_" + datetime.now().strftime(_DATE_STR_FORMAT) \ No newline at end of file diff --git a/bookstack_file_exporter/archiver/util.py b/bookstack_file_exporter/archiver/util.py index 6869e0b..031f579 100644 --- a/bookstack_file_exporter/archiver/util.py +++ b/bookstack_file_exporter/archiver/util.py @@ -10,9 +10,6 @@ log = logging.getLogger(__name__) -def generate_root_folder(base_folder_name: str) -> str: - return base_folder_name + "_" + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') - def get_byte_response(url: str, headers: Dict[str, str]) -> bytes: try: response = requests.get(url=url, headers=headers) diff --git a/bookstack_file_exporter/exporter/exporter.py b/bookstack_file_exporter/exporter/exporter.py new file mode 100644 index 0000000..77a5c0c --- /dev/null +++ b/bookstack_file_exporter/exporter/exporter.py @@ -0,0 +1,95 @@ +from typing import Dict, List, Union + +import bookstack_file_exporter.exporter.util as util +from bookstack_file_exporter.exporter.node import Node + + +# _API_SUFFIX_PATHS = { +# "shelves": "api/shelves", +# "books": "api/books", +# "chapters": "api/chapters", +# "pages": "api/pages" +# } + +class NodeExporter(): + """ + NodeExporter class provides an interface to help create Bookstack resources/nodes (pages, books, etc) and their relationships. + + Raises: + + ValueError if data returned from bookstack api is empty or not in desired format. + """ + def __init__(self, api_urls: Dict[str, str], headers: Dict[str,str]): + self.api_urls = api_urls + self.headers = headers + + def get_shelf_nodes(self) -> Dict[int, Node]: + """ + Function to get all shelf Node instances + :returns: Dict[int, Node] for all shelf nodes + """ + base_url = self.api_urls["shelves"] + all_parents: List[int] = util.get_all_ids(base_url, self.headers) + if not all_parents: + raise ValueError(f"No resources returned from Bookstack api url: {base_url}") + return self._get_parents(base_url, all_parents) + + def _get_parents(self, base_url: str, parent_ids: List[int], path_prefix: Union[str, None] = None) -> Dict[int, Node]: + parent_nodes = {} + for parent_id in parent_ids: + parent_url = f"{base_url}/{parent_id}" + parent_data = util.get_json_response(url=parent_url, headers=self.headers) + parent_nodes[parent_id] = Node(parent_data, path_prefix=path_prefix) + return parent_nodes + + def get_chapter_nodes(self, book_nodes: Dict[int, Node]): + # Chapters are treated a little differently + # They are under books like pages but have their own children + # i.e. not a terminal node + base_url = self.api_urls["chapters"] + all_chapters: List[int] = util.get_all_ids(base_url, self.headers) + if not all_chapters: + raise ValueError(f"No resources returned from Bookstack api url: {base_url}") + return self._get_chapters(base_url, all_chapters, book_nodes) + + def _get_chapters(self, base_url: str, all_chapters: List[int], book_nodes: Dict[int, Node]): + chapter_nodes = {} + for chapter_id in all_chapters: + chapter_url = f"{base_url}/{chapter_id}" + chapter_data = util.get_json_response(url=chapter_url, headers=self.headers) + book_id = chapter_data['book_id'] + chapter_nodes[chapter_id] = Node(chapter_data, book_nodes[book_id]) + return chapter_nodes + + def get_child_nodes(self, resource_type: str, parent_nodes: Dict[int, Node], filter_empty: bool = True): + base_url = self.api_urls[resource_type] + return self._get_children(base_url, parent_nodes, filter_empty) + + def _get_children(self, base_url: str, parent_nodes: Dict[int, Node], filter_empty: bool): + child_nodes = {} + for _, parent in parent_nodes.items(): + if parent.children: + for child in parent.children: + child_id = child['id'] + child_url = f"{base_url}/{child_id}" + child_data = util.get_json_response(url=child_url, headers=self.headers) + child_node = Node(child_data, parent) + if filter_empty: + if not child_node.empty: + child_nodes[child_id] = child_node + else: + child_nodes[child_id] = child_node + return child_nodes + + def get_unassigned_books(self, existing_resources: Dict[int, Node], path_prefix: str) -> Dict[int, Node]: + base_url = self.api_urls["books"] + all_resources: List[int] = util.get_all_ids(url=base_url, headers=self.headers) + unassigned = [] + # get all existing ones and compare against current known resources + for resource_id in all_resources: + if resource_id not in existing_resources: + unassigned.append(resource_id) + if not unassigned: + raise ValueError(f"No unassigned resources found for type: {base_url}") + # books with no shelf treated like a parent resource + return self._get_parents(base_url, unassigned, path_prefix) \ No newline at end of file diff --git a/bookstack_file_exporter/exporter/node.py b/bookstack_file_exporter/exporter/node.py index 64c453f..d4aa849 100644 --- a/bookstack_file_exporter/exporter/node.py +++ b/bookstack_file_exporter/exporter/node.py @@ -9,7 +9,7 @@ class Node(): """ - Node class provides an interface to create and reference bookstack child/parent relationships for resources like pages, books, chapters, and shelves. + Node class provides an interface to create bookstack child/parent relationships for resources like pages, books, chapters, and shelves. Args: metadata: Dict[str, Union[str, int]] (required) = The metadata of the resource from bookstack api @@ -23,7 +23,7 @@ class Node(): """ def __init__(self, meta: Dict[str, Union[str, int]], parent: Union['Node', None] = None, path_prefix: Union[str, None] = None): self.meta = meta - self.__parent = parent + self._parent = parent self._path_prefix = path_prefix self.name: str = "" self.id: int = 0 @@ -39,9 +39,8 @@ def _initialize(self): self.id = self.meta['id'] self._display_name = self.meta['name'] # get base file path from parent if it exists - if self.__parent: - self._file_path = f"{self.__parent.file_path}/{self.name}" - # self._file_path = self.__parent.file_path + '/' + self.name + if self._parent: + self._file_path = f"{self._parent.file_path}/{self.name}" # normalize path prefix if it does not exist if not self._path_prefix: self._path_prefix = "" diff --git a/bookstack_file_exporter/exporter/util.py b/bookstack_file_exporter/exporter/util.py index 7c3b8a7..ff0afdf 100644 --- a/bookstack_file_exporter/exporter/util.py +++ b/bookstack_file_exporter/exporter/util.py @@ -17,49 +17,9 @@ def get_json_response(url: str, headers: Dict[str, str], verify: bool = True, ti def get_all_ids(url: str, headers: Dict[str, str]) -> List[int]: ids_api_meta = get_json_response(url=url, headers=headers) - all_ids = [item['id'] for item in ids_api_meta['data']] - return all_ids - -def get_parent_meta(url: str, headers: Dict[str, str], parent_ids: List[int], - path_prefix: Union[str, None] = None) -> Dict[int, Node]: - parent_nodes = {} - for parent_id in parent_ids: - parent_url = f"{url}/{parent_id}" - # parent_url = url + "/" + str(parent_id) - parent_data = get_json_response(url=parent_url, headers=headers) - parent_nodes[parent_id] = Node(parent_data, path_prefix=path_prefix) - return parent_nodes - -def get_chapter_meta(url: str, headers: Dict[str, str], chapters: List[int], - books:Dict[int, Node], path_prefix: Union[str, None] = None) -> Dict[int, Node]: - chapter_nodes = {} - for chapter_id in chapters: - chapter_url = f"{url}/{chapter_id}" - # chapter_url = url + "/" + str(chapter_id) - chapter_data = get_json_response(url=chapter_url, headers=headers) - book_id = chapter_data['book_id'] - chapter_nodes[chapter_id] = Node(chapter_data, books[book_id], path_prefix=path_prefix) - return chapter_nodes - -def get_child_meta(url: str, headers: Dict[str, str], parent_nodes: Dict[int, Node], - filter_empty: bool = False, path_prefix: Union[str, None] = None) -> Dict[int, Node]: - child_nodes = {} - for _, parent in parent_nodes.items(): - if parent.children: - for child in parent.children: - child_id = child['id'] - child_url = f"{url}/{child_id}" - # child_url = url + "/" + str(child_id) - child_data = get_json_response(url=child_url, headers=headers) - child_node = Node(child_data, parent, path_prefix=path_prefix) - if filter_empty: - if not child_node.empty: - child_nodes[child_id] = child_node - else: - child_nodes[child_id] = child_node - return child_nodes - -def get_page_export(url: str, headers: Dict[str, str]): - pass + if ids_api_meta: + return [item['id'] for item in ids_api_meta['data']] + else: + return [] diff --git a/bookstack_file_exporter/run.py b/bookstack_file_exporter/run.py index 5cd1138..784b900 100644 --- a/bookstack_file_exporter/run.py +++ b/bookstack_file_exporter/run.py @@ -5,114 +5,79 @@ from typing import Dict, Union, List from bookstack_file_exporter.config_helper.config_helper import ConfigNode -from bookstack_file_exporter.exporter import util from bookstack_file_exporter.exporter.node import Node +from bookstack_file_exporter.exporter.exporter import NodeExporter from bookstack_file_exporter.archiver import util as archiver_util from bookstack_file_exporter.archiver.archiver import Archiver - log = logging.getLogger(__name__) -# def get_shelve_nodes(shelve_url: str, headers: Dict[str, str]) -> Dict[int, Node]: -# all_shelves: List[int] = util.get_all_ids(url=shelve_url, headers=headers) -# return util.get_parent_meta(url=shelve_url, headers=headers, parent_ids=all_shelves) - - -# def get_chapters(chapter_url: str, headers: Dict[str, str]): -# pass - def test(args: argparse.Namespace, token_id_env: str, token_secret_env: str): + ## get configuration from helper config = ConfigNode(args) config.token_id= os.environ.get(token_id_env, "") config.token_secret = os.environ.get(token_secret_env, "") ## convenience vars bookstack_headers = config.headers + api_urls = config.urls export_formats = config.user_inputs.formats remote_targets = config.user_inputs.remote_targets - shelve_base_url = config.urls['shelves'] - book_base_url = config.urls['books'] - chapter_base_url = config.urls['chapters'] + unassigned_dir = config.unassigned_book_dir page_base_url = config.urls['pages'] + base_export_dir = config.base_dir_name + + #### Export Data ##### + # need to implement pagination for apis + ## Use exporter class to get all the resources (pages, books, etc.) and their relationships + exportHelper = NodeExporter(api_urls, bookstack_headers) ## shelves - # shelve_nodes: Dict[int, Node] = get_shelve_nodes(shelve_base_url, bookstack_headers) - # need to implement pagination - all_shelves: List[int] = util.get_all_ids(url=shelve_base_url, headers=bookstack_headers) - shelve_nodes: Dict[int, Node] = util.get_parent_meta(url=shelve_base_url, headers=bookstack_headers, - parent_ids=all_shelves) - + shelve_nodes: Dict[int, Node] = exportHelper.get_shelf_nodes() ## books - book_nodes: Dict[int, Node] = util.get_child_meta(url=book_base_url, headers=bookstack_headers, - parent_nodes=shelve_nodes) - - ## pages - page_nodes = util.get_child_meta(url=page_base_url, headers=bookstack_headers, parent_nodes=book_nodes, filter_empty=True) - - - ## chapters - # get_chapters(chapter_base_url, bookstack_headers) + book_nodes: Dict[int, Node] = exportHelper.get_child_nodes("books", shelve_nodes) + # books with no shelve assignment + # default will be put in "unassigned" directory relative to backup dir + # catch ValueError for Missing Response/Empty Data if no chapters exists + try: + books_no_shelf: Dict[int, Node] = exportHelper.get_unassigned_books(book_nodes, unassigned_dir) + except ValueError: + log.Info("No unassigned books found") + books_no_shelf = {} + + # add new book nodes to map + # these should not already be present in map + # since we started with shelves first and then moved our way down. + if books_no_shelf: + for key, value in books_no_shelf.items(): + book_nodes[key] = value + + ## chapters (if exists) + # chapter nodes are treated a little differently + # are children under books + try: + chapter_nodes: Dict[int, Node] = exportHelper.get_chapter_nodes(book_nodes) + except ValueError: + log.Info("No chapter data was found") + chapter_nodes = {} - all_chapters: List[int] = util.get_all_ids(url=chapter_base_url, headers=bookstack_headers) - # check for chapters since they are optional - if all_chapters: - chapter_nodes: Dict[int, Node] = util.get_chapter_meta(url=chapter_base_url, headers=bookstack_headers, - chapters=all_chapters, books=book_nodes) - # add all pages in a chapter first - page_chapter_nodes = util.get_child_meta(url=page_base_url, headers=bookstack_headers, parent_nodes=chapter_nodes, filter_empty=True) + ## pages + page_nodes: Dict[int, Node] = exportHelper.get_child_nodes("pages", book_nodes) + # add chapter node pages + # replace existing page node if found with proper chapter parent + if chapter_nodes: + page_chapter_nodes: Dict[int, Node] = exportHelper.get_child_nodes("pages", chapter_nodes) + ## since we filter empty, check if there is any content + ## add all chapter pages to existing page nodes if page_chapter_nodes: for key, value in page_chapter_nodes.items(): page_nodes[key] = value - - # print(chapter_nodes) - # for _, value in chapter_nodes.items(): - # print(value.children) - - ## get books with no shelf - all_books: List[int] = util.get_all_ids(url=book_base_url, headers=bookstack_headers) - # filter out already seen books - books_no_shelf = [] - for book_id in all_books: - if book_id not in book_nodes.keys(): - books_no_shelf.append(book_id) - - - if books_no_shelf: - no_shelf_book_nodes = util.get_parent_meta(url=book_base_url, headers=bookstack_headers, - parent_ids=books_no_shelf, path_prefix=config.unassigned_book_dir) - no_shelf_page_nodes = util.get_child_meta(url=page_base_url, headers=bookstack_headers, - parent_nodes=no_shelf_book_nodes, filter_empty=True) - for key, value in no_shelf_page_nodes.items(): - page_nodes[key] = value - - - # for key, page in page_nodes.items(): - # print(page.file_path) - - # for format in config.user_inputs.export_formats: - # for key, page in page_nodes.items(): - # if config.user_inputs.export_meta: - # pass - - base_dir_name = archiver_util.generate_root_folder(config.base_dir_name) - log.info(base_dir_name) - - archive: Archiver = Archiver(base_dir_name, config.user_inputs.export_meta, page_base_url, bookstack_headers) - - # for _, page in page_nodes.items(): - # archive.archive(page, 'markdown') - # First create local archive and tar ball - for _, page in page_nodes.items(): - for format in export_formats: - # instead of sleep, implement back off retry in utils - sleep(2) - archive.gather(page, format) + ## start archive ## + archive: Archiver = Archiver(base_export_dir, config.user_inputs.export_meta, page_base_url, bookstack_headers) # create tar - archive.archive() + archive.archive(page_nodes, export_formats) # archive to remote targets - if remote_targets: - for target in remote_targets: - archive.archive_remote(target) \ No newline at end of file + archive.archive_remote(remote_targets) \ No newline at end of file