From 02f44f4b2b3a9a10e8a6a103c0bc62b95dd39260 Mon Sep 17 00:00:00 2001 From: ACA Date: Sun, 3 Mar 2024 13:35:22 +0100 Subject: [PATCH] core: improve typing, fix some type assignments, docstrings use pathlib in some places --- lncrawl/core/app.py | 5 +++-- lncrawl/core/crawler.py | 6 ++++-- lncrawl/core/downloader.py | 4 ++-- lncrawl/core/logconfig.py | 4 ++-- lncrawl/core/novel_info.py | 3 ++- lncrawl/core/proxy.py | 4 ++-- lncrawl/core/taskman.py | 3 ++- 7 files changed, 17 insertions(+), 12 deletions(-) diff --git a/lncrawl/core/app.py b/lncrawl/core/app.py index 83f92f8e0..809004725 100644 --- a/lncrawl/core/app.py +++ b/lncrawl/core/app.py @@ -2,6 +2,7 @@ import logging import os import shutil +from pathlib import Path from threading import Thread from typing import Dict, List, Optional, Tuple from urllib.parse import urlparse @@ -167,7 +168,7 @@ def start_download(self): fetch_chapter_images(self) save_metadata(self, True) - if not self.output_formats.get("json", False): + if not self.output_formats.get(OutputFormat.json.value, False): shutil.rmtree(os.path.join(self.output_path, "json"), ignore_errors=True) if self.can_do("logout"): @@ -233,7 +234,7 @@ def compress_books(self, archive_singles=False): logger.info("Not archiving single file inside %s" % root_dir) archived_file = os.path.join(root_dir, file_list[0]) else: - base_path = os.path.join(self.output_path, output_name) + base_path = Path(self.output_path) / output_name logger.info("Compressing %s to %s" % (root_dir, base_path)) archived_file = shutil.make_archive( base_path, diff --git a/lncrawl/core/crawler.py b/lncrawl/core/crawler.py index bafde7435..20c89b00b 100644 --- a/lncrawl/core/crawler.py +++ b/lncrawl/core/crawler.py @@ -155,6 +155,7 @@ def download_chapters( unit="item", fail_fast=fail_fast, ) + chapter = None for (index, future) in futures.items(): try: chapter = chapters[index] @@ -162,8 +163,9 @@ def download_chapters( self.extract_chapter_images(chapter) chapter.success = True except Exception as e: - chapter.body = "" - chapter.success = False + if isinstance(chapter, Chapter): + chapter.body = "" + chapter.success = False if isinstance(e, KeyboardInterrupt): break finally: diff --git a/lncrawl/core/downloader.py b/lncrawl/core/downloader.py index 57d836a58..39bcf18a6 100644 --- a/lncrawl/core/downloader.py +++ b/lncrawl/core/downloader.py @@ -80,9 +80,9 @@ def fetch_chapter_body(app): old_chapter = json.load(file) chapter.update(**old_chapter) except FileNotFoundError: - logger.info("Missing File: %s Retrieved!" % (file_name)) + logger.info("Missing File: %s Retrieved!" % file_name) except json.JSONDecodeError: - logger.info("Unable to decode JSON from the file: %s" % (file_name)) + logger.info("Unable to decode JSON from the file: %s" % file_name) except Exception as e: logger.exception("An error occurred while reading the file:", e) diff --git a/lncrawl/core/logconfig.py b/lncrawl/core/logconfig.py index 660b6cd50..898676c3f 100644 --- a/lncrawl/core/logconfig.py +++ b/lncrawl/core/logconfig.py @@ -65,8 +65,8 @@ def configure_logging(): } if not log_file: del config["handlers"]["file"] - config["root"]["level"] = level + config["root"]["level"] = logging.getLevelName(level) config["root"]["handlers"] = ["console"] - config["handlers"]["console"]["level"] = level + config["handlers"]["console"]["level"] = logging.getLevelName(level) logging.config.dictConfig(config) diff --git a/lncrawl/core/novel_info.py b/lncrawl/core/novel_info.py index 77d487ef0..ef2c542cb 100644 --- a/lncrawl/core/novel_info.py +++ b/lncrawl/core/novel_info.py @@ -1,6 +1,7 @@ import math import os import re +from pathlib import Path from typing import Dict from .. import constants as C @@ -109,5 +110,5 @@ def save_metadata(app, completed=False): ) os.makedirs(app.output_path, exist_ok=True) - file_name = os.path.join(app.output_path, C.META_FILE_NAME) + file_name = Path(app.output_path) / C.META_FILE_NAME novel.to_json(file_name, encoding="utf-8", indent=2) diff --git a/lncrawl/core/proxy.py b/lncrawl/core/proxy.py index ed7dc2822..e2247ba40 100644 --- a/lncrawl/core/proxy.py +++ b/lncrawl/core/proxy.py @@ -156,7 +156,7 @@ def __find_proxies(): __proxy_list.setdefault(scheme, []) if __proxy_visited_at.get(url, 0) + __proxy_ttl < time.time(): __validate_and_add(scheme, ip, url) - __proxy_visited_at[url] = time.time() + __proxy_visited_at[url] = int(time.time()) wait_times = 3 * 60 while wait_times and not __has_exit: @@ -178,6 +178,6 @@ def start_proxy_fetcher(): Thread(target=__find_proxies, daemon=False).start() -def stop_proxy_fetcher(): +def stop_proxy_fetcher(*args, **kwargs): global __has_exit __has_exit = True diff --git a/lncrawl/core/taskman.py b/lncrawl/core/taskman.py index e9b186d1b..7fccb7f38 100644 --- a/lncrawl/core/taskman.py +++ b/lncrawl/core/taskman.py @@ -136,7 +136,7 @@ def domain_gate(self, hostname: str = ""): """Limit number of entry per hostname. Args: - url: A fully qualified url. + hostname: A fully qualified url. Returns: A semaphore object to wait. @@ -179,6 +179,7 @@ def resolve_futures( disable_bar: Hides the progress bar if True. desc: The progress bar description unit: The progress unit name + fail_fast: Fail on first error """ if not futures: return