From ba2fe5b34cac4f429efd428489565412a2c109b5 Mon Sep 17 00:00:00 2001 From: sky-cake <> Date: Fri, 8 Mar 2024 01:24:38 -0500 Subject: [PATCH] feat: filter threads with white/black lists --- .gitignore | 4 ++++ readme.md | 40 +++++++++++++++++++++------------------- requirements.txt | 1 + scraper.ex.json | 10 ++++++---- scraper/Indexer.py | 40 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 72 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index b277cc2..fd3a15c 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,5 @@ scraper.json +venv +*.pyc +launch.json +media/ \ No newline at end of file diff --git a/readme.md b/readme.md index c866209..8d16351 100644 --- a/readme.md +++ b/readme.md @@ -153,6 +153,8 @@ The object key defines the board's name in the database. You may use `sourceBoar * `requestTimeoutFile` - Request timeout for image requests. Suggested values are between `40` and `120`. * `requestThrottleBoard` - Minimum time between requests for each board, in seconds. Suggested values are between `0.3` and `1.0`. Don't go too high on fast boards. * `requestThrottleGlobal` - Minimum time between requests for all boards, in seconds. Be careful with this. If you have more than a few boards, just set it to zero. +* `blacklistPostFilter` - A board level config that's a regex pattern. If specified, will **never** download threads with matching subjects or comments. +* `whitelistPostFilter` - A board level config that's a regex pattern. If specified, will **only** download threads with matching subjects or comments. If `blacklistPostFilter` is specified and has a match, a thread will not be downloaded despite a `whitelistPostFilter` match. ## Database Setup @@ -308,19 +310,19 @@ CREATE TABLE `%%BOARD%%_threads` ( SQL Code for Triggers ```sql -DELIMITER ;; +DELIMITER \\ -DROP PROCEDURE IF EXISTS `update_thread_%%BOARD%%`;; -DROP PROCEDURE IF EXISTS `create_thread_%%BOARD%%`;; -DROP PROCEDURE IF EXISTS `delete_thread_%%BOARD%%`;; -DROP PROCEDURE IF EXISTS `insert_image_%%BOARD%%`;; -DROP PROCEDURE IF EXISTS `delete_image_%%BOARD%%`;; -DROP PROCEDURE IF EXISTS `insert_post_%%BOARD%%`;; -DROP PROCEDURE IF EXISTS `delete_post_%%BOARD%%`;; +DROP PROCEDURE IF EXISTS `update_thread_%%BOARD%%`\\ +DROP PROCEDURE IF EXISTS `create_thread_%%BOARD%%`\\ +DROP PROCEDURE IF EXISTS `delete_thread_%%BOARD%%`\\ +DROP PROCEDURE IF EXISTS `insert_image_%%BOARD%%`\\ +DROP PROCEDURE IF EXISTS `delete_image_%%BOARD%%`\\ +DROP PROCEDURE IF EXISTS `insert_post_%%BOARD%%`\\ +DROP PROCEDURE IF EXISTS `delete_post_%%BOARD%%`\\ -DROP TRIGGER IF EXISTS `before_ins_%%BOARD%%`;; -DROP TRIGGER IF EXISTS `after_ins_%%BOARD%%`;; -DROP TRIGGER IF EXISTS `after_del_%%BOARD%%`;; +DROP TRIGGER IF EXISTS `before_ins_%%BOARD%%`\\ +DROP TRIGGER IF EXISTS `after_ins_%%BOARD%%`\\ +DROP TRIGGER IF EXISTS `after_del_%%BOARD%%`\\ CREATE PROCEDURE `update_thread_%%BOARD%%` (ins INT, tnum INT, subnum INT, timestamp INT, media INT, email VARCHAR(100)) BEGIN @@ -335,17 +337,17 @@ BEGIN op.nreplies = IF(ins, (op.nreplies + 1), (op.nreplies - 1)), op.nimages = IF(media, IF(ins, (op.nimages + 1), (op.nimages - 1)), op.nimages) WHERE op.thread_num = tnum; -END;; +END\\ CREATE PROCEDURE `create_thread_%%BOARD%%` (num INT, timestamp INT) BEGIN INSERT IGNORE INTO `%%BOARD%%_threads` VALUES (num, timestamp, timestamp, timestamp, NULL, NULL, timestamp, 0, 0, 0, 0); -END;; +END\\ CREATE PROCEDURE `delete_thread_%%BOARD%%` (tnum INT) BEGIN DELETE FROM `%%BOARD%%_threads` WHERE thread_num = tnum; -END;; +END\\ CREATE PROCEDURE `insert_image_%%BOARD%%` (n_media_hash VARCHAR(25), n_media VARCHAR(20), n_preview VARCHAR(20), n_op INT) BEGIN @@ -366,12 +368,12 @@ BEGIN preview_reply = COALESCE(preview_reply, VALUES(preview_reply)), media = COALESCE(media, VALUES(media)); END IF; -END;; +END\\ CREATE PROCEDURE `delete_image_%%BOARD%%` (n_media_id INT) BEGIN UPDATE `%%BOARD%%_images` SET total = (total - 1) WHERE media_id = n_media_id; -END;; +END\\ CREATE TRIGGER `before_ins_%%BOARD%%` BEFORE INSERT ON `%%BOARD%%` FOR EACH ROW @@ -380,7 +382,7 @@ BEGIN CALL insert_image_%%BOARD%%(NEW.media_hash, NEW.media_orig, NEW.preview_orig, NEW.op); SET NEW.media_id = LAST_INSERT_ID(); END IF; -END;; +END\\ CREATE TRIGGER `after_ins_%%BOARD%%` AFTER INSERT ON `%%BOARD%%` FOR EACH ROW @@ -389,7 +391,7 @@ BEGIN CALL create_thread_%%BOARD%%(NEW.num, NEW.timestamp); END IF; CALL update_thread_%%BOARD%%(1, NEW.thread_num, NEW.subnum, NEW.timestamp, NEW.media_id, NEW.email); -END;; +END\\ CREATE TRIGGER `after_del_%%BOARD%%` AFTER DELETE ON `%%BOARD%%` FOR EACH ROW @@ -401,7 +403,7 @@ BEGIN IF OLD.media_hash IS NOT NULL THEN CALL delete_image_%%BOARD%%(OLD.media_id); END IF; -END;; +END\\ DELIMITER ; ``` diff --git a/requirements.txt b/requirements.txt index 1abe921..2203fed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ psutil~=5.8 pymysql~=1.0 pytz~=2021.2 requests~=2.26 +redis \ No newline at end of file diff --git a/scraper.ex.json b/scraper.ex.json index e8b44af..f629695 100644 --- a/scraper.ex.json +++ b/scraper.ex.json @@ -38,9 +38,11 @@ }, "boards": { - "example": { - "sourceBoard": "a" - } - } + "g": { + "timeBetweenIndexUpdates": 30, + "blacklistPostFilter": ".*(gpt|local models|stable diff|voice synth|macos|windows).*", + "whitelistPostFilter": ".*(dra/g/on maid|maids|big numbers).*" + } + } } } diff --git a/scraper/Indexer.py b/scraper/Indexer.py index b2e42b7..2627def 100644 --- a/scraper/Indexer.py +++ b/scraper/Indexer.py @@ -7,6 +7,43 @@ from .ItemTopic import * from .Utils import * + +def should_archive_topic(topic_d, conf): + """ + - If a post is blacklisted and whitelisted, it will not be archived - blacklisted filters beat whitelisted filters. + - If only a blacklist is specified, only skip blacklisted posts and archive everything else. + - If only a whitelist is specified, only archive whitelist posts and archive everything else. + - If no lists are specified, archive everything. + """ + + subject = topic_d.get('sub', False) + comment = topic_d.get('com', False) + + blacklist_post_filter = conf.get('blacklistPostFilter', False) + if blacklist_post_filter: + if subject: + if re.fullmatch(blacklist_post_filter, subject, re.IGNORECASE) is not None: + return False + + if comment: + if re.fullmatch(blacklist_post_filter, comment, re.IGNORECASE) is not None: + return False + + whitelist_post_filter = conf.get('whitelistPostFilter', False) + if whitelist_post_filter: + if subject: + if re.fullmatch(whitelist_post_filter, subject, re.IGNORECASE) is not None: + return True + + if comment: + if re.fullmatch(whitelist_post_filter, comment, re.IGNORECASE) is not None: + return True + + return False + + return True + + class Indexer(Thread): def __init__(self, board, **args): super().__init__(board, **args) @@ -96,6 +133,9 @@ def run(self): ): raise Exception() + if not should_archive_topic(topic_d, self.board.conf): + continue + topic = None # find the corresponding topic object