Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: filter threads with white/black lists #4

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
scraper.json
venv
*.pyc
launch.json
media/
40 changes: 21 additions & 19 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ The object key defines the board's name in the database. You may use `sourceBoar
* `requestTimeoutFile` - Request timeout for image requests. Suggested values are between `40` and `120`.
* `requestThrottleBoard` - Minimum time between requests for each board, in seconds. Suggested values are between `0.3` and `1.0`. Don't go too high on fast boards.
* `requestThrottleGlobal` - Minimum time between requests for all boards, in seconds. Be careful with this. If you have more than a few boards, just set it to zero.
* `blacklistPostFilter` - A board level config that's a regex pattern. If specified, will **never** download threads with matching subjects or comments.
* `whitelistPostFilter` - A board level config that's a regex pattern. If specified, will **only** download threads with matching subjects or comments. If `blacklistPostFilter` is specified and has a match, a thread will not be downloaded despite a `whitelistPostFilter` match.

## Database Setup

Expand Down Expand Up @@ -308,19 +310,19 @@ CREATE TABLE `%%BOARD%%_threads` (
<summary>SQL Code for Triggers</summary>

```sql
DELIMITER ;;
DELIMITER \\

DROP PROCEDURE IF EXISTS `update_thread_%%BOARD%%`;;
DROP PROCEDURE IF EXISTS `create_thread_%%BOARD%%`;;
DROP PROCEDURE IF EXISTS `delete_thread_%%BOARD%%`;;
DROP PROCEDURE IF EXISTS `insert_image_%%BOARD%%`;;
DROP PROCEDURE IF EXISTS `delete_image_%%BOARD%%`;;
DROP PROCEDURE IF EXISTS `insert_post_%%BOARD%%`;;
DROP PROCEDURE IF EXISTS `delete_post_%%BOARD%%`;;
DROP PROCEDURE IF EXISTS `update_thread_%%BOARD%%`\\
DROP PROCEDURE IF EXISTS `create_thread_%%BOARD%%`\\
DROP PROCEDURE IF EXISTS `delete_thread_%%BOARD%%`\\
DROP PROCEDURE IF EXISTS `insert_image_%%BOARD%%`\\
DROP PROCEDURE IF EXISTS `delete_image_%%BOARD%%`\\
DROP PROCEDURE IF EXISTS `insert_post_%%BOARD%%`\\
DROP PROCEDURE IF EXISTS `delete_post_%%BOARD%%`\\

DROP TRIGGER IF EXISTS `before_ins_%%BOARD%%`;;
DROP TRIGGER IF EXISTS `after_ins_%%BOARD%%`;;
DROP TRIGGER IF EXISTS `after_del_%%BOARD%%`;;
DROP TRIGGER IF EXISTS `before_ins_%%BOARD%%`\\
DROP TRIGGER IF EXISTS `after_ins_%%BOARD%%`\\
DROP TRIGGER IF EXISTS `after_del_%%BOARD%%`\\

CREATE PROCEDURE `update_thread_%%BOARD%%` (ins INT, tnum INT, subnum INT, timestamp INT, media INT, email VARCHAR(100))
BEGIN
Expand All @@ -335,17 +337,17 @@ BEGIN
op.nreplies = IF(ins, (op.nreplies + 1), (op.nreplies - 1)),
op.nimages = IF(media, IF(ins, (op.nimages + 1), (op.nimages - 1)), op.nimages)
WHERE op.thread_num = tnum;
END;;
END\\

CREATE PROCEDURE `create_thread_%%BOARD%%` (num INT, timestamp INT)
BEGIN
INSERT IGNORE INTO `%%BOARD%%_threads` VALUES (num, timestamp, timestamp, timestamp, NULL, NULL, timestamp, 0, 0, 0, 0);
END;;
END\\

CREATE PROCEDURE `delete_thread_%%BOARD%%` (tnum INT)
BEGIN
DELETE FROM `%%BOARD%%_threads` WHERE thread_num = tnum;
END;;
END\\

CREATE PROCEDURE `insert_image_%%BOARD%%` (n_media_hash VARCHAR(25), n_media VARCHAR(20), n_preview VARCHAR(20), n_op INT)
BEGIN
Expand All @@ -366,12 +368,12 @@ BEGIN
preview_reply = COALESCE(preview_reply, VALUES(preview_reply)),
media = COALESCE(media, VALUES(media));
END IF;
END;;
END\\

CREATE PROCEDURE `delete_image_%%BOARD%%` (n_media_id INT)
BEGIN
UPDATE `%%BOARD%%_images` SET total = (total - 1) WHERE media_id = n_media_id;
END;;
END\\

CREATE TRIGGER `before_ins_%%BOARD%%` BEFORE INSERT ON `%%BOARD%%`
FOR EACH ROW
Expand All @@ -380,7 +382,7 @@ BEGIN
CALL insert_image_%%BOARD%%(NEW.media_hash, NEW.media_orig, NEW.preview_orig, NEW.op);
SET NEW.media_id = LAST_INSERT_ID();
END IF;
END;;
END\\

CREATE TRIGGER `after_ins_%%BOARD%%` AFTER INSERT ON `%%BOARD%%`
FOR EACH ROW
Expand All @@ -389,7 +391,7 @@ BEGIN
CALL create_thread_%%BOARD%%(NEW.num, NEW.timestamp);
END IF;
CALL update_thread_%%BOARD%%(1, NEW.thread_num, NEW.subnum, NEW.timestamp, NEW.media_id, NEW.email);
END;;
END\\

CREATE TRIGGER `after_del_%%BOARD%%` AFTER DELETE ON `%%BOARD%%`
FOR EACH ROW
Expand All @@ -401,7 +403,7 @@ BEGIN
IF OLD.media_hash IS NOT NULL THEN
CALL delete_image_%%BOARD%%(OLD.media_id);
END IF;
END;;
END\\

DELIMITER ;
```
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ psutil~=5.8
pymysql~=1.0
pytz~=2021.2
requests~=2.26
redis
10 changes: 6 additions & 4 deletions scraper.ex.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,11 @@
},

"boards": {
"example": {
"sourceBoard": "a"
}
}
"g": {
"timeBetweenIndexUpdates": 30,
"blacklistPostFilter": ".*(gpt|local models|stable diff|voice synth|macos|windows).*",
"whitelistPostFilter": ".*(dra/g/on maid|maids|big numbers).*"
}
}
}
}
40 changes: 40 additions & 0 deletions scraper/Indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,43 @@
from .ItemTopic import *
from .Utils import *


def should_archive_topic(topic_d, conf):
"""
- If a post is blacklisted and whitelisted, it will not be archived - blacklisted filters beat whitelisted filters.
- If only a blacklist is specified, only skip blacklisted posts and archive everything else.
- If only a whitelist is specified, only archive whitelist posts and archive everything else.
- If no lists are specified, archive everything.
"""

subject = topic_d.get('sub', False)
comment = topic_d.get('com', False)

blacklist_post_filter = conf.get('blacklistPostFilter', False)
if blacklist_post_filter:
if subject:
if re.fullmatch(blacklist_post_filter, subject, re.IGNORECASE) is not None:
return False

if comment:
if re.fullmatch(blacklist_post_filter, comment, re.IGNORECASE) is not None:
return False

whitelist_post_filter = conf.get('whitelistPostFilter', False)
if whitelist_post_filter:
if subject:
if re.fullmatch(whitelist_post_filter, subject, re.IGNORECASE) is not None:
return True

if comment:
if re.fullmatch(whitelist_post_filter, comment, re.IGNORECASE) is not None:
return True

return False

return True


class Indexer(Thread):
def __init__(self, board, **args):
super().__init__(board, **args)
Expand Down Expand Up @@ -96,6 +133,9 @@ def run(self):
):
raise Exception()

if not should_archive_topic(topic_d, self.board.conf):
continue

topic = None

# find the corresponding topic object
Expand Down