-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_transcript_urls.py
81 lines (69 loc) · 2.44 KB
/
get_transcript_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import requests
from bs4 import BeautifulSoup
import json
from pytube import Channel
from tqdm import tqdm
def get_wiki_urls():
url_collection = []
page = "https://game-scripts-wiki.blogspot.com"
first_post = "https://game-scripts-wiki.blogspot.com/2018/06/"
last_post = "https://game-scripts-wiki.blogspot.com/2023/05/"
# set start and stop
start = first_post.split("/")
current_month = int(start[4])
current_year = int(start[3])
end = last_post.split("/")
max_month = int(end[4])
max_year = int(end[3])
# check all posts between start and stop
while current_month <= max_month or current_year < max_year:
# build current url
month = f"{current_month:02d}"
year = str(current_year)
combined = '/'.join([year, month])
url = '/'.join([page, combined, ""])
content = requests.get(url)
# read html
soup = BeautifulSoup(content.text, "html.parser")
posts = soup.select('script[type="application/ld+json"]')
# extract target urls
if posts:
for post in posts:
data = json.loads(post.string)
target_url = data["mainEntityOfPage"]["@id"]
url_collection.append(target_url)
# logic of month and year calculation
if current_month < 12:
current_month += 1
else:
current_month = 1
current_year += 1
# save gathered urls
with open("data/url_list.txt", "w") as f:
for url in url_collection:
f.write(f"{url}\n")
def get_youtube_urls(channels: list, verbose=False):
"""
this functions saves all video urls from the input channel list to a file
:param channels: a list of youtube channels
:return: None
"""
urls = []
for chan in channels:
c = Channel(chan)
name = chan.split("/")[-1]
num_videos = len(c.video_urls)
for vid in tqdm(c.videos, total=num_videos, desc=name):
if all(keyword in vid.title.lower() for keyword in ["no commentary", "full game", "gameplay"]):
urls.append(vid.watch_url)
with open("temp/urls.txt", "w") as f:
for url in urls:
f.write(f"{url}\n")
channel_list = [
"https://www.youtube.com/@glp",
"https://www.youtube.com/@MKIceAndFire",
"https://www.youtube.com/@FullPlaythroughs",
"https://www.youtube.com/@Shirrako"
]
get_wiki_urls()
get_youtube_urls(channel_list)