Skip to content

Commit

Permalink
requirements test
Browse files Browse the repository at this point in the history
  • Loading branch information
neonwatty committed Jul 17, 2024
1 parent 0705bed commit d1602e0
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 34 deletions.
23 changes: 14 additions & 9 deletions transcript_downloader_walkthrough.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"outputs": [],
"source": [
"import os\n",
"\n",
"# if running in collab pull repo and install requirements\n",
"if os.getenv(\"COLAB_RELEASE_TAG\"):\n",
" !git clone https://github.com/jermwatt/youtube_transcript_downloader.git\n",
Expand Down Expand Up @@ -97,15 +98,16 @@
"source": [
"import pandas as pd\n",
"\n",
"\n",
"def parse_input_file(input_file_path: str) -> list:\n",
" youtube_urls = []\n",
" with open(input_file_path, 'r') as file:\n",
" with open(input_file_path, \"r\") as file:\n",
" for line in file:\n",
" youtube_urls.append(line.strip())\n",
" return youtube_urls\n",
"\n",
"def save_output(data: list,\n",
" output_file_path: str) -> None:\n",
"\n",
"def save_output(data: list, output_file_path: str) -> None:\n",
" df = pd.DataFrame(data)\n",
" df.to_csv(output_file_path, index=False)"
]
Expand Down Expand Up @@ -134,10 +136,11 @@
"source": [
"import re\n",
"\n",
"\n",
"def is_valid_youtube_shorts_url(url: str) -> bool:\n",
" if not isinstance(url, str):\n",
" return False \n",
" pattern = r'^https://www\\.youtube\\.com/shorts/[A-Za-z0-9_-]{11}$' # youtube vido ids are always 11 chars long\n",
" return False\n",
" pattern = r\"^https://www\\.youtube\\.com/shorts/[A-Za-z0-9_-]{11}$\" # youtube vido ids are always 11 chars long\n",
" return re.match(pattern, url) is not None"
]
},
Expand Down Expand Up @@ -176,7 +179,8 @@
"source": [
"from typing import List, Dict\n",
"from youtube_transcript_api import YouTubeTranscriptApi\n",
" \n",
"\n",
"\n",
"def get_single_transcript(youtube_url: str) -> dict:\n",
" try:\n",
" if is_valid_youtube_shorts_url(youtube_url):\n",
Expand All @@ -194,6 +198,7 @@
" print(f\"FAILURE: transcript pull for youtube_url - {youtube_url} - failed with exception {e}\")\n",
" return {}\n",
"\n",
"\n",
"def get_batch_transcripts(youtube_urls: List[str]) -> List[Dict]:\n",
" valid_urls = []\n",
" valid_vids = []\n",
Expand Down Expand Up @@ -298,10 +303,10 @@
}
],
"source": [
"# print out first few lines of input \n",
"# print out first few lines of input\n",
"with open(\"data/input/test_input.txt\") as myfile:\n",
" first_few_lines=myfile.readlines(1024)[0:3] \n",
"print(first_few_lines)\n"
" first_few_lines = myfile.readlines(1024)[0:3]\n",
"print(first_few_lines)"
]
},
{
Expand Down
31 changes: 17 additions & 14 deletions youtube_shorts_transcript_downloader/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,36 @@

st.set_page_config(page_title="Youtube Shorts Transcript Downloader", layout="wide")
st.title("YT Shorts Transcript Downloader")
st.markdown(
"instructions: enter in urls separated by commas or upload a text file with one url per line"
)
st.markdown("instructions: enter in urls separated by commas or upload a text file with one url per line")


base = st.container(border=True)
with base:
col1, sep_col, col2 = st.columns([5, 2, 5])

with col1:
text_urls = st.text_area("youtube shorts urls", value="", placeholder="enter urls separated by commas - for example: https://www.youtube.com/shorts/o7a9hx-Pqyo, https://www.youtube.com/shorts/xkAYLnIsfX4")

text_urls = st.text_area(
"youtube shorts urls",
value="",
placeholder="enter urls separated by commas - for example: https://www.youtube.com/shorts/o7a9hx-Pqyo, https://www.youtube.com/shorts/xkAYLnIsfX4",
)

with col2:
uploaded_file = st.file_uploader("Choose a File", type=["txt"])

col3, col4, col5 = st.columns([3, 2, 3])
with col3:
trans_button_val = st.button(label="fetch transcripts", type="primary")
with col4:
empty_container = st.container()
with col5:
placeholder = st.empty()

download_area = st.container()

# https://www.youtube.com/shorts/o7a9hx-Pqyo, https://www.youtube.com/shorts/xkAYLnIsfX4


@st.cache_data
def convert_df(df: pd.DataFrame) -> "csv":
# IMPORTANT: Cache the conversion to prevent computation on every rerun
Expand Down Expand Up @@ -62,10 +65,10 @@ def button_logic(youtube_short_urls: list) -> None:
if len(text_urls.strip()) > 0:
st.warning("you can enter urls manually or from file but not both", icon="⚠️")
st.stop()

if uploaded_file.type == "text/plain":
from io import StringIO

stringio = StringIO(uploaded_file.read().decode("utf-8"))
for line in stringio:
youtube_short_urls.append(line.strip())
Expand All @@ -78,14 +81,14 @@ def button_logic(youtube_short_urls: list) -> None:
if uploaded_file is not None:
st.warning("you can enter urls manually or from file but not both", icon="⚠️")
st.stop()

try:
text_urls_split = text_urls.split(",")
text_urls_split = [v.strip() for v in text_urls_split]
youtube_short_urls = text_urls_split
except:
st.warning("please check your manually entered urls", icon="⚠️")
except: # noqa E722
st.warning("please check your manually entered urls", icon="⚠️")
st.stop()

with st.spinner(text="transcript pull in progress..."):
button_logic(youtube_short_urls)
17 changes: 6 additions & 11 deletions youtube_shorts_transcript_downloader/transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

def is_valid_youtube_shorts_url(url: str) -> bool:
if not isinstance(url, str):
return False
pattern = r'^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$' # youtube vido ids are always 11 chars long
return False
pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long
return re.match(pattern, url) is not None


Expand All @@ -24,9 +24,7 @@ def get_single_transcript(youtube_url: str) -> dict:
print(f"FAILURE: youtube_url is not valid - {youtube_url}")
return {}
except Exception as e:
print(
f"FAILURE: transcript pull for youtube_url - {youtube_url} - failed with exception {e}"
)
print(f"FAILURE: transcript pull for youtube_url - {youtube_url} - failed with exception {e}")
return {}


Expand All @@ -39,12 +37,9 @@ def get_batch_transcripts(youtube_urls: List[str]) -> List[Dict]:
valid_urls.append(url)
valid_vids.append(vid)
try:
video_transcripts = YouTubeTranscriptApi.get_transcripts(
valid_vids, languages=["en"]
)[0]
print(YouTubeTranscriptApi.get_transcripts(
valid_vids, languages=["en"]))

video_transcripts = YouTubeTranscriptApi.get_transcripts(valid_vids, languages=["en"])[0]
print(YouTubeTranscriptApi.get_transcripts(valid_vids, languages=["en"]))

entries = []
for i in range(len(valid_urls)):
entry = {}
Expand Down

0 comments on commit d1602e0

Please sign in to comment.