From 978b1f0c713dce88f7c7cca5717841536052e2e5 Mon Sep 17 00:00:00 2001
From: ReenigneArcher <42013603+ReenigneArcher@users.noreply.github.com>
Date: Fri, 26 Jul 2024 10:21:51 -0400
Subject: [PATCH] fix(yt-dl): validate audio url and add retry logic

---
 Contents/Code/default_prefs.py       |   3 +-
 Contents/Code/plex_api_helper.py     |   2 +-
 Contents/Code/youtube_dl_helper.py   | 128 +++++++++++++-----------
 Contents/DefaultPrefs.json           |   7 ++
 Contents/Strings/en.json             |   3 +-
 docs/source/about/usage.rst          |  18 +++-
 tests/conftest.py                    |   2 +-
 tests/unit/test_migration_helper.py  |   4 +-
 tests/unit/test_youtube_dl_helper.py | 141 ++++++++++++++++++++++++---
 9 files changed, 228 insertions(+), 80 deletions(-)

diff --git a/Contents/Code/default_prefs.py b/Contents/Code/default_prefs.py
index abc561ac..02e1dd30 100644
--- a/Contents/Code/default_prefs.py
+++ b/Contents/Code/default_prefs.py
@@ -17,8 +17,9 @@
     int_update_themes_interval='60',
     int_update_database_cache_interval='60',
     int_plexapi_plexapi_timeout='180',
-    int_plexapi_upload_retries_max='3',
+    int_plexapi_upload_retries_max='6',
     int_plexapi_upload_threads='3',
+    int_youtube_retries_max='8',
     str_youtube_cookies='',
     enum_webapp_locale='en',
     str_webapp_http_host='0.0.0.0',
diff --git a/Contents/Code/plex_api_helper.py b/Contents/Code/plex_api_helper.py
index 640a6399..106cfbb5 100644
--- a/Contents/Code/plex_api_helper.py
+++ b/Contents/Code/plex_api_helper.py
@@ -406,7 +406,7 @@ def upload_media(item, method, filepath=None, url=None):
     ...
     """
     count = 0
-    while count <= int(Prefs['int_plexapi_upload_retries_max']):
+    while count <= max(0, int(Prefs['int_plexapi_upload_retries_max'])):
         try:
             if filepath:
                 if method == item.uploadTheme:
diff --git a/Contents/Code/youtube_dl_helper.py b/Contents/Code/youtube_dl_helper.py
index 34cc9cf0..ffca65d0 100644
--- a/Contents/Code/youtube_dl_helper.py
+++ b/Contents/Code/youtube_dl_helper.py
@@ -5,6 +5,7 @@
 import json
 import os
 import tempfile
+import time
 
 # plex debugging
 try:
@@ -16,6 +17,7 @@
     from plexhints.prefs_kit import Prefs  # prefs kit
 
 # imports from Libraries\Shared
+import requests
 from typing import Optional
 import youtube_dl
 
@@ -26,7 +28,7 @@
 plugin_logger = logging.getLogger(plugin_identifier)
 
 
-def nsbool(value):
+def ns_bool(value):
     # type: (bool) -> str
     """
     Format a boolean value for a Netscape cookie jar file.
@@ -69,7 +71,7 @@ def process_youtube(url):
     cookie_jar_file.write('# Netscape HTTP Cookie File\n')
 
     youtube_dl_params = dict(
-        cookiefile=cookie_jar_file.name,
+        cookiefile=cookie_jar_file.name if Prefs['str_youtube_cookies'] else None,
         logger=plugin_logger,
         socket_timeout=10,
         youtube_include_dash_manifest=False,
@@ -83,9 +85,9 @@ def process_youtube(url):
                 expiry = int(cookie.get('expiry', 0))
                 values = [
                     cookie['domain'],
-                    nsbool(include_subdomain),
+                    ns_bool(include_subdomain),
                     cookie['path'],
-                    nsbool(cookie['secure']),
+                    ns_bool(cookie['secure']),
                     str(expiry),
                     cookie['name'],
                     cookie['value']
@@ -100,65 +102,73 @@ def process_youtube(url):
     try:
         ydl = youtube_dl.YoutubeDL(params=youtube_dl_params)
 
-        with ydl:
-            try:
-                result = ydl.extract_info(
-                    url=url,
-                    download=False  # We just want to extract the info
-                )
-            except Exception as exc:
-                if isinstance(exc, youtube_dl.utils.ExtractorError) and exc.expected:
-                    Log.Info('YDL returned YT error while downloading {}: {}'.format(url, exc))
-                else:
-                    Log.Exception('YDL returned an unexpected error while downloading {}: {}'.format(url, exc))
-                return None
-
-            if 'entries' in result:
-                # Can be a playlist or a list of videos
-                video_data = result['entries'][0]
-            else:
-                # Just a video
-                video_data = result
-
-        selected = {
-            'opus': {
-                'size': 0,
-                'audio_url': None
-            },
-            'mp4a': {
-                'size': 0,
-                'audio_url': None
-            },
-        }
-        if video_data:
-            for fmt in video_data['formats']:  # loop through formats, select largest audio size for better quality
-                if 'audio only' in fmt['format']:
-                    if 'opus' == fmt['acodec']:
-                        temp_codec = 'opus'
-                    elif 'mp4a' == fmt['acodec'].split('.')[0]:
-                        temp_codec = 'mp4a'
-                    else:
-                        Log.Debug('Unknown codec: %s' % fmt['acodec'])
-                        continue  # unknown codec
-                    filesize = int(fmt['filesize'])
-                    if filesize > selected[temp_codec]['size']:
-                        selected[temp_codec]['size'] = filesize
-                        selected[temp_codec]['audio_url'] = fmt['url']
-
         audio_url = None
 
-        if 0 < selected['opus']['size'] > selected['mp4a']['size']:
-            audio_url = selected['opus']['audio_url']
-        elif 0 < selected['mp4a']['size'] > selected['opus']['size']:
-            audio_url = selected['mp4a']['audio_url']
-
-        if audio_url and Prefs['bool_prefer_mp4a_codec']:  # mp4a codec is preferred
-            if selected['mp4a']['audio_url']:  # mp4a codec is available
-                audio_url = selected['mp4a']['audio_url']
-            elif selected['opus']['audio_url']:  # fallback to opus :(
+        count = 0
+        while count <= max(0, int(Prefs['int_youtube_retries_max'])):
+            sleep_time = 2 ** count
+            time.sleep(sleep_time)
+            with ydl:
+                try:
+                    result = ydl.extract_info(
+                        url=url,
+                        download=False  # We just want to extract the info
+                    )
+                except Exception as exc:
+                    if isinstance(exc, youtube_dl.utils.ExtractorError) and exc.expected:
+                        Log.Info('YDL returned YT error while downloading {}: {}'.format(url, exc))
+                    else:
+                        Log.Exception('YDL returned an unexpected error while downloading {}: {}'.format(url, exc))
+                    count += 1
+                    continue
+
+                # If a playlist was provided, select the first video
+                video_data = result['entries'][0] if 'entries' in result else result
+
+            selected = {
+                'opus': {
+                    'size': 0,
+                    'audio_url': None
+                },
+                'mp4a': {
+                    'size': 0,
+                    'audio_url': None
+                },
+            }
+            if video_data:
+                for fmt in video_data['formats']:  # loop through formats, select largest audio size for better quality
+                    if 'audio only' in fmt['format']:
+                        if 'opus' == fmt['acodec']:
+                            temp_codec = 'opus'
+                        elif 'mp4a' == fmt['acodec'].split('.')[0]:
+                            temp_codec = 'mp4a'
+                        else:
+                            Log.Debug('Unknown codec: %s' % fmt['acodec'])
+                            continue  # unknown codec
+                        filesize = int(fmt['filesize'])
+                        if filesize > selected[temp_codec]['size']:
+                            selected[temp_codec]['size'] = filesize
+                            selected[temp_codec]['audio_url'] = fmt['url']
+
+            if 0 < selected['opus']['size'] > selected['mp4a']['size']:
                 audio_url = selected['opus']['audio_url']
+            elif 0 < selected['mp4a']['size'] > selected['opus']['size']:
+                audio_url = selected['mp4a']['audio_url']
+
+            if audio_url and Prefs['bool_prefer_mp4a_codec']:  # mp4a codec is preferred
+                if selected['mp4a']['audio_url']:  # mp4a codec is available
+                    audio_url = selected['mp4a']['audio_url']
+                elif selected['opus']['audio_url']:  # fallback to opus :(
+                    audio_url = selected['opus']['audio_url']
+
+            if audio_url:
+                validate = requests.get(url=audio_url, stream=True)
+                if validate.status_code != 200:
+                    Log.Warn('Failed to validate audio URL for video {}'.format(url))
+                    count += 1
+                    continue
 
-        return audio_url  # return None or url found
+            return audio_url  # return None or url found
     finally:
         try:
             os.remove(cookie_jar_file.name)
diff --git a/Contents/DefaultPrefs.json b/Contents/DefaultPrefs.json
index 91203691..743c188b 100644
--- a/Contents/DefaultPrefs.json
+++ b/Contents/DefaultPrefs.json
@@ -125,6 +125,13 @@
 		"default": "3",
 		"secure": "false"
 	},
+	{
+		"id": "int_youtube_retries_max",
+		"type": "text",
+		"label": "int_youtube_retries_max",
+		"default": "8",
+		"secure": "false"
+	},
 	{
 		"id": "str_youtube_cookies",
 		"type": "text",
diff --git a/Contents/Strings/en.json b/Contents/Strings/en.json
index 748a0d62..f68f58d2 100644
--- a/Contents/Strings/en.json
+++ b/Contents/Strings/en.json
@@ -15,8 +15,9 @@
   "int_update_themes_interval": "Interval for automatic update task, in minutes (min: 15)",
   "int_update_database_cache_interval": "Interval for database cache update task, in minutes (min: 15)",
   "int_plexapi_plexapi_timeout": "PlexAPI Timeout, in seconds (min: 1)",
-  "int_plexapi_upload_retries_max": "Max Retries, integer (min: 0)",
+  "int_plexapi_upload_retries_max": "Max Retries (PlexAPI uploads), integer (min: 0)",
   "int_plexapi_upload_threads": "Multiprocessing Threads, integer (min: 1)",
+  "int_youtube_retries_max": "Max Retries (YouTube), integer (min: 0)",
   "str_youtube_cookies": "YouTube Cookies (JSON format)",
   "enum_webapp_locale": "Web UI Locale",
   "str_webapp_http_host": "Web UI Host Address (requires Plex Media Server restart)",
diff --git a/docs/source/about/usage.rst b/docs/source/about/usage.rst
index 54e756ce..848ee949 100644
--- a/docs/source/about/usage.rst
+++ b/docs/source/about/usage.rst
@@ -228,8 +228,8 @@ Default
 Minimum
    ``1``
 
-Max Retries
-^^^^^^^^^^^
+Max Retries (PlexAPI uploads)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Description
    The number of times to retry uploading theme audio to the Plex server. The time between retries will increase
@@ -242,6 +242,20 @@ Default
 Minimum
    ``0``
 
+Max Retries (YouTube)
+^^^^^^^^^^^^^^^^^^^^^
+
+Description
+   The number of times to retry getting an audio url from YouTube. The time between retries will increase
+   exponentially. The time between is calculated as ``2 ^ retry_number``. For example, the first retry will occur
+   after 2 seconds, the second retry will occur after 4 seconds, and the third retry will occur after 8 seconds.
+
+Default
+   ``8``
+
+Minimum
+   ``0``
+
 Multiprocessing Threads
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 6bd28a15..d5c256ec 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -59,7 +59,7 @@ def wait_for_themes(section):
     timer = 0
     with_themes = 0
     total = len(section.all())
-    while timer < 180 and with_themes < total:
+    while timer < 600 and with_themes < total:
         with_themes = 0
         try:
             for item in section.all():
diff --git a/tests/unit/test_migration_helper.py b/tests/unit/test_migration_helper.py
index 70cbaef1..b3e2fef4 100644
--- a/tests/unit/test_migration_helper.py
+++ b/tests/unit/test_migration_helper.py
@@ -54,7 +54,7 @@ def test_validate_migration_key(migration_helper_fixture, key, raise_exception,
 @pytest.mark.parametrize('key, expected', [
     (migration_helper_object.LOCKED_THEMES, None),
     (migration_helper_object.LOCKED_COLLECTION_FIELDS, None),
-    pytest.param('invalid', None, marks=pytest.mark.xfail(raises=AttributeError)),
+    pytest.param('invalid', None, marks=pytest.mark.xfail(raises=AttributeError, reason="Cannot migrate in CI")),
 ])
 def test_get_migration_status(migration_helper_fixture, migration_status_file, key, expected):
     migration_status = migration_helper_fixture.get_migration_status(key=key)
@@ -64,7 +64,7 @@ def test_get_migration_status(migration_helper_fixture, migration_status_file, k
 @pytest.mark.parametrize('key', [
     migration_helper_object.LOCKED_THEMES,
     migration_helper_object.LOCKED_COLLECTION_FIELDS,
-    pytest.param('invalid', marks=pytest.mark.xfail(raises=AttributeError)),
+    pytest.param('invalid', marks=pytest.mark.xfail(raises=AttributeError, reason="Cannot migrate in CI")),
 ])
 def test_set_migration_status(migration_helper_fixture, migration_status_file, key):
     # perform the test twice, to load an existing migration file
diff --git a/tests/unit/test_youtube_dl_helper.py b/tests/unit/test_youtube_dl_helper.py
index 4a522d92..d9d71686 100644
--- a/tests/unit/test_youtube_dl_helper.py
+++ b/tests/unit/test_youtube_dl_helper.py
@@ -1,28 +1,143 @@
 # -*- coding: utf-8 -*-
 
+# standard imports
+import binascii
+import os
+import tempfile
+from urlparse import urlparse
+
 # lib imports
 import pytest
+import requests
 
 # local imports
 from Code import youtube_dl_helper
 
+gh_url_prefix = 'https://raw.githubusercontent.com/LizardByte/ThemerrDB/gh-pages'
+
+xfail_rate_limit = "YouTube is probably rate limiting"
+
 
-@pytest.mark.parametrize('url', [
-    'https://www.youtube.com/watch?v=dQw4w9WgXcQ',
-    'https://www.youtube.com/watch?v=Wb8j8Ojd4YQ&list=PLMYr5_xSeuXAbhxYHz86hA1eCDugoxXY0&pp=iAQB',  # playlist test
+# standard items setup by plexhints action
+@pytest.fixture(scope='module', params=[
+    pytest.param({
+        'url': 'https://www.youtube.com/watch?v=dQw4w9WgXcQ',
+        'expected': True,
+    }),
+    pytest.param({  # playlist
+        'url': 'https://www.youtube.com/watch?v=Wb8j8Ojd4YQ&list=PLMYr5_xSeuXAbhxYHz86hA1eCDugoxXY0&pp=iAQB',
+        'expected': True,
+    }),
+    pytest.param({
+        'url': '{}/movies/themoviedb/9761.json'.format(gh_url_prefix),  # Elephants Dream
+        'expected': True,
+    }, marks=pytest.mark.xfail(reason=xfail_rate_limit)),
+    pytest.param({
+        'url': '{}/movies/themoviedb/20529.json'.format(gh_url_prefix),  # Sita Sings the Blues
+        'expected': True,
+    }, marks=pytest.mark.xfail(reason=xfail_rate_limit)),
+    pytest.param({
+        'url': '{}/movies/themoviedb/10378.json'.format(gh_url_prefix),  # Big Buck Bunny
+        'expected': True,
+    }, marks=pytest.mark.xfail(reason=xfail_rate_limit)),
+    pytest.param({
+        'url': '{}/movies/themoviedb/45745.json'.format(gh_url_prefix),  # Sintel
+        'expected': True,
+    }, marks=pytest.mark.xfail(reason=xfail_rate_limit)),
+    pytest.param({
+        'url': '{}/tv_shows/themoviedb/1399.json'.format(gh_url_prefix),  # Game of Thrones
+        'expected': True,
+    }, marks=pytest.mark.xfail(reason=xfail_rate_limit)),
+    pytest.param({
+        'url': '{}/tv_shows/themoviedb/48866.json'.format(gh_url_prefix),  # The 100
+        'expected': True,
+    }, marks=pytest.mark.xfail(reason=xfail_rate_limit)),
+    pytest.param({
+        'url': 'https://www.youtube.com/watch?v=notavideoid',  # invalid A
+        'expected': False,
+    }),
+    pytest.param({
+        'url': 'https://blahblahblah',  # invalid B
+        'expected': False,
+    }),
 ])
-def test_process_youtube(url):
+def youtube_url(request):
+    url = request.param['url']
+    host = urlparse(url=url).hostname
+    if host and not host.endswith(".githubusercontent.com"):
+        return request.param
+
+    # get the youtube_theme_url key from the JSON data
+    response = requests.get(url=url)
+    assert response.status_code == 200
+    data = response.json()
+    request.param['url'] = data['youtube_theme_url']
+    return request.param
+
+
+@pytest.fixture
+def temp_file():
+    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+        yield temp_file.name
+    os.remove(temp_file.name)
+
+
+def test_process_youtube(youtube_url, temp_file):
     # test valid urls
-    audio_url = youtube_dl_helper.process_youtube(url=url)
+    audio_url = youtube_dl_helper.process_youtube(url=youtube_url['url'])
+    if not youtube_url['expected']:
+        assert audio_url is None
+        return
+
     assert audio_url is not None
     assert audio_url.startswith('https://')
 
+    # valid signatures dictionary with offsets
+    valid_signatures = {
+        "1866747970": 3,  # Mp4 container
+        "6674797069736F6D": 4,  # Mp4 container
+        "494433": 0,  # ID3
+        "FFFB": 0,  # MPEG-1 Layer 3
+        "FFF3": 0,  # MPEG-1 Layer 3
+        "FFF2": 0,  # MPEG-1 Layer 3
+    }
 
-@pytest.mark.parametrize('url', [
-    'https://www.youtube.com/watch?v=notavideoid',
-    'https://blahblahblah',
-])
-def test_process_youtube_invalid(url):
-    # test invalid urls
-    audio_url = youtube_dl_helper.process_youtube(url=url)
-    assert audio_url is None
+    chunk_size = 0
+    for sig, offset in valid_signatures.items():
+        chunk_size = max(chunk_size, len(sig) + offset)  # get the max chunk size
+
+    # validate the format of the audio file at the URL
+    # download the file
+    with requests.get(audio_url, stream=True) as r:
+        r.raise_for_status()
+        with open(temp_file, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=chunk_size):
+                f.write(chunk)
+                break  # only download the first chunk for unit testing
+
+    # read the file bytes
+    with open(temp_file, 'rb') as f:
+        file_bytes = f.read()
+
+    # convert bytes to hex
+    file_bytes_hex = binascii.hexlify(file_bytes)
+    print(file_bytes_hex[:30])
+
+    # check if the file is not WebM
+    is_webm = file_bytes_hex.startswith("1A45DFA3")
+    assert not is_webm, "File {} is WebM".format(temp_file)
+
+    # check if the file is valid
+    is_valid = False
+
+    # loop through valid_signatures
+    for signature, offset in valid_signatures.items():
+        # remove the offset bytes
+        file_bytes_hex_without_offset = file_bytes_hex[offset * 2:]
+
+        # check if the beginning of the file_bytes_hex_without_offset matches the signature
+        if file_bytes_hex_without_offset.startswith(signature):
+            is_valid = True
+            break
+
+    assert is_valid, "File {} is not a valid format".format(temp_file)