Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[shoutfactorytv] Add new extractor #11861

Closed
wants to merge 12 commits into from
88 changes: 25 additions & 63 deletions youtube_dl/extractor/bandcamp.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from __future__ import unicode_literals

import json
import random
import re
import time

from .common import InfoExtractor
from ..compat import (
Expand All @@ -14,9 +12,6 @@
ExtractorError,
float_or_none,
int_or_none,
parse_filesize,
unescapeHTML,
update_url_query,
)


Expand Down Expand Up @@ -86,68 +81,35 @@ def _real_extract(self, url):
r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
webpage, 'video id')

download_webpage = self._download_webpage(
download_link, video_id, 'Downloading free downloads page')

blob = self._parse_json(
self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
'blob', group='blob'),
video_id, transform_source=unescapeHTML)

info = blob['digital_items'][0]

downloads = info['downloads']
track = info['title']

artist = info.get('artist')
title = '%s - %s' % (artist, track) if artist else track

download_formats = {}
for f in blob['download_formats']:
name, ext = f.get('name'), f.get('file_extension')
if all(isinstance(x, compat_str) for x in (name, ext)):
download_formats[name] = ext.strip('.')

formats = []
for format_id, f in downloads.items():
format_url = f.get('url')
if not format_url:
continue
# Stat URL generation algorithm is reverse engineered from
# download_*_bundle_*.js
stat_url = update_url_query(
format_url.replace('/download/', '/statdownload/'), {
'.rand': int(time.time() * 1000 * random.random()),
})
format_id = f.get('encoding_name') or format_id
stat = self._download_json(
stat_url, video_id, 'Downloading %s JSON' % format_id,
transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
fatal=False)
if not stat:
continue
retry_url = stat.get('retry_url')
if not isinstance(retry_url, compat_str):
continue
formats.append({
'url': self._proto_relative_url(retry_url, 'http:'),
'ext': download_formats.get(format_id),
'format_id': format_id,
'format_note': f.get('description'),
'filesize': parse_filesize(f.get('size_mb')),
'vcodec': 'none',
})
self._sort_formats(formats)
download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page')
# We get the dictionary of the track from some javascript code
all_info = self._parse_json(self._search_regex(
r'(?sm)items: (.*?),$', download_webpage, 'items'), video_id)
info = all_info[0]
# We pick mp3-320 for now, until format selection can be easily implemented.
mp3_info = info['downloads']['mp3-320']
# If we try to use this url it says the link has expired
initial_url = mp3_info['url']
m_url = re.match(
r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$',
initial_url)
# We build the url we will use to get the final track url
# This url is build in Bandcamp in the script download_bunde_*.js
request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), video_id, m_url.group('ts'))
final_url_webpage = self._download_webpage(request_url, video_id, 'Requesting download url')
# If we could correctly generate the .rand field the url would be
# in the "download_url" key
final_url = self._proto_relative_url(self._search_regex(
r'"retry_url":"(.+?)"', final_url_webpage, 'final video URL'), 'http:')

return {
'id': video_id,
'title': title,
'title': info['title'],
'ext': 'mp3',
'vcodec': 'none',
'url': final_url,
'thumbnail': info.get('thumb_url'),
'uploader': info.get('artist'),
'artist': artist,
'track': track,
'formats': formats,
}


Expand Down Expand Up @@ -234,4 +196,4 @@ def _real_extract(self, url):
'id': playlist_id,
'title': title,
'entries': entries,
}
}
1 change: 1 addition & 0 deletions youtube_dl/extractor/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,6 +834,7 @@
SharedIE,
VivoIE,
)
from .shoutfactorytv import ShoutFactoryTVIE
from .showroomlive import ShowRoomLiveIE
from .sina import SinaIE
from .sixplay import SixPlayIE
Expand Down
54 changes: 54 additions & 0 deletions youtube_dl/extractor/shoutfactorytv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
ExtractorError,
parse_m3u8_attributes,
)


class ShoutFactoryTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?shoutfactorytv\.com/.*?/(?P<id>[0-9a-d]+)'
_TEST = {
'url': 'http://www.shoutfactorytv.com/mst3k-shorts/mst3k-short-x-marks-the-spot/57473979e0a6b40d7300809a',
'md5': 'a04c5394947cead82be3808ec6285f71',
'info_dict': {
'id': '57473979e0a6b40d7300809a',
'ext': 'mp4',
'title': 'MST3K Short: X Marks The Spot',
'series': 'MST3K Shorts',
'description': 'Poor Joe gets grilled in a heavenly court in this WWII era film promoting road safety in New Jersey.',
}
}

def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)

title = self._html_search_regex(
r'<h2><span>(.+)</span>.+</h2>', webpage, 'title')
series = self._html_search_regex(
r'<h2><span>.+</span> (.+)</h2>', webpage, 'series', default=None)

player_embed = re.search(
r'<script src=(["\'])(?P<javascript>https://player.zype.com\S+)\1', webpage)
if not player_embed:
raise ExtractorError('Could not extract player\'s JavaScript.')
javascript = player_embed.group('javascript')
download_js = self._download_webpage(
javascript, video_id, 'Downloading JavaScript page')

m3u8 = self._html_search_regex(
r"source0.src = '(.*?)'", download_js, 'm3u8')
formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4')
self._sort_formats(formats)

return {
'id': video_id,
'title': title,
'formats': formats,
'series': series,
}