Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Likee] Add new extractor #24551

Closed
wants to merge 12 commits into from
137 changes: 137 additions & 0 deletions youtube_dl/extractor/axios.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
try_get,
mimetype2ext,
clean_html
)


class AxiosIE(InfoExtractor):
IE_NAME = 'axios'
IE_DESC = 'www.axios.com'
_VALID_URL = r"""(?x)^
((http[s]?|fpt):)\/?\/(www\.|m\.|)
(?P<site>
(www\.axios\.com)
)\/
(?P<slug>.*?)\-
(?P<id>[0-9a-z]{8}\-[0-9a-z]{4}-[a-z0-9]{4}.+?)\.
"""
__TESTS = [
{
"url": r"https://www.axios.com/trump-coronavirus-restrictions-c3da2d28-b761-4b62-b6d6-734c059c6dba.html",
"info_dict": {
"id": "c3da2d28-b761-4b62-b6d6-734c059c6dba",
"title": '''Trump says he wants to "open" the country by Easter''',
"ext": "mp4",
"description": str,
'thumbnails': [],
}
},
{
"url": r"https://www.axios.com/coronavirus-texas-official-grandparents-die-172ca951-891c-44e7-a9ec-77c486e0c5c3.html",
"info_dict": {
"id": "172ca951-891c-44e7-a9ec-77c486e0c5c3",
"title": '''Texas Lt. Gov.: Grandparents would be willing to die to save the economy''',
"ext": "mp4",
"description": str,
'thumbnails': [],
}
},
{
"url": r"https://www.axios.com/cuomo-trump-mandatory-quarantine-panic-35ae54a1-0aa9-4a38-910d-647293002fc2.html",
"info_dict": {
"id": "35ae54a1-0aa9-4a38-910d-647293002fc2",
"title": '''Cuomo: Trump's mandatory quarantine comments "really panicked people"''',
"ext": "mp4",
"description": str,
'thumbnails': [],
}
},
{
"url": r"https://www.axios.com/coronavirus-louisiana-bel-edwards-ventilators-7810fc76-1825-41b2-8b22-f1cfc14e2ffe.html",
"info_dict": {
"id": "7810fc76-1825-41b2-8b22-f1cfc14e2ffe",
"title": '''Louisiana on track to exceed ventilator capacity this week, governor says''',
"ext": "mp4",
"description": str,
'thumbnails': [],
}
},
]
api_jwplayer = r'http://content.jwplatform.com/v2/media/%s'

def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
url_or_request=url,
video_id=video_id
)
jwplayer_mobj = re.search(
r'''<amp-jwplayer.+?data-player-id=\"(?P<player_id>.+?)\".+?data-media-id=\"(?P<media_id>.+?)\".+?\>\<\/amp-jwplayer>''',
webpage
)
description = self._search_regex(
r'''<div\s+class=\"b0w77w-0 jctzOA gtm-story-text\"\>(?P<description>.*?)\<\/div\>''',
webpage, "Description", group="description"
)
title = self._search_regex(
r'''<h1\s+class="sc-31t5q3-0 sc-1fjk95c-2 guveJc"\>(?P<title>.*?)\<\/h1\>''',
webpage, "Title", group="title"
)
description = clean_html(description)
# player_id = jwplayer_mobj.group("player_id")
media_id = jwplayer_mobj.group("media_id")
json_jwplayer = self._download_json(
url_or_request=self.api_jwplayer % media_id,
video_id=media_id,
)
playlist = try_get(json_jwplayer, lambda x: x['playlist'][0])
if playlist:
images = playlist.get('images')
thumbnails = [
{
"url": img.get('src'),
"width": img.get('width')
} for img in images if img.get('src')
]
sources = playlist.get('sources') or []
formats = []
for sour in sources:
if not sour:
continue
_type = sour.get('type')
ext = mimetype2ext(_type)
file = sour.get('file')
if ext == 'm3u8':
m3u8_doc = self._download_webpage(file, video_id=media_id)
formats.extend(self._parse_m3u8_formats(m3u8_doc, file))
elif ext == 'mp4':
formats.append({
"url": file,
"ext": ext,
"height": sour.get('height'),
"width": sour.get('width'),
'protocol': 'http',
"label": sour.get("label")
})
else:
formats.append({
"url": file,
"ext": ext,
'protocol': 'http',
"label": sour.get("label")
})
formats.sort(key=lambda x: x.get("height") if x.get("height") else -1)
return {
"id": video_id,
"title": title.strip(),
"thumbnails": thumbnails,
"formats": formats,
"description": description
}
16 changes: 16 additions & 0 deletions youtube_dl/extractor/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1503,3 +1503,19 @@
from .zdf import ZDFIE, ZDFChannelIE
from .zingmp3 import ZingMp3IE
from .zype import ZypeIE

from .axios import (
AxiosIE
)

from .likee import (
LikeeIE,
LikeeUserIE
)

from .zingmp3_vn import (
Zingmp3_vnIE,
Zingmp3_vnPlaylistIE,
Zingmp3_vnChartIE,
Zingmp3_vnUserIE,
)
203 changes: 203 additions & 0 deletions youtube_dl/extractor/likee.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# coding: utf-8
# Code by hatienl0i261299 - fb.com/100011734236090 - hatienloi261299@gmail.com
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_urlencode
)
from ..utils import (
js_to_json,
int_or_none,
try_get
)


class LikeeIE(InfoExtractor):
_VALID_URL = r'''(?x)^((http[s]?|fpt):)\/?\/(www\.|m\.|)
(?P<site>
(likee\.com)
)\/(?P<user>@.+?)\/(video)\/(?P<id>[0-9]+)$
'''
IE_NAME = 'likee'
IE_DESC = 'likee.com'
_TESTS = [
{
"url": "https://likee.com/@Inayat95/video/6808497581927578387",
"info_dict": {
"id": "6808497581927578387",
"ext": "mp4",
"title": "@Inayat95_6808497581927578387",
"description": str,
"thumbnail": r"re:^https?:.+?.jpg",
"uploader": str,
"uploader_id": int,
"like_count": int,
"comment_count": int,
"share_count": int,
"view_count": int,
"download_count": int
}
},
{
"url": "https://likee.com/@Inayat95/video/6792552721999608595",
"info_dict": {
"id": "6792552721999608595",
"ext": "mp4",
"title": "@Inayat95_6792552721999608595",
"description": str,
"thumbnail": r"re:^https?:.+?.jpg",
"uploader": str,
"uploader_id": int,
"like_count": int,
"comment_count": int,
"share_count": int,
"view_count": int,
"download_count": int
}
},
{
"url": "https://likee.com/@435421183/video/6802046076516688592",
"info_dict": {
"id": "6802046076516688592",
"ext": "mp4",
"title": "@435421183_6802046076516688592",
"description": str,
"thumbnail": r"re:^https?:.+?.jpg",
"uploader": str,
"uploader_id": int,
"like_count": int,
"comment_count": int,
"share_count": int,
"view_count": int,
"download_count": int
}
},
]

def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group("id")
user = mobj.group("user")
webpage = self._download_webpage(
url_or_request=url,
video_id=video_id
)

info_video = self._regex_data(webpage, video_id)

if info_video.get("post_id") == video_id:
formats = [{
"url": info_video.get("video_url") or info_video.get("video_water_url"),
"ext": "mp4",
"height": int_or_none(info_video.get("video_height")),
"width": int_or_none(info_video.get("video_width")),
"protocol": "http",
}]

def get_count(name):
return int_or_none(info_video.get(name), default=0)

return {
"id": video_id,
"title": "%s_%s" % (user, video_id),
"description": info_video.get("msg_text") or '',
"thumbnail": info_video.get("image1") or info_video.get("image2") or info_video.get("image3"),
"like_count": get_count("like_count"),
"view_count": get_count("play_count"),
"share_count": get_count("share_count"),
"download_count": get_count("download_count"),
"comment_count": get_count("comment_count"),
"uploader": info_video.get("nick_name"),
"uploader_id": int_or_none(info_video.get("poster_uid")),
"formats": formats
}

def _regex_data(self, webpage, video_id):
info_video = self._parse_json(self._search_regex(
r'''<script>window.data\s+=\s+(\{.+?\})\;''',
webpage,
"info video",
), video_id, transform_source=js_to_json)
return info_video


class LikeeUserIE(LikeeIE):
_VALID_URL = r'''(?x)^((http[s]?|fpt):)\/?\/(www\.|m\.|)
(?P<site>
(likee\.com)
)\/(user)\/(?P<user>@.*?)(\W|$)
'''
IE_NAME = "likee:user"
_TESTS = [
{
"url": "https://likee.com/user/@Inayat95",
"info_dict": {
"id": "1357265683",
"title": "@Inayat95",
},
"playlist_mincount": 10
},
{
"url": "https://likee.com/user/@435421183/",
"info_dict": {
"id": "681435856",
"title": "@435421183",
},
"playlist_mincount": 5
},
{
"url": "https://likee.com/user/@52710468/",
"info_dict": {
"id": "1300330468",
"title": "@52710468",
},
"playlist_mincount": 10
}
]

def _real_extract(self, url):
mobj = re.search(self._VALID_URL, url)

user = mobj.group("user")

webpage = self._download_webpage(
url_or_request=url,
video_id=user
)
info_playlist = self._regex_data(webpage, user)
uid = try_get(info_playlist, lambda x: x['userinfo']['uid'])

return self.playlist_result(entries=self._entries(uid, user), playlist_id=uid, playlist_title=user)

def _entries(self, uid, user):
count = 50
lastPostId = ""
while True:
info = self._download_json(
url_or_request="https://likee.com/official_website/VideoApi/getUserVideo",
video_id=lastPostId or uid,
data=compat_urllib_parse_urlencode({
"uid": uid,
"count": count,
"lastPostId": lastPostId
}).encode()
)
if info.get("msg") != "success":
break
videoList = try_get(info, lambda x: x['data']['videoList'])
video_id = ''
for video in videoList:
if not video:
continue
video_id = video.get("postId")
yield self.url_result(
url="https://likee.com/%s/video/%s" % (user, video_id),
ie=LikeeIE.ie_key(),
video_id=video_id
)
lastPostId = video_id
if len(videoList) != count:
break
1 change: 0 additions & 1 deletion youtube_dl/extractor/lynda.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,6 @@ def _real_extract(self, url):
self._sort_formats(formats)

subtitles = self.extract_subtitles(video_id)

return {
'id': video_id,
'title': title,
Expand Down
Loading