From 6c787d8f1e6a858a16dd1dde75ab7b3591d5db36 Mon Sep 17 00:00:00 2001 From: Scott Date: Fri, 22 May 2020 15:46:09 -0400 Subject: [PATCH 1/3] add pbskids --- youtube_dl/extractor/pbskids.py | 235 ++++++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 youtube_dl/extractor/pbskids.py diff --git a/youtube_dl/extractor/pbskids.py b/youtube_dl/extractor/pbskids.py new file mode 100644 index 000000000..0c652ab9c --- /dev/null +++ b/youtube_dl/extractor/pbskids.py @@ -0,0 +1,235 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, + float_or_none, + js_to_json, + orderedSet, + strip_jsonp, + strip_or_none, + unified_strdate, + url_or_none, + US_RATINGS, +) + + +class PBSKIDSIE(InfoExtractor): + IE_NAME = 'pbskids' + IE_DESC = 'Public Broadcasting Service (PBS) for Kids' + + _VALID_URL = r'''(?x)https?:// + (?: + # Article with embedded player + pbskids\.org/video/[^/]+/(?P) + ) + ''' + + _GEO_COUNTRIES = ['US'] + + _TESTS = [ + { + 'url': 'https://pbskids.org/video/super-why/2206965769', + 'md5': '173dc391afd361fa72eab5d3d918968d', + 'info_dict': { + 'id': '2206965769', + 'ext': 'mp4', + 'title': 'Jasper\'s Cowboy Wish', + 'duration': 1510, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, + }, + ] + _ERRORS = { + 101: 'We\'re sorry, but this video is not yet available.', + 403: 'We\'re sorry, but this video is not available in your region due to right restrictions.', + 404: 'We are experiencing technical difficulties that are preventing us from playing the video at this time. Please check back again soon.', + 410: 'This video has expired and is no longer available for online streaming.', + } + + def _real_initialize(self): + cookie = (self._download_json( + 'http://localization.services.pbs.org/localize/auto/cookie/', + None, headers=self.geo_verification_headers(), fatal=False) or {}).get('cookie') + if cookie: + station = self._search_regex(r'#?s=\["([^"]+)"', cookie, 'station') + if station: + self._set_cookie('.pbs.org', 'pbsol.station', station) + + def _extract_webpage(self, url): + mobj = re.match(self._VALID_URL, url) + + description = None + + video_id = None + display_id = None + info = None + episode_id = mobj.group('episode_id') + + if episode_id: + webpage = self._download_webpage(url, episode_id) + + description = self._html_search_meta( + 'description', webpage, default=None) + upload_date = unified_strdate(self._search_regex( + r'air_date"\:"([^"]+)"', + webpage, 'upload date', default=None)) + + # m3u8 url + MULTI_PART_REGEXES = ( + r'URI"\:"https?\:.?/.?/urs\.pbs\.org.?/redirect.?/([\d\w]+)', + ) + + for p in MULTI_PART_REGEXES: + tabbed_videos = orderedSet(re.findall(p, webpage)) + if tabbed_videos: + return tabbed_videos, episode_id, upload_date, description + + if not video_id: + page = self._download_webpage(url, 0) + data = self._extract_video_data(page, 'video data', 0) + info = data.get('video_obj') + video_id = info.get('URI').replace('https://urs.pbs.org/redirect/','').replace('/','') + display_id = data.get('video_id') + + return video_id, display_id, None, description, info + + def _extract_video_data(self, string, name, video_id, fatal=True): + return self._parse_json( + self._search_regex( + [r'window\._PBS_KIDS_DEEPLINK\s*=\s*({.+?});'], + string, name, default='{}'), + video_id, transform_source=js_to_json, fatal=fatal) + + def _real_extract(self, url): + video_id, display_id, upload_date, description, info = self._extract_webpage(url) + + if isinstance(video_id, list): + entries = [self.url_result( + 'https://urs.pbs.org/redirect/%s/?format=json' % vid_id, 'PBSKIDS', vid_id) + for vid_id in video_id] + return self.playlist_result(entries, display_id) + + redirects = [] + redirects.append({"url":'https://urs.pbs.org/redirect/%s/' % video_id, 'eeid':display_id}) + if upload_date is None: + upload_date = unified_strdate(info.get('air_date')) + + formats = [] + http_url = None + for num, redirect in enumerate(redirects): + redirect_id = redirect.get('eeid') + + redirect_info = self._download_json( + '%s?format=json' % redirect['url'], display_id, + 'Downloading %s video url info' % (redirect_id or num), + headers=self.geo_verification_headers()) + + if redirect_info['status'] == 'error': + message = self._ERRORS.get( + redirect_info['http_code'], redirect_info['message']) + if redirect_info['http_code'] == 403: + self.raise_geo_restricted( + msg=message, countries=self._GEO_COUNTRIES) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message), expected=True) + + format_url = redirect_info.get('url') + if not format_url: + continue + + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': redirect_id, + }) + if re.search(r'^https?://.*(?:\d+k|baseline)', format_url): + http_url = format_url + self._remove_duplicate_formats(formats) + m3u8_formats = list(filter( + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', + formats)) + if http_url: + for m3u8_format in m3u8_formats: + bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None) + # Lower qualities (150k and 192k) are not available as HTTP formats (see [1]), + # we won't try extracting them. + # Since summer 2016 higher quality formats (4500k and 6500k) are also available + # albeit they are not documented in [2]. + # 1. https://github.com/ytdl-org/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656 + # 2. https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications + if not bitrate or int(bitrate) < 400: + continue + f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url) + # This may produce invalid links sometimes (e.g. + # http://www.pbs.org/wgbh/frontline/film/suicide-plan) + if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate): + continue + f = m3u8_format.copy() + f.update({ + 'url': f_url, + 'format_id': m3u8_format['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + self._sort_formats(formats) + + rating_str = info.get('rating') + if rating_str is not None: + rating_str = rating_str.rpartition('-')[2] + age_limit = US_RATINGS.get(rating_str) + + subtitles = {} + closed_captions_url = info.get('closed_captions')[0].get('URI').replace('\\','') + if closed_captions_url: + subtitles['en'] = [{ + 'ext': 'ttml', + 'url': closed_captions_url, + }] + mobj = re.search(r'/(\d+)_Encoded\.dfxp', closed_captions_url) + if mobj: + ttml_caption_suffix, ttml_caption_id = mobj.group(0, 1) + ttml_caption_id = int(ttml_caption_id) + subtitles['en'].extend([{ + 'url': closed_captions_url.replace( + ttml_caption_suffix, '/%d_Encoded.srt' % (ttml_caption_id + 1)), + 'ext': 'srt', + }, { + 'url': closed_captions_url.replace( + ttml_caption_suffix, '/%d_Encoded.vtt' % (ttml_caption_id + 2)), + 'ext': 'vtt', + }]) + + # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc) + # Try turning it to 'program - title' naming scheme if possible + alt_title = info.get('program', {}).get('title') + if alt_title: + info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + r'[\s\-:]+', '', info['title']) + + description = info.get('description') or info.get( + 'program', {}).get('description') or description + + return { + 'id': video_id, + 'display_id': display_id, + 'title': info['title'], + 'description': description, + 'thumbnail': info.get('mezzanine'), + 'duration': int_or_none(info.get('duration')), + 'age_limit': age_limit, + 'upload_date': upload_date, + 'formats': formats, + 'subtitles': subtitles, + #'chapters': chapters, + } From 47f777b1c3018966e463daff9bf6926d0508d022 Mon Sep 17 00:00:00 2001 From: Scott Date: Fri, 22 May 2020 15:58:01 -0400 Subject: [PATCH 2/3] flake8 fixes --- youtube_dl/extractor/pbskids.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/pbskids.py b/youtube_dl/extractor/pbskids.py index 0c652ab9c..74b483af7 100644 --- a/youtube_dl/extractor/pbskids.py +++ b/youtube_dl/extractor/pbskids.py @@ -4,18 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, int_or_none, - float_or_none, js_to_json, orderedSet, - strip_jsonp, - strip_or_none, unified_strdate, - url_or_none, US_RATINGS, ) @@ -73,21 +68,18 @@ class PBSKIDSIE(InfoExtractor): display_id = None info = None episode_id = mobj.group('episode_id') - + if episode_id: webpage = self._download_webpage(url, episode_id) - description = self._html_search_meta( 'description', webpage, default=None) upload_date = unified_strdate(self._search_regex( r'air_date"\:"([^"]+)"', webpage, 'upload date', default=None)) - # m3u8 url MULTI_PART_REGEXES = ( r'URI"\:"https?\:.?/.?/urs\.pbs\.org.?/redirect.?/([\d\w]+)', ) - for p in MULTI_PART_REGEXES: tabbed_videos = orderedSet(re.findall(p, webpage)) if tabbed_videos: @@ -97,7 +89,7 @@ class PBSKIDSIE(InfoExtractor): page = self._download_webpage(url, 0) data = self._extract_video_data(page, 'video data', 0) info = data.get('video_obj') - video_id = info.get('URI').replace('https://urs.pbs.org/redirect/','').replace('/','') + video_id = info.get('URI').replace('https://urs.pbs.org/redirect/', '').replace('/', '') display_id = data.get('video_id') return video_id, display_id, None, description, info @@ -119,7 +111,7 @@ class PBSKIDSIE(InfoExtractor): return self.playlist_result(entries, display_id) redirects = [] - redirects.append({"url":'https://urs.pbs.org/redirect/%s/' % video_id, 'eeid':display_id}) + redirects.append({"url": 'https://urs.pbs.org/redirect/%s/' % video_id, 'eeid': display_id}) if upload_date is None: upload_date = unified_strdate(info.get('air_date')) @@ -191,7 +183,7 @@ class PBSKIDSIE(InfoExtractor): age_limit = US_RATINGS.get(rating_str) subtitles = {} - closed_captions_url = info.get('closed_captions')[0].get('URI').replace('\\','') + closed_captions_url = info.get('closed_captions')[0].get('URI').replace('\\', '') if closed_captions_url: subtitles['en'] = [{ 'ext': 'ttml', @@ -231,5 +223,4 @@ class PBSKIDSIE(InfoExtractor): 'upload_date': upload_date, 'formats': formats, 'subtitles': subtitles, - #'chapters': chapters, } From de2951e53e4de2f1bad129fb79dd2ca4a3001e68 Mon Sep 17 00:00:00 2001 From: Scott Date: Mon, 25 May 2020 21:04:52 -0400 Subject: [PATCH 3/3] add to extractor file --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/pbskids.py | 9 --------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4b3092028..9829b550f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -825,6 +825,7 @@ from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE +from .pbskids import PBSKIDSIE from .pearvideo import PearVideoIE from .peertube import PeerTubeIE from .people import PeopleIE diff --git a/youtube_dl/extractor/pbskids.py b/youtube_dl/extractor/pbskids.py index 74b483af7..64f76c1c5 100644 --- a/youtube_dl/extractor/pbskids.py +++ b/youtube_dl/extractor/pbskids.py @@ -50,15 +50,6 @@ class PBSKIDSIE(InfoExtractor): 410: 'This video has expired and is no longer available for online streaming.', } - def _real_initialize(self): - cookie = (self._download_json( - 'http://localization.services.pbs.org/localize/auto/cookie/', - None, headers=self.geo_verification_headers(), fatal=False) or {}).get('cookie') - if cookie: - station = self._search_regex(r'#?s=\["([^"]+)"', cookie, 'station') - if station: - self._set_cookie('.pbs.org', 'pbsol.station', station) - def _extract_webpage(self, url): mobj = re.match(self._VALID_URL, url)