From 32766d525a49fe020154e22c953da158fcd3c12e Mon Sep 17 00:00:00 2001 From: FliegendeWurst <2012gdwu@web.de> Date: Sun, 12 Apr 2020 01:40:13 +0200 Subject: [PATCH 1/8] [tele5] Prefer m3u8 download in extraction (closes #24674) --- youtube_dl/extractor/tele5.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 364556a1f..9f4e29234 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -91,6 +91,19 @@ class Tele5IE(InfoExtractor): media = self._download_json( 'https://cdn.jwplayer.com/v2/media/' + jwplatform_id, display_id) + + m3u8_url = try_get( + media, lambda x: x['playlist'][0]['sources'][0]['file'], compat_str) + + if m3u8_url: + formats = self._extract_m3u8_formats(m3u8_url, jwplatform_id, 'mp4', fatal=False) + return { + 'id': '%s' % jwplatform_id, + 'title': try_get(media, lambda x: x['title'], compat_str), + # TODO: description, thumbnail, duration + 'formats': formats + } + nexx_id = try_get( media, lambda x: x['playlist'][0]['nexx_id'], compat_str) From dae3dc6ab313791aa7fd43c019bbdafc5db1f94b Mon Sep 17 00:00:00 2001 From: FliegendeWurst <2012gdwu@web.de> Date: Sun, 12 Apr 2020 11:25:12 +0200 Subject: [PATCH 2/8] [tele5] [WIP] Clean up tests and simplify extractor --- youtube_dl/extractor/tele5.py | 72 +++++++++++++++++------------------ 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 9f4e29234..3d6367d85 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -19,35 +19,44 @@ from ..utils import ( class Tele5IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ - 'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416', + 'url': 'https://www.tele5.de/filme/schlefaz-der-polyp-die-bestie-mit-den-todesarmen-ab-13042018/', 'info_dict': { - 'id': '1549416', + 'id': 'XSWj0xbO', 'ext': 'mp4', - 'upload_date': '20180814', - 'timestamp': 1534290623, - 'title': 'Pandorum', + # fun fact: upload_date is not visible on the web page for this video + 'upload_date': '20200326', # this is a re-upload + 'timestamp': 1585190811, + 'duration': 8701.0, + 'title': 'SchleFaZ: Der Polyp - Die Bestie mit den Todesarmen (ab 13.04.2018)', + 'description': 'SchleFaZ: Der Polyp - Die Bestie mit den Todesarmen (ab 13.04.2018)' }, 'params': { - 'skip_download': True, - }, + 'skip_download': True + } }, { - 'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', - 'only_matching': True, + 'url': 'https://www.tele5.de/filme/schlefaz-dragon-crusaders/', + 'info_dict': { + 'id': '1F8PHGxn', + 'ext': 'mp4', + 'upload_date': '20190509', + 'timestamp': 1557441600, + 'duration': 8181.0, + 'title': 'SchleFaZ: Dragon Crusaders', + 'description': 'Drachenzähmen schlecht gemacht! Oliver Kalkofe und Peter Rütten knöpfen sich mit "SchleFaZ: Dragon Crusaders" eine wahrhaft verhext-verflixte Drachen-Sause vor. Statt großer Kampf, großer Krampf. Nicht nur in den Füßen, die einem bei dem müden Fantasy-Abenteuer garantiert einschlafen!' + }, + 'params': { + 'skip_download': True + } }, { + # TODO: 400 Bad Request error on webpage, remove this test? (they might fix it eventually) 'url': 'https://www.tele5.de/video-clip/?ve_id=1609440', 'only_matching': True, - }, { - 'url': 'https://www.tele5.de/filme/schlefaz-dragon-crusaders/', - 'only_matching': True, }, { 'url': 'https://www.tele5.de/filme/making-of/avengers-endgame/', 'only_matching': True, }, { 'url': 'https://www.tele5.de/star-trek/raumschiff-voyager/ganze-folge/das-vinculum/', 'only_matching': True, - }, { - 'url': 'https://www.tele5.de/anders-ist-sevda/', - 'only_matching': True, }] def _real_extract(self, url): @@ -57,10 +66,11 @@ class Tele5IE(InfoExtractor): NEXX_ID_RE = r'\d{6,}' JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' + def nexx_url(nexx_id): + return 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id + def nexx_result(nexx_id): - return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, - ie=NexxIE.ie_key(), video_id=nexx_id) + return self.url_result(nexx_url(nexx_id), ie=NexxIE.ie_key(), video_id=nexx_id) nexx_id = jwplatform_id = None @@ -77,38 +87,26 @@ class Tele5IE(InfoExtractor): def extract_id(pattern, name, default=NO_DEFAULT): return self._html_search_regex( (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, - r'\s+id\s*=\s*["\']player_(%s)' % pattern, - r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, + r'\s+id\s*=\s*["\']player_(%s)' % pattern, + r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, default=default) + if not jwplatform_id: + jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) if nexx_id: return nexx_result(nexx_id) - if not jwplatform_id: - jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') - media = self._download_json( 'https://cdn.jwplayer.com/v2/media/' + jwplatform_id, display_id) - m3u8_url = try_get( - media, lambda x: x['playlist'][0]['sources'][0]['file'], compat_str) - - if m3u8_url: - formats = self._extract_m3u8_formats(m3u8_url, jwplatform_id, 'mp4', fatal=False) - return { - 'id': '%s' % jwplatform_id, - 'title': try_get(media, lambda x: x['title'], compat_str), - # TODO: description, thumbnail, duration - 'formats': formats - } - nexx_id = try_get( media, lambda x: x['playlist'][0]['nexx_id'], compat_str) - if nexx_id: - return nexx_result(nexx_id) + # TODO: nexx offers more formats, but fails (404) on some videos + #if nexx_id: + #return nexx_result(nexx_id) return self.url_result( 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), From d0dd97a0c17d76b115cf0f25cb8ef014699eda3d Mon Sep 17 00:00:00 2001 From: FliegendeWurst <2012gdwu@web.de> Date: Sun, 12 Apr 2020 11:28:26 +0200 Subject: [PATCH 3/8] [tele5] flake8 fixes --- youtube_dl/extractor/tele5.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 3d6367d85..166db2c5c 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -24,15 +24,15 @@ class Tele5IE(InfoExtractor): 'id': 'XSWj0xbO', 'ext': 'mp4', # fun fact: upload_date is not visible on the web page for this video - 'upload_date': '20200326', # this is a re-upload + 'upload_date': '20200326', # this is a re-upload 'timestamp': 1585190811, 'duration': 8701.0, 'title': 'SchleFaZ: Der Polyp - Die Bestie mit den Todesarmen (ab 13.04.2018)', 'description': 'SchleFaZ: Der Polyp - Die Bestie mit den Todesarmen (ab 13.04.2018)' }, 'params': { - 'skip_download': True - } + 'skip_download': True, + }, }, { 'url': 'https://www.tele5.de/filme/schlefaz-dragon-crusaders/', 'info_dict': { @@ -45,8 +45,8 @@ class Tele5IE(InfoExtractor): 'description': 'Drachenzähmen schlecht gemacht! Oliver Kalkofe und Peter Rütten knöpfen sich mit "SchleFaZ: Dragon Crusaders" eine wahrhaft verhext-verflixte Drachen-Sause vor. Statt großer Kampf, großer Krampf. Nicht nur in den Füßen, die einem bei dem müden Fantasy-Abenteuer garantiert einschlafen!' }, 'params': { - 'skip_download': True - } + 'skip_download': True, + }, }, { # TODO: 400 Bad Request error on webpage, remove this test? (they might fix it eventually) 'url': 'https://www.tele5.de/video-clip/?ve_id=1609440', @@ -87,8 +87,8 @@ class Tele5IE(InfoExtractor): def extract_id(pattern, name, default=NO_DEFAULT): return self._html_search_regex( (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, - r'\s+id\s*=\s*["\']player_(%s)' % pattern, - r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, + r'\s+id\s*=\s*["\']player_(%s)' % pattern, + r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, default=default) if not jwplatform_id: @@ -105,8 +105,8 @@ class Tele5IE(InfoExtractor): media, lambda x: x['playlist'][0]['nexx_id'], compat_str) # TODO: nexx offers more formats, but fails (404) on some videos - #if nexx_id: - #return nexx_result(nexx_id) + # if nexx_id: + # return nexx_result(nexx_id) return self.url_result( 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), From 444a763e50c16692cbfff424c893881a0b033008 Mon Sep 17 00:00:00 2001 From: FliegendeWurst <2012gdwu@web.de> Date: Sun, 12 Apr 2020 11:33:55 +0200 Subject: [PATCH 4/8] [tele5] Undo some reorderings --- youtube_dl/extractor/tele5.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 166db2c5c..67a7a286e 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -91,16 +91,16 @@ class Tele5IE(InfoExtractor): r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, default=default) - if not jwplatform_id: - jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) if nexx_id: return nexx_result(nexx_id) + if not jwplatform_id: + jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') + media = self._download_json( 'https://cdn.jwplayer.com/v2/media/' + jwplatform_id, display_id) - nexx_id = try_get( media, lambda x: x['playlist'][0]['nexx_id'], compat_str) From cba6cb6986e8b373c2f30ba0b9860cabb3105fea Mon Sep 17 00:00:00 2001 From: FliegendeWurst <2012gdwu@web.de> Date: Sun, 12 Apr 2020 13:22:20 +0200 Subject: [PATCH 5/8] [tele5] Playlist downloading --- youtube_dl/extractor/tele5.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 67a7a286e..ae44b588a 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -12,7 +12,9 @@ from ..compat import ( ) from ..utils import ( NO_DEFAULT, + smuggle_url, try_get, + unsmuggle_url, ) @@ -48,9 +50,11 @@ class Tele5IE(InfoExtractor): 'skip_download': True, }, }, { - # TODO: 400 Bad Request error on webpage, remove this test? (they might fix it eventually) - 'url': 'https://www.tele5.de/video-clip/?ve_id=1609440', - 'only_matching': True, + 'url': 'https://www.tele5.de/timeless/', + 'info_dict': { + 'title': 'Timeless', + }, + 'playlist_count': 6, }, { 'url': 'https://www.tele5.de/filme/making-of/avengers-endgame/', 'only_matching': True, @@ -60,6 +64,9 @@ class Tele5IE(InfoExtractor): }] def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + # TODO: do we really need this? qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] @@ -84,6 +91,23 @@ class Tele5IE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + if not smuggled_data.get('force_singlevideo', False): + # TODO: user now has to specify --no-playlist every time (bad) + if not self._downloader.params.get('noplaylist'): + # TODO: use something other than a regex? + urls = re.findall('href="([^"]+)"\\s+class="special-video__link(?: video-teaser__link)?"', webpage, re.MULTILINE) + entries = [] + for url in urls: + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Tele5', + 'url': smuggle_url( + 'https://tele5.de%s' % url, + {'force_singlevideo': True}), + }) + title = self._html_search_regex("

([^<]+)

", webpage, 'playlist title') + return self.playlist_result(entries, playlist_title=title) + def extract_id(pattern, name, default=NO_DEFAULT): return self._html_search_regex( (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, From 21631596ea7e8dadca11b04418fb071b5428cf66 Mon Sep 17 00:00:00 2001 From: FliegendeWurst <2012gdwu@web.de> Date: Sun, 12 Apr 2020 13:25:12 +0200 Subject: [PATCH 6/8] [tele5] comments --- youtube_dl/extractor/tele5.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index ae44b588a..e6655d070 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -92,7 +92,7 @@ class Tele5IE(InfoExtractor): webpage = self._download_webpage(url, display_id) if not smuggled_data.get('force_singlevideo', False): - # TODO: user now has to specify --no-playlist every time (bad) + # TODO: user now has to specify --no-playlist every time (annoying and not expected) if not self._downloader.params.get('noplaylist'): # TODO: use something other than a regex? urls = re.findall('href="([^"]+)"\\s+class="special-video__link(?: video-teaser__link)?"', webpage, re.MULTILINE) @@ -105,6 +105,7 @@ class Tele5IE(InfoExtractor): 'https://tele5.de%s' % url, {'force_singlevideo': True}), }) + # TODO: use something other than a regex? title = self._html_search_regex("

([^<]+)

", webpage, 'playlist title') return self.playlist_result(entries, playlist_title=title) From 4599b560a0631d8096c6f40bc040a8b5329cf7d3 Mon Sep 17 00:00:00 2001 From: FliegendeWurst <2012gdwu@web.de> Date: Sun, 12 Apr 2020 13:29:34 +0200 Subject: [PATCH 7/8] [tele5] Fix extraction of videos without playlist --- youtube_dl/extractor/tele5.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index e6655d070..632065d51 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -106,8 +106,9 @@ class Tele5IE(InfoExtractor): {'force_singlevideo': True}), }) # TODO: use something other than a regex? - title = self._html_search_regex("

([^<]+)

", webpage, 'playlist title') - return self.playlist_result(entries, playlist_title=title) + title = re.search("

([^<]+)

", webpage, 0) + if title: + return self.playlist_result(entries, playlist_title=title.group(1)) def extract_id(pattern, name, default=NO_DEFAULT): return self._html_search_regex( From ad40ac7238a2a390593c38b4039d8f9935496e58 Mon Sep 17 00:00:00 2001 From: FliegendeWurst <2012gdwu@web.de> Date: Sun, 12 Apr 2020 16:14:32 +0200 Subject: [PATCH 8/8] [tele5] Add test that requires ?ve_id= support --- youtube_dl/extractor/tele5.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 632065d51..1b250a856 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -55,6 +55,21 @@ class Tele5IE(InfoExtractor): 'title': 'Timeless', }, 'playlist_count': 6, + }, { + 'url': 'https://www.tele5.de/kalkofes-welt/best-of-clips/worst-of-internet/?ve_id=dm2hJgJp', + 'info_dict': { + 'id': 'dm2hJgJp', + 'ext': 'mp4', + 'title': 'Freshtorge - Sandra trifft Frau Merkel', + 'upload_date': '20200326', + 'description': 'Freshtorge - Sandra trifft Frau Merkel', + 'timestamp': 1585185161, + 'duration': 170.0, + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, }, { 'url': 'https://www.tele5.de/filme/making-of/avengers-endgame/', 'only_matching': True, @@ -66,7 +81,6 @@ class Tele5IE(InfoExtractor): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) - # TODO: do we really need this? qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0]