From 9dce8410d29c9bb228e16ee77540a72cd7d8b3fa Mon Sep 17 00:00:00 2001 From: Crypto90 Date: Wed, 1 Jul 2020 03:01:32 +0200 Subject: [PATCH 01/16] Update youtube.py (#25848 #25720 #16627 #25652) --- youtube_dl/extractor/youtube.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ef08bf8cb..9696941c1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -326,35 +326,48 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): - for video_id, video_title in self.extract_videos_from_page(content): - yield self.url_result(video_id, 'Youtube', video_id, video_title) - - def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): + for video_id, video_title, video_duration in self.extract_videos_from_page(content): + if len(video_id) == 11: + # Youtube video id found + yield self.url_result(video_id, 'Youtube', video_id, video_title) + elif len(video_id) > 11: + # Youtube playlist id found + yield self.url_result('https://www.youtube.com/playlist?list=%s' % video_id, 'YoutubePlaylist', video_id, video_title) + + def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page): for mobj in re.finditer(video_re, page): # The link with index 0 is not the first video of the playlist (not sure if still actual) if 'index' in mobj.groupdict() and mobj.group('id') == '0': continue video_id = mobj.group('id') - video_title = unescapeHTML( - mobj.group('title')) if 'title' in mobj.groupdict() else None + playlist_id = mobj.group('plid') if 'plid' in mobj.groupdict() else None + if playlist_id is not None: + video_id = playlist_id + video_title = unescapeHTML(mobj.group('title')) if 'title' in mobj.groupdict() else None if video_title: video_title = video_title.strip() if video_title == '► Play all': video_title = None + video_duration = mobj.group('duration') if 'duration' in mobj.groupdict() else None + if video_duration: + video_duration = video_duration.strip() try: idx = ids_in_page.index(video_id) if video_title and not titles_in_page[idx]: titles_in_page[idx] = video_title + if video_duration and not durations_in_page[idx]: + durations_in_page[idx] = video_duration except ValueError: ids_in_page.append(video_id) titles_in_page.append(video_title) + durations_in_page.append(video_duration) def extract_videos_from_page(self, page): ids_in_page = [] titles_in_page = [] - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) + durations_in_page = [] + self.extract_videos_from_page_impl(self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page) + return zip(ids_in_page, titles_in_page, durations_in_page) class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): @@ -3149,7 +3162,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P[^"]+))?' + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?(.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?' class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): From c35602602583f63d2fef6251e0143d001ae447fd Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Wed, 1 Jul 2020 12:23:31 +0200 Subject: [PATCH 02/16] Update common.py Added video duration variable to url_result() --- youtube_dl/extractor/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a61753b17..96300036b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -941,7 +941,7 @@ class InfoExtractor(object): # Methods for following #608 @staticmethod - def url_result(url, ie=None, video_id=None, video_title=None): + def url_result(url, ie=None, video_id=None, video_title=None, video_duration=None): """Returns a URL that points to a page that should be processed""" # TODO: ie should be the class used for getting the info video_info = {'_type': 'url', @@ -951,6 +951,8 @@ class InfoExtractor(object): video_info['id'] = video_id if video_title is not None: video_info['title'] = video_title + if video_duration is not None: + video_info['duration'] = video_duration return video_info def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): From 2a56ab7fd585105970350871c8570bf8093f632f Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Wed, 1 Jul 2020 12:24:27 +0200 Subject: [PATCH 03/16] Update youtube.py --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9696941c1..2629645c5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -329,7 +329,7 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): for video_id, video_title, video_duration in self.extract_videos_from_page(content): if len(video_id) == 11: # Youtube video id found - yield self.url_result(video_id, 'Youtube', video_id, video_title) + yield self.url_result(video_id, 'Youtube', video_id, video_title, video_duration) elif len(video_id) > 11: # Youtube playlist id found yield self.url_result('https://www.youtube.com/playlist?list=%s' % video_id, 'YoutubePlaylist', video_id, video_title) From 4fd0424fbf8efb2a241abd2567470e93926ce594 Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Wed, 1 Jul 2020 13:46:43 +0200 Subject: [PATCH 04/16] Update youtube.py Fixed other calls with now wrong extract_videos_from_page_impl() argument count. --- youtube_dl/extractor/youtube.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2629645c5..70157a0c4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2758,6 +2758,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): def extract_videos_from_page(self, page): ids_in_page = [] titles_in_page = [] + durations_in_page = [] for item in re.findall( r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): @@ -2768,20 +2769,21 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): video_title = video_title.strip() ids_in_page.append(video_id) titles_in_page.append(video_title) + #TODO ADD VIDEO DURATION HERE TOO? # Fallback with old _VIDEO_RE self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) + self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page) # Relaxed fallbacks self.extract_videos_from_page_impl( r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) + ids_in_page, titles_in_page, durations_in_page) self.extract_videos_from_page_impl( r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) + ids_in_page, titles_in_page, durations_in_page) - return zip(ids_in_page, titles_in_page) + return zip(ids_in_page, titles_in_page, durations_in_page) def _extract_mix(self, playlist_id): # The mixes are generated from a single video From d91421e5e06ba29306f77758d0be4dd42dc43f60 Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Wed, 1 Jul 2020 15:34:59 +0200 Subject: [PATCH 05/16] Flake8 comment format fix.. --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 70157a0c4..eabb3f736 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2769,7 +2769,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): video_title = video_title.strip() ids_in_page.append(video_id) titles_in_page.append(video_title) - #TODO ADD VIDEO DURATION HERE TOO? + # TODO: ADD VIDEO DURATION HERE TOO? # Fallback with old _VIDEO_RE self.extract_videos_from_page_impl( From 438ec47af3825ef172494ec78f0f53d32e4477ba Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Wed, 1 Jul 2020 19:42:11 +0200 Subject: [PATCH 06/16] Update youtube.py Added current duration placeholder for playlist content videos parsing. Next step is to edit regex there too, to get the duration of each video in a playlist. --- youtube_dl/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eabb3f736..43a233ddb 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2769,7 +2769,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): video_title = video_title.strip() ids_in_page.append(video_id) titles_in_page.append(video_title) - # TODO: ADD VIDEO DURATION HERE TOO? + # TODO: ADD VIDEO DURATION HERE TOO! + durations_in_page.append(None) # Fallback with old _VIDEO_RE self.extract_videos_from_page_impl( From 5f6f4fda9bbb1d3e52135c756dabfcd555547da6 Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Wed, 1 Jul 2020 19:54:23 +0200 Subject: [PATCH 07/16] Update youtube.py Changed playlist search result urls to provide video id and playlist id. --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 43a233ddb..7e024acde 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -332,7 +332,7 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): yield self.url_result(video_id, 'Youtube', video_id, video_title, video_duration) elif len(video_id) > 11: # Youtube playlist id found - yield self.url_result('https://www.youtube.com/playlist?list=%s' % video_id, 'YoutubePlaylist', video_id, video_title) + yield self.url_result('https://www.youtube.com/watch?v=%s&list=%s' % (video_id.split(';')[0], video_id.split(';')[1]), 'YoutubePlaylist', video_id, video_title) def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page): for mobj in re.finditer(video_re, page): @@ -342,7 +342,7 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): video_id = mobj.group('id') playlist_id = mobj.group('plid') if 'plid' in mobj.groupdict() else None if playlist_id is not None: - video_id = playlist_id + video_id = video_id + ';' + playlist_id video_title = unescapeHTML(mobj.group('title')) if 'title' in mobj.groupdict() else None if video_title: video_title = video_title.strip() From 2cfce2b63e6147e38547a0577774491eca185647 Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Wed, 1 Jul 2020 19:58:57 +0200 Subject: [PATCH 08/16] Update youtube.py Only return playlist id for "id" result. --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7e024acde..591fcc0e6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -332,7 +332,7 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): yield self.url_result(video_id, 'Youtube', video_id, video_title, video_duration) elif len(video_id) > 11: # Youtube playlist id found - yield self.url_result('https://www.youtube.com/watch?v=%s&list=%s' % (video_id.split(';')[0], video_id.split(';')[1]), 'YoutubePlaylist', video_id, video_title) + yield self.url_result('https://www.youtube.com/watch?v=%s&list=%s' % (video_id.split(';')[0], video_id.split(';')[1]), 'YoutubePlaylist', video_id.split(';')[1], video_title) def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page): for mobj in re.finditer(video_re, page): From 0289ffc4f4917aea794a3c595613a8f502b931aa Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Wed, 1 Jul 2020 20:17:58 +0200 Subject: [PATCH 09/16] Reverted last changes. Reverted last changes. Seems like the result urls have to be in the format "https://www.youtube.com/playlist?list=%s", can't temporarly add "video_id;playlist_id" because it gets messed up with the regex parsing and storing to array logic. --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 591fcc0e6..43a233ddb 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -332,7 +332,7 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): yield self.url_result(video_id, 'Youtube', video_id, video_title, video_duration) elif len(video_id) > 11: # Youtube playlist id found - yield self.url_result('https://www.youtube.com/watch?v=%s&list=%s' % (video_id.split(';')[0], video_id.split(';')[1]), 'YoutubePlaylist', video_id.split(';')[1], video_title) + yield self.url_result('https://www.youtube.com/playlist?list=%s' % video_id, 'YoutubePlaylist', video_id, video_title) def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page): for mobj in re.finditer(video_re, page): @@ -342,7 +342,7 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): video_id = mobj.group('id') playlist_id = mobj.group('plid') if 'plid' in mobj.groupdict() else None if playlist_id is not None: - video_id = video_id + ';' + playlist_id + video_id = playlist_id video_title = unescapeHTML(mobj.group('title')) if 'title' in mobj.groupdict() else None if video_title: video_title = video_title.strip() From 796a691e95ce86bf4db6edc13f8aac2cc6290536 Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Thu, 2 Jul 2020 21:59:11 +0200 Subject: [PATCH 10/16] Now also extracting youtube playlist videos count and resulting it as duration for playlist searches. Updated _VIDEO_RE regex for YoutubeSearchBaseInfoExtractor class to extract videos count for youtube playlist results and passing the result to "duration" output field. So "duration" shows the duration in "HH:MM:SS" for video results or the amount of videos in a youtube playlist for a playlist result eg. "1,234" or "24" . --- youtube_dl/extractor/youtube.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 43a233ddb..9271aea1c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -332,7 +332,7 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): yield self.url_result(video_id, 'Youtube', video_id, video_title, video_duration) elif len(video_id) > 11: # Youtube playlist id found - yield self.url_result('https://www.youtube.com/playlist?list=%s' % video_id, 'YoutubePlaylist', video_id, video_title) + yield self.url_result('https://www.youtube.com/playlist?list=%s' % video_id, 'YoutubePlaylist', video_id, video_title, video_duration) def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page): for mobj in re.finditer(video_re, page): @@ -349,6 +349,9 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): if video_title == '► Play all': video_title = None video_duration = mobj.group('duration') if 'duration' in mobj.groupdict() else None + playlist_count = mobj.group('plcounter') if 'plcounter' in mobj.groupdict() else None + if playlist_id is not None and playlist_count is not None: + video_duration = playlist_count if video_duration: video_duration = video_duration.strip() try: @@ -3165,7 +3168,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?(.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?' + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?(?:((?!formatted-video-count-label)[\s\S])*[^\d]+(?P<plcounter>[0-9,.]+))?(.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?' class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): From 585017371794f5382091e284805ae5556763cb4d Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Thu, 2 Jul 2020 23:41:05 +0200 Subject: [PATCH 11/16] Fixing an issue with regex of last commit. --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9271aea1c..2fc78bf1f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3168,7 +3168,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?(?:((?!formatted-video-count-label)[\s\S])*[^\d]+(?P<plcounter>[0-9,.]+))?(.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?' + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(((?!formatted-video-count-label|tile-link)[\s\S])*[^\d]+(?P<plcounter>[0-9,.]+))?((?:[^\"]*"[^>]+\btitle="(?P<title>[^\"]+)))?(?:.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?' class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): From f49bfa7e60de538bd208a9e641d246dca0c863df Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Thu, 2 Jul 2020 23:59:35 +0200 Subject: [PATCH 12/16] Optimized _VIDEO_RE regex. --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2fc78bf1f..136b5fa8e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3168,7 +3168,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(((?!formatted-video-count-label|tile-link)[\s\S])*[^\d]+(?P<plcounter>[0-9,.]+))?((?:[^\"]*"[^>]+\btitle="(?P<title>[^\"]+)))?(?:.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?' + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(((?!formatted-video-count-label|tile-link)[\s\S])*"[^\d]+(?P<plcounter>[0-9,.]+)</b>\svideos)?((?:[^\"]*"[^>]+\btitle="(?P<title>[^\"]+)))?(?:.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?' class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): From 73670d03df999e578d3f1275d57dc3c13d9dc765 Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Fri, 3 Jul 2020 00:32:24 +0200 Subject: [PATCH 13/16] Updated regex to fix an issue case where the playlist video counter of a wrong (next) playlist got matched. --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 136b5fa8e..cb3588f40 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3168,7 +3168,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(((?!formatted-video-count-label|tile-link)[\s\S])*"[^\d]+(?P<plcounter>[0-9,.]+)</b>\svideos)?((?:[^\"]*"[^>]+\btitle="(?P<title>[^\"]+)))?(?:.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?' + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(((?!formatted-video-count-label|tile-link|href="\s*/watch)[\s\S])*"[^\d]+(?P<plcounter>[0-9,.]+)</b>\svideos)?((?:[^\"]*"[^>]+\btitle="(?P<title>[^\"]+)))?(?:.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?' class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): From 45419a78631a74a629df9c8dbbaddb18c49a5674 Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Tue, 21 Jul 2020 15:47:19 +0200 Subject: [PATCH 14/16] Changed playlist result urls to provide video id. Changed playlist result urls to provide video id which is needed to keep the information which cover got used to show for the playlist. With the video id, cover image urls for the playlist result can get created. --- youtube_dl/extractor/youtube.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index cb3588f40..edcdc7b88 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -326,20 +326,21 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): - for video_id, video_title, video_duration in self.extract_videos_from_page(content): + for video_id, video_title, video_duration, playlist_video_id in self.extract_videos_from_page(content): if len(video_id) == 11: # Youtube video id found yield self.url_result(video_id, 'Youtube', video_id, video_title, video_duration) elif len(video_id) > 11: # Youtube playlist id found - yield self.url_result('https://www.youtube.com/playlist?list=%s' % video_id, 'YoutubePlaylist', video_id, video_title, video_duration) + yield self.url_result('https://www.youtube.com/watch?v=%s&list=%s' % (playlist_video_id, video_id), 'YoutubePlaylist', video_id, video_title, video_duration) - def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page): + def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page): for mobj in re.finditer(video_re, page): # The link with index 0 is not the first video of the playlist (not sure if still actual) if 'index' in mobj.groupdict() and mobj.group('id') == '0': continue - video_id = mobj.group('id') + video_id_original = mobj.group('id') + video_id = video_id_original playlist_id = mobj.group('plid') if 'plid' in mobj.groupdict() else None if playlist_id is not None: video_id = playlist_id @@ -360,17 +361,21 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): titles_in_page[idx] = video_title if video_duration and not durations_in_page[idx]: durations_in_page[idx] = video_duration + if playlist_id is not None and not playlist_video_id_in_page[idx]: + playlist_video_id_in_page[idx] = video_id_original except ValueError: ids_in_page.append(video_id) titles_in_page.append(video_title) durations_in_page.append(video_duration) + playlist_video_id_in_page.append(video_id_original) def extract_videos_from_page(self, page): ids_in_page = [] + playlist_video_id_in_page = [] titles_in_page = [] durations_in_page = [] - self.extract_videos_from_page_impl(self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page) - return zip(ids_in_page, titles_in_page, durations_in_page) + self.extract_videos_from_page_impl(self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page) + return zip(ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page) class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): @@ -2772,7 +2777,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): video_title = video_title.strip() ids_in_page.append(video_id) titles_in_page.append(video_title) - # TODO: ADD VIDEO DURATION HERE TOO! + # TODO: ADD VIDEO DURATION HERE TOO? durations_in_page.append(None) # Fallback with old _VIDEO_RE From c303cb7efd2e69239573105dd5ec5a5b3abcd0bb Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Tue, 21 Jul 2020 16:12:46 +0200 Subject: [PATCH 15/16] Fixed argument count error. --- youtube_dl/extractor/youtube.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index edcdc7b88..24d07d2ea 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2767,6 +2767,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): ids_in_page = [] titles_in_page = [] durations_in_page = [] + playlist_video_id_in_page = [] for item in re.findall( r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): @@ -2779,20 +2780,21 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): titles_in_page.append(video_title) # TODO: ADD VIDEO DURATION HERE TOO? durations_in_page.append(None) + playlist_video_id_in_page.append(None) # Fallback with old _VIDEO_RE self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page) + self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page) # Relaxed fallbacks self.extract_videos_from_page_impl( r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page, durations_in_page) + ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page) self.extract_videos_from_page_impl( r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page, durations_in_page) + ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page) - return zip(ids_in_page, titles_in_page, durations_in_page) + return zip(ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page) def _extract_mix(self, playlist_id): # The mixes are generated from a single video From 677de2d47861fcc0664d160adfb21440e4d607b4 Mon Sep 17 00:00:00 2001 From: Crypto90 <support@orangesiri.com> Date: Thu, 30 Jul 2020 09:26:58 +0200 Subject: [PATCH 16/16] Fixed broken next page parsing Added changes from this pull request for broken next page parsing: https://github.com/ytdl-org/youtube-dl/pull/26163 --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 24d07d2ea..b014a600f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3217,7 +3217,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): if not new_videos or len(videos) > limit: break next_link = self._html_search_regex( - r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', + r'href="(/results\?[^"]*\b(?:sp=[^"]+)?)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', html_content, 'next link', default=None) if next_link is None: break