From c3f75fe42319f90f9a8b60e7aa708f653ef889cd Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Tue, 14 Jan 2020 21:00:56 +0000 Subject: [PATCH 01/10] [facebook] Add user video playlist extraction --- youtube_dl/extractor/facebook.py | 103 +++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index ce64e2683..6ce6bea55 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals import re import socket +import json +import itertools from .common import InfoExtractor from ..compat import ( @@ -509,3 +511,104 @@ class FacebookPluginsVideoIE(InfoExtractor): return self.url_result( compat_urllib_parse_unquote(self._match_id(url)), FacebookIE.ie_key()) + +class FacebookUserIE(InfoExtractor): + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?facebook\.com/(?:pg/)?(?P[^/?#&]+))/videos(?!/\d)' + + _TESTS = [{ + # page + 'url': 'https://www.facebook.com/uniladmag/videos' + }, { + # page + 'url': 'https://www.facebook.com/Coca-Cola/videos/?ref=page_internal' + }, { + # profile + 'url': 'https://www.facebook.com/zuck/videos' + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('id') + + page = self._download_webpage( + url, user_id, 'Downloading user webpage') + fb_url = self._html_search_meta( + 'al:android:url', page, default=None) + fb_url_re = re.match(r'fb://(?Ppage|profile)/(?P\d+)', fb_url) + if not fb_url_re: + raise ExtractorError('Could not extract page ID', expected=False) + page_id = fb_url_re.group('id') + fb_dtsg_ag_re = re.search(r'"async_get_token":"([\w\-:]+)"', page) + pagelet_token_re = re.search(r'pagelet_token:"([\w\-]+)"', page) + collection_token_re = re.search(r'pagelet_timeline_app_collection_([\d:]+)', page) + cursor = None + entries = [] + + if fb_url_re.group('type') == 'page': + endpoint = 'PagesVideoHubVideoContainerPagelet' + a_class = '_5asm' + data = { + 'page': page_id + } + elif fb_url_re.group('type') == 'profile': + if not (fb_dtsg_ag_re and pagelet_token_re and collection_token_re): + raise ExtractorError('You must be logged in to extract profile videos', expected=True) + endpoint = 'VideosByUserAppCollectionPagelet' + a_class = '_400z' + data = { + 'collection_token': collection_token_re[1], + 'disablepager': False, + 'overview': False, + 'profile_id': page_id, + 'pagelet_token': pagelet_token_re[1], + 'order': None, + 'sk': 'videos' + } + + for page_num in itertools.count(1): + js_data_page = self._download_webpage( + 'https://www.facebook.com/ajax/pagelet/generic.php/%s' % endpoint, + user_id, 'Downloading page %d' % page_num, + query={ + 'fb_dtsg_ag': fb_dtsg_ag_re[1] if fb_dtsg_ag_re else None, + 'data': json.dumps( + {**data, 'cursor': cursor}, + separators=(',', ':')), + '__a': 1 + }) + + js_data = self._parse_json(self._search_regex( + r'({.+})', js_data_page, + 'js data', default='{}'), user_id, fatal=True) + + for video in re.findall( + r'href="(?P[^"]+)"[^>]+%s' % a_class, + js_data['payload']): + entries.append( + self.url_result('https://www.facebook.com%s' % video, FacebookIE.ie_key()) + ) + + cursor = None + if fb_url_re.group('type') == 'page': + if not 'instances' in js_data['jsmods']: + break + for parent in js_data['jsmods']['instances']: + if type(parent) is list: + for child in parent: + if type(child) is list: + for subchild in child: + if type(subchild) is dict and 'cursor' in subchild: + cursor = subchild['cursor'] + break + elif fb_url_re.group('type') == 'profile': + for parent in js_data['jsmods']['require']: + if type(parent) is list: + for child in parent: + if type(child) is list: + if len(child) == 3: + cursor = child[2] + break + if not cursor: + break + + return self.playlist_result(entries, user_id) From 5e83f922acd95b1933ee5e79548e65accb13c041 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Tue, 14 Jan 2020 21:03:23 +0000 Subject: [PATCH 02/10] Update extractors.py --- youtube_dl/extractor/extractors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1cab440f4..a9349f3cd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -334,6 +334,7 @@ from .eyedotv import EyedoTVIE from .facebook import ( FacebookIE, FacebookPluginsVideoIE, + FacebookUserIE, ) from .faz import FazIE from .fc2 import ( From a944eb8884c7af5e143b45deedcda82c64989db1 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Tue, 14 Jan 2020 21:20:36 +0000 Subject: [PATCH 03/10] Update facebook.py --- youtube_dl/extractor/facebook.py | 51 +++++++++++++++++++------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 6ce6bea55..36f52e1ef 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -534,62 +534,71 @@ class FacebookUserIE(InfoExtractor): url, user_id, 'Downloading user webpage') fb_url = self._html_search_meta( 'al:android:url', page, default=None) - fb_url_re = re.match(r'fb://(?Ppage|profile)/(?P\d+)', fb_url) - if not fb_url_re: + fb_url_mobj = re.match(r'fb://(?Ppage|profile)/(?P\d+)', fb_url) + if not fb_url_mobj: raise ExtractorError('Could not extract page ID', expected=False) - page_id = fb_url_re.group('id') - fb_dtsg_ag_re = re.search(r'"async_get_token":"([\w\-:]+)"', page) - pagelet_token_re = re.search(r'pagelet_token:"([\w\-]+)"', page) - collection_token_re = re.search(r'pagelet_timeline_app_collection_([\d:]+)', page) + page_id = fb_url_mobj.group('id') + fb_dtsg_ag = self._search_regex( + r'"async_get_token":"([\w\-:]+)"', + page, 'fb_dtsg_ag', default=None) + pagelet_token = self._search_regex( + r'pagelet_token:"([\w\-]+)"', + page, 'pagelet_token', default=None) + collection_token = self._search_regex( + r'pagelet_timeline_app_collection_([\d:]+)', + page, 'collection_token', default=None) cursor = None entries = [] - if fb_url_re.group('type') == 'page': + if fb_url_mobj.group('type') == 'page': endpoint = 'PagesVideoHubVideoContainerPagelet' a_class = '_5asm' data = { 'page': page_id } - elif fb_url_re.group('type') == 'profile': - if not (fb_dtsg_ag_re and pagelet_token_re and collection_token_re): + elif fb_url_mobj.group('type') == 'profile': + if not (fb_dtsg_ag and pagelet_token and collection_token): raise ExtractorError('You must be logged in to extract profile videos', expected=True) endpoint = 'VideosByUserAppCollectionPagelet' a_class = '_400z' data = { - 'collection_token': collection_token_re[1], + 'collection_token': collection_token, 'disablepager': False, 'overview': False, 'profile_id': page_id, - 'pagelet_token': pagelet_token_re[1], + 'pagelet_token': pagelet_token, 'order': None, 'sk': 'videos' - } + } for page_num in itertools.count(1): js_data_page = self._download_webpage( 'https://www.facebook.com/ajax/pagelet/generic.php/%s' % endpoint, user_id, 'Downloading page %d' % page_num, query={ - 'fb_dtsg_ag': fb_dtsg_ag_re[1] if fb_dtsg_ag_re else None, + 'fb_dtsg_ag': fb_dtsg_ag if fb_dtsg_ag else None, 'data': json.dumps( {**data, 'cursor': cursor}, separators=(',', ':')), '__a': 1 - }) + }) - js_data = self._parse_json(self._search_regex( - r'({.+})', js_data_page, - 'js data', default='{}'), user_id, fatal=True) + js_data = self._parse_json( + self._search_regex( + r'({.+})', js_data_page, + 'js data', default='{}'), + user_id, fatal=True) for video in re.findall( r'href="(?P[^"]+)"[^>]+%s' % a_class, js_data['payload']): entries.append( - self.url_result('https://www.facebook.com%s' % video, FacebookIE.ie_key()) - ) + self.url_result( + 'https://www.facebook.com%s' % video, + FacebookIE.ie_key())) cursor = None - if fb_url_re.group('type') == 'page': + if fb_url_mobj.group('type') == 'page': if not 'instances' in js_data['jsmods']: break for parent in js_data['jsmods']['instances']: @@ -600,7 +609,7 @@ class FacebookUserIE(InfoExtractor): if type(subchild) is dict and 'cursor' in subchild: cursor = subchild['cursor'] break - elif fb_url_re.group('type') == 'profile': + elif fb_url_mobj.group('type') == 'profile': for parent in js_data['jsmods']['require']: if type(parent) is list: for child in parent: From 81aa8614be3b47b4d6b2ee1f1bfae53bae9c5863 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Tue, 14 Jan 2020 21:31:26 +0000 Subject: [PATCH 04/10] Update facebook.py --- youtube_dl/extractor/facebook.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 36f52e1ef..e6f156a56 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -512,6 +512,7 @@ class FacebookPluginsVideoIE(InfoExtractor): compat_urllib_parse_unquote(self._match_id(url)), FacebookIE.ie_key()) + class FacebookUserIE(InfoExtractor): _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?facebook\.com/(?:pg/)?(?P[^/?#&]+))/videos(?!/\d)' @@ -549,7 +550,7 @@ class FacebookUserIE(InfoExtractor): page, 'collection_token', default=None) cursor = None entries = [] - + if fb_url_mobj.group('type') == 'page': endpoint = 'PagesVideoHubVideoContainerPagelet' a_class = '_5asm' @@ -590,7 +591,7 @@ class FacebookUserIE(InfoExtractor): user_id, fatal=True) for video in re.findall( - r'href="(?P[^"]+)"[^>]+%s' % a_class, + r'href="(?P[^"]+)"[^>]+%s' % a_class, js_data['payload']): entries.append( self.url_result( @@ -599,7 +600,7 @@ class FacebookUserIE(InfoExtractor): cursor = None if fb_url_mobj.group('type') == 'page': - if not 'instances' in js_data['jsmods']: + if 'instances' not in js_data['jsmods']: break for parent in js_data['jsmods']['instances']: if type(parent) is list: From f99698f45dcf6137e7ca7274e7a0c6dc76e52b30 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Wed, 15 Jan 2020 13:31:50 +0000 Subject: [PATCH 05/10] Update facebook.py --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index e6f156a56..56d8144c4 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -514,7 +514,7 @@ class FacebookPluginsVideoIE(InfoExtractor): class FacebookUserIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?facebook\.com/(?:pg/)?(?P[^/?#&]+))/videos(?!/\d)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?facebook\.com/(?:pg/)?(?P[^/?#&]+))/videos(?!/[\w\.])' _TESTS = [{ # page From 0c4a0205c7f71b2dc223d1c10baf78e640947d84 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Wed, 15 Jan 2020 15:59:23 +0000 Subject: [PATCH 06/10] Update facebook.py --- youtube_dl/extractor/facebook.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 56d8144c4..4f5bf1cd9 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -518,13 +518,15 @@ class FacebookUserIE(InfoExtractor): _TESTS = [{ # page - 'url': 'https://www.facebook.com/uniladmag/videos' - }, { - # page - 'url': 'https://www.facebook.com/Coca-Cola/videos/?ref=page_internal' + 'url': 'https://www.facebook.com/Coca-Cola/videos/?ref=page_internal', + 'info_dict': { + 'id': 'Coca-Cola' + }, + 'playlist_mincount': 90, }, { - # profile - 'url': 'https://www.facebook.com/zuck/videos' + # profile (requires login) + 'url': 'https://www.facebook.com/zuck/videos', + 'only_matching': True, }] def _real_extract(self, url): @@ -556,7 +558,7 @@ class FacebookUserIE(InfoExtractor): a_class = '_5asm' data = { 'page': page_id - } + } elif fb_url_mobj.group('type') == 'profile': if not (fb_dtsg_ag and pagelet_token and collection_token): raise ExtractorError('You must be logged in to extract profile videos', expected=True) @@ -570,7 +572,7 @@ class FacebookUserIE(InfoExtractor): 'pagelet_token': pagelet_token, 'order': None, 'sk': 'videos' - } + } for page_num in itertools.count(1): js_data_page = self._download_webpage( @@ -582,7 +584,7 @@ class FacebookUserIE(InfoExtractor): {**data, 'cursor': cursor}, separators=(',', ':')), '__a': 1 - }) + }) js_data = self._parse_json( self._search_regex( @@ -591,8 +593,8 @@ class FacebookUserIE(InfoExtractor): user_id, fatal=True) for video in re.findall( - r'href="(?P[^"]+)"[^>]+%s' % a_class, - js_data['payload']): + r'href="(?P[^"]+)"[^>]+%s' % a_class, + js_data['payload']): entries.append( self.url_result( 'https://www.facebook.com%s' % video, From 388df8892015fd6c11a4c74094dace50076d7b41 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Wed, 15 Jan 2020 17:58:23 +0000 Subject: [PATCH 07/10] Update facebook.py --- youtube_dl/extractor/facebook.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 4f5bf1cd9..932ec41b7 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -26,6 +26,7 @@ from ..utils import ( sanitized_Request, try_get, urlencode_postdata, + merge_dicts, ) @@ -581,7 +582,7 @@ class FacebookUserIE(InfoExtractor): query={ 'fb_dtsg_ag': fb_dtsg_ag if fb_dtsg_ag else None, 'data': json.dumps( - {**data, 'cursor': cursor}, + merge_dicts(data, {'cursor': cursor}), separators=(',', ':')), '__a': 1 }) From 772f93a75dec588b5abd7495dde960df8758df2d Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Thu, 19 Mar 2020 12:00:23 +0000 Subject: [PATCH 08/10] Update facebook.py --- youtube_dl/extractor/facebook.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 932ec41b7..f47cf97db 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -516,6 +516,7 @@ class FacebookPluginsVideoIE(InfoExtractor): class FacebookUserIE(InfoExtractor): _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?facebook\.com/(?:pg/)?(?P[^/?#&]+))/videos(?!/[\w\.])' + IE_NAME = "facebook:user" _TESTS = [{ # page @@ -556,15 +557,14 @@ class FacebookUserIE(InfoExtractor): if fb_url_mobj.group('type') == 'page': endpoint = 'PagesVideoHubVideoContainerPagelet' - a_class = '_5asm' data = { 'page': page_id } + page_re = r']+>.+?' elif fb_url_mobj.group('type') == 'profile': if not (fb_dtsg_ag and pagelet_token and collection_token): raise ExtractorError('You must be logged in to extract profile videos', expected=True) endpoint = 'VideosByUserAppCollectionPagelet' - a_class = '_400z' data = { 'collection_token': collection_token, 'disablepager': False, @@ -574,6 +574,7 @@ class FacebookUserIE(InfoExtractor): 'order': None, 'sk': 'videos' } + page_re = r']+>' for page_num in itertools.count(1): js_data_page = self._download_webpage( @@ -594,8 +595,7 @@ class FacebookUserIE(InfoExtractor): user_id, fatal=True) for video in re.findall( - r'href="(?P[^"]+)"[^>]+%s' % a_class, - js_data['payload']): + page_re, js_data['payload']): entries.append( self.url_result( 'https://www.facebook.com%s' % video, From 09eed1064443f6568412658ea17bd1be05e34f26 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Sun, 29 Mar 2020 03:06:06 +0100 Subject: [PATCH 09/10] Update facebook.py --- youtube_dl/extractor/facebook.py | 43 +++++++++++++++++++------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f47cf97db..54d00c7ec 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -22,11 +22,12 @@ from ..utils import ( int_or_none, js_to_json, limit_length, + merge_dicts, parse_count, sanitized_Request, try_get, urlencode_postdata, - merge_dicts, + urljoin, ) @@ -515,8 +516,14 @@ class FacebookPluginsVideoIE(InfoExtractor): class FacebookUserIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?facebook\.com/(?:pg/)?(?P[^/?#&]+))/videos(?!/[\w\.])' - IE_NAME = "facebook:user" + _VALID_URL = r'https?://(?:[^/]+\.)?facebook\.com/(?:pg/)?(?P[^/?#&]+)/videos' + IE_NAME = 'facebook:user' + + @classmethod + def suitable(cls, url): + return (False + if FacebookIE.suitable(url) + else super(FacebookUserIE, cls).suitable(url)) _TESTS = [{ # page @@ -532,17 +539,17 @@ class FacebookUserIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('id') + user_id = self._match_id(url) page = self._download_webpage( url, user_id, 'Downloading user webpage') fb_url = self._html_search_meta( 'al:android:url', page, default=None) - fb_url_mobj = re.match(r'fb://(?Ppage|profile)/(?P\d+)', fb_url) - if not fb_url_mobj: - raise ExtractorError('Could not extract page ID', expected=False) - page_id = fb_url_mobj.group('id') + fb_url_re = r'fb://(?Ppage|profile)/(?P\d+)' + page_type = self._search_regex( + fb_url_re, fb_url, 'page type', group='type') + page_id = self._search_regex( + fb_url_re, fb_url, 'page id', group='id') fb_dtsg_ag = self._search_regex( r'"async_get_token":"([\w\-:]+)"', page, 'fb_dtsg_ag', default=None) @@ -555,15 +562,15 @@ class FacebookUserIE(InfoExtractor): cursor = None entries = [] - if fb_url_mobj.group('type') == 'page': + if page_type == 'page': endpoint = 'PagesVideoHubVideoContainerPagelet' data = { 'page': page_id } page_re = r']+>.+?' - elif fb_url_mobj.group('type') == 'profile': + elif page_type == 'profile': if not (fb_dtsg_ag and pagelet_token and collection_token): - raise ExtractorError('You must be logged in to extract profile videos', expected=True) + raise ExtractorError('You must use cookies to extract profile videos', expected=True) endpoint = 'VideosByUserAppCollectionPagelet' data = { 'collection_token': collection_token, @@ -592,17 +599,19 @@ class FacebookUserIE(InfoExtractor): self._search_regex( r'({.+})', js_data_page, 'js data', default='{}'), - user_id, fatal=True) + user_id) for video in re.findall( page_re, js_data['payload']): + video_url = urljoin('https://www.facebook.com', video) + if not FacebookIE.suitable(video_url): + continue entries.append( self.url_result( - 'https://www.facebook.com%s' % video, - FacebookIE.ie_key())) + video_url, FacebookIE.ie_key())) cursor = None - if fb_url_mobj.group('type') == 'page': + if page_type == 'page': if 'instances' not in js_data['jsmods']: break for parent in js_data['jsmods']['instances']: @@ -613,7 +622,7 @@ class FacebookUserIE(InfoExtractor): if type(subchild) is dict and 'cursor' in subchild: cursor = subchild['cursor'] break - elif fb_url_mobj.group('type') == 'profile': + elif page_type == 'profile': for parent in js_data['jsmods']['require']: if type(parent) is list: for child in parent: From 76b1f2a974fda99c79ec17f727896339fa1d6c3a Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Sun, 29 Mar 2020 03:22:02 +0100 Subject: [PATCH 10/10] flake8 --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 54d00c7ec..efa2c0f69 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -518,7 +518,7 @@ class FacebookPluginsVideoIE(InfoExtractor): class FacebookUserIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?facebook\.com/(?:pg/)?(?P[^/?#&]+)/videos' IE_NAME = 'facebook:user' - + @classmethod def suitable(cls, url): return (False