From 0c75abbb7bb9135d145805e86c87a5a43b69ac15 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 24 Aug 2016 23:58:22 +0800 Subject: [PATCH] [mtvservices:embedded] Use another endpoint to get feed URL Closes #10363 In the original mtvservices:embedded test case, config.xml is still used to get the feed URL. Some other examples, including test_Generic_40 (http://www.vulture.com/2016/06/new-key-peele-sketches-released.html), and the video mentioned in #10363, use another endpoint to get the feed URL. The 'index.html' approach works for the original test case, too. So I didn't keep the old approach. --- ChangeLog | 6 ++++++ youtube_dl/extractor/bet.py | 5 ++--- youtube_dl/extractor/mtv.py | 27 +++++++++++++-------------- youtube_dl/extractor/nick.py | 5 ++--- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4f3f1265f..c3cc8f38f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors +* [mtvservices:embedded] Fix extraction for the new 'edge' player (#10363) + + version 2016.08.24.1 Extractors diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index bd3ee2e2e..1f8ef0303 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor from ..utils import unified_strdate -from ..compat import compat_urllib_parse_urlencode class BetIE(MTVServicesInfoExtractor): @@ -53,9 +52,9 @@ class BetIE(MTVServicesInfoExtractor): _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player" def _get_feed_query(self, uri): - return compat_urllib_parse_urlencode({ + return { 'uuid': uri, - }) + } def _extract_mgid(self, webpage): return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid') diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 200f340de..bdda68819 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_str, compat_xpath, ) @@ -14,12 +13,13 @@ from ..utils import ( fix_xml_ampersands, float_or_none, HEADRequest, + RegexNotFoundError, sanitized_Request, strip_or_none, timeconvert, unescapeHTML, + update_url_query, url_basename, - RegexNotFoundError, xpath_text, ) @@ -36,6 +36,11 @@ class MTVServicesInfoExtractor(InfoExtractor): def _id_from_uri(uri): return uri.split(':')[-1] + @staticmethod + def _remove_template_parameter(url): + # Remove the templates, like &device={device} + return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url) + # This was originally implemented for ComedyCentral, but it also works here @classmethod def _transform_rtmp_url(cls, rtmp_video_url): @@ -117,9 +122,7 @@ class MTVServicesInfoExtractor(InfoExtractor): video_id = self._id_from_uri(uri) self.report_extraction(video_id) content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))) - mediagen_url = content_el.attrib['url'] - # Remove the templates, like &device={device} - mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) + mediagen_url = self._remove_template_parameter(content_el.attrib['url']) if 'acceptMethods' not in mediagen_url: mediagen_url += '&' if '?' in mediagen_url else '?' mediagen_url += 'acceptMethods=fms' @@ -178,12 +181,12 @@ class MTVServicesInfoExtractor(InfoExtractor): data = {'uri': uri} if self._LANG: data['lang'] = self._LANG - return compat_urllib_parse_urlencode(data) + return data def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) feed_url = self._get_feed_url(uri) - info_url = feed_url + '?' + self._get_feed_query(uri) + info_url = update_url_query(feed_url, self._get_feed_query(uri)) return self._get_videos_info_from_url(info_url, video_id) def _get_videos_info_from_url(self, url, video_id): @@ -256,13 +259,9 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): def _get_feed_url(self, uri): video_id = self._id_from_uri(uri) - site_id = uri.replace(video_id, '') - config_url = ('http://media.mtvnservices.com/pmt-arc/e1/players/{0}/' - 'context52/config.xml'.format(site_id)) - config_doc = self._download_xml(config_url, video_id) - feed_node = config_doc.find('.//feed') - feed_url = feed_node.text.strip().split('?')[0] - return feed_url + config = self._download_json( + 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id) + return self._remove_template_parameter(config['feedWithQueryParams']) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 9c54846e1..64730a624 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse_urlencode from ..utils import update_url_query @@ -59,10 +58,10 @@ class NickIE(MTVServicesInfoExtractor): }] def _get_feed_query(self, uri): - return compat_urllib_parse_urlencode({ + return { 'feed': 'nick_arc_player_prime', 'mgid': uri, - }) + } def _extract_mgid(self, webpage): return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid')