From d7ae0639b4e43e9ac9eebb5d09cfcb77bd0671ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 20 Jul 2013 19:33:40 +0200 Subject: [PATCH 001/119] [youtube] Add an extractor for Youtube recommended videos (":ytrec" keyword) (closes #476) The new extractor and YoutubeSubscriptionsIE are subclasses of YoutubeFeedsInfoExtractor, which allows to fetch videos from http://www.youtube.com/feed_ajax --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/youtube.py | 39 ++++++++++++++++++++++++-------- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b4a1c20e9..6de48c341 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -92,6 +92,7 @@ from .youtube import ( YoutubeChannelIE, YoutubeShowIE, YoutubeSubscriptionsIE, + YoutubeRecommendedIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 12e8fc25d..0d49c0721 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -898,12 +898,12 @@ class YoutubeShowIE(InfoExtractor): return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons] -class YoutubeSubscriptionsIE(YoutubeIE): - """It's a subclass of YoutubeIE because we need to login""" - IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' - IE_NAME = u'youtube:subscriptions' - _FEED_TEMPLATE = 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=subscriptions&paging=%s' +class YoutubeFeedsInfoExtractor(YoutubeIE): + """ + Base class for extractors that fetch info from + http://www.youtube.com/feed_ajax + Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. + """ _PAGING_STEP = 30 # Overwrite YoutubeIE properties we don't want @@ -912,18 +912,27 @@ class YoutubeSubscriptionsIE(YoutubeIE): def suitable(cls, url): return re.match(cls._VALID_URL, url) is not None + @property + def _FEED_TEMPLATE(self): + return 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=%s&paging=%%s' % self._FEED_NAME + + @property + def IE_NAME(self): + return u'youtube:%s' % self._FEED_NAME + def _real_initialize(self): (username, password) = self._get_login_info() if username is None: raise ExtractorError(u'No login info available, needed for downloading the Youtube subscriptions.', expected=True) - super(YoutubeSubscriptionsIE, self)._real_initialize() + super(YoutubeFeedsInfoExtractor, self)._real_initialize() def _real_extract(self, url): feed_entries = [] # The step argument is available only in 2.7 or higher for i in itertools.count(0): paging = i*self._PAGING_STEP - info = self._download_webpage(self._FEED_TEMPLATE % paging, 'feed', + info = self._download_webpage(self._FEED_TEMPLATE % paging, + u'%s feed' % self._FEED_NAME, u'Downloading page %s' % i) info = json.loads(info) feed_html = info['feed_html'] @@ -932,4 +941,16 @@ class YoutubeSubscriptionsIE(YoutubeIE): feed_entries.extend(self.url_result(id, 'Youtube') for id in ids) if info['paging'] is None: break - return self.playlist_result(feed_entries, playlist_title='Youtube Subscriptions') + return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) + +class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): + IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + _FEED_NAME = 'subscriptions' + _PLAYLIST_TITLE = u'Youtube Subscriptions' + +class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _FEED_NAME = 'recommended' + _PLAYLIST_TITLE = u'Youtube Recommended videos' From ddbfd0f0c51d797c2223e536d8bd0cc5f7582bb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 21 Jul 2013 11:04:56 +0200 Subject: [PATCH 002/119] ComedyCentralIE: support the extended interviews urls (fixes #1079) --- youtube_dl/extractor/comedycentral.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 93d9e3d5e..bf8d711ee 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -24,7 +24,9 @@ class ComedyCentralIE(InfoExtractor): (full-episodes/(?P.*)| (?P (the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?)) - |(watch/(?P[^/]*)/(?P.*))))) + |(watch/(?P[^/]*)/(?P.*)))| + (?P + extended-interviews/(?P[0-9]+)/playlist_tds_extended_(?P.*?)/.*?))) $""" _TEST = { u'url': u'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart', @@ -87,6 +89,9 @@ class ComedyCentralIE(InfoExtractor): else: epTitle = mobj.group('cntitle') dlNewest = False + elif mobj.group('interview'): + epTitle = mobj.group('interview_title') + dlNewest = False else: dlNewest = not mobj.group('episode') if dlNewest: From de48addae2ae61eded2992cae61d405647847f8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 23 Jul 2013 11:14:11 +0200 Subject: [PATCH 003/119] Fix CollegHumorIE Now it downloads the video over http in one file, it doesn't downloads in fragments Added a test and use the methods in InfoExtractor for downloading webpages --- youtube_dl/extractor/collegehumor.py | 40 +++++++++++++--------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 7ae0972e5..5ad170d4f 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,26 +1,26 @@ import re -import socket import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_http_client, - compat_str, - compat_urllib_error, compat_urllib_parse_urlparse, - compat_urllib_request, ExtractorError, ) class CollegeHumorIE(InfoExtractor): - _WORKING = False _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P[0-9]+)/(?P.*)$' - def report_manifest(self, video_id): - """Report information extraction.""" - self.to_screen(u'%s: Downloading XML manifest' % video_id) + _TEST = { + u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', + u'file': u'6902724.mp4', + u'md5': u'1264c12ad95dca142a9f0bf7968105a0', + u'info_dict': { + u'title': u'Comic-Con Cosplay Catastrophe', + u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.', + }, + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -36,10 +36,9 @@ class CollegeHumorIE(InfoExtractor): self.report_extraction(video_id) xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id - try: - metaXml = compat_urllib_request.urlopen(xmlUrl).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) + metaXml = self._download_webpage(xmlUrl, video_id, + u'Downloading info XML', + u'Unable to download video info XML') mdoc = xml.etree.ElementTree.fromstring(metaXml) try: @@ -52,11 +51,9 @@ class CollegeHumorIE(InfoExtractor): raise ExtractorError(u'Invalid metadata XML file') manifest_url += '?hdcore=2.10.3' - self.report_manifest(video_id) - try: - manifestXml = compat_urllib_request.urlopen(manifest_url).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) + manifestXml = self._download_webpage(manifest_url, video_id, + u'Downloading XML manifest', + u'Unable to download video info XML') adoc = xml.etree.ElementTree.fromstring(manifestXml) try: @@ -66,9 +63,8 @@ class CollegeHumorIE(InfoExtractor): except IndexError as err: raise ExtractorError(u'Invalid manifest file') - url_pr = compat_urllib_parse_urlparse(manifest_url) - url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1' + url_pr = compat_urllib_parse_urlparse(info['thumbnail']) - info['url'] = url - info['ext'] = 'f4f' + info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') + info['ext'] = 'mp4' return [info] From 67ae7b4760be259d82013fcabe3e0e4cd8f0a9df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 23 Jul 2013 11:41:05 +0200 Subject: [PATCH 004/119] Fix BreakIE Also detect videos that come from Youtube --- youtube_dl/extractor/breakcom.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 34f555e89..53a898de3 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -1,6 +1,8 @@ import re +import json from .common import InfoExtractor +from ..utils import determine_ext class BreakIE(InfoExtractor): @@ -17,17 +19,20 @@ class BreakIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1).split("-")[-1] - webpage = self._download_webpage(url, video_id) - video_url = re.search(r"videoPath: '(.+?)',",webpage).group(1) - key = re.search(r"icon: '(.+?)',",webpage).group(1) - final_url = str(video_url)+"?"+str(key) - thumbnail_url = re.search(r"thumbnailURL: '(.+?)'",webpage).group(1) - title = re.search(r"sVidTitle: '(.+)',",webpage).group(1) - ext = video_url.split('.')[-1] + embed_url = 'http://www.break.com/embed/%s' % video_id + webpage = self._download_webpage(embed_url, video_id) + info_json = self._search_regex(r'var embedVars = ({.*?});', webpage, + u'info json', flags=re.DOTALL) + info = json.loads(info_json) + video_url = info['videoUri'] + m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) + if m_youtube is not None: + return self.url_result(m_youtube.group(1), 'Youtube') + final_url = video_url + '?' + info['AuthToken'] return [{ 'id': video_id, 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, + 'ext': determine_ext(final_url), + 'title': info['contentName'], + 'thumbnail': info['thumbUri'], }] From a7af0ebaf5fd643d42787d55157f2c767bd73404 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 23 Jul 2013 14:20:52 +0200 Subject: [PATCH 005/119] release 2013.07.23 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 32eb27dad..1f1f3ce8b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.07.19' +__version__ = '2013.07.23' From 70fa830e4d3ecebd99447319e861cd6b8af9ff72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 23 Jul 2013 14:27:13 +0200 Subject: [PATCH 006/119] CollegeHumorIE: support Youtube videos and embed urls (fixes #1094) --- youtube_dl/extractor/collegehumor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 5ad170d4f..4a0b0af5a 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -10,7 +10,7 @@ from ..utils import ( class CollegeHumorIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P[0-9]+)/(?P.*)$' + _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed)/(?P[0-9]+)/(?P.*)$' _TEST = { u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', @@ -43,6 +43,9 @@ class CollegeHumorIE(InfoExtractor): mdoc = xml.etree.ElementTree.fromstring(metaXml) try: videoNode = mdoc.findall('./video')[0] + youtubeIdNode = videoNode.find('./youtubeID') + if youtubeIdNode is not None: + return self.url_result(youtubeIdNode.text, 'Youtube') info['description'] = videoNode.findall('./description')[0].text info['title'] = videoNode.findall('./caption')[0].text info['thumbnail'] = videoNode.findall('./thumbnail')[0].text From acc47c1a3f9c44b01042f689066fbe9088e0b38d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 23 Jul 2013 14:28:48 +0200 Subject: [PATCH 007/119] Mark WatIE and TF1IE as broken (related #1103) --- youtube_dl/extractor/tf1.py | 1 + youtube_dl/extractor/wat.py | 1 + 2 files changed, 2 insertions(+) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index e0ffeced5..a8af89f83 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -10,6 +10,7 @@ class TF1IE(InfoExtractor): TF1 uses the wat.tv player, currently it can only download videos with the html5 player enabled, it cannot download HD videos. """ + _WORKING = False _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html' _TEST = { u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 0d1302cd2..0407a2d26 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -12,6 +12,7 @@ from ..utils import ( class WatIE(InfoExtractor): + _WORKING = False _VALID_URL=r'http://www.wat.tv/.*-(?P.*?)_.*?.html' IE_NAME = 'wat.tv' _TEST = { From 252580c5610cbeae4a3fb6fbd23ed532d6a273a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 23 Jul 2013 14:58:01 +0200 Subject: [PATCH 008/119] YoutubeChannelE: switch ajax query from channel_ajax to c4_browse_ajax It wasn't detecting when there aren't more videos --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0d49c0721..39894ae88 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -731,7 +731,7 @@ class YoutubeChannelIE(InfoExtractor): _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en' _MORE_PAGES_INDICATOR = 'yt-uix-load-more' - _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' + _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' IE_NAME = u'youtube:channel' def extract_videos_from_page(self, page): From b7cc9f50268f41656746009b6a6e8f5d5bf1cb7a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 23 Jul 2013 18:35:52 +0200 Subject: [PATCH 009/119] [soundcloud] Support URLs with a slash at the end (Fixes #1104) --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index d47c49c03..8b8b5166d 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -19,7 +19,7 @@ class SoundcloudIE(InfoExtractor): of the stream token and uid """ - _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)(?:[?].*)?$' + _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$' IE_NAME = u'soundcloud' _TEST = { u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', From fc492de31dfd8cc0a88988a0f2ea985a35d60975 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 23 Jul 2013 18:37:52 +0200 Subject: [PATCH 010/119] release 2013.07.23.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1f1f3ce8b..56b870b34 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.07.23' +__version__ = '2013.07.23.1' From b9a1252c9601db3e208eb76ab4c3d60f0fc1f7f2 Mon Sep 17 00:00:00 2001 From: Alex Van't Hof Date: Wed, 24 Jul 2013 00:48:11 -0400 Subject: [PATCH 011/119] [traileraddict] Allow all types of trailer URLs Valid url regex for traileraddict.com is too strict. Need to allow, e.g. theatrical-trailer, teaser-trailer, feature-read-band-trailer, etc. --- youtube_dl/extractor/traileraddict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 324bb6231..6f2fc2394 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class TrailerAddictIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/trailer/([^/]+)/(?:trailer|feature-trailer)' + _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/trailer/([^/]+)/(?:[^/]*trailer[^/]*)' _TEST = { u'url': u'http://www.traileraddict.com/trailer/prince-avalanche/trailer', u'file': u'76184.mp4', From b1ca5e3ffab2cd2f717039029f8b9023762d6214 Mon Sep 17 00:00:00 2001 From: Alex Van't Hof Date: Wed, 24 Jul 2013 02:33:48 -0400 Subject: [PATCH 012/119] [traileraddict] Obtain hd quality stream if available No clear method for determining if hd is available so opted to just check for presence of hd toggle function. --- youtube_dl/extractor/traileraddict.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 324bb6231..ff8e173bc 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -19,22 +19,28 @@ class TrailerAddictIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - + title = self._search_regex(r'(.+?)', webpage, 'video title').replace(' - Trailer Addict','') view_count = self._search_regex(r'Views: (.+?)
', webpage, 'Views Count') video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1] - info_url = "http://www.traileraddict.com/fvar.php?tid=%s" %(str(video_id)) + # Presence of (no)watchplus function indicates HD quality is available + if re.search(r'function (no)?watchplus()', webpage): + fvar = "fvarhd" + else: + fvar = "fvar" + + info_url = "http://www.traileraddict.com/%s.php?tid=%s" % (fvar, str(video_id)) info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage") - + final_url = self._search_regex(r'&fileurl=(.+)', info_webpage, 'Download url').replace('%3F','?') thumbnail_url = self._search_regex(r'&image=(.+?)&', info_webpage, 'thumbnail url') ext = final_url.split('.')[-1].split('?')[0] - + return [{ 'id' : video_id, 'url' : final_url, From 239e3e0cca362a3fecb702fe0ee11701ee3991b6 Mon Sep 17 00:00:00 2001 From: patrickslin Date: Tue, 23 Jul 2013 17:32:25 -0700 Subject: [PATCH 013/119] YoutubeIE: new algo for length 87 (fixes #1105) Squashed commit from the pull requests #1107, #1109 and #1110. --- devscripts/youtube_genalgo.py | 2 +- test/test_youtube_sig.py | 2 +- youtube_dl/extractor/youtube.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 2b3879f0a..4aeca3c50 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -16,7 +16,7 @@ tests = [ "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"), # 87 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", - "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"), + "tyuioplkjhgfdsazxcv"), # 86 - vfl_ymO4Z 2013/06/27 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"), diff --git a/test/test_youtube_sig.py b/test/test_youtube_sig.py index 51b300532..6da7c5502 100644 --- a/test/test_youtube_sig.py +++ b/test/test_youtube_sig.py @@ -30,7 +30,7 @@ class TestYoutubeSig(unittest.TestCase): def test_87(self): wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<" - right = "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr" + right = "tyuioplkjhgfdsazxcv" self.assertEqual(sig(wrong), right) def test_86(self): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 39894ae88..9d3f61be6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -186,7 +186,7 @@ class YoutubeIE(InfoExtractor): elif len(s) == 88: return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] elif len(s) == 87: - return s[62] + s[82:62:-1] + s[83] + s[61:52:-1] + s[0] + s[51:2:-1] + return s[4:23] + s[86] + s[24:85] elif len(s) == 86: return s[2:63] + s[82] + s[64:82] + s[63] elif len(s) == 85: From 870a7e6156d62675079b7971a90d7688fff7bd7a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 24 Jul 2013 10:29:34 +0200 Subject: [PATCH 014/119] release 2013.07.24 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 56b870b34..2a529c76b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.07.23.1' +__version__ = '2013.07.24' From 16484d49231c9b89bb81362794b58a8dcf6883ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 24 Jul 2013 10:43:44 +0200 Subject: [PATCH 015/119] [traileraddict]: Support clips urls and more trailer urls --- youtube_dl/extractor/traileraddict.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 6f2fc2394..3f597737b 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class TrailerAddictIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/trailer/([^/]+)/(?:[^/]*trailer[^/]*)' + _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P.+?)/(?P.+)' _TEST = { u'url': u'http://www.traileraddict.com/trailer/prince-avalanche/trailer', u'file': u'76184.mp4', @@ -17,8 +17,8 @@ class TrailerAddictIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) - webpage = self._download_webpage(url, video_id) + name = mobj.group('movie') + '/' + mobj.group('trailer_name') + webpage = self._download_webpage(url, name) title = self._search_regex(r'(.+?)', webpage, 'video title').replace(' - Trailer Addict','') From 3e1ad508eb50a760000c33f84fc2cb5275d0a469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 24 Jul 2013 12:48:25 +0200 Subject: [PATCH 016/119] Add Youtube player info for length 87 --- devscripts/youtube_genalgo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 4aeca3c50..93ec32a4b 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -14,7 +14,7 @@ tests = [ # 88 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"), - # 87 + # 87 - vflART1Nf 2013/07/24 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", "tyuioplkjhgfdsazxcv"), # 86 - vfl_ymO4Z 2013/06/27 From c216c1894de46ef5cfb7e391d9adfd5d3f84acbb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 24 Jul 2013 13:52:55 +0200 Subject: [PATCH 017/119] release 2013.07.24.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2a529c76b..d37b6681d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.07.24' +__version__ = '2013.07.24.1' From 7d2392691c18973c86820b796cbfe61cfad9ff11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 24 Jul 2013 14:05:14 +0200 Subject: [PATCH 018/119] [soundcloud]: Some improvements Extract thumbnails. Make SoundcloudSetIE a subclass of SoundcloudIE to reuse some code. Directly extract the file url without downloading an extra page. --- youtube_dl/extractor/soundcloud.py | 90 +++++++++++------------------- 1 file changed, 34 insertions(+), 56 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 8b8b5166d..54ff8db12 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -33,10 +33,35 @@ class SoundcloudIE(InfoExtractor): } } + _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' + def report_resolve(self, video_id): """Report information extraction.""" self.to_screen(u'%s: Resolving id' % video_id) + @classmethod + def _resolv_url(cls, url): + return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID + + def _extract_info_dict(self, info, full_title=None): + video_id = info['id'] + name = full_title or video_id + self.report_extraction(name) + + thumbnail = info['artwork_url'] + if thumbnail is not None: + thumbnail = thumbnail.replace('-large', '-t500x500') + return { + 'id': info['id'], + 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, + 'uploader': info['user']['username'], + 'upload_date': unified_strdate(info['created_at']), + 'title': info['title'], + 'ext': u'mp3', + 'description': info['description'], + 'thumbnail': thumbnail, + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -51,41 +76,13 @@ class SoundcloudIE(InfoExtractor): self.report_resolve(full_title) url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title) - resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' + resolv_url = self._resolv_url(url) info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON') info = json.loads(info_json) - video_id = info['id'] - self.report_extraction(full_title) - - streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' - stream_json = self._download_webpage(streams_url, full_title, - u'Downloading stream definitions', - u'unable to download stream definitions') - - streams = json.loads(stream_json) - mediaURL = streams['http_mp3_128_url'] - upload_date = unified_strdate(info['created_at']) - - return [{ - 'id': info['id'], - 'url': mediaURL, - 'uploader': info['user']['username'], - 'upload_date': upload_date, - 'title': info['title'], - 'ext': u'mp3', - 'description': info['description'], - }] - -class SoundcloudSetIE(InfoExtractor): - """Information extractor for soundcloud.com sets - To access the media, the uid of the song and a stream token - must be extracted from the page source and the script must make - a request to media.soundcloud.com/crossdomain.xml. Then - the media can be grabbed by requesting from an url composed - of the stream token and uid - """ + return self._extract_info_dict(info, full_title) +class SoundcloudSetIE(SoundcloudIE): _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$' IE_NAME = u'soundcloud:set' _TEST = { @@ -153,10 +150,6 @@ class SoundcloudSetIE(InfoExtractor): ] } - def report_resolve(self, video_id): - """Report information extraction.""" - self.to_screen(u'%s: Resolving id' % video_id) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -171,7 +164,7 @@ class SoundcloudSetIE(InfoExtractor): self.report_resolve(full_title) url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) - resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' + resolv_url = self._resolv_url(url) info_json = self._download_webpage(resolv_url, full_title) videos = [] @@ -182,23 +175,8 @@ class SoundcloudSetIE(InfoExtractor): return self.report_extraction(full_title) - for track in info['tracks']: - video_id = track['id'] - - streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' - stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON') - - self.report_extraction(video_id) - streams = json.loads(stream_json) - mediaURL = streams['http_mp3_128_url'] - - videos.append({ - 'id': video_id, - 'url': mediaURL, - 'uploader': track['user']['username'], - 'upload_date': unified_strdate(track['created_at']), - 'title': track['title'], - 'ext': u'mp3', - 'description': track['description'], - }) - return videos + return {'_type': 'playlist', + 'entries': [self._extract_info_dict(track) for track in info['tracks']], + 'id': info['id'], + 'title': info['title'], + } From eb6a41ba0f5dbb836d5b48b9e38f406c3c46c0ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 24 Jul 2013 14:39:21 +0200 Subject: [PATCH 019/119] ExfmIE: extract Soundcloud songs using SoundcloudIE Now SouncloudIE accepts api urls. --- youtube_dl/extractor/exfm.py | 44 +++++++++++++++++++----------- youtube_dl/extractor/soundcloud.py | 39 +++++++++++++++++--------- 2 files changed, 54 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index fe1582d1a..3443f19c5 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -8,17 +8,30 @@ class ExfmIE(InfoExtractor): IE_NAME = u'exfm' IE_DESC = u'ex.fm' _VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)' - _SOUNDCLOUD_URL_ = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream' - _TEST = { - u'url': u'http://ex.fm/song/1bgtzg', - u'file': u'1bgtzg.mp3', - u'md5': u'8a7967a3fef10e59a1d6f86240fd41cf', - u'info_dict': { - u"title": u"We Can't Stop", - u"uploader": u"Miley Cyrus", - u'thumbnail': u'http://i1.sndcdn.com/artworks-000049666230-w9i7ef-t500x500.jpg?9d68d37' - } - } + _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream' + _TESTS = [ + { + u'url': u'http://ex.fm/song/1bgtzg', + u'file': u'95223130.mp3', + u'md5': u'8a7967a3fef10e59a1d6f86240fd41cf', + u'info_dict': { + u"title": u"We Can't Stop - Miley Cyrus", + u"uploader": u"Miley Cyrus", + u'upload_date': u'20130603', + u'description': u'Download "We Can\'t Stop" \r\niTunes: http://smarturl.it/WeCantStop?IQid=SC\r\nAmazon: http://smarturl.it/WeCantStopAMZ?IQid=SC', + }, + u'note': u'Soundcloud song', + }, + { + u'url': u'http://ex.fm/song/wddt8', + u'file': u'wddt8.mp3', + u'md5': u'966bd70741ac5b8570d8e45bfaed3643', + u'info_dict': { + u'title': u'Safe and Sound', + u'uploader': u'Capital Cities', + }, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -26,11 +39,10 @@ class ExfmIE(InfoExtractor): info_url = "http://ex.fm/api/v3/song/%s" %(song_id) webpage = self._download_webpage(info_url, song_id) info = json.loads(webpage) - song_url = re.match(self._SOUNDCLOUD_URL_,info['song']['url']) - if song_url is not None: - song_url = song_url.group() + "?client_id=b45b1aa10f1ac2941910a7f0d10f8e28" - else: - song_url = info['song']['url'] + song_url = info['song']['url'] + if re.match(self._SOUNDCLOUD_URL, song_url) is not None: + self.to_screen('Soundcloud song detected') + return self.url_result(song_url.replace('/stream',''), 'Soundcloud') return [{ 'id': song_id, 'url': song_url, diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 54ff8db12..7c9f1c6b6 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -19,7 +19,11 @@ class SoundcloudIE(InfoExtractor): of the stream token and uid """ - _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$' + _VALID_URL = r'''^(?:https?://)? + (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$) + |(?:api\.soundcloud\.com/tracks/(?P\d+)) + ) + ''' IE_NAME = u'soundcloud' _TEST = { u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', @@ -35,6 +39,10 @@ class SoundcloudIE(InfoExtractor): _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' + @classmethod + def suitable(cls, url): + return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None + def report_resolve(self, video_id): """Report information extraction.""" self.to_screen(u'%s: Resolving id' % video_id) @@ -63,21 +71,26 @@ class SoundcloudIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) - # extract uploader (which is in the url) - uploader = mobj.group(1) - # extract simple title (uploader + slug of song title) - slug_title = mobj.group(2) - full_title = '%s/%s' % (uploader, slug_title) - - self.report_resolve(full_title) - - url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title) - resolv_url = self._resolv_url(url) - info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON') + track_id = mobj.group('track_id') + if track_id is not None: + info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID + full_title = track_id + else: + # extract uploader (which is in the url) + uploader = mobj.group(1) + # extract simple title (uploader + slug of song title) + slug_title = mobj.group(2) + full_title = '%s/%s' % (uploader, slug_title) + + self.report_resolve(full_title) + + url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title) + info_json_url = self._resolv_url(url) + info_json = self._download_webpage(info_json_url, full_title, u'Downloading info JSON') info = json.loads(info_json) return self._extract_info_dict(info, full_title) From 771822ebb85641359d4983137720446761d80bc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 24 Jul 2013 20:14:55 +0200 Subject: [PATCH 020/119] YoutubePlaylistIE: break only if there's no entry field in the response Otherwise the Favorite videos playlist cannot be downloaded complete. Also break if it reach the maximum value of the start-index. --- youtube_dl/extractor/youtube.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9d3f61be6..a16836c69 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -696,7 +696,11 @@ class YoutubePlaylistIE(InfoExtractor): videos = [] while True: - url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1) + start_index = self._MAX_RESULTS * (page_num - 1) + 1 + if start_index >= 1000: + self._downloader.report_warning(u'Max number of results reached') + break + url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index) page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) try: @@ -715,9 +719,6 @@ class YoutubePlaylistIE(InfoExtractor): index = entry['yt$position']['$t'] if 'media$group' in entry and 'media$player' in entry['media$group']: videos.append((index, entry['media$group']['media$player']['url'])) - - if len(response['feed']['entry']) < self._MAX_RESULTS: - break page_num += 1 videos = [v[1] for v in sorted(videos)] From b2e8bc1b20192e2ac261b4d3d88b8ae85c69eb9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 24 Jul 2013 20:40:12 +0200 Subject: [PATCH 021/119] YoutubeIE: Move the code from _real_initialize to a base class This allows to reuse the code in other IEs without having to overwrite some parts. --- youtube_dl/extractor/youtube.py | 215 ++++++++++++++++---------------- 1 file changed, 110 insertions(+), 105 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a16836c69..04d8df630 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -23,8 +23,114 @@ from ..utils import ( orderedSet, ) +class YoutubeBaseInfoExtractor(InfoExtractor): + """Provide base functions for Youtube extractors""" + _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' + _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' + _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' + _NETRC_MACHINE = 'youtube' + # If True it will raise an error if no login info is provided + _LOGIN_REQUIRED = False + + def report_lang(self): + """Report attempt to set language.""" + self.to_screen(u'Setting language') + + def _set_language(self): + request = compat_urllib_request.Request(self._LANG_URL) + try: + self.report_lang() + compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) + return False + return True + + def _login(self): + (username, password) = self._get_login_info() + # No authentication to be performed + if username is None: + if self._LOGIN_REQUIRED: + raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) + return False + + request = compat_urllib_request.Request(self._LOGIN_URL) + try: + login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) + return False + + galx = None + dsh = None + match = re.search(re.compile(r']* id="gaia_loginform"', login_results) is not None: + self._downloader.report_warning(u'unable to log in: bad username or password') + return False + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) + return False + return True + + def _confirm_age(self): + age_form = { + 'next_url': '/', + 'action_confirm': 'Confirm', + } + request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) + try: + self.report_age_confirmation() + compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) + return True + + def _real_initialize(self): + if self._downloader is None: + return + if not self._set_language(): + return + if not self._login(): + return + self._confirm_age() + +class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = u'YouTube.com' _VALID_URL = r"""^ ( @@ -45,11 +151,7 @@ class YoutubeIE(InfoExtractor): ([0-9A-Za-z_-]+) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow $""" - _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' - _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - _NETRC_MACHINE = 'youtube' # Listed in order of quality _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] @@ -139,10 +241,6 @@ class YoutubeIE(InfoExtractor): if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - def report_lang(self): - """Report attempt to set language.""" - self.to_screen(u'Setting language') - def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" self.to_screen(u'%s: Downloading video webpage' % video_id) @@ -306,91 +404,6 @@ class YoutubeIE(InfoExtractor): for x in formats: print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) - def _real_initialize(self): - if self._downloader is None: - return - - # Set language - request = compat_urllib_request.Request(self._LANG_URL) - try: - self.report_lang() - compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) - return - - (username, password) = self._get_login_info() - - # No authentication to be performed - if username is None: - return - - request = compat_urllib_request.Request(self._LOGIN_URL) - try: - login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) - return - - galx = None - dsh = None - match = re.search(re.compile(r']* id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username or password') - return - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) - return - - # Confirm age - age_form = { - 'next_url': '/', - 'action_confirm': 'Confirm', - } - request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) - try: - self.report_age_confirmation() - compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) - def _extract_id(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: @@ -899,20 +912,15 @@ class YoutubeShowIE(InfoExtractor): return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons] -class YoutubeFeedsInfoExtractor(YoutubeIE): +class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ Base class for extractors that fetch info from http://www.youtube.com/feed_ajax Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ + _LOGIN_REQUIRED = True _PAGING_STEP = 30 - # Overwrite YoutubeIE properties we don't want - _TESTS = [] - @classmethod - def suitable(cls, url): - return re.match(cls._VALID_URL, url) is not None - @property def _FEED_TEMPLATE(self): return 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=%s&paging=%%s' % self._FEED_NAME @@ -922,10 +930,7 @@ class YoutubeFeedsInfoExtractor(YoutubeIE): return u'youtube:%s' % self._FEED_NAME def _real_initialize(self): - (username, password) = self._get_login_info() - if username is None: - raise ExtractorError(u'No login info available, needed for downloading the Youtube subscriptions.', expected=True) - super(YoutubeFeedsInfoExtractor, self)._real_initialize() + self._login() def _real_extract(self, url): feed_entries = [] From c626a3d9fac5d53df46c9dc02e12b12d343e59e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 24 Jul 2013 20:45:19 +0200 Subject: [PATCH 022/119] Add an extractor for downloading the Youtube favorite videos(closes #127) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/youtube.py | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6de48c341..40dc1c1e4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -93,6 +93,7 @@ from .youtube import ( YoutubeShowIE, YoutubeSubscriptionsIE, YoutubeRecommendedIE, + YoutubeFavouritesIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 04d8df630..02f215605 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -683,10 +683,10 @@ class YoutubePlaylistIE(InfoExtractor): \? (?:.*?&)*? (?:p|a|list)= | p/ ) - ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) + ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,}) .* | - ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) + ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' _MAX_RESULTS = 50 @@ -960,3 +960,15 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = u'Youtube Recommended videos' + + +class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): + IE_NAME = u'youtube:favorites' + IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?' + _LOGIN_REQUIRED = True + + def _real_extract(self, url): + webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') + playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id') + return self.url_result(playlist_id, 'YoutubePlaylist') From 156d5ad6daa38699186e68e2a42a6dad441ef037 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 24 Jul 2013 21:18:41 +0200 Subject: [PATCH 023/119] release 2013.07.24.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d37b6681d..1568b2f4c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.07.24.1' +__version__ = '2013.07.24.2' From 43ba5456b1a072c964a2161f08bae9d4032dc6f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 24 Jul 2013 22:13:39 +0200 Subject: [PATCH 024/119] [youtube] add an extractor for the "Watch Later" list --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/youtube.py | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 40dc1c1e4..ad144286c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -93,6 +93,7 @@ from .youtube import ( YoutubeShowIE, YoutubeSubscriptionsIE, YoutubeRecommendedIE, + YoutubeWatchLaterIE, YoutubeFavouritesIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 02f215605..f11402efa 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -920,10 +920,15 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ _LOGIN_REQUIRED = True _PAGING_STEP = 30 + # use action_load_personal_feed instead of action_load_system_feed + _PERSONAL_FEED = False @property def _FEED_TEMPLATE(self): - return 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=%s&paging=%%s' % self._FEED_NAME + action = 'action_load_system_feed' + if self._PERSONAL_FEED: + action = 'action_load_personal_feed' + return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME) @property def IE_NAME(self): @@ -942,7 +947,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): u'Downloading page %s' % i) info = json.loads(info) feed_html = info['feed_html'] - m_ids = re.finditer(r'"/watch\?v=(.*?)"', feed_html) + m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) ids = orderedSet(m.group(1) for m in m_ids) feed_entries.extend(self.url_result(id, 'Youtube') for id in ids) if info['paging'] is None: @@ -961,6 +966,13 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'recommended' _PLAYLIST_TITLE = u'Youtube Recommended videos' +class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): + IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' + _FEED_NAME = 'watch_later' + _PLAYLIST_TITLE = u'Youtube Watch Later' + _PAGING_STEP = 100 + _PERSONAL_FEED = True class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = u'youtube:favorites' From 755eb0320ed127fe2f24af13343ae587a63daf22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 24 Jul 2013 22:27:33 +0200 Subject: [PATCH 025/119] [youtube] use itertools.count instead of a "while True" loop and a manual counter --- youtube_dl/extractor/youtube.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f11402efa..491018d6c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -705,10 +705,9 @@ class YoutubePlaylistIE(InfoExtractor): # Download playlist videos from API playlist_id = mobj.group(1) or mobj.group(2) - page_num = 1 videos = [] - while True: + for page_num in itertools.count(1): start_index = self._MAX_RESULTS * (page_num - 1) + 1 if start_index >= 1000: self._downloader.report_warning(u'Max number of results reached') @@ -732,7 +731,6 @@ class YoutubePlaylistIE(InfoExtractor): index = entry['yt$position']['$t'] if 'media$group' in entry and 'media$player' in entry['media$group']: videos.append((index, entry['media$group']['media$player']['url'])) - page_num += 1 videos = [v[1] for v in sorted(videos)] @@ -776,9 +774,7 @@ class YoutubeChannelIE(InfoExtractor): # Download any subsequent channel pages using the json-based channel_ajax query if self._MORE_PAGES_INDICATOR in page: - while True: - pagenum = pagenum + 1 - + for pagenum in itertools.count(1): url = self._MORE_PAGES_URL % (pagenum, channel_id) page = self._download_webpage(url, channel_id, u'Downloading page #%s' % pagenum) @@ -821,9 +817,8 @@ class YoutubeUserIE(InfoExtractor): # all of them. video_ids = [] - pagenum = 0 - while True: + for pagenum in itertools.count(0): start_index = pagenum * self._GDATA_PAGE_SIZE + 1 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) @@ -848,8 +843,6 @@ class YoutubeUserIE(InfoExtractor): if len(ids_in_page) < self._GDATA_PAGE_SIZE: break - pagenum += 1 - urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] url_results = [self.url_result(rurl, 'Youtube') for rurl in urls] return [self.playlist_result(url_results, playlist_title = username)] From e93aa81aa60fc9b44e10feba565586e82e56ba18 Mon Sep 17 00:00:00 2001 From: pishposhmcgee Date: Wed, 24 Jul 2013 16:55:28 -0500 Subject: [PATCH 026/119] Added an option 'e' to go with 'video' or 'embed' Based on links that I've seen, /e/ also occurs in the wild, and making this substitution yields effective results. --- youtube_dl/extractor/collegehumor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 4a0b0af5a..5badde03a 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -10,7 +10,7 @@ from ..utils import ( class CollegeHumorIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed)/(?P[0-9]+)/(?P.*)$' + _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P[0-9]+)/(?P.*)$' _TEST = { u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', From 12ef6aefa8ef5b8959a1b51788dd42cc33a4fe5c Mon Sep 17 00:00:00 2001 From: pishposhmcgee Date: Wed, 24 Jul 2013 21:51:08 -0500 Subject: [PATCH 027/119] changed video_url regex Some older videos contain an extra properties such as 'embed' before 'type'. --- youtube_dl/extractor/teamcoco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index ec92e589a..c910110ca 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -33,7 +33,7 @@ class TeamcocoIE(InfoExtractor): data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id data = self._download_webpage(data_url, video_id, 'Downloading data webpage') - video_url = self._html_search_regex(r'(.*?)', + video_url = self._html_search_regex(r']*type="high".*?>(.*?)', data, u'video URL') return [{ From 0a99956f711d492c6ca08cd086d3d4b5024fd10f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 25 Jul 2013 09:34:12 +0200 Subject: [PATCH 028/119] [ina] Fix URL detection (Fixes #1121) --- youtube_dl/extractor/ina.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py index 962c59214..ce1ff32a1 100644 --- a/youtube_dl/extractor/ina.py +++ b/youtube_dl/extractor/ina.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class InaIE(InfoExtractor): """Information Extractor for Ina.fr""" - _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?PI[0-9]+)/.*' + _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P[A-F0-9]+)/.*' _TEST = { u'url': u'www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', u'file': u'I12055569.mp4', From d0866f0bb41c4118189d58d27973afc473d496a9 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 25 Jul 2013 09:35:25 +0200 Subject: [PATCH 029/119] release 2013.07.25 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1568b2f4c..cad365da0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.07.24.2' +__version__ = '2013.07.25' From 6625f82940b2701e2ccc9a6cf5808056b075e16b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 25 Jul 2013 09:40:19 +0200 Subject: [PATCH 030/119] [keek] Allow httpS URLs (Fixes #1123) --- youtube_dl/extractor/keek.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py index dda78743d..738247e02 100644 --- a/youtube_dl/extractor/keek.py +++ b/youtube_dl/extractor/keek.py @@ -4,10 +4,10 @@ from .common import InfoExtractor class KeekIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' IE_NAME = u'keek' _TEST = { - u'url': u'http://www.keek.com/ytdl/keeks/NODfbab', + u'url': u'https?://www.keek.com/ytdl/keeks/NODfbab', u'file': u'NODfbab.mp4', u'md5': u'9b0636f8c0f7614afa4ea5e4c6e57e83', u'info_dict': { From 27669bd11d70d2b169a27e5e8b40b0205d14f1df Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 25 Jul 2013 09:52:53 +0200 Subject: [PATCH 031/119] [ina] Allow I at start of video IDs --- youtube_dl/extractor/ina.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py index ce1ff32a1..652f19b7b 100644 --- a/youtube_dl/extractor/ina.py +++ b/youtube_dl/extractor/ina.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class InaIE(InfoExtractor): """Information Extractor for Ina.fr""" - _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P[A-F0-9]+)/.*' + _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?PI?[A-F0-9]+)/.*' _TEST = { u'url': u'www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', u'file': u'I12055569.mp4', From 0ffcb7c6fca96038f46c685af8745641046e5158 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 25 Jul 2013 09:53:15 +0200 Subject: [PATCH 032/119] release 2013.07.25.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cad365da0..017d28c60 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.07.25' +__version__ = '2013.07.25.1' From 42f2805e48b918a74319aeaca9f8b9068e6e0298 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 25 Jul 2013 10:10:37 +0200 Subject: [PATCH 033/119] [keek] Fix testcase (Broken by accident in 6625f82940b2701e2ccc9a6cf5808056b075e16b) --- youtube_dl/extractor/keek.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py index 738247e02..a7b88d2d9 100644 --- a/youtube_dl/extractor/keek.py +++ b/youtube_dl/extractor/keek.py @@ -7,7 +7,7 @@ class KeekIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' IE_NAME = u'keek' _TEST = { - u'url': u'https?://www.keek.com/ytdl/keeks/NODfbab', + u'url': u'https://www.keek.com/ytdl/keeks/NODfbab', u'file': u'NODfbab.mp4', u'md5': u'9b0636f8c0f7614afa4ea5e4c6e57e83', u'info_dict': { From aedd6bb97de7ed82da746892e2a0b32ee9c6d6b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 25 Jul 2013 22:06:53 +0200 Subject: [PATCH 034/119] YoutubeIE: new algo for length 81 (fixes #1127) --- devscripts/youtube_genalgo.py | 4 ++-- test/test_youtube_sig.py | 2 +- youtube_dl/extractor/youtube.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 93ec32a4b..3218113e7 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -32,9 +32,9 @@ tests = [ # 82 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"), - # 81 + # 81 - vflLC8JvQ 2013/07/25 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", - "urty8ioplkjhgfdsazxcvbqm1234567e90QWERTYUIOPLKHGFDSnZXCVBNM!@#$%^&*(-+={[};?/>."), + "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), ] def find_matching(wrong, right): diff --git a/test/test_youtube_sig.py b/test/test_youtube_sig.py index 6da7c5502..4e50d6f90 100644 --- a/test/test_youtube_sig.py +++ b/test/test_youtube_sig.py @@ -60,7 +60,7 @@ class TestYoutubeSig(unittest.TestCase): def test_81(self): wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>." - right = "urty8ioplkjhgfdsazxcvbqm1234567e90QWERTYUIOPLKHGFDSnZXCVBNM!@#$%^&*(-+={[};?/>." + right = "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp" self.assertEqual(sig(wrong), right) if __name__ == '__main__': diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 491018d6c..f86929e06 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -296,7 +296,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif len(s) == 82: return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34] elif len(s) == 81: - return s[6] + s[3:6] + s[33] + s[7:24] + s[0] + s[25:33] + s[2] + s[34:53] + s[24] + s[54:81] + return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) From 5c468ca8a852a00a55b4559d31f5e9861e76d748 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 25 Jul 2013 22:50:24 +0200 Subject: [PATCH 035/119] YoutubeIE: add algo for length 79 (fixes #1126) --- devscripts/youtube_genalgo.py | 3 +++ test/test_youtube_sig.py | 5 +++++ youtube_dl/extractor/youtube.py | 2 ++ 3 files changed, 10 insertions(+) diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 3218113e7..22977ccd9 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -35,6 +35,9 @@ tests = [ # 81 - vflLC8JvQ 2013/07/25 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), + # 79 - vflLC8JvQ 2013/07/25 (sporadic) + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/", + "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), ] def find_matching(wrong, right): diff --git a/test/test_youtube_sig.py b/test/test_youtube_sig.py index 4e50d6f90..4d45a0e08 100644 --- a/test/test_youtube_sig.py +++ b/test/test_youtube_sig.py @@ -63,5 +63,10 @@ class TestYoutubeSig(unittest.TestCase): right = "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp" self.assertEqual(sig(wrong), right) + def test_79(self): + wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/" + right = "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp" + self.assertEqual(sig(wrong), right) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f86929e06..f10f2e3dd 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -297,6 +297,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34] elif len(s) == 81: return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] + elif len(s) == 79: + return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) From da70877a1bead1a53405bcda629fd0c5684a090a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 25 Jul 2013 22:58:40 +0200 Subject: [PATCH 036/119] release 2013.07.25.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 017d28c60..8b436720d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.07.25.1' +__version__ = '2013.07.25.2' From 58261235f04439332642f4bc401743b65b951519 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 26 Jul 2013 13:00:59 +0200 Subject: [PATCH 037/119] Add an extractor for roxwell.com (closes #1044) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/roxwel.py | 49 ++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 youtube_dl/extractor/roxwel.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ad144286c..7cfb292d9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -52,6 +52,7 @@ from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE +from .roxwel import RoxwelIE from .sina import SinaIE from .soundcloud import SoundcloudIE, SoundcloudSetIE from .spiegel import SpiegelIE diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py new file mode 100644 index 000000000..d339e6cb5 --- /dev/null +++ b/youtube_dl/extractor/roxwel.py @@ -0,0 +1,49 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import unified_strdate, determine_ext + + +class RoxwelIE(InfoExtractor): + _VALID_URL = r'https?://www\.roxwel\.com/player/(?P.+?)(\.|\?|$)' + + _TEST = { + u'url': u'http://www.roxwel.com/player/passionpittakeawalklive.html', + u'file': u'passionpittakeawalklive.flv', + u'md5': u'd9dea8360a1e7d485d2206db7fe13035', + u'info_dict': { + u'title': u'Take A Walk (live)', + u'uploader': u'Passion Pit', + u'description': u'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', + }, + u'skip': u'Requires rtmpdump', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + filename = mobj.group('filename') + info_url = 'http://www.roxwel.com/api/videos/%s' % filename + info_page = self._download_webpage(info_url, filename, + u'Downloading video info') + + self.report_extraction(filename) + info = json.loads(info_page) + rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) + best_rate = rtmp_rates[-1] + url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) + rtmp_url = self._download_webpage(url_page_url, filename, u'Downloading video url') + ext = determine_ext(rtmp_url) + if ext == 'f4v': + rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) + + return {'id': filename, + 'title': info['title'], + 'url': rtmp_url, + 'ext': 'flv', + 'description': info['description'], + 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), + 'uploader': info['artist'], + 'uploader_id': info['artistname'], + 'upload_date': unified_strdate(info['dbdate']), + } From a3c736def2058547e2c9e9fa4013f7c547c72c55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 29 Jul 2013 12:07:38 +0200 Subject: [PATCH 038/119] [dailymotion] Add an extractor for Dailymotion playlists --- test/test_playlists.py | 30 ++++++++++++++++++++++++++++ youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/dailymotion.py | 31 +++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 test/test_playlists.py diff --git a/test/test_playlists.py b/test/test_playlists.py new file mode 100644 index 000000000..f8cc75afb --- /dev/null +++ b/test/test_playlists.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +import sys +import unittest +import json + +# Allow direct execution +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor import DailymotionPlaylistIE +from youtube_dl.utils import * + +from helper import FakeYDL + +class TestPlaylists(unittest.TestCase): + def assertIsPlaylist(self, info): + """Make sure the info has '_type' set to 'playlist'""" + self.assertEqual(info['_type'], 'playlist') + + def test_dailymotion_playlist(self): + dl = FakeYDL() + ie = DailymotionPlaylistIE(dl) + result = ie.extract('http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'SPORT') + self.assertTrue(len(result['entries']) > 20) + +if __name__ == '__main__': + unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7cfb292d9..063a3d065 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -12,7 +12,7 @@ from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE -from .dailymotion import DailymotionIE +from .dailymotion import DailymotionIE, DailymotionPlaylistIE from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 9bf7a28ca..fa8c630d0 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,9 +1,12 @@ import re import json +import itertools from .common import InfoExtractor from ..utils import ( compat_urllib_request, + get_element_by_attribute, + get_element_by_id, ExtractorError, ) @@ -77,3 +80,31 @@ class DailymotionIE(InfoExtractor): 'ext': video_extension, 'thumbnail': info['thumbnail_url'] }] + + +class DailymotionPlaylistIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P.+?)/' + _MORE_PAGES_INDICATOR = r'' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + video_ids = [] + + for pagenum in itertools.count(1): + webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum), + playlist_id, u'Downloading page %s' % pagenum) + + playlist_el = get_element_by_attribute(u'class', u'video_list', webpage) + video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el)) + + if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: + break + + entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') + for video_id in video_ids] + return {'_type': 'playlist', + 'id': playlist_id, + 'title': get_element_by_id(u'playlist_name', webpage), + 'entries': entries, + } From caeefc29ebce8dbfb0c25a79887719055276c9eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 29 Jul 2013 13:12:09 +0200 Subject: [PATCH 039/119] [vimeo] add an extractor for channels --- test/test_playlists.py | 10 +++++++++- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/vimeo.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index f8cc75afb..65de3a55c 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -8,7 +8,7 @@ import json import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import DailymotionPlaylistIE +from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE from youtube_dl.utils import * from helper import FakeYDL @@ -26,5 +26,13 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'SPORT') self.assertTrue(len(result['entries']) > 20) + def test_vimeo_channel(self): + dl = FakeYDL() + ie = VimeoChannelIE(dl) + result = ie.extract('http://vimeo.com/channels/tributes') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Vimeo Tributes') + self.assertTrue(len(result['entries']) > 24) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 063a3d065..9f4dd3c24 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -71,7 +71,7 @@ from .ustream import UstreamIE from .vbox7 import Vbox7IE from .veoh import VeohIE from .vevo import VevoIE -from .vimeo import VimeoIE +from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE from .c56 import C56IE from .wat import WatIE diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ac32043c1..cc9c8d018 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1,5 +1,6 @@ import json import re +import itertools from .common import InfoExtractor from ..utils import ( @@ -171,3 +172,31 @@ class VimeoIE(InfoExtractor): 'thumbnail': video_thumbnail, 'description': video_description, }] + + +class VimeoChannelIE(InfoExtractor): + IE_NAME = u'vimeo:channel' + _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P[^/]+)' + _MORE_PAGES_INDICATOR = r'(.*?)' % channel_id, + webpage, u'channel title') + return {'_type': 'playlist', + 'id': channel_id, + 'title': channel_title, + 'entries': entries, + } From 63f05de10bac9250f58b2f15539f109ec75b3af7 Mon Sep 17 00:00:00 2001 From: Johny Mo Swag Date: Mon, 29 Jul 2013 12:11:57 -0700 Subject: [PATCH 040/119] detect vevo embed --- youtube_dl/extractor/worldstarhiphop.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index 5b9779c05..8715848ee 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -21,6 +21,14 @@ class WorldStarHipHopIE(InfoExtractor): webpage_src = self._download_webpage(url, video_id) + video_url = self._search_regex('videoId=(.*?)&?', + webpage_src, u'video URL', fatal=False) + + if video_url: + self.to_screen(u'Vevo video detected:') + vevo_id = 'vevo:%s' video_url + self.url_result(vevo_id) + video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)', webpage_src, u'video URL') From 579e2691feca6173c5a84c1fb4fe7a213386c223 Mon Sep 17 00:00:00 2001 From: Johny Mo Swag Date: Mon, 29 Jul 2013 12:24:26 -0700 Subject: [PATCH 041/119] detect vevo embed fix --- youtube_dl/extractor/worldstarhiphop.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index 8715848ee..a93928f3c 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -21,17 +21,23 @@ class WorldStarHipHopIE(InfoExtractor): webpage_src = self._download_webpage(url, video_id) - video_url = self._search_regex('videoId=(.*?)&?', + video_url = self._search_regex(r'videoId=(.*?)&?', webpage_src, u'video URL', fatal=False) - + if video_url: self.to_screen(u'Vevo video detected:') - vevo_id = 'vevo:%s' video_url - self.url_result(vevo_id) + return self.url_result('vevo:%s' % video_url, ie='Vevo') video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)', webpage_src, u'video URL') + if video_url == None: + video_url = self._search_regex(r'videoId=(.*?)&?', + webpage_src, u'video URL') + self.to_screen(u'Vevo video detected:') + vevo_id = 'vevo:%s' % video_url + return self.url_result(vevo_id, ie='Vevo') + if 'youtube' in video_url: self.to_screen(u'Youtube video detected:') return self.url_result(video_url, ie='Youtube') From a3124ba49fa7e089e761216ea4eb739aae6b07e5 Mon Sep 17 00:00:00 2001 From: pishposhmcgee Date: Mon, 29 Jul 2013 15:45:20 -0500 Subject: [PATCH 042/119] Modified m_urls regex and video_url Some videos have a leading slash, some do not --- youtube_dl/extractor/vevo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 3b16dcfbc..67537eae5 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -35,12 +35,12 @@ class VevoIE(InfoExtractor): self.report_extraction(video_id) video_info = json.loads(info_json) - m_urls = list(re.finditer(r'