From b3f0e5304807862ce72c136da90b860df805ee5c Mon Sep 17 00:00:00 2001 From: Jai Grimshaw Date: Sat, 31 Aug 2013 01:53:01 +1000 Subject: [PATCH 01/16] Fixed issue #1277 KeyError when no description. Allows a continue with a warning when an extractor cannot retrieve a description. --- youtube_dl/YoutubeDL.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b289bd9e2..afce28040 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -492,6 +492,8 @@ class YoutubeDL(object): self.report_writedescription(descfn) with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: descfile.write(info_dict['description']) + except (KeyError, TypeError): + self.report_warning(u'Cannot extract description.') except (OSError, IOError): self.report_error(u'Cannot write description file ' + descfn) return From f1d20fa39f264508aa7219ccc4bc64f59b970f04 Mon Sep 17 00:00:00 2001 From: Johny Mo Swag Date: Wed, 11 Sep 2013 14:50:38 -0700 Subject: [PATCH 02/16] added kickstarter IE --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/kickstarter.py | 43 +++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 youtube_dl/extractor/kickstarter.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fbe0b8cb7..5711b6bba 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -48,6 +48,7 @@ from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE from .kankan import KankanIE +from .kickstarter import KickStarterIE from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py new file mode 100644 index 000000000..7f6f2b064 --- /dev/null +++ b/youtube_dl/extractor/kickstarter.py @@ -0,0 +1,43 @@ +import re + +from .common import InfoExtractor + + +class KickStarterIE(InfoExtractor): + _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P.*)/.*\?' + _TEST = { + "url": "https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location", + "file": "1404461844.mp4", + "md5": "c81addca81327ffa66c642b5d8b08cab", + "info_dict": { + "title": u"Intersection: The Story of Josh Grant by Kyle Cowling \u2014 Kickstarter" + } + } + + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + webpage_src = self._download_webpage(url, video_id) + + video_url = self._search_regex(r'data-video="(.*?)">', + webpage_src, u'video URL') + + if 'mp4' in video_url: + ext = 'mp4' + else: + ext = 'flv' + + video_title = self._html_search_regex(r"(.*)?", + webpage_src, u'title') + + + results = [{ + 'id': video_id, + 'url' : video_url, + 'title' : video_title, + 'ext' : ext, + }] + + return results \ No newline at end of file From c247d87ef3a7f03dfcc28e3fc23dee9ec34835d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 12 Sep 2013 11:31:27 +0200 Subject: [PATCH 03/16] [funnyordie] fix video url extraction --- youtube_dl/extractor/funnyordie.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 4508f0dfa..f3d86a711 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,7 +21,7 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'type: "video/mp4", src: "(.*?)"', + video_url = self._search_regex(r'type="video/mp4" src="(.*?)"', webpage, u'video URL', flags=re.DOTALL) info = { From bfd5c93af9f9eee938c628f19c997f999f21c74e Mon Sep 17 00:00:00 2001 From: tewe Date: Thu, 12 Sep 2013 12:30:14 +0200 Subject: [PATCH 04/16] Add Ustream channel support --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/ustream.py | 70 ++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 26cf24935..a7cddef73 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -96,7 +96,7 @@ from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE from .unistra import UnistraIE -from .ustream import UstreamIE +from .ustream import UstreamIE, UstreamChannelIE from .vbox7 import Vbox7IE from .veehd import VeeHDIE from .veoh import VeohIE diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 5f423870a..16cdcc765 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -1,4 +1,7 @@ +from HTMLParser import HTMLParser +import json import re +from urlparse import urljoin from .common import InfoExtractor @@ -43,3 +46,70 @@ class UstreamIE(InfoExtractor): 'thumbnail': thumbnail, } return info + +# More robust than regular expressions + +class ChannelParser(HTMLParser): + """ + + """ + channel_id = None + + def handle_starttag(self, tag, attrs): + if tag != 'meta': + return + values = dict(attrs) + if values.get('name') != 'ustream:channel_id': + return + value = values.get('content', '') + if value.isdigit(): + self.channel_id = value + +class SocialstreamParser(HTMLParser): + """ +
  • + """ + def __init__(self): + HTMLParser.__init__(self) + self.content_ids = [] + + def handle_starttag(self, tag, attrs): + if tag != 'li': + return + for (attr, value) in attrs: + if attr == 'data-content-id' and value.isdigit(): + self.content_ids.append(value) + +class UstreamChannelIE(InfoExtractor): + _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P.+)' + IE_NAME = u'ustream:channel' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + slug = m.group('slug') + # Slugs can be non-ascii, but youtube-dl can't handle non-ascii command lines, + # so if we got this far it's probably percent encoded and we needn't worry. + + p = ChannelParser() + p.feed(self._download_webpage(url, slug)) + p.close() + channel_id = p.channel_id + + p = SocialstreamParser() + BASE = 'http://www.ustream.tv' + next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id + while next_url: + reply = json.loads(self._download_webpage(urljoin(BASE, next_url), channel_id)) + p.feed(reply['data']) + next_url = reply['nextUrl'] + p.close() + video_ids = p.content_ids + + # From YoutubeChannelIE + + self._downloader.to_screen(u'[ustream] Channel %s: Found %i videos' % (channel_id, len(video_ids))) + + urls = ['http://www.ustream.tv/recorded/' + vid for vid in video_ids] + url_entries = [self.url_result(eurl, 'Ustream') for eurl in urls] + return [self.playlist_result(url_entries, channel_id)] From ad94a6fe446543788730f5096c7b74229702805e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 12 Sep 2013 21:56:36 +0200 Subject: [PATCH 05/16] [canalplust] accept urls that don't include the video id (fixes #1415), extract more info and update test --- youtube_dl/extractor/canalplus.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 1f02519a0..1db9b24cf 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -1,3 +1,4 @@ +# encoding: utf-8 import re import xml.etree.ElementTree @@ -5,24 +6,29 @@ from .common import InfoExtractor from ..utils import unified_strdate class CanalplusIE(InfoExtractor): - _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P\d+)' + _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P.*)|player\.canalplus\.fr/#/(?P\d+))' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' IE_NAME = u'canalplus.fr' _TEST = { - u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861', - u'file': u'889861.flv', - u'md5': u'590a888158b5f0d6832f84001fbf3e99', + u'url': u'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', + u'file': u'922470.flv', u'info_dict': { - u'title': u'Le Petit Journal 20/06/13 - La guerre des drone', - u'upload_date': u'20130620', + u'title': u'Zapping - 26/08/13', + u'description': u'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', + u'upload_date': u'20130826', + }, + u'params': { + u'skip_download': True, }, - u'skip': u'Requires rtmpdump' } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + if video_id is None: + webpage = self._download_webpage(url, mobj.group('path')) + video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id') info_url = self._VIDEO_INFO_TEMPLATE % video_id info_page = self._download_webpage(info_url,video_id, u'Downloading video info') @@ -43,4 +49,6 @@ class CanalplusIE(InfoExtractor): 'ext': 'flv', 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), 'thumbnail': media.find('IMAGES/GRAND').text, + 'description': infos.find('DESCRIPTION').text, + 'view_count': int(infos.find('NB_VUES').text), } From ce85f022d2c78b3f58982b486c8c628d22158573 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 12 Sep 2013 22:04:09 +0200 Subject: [PATCH 06/16] [youtube] update algo for length 82 (fixes #1416) --- devscripts/youtube_genalgo.py | 4 ++-- youtube_dl/extractor/youtube.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 6e3595366..b390c7e2e 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -32,9 +32,9 @@ tests = [ # 83 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"), - # 82 - vflZK4ZYR 2013/08/23 + # 82 - vflGNjMhJ 2013/09/12 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", - "wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>.<"), + ".>/?;}[<=+-(*&^%$#@!MNBVCXeASDFGHKLPOqUYTREWQ0987654321mnbvcxzasdfghjklpoiuytrIwZ"), # 81 - vflLC8JvQ 2013/07/25 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2e0d70eaf..f49665925 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -434,7 +434,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): elif len(s) == 83: return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] elif len(s) == 82: - return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82] + return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54] elif len(s) == 81: return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] elif len(s) == 80: From dd01d6558a142deb93fe7d6122ae698ecdea4f63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 12 Sep 2013 22:18:39 +0200 Subject: [PATCH 07/16] [gamespot] Update test video title --- youtube_dl/extractor/gamespot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 7585b7061..cd3bbe65f 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ class GameSpotIE(InfoExtractor): u"file": u"6410818.mp4", u"md5": u"b2a30deaa8654fcccd43713a6b6a4825", u"info_dict": { - u"title": u"Arma III - Community Guide: SITREP I", + u"title": u"Arma 3 - Community Guide: SITREP I", u"upload_date": u"20130627", } } From a921f40799d2ecb4be53b3241d2dbfc80f804d73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 13 Sep 2013 22:05:29 +0200 Subject: [PATCH 08/16] [ustream] Simplify channel extraction the ChannelParser has been moved to a new function in utils get_meta_content Instead of the SocialStreamParser now it uses a regex --- test/test_utils.py | 28 +++++++++++++----- youtube_dl/extractor/ustream.py | 49 ++++--------------------------- youtube_dl/utils.py | 52 ++++++++++++++++++++++++++++----- 3 files changed, 70 insertions(+), 59 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index be1069105..ff2e9885b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -11,13 +11,16 @@ import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) #from youtube_dl.utils import htmlentity_transform -from youtube_dl.utils import timeconvert -from youtube_dl.utils import sanitize_filename -from youtube_dl.utils import unescapeHTML -from youtube_dl.utils import orderedSet -from youtube_dl.utils import DateRange -from youtube_dl.utils import unified_strdate -from youtube_dl.utils import find_xpath_attr +from youtube_dl.utils import ( + timeconvert, + sanitize_filename, + unescapeHTML, + orderedSet, + DateRange, + unified_strdate, + find_xpath_attr, + get_meta_content, +) if sys.version_info < (3, 0): _compat_str = lambda b: b.decode('unicode-escape') @@ -127,5 +130,16 @@ class TestUtil(unittest.TestCase): self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1]) self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2]) + def test_meta_parser(self): + testhtml = u''' + + + + + ''' + get_meta = lambda name: get_meta_content(name, testhtml) + self.assertEqual(get_meta('description'), u'foo & bar') + self.assertEqual(get_meta('author'), 'Plato') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index f69b27d44..74c82587f 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -4,7 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( compat_urlparse, - compat_html_parser, + get_meta_content, ) @@ -49,40 +49,6 @@ class UstreamIE(InfoExtractor): } return info -# More robust than regular expressions - -class ChannelParser(compat_html_parser.HTMLParser): - """ - - """ - channel_id = None - - def handle_starttag(self, tag, attrs): - if tag != 'meta': - return - values = dict(attrs) - if values.get('name') != 'ustream:channel_id': - return - value = values.get('content', '') - if value.isdigit(): - self.channel_id = value - -class SocialstreamParser(compat_html_parser.HTMLParser): - """ -
  • - """ - def __init__(self): - compat_html_parser.HTMLParser.__init__(self) - self.content_ids = [] - - def handle_starttag(self, tag, attrs): - if tag != 'li': - return - for (attr, value) in attrs: - if attr == 'data-content-id' and value.isdigit(): - self.content_ids.append(value) - class UstreamChannelIE(InfoExtractor): _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P.+)' IE_NAME = u'ustream:channel' @@ -90,21 +56,16 @@ class UstreamChannelIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url) slug = m.group('slug') + webpage = self._download_webpage(url, slug) + channel_id = get_meta_content('ustream:channel_id', webpage) - p = ChannelParser() - p.feed(self._download_webpage(url, slug)) - p.close() - channel_id = p.channel_id - - p = SocialstreamParser() BASE = 'http://www.ustream.tv' next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id + video_ids = [] while next_url: reply = json.loads(self._download_webpage(compat_urlparse.urljoin(BASE, next_url), channel_id)) - p.feed(reply['data']) + video_ids.extend(re.findall(r'data-content-id="(\d.*)"', reply['data'])) next_url = reply['nextUrl'] - p.close() - video_ids = p.content_ids urls = ['http://www.ustream.tv/recorded/' + vid for vid in video_ids] url_entries = [self.url_result(eurl, 'Ustream') for eurl in urls] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 201802cee..768c6207d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -249,7 +249,17 @@ def htmlentity_transform(matchobj): return (u'&%s;' % entity) compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class AttrParser(compat_html_parser.HTMLParser): +class BaseHTMLParser(compat_html_parser.HTMLParser): + def __init(self): + compat_html_parser.HTMLParser.__init__(self) + self.html = None + + def loads(self, html): + self.html = html + self.feed(html) + self.close() + +class AttrParser(BaseHTMLParser): """Modified HTMLParser that isolates a tag with the specified attribute""" def __init__(self, attribute, value): self.attribute = attribute @@ -257,10 +267,9 @@ class AttrParser(compat_html_parser.HTMLParser): self.result = None self.started = False self.depth = {} - self.html = None self.watch_startpos = False self.error_count = 0 - compat_html_parser.HTMLParser.__init__(self) + BaseHTMLParser.__init__(self) def error(self, message): if self.error_count > 10 or self.started: @@ -269,11 +278,6 @@ class AttrParser(compat_html_parser.HTMLParser): self.error_count += 1 self.goahead(1) - def loads(self, html): - self.html = html - self.feed(html) - self.close() - def handle_starttag(self, tag, attrs): attrs = dict(attrs) if self.started: @@ -334,6 +338,38 @@ def get_element_by_attribute(attribute, value, html): pass return parser.get_result() +class MetaParser(BaseHTMLParser): + """ + Modified HTMLParser that isolates a meta tag with the specified name + attribute. + """ + def __init__(self, name): + BaseHTMLParser.__init__(self) + self.name = name + self.content = None + self.result = None + + def handle_starttag(self, tag, attrs): + if tag != 'meta': + return + attrs = dict(attrs) + if attrs.get('name') == self.name: + self.result = attrs.get('content') + + def get_result(self): + return self.result + +def get_meta_content(name, html): + """ + Return the content attribute from the meta tag with the given name attribute. + """ + parser = MetaParser(name) + try: + parser.loads(html) + except compat_html_parser.HTMLParseError: + pass + return parser.get_result() + def clean_html(html): """Clean an HTML snippet into a readable string""" From 9a1c32dc54fdefcd6b5e03fac1a0dd65383b6f99 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sat, 14 Sep 2013 05:42:00 +0200 Subject: [PATCH 09/16] XHamsterIE: Add support for new URL format --- youtube_dl/extractor/xhamster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 88b8b6be0..e50069586 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -11,7 +11,7 @@ from ..utils import ( class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" - _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P[0-9]+)/.*\.html' + _VALID_URL = r'(?:http://)?(?P(?:www\.)?xhamster\.com/movies/(?P[0-9]+)/.*\.html(?:\?.*)?)' _TEST = { u'url': u'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', u'file': u'1509445.flv', @@ -27,7 +27,7 @@ class XHamsterIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id + mrss_url = 'http://' + mobj.group('url') webpage = self._download_webpage(mrss_url, video_id) mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) From fad84d50fe124df1c620c9bc95bdc4c9e5053e6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 14 Sep 2013 11:10:01 +0200 Subject: [PATCH 10/16] [googleplus] Fix upload date extraction --- youtube_dl/extractor/googleplus.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index f1cd88983..8895ad289 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -40,7 +40,8 @@ class GooglePlusIE(InfoExtractor): self.report_extraction(video_id) # Extract update date - upload_date = self._html_search_regex('title="Timestamp">(.*?)', + upload_date = self._html_search_regex( + ['title="Timestamp">(.*?)', r'(.+?)'], webpage, u'upload date', fatal=False) if upload_date: # Convert timestring to a format suitable for filename From 0b7f31184d6a2d87cf7f568c561ff8d017f07bd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 14 Sep 2013 11:14:40 +0200 Subject: [PATCH 11/16] Now --all-sub is a modifier to --write-sub and --write-auto-sub (closes #1412) For keeping backwards compatibility --all-sub sets --write-sub if --write-auto-sub is not given --- test/test_dailymotion_subtitles.py | 2 ++ test/test_youtube_subtitles.py | 2 ++ youtube_dl/YoutubeDL.py | 4 ++-- youtube_dl/__init__.py | 5 +++++ youtube_dl/extractor/subtitles.py | 5 ++--- 5 files changed, 13 insertions(+), 5 deletions(-) diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index bcd9f79f6..83c65d57e 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -40,6 +40,7 @@ class TestDailymotionSubtitles(unittest.TestCase): subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 5) @@ -54,6 +55,7 @@ class TestDailymotionSubtitles(unittest.TestCase): self.assertTrue(len(subtitles.keys()) == 0) def test_nosubtitles(self): self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' + self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles), 0) diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 5632871ac..168e6c66c 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -41,6 +41,7 @@ class TestYoutubeSubtitles(unittest.TestCase): subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') def test_youtube_allsubtitles(self): + self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) @@ -66,6 +67,7 @@ class TestYoutubeSubtitles(unittest.TestCase): self.assertTrue(subtitles['it'] is not None) def test_youtube_nosubtitles(self): self.url = 'sAjKT8FhjI8' + self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles), 0) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c2f992b8e..e53a2b8ad 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -74,6 +74,7 @@ class YoutubeDL(object): writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatic subtitles to a file allsubtitles: Downloads all the subtitles of the video + (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) subtitleslangs: List of languages of the subtitles to download @@ -499,8 +500,7 @@ class YoutubeDL(object): return subtitles_are_requested = any([self.params.get('writesubtitles', False), - self.params.get('writeautomaticsub'), - self.params.get('allsubtitles', False)]) + self.params.get('writeautomaticsub')]) if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: # subtitles download errors are already managed as troubles in relevant IE diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 696e54f49..0022a4e7a 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -533,6 +533,11 @@ def _real_main(argv=None): else: date = DateRange(opts.dateafter, opts.datebefore) + # --all-sub automatically sets --write-sub if --write-auto-sub is not given + # this was the old behaviour if only --all-sub was given. + if opts.allsubtitles and (opts.writeautomaticsub == False): + opts.writesubtitles = True + if sys.version_info < (3,): # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems) if opts.outtmpl is not None: diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 97215f289..90de7de3a 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -10,8 +10,7 @@ class SubtitlesInfoExtractor(InfoExtractor): @property def _have_to_download_any_subtitles(self): return any([self._downloader.params.get('writesubtitles', False), - self._downloader.params.get('writeautomaticsub'), - self._downloader.params.get('allsubtitles', False)]) + self._downloader.params.get('writeautomaticsub')]) def _list_available_subtitles(self, video_id, webpage=None): """ outputs the available subtitles for the video """ @@ -34,7 +33,7 @@ class SubtitlesInfoExtractor(InfoExtractor): available_subs_list = {} if self._downloader.params.get('writeautomaticsub', False): available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage)) - if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): + if self._downloader.params.get('writesubtitles', False): available_subs_list.update(self._get_available_subtitles(video_id)) if not available_subs_list: # error, it didn't get the available subtitles From 19e1d35989970831007b7ca5d988fe0454f08a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 14 Sep 2013 14:26:42 +0200 Subject: [PATCH 12/16] [mixcloud] Rewrite extractor (fixes #278) --- youtube_dl/extractor/mixcloud.py | 122 ++++++++++--------------------- youtube_dl/utils.py | 11 ++- 2 files changed, 48 insertions(+), 85 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 8245b5583..a200dcd74 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -5,34 +5,27 @@ import socket from .common import InfoExtractor from ..utils import ( compat_http_client, - compat_str, compat_urllib_error, compat_urllib_request, - - ExtractorError, + unified_strdate, ) class MixcloudIE(InfoExtractor): - _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' IE_NAME = u'mixcloud' - def report_download_json(self, file_id): - """Report JSON download.""" - self.to_screen(u'Downloading json') - - def get_urls(self, jsonData, fmt, bitrate='best'): - """Get urls from 'audio_formats' section in json""" - try: - bitrate_list = jsonData[fmt] - if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: - bitrate = max(bitrate_list) # select highest - - url_list = jsonData[fmt][bitrate] - except TypeError: # we have no bitrate info. - url_list = jsonData[fmt] - return url_list + _TEST = { + u'url': u'http://www.mixcloud.com/dholbach/cryptkeeper/', + u'file': u'dholbach-cryptkeeper.mp3', + u'info_dict': { + u'title': u'Cryptkeeper', + u'description': u'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', + u'uploader': u'Daniel Holbach', + u'uploader_id': u'dholbach', + u'upload_date': u'20111115', + }, + } def check_urls(self, url_list): """Returns 1st active url from list""" @@ -45,71 +38,32 @@ class MixcloudIE(InfoExtractor): return None - def _print_formats(self, formats): - print('Available formats:') - for fmt in formats.keys(): - for b in formats[fmt]: - try: - ext = formats[fmt][b][0] - print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])) - except TypeError: # we have no bitrate info - ext = formats[fmt][0] - print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])) - break - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - # extract uploader & filename from url - uploader = mobj.group(1).decode('utf-8') - file_id = uploader + "-" + mobj.group(2).decode('utf-8') - - # construct API request - file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' - # retrieve .json file with links to files - request = compat_urllib_request.Request(file_url) - try: - self.report_download_json(file_url) - jsonData = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err)) - - # parse JSON - json_data = json.loads(jsonData) - player_url = json_data['player_swf_url'] - formats = dict(json_data['audio_formats']) - - req_format = self._downloader.params.get('format', None) - - if self._downloader.params.get('listformats', None): - self._print_formats(formats) - return - - if req_format is None or req_format == 'best': - for format_param in formats.keys(): - url_list = self.get_urls(formats, format_param) - # check urls - file_url = self.check_urls(url_list) - if file_url is not None: - break # got it! - else: - if req_format not in formats: - raise ExtractorError(u'Format is not available') - - url_list = self.get_urls(formats, req_format) - file_url = self.check_urls(url_list) - format_param = req_format - return [{ - 'id': file_id.decode('utf-8'), - 'url': file_url.decode('utf-8'), - 'uploader': uploader.decode('utf-8'), - 'upload_date': None, - 'title': json_data['name'], - 'ext': file_url.split('.')[-1].decode('utf-8'), - 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), - 'thumbnail': json_data['thumbnail_url'], - 'description': json_data['description'], - 'player_url': player_url.decode('utf-8'), - }] + uploader = mobj.group(1) + cloudcast_name = mobj.group(2) + track_id = '-'.join((uploader, cloudcast_name)) + api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name) + webpage = self._download_webpage(url, track_id) + json_data = self._download_webpage(api_url, track_id, + u'Downloading cloudcast info') + info = json.loads(json_data) + + preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') + song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') + template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) + final_song_url = self.check_urls(template_url % i for i in range(30)) + + return { + 'id': track_id, + 'title': info['name'], + 'url': final_song_url, + 'ext': 'mp3', + 'description': info['description'], + 'thumbnail': info['pictures'].get('extra_large'), + 'uploader': info['user']['name'], + 'uploader_id': info['user']['username'], + 'upload_date': unified_strdate(info['created_time']), + 'view_count': info['play_count'], + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 768c6207d..5558d4737 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -700,7 +700,16 @@ def unified_strdate(date_str): date_str = date_str.replace(',',' ') # %z (UTC offset) is only supported in python>=3.2 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) - format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M'] + format_expressions = [ + '%d %B %Y', + '%B %d %Y', + '%b %d %Y', + '%Y-%m-%d', + '%d/%m/%Y', + '%Y/%m/%d %H:%M:%S', + '%d.%m.%Y %H:%M', + '%Y-%m-%dT%H:%M:%SZ', + ] for expression in format_expressions: try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') From 471a5ee908ee765c1ba1ff6a41051bcf71065064 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 14 Sep 2013 14:45:04 +0200 Subject: [PATCH 13/16] Set the ext field for each format --- youtube_dl/extractor/archiveorg.py | 7 ++++--- youtube_dl/extractor/dreisat.py | 6 +++--- youtube_dl/extractor/trilulilu.py | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 7efd1d823..61ce4469a 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -46,6 +46,8 @@ class ArchiveOrgIE(InfoExtractor): for fn,fdata in data['files'].items() if 'Video' in fdata['format']] formats.sort(key=lambda fdata: fdata['file_size']) + for f in formats: + f['ext'] = determine_ext(f['url']) info = { '_type': 'video', @@ -61,7 +63,6 @@ class ArchiveOrgIE(InfoExtractor): info['thumbnail'] = thumbnail # TODO: Remove when #980 has been merged - info['url'] = formats[-1]['url'] - info['ext'] = determine_ext(formats[-1]['url']) + info.update(formats[-1]) - return info \ No newline at end of file + return info diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 64b465805..765cb1f37 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -54,6 +54,7 @@ class DreiSatIE(InfoExtractor): 'width': int(fe.find('./width').text), 'height': int(fe.find('./height').text), 'url': fe.find('./url').text, + 'ext': determine_ext(fe.find('./url').text), 'filesize': int(fe.find('./filesize').text), 'video_bitrate': int(fe.find('./videoBitrate').text), '3sat_qualityname': fe.find('./quality').text, @@ -79,7 +80,6 @@ class DreiSatIE(InfoExtractor): } # TODO: Remove when #980 has been merged - info['url'] = formats[-1]['url'] - info['ext'] = determine_ext(formats[-1]['url']) + info.update(formats[-1]) - return info \ No newline at end of file + return info diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index f278951ba..0bf028f61 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -52,6 +52,7 @@ class TriluliluIE(InfoExtractor): { 'format': fnode.text, 'url': video_url_template % fnode.text, + 'ext': fnode.text.partition('-')[0] } for fnode in format_doc.findall('./formats/format') @@ -67,7 +68,6 @@ class TriluliluIE(InfoExtractor): } # TODO: Remove when #980 has been merged - info['url'] = formats[-1]['url'] - info['ext'] = formats[-1]['format'].partition('-')[0] + info.update(formats[-1]) return info From 92790f4e542fc3d5f4cc02a647a2695d9175d464 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 14 Sep 2013 21:41:49 +0200 Subject: [PATCH 14/16] [soundcloud] Add an extractor for users (closes #1426) --- test/test_playlists.py | 10 ++++++- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/soundcloud.py | 45 ++++++++++++++++++++++++++++-- 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index 4a2e00b01..d079a4f23 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -8,7 +8,7 @@ import json import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE, UstreamChannelIE +from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE, UstreamChannelIE, SoundcloudUserIE from youtube_dl.utils import * from helper import FakeYDL @@ -42,5 +42,13 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], u'5124905') self.assertTrue(len(result['entries']) >= 11) + def test_soundcloud_user(self): + dl = FakeYDL() + ie = SoundcloudUserIE(dl) + result = ie.extract('https://soundcloud.com/the-concept-band') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'9615865') + self.assertTrue(len(result['entries']) >= 12) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 06f9542d2..19d57c2e9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -82,7 +82,7 @@ from .sina import SinaIE from .slashdot import SlashdotIE from .slideshare import SlideshareIE from .sohu import SohuIE -from .soundcloud import SoundcloudIE, SoundcloudSetIE +from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5f3a5540d..29cd5617c 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,10 +1,12 @@ import json import re +import itertools from .common import InfoExtractor from ..utils import ( compat_str, compat_urlparse, + compat_urllib_parse, ExtractorError, unified_strdate, @@ -53,10 +55,11 @@ class SoundcloudIE(InfoExtractor): def _resolv_url(cls, url): return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID - def _extract_info_dict(self, info, full_title=None): + def _extract_info_dict(self, info, full_title=None, quiet=False): video_id = info['id'] name = full_title or video_id - self.report_extraction(name) + if quiet == False: + self.report_extraction(name) thumbnail = info['artwork_url'] if thumbnail is not None: @@ -198,3 +201,41 @@ class SoundcloudSetIE(SoundcloudIE): 'id': info['id'], 'title': info['title'], } + + +class SoundcloudUserIE(SoundcloudIE): + _VALID_URL = r'https?://(www\.)?soundcloud.com/(?P[^/]+)(/?(tracks/)?)?(\?.*)?$' + IE_NAME = u'soundcloud:user' + + # it's in tests/test_playlists.py + _TEST = None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + uploader = mobj.group('user') + + url = 'http://soundcloud.com/%s/' % uploader + resolv_url = self._resolv_url(url) + user_json = self._download_webpage(resolv_url, uploader, + u'Downloading user info') + user = json.loads(user_json) + + tracks = [] + for i in itertools.count(): + data = compat_urllib_parse.urlencode({'offset': i*50, + 'client_id': self._CLIENT_ID, + }) + tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data + response = self._download_webpage(tracks_url, uploader, + u'Downloading tracks page %s' % (i+1)) + new_tracks = json.loads(response) + tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks) + if len(new_tracks) < 50: + break + + return { + '_type': 'playlist', + 'id': compat_str(user['id']), + 'title': user['username'], + 'entries': tracks, + } From e69ae5b9e74910541e75eea4c8dfc13066f28f65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Sep 2013 12:14:59 +0200 Subject: [PATCH 15/16] [youtube] support youtube.googleapis.com/v/* urls (fixes #1425) --- test/test_all_urls.py | 1 + youtube_dl/extractor/youtube.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 99fc7bd28..ff1c86efe 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -36,6 +36,7 @@ class TestAllURLsMatching(unittest.TestCase): self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668 self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube']) self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) + self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube']) def test_youtube_channel_matching(self): assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f49665925..e4a2e22bc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -139,7 +139,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): ( (?:https?://)? # http(s):// (optional) (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| - tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains + tube\.majestyc\.net/| + youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ From 5a6fecc3dee35f95f3590a31e51670819db5a1fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Sep 2013 23:30:58 +0200 Subject: [PATCH 16/16] Add an extractor for southparkstudios.com (closes #1434) It uses the MTV system --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/southparkstudios.py | 34 ++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 youtube_dl/extractor/southparkstudios.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 19d57c2e9..246f1e8b5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -83,6 +83,7 @@ from .slashdot import SlashdotIE from .slideshare import SlideshareIE from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE +from .southparkstudios import SouthParkStudiosIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py new file mode 100644 index 000000000..a5dc754dd --- /dev/null +++ b/youtube_dl/extractor/southparkstudios.py @@ -0,0 +1,34 @@ +import re + +from .mtv import MTVIE, _media_xml_tag + + +class SouthParkStudiosIE(MTVIE): + IE_NAME = u'southparkstudios.com' + _VALID_URL = r'https?://www\.southparkstudios\.com/clips/(?P\d+)' + + _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + + _TEST = { + u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', + u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4', + u'info_dict': { + u'title': u'Bat Daded', + u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.', + }, + } + + # Overwrite MTVIE properties we don't want + _TESTS = [] + + def _get_thumbnail_url(self, uri, itemdoc): + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + return itemdoc.find(search_path).attrib['url'] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"', + webpage, u'mgid') + return self._get_videos_info(mgid)