From 46720279c28afb646b6ac19bcb11e85bb4bea726 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 12 Jul 2013 19:00:19 +0200 Subject: [PATCH 1/2] InfoExtractor: add some helper methods to extract OpenGraph info --- youtube_dl/extractor/common.py | 24 ++++++++++++++++++++++++ youtube_dl/extractor/cspan.py | 4 +--- youtube_dl/extractor/dailymotion.py | 5 +---- youtube_dl/extractor/ehow.py | 11 +++-------- youtube_dl/extractor/escapist.py | 8 ++------ youtube_dl/extractor/flickr.py | 15 +++------------ youtube_dl/extractor/funnyordie.py | 5 +---- youtube_dl/extractor/hotnewhiphop.py | 8 ++------ youtube_dl/extractor/instagram.py | 10 ++-------- youtube_dl/extractor/keek.py | 3 +-- youtube_dl/extractor/liveleak.py | 6 ++---- youtube_dl/extractor/nba.py | 3 +-- youtube_dl/extractor/statigram.py | 10 ++-------- youtube_dl/extractor/teamcoco.py | 15 +++------------ youtube_dl/extractor/traileraddict.py | 9 +++------ youtube_dl/extractor/tutv.py | 4 +--- youtube_dl/extractor/vine.py | 10 ++-------- 17 files changed, 54 insertions(+), 96 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1bd5538ca..0a0c4047d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -257,6 +257,30 @@ class InfoExtractor(object): return (username, password) + # Helper functions for extracting OpenGraph info + @staticmethod + def _og_regex(property): + return r'(.*?)', video_info, 'video url') @@ -49,5 +47,5 @@ class CSpanIE(InfoExtractor): 'url': url, 'play_path': path, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 5fd2221a7..9bf7a28ca 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -39,9 +39,6 @@ class DailymotionIE(InfoExtractor): # Extract URL, uploader and title from webpage self.report_extraction(video_id) - video_title = self._html_search_regex(r'', - webpage, 'title') - video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', # Looking for official user r'<(?:span|a) .*?rel="author".*?>([^<]+?)', - webpage, u'thumbnail URL') uploader = self._search_regex(r'', webpage, u'uploader') - title = self._search_regex(r'', - webpage, u'Video title').replace(' | eHow', '') - description = self._search_regex(r'', - webpage, u'video description') + title = self._og_search_title(webpage).replace(' | eHow', '') ext = determine_ext(final_url) return { @@ -44,8 +39,8 @@ class EHowIE(InfoExtractor): 'url': final_url, 'ext': ext, 'title': title, - 'thumbnail': thumbnail_url, - 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), 'uploader': uploader, } diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 794460e84..3aa2da52c 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -36,11 +36,7 @@ class EscapistIE(InfoExtractor): videoDesc = self._html_search_regex('(?P.*?)</h1>", r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) - video_description = self._html_search_regex(r'(.*)", webpage_src, u'title') - - # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - thumbnail = self._html_search_regex(r'"og:image" content="(.*)"', - webpage_src, u'thumbnail', fatal=False) results = [{ 'id': video_id, 'url' : video_url, 'title' : video_title, - 'thumbnail' : thumbnail, + 'thumbnail' : self._og_search_thumbnail(webpage_src), 'ext' : 'mp3', }] - return results \ No newline at end of file + return results diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 6ae704efd..1ffadf67f 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -18,12 +18,6 @@ class InstagramIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'', - webpage, u'thumbnail URL', fatal=False) html_title = self._html_search_regex( r'(.+?)', webpage, u'title', flags=re.DOTALL) @@ -34,9 +28,9 @@ class InstagramIE(InfoExtractor): return [{ 'id': video_id, - 'url': video_url, + 'url': self._og_search_video_url(webpage), 'ext': ext, 'title': title, - 'thumbnail': thumbnail_url, + 'thumbnail': self._og_search_thumbnail(webpage), 'uploader_id' : uploader_id }] diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py index 72ad6a3d0..dda78743d 100644 --- a/youtube_dl/extractor/keek.py +++ b/youtube_dl/extractor/keek.py @@ -24,8 +24,7 @@ class KeekIE(InfoExtractor): thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'[\S\s]+?

(?P.+?)

', webpage, u'uploader', fatal=False) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index cf8a2c931..dd062a14e 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -33,11 +33,9 @@ class LiveLeakIE(InfoExtractor): video_url = self._search_regex(r'file: "(.*?)",', webpage, u'video URL') - video_title = self._html_search_regex(r'', webpage, u'uploader', fatal=False) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 122b7dd26..0f178905b 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -30,8 +30,7 @@ class NBAIE(InfoExtractor): video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' shortened_video_id = video_id.rpartition('/')[2] - title = self._html_search_regex(r'Date: (.*?)', webpage, 'upload_date', fatal=False) diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py index ae9a63e8b..b8e6b3bf9 100644 --- a/youtube_dl/extractor/statigram.py +++ b/youtube_dl/extractor/statigram.py @@ -18,12 +18,6 @@ class StatigramIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'', - webpage, u'video URL') - thumbnail_url = self._html_search_regex( - r'', - webpage, u'thumbnail URL', fatal=False) html_title = self._html_search_regex( r'(.+?)', webpage, u'title') @@ -34,9 +28,9 @@ class StatigramIE(InfoExtractor): return [{ 'id': video_id, - 'url': video_url, + 'url': self._og_search_video_url(webpage), 'ext': ext, 'title': title, - 'thumbnail': thumbnail_url, + 'thumbnail': self._og_search_thumbnail(webpage), 'uploader_id' : uploader_id }] diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 1dd5e1b68..ec92e589a 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -30,15 +30,6 @@ class TeamcocoIE(InfoExtractor): self.report_extraction(video_id) - video_title = self._html_search_regex(r'', webpage, 'Views Count') - description = self._search_regex(r'', - webpage, 'video description') - video_id = self._search_regex(r'', - webpage, 'Video id').split('=')[1] - + video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1] + info_url = "http://www.traileraddict.com/fvar.php?tid=%s" %(str(video_id)) info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage") @@ -44,6 +41,6 @@ class TrailerAddictIE(InfoExtractor): 'ext' : ext, 'title' : title, 'thumbnail' : thumbnail_url, - 'description' : description, + 'description' : self._og_search_description(webpage), 'view_count' : view_count, }] diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index fcaa6ac01..4e404fbf5 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -22,8 +22,6 @@ class TutvIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'', webpage, u'title') internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID') data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) @@ -36,6 +34,6 @@ class TutvIE(InfoExtractor): 'id': internal_id, 'url': video_url, 'ext': ext, - 'title': title, + 'title': self._og_search_title(webpage), } return [info] diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index bdd3522eb..c4ec1f06f 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -27,12 +27,6 @@ class VineIE(InfoExtractor): video_url = self._html_search_regex(r'.*?

(.+?)

', webpage, u'uploader', fatal=False, flags=re.DOTALL) @@ -40,7 +34,7 @@ class VineIE(InfoExtractor): 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': video_title, - 'thumbnail': thumbnail, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), 'uploader': uploader, }] From 44dbe8903580e1e62ea6f3881b2fa469853c4a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 13 Jul 2013 11:29:08 +0200 Subject: [PATCH 2/2] Use re.DOTALL by default when searching OpenGraph properties --- youtube_dl/extractor/common.py | 2 +- youtube_dl/extractor/funnyordie.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0a0c4047d..05b243871 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -265,7 +265,7 @@ class InfoExtractor(object): def _og_search_property(self, property, html, name=None, **kargs): if name is None: name = 'OpenGraph %s' % property - return self._html_search_regex(self._og_regex(property), html, name, **kargs) + return self._html_search_regex(self._og_regex(property), html, name, flags=re.DOTALL, **kargs) def _og_search_thumbnail(self, html, **kargs): return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 64363dcd5..67a7e5f76 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -32,6 +32,6 @@ class FunnyOrDieIE(InfoExtractor): 'url': video_url, 'ext': 'mp4', 'title': title, - 'description': self._og_search_description(webpage, flags=re.DOTALL), + 'description': self._og_search_description(webpage), } return [info]