[dispeak] Add new extractor

Both GDCVault and GPUTechConf uses the service of DigitalSpeaking.
8 years ago · ec59d657e7
parent 99ef96f84c
commit ec59d657e7
4 changed files with 123 additions and 99 deletions
--- a/youtube_dl/extractor/dispeak.py
+++ b/youtube_dl/extractor/dispeak.py
@ -0,0 +1,111 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    remove_end,
+    xpath_element,
+    xpath_text,
+)
+
+
+class DigitalSpeakingIE(InfoExtractor):
+    _VALID_URL = r'http://evt.dispeak.com/([^/]+/)+xml/(?P<id>[^.]+).xml'
+
+    _TEST = {
+        # From http://evt.dispeak.com/ubm/gdc/sf16/xml/840376_BQRC.xml
+        'url': 'http://evt.dispeak.com/ubm/gdc/sf16/xml/840376_BQRC.xml',
+        'md5': 'a8efb6c31ed06ca8739294960b2dbabd',
+        'info_dict': {
+            'id': '840376_BQRC',
+            'ext': 'mp4',
+            'title': 'Tenacious Design and The Interface of \'Destiny\'',
+        },
+    }
+
+    def _parse_mp4(self, metadata):
+        video_formats = []
+        video_root = None
+
+        mp4_video = xpath_text(metadata, './mp4video', default=None)
+        if mp4_video is not None:
+            mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video)
+            video_root = mobj.group('root')
+        if video_root is None:
+            http_host = xpath_text(metadata, 'httpHost', default=None)
+            if http_host:
+                video_root = 'http://%s/' % http_host
+        if video_root is None:
+            # Hard-coded in http://evt.dispeak.com/ubm/gdc/sf16/custom/player2.js
+            # Works for GPUTechConf, too
+            video_root = 'http://s3-2u.digitallyspeaking.com/'
+
+        formats = metadata.findall('./MBRVideos/MBRVideo')
+        if not formats:
+            return None
+        for a_format in formats:
+            stream_name = xpath_text(a_format, 'streamName', fatal=True)
+            video_path = re.match(r'mp4\:(?P<path>.*)', stream_name).group('path')
+            url = video_root + video_path
+            vbr = xpath_text(a_format, 'bitrate')
+            video_formats.append({
+                'url': url,
+                'vbr': int_or_none(vbr),
+            })
+        return video_formats
+
+    def _parse_flv(self, metadata):
+        formats = []
+        akamai_url = xpath_text(metadata, './akamaiHost', fatal=True)
+        audios = metadata.find('./audios')
+        if audios is not None:
+            for audio in audios:
+                formats.append({
+                    'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+                    'play_path': remove_end(audio.get('url'), '.flv'),
+                    'ext': 'flv',
+                    'vcodec': 'none',
+                    'format_id': audio.get('code'),
+                })
+        slide_video_path = xpath_text(metadata, './slideVideo', fatal=True)
+        formats.append({
+            'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+            'play_path': remove_end(slide_video_path, '.flv'),
+            'ext': 'flv',
+            'format_note': 'slide deck video',
+            'quality': -2,
+            'preference': -2,
+            'format_id': 'slides',
+        })
+        speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True)
+        formats.append({
+            'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+            'play_path': remove_end(speaker_video_path, '.flv'),
+            'ext': 'flv',
+            'format_note': 'speaker video',
+            'quality': -1,
+            'preference': -1,
+            'format_id': 'speaker',
+        })
+        return formats
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        xml_description = self._download_xml(url, video_id)
+        metadata = xpath_element(xml_description, 'metadata')
+
+        video_formats = self._parse_mp4(metadata)
+        if video_formats is None:
+            video_formats = self._parse_flv(metadata)
+
+        return {
+            'id': video_id,
+            'formats': video_formats,
+            'title': xpath_text(metadata, 'title', fatal=True),
+            'duration': parse_duration(xpath_text(metadata, 'endTime')),
+            'creator': xpath_text(metadata, 'speaker'),
+        }
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -197,6 +197,7 @@ from .dump import DumpIE
 from .dumpert import DumpertIE
 from .defense import DefenseGouvFrIE
 from .discovery import DiscoveryIE
+from .dispeak import DigitalSpeakingIE
 from .dropbox import DropboxIE
 from .dw import (
    DWIE,
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@ -4,7 +4,6 @@ import re

 from .common import InfoExtractor
 from ..utils import (
-    remove_end,
    HEADRequest,
    sanitized_Request,
    urlencode_postdata,
@ -64,66 +63,6 @@ class GDCVaultIE(InfoExtractor):
        },
    ]

-    def _parse_mp4(self, xml_description):
-        video_formats = []
-        video_root = None
-
-        mp4_video = xml_description.find('./metadata/mp4video')
-        if mp4_video is not None:
-            mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text)
-            video_root = mobj.group('root')
-        if video_root is None:
-            # Hard-coded in http://evt.dispeak.com/ubm/gdc/sf16/custom/player2.js
-            video_root = 'http://s3-2u.digitallyspeaking.com/'
-
-        formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
-        if not formats:
-            return None
-        for format in formats:
-            mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
-            url = video_root + mobj.group('path')
-            vbr = format.find('bitrate').text
-            video_formats.append({
-                'url': url,
-                'vbr': int(vbr),
-            })
-        return video_formats
-
-    def _parse_flv(self, xml_description):
-        formats = []
-        akamai_url = xml_description.find('./metadata/akamaiHost').text
-        audios = xml_description.find('./metadata/audios')
-        if audios is not None:
-            for audio in audios:
-                formats.append({
-                    'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
-                    'play_path': remove_end(audio.get('url'), '.flv'),
-                    'ext': 'flv',
-                    'vcodec': 'none',
-                    'format_id': audio.get('code'),
-                })
-        slide_video_path = xml_description.find('./metadata/slideVideo').text
-        formats.append({
-            'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
-            'play_path': remove_end(slide_video_path, '.flv'),
-            'ext': 'flv',
-            'format_note': 'slide deck video',
-            'quality': -2,
-            'preference': -2,
-            'format_id': 'slides',
-        })
-        speaker_video_path = xml_description.find('./metadata/speakerVideo').text
-        formats.append({
-            'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
-            'play_path': remove_end(speaker_video_path, '.flv'),
-            'ext': 'flv',
-            'format_note': 'speaker video',
-            'quality': -1,
-            'preference': -1,
-            'format_id': 'speaker',
-        })
-        return formats
-
    def _login(self, webpage_url, display_id):
        (username, password) = self._get_login_info()
        if username is None or password is None:
@ -199,17 +138,10 @@ class GDCVaultIE(InfoExtractor):
                r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>',
                start_page, 'xml filename')

-        xml_description = self._download_xml(
-            '%s/xml/%s' % (xml_root, xml_name), display_id)
-
-        video_title = xml_description.find('./metadata/title').text
-        video_formats = self._parse_mp4(xml_description)
-        if video_formats is None:
-            video_formats = self._parse_flv(xml_description)
-
        return {
+            '_type': 'url_transparent',
            'id': video_id,
            'display_id': display_id,
-            'title': video_title,
-            'formats': video_formats,
+            'url': '%s/xml/%s' % (xml_root, xml_name),
+            'ie': 'DigitalSpeaking',
        }
--- a/youtube_dl/extractor/gputechconf.py
+++ b/youtube_dl/extractor/gputechconf.py
@ -2,12 +2,6 @@
 from __future__ import unicode_literals

 from .common import InfoExtractor
-from ..utils import (
-    xpath_element,
-    xpath_text,
-    int_or_none,
-    parse_duration,
-)


 class GPUTechConfIE(InfoExtractor):
@ -27,29 +21,15 @@ class GPUTechConfIE(InfoExtractor):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

-        root_path = self._search_regex(r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path', 'http://evt.dispeak.com/nvidia/events/gtc15/')
-        xml_file_id = self._search_regex(r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id')
-
-        doc = self._download_xml('%sxml/%s.xml' % (root_path, xml_file_id), video_id)
-
-        metadata = xpath_element(doc, 'metadata')
-        http_host = xpath_text(metadata, 'httpHost', 'http host', True)
-        mbr_videos = xpath_element(metadata, 'MBRVideos')
-
-        formats = []
-        for mbr_video in mbr_videos.findall('MBRVideo'):
-            stream_name = xpath_text(mbr_video, 'streamName')
-            if stream_name:
-                formats.append({
-                    'url': 'http://%s/%s' % (http_host, stream_name.replace('mp4:', '')),
-                    'tbr': int_or_none(xpath_text(mbr_video, 'bitrate')),
-                })
-        self._sort_formats(formats)
+        root_path = self._search_regex(
+            r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path',
+            default='http://evt.dispeak.com/nvidia/events/gtc15/')
+        xml_file_id = self._search_regex(
+            r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id')

        return {
+            '_type': 'url_transparent',
            'id': video_id,
-            'title': xpath_text(metadata, 'title'),
-            'duration': parse_duration(xpath_text(metadata, 'endTime')),
-            'creator': xpath_text(metadata, 'speaker'),
-            'formats': formats,
+            'url': '%sxml/%s.xml' % (root_path, xml_file_id),
+            'ie': 'DigitalSpeaking',
        }