From af17794c654bd24bbd5f47997596430b201ea08e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 2 Oct 2015 22:29:15 +0600 Subject: [PATCH] [europa] Improve extraction --- youtube_dl/extractor/europa.py | 86 +++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py index c437c4886..02ba8d63c 100644 --- a/youtube_dl/extractor/europa.py +++ b/youtube_dl/extractor/europa.py @@ -2,59 +2,89 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( - compat_urlparse, + int_or_none, + orderedSet, + parse_duration, + qualities, + unified_strdate, xpath_text ) class EuropaIE(InfoExtractor): - _VALID_URL = r'https?://ec\.europa\.eu/avservices/video/player\.cfm\?(?:[^&]|&(?!ref))*ref=(?P[A-Za-z0-9]+)' - _TEST = { + _VALID_URL = r'https?://ec\.europa\.eu/avservices/video/player\.cfm\?.*?\bref=(?P[A-Za-z0-9]+)' + _TESTS = [{ 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', - 'md5': '728cca2fd41d5aa7350cec1141fbe620', + 'md5': '574f080699ddd1e19a675b0ddf010371', 'info_dict': { 'id': 'I107758', 'ext': 'mp4', 'title': 'TRADE - Wikileaks on TTIP', 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015', - 'thumbnail': 're:^http://defiris\.ec\.streamcloud\.be/findmedia/18/107758/THUMB_[0-9A-Z]+\.jpg$' + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20150811', + 'duration': 34, + 'view_count': int, + 'formats': 'mincount:3', } - } + }, { + 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - lang = query.get('sitelang', ['en'])[0] - playlist = self._download_xml('http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=' + video_id, video_id) - videos = {} - formats = [] + playlist = self._download_xml( + 'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=%s' % video_id, video_id) - for item in playlist.findall('info/title/item'): - videos[xpath_text(item, 'lg')] = {'title': xpath_text(item, 'label').strip()} + def get_item(type_, preference): + items = {} + for item in playlist.findall('./info/%s/item' % type_): + lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None) + if lang and label: + items[lang] = label.strip() + for p in preference: + if items.get(p): + return items[p] - for item in playlist.findall('info/description/item'): - videos[xpath_text(item, 'lg')]['description'] = xpath_text(item, 'label').strip() + query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + preferred_lang = query.get('sitelang', ('en', ))[0] - for item in playlist.findall('files/file'): - lg = xpath_text(item, 'lg') - vid = videos[lg] - vid['format_note'] = xpath_text(item, 'lglabel') - vid['url'] = xpath_text(item, 'url') + preferred_langs = orderedSet((preferred_lang, 'en', 'int')) - if lg == lang: - vid['language_preference'] = 10 + title = get_item('title', preferred_langs) or video_id + description = get_item('description', preferred_langs) + thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail') + upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) + duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) + view_count = int_or_none(xpath_text(playlist,'./info/views', 'views')) - formats.append(vid) + language_preference = qualities(preferred_langs[::-1]) - formats.reverse() - def_video = videos.get(lang, videos['int']) + formats = [] + for file_ in playlist.findall('./files/file'): + video_url = xpath_text(file_, './url') + if not video_url: + continue + lang = xpath_text(file_, './lg') + formats.append({ + 'url': video_url, + 'format_id': lang, + 'format_note': xpath_text(file_, './lglabel'), + 'language_preference': language_preference(lang) + }) + self._sort_formats(formats) return { 'id': video_id, - 'title': def_video['title'], - 'description': def_video['description'], - 'thumbnail': xpath_text(playlist, 'info/thumburl', 'thumburl'), + 'title': title, + 'description': description, + 'thumbnail': thumbnmail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, 'formats': formats }