From 8efa309b20ec3d03e98bd3e5fac10be471ff607a Mon Sep 17 00:00:00 2001 From: m8factorial Date: Sun, 26 Apr 2020 19:07:01 +0200 Subject: [PATCH] [CRTVG] Add new extractor --- youtube_dl/extractor/crtvg.py | 155 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 156 insertions(+) create mode 100644 youtube_dl/extractor/crtvg.py diff --git a/youtube_dl/extractor/crtvg.py b/youtube_dl/extractor/crtvg.py new file mode 100644 index 000000000..212ab327d --- /dev/null +++ b/youtube_dl/extractor/crtvg.py @@ -0,0 +1,155 @@ + +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + unescapeHTML, + unified_strdate, +) + + +class CRTVGIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?crtvg\.(?:gal|es)/(?Prg/a-carta|rg/podcast|rg/destacados|tvg/a-carta|informativos|en-serie)/(?:[^/]+/)*(?P[A-Za-z0-9-]*)-(?P[0-9]{4,})?' + _TESTS = [{ + 'url': 'http://www.crtvg.es/tvg/a-carta/pepe-o-ingles-28-best-of', + 'info_dict': { + 'id': '594037', + 'ext': 'mp4', + 'title': 'The Best of... (O mellor de Pepe o Inglés)', + 'description': 'Lola conta como andan as cousas en Cruceiro un ano despois, con Pepe e Pilar por Londres e Filipa e o alcalde vivindo en Marbella e presenta o último episodio, un resume co mellor da serie, unha especie de propina como dia ela.', + 'series': 'Pepe o inglés', + 'release_date': '20130527', + }, + }, { + 'url': 'http://www.crtvg.es/rg/podcast/a-bola-extra-a-bola-extra-do-dia-21-04-2020-4383702', + 'info_dict': { + 'id': '4383702', + 'ext': 'mp3', + 'title': 'A bóla extra do día 21/04/2020', + 'description': None, + 'series': 'A bóla extra', + 'release_date': '20200421', + }, + }] + + def _real_extract(self, url): + + media_id = self._match_id(url) + + category = re.match(self._VALID_URL, url).group('category') + webpage = self._download_webpage(url, media_id) + + if media_id is None or media_id == 'None': + media_id = self._html_search_regex( + r'metadata\[\'ns_st_ci\'\][ ]*=[ ]*(\d{4,})', + webpage, 'media_id', group=1, fatal=False) + + # Radio Galega in mp3, otherwise mp4 + ext = 'mp3' if 'rg' in category else 'mp4' + + media_url = self._html_search_regex( + r'https?://(?:www\.)?.*flumotion.com/videos/(?:[^/]+/)*?([A-Za-z0-9\-_]*)\.' + ext, + webpage, 'media_url', group=0) + + if 'a-carta' in category or 'destacados' in category: + title = unescapeHTML(self._html_search_regex( + r'

(?:\s*)?([^\n\r]*)(?:\s*)?

', + webpage, 'title', group=1, fatal=False)) + description = unescapeHTML(self._html_search_regex( + r'
(?:\s*)?([^\n\r]*)(?:\s*)?', + webpage, 'description', fatal=False)) + series = unescapeHTML(self._html_search_regex( + r'

(?:\s*)?', + webpage, 'series', group=1, fatal=False)) + release_date = unescapeHTML(self._html_search_regex( + r'
(?:\s*)?(?:\D*)?(\d{2}/\d{2}/\d{4})', + webpage, 'release_date', group=1, fatal=False)) + release_date = unified_strdate(release_date) + + elif category == 'en-serie': + title = unescapeHTML(self._html_search_regex( + r'
(?:\s*)?

(.*)?

', + webpage, 'title', group=1, fatal=False)) + description = unescapeHTML(self._html_search_regex( + r'
(?:\s*)?

(?:.*)?

(?:\s*)

(.*)?

', + webpage, 'description', fatal=False)) + series = unescapeHTML(self._html_search_regex( + r'
(?:\s*)?

(.*)?

(?:\s*)', + webpage, 'series', group=1, flags=re.S, fatal=False)) + release_date = unescapeHTML(self._html_search_regex( + r'metadata\[\'ns_st_ddt\'\][ ]*=[ ]*\'?(\d{4}\-\d{2}\-\d{2})', + webpage, 'release_date', group=1, fatal=False)) + release_date = unified_strdate(release_date) + + elif category == 'informativos': + title = unescapeHTML(self._html_search_regex( + r'

(.*)?

', + webpage, 'title', group=1, fatal=False)) + description = unescapeHTML(self._html_search_meta( + ('og:description', 'og:description'), + webpage, 'description', fatal=False)) + series = 'Noticias de Galicia' + release_date = unescapeHTML(self._html_search_regex( + r'
(?:\s*)?(?:\D*)?(\d{2}/\d{2}/\d{4})', + webpage, 'release_date', group=1, fatal=False)) + release_date = unified_strdate(release_date) + + elif category == 'rg/podcast': + title = unescapeHTML(self._html_search_regex( + r'(.*)?', + webpage, 'series', group=1, fatal=False)) + description = None + series = unescapeHTML(self._html_search_regex( + r'
(?:\s*)?', + webpage, 'series', group=1, fatal=False)) + release_date = unescapeHTML(self._html_search_regex( + r'(?:\D*)?(\d{2}/\d{2}/\d{4})', + webpage, 'release_date', group=1, fatal=False)) + release_date = unified_strdate(release_date) + + else: + title = unescapeHTML(self._html_search_meta( + ('title', 'og:title'), + webpage, 'title', fatal=False)) + if 'rg' not in category: + description = unescapeHTML(self._html_search_meta( + ('og:description', 'og:description'), + webpage, 'description', fatal=False)) + else: + description = None + series = unescapeHTML(self._html_search_regex( + r'<title>(.*)?', + webpage, 'series', group=1, fatal=False)) + release_date = None + + if title is not None and "|" in title: + title = title.split("|")[0].rstrip() + + if series is not None and "|" in series: + series = series.split("|")[0].rstrip() + + formats = [] + if 'rg' in category: + formats.append({ + 'format_id': 'audio', + 'url': media_url, + 'ext': ext, + 'vcode': None, + 'acodec': ext, + }) + else: + formats = self._extract_m3u8_formats( + media_url + '/playlist.m3u8', + media_id, ext='mp4', fatal=False) + self._sort_formats(formats) + + return {'id': media_id, + 'title': title, + 'description': description, + 'series': series, + 'release_date': release_date, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e407ab3d9..77e4a4b83 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -235,6 +235,7 @@ from .corus import CorusIE from .cracked import CrackedIE from .crackle import CrackleIE from .crooksandliars import CrooksAndLiarsIE +from .crtvg import CRTVGIE from .crunchyroll import ( CrunchyrollIE, CrunchyrollShowPlaylistIE