From ba17598d061bf316c449052fcabbc55e6010c83f Mon Sep 17 00:00:00 2001 From: luxcem Date: Mon, 15 Jul 2019 20:15:25 +0200 Subject: [PATCH] [arteradio] Add new extractor --- youtube_dl/extractor/arteradio.py | 135 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 4 + 2 files changed, 139 insertions(+) create mode 100644 youtube_dl/extractor/arteradio.py diff --git a/youtube_dl/extractor/arteradio.py b/youtube_dl/extractor/arteradio.py new file mode 100644 index 000000000..76fde70bc --- /dev/null +++ b/youtube_dl/extractor/arteradio.py @@ -0,0 +1,135 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + month_by_name, + int_or_none, + str_or_none, +) + + +class ArteRadioIE(InfoExtractor): + """ArteRadio sound extractor.""" + IE_NAME = 'arteradio' + _VALID_URL = r'https?://(?:www\.)?arteradio\.com/son/(?P\d+)/(.*)' + _CDN_URL = 'https://download.www.arte.tv/permanent/arteradio/sites/default/files/sons/' + + _TESTS = [{ + 'url': 'https://www.arteradio.com/son/616458/la_bas_si_j_y_suis_plus', + 'md5': 'ae9219bbcfbb258ab5d5ba877708e2a9', + 'info_dict': { + 'id': '616458', + 'ext': 'mp3', + 'title': 'Là-bas si j\'y suis plus | ARTE Radio', + 'upload_date': '20140911', + 'description': 'md5:863c01af898a02681a2f543d32031566', + 'vcodec': 'none', + 'duration': 917, + }, + }, { + 'url': 'https://www.arteradio.com/son/61661758/bande_annonce_beatmakers_saison_2', + 'md5': 'a4678484b374e35faf47a555860a0a4f', + 'info_dict': { + 'id': '61661758', + 'ext': 'mp3', + 'title': 'Bande-annonce Beatmakers, saison 2 | ARTE Radio', + 'upload_date': '20190627', + 'description': 'md5:d52a0fd2fcc88e2d4b1cd0f2a5092fd1', + 'vcodec': 'none', + 'duration': 56, + 'thumbnail': 'https://www.arteradio.com/sites/default/files/beatmakers_s2_1.jpg' + }, + }] + + def _extract_date(self, webpage): + # Fetching date + upload_date_str = self._html_search_regex( + r']*>Mise en ligne.+<\/h5>\s+

(.+)<\/p>', + webpage, 'upload_date_str', fatal=False, default=None) + if not upload_date_str: + return None + try: + day, month, year = upload_date_str.split(' ') + day = '{:02d}'.format(int_or_none(day)) + month = '{:02d}'.format(month_by_name(month, lang='fr')) + except (ValueError, TypeError): + return None + return ''.join((year, month, day)) + + def _extract_data_from_button(self, button): + meta_data = dict(re.findall(r'data-([-\w]+?)=\"(.+?)\"', button)) + # If no sound-href is found we cannot extract the link + try: + url = self._CDN_URL + meta_data['sound-href'] + except KeyError: + raise ExtractorError('No audio found') + return { + 'id': meta_data.get('sound-id'), + 'duration': int_or_none(meta_data.get('duration-seconds')), + 'url': url, + 'href': meta_data.get('href'), + 'position': meta_data.get('position-serie'), + 'thumbnail': meta_data.get('image-url'), + 'vcodec': 'none', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + button = self._html_search_regex( + r']+data-sound-id=\"{}\"[^>]*)>'.format(video_id), + webpage, 'button') + audio_data = self._extract_data_from_button(button) + result = { + 'id': audio_data['id'] or video_id, + 'duration': audio_data['duration'], + 'url': audio_data['url'], + 'thumbnail': audio_data['thumbnail'], + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'upload_date': str_or_none(self._extract_date(webpage)), + } + result.update(audio_data) + return result + + +class ArteRadioSerieIE(ArteRadioIE): + """ArteRadio serie extractor.""" + IE_NAME = 'arteradio.com:playlist' + _VALID_URL = r'https?://(?:www\.)?arteradio\.com/serie/(?P.+)' + + _TESTS = [{ + 'url': 'https://www.arteradio.com/serie/crackopolis', + 'md5': '', + 'info_dict': { + 'id': 'crackopolis', + 'title': 'CRACKOPOLIS | ARTE Radio', + 'description': 'md5:1b4665891ef07bef17c98d692435d177', + }, + 'playlist_count': 16 + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + button_entries = re.findall( + r']+data-serie-url=\"/serie/{}\"[^>]*)>'.format(playlist_id), + webpage) + entries = [self._extract_data_from_button(button) for button in button_entries] + # The title for an item is not in the item meta data + # We generate the title from the item url + for index, entry in enumerate(entries): + position = entry.get('position') or index + try: + title = entry.get('href', '').split('/')[3] + except IndexError: + title = playlist_id + entry['title'] = position + '_' + title + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a202a0449..8fdef738a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -62,6 +62,10 @@ from .arte import ( ArteTVEmbedIE, ArteTVPlaylistIE, ) +from .arteradio import ( + ArteRadioIE, + ArteRadioSerieIE, +) from .asiancrush import ( AsianCrushIE, AsianCrushPlaylistIE,