[arteradio] Add new extractor

archive/recovered-github-prs
luxcem 5 years ago
parent fffc618c51
commit ba17598d06
No known key found for this signature in database
GPG Key ID: DA38C8196CD598A1

@ -0,0 +1,135 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
month_by_name,
int_or_none,
str_or_none,
)
class ArteRadioIE(InfoExtractor):
"""ArteRadio sound extractor."""
IE_NAME = 'arteradio'
_VALID_URL = r'https?://(?:www\.)?arteradio\.com/son/(?P<id>\d+)/(.*)'
_CDN_URL = 'https://download.www.arte.tv/permanent/arteradio/sites/default/files/sons/'
_TESTS = [{
'url': 'https://www.arteradio.com/son/616458/la_bas_si_j_y_suis_plus',
'md5': 'ae9219bbcfbb258ab5d5ba877708e2a9',
'info_dict': {
'id': '616458',
'ext': 'mp3',
'title': 'Là-bas si j\'y suis plus | ARTE Radio',
'upload_date': '20140911',
'description': 'md5:863c01af898a02681a2f543d32031566',
'vcodec': 'none',
'duration': 917,
},
}, {
'url': 'https://www.arteradio.com/son/61661758/bande_annonce_beatmakers_saison_2',
'md5': 'a4678484b374e35faf47a555860a0a4f',
'info_dict': {
'id': '61661758',
'ext': 'mp3',
'title': 'Bande-annonce Beatmakers, saison 2 | ARTE Radio',
'upload_date': '20190627',
'description': 'md5:d52a0fd2fcc88e2d4b1cd0f2a5092fd1',
'vcodec': 'none',
'duration': 56,
'thumbnail': 'https://www.arteradio.com/sites/default/files/beatmakers_s2_1.jpg'
},
}]
def _extract_date(self, webpage):
# Fetching date
upload_date_str = self._html_search_regex(
r'<h5[^>]*>Mise en ligne.+<\/h5>\s+<p>(.+)<\/p>',
webpage, 'upload_date_str', fatal=False, default=None)
if not upload_date_str:
return None
try:
day, month, year = upload_date_str.split(' ')
day = '{:02d}'.format(int_or_none(day))
month = '{:02d}'.format(month_by_name(month, lang='fr'))
except (ValueError, TypeError):
return None
return ''.join((year, month, day))
def _extract_data_from_button(self, button):
meta_data = dict(re.findall(r'data-([-\w]+?)=\"(.+?)\"', button))
# If no sound-href is found we cannot extract the link
try:
url = self._CDN_URL + meta_data['sound-href']
except KeyError:
raise ExtractorError('No audio found')
return {
'id': meta_data.get('sound-id'),
'duration': int_or_none(meta_data.get('duration-seconds')),
'url': url,
'href': meta_data.get('href'),
'position': meta_data.get('position-serie'),
'thumbnail': meta_data.get('image-url'),
'vcodec': 'none',
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
button = self._html_search_regex(
r'<button([^>]+data-sound-id=\"{}\"[^>]*)>'.format(video_id),
webpage, 'button')
audio_data = self._extract_data_from_button(button)
result = {
'id': audio_data['id'] or video_id,
'duration': audio_data['duration'],
'url': audio_data['url'],
'thumbnail': audio_data['thumbnail'],
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'upload_date': str_or_none(self._extract_date(webpage)),
}
result.update(audio_data)
return result
class ArteRadioSerieIE(ArteRadioIE):
"""ArteRadio serie extractor."""
IE_NAME = 'arteradio.com:playlist'
_VALID_URL = r'https?://(?:www\.)?arteradio\.com/serie/(?P<id>.+)'
_TESTS = [{
'url': 'https://www.arteradio.com/serie/crackopolis',
'md5': '',
'info_dict': {
'id': 'crackopolis',
'title': 'CRACKOPOLIS | ARTE Radio',
'description': 'md5:1b4665891ef07bef17c98d692435d177',
},
'playlist_count': 16
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
button_entries = re.findall(
r'<button([^>]+data-serie-url=\"/serie/{}\"[^>]*)>'.format(playlist_id),
webpage)
entries = [self._extract_data_from_button(button) for button in button_entries]
# The title for an item is not in the item meta data
# We generate the title from the item url
for index, entry in enumerate(entries):
position = entry.get('position') or index
try:
title = entry.get('href', '').split('/')[3]
except IndexError:
title = playlist_id
entry['title'] = position + '_' + title
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
return self.playlist_result(entries, playlist_id, title, description)

@ -62,6 +62,10 @@ from .arte import (
ArteTVEmbedIE,
ArteTVPlaylistIE,
)
from .arteradio import (
ArteRadioIE,
ArteRadioSerieIE,
)
from .asiancrush import (
AsianCrushIE,
AsianCrushPlaylistIE,

Loading…
Cancel
Save