From db6364df21e02ac7f400a25a7aef8e4dd0824c5d Mon Sep 17 00:00:00 2001 From: Jakub Wieczorek Date: Mon, 20 Jul 2020 11:07:22 +0200 Subject: [PATCH 1/3] [polskieradio] Fix extraction for the updated article page theme --- youtube_dl/extractor/polskieradio.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py index 978d6f813..6728e2f05 100644 --- a/youtube_dl/extractor/polskieradio.py +++ b/youtube_dl/extractor/polskieradio.py @@ -15,6 +15,7 @@ from ..utils import ( int_or_none, strip_or_none, unified_timestamp, + unescapeHTML, ) @@ -39,6 +40,25 @@ class PolskieRadioIE(InfoExtractor): 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' }, }], + }, { + 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo', + 'info_dict': { + 'id': '2534482', + 'title': 'Żagaryści. Poezja jak spoiwo', + 'description': 'md5:f18d95d5dcba747a09b635e21a4c0695', + }, + 'playlist': [{ + 'md5': 'd07559829f61d5a93a75755987ded760', + 'info_dict': { + 'id': '2516679', + 'ext': 'mp3', + 'title': 'md5:c6e1234e0b747ad883cb91b7ad06b98c', + 'timestamp': 1592654400, + 'upload_date': '20200620', + 'duration': 1430, + 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' + }, + }], }, { 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', 'info_dict': { @@ -78,8 +98,8 @@ class PolskieRadioIE(InfoExtractor): media_urls = set() - for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): - media = self._parse_json(data_media, playlist_id, fatal=False) + for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', content): + media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) if not media.get('file') or not media.get('desc'): continue media_url = self._proto_relative_url(media['file'], 'http:') @@ -98,6 +118,7 @@ class PolskieRadioIE(InfoExtractor): title = self._og_search_title(webpage).strip() description = strip_or_none(self._og_search_description(webpage)) + description = description.replace('\xa0', ' ') if description is not None else None return self.playlist_result(entries, playlist_id, title, description) From 88d220db23c32ef16aa52ad15210b31455cff944 Mon Sep 17 00:00:00 2001 From: Jakub Wieczorek Date: Mon, 20 Jul 2020 11:16:22 +0200 Subject: [PATCH 2/3] [polskieradio] Change the test case for playlist extraction The previous one has for some reason been removed from the Web site. --- youtube_dl/extractor/polskieradio.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py index 6728e2f05..ec03366e2 100644 --- a/youtube_dl/extractor/polskieradio.py +++ b/youtube_dl/extractor/polskieradio.py @@ -60,13 +60,13 @@ class PolskieRadioIE(InfoExtractor): }, }], }, { - 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', + 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2487823,Marek-Kondrat-czyta-Mistrza-i-Malgorzate', 'info_dict': { - 'id': '1635803', - 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', - 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', + 'id': '2487823', + 'title': 'Marek Kondrat czyta "Mistrza i Małgorzatę"', + 'description': 'md5:8422a95cc83834f2aaeff9d82e9c8f39', }, - 'playlist_mincount': 12, + 'playlist_mincount': 50, }, { 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', 'only_matching': True, From a75e8add8aecdf8a66bd7d125a11ffabe8a66e8b Mon Sep 17 00:00:00 2001 From: Jakub Wieczorek Date: Mon, 20 Jul 2020 11:27:24 +0200 Subject: [PATCH 3/3] [polskieradio] Add a test for new-style multiple broadcast playlists. --- youtube_dl/extractor/polskieradio.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py index ec03366e2..53fe0340a 100644 --- a/youtube_dl/extractor/polskieradio.py +++ b/youtube_dl/extractor/polskieradio.py @@ -21,7 +21,7 @@ from ..utils import ( class PolskieRadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P[0-9]+)' - _TESTS = [{ + _TESTS = [{ # Old-style single broadcast. 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', 'info_dict': { 'id': '1587943', @@ -40,7 +40,7 @@ class PolskieRadioIE(InfoExtractor): 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' }, }], - }, { + }, { # New-style single broadcast. 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo', 'info_dict': { 'id': '2534482', @@ -59,7 +59,7 @@ class PolskieRadioIE(InfoExtractor): 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' }, }], - }, { + }, { # Old-style multiple broadcast playlist. 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2487823,Marek-Kondrat-czyta-Mistrza-i-Malgorzate', 'info_dict': { 'id': '2487823', @@ -67,6 +67,14 @@ class PolskieRadioIE(InfoExtractor): 'description': 'md5:8422a95cc83834f2aaeff9d82e9c8f39', }, 'playlist_mincount': 50, + }, { # New-style multiple broadcast playlist. + 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2541317,Czytamy-Kalendarz-i-klepsydre-Tadeusza-Konwickiego', + 'info_dict': { + 'id': '2541317', + 'title': 'Czytamy "Kalendarz i klepsydrę" Tadeusza Konwickiego', + 'description': 'md5:0baeaa46d877f1351fb2eeed3e871f9f', + }, + 'playlist_mincount': 15, }, { 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', 'only_matching': True,