From 1ba2d65ffd2c5b2a82547471bc72a6efdc848a7a Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Wed, 23 Sep 2020 23:09:00 +0200 Subject: [PATCH 1/8] [bandcamp] fix regexp for JSON matching on bandcamp --- youtube_dl/extractor/bandcamp.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f14b407dc..ad1812320 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -91,10 +91,11 @@ class BandcampIE(InfoExtractor): duration = None formats = [] - track_info = self._parse_json( - self._search_regex( - r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', - webpage, 'track info', default='{}'), title) + trackinfo_block = self._search_regex( + r'trackinfo":\[\s*({.+?})\s*\],"', + webpage, 'track info', default='{}') + quoted_json = trackinfo_block.replace('"', '"') + track_info = self._parse_json(quoted_json, title) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -117,7 +118,7 @@ class BandcampIE(InfoExtractor): def extract(key): return self._search_regex( - r'\b%s\s*["\']?\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' % key, + r',"%s":(")(?P(?:(?!").)+)"' % key, webpage, key, default=None, group='value') artist = extract('artist') From c0acb120831d18d177ec013a8334370c8c10cc0e Mon Sep 17 00:00:00 2001 From: Gilou Date: Sat, 26 Sep 2020 17:34:35 +0200 Subject: [PATCH 2/8] [bandcamp] use unescapeHTML instead of a simple replace of quotes --- youtube_dl/extractor/bandcamp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index ad1812320..55d110e28 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -92,10 +92,10 @@ class BandcampIE(InfoExtractor): formats = [] trackinfo_block = self._search_regex( - r'trackinfo":\[\s*({.+?})\s*\],"', + r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', webpage, 'track info', default='{}') - quoted_json = trackinfo_block.replace('"', '"') - track_info = self._parse_json(quoted_json, title) + unescaped_json = unescapeHTML(trackinfo_block) + track_info = self._parse_json(unescaped_json, title) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -118,7 +118,7 @@ class BandcampIE(InfoExtractor): def extract(key): return self._search_regex( - r',"%s":(")(?P(?:(?!").)+)"' % key, + r',(["\']|")%s\1:\1(?P(?:(?!\1).)+)\1' % key, webpage, key, default=None, group='value') artist = extract('artist') From d5a55b1725ff4182664245e9aa9c57817f7c012b Mon Sep 17 00:00:00 2001 From: Gilou Date: Sun, 27 Sep 2020 14:51:42 +0200 Subject: [PATCH 3/8] [bandcamp] match album titles inside the new JSON data block, and unescape the title properly --- youtube_dl/extractor/bandcamp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 55d110e28..f036a89eb 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -316,10 +316,10 @@ class BandcampAlbumIE(InfoExtractor): if self._html_search_meta('duration', elem_content, default=None)] title = self._html_search_regex( - r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', + r'album_title\s*(?:"|["\']):\s*(?:"|["\'])((?:\\.|[^"\\])+?)(?:"|["\'])', webpage, 'title', fatal=False) if title: - title = title.replace(r'\"', '"') + title = unescapeHTML(title) return { '_type': 'playlist', 'uploader_id': uploader_id, From 4014677cb337dc1886a74dc20e7d24c18deaf911 Mon Sep 17 00:00:00 2001 From: Gilou Date: Sun, 27 Sep 2020 15:11:08 +0200 Subject: [PATCH 4/8] [bandcamp] fix the freeDownloadPage JSON lookup, and use the id from the URL to match the tracks --- youtube_dl/extractor/bandcamp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f036a89eb..eccb867a0 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -128,12 +128,12 @@ class BandcampIE(InfoExtractor): release_date = unified_strdate(extract('album_release_date')) download_link = self._search_regex( - r'freeDownloadPage\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + r'freeDownloadPage(?:["\']|"):\s*(["\']|")(?P(?:(?!\1).)+)\1', webpage, 'download link', default=None, group='url') if download_link: track_id = self._search_regex( - r'(?ms)var TralbumData = .*?[{,]\s*id: (?P\d+),?$', - webpage, 'track id') + r'\?id=(?P\d+)&', + download_link, 'track id') download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') From 20cab1e2ca4fcff1925c1223705f5e9d8249d49a Mon Sep 17 00:00:00 2001 From: Gilou Date: Sun, 27 Sep 2020 15:52:55 +0200 Subject: [PATCH 5/8] [bandcamp] update youtuble dl test song information to match title as artist - track, and add missing keys from info_dict --- youtube_dl/extractor/bandcamp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index eccb867a0..3d32b1e0f 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -33,8 +33,11 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \\ - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, + 'uploader': 'youtube-dl \\', + 'timestamp': 1354224127, + 'upload_date': '20121129', }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { From a4e1c3e3712a55a78f48c71dd82ab73fa7fc7375 Mon Sep 17 00:00:00 2001 From: Gilou Date: Mon, 28 Sep 2020 19:42:56 +0200 Subject: [PATCH 6/8] [bandcamp] fix test song uploader name, cleanup remanings " and \ in data, including album titles --- youtube_dl/extractor/bandcamp.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 3d32b1e0f..3405b570a 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -33,9 +33,9 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \\ - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, - 'uploader': 'youtube-dl \\', + 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", 'timestamp': 1354224127, 'upload_date': '20121129', }, @@ -43,7 +43,7 @@ class BandcampIE(InfoExtractor): }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '853e35bf34aa1d6fe2615ae612564b36', + 'md5': '5d92af55811e47f38962a54c30b07ef0', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', @@ -94,11 +94,12 @@ class BandcampIE(InfoExtractor): duration = None formats = [] - trackinfo_block = self._search_regex( + trackinfo_block = self._html_search_regex( r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', webpage, 'track info', default='{}') - unescaped_json = unescapeHTML(trackinfo_block) - track_info = self._parse_json(unescaped_json, title) + + track_info = self._parse_json(trackinfo_block, title) + if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -120,9 +121,10 @@ class BandcampIE(InfoExtractor): duration = float_or_none(track_info.get('duration')) def extract(key): - return self._search_regex( - r',(["\']|")%s\1:\1(?P(?:(?!\1).)+)\1' % key, + data = self._html_search_regex( + r',(["\']|")%s\1:\1(?P(?:\\\1|((?!\1).))+)\1' % key, webpage, key, default=None, group='value') + return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data artist = extract('artist') album = extract('album_title') @@ -319,10 +321,12 @@ class BandcampAlbumIE(InfoExtractor): if self._html_search_meta('duration', elem_content, default=None)] title = self._html_search_regex( - r'album_title\s*(?:"|["\']):\s*(?:"|["\'])((?:\\.|[^"\\])+?)(?:"|["\'])', - webpage, 'title', fatal=False) + r'album_title\s*(?:"|["\']):\s*("|["\'])(?P(?:\\\1|((?!\1).))+)\1', + webpage, 'title', fatal=False, group='album') + if title: - title = unescapeHTML(title) + title = title.replace(r'\"', '"') + return { '_type': 'playlist', 'uploader_id': uploader_id, From 7117b849c1202321bff4d4995f92a3626cfdd20c Mon Sep 17 00:00:00 2001 From: Gilou Date: Tue, 29 Sep 2020 12:09:55 +0200 Subject: [PATCH 7/8] [bandcamp] Revert test song title, and extract title generally (which may fail, as the other title json values might come up), instead of out of trackinfo, as bandcamp prefixes it with artist - --- youtube_dl/extractor/bandcamp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 3405b570a..04b8aa80f 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -33,7 +33,7 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", 'timestamp': 1354224127, @@ -99,7 +99,6 @@ class BandcampIE(InfoExtractor): webpage, 'track info', default='{}') track_info = self._parse_json(trackinfo_block, title) - if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -115,7 +114,7 @@ class BandcampIE(InfoExtractor): 'acodec': ext, 'abr': int_or_none(abr_str), }) - track = track_info.get('title') + track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) track_number = int_or_none(track_info.get('track_num')) duration = float_or_none(track_info.get('duration')) @@ -126,6 +125,7 @@ class BandcampIE(InfoExtractor): webpage, key, default=None, group='value') return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data + track = extract('title') artist = extract('artist') album = extract('album_title') timestamp = unified_timestamp( From 955906045fdd24b21a9b6edd8c410d36394f7241 Mon Sep 17 00:00:00 2001 From: Gilou Date: Tue, 29 Sep 2020 14:15:53 +0200 Subject: [PATCH 8/8] [bandcamp] alternative approach using JSON data elements parsing to fetch the data --- youtube_dl/extractor/bandcamp.py | 45 +++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 04b8aa80f..073409bd5 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -93,12 +93,30 @@ class BandcampIE(InfoExtractor): track_number = None duration = None + scriptdatablocks = re.findall(r'