From 8df0c2c7a598fa429df55b0e7d826d1daf502b36 Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Fri, 24 Jan 2020 15:03:48 +0100 Subject: [PATCH 1/4] [archiveorg] Fix extraction (closes #21330, closes #23586, closes #23700) --- youtube_dl/extractor/archiveorg.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index c79c58e82..a65fdd7d0 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -40,9 +40,12 @@ class ArchiveOrgIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) + input_element_with_playlist = self._search_regex( + r'(<\s*input.*\s*class\s*=\s*[\'"].*\s*js-play8-playlist\s*.*[\'"]\s*.*>)', + webpage, 'jwplayer playlist') jwplayer_playlist = self._parse_json(self._search_regex( - r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", - webpage, 'jwplayer playlist'), video_id) + r'.*\s+value\s*=\s*[\'"](.+)[\'"][\s/]', + input_element_with_playlist, 'playlist data'), video_id) info = self._parse_jwplayer_data( {'playlist': jwplayer_playlist}, video_id, base_url=url) @@ -52,7 +55,7 @@ class ArchiveOrgIE(InfoExtractor): metadata = self._download_json( 'http://archive.org/details/' + video_id, video_id, query={ 'output': 'json', - })['metadata'] + }).get('metadata', {}) info.update({ 'title': get_optional(metadata, 'title') or info.get('title'), 'description': clean_html(get_optional(metadata, 'description')), From e910f498d339500f9596d11590ace4cb235d8409 Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Wed, 19 Feb 2020 22:04:47 +0100 Subject: [PATCH 2/4] [archiveorg] Use extract_attributes() --- youtube_dl/extractor/archiveorg.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index a65fdd7d0..909dc0aaf 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -2,8 +2,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - unified_strdate, clean_html, + extract_attributes, + unified_strdate, ) @@ -43,9 +44,8 @@ class ArchiveOrgIE(InfoExtractor): input_element_with_playlist = self._search_regex( r'(<\s*input.*\s*class\s*=\s*[\'"].*\s*js-play8-playlist\s*.*[\'"]\s*.*>)', webpage, 'jwplayer playlist') - jwplayer_playlist = self._parse_json(self._search_regex( - r'.*\s+value\s*=\s*[\'"](.+)[\'"][\s/]', - input_element_with_playlist, 'playlist data'), video_id) + jwplayer_playlist = self._parse_json(extract_attributes( + input_element_with_playlist)['value'], video_id) info = self._parse_jwplayer_data( {'playlist': jwplayer_playlist}, video_id, base_url=url) From b98d1c0d5a53e21659a81b1635877e731a3bfbc0 Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Wed, 19 Feb 2020 22:28:24 +0100 Subject: [PATCH 3/4] [archiveorg] Use and fix get_element_by_class() Use get_element_by_class() from utils to get rid of yet another regex. This function used to return only the content of the element, and not the element itself, including its tag and attributes. The whole group of get_element_by_X() functions are a bit of a misnomer, as they all return the *content* of the element and not the element itself. All these functions can now return the whole element when setting their `include_tag` parameter to `True`. By default it is `False` so no other code will be affected by this change. Tests have been added to test/test_utils.py accordingly. This uncovered a bug which prevented elements starting with a hyphen as their class name from being found. This has been fixed by fixing the regex used in get_elements_by_class(). --- test/test_utils.py | 41 +++++++++++++++++++ youtube_dl/extractor/archiveorg.py | 6 +-- youtube_dl/utils.py | 64 +++++++++++++++++++++--------- 3 files changed, 90 insertions(+), 21 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 0896f4150..74a7792fc 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1401,8 +1401,49 @@ Line 1 ''' self.assertEqual(get_element_by_class('foo', html), 'nice') + self.assertEqual(get_element_by_class('foo', html, include_tag=True), 'nice') self.assertEqual(get_element_by_class('no-such-class', html), None) + html = ''' + + ''' + + self.assertEqual(get_element_by_class('foo', html), None) + self.assertEqual(get_element_by_class('foo', html, include_tag=True), '') + + html = ''' + + ''' + + self.assertEqual(get_element_by_class('foo', html), '') + self.assertEqual(get_element_by_class('foo', html, include_tag=True), '') + + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('content-section__wrap', html), 'nice') + self.assertEqual(get_element_by_class('content-section__wrap', html, include_tag=True), 'nice') + + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('-test-hyphen', html), 'nice') + + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('_test_underscore', html), 'nice') + + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('ä-umlaut', html), 'nice') + self.assertEqual(get_element_by_class('↑-unicode', html), 'nice') + def test_get_element_by_attribute(self): html = ''' nice diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 909dc0aaf..80ad653b1 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( clean_html, extract_attributes, + get_element_by_class, unified_strdate, ) @@ -41,9 +42,8 @@ class ArchiveOrgIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) - input_element_with_playlist = self._search_regex( - r'(<\s*input.*\s*class\s*=\s*[\'"].*\s*js-play8-playlist\s*.*[\'"]\s*.*>)', - webpage, 'jwplayer playlist') + input_element_with_playlist = get_element_by_class( + 'js-play8-playlist', webpage, include_tag=True) jwplayer_playlist = self._parse_json(extract_attributes( input_element_with_playlist)['value'], video_id) info = self._parse_jwplayer_data( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f6204692a..4149f4dc5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1926,32 +1926,55 @@ def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): return n.attrib[key] -def get_element_by_id(id, html): - """Return the content of the tag with the specified ID in the passed HTML document""" - return get_element_by_attribute('id', id, html) +def get_element_by_id(id, html, include_tag=False): + """ + Return the content of the tag with the specified ID in the passed HTML document. + + The whole element, including its tag, is returned when `include_flag` is `True`. + """ + return get_element_by_attribute('id', id, html, include_tag) + +def get_element_by_class(class_name, html, include_tag=False): + """ + Return the content of the first tag with the specified class in the passed HTML document. -def get_element_by_class(class_name, html): - """Return the content of the first tag with the specified class in the passed HTML document""" - retval = get_elements_by_class(class_name, html) + The whole element, including its tag, is returned when `include_flag` is `True`. + """ + retval = get_elements_by_class(class_name, html, include_tag) return retval[0] if retval else None -def get_element_by_attribute(attribute, value, html, escape_value=True): - retval = get_elements_by_attribute(attribute, value, html, escape_value) +def get_element_by_attribute(attribute, value, html, escape_value=True, + include_tag=False): + """ + Return the content of the first tag with the specified attribute in the passed HTML document. + + The whole element, including its tag, is returned when `include_flag` is `True`. + """ + retval = get_elements_by_attribute(attribute, value, html, escape_value, + include_tag) return retval[0] if retval else None -def get_elements_by_class(class_name, html): - """Return the content of all tags with the specified class in the passed HTML document as a list""" +def get_elements_by_class(class_name, html, include_tag=False): + """ + Return the content of all tags with the specified class in the passed HTML document as a list. + + The whole elements, including their tags, are returned when `include_flag` is `True`. + """ return get_elements_by_attribute( - 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), - html, escape_value=False) + 'class', r'[^\'"]*(? + \s*(?:\/\s*>|> (?P.*?) - + ) ''' % (re.escape(attribute), value), html): - res = m.group('content') + res = m.group(0) if include_tag else m.group('content') + if res is None: + continue if res.startswith('"') or res.startswith("'"): res = res[1:-1] @@ -1981,7 +2006,10 @@ class HTMLAttributeParser(compat_HTMLParser): compat_HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): - self.attrs = dict(attrs) + # Make sure we're looking at the first attributes. Later ones are from + # embedded elements. + if not self.attrs: + self.attrs = dict(attrs) def extract_attributes(html_element): From 1326a5aa38b5331cb899a47e8c18f05b82d31bc7 Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Thu, 20 Feb 2020 00:02:32 +0100 Subject: [PATCH 4/4] [archiveorg] Make metadata extraction more robust --- youtube_dl/extractor/archiveorg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 80ad653b1..21dfe850d 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -52,10 +52,13 @@ class ArchiveOrgIE(InfoExtractor): def get_optional(metadata, field): return metadata.get(field, [None])[0] - metadata = self._download_json( + json_metadata = self._download_json( 'http://archive.org/details/' + video_id, video_id, query={ 'output': 'json', - }).get('metadata', {}) + }, fatal=False) + metadata = (json_metadata.get('metadata', {}) + if isinstance(json_metadata, dict) + else {}) info.update({ 'title': get_optional(metadata, 'title') or info.get('title'), 'description': clean_html(get_optional(metadata, 'description')),