From 67134eaba1a56cec4117000acb2fc9284c9cdd9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 28 Jun 2015 22:08:29 +0200 Subject: [PATCH 1/8] [YoutubeDL] rework how the format spec is processed The spec string is processed using 'tokenize.tokenize' to split it in words and operators, the filters are still processed using regular expressions. This should make easier to allow grouping operators with parens. --- test/test_YoutubeDL.py | 27 ++-- youtube_dl/YoutubeDL.py | 298 +++++++++++++++++++++++++--------------- youtube_dl/compat.py | 5 + 3 files changed, 209 insertions(+), 121 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index a13c09ef4..8f7aef512 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -229,21 +229,30 @@ class TestFormatSelection(unittest.TestCase): '141', '172', '140', '171', '139', ] - for f1id, f2id in zip(order, order[1:]): - f1 = YoutubeIE._formats[f1id].copy() - f1['format_id'] = f1id - f1['url'] = 'url:' + f1id - f2 = YoutubeIE._formats[f2id].copy() - f2['format_id'] = f2id - f2['url'] = 'url:' + f2id + def format_info(f_id): + info = YoutubeIE._formats[f_id].copy() + info['format_id'] = f_id + info['url'] = 'url:' + f_id + return info + formats_order = [format_info(f_id) for f_id in order] + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], '137+141') + self.assertEqual(downloaded['ext'], 'mp4') + for f1, f2 in zip(formats_order, formats_order[1:]): info_dict = _make_result([f1, f2], extractor='youtube') ydl = YDL({'format': 'best/bestvideo'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], f1id) + self.assertEqual(downloaded['format_id'], f1['format_id']) info_dict = _make_result([f2, f1], extractor='youtube') ydl = YDL({'format': 'best/bestvideo'}) @@ -251,7 +260,7 @@ class TestFormatSelection(unittest.TestCase): yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], f1id) + self.assertEqual(downloaded['format_id'], f1['format_id']) def test_format_filtering(self): formats = [ diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ef0f71bad..17a5407b9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -21,6 +21,7 @@ import subprocess import socket import sys import time +import tokenize import traceback if os.name == 'nt': @@ -34,6 +35,7 @@ from .compat import ( compat_http_client, compat_kwargs, compat_str, + compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, ) @@ -851,8 +853,8 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) - def _apply_format_filter(self, format_spec, available_formats): - " Returns a tuple of the remaining format_spec and filtered formats " + def _build_format_filter(self, filter_spec): + " Returns a function to filter the formats according to the filter_spec " OPERATORS = { '<': operator.lt, @@ -862,13 +864,13 @@ class YoutubeDL(object): '=': operator.eq, '!=': operator.ne, } - operator_rex = re.compile(r'''(?x)\s*\[ + operator_rex = re.compile(r'''(?x)\s* (?Pwidth|height|tbr|abr|vbr|asr|filesize|fps) \s*(?P%s)(?P\s*\?)?\s* (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) - \]$ + $ ''' % '|'.join(map(re.escape, OPERATORS.keys()))) - m = operator_rex.search(format_spec) + m = operator_rex.search(filter_spec) if m: try: comparison_value = int(m.group('value')) @@ -879,7 +881,7 @@ class YoutubeDL(object): if comparison_value is None: raise ValueError( 'Invalid value %r in format specification %r' % ( - m.group('value'), format_spec)) + m.group('value'), filter_spec)) op = OPERATORS[m.group('op')] if not m: @@ -887,85 +889,201 @@ class YoutubeDL(object): '=': operator.eq, '!=': operator.ne, } - str_operator_rex = re.compile(r'''(?x)\s*\[ + str_operator_rex = re.compile(r'''(?x) \s*(?Pext|acodec|vcodec|container|protocol) \s*(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9_-]+) - \s*\]$ + \s*$ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) - m = str_operator_rex.search(format_spec) + m = str_operator_rex.search(filter_spec) if m: comparison_value = m.group('value') op = STR_OPERATORS[m.group('op')] if not m: - raise ValueError('Invalid format specification %r' % format_spec) + raise ValueError('Invalid filter specification %r' % filter_spec) def _filter(f): actual_value = f.get(m.group('key')) if actual_value is None: return m.group('none_inclusive') return op(actual_value, comparison_value) - new_formats = [f for f in available_formats if _filter(f)] + return _filter + + def build_format_selector(self, format_spec): + def syntax_error(note, start): + message = ( + 'Invalid format specification: ' + '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) + return SyntaxError(message) + + PICKFIRST = 'PICKFIRST' + MERGE = 'MERGE' + SINGLE = 'SINGLE' + FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) + + def _parse_filter(tokens): + filter_parts = [] + for type, string, start, _, _ in tokens: + if type == tokenize.OP and string == ']': + return ''.join(filter_parts) + else: + filter_parts.append(string) + + def _parse_format_selection(tokens, endwith=[]): + selectors = [] + current_selector = None + for type, string, start, _, _ in tokens: + # ENCODING is only defined in python 3.x + if type == getattr(tokenize, 'ENCODING', None): + continue + elif type in [tokenize.NAME, tokenize.NUMBER]: + current_selector = FormatSelector(SINGLE, string, []) + elif type == tokenize.OP: + if string in endwith: + break + if string == ',': + selectors.append(current_selector) + current_selector = None + elif string == '/': + first_choice = current_selector + second_choice = _parse_format_selection(tokens, [',']) + current_selector = None + selectors.append(FormatSelector(PICKFIRST, (first_choice, second_choice), [])) + elif string == '[': + if not current_selector: + current_selector = FormatSelector(SINGLE, 'best', []) + format_filter = _parse_filter(tokens) + current_selector.filters.append(format_filter) + elif string == '+': + video_selector = current_selector + audio_selector = _parse_format_selection(tokens, [',']) + current_selector = None + selectors.append(FormatSelector(MERGE, (video_selector, audio_selector), [])) + else: + raise syntax_error('Operator not recognized: "{0}"'.format(string), start) + elif type == tokenize.ENDMARKER: + break + if current_selector: + selectors.append(current_selector) + return selectors + + def _build_selector_function(selector): + if isinstance(selector, list): + fs = [_build_selector_function(s) for s in selector] + + def selector_function(formats): + for f in fs: + for format in f(formats): + yield format + return selector_function + elif selector.type == PICKFIRST: + fs = [_build_selector_function(s) for s in selector.selector] + + def selector_function(formats): + for f in fs: + picked_formats = list(f(formats)) + if picked_formats: + return picked_formats + return [] + elif selector.type == SINGLE: + format_spec = selector.selector + + def selector_function(formats): + if format_spec in ['best', 'worst', None]: + format_idx = 0 if format_spec == 'worst' else -1 + audiovideo_formats = [ + f for f in formats + if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] + if audiovideo_formats: + yield audiovideo_formats[format_idx] + # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format + elif (all(f.get('acodec') != 'none' for f in formats) or + all(f.get('vcodec') != 'none' for f in formats)): + yield formats[format_idx] + elif format_spec == 'bestaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[-1] + elif format_spec == 'worstaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[0] + elif format_spec == 'bestvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[-1] + elif format_spec == 'worstvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[0] + else: + extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] + if format_spec in extensions: + filter_f = lambda f: f['ext'] == format_spec + else: + filter_f = lambda f: f['format_id'] == format_spec + matches = list(filter(filter_f, formats)) + if matches: + yield matches[-1] + elif selector.type == MERGE: + def _merge(formats_info): + format_1, format_2 = [f['format_id'] for f in formats_info] + # The first format must contain the video and the + # second the audio + if formats_info[0].get('vcodec') == 'none': + self.report_error('The first format must ' + 'contain the video, try using ' + '"-f %s+%s"' % (format_2, format_1)) + return + output_ext = ( + formats_info[0]['ext'] + if self.params.get('merge_output_format') is None + else self.params['merge_output_format']) + return { + 'requested_formats': formats_info, + 'format': '%s+%s' % (formats_info[0].get('format'), + formats_info[1].get('format')), + 'format_id': '%s+%s' % (formats_info[0].get('format_id'), + formats_info[1].get('format_id')), + 'width': formats_info[0].get('width'), + 'height': formats_info[0].get('height'), + 'resolution': formats_info[0].get('resolution'), + 'fps': formats_info[0].get('fps'), + 'vcodec': formats_info[0].get('vcodec'), + 'vbr': formats_info[0].get('vbr'), + 'stretched_ratio': formats_info[0].get('stretched_ratio'), + 'acodec': formats_info[1].get('acodec'), + 'abr': formats_info[1].get('abr'), + 'ext': output_ext, + } + video_selector, audio_selector = map(_build_selector_function, selector.selector) - new_format_spec = format_spec[:-len(m.group(0))] - if not new_format_spec: - new_format_spec = 'best' + def selector_function(formats): + formats = list(formats) + for pair in itertools.product(video_selector(formats), audio_selector(formats)): + yield _merge(pair) - return (new_format_spec, new_formats) + filters = [self._build_format_filter(f) for f in selector.filters] - def select_format(self, format_spec, available_formats): - while format_spec.endswith(']'): - format_spec, available_formats = self._apply_format_filter( - format_spec, available_formats) - if not available_formats: - return None + def final_selector(formats): + for _filter in filters: + formats = list(filter(_filter, formats)) + return selector_function(formats) + return final_selector - if format_spec in ['best', 'worst', None]: - format_idx = 0 if format_spec == 'worst' else -1 - audiovideo_formats = [ - f for f in available_formats - if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] - if audiovideo_formats: - return audiovideo_formats[format_idx] - # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format - elif (all(f.get('acodec') != 'none' for f in available_formats) or - all(f.get('vcodec') != 'none' for f in available_formats)): - return available_formats[format_idx] - elif format_spec == 'bestaudio': - audio_formats = [ - f for f in available_formats - if f.get('vcodec') == 'none'] - if audio_formats: - return audio_formats[-1] - elif format_spec == 'worstaudio': - audio_formats = [ - f for f in available_formats - if f.get('vcodec') == 'none'] - if audio_formats: - return audio_formats[0] - elif format_spec == 'bestvideo': - video_formats = [ - f for f in available_formats - if f.get('acodec') == 'none'] - if video_formats: - return video_formats[-1] - elif format_spec == 'worstvideo': - video_formats = [ - f for f in available_formats - if f.get('acodec') == 'none'] - if video_formats: - return video_formats[0] - else: - extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] - if format_spec in extensions: - filter_f = lambda f: f['ext'] == format_spec - else: - filter_f = lambda f: f['format_id'] == format_spec - matches = list(filter(filter_f, available_formats)) - if matches: - return matches[-1] - return None + stream = io.BytesIO(format_spec.encode('utf-8')) + tokens = compat_tokenize_tokenize(stream.readline) + parsed_selector = _parse_format_selection(tokens) + return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): res = std_headers.copy() @@ -1112,52 +1230,8 @@ class YoutubeDL(object): if req_format == 'all': formats_to_download = formats else: - for rfstr in req_format.split(','): - # We can accept formats requested in the format: 34/5/best, we pick - # the first that is available, starting from left - req_formats = rfstr.split('/') - for rf in req_formats: - if re.match(r'.+?\+.+?', rf) is not None: - # Two formats have been requested like '137+139' - format_1, format_2 = rf.split('+') - formats_info = (self.select_format(format_1, formats), - self.select_format(format_2, formats)) - if all(formats_info): - # The first format must contain the video and the - # second the audio - if formats_info[0].get('vcodec') == 'none': - self.report_error('The first format must ' - 'contain the video, try using ' - '"-f %s+%s"' % (format_2, format_1)) - return - output_ext = ( - formats_info[0]['ext'] - if self.params.get('merge_output_format') is None - else self.params['merge_output_format']) - selected_format = { - 'requested_formats': formats_info, - 'format': '%s+%s' % (formats_info[0].get('format'), - formats_info[1].get('format')), - 'format_id': '%s+%s' % (formats_info[0].get('format_id'), - formats_info[1].get('format_id')), - 'width': formats_info[0].get('width'), - 'height': formats_info[0].get('height'), - 'resolution': formats_info[0].get('resolution'), - 'fps': formats_info[0].get('fps'), - 'vcodec': formats_info[0].get('vcodec'), - 'vbr': formats_info[0].get('vbr'), - 'stretched_ratio': formats_info[0].get('stretched_ratio'), - 'acodec': formats_info[1].get('acodec'), - 'abr': formats_info[1].get('abr'), - 'ext': output_ext, - } - else: - selected_format = None - else: - selected_format = self.select_format(rf, formats) - if selected_format is not None: - formats_to_download.append(selected_format) - break + format_selector = self.build_format_selector(req_format) + formats_to_download = list(format_selector(formats)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index f9529210d..bc218dd71 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -388,6 +388,10 @@ else: pass return _terminal_size(columns, lines) +if sys.version_info >= (3, 0): + from tokenize import tokenize as compat_tokenize_tokenize +else: + from tokenize import generate_tokens as compat_tokenize_tokenize __all__ = [ 'compat_HTTPError', @@ -408,6 +412,7 @@ __all__ = [ 'compat_socket_create_connection', 'compat_str', 'compat_subprocess_get_DEVNULL', + 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', 'compat_urllib_parse_unquote', From 5acfa126c812c3ab7088af6c7df79697baee7831 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 28 Jun 2015 22:48:02 +0200 Subject: [PATCH 2/8] [YoutubeDL] format spec: treat 'all' like a normal specifier So you can use filters with it, for example 'all[width>=400][width<=600]'. --- test/test_YoutubeDL.py | 5 +++++ youtube_dl/YoutubeDL.py | 13 ++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 8f7aef512..709e3100f 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -317,6 +317,11 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'G') + ydl = YDL({'format': 'all[width>=400][width<=600]'}) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['B', 'C', 'D']) + class TestYoutubeDL(unittest.TestCase): def test_subtitles(self): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 17a5407b9..258e612af 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -990,7 +990,10 @@ class YoutubeDL(object): format_spec = selector.selector def selector_function(formats): - if format_spec in ['best', 'worst', None]: + if format_spec == 'all': + for f in formats: + yield f + elif format_spec in ['best', 'worst', None]: format_idx = 0 if format_spec == 'worst' else -1 audiovideo_formats = [ f for f in formats @@ -1226,12 +1229,8 @@ class YoutubeDL(object): req_format_list.append('bestvideo+bestaudio') req_format_list.append('best') req_format = '/'.join(req_format_list) - formats_to_download = [] - if req_format == 'all': - formats_to_download = formats - else: - format_selector = self.build_format_selector(req_format) - formats_to_download = list(format_selector(formats)) + format_selector = self.build_format_selector(req_format) + formats_to_download = list(format_selector(formats)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) From 0130afb76e5cb6f470f39f127c8d09eea3e82d0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 29 Jun 2015 12:42:02 +0200 Subject: [PATCH 3/8] [YoutubeDL] format spec: allow grouping specifiers with parentheses --- test/test_YoutubeDL.py | 24 ++++++++++++++++++++++++ youtube_dl/YoutubeDL.py | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 709e3100f..6f374d7ea 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -245,6 +245,30 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], '137+141') self.assertEqual(downloaded['ext'], 'mp4') + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['137+141', '248+141']) + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])[height<=720]+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['136+141', '247+141']) + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=none]/bestvideo[ext=webm])+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['248+141']) + for f1, f2 in zip(formats_order, formats_order[1:]): info_dict = _make_result([f1, f2], extractor='youtube') ydl = YDL({'format': 'best/bestvideo'}) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 258e612af..e5b46f87e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -920,6 +920,7 @@ class YoutubeDL(object): PICKFIRST = 'PICKFIRST' MERGE = 'MERGE' SINGLE = 'SINGLE' + GROUP = 'GROUP' FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) def _parse_filter(tokens): @@ -942,6 +943,10 @@ class YoutubeDL(object): elif type == tokenize.OP: if string in endwith: break + elif string == ')': + # ')' will be handled by the parentheses group + tokens.restore_last_token() + break if string == ',': selectors.append(current_selector) current_selector = None @@ -955,6 +960,10 @@ class YoutubeDL(object): current_selector = FormatSelector(SINGLE, 'best', []) format_filter = _parse_filter(tokens) current_selector.filters.append(format_filter) + elif string == '(': + if current_selector: + raise syntax_error('Unexpected "("', start) + current_selector = FormatSelector(GROUP, _parse_format_selection(tokens, [')']), []) elif string == '+': video_selector = current_selector audio_selector = _parse_format_selection(tokens, [',']) @@ -977,6 +986,8 @@ class YoutubeDL(object): for format in f(formats): yield format return selector_function + elif selector.type == GROUP: + selector_function = _build_selector_function(selector.selector) elif selector.type == PICKFIRST: fs = [_build_selector_function(s) for s in selector.selector] @@ -1084,8 +1095,32 @@ class YoutubeDL(object): return final_selector stream = io.BytesIO(format_spec.encode('utf-8')) - tokens = compat_tokenize_tokenize(stream.readline) - parsed_selector = _parse_format_selection(tokens) + try: + tokens = list(compat_tokenize_tokenize(stream.readline)) + except tokenize.TokenError: + raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) + + class TokenIterator(object): + def __init__(self, tokens): + self.tokens = tokens + self.counter = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.counter >= len(self.tokens): + raise StopIteration() + value = self.tokens[self.counter] + self.counter += 1 + return value + + next = __next__ + + def restore_last_token(self): + self.counter -= 1 + + parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): From cf2ac6df6896dac4d23918867bb86fac1e1088d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 30 Jun 2015 19:45:42 +0200 Subject: [PATCH 4/8] [YoutubeDL] format spec: Fix handling of '+' with '/' 'bestvideo+bestaudio/best' was incorrectly interpreted as 'bestvideo+(bestaudio/best)', so it would fail if 'bestaudio' doesn't exist instead of falling back to 'best'. --- test/test_YoutubeDL.py | 8 ++++++++ youtube_dl/YoutubeDL.py | 25 +++++++++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 6f374d7ea..1e4aaa559 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -245,6 +245,14 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], '137+141') self.assertEqual(downloaded['ext'], 'mp4') + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo[height>=999999]+bestaudio/best'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], '38') + info_dict = _make_result(list(formats_order), extractor='youtube') ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'}) yie = YoutubeIE(ydl) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e5b46f87e..5deb4848e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -931,7 +931,7 @@ class YoutubeDL(object): else: filter_parts.append(string) - def _parse_format_selection(tokens, endwith=[]): + def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): selectors = [] current_selector = None for type, string, start, _, _ in tokens: @@ -941,18 +941,23 @@ class YoutubeDL(object): elif type in [tokenize.NAME, tokenize.NUMBER]: current_selector = FormatSelector(SINGLE, string, []) elif type == tokenize.OP: - if string in endwith: + if string == ')': + if not inside_group: + # ')' will be handled by the parentheses group + tokens.restore_last_token() break - elif string == ')': - # ')' will be handled by the parentheses group + elif inside_merge and string in ['/', ',']: tokens.restore_last_token() break - if string == ',': + elif inside_choice and string == ',': + tokens.restore_last_token() + break + elif string == ',': selectors.append(current_selector) current_selector = None elif string == '/': first_choice = current_selector - second_choice = _parse_format_selection(tokens, [',']) + second_choice = _parse_format_selection(tokens, inside_choice=True) current_selector = None selectors.append(FormatSelector(PICKFIRST, (first_choice, second_choice), [])) elif string == '[': @@ -963,12 +968,12 @@ class YoutubeDL(object): elif string == '(': if current_selector: raise syntax_error('Unexpected "("', start) - current_selector = FormatSelector(GROUP, _parse_format_selection(tokens, [')']), []) + group = _parse_format_selection(tokens, inside_group=True) + current_selector = FormatSelector(GROUP, group, []) elif string == '+': video_selector = current_selector - audio_selector = _parse_format_selection(tokens, [',']) - current_selector = None - selectors.append(FormatSelector(MERGE, (video_selector, audio_selector), [])) + audio_selector = _parse_format_selection(tokens, inside_merge=True) + current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) else: raise syntax_error('Operator not recognized: "{0}"'.format(string), start) elif type == tokenize.ENDMARKER: From f5f4a27a964b41646303921104f4d6d6fd2098e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 4 Jul 2015 21:30:26 +0200 Subject: [PATCH 5/8] [YoutubeDL] format spec: fix handling of '/' with ',' When using 'bestvideo/best,bestaudio', 'bestvideo/best' must be set as the current_selector (instead of appending it to the selectors), otherwise when it gets the ',' it would append 'None' to the selectors. --- test/test_YoutubeDL.py | 8 ++++++++ youtube_dl/YoutubeDL.py | 3 +-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 1e4aaa559..f103779d3 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -253,6 +253,14 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '38') + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo/best,bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['137', '141']) + info_dict = _make_result(list(formats_order), extractor='youtube') ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'}) yie = YoutubeIE(ydl) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5deb4848e..5a79e5f1d 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -958,8 +958,7 @@ class YoutubeDL(object): elif string == '/': first_choice = current_selector second_choice = _parse_format_selection(tokens, inside_choice=True) - current_selector = None - selectors.append(FormatSelector(PICKFIRST, (first_choice, second_choice), [])) + current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) elif string == '[': if not current_selector: current_selector = FormatSelector(SINGLE, 'best', []) From bb8e55366289e0c129ef85abb8c1ac1cbae86a66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 4 Jul 2015 21:41:09 +0200 Subject: [PATCH 6/8] [YoutubeDL] format spec: Do not fail when a filter gives an empty result For example with 'best[height<40]' we ended getting a 'IndexError: list index out of range'. --- test/test_YoutubeDL.py | 9 ++++++++- youtube_dl/YoutubeDL.py | 3 +++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f103779d3..bf2baae07 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -15,7 +15,7 @@ from youtube_dl import YoutubeDL from youtube_dl.compat import compat_str from youtube_dl.extractor import YoutubeIE from youtube_dl.postprocessor.common import PostProcessor -from youtube_dl.utils import match_filter_func +from youtube_dl.utils import ExtractorError, match_filter_func TEST_URL = 'http://localhost/sample.mp4' @@ -362,6 +362,13 @@ class TestFormatSelection(unittest.TestCase): downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] self.assertEqual(downloaded_ids, ['B', 'C', 'D']) + ydl = YDL({'format': 'best[height<40]'}) + try: + ydl.process_ie_result(info_dict) + except ExtractorError: + pass + self.assertEqual(ydl.downloaded_info_dicts, []) + class TestYoutubeDL(unittest.TestCase): def test_subtitles(self): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5a79e5f1d..6478d05dc 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1005,6 +1005,9 @@ class YoutubeDL(object): format_spec = selector.selector def selector_function(formats): + formats = list(formats) + if not formats: + return if format_spec == 'all': for f in formats: yield f From 0a31a350981931ab9403d58258f5058ed98142a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 10 Jul 2015 22:46:25 +0200 Subject: [PATCH 7/8] [YoutubeDL] format spec: add additional checks for invalid syntax --- test/test_YoutubeDL.py | 10 ++++++++++ youtube_dl/YoutubeDL.py | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index bf2baae07..20f45f439 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -302,6 +302,16 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], f1['format_id']) + def test_invalid_format_specs(self): + def assert_syntax_error(format_spec): + ydl = YDL({'format': format_spec}) + info_dict = _make_result([{'format_id': 'foo', 'url': TEST_URL}]) + self.assertRaises(SyntaxError, ydl.process_ie_result, info_dict) + + assert_syntax_error('bestvideo,,best') + assert_syntax_error('+bestaudio') + assert_syntax_error('bestvideo+') + def test_format_filtering(self): formats = [ {'format_id': 'A', 'filesize': 500, 'width': 1000}, diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6478d05dc..da7c51008 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -953,6 +953,8 @@ class YoutubeDL(object): tokens.restore_last_token() break elif string == ',': + if not current_selector: + raise syntax_error('"," must follow a format selector', start) selectors.append(current_selector) current_selector = None elif string == '/': @@ -972,6 +974,8 @@ class YoutubeDL(object): elif string == '+': video_selector = current_selector audio_selector = _parse_format_selection(tokens, inside_merge=True) + if not video_selector or not audio_selector: + raise syntax_error('"+" must be between two format selectors', start) current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) else: raise syntax_error('Operator not recognized: "{0}"'.format(string), start) From dc48695ab9c84cda56ee723aee11e5da44c2258a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 10 Jul 2015 22:59:45 +0200 Subject: [PATCH 8/8] Document how to group format selectors --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e3452c9e1..4e6cb3fc7 100644 --- a/README.md +++ b/README.md @@ -268,7 +268,7 @@ youtube-dl_test_video_.mp4 # A simple file name By default youtube-dl tries to download the best quality, but sometimes you may want to download other format. The simplest case is requesting a specific format, for example `-f 22`. You can get the list of available formats using `--list-formats`, you can also use a file extension (currently it supports aac, m4a, mp3, mp4, ogg, wav, webm) or the special names `best`, `bestvideo`, `bestaudio` and `worst`. -If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f +` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`. +If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f +` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`. Format selectors can also be grouped using parentheses, for example if you want to download the best mp4 and webm formats with a height lower than 480 you can use `-f '(mp4,webm)[height<480]'`. Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed.