[YoutubeDL] rework how the format spec is processed

The spec string is processed using 'tokenize.tokenize' to split it in words and operators, the filters are still processed using regular expressions.
This should make easier to allow grouping operators with parens.
totalwebcasting
Jaime Marquínez Ferrándiz 9 years ago
parent ac0474f89d
commit 67134eaba1

@ -229,21 +229,30 @@ class TestFormatSelection(unittest.TestCase):
'141', '172', '140', '171', '139', '141', '172', '140', '171', '139',
] ]
for f1id, f2id in zip(order, order[1:]): def format_info(f_id):
f1 = YoutubeIE._formats[f1id].copy() info = YoutubeIE._formats[f_id].copy()
f1['format_id'] = f1id info['format_id'] = f_id
f1['url'] = 'url:' + f1id info['url'] = 'url:' + f_id
f2 = YoutubeIE._formats[f2id].copy() return info
f2['format_id'] = f2id formats_order = [format_info(f_id) for f_id in order]
f2['url'] = 'url:' + f2id
info_dict = _make_result(list(formats_order), extractor='youtube')
ydl = YDL({'format': 'bestvideo+bestaudio'})
yie = YoutubeIE(ydl)
yie._sort_formats(info_dict['formats'])
ydl.process_ie_result(info_dict)
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], '137+141')
self.assertEqual(downloaded['ext'], 'mp4')
for f1, f2 in zip(formats_order, formats_order[1:]):
info_dict = _make_result([f1, f2], extractor='youtube') info_dict = _make_result([f1, f2], extractor='youtube')
ydl = YDL({'format': 'best/bestvideo'}) ydl = YDL({'format': 'best/bestvideo'})
yie = YoutubeIE(ydl) yie = YoutubeIE(ydl)
yie._sort_formats(info_dict['formats']) yie._sort_formats(info_dict['formats'])
ydl.process_ie_result(info_dict) ydl.process_ie_result(info_dict)
downloaded = ydl.downloaded_info_dicts[0] downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], f1id) self.assertEqual(downloaded['format_id'], f1['format_id'])
info_dict = _make_result([f2, f1], extractor='youtube') info_dict = _make_result([f2, f1], extractor='youtube')
ydl = YDL({'format': 'best/bestvideo'}) ydl = YDL({'format': 'best/bestvideo'})
@ -251,7 +260,7 @@ class TestFormatSelection(unittest.TestCase):
yie._sort_formats(info_dict['formats']) yie._sort_formats(info_dict['formats'])
ydl.process_ie_result(info_dict) ydl.process_ie_result(info_dict)
downloaded = ydl.downloaded_info_dicts[0] downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], f1id) self.assertEqual(downloaded['format_id'], f1['format_id'])
def test_format_filtering(self): def test_format_filtering(self):
formats = [ formats = [

@ -21,6 +21,7 @@ import subprocess
import socket import socket
import sys import sys
import time import time
import tokenize
import traceback import traceback
if os.name == 'nt': if os.name == 'nt':
@ -34,6 +35,7 @@ from .compat import (
compat_http_client, compat_http_client,
compat_kwargs, compat_kwargs,
compat_str, compat_str,
compat_tokenize_tokenize,
compat_urllib_error, compat_urllib_error,
compat_urllib_request, compat_urllib_request,
) )
@ -851,8 +853,8 @@ class YoutubeDL(object):
else: else:
raise Exception('Invalid result type: %s' % result_type) raise Exception('Invalid result type: %s' % result_type)
def _apply_format_filter(self, format_spec, available_formats): def _build_format_filter(self, filter_spec):
" Returns a tuple of the remaining format_spec and filtered formats " " Returns a function to filter the formats according to the filter_spec "
OPERATORS = { OPERATORS = {
'<': operator.lt, '<': operator.lt,
@ -862,13 +864,13 @@ class YoutubeDL(object):
'=': operator.eq, '=': operator.eq,
'!=': operator.ne, '!=': operator.ne,
} }
operator_rex = re.compile(r'''(?x)\s*\[ operator_rex = re.compile(r'''(?x)\s*
(?P<key>width|height|tbr|abr|vbr|asr|filesize|fps) (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
\s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
(?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
\]$ $
''' % '|'.join(map(re.escape, OPERATORS.keys()))) ''' % '|'.join(map(re.escape, OPERATORS.keys())))
m = operator_rex.search(format_spec) m = operator_rex.search(filter_spec)
if m: if m:
try: try:
comparison_value = int(m.group('value')) comparison_value = int(m.group('value'))
@ -879,7 +881,7 @@ class YoutubeDL(object):
if comparison_value is None: if comparison_value is None:
raise ValueError( raise ValueError(
'Invalid value %r in format specification %r' % ( 'Invalid value %r in format specification %r' % (
m.group('value'), format_spec)) m.group('value'), filter_spec))
op = OPERATORS[m.group('op')] op = OPERATORS[m.group('op')]
if not m: if not m:
@ -887,85 +889,201 @@ class YoutubeDL(object):
'=': operator.eq, '=': operator.eq,
'!=': operator.ne, '!=': operator.ne,
} }
str_operator_rex = re.compile(r'''(?x)\s*\[ str_operator_rex = re.compile(r'''(?x)
\s*(?P<key>ext|acodec|vcodec|container|protocol) \s*(?P<key>ext|acodec|vcodec|container|protocol)
\s*(?P<op>%s)(?P<none_inclusive>\s*\?)? \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
\s*(?P<value>[a-zA-Z0-9_-]+) \s*(?P<value>[a-zA-Z0-9_-]+)
\s*\]$ \s*$
''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
m = str_operator_rex.search(format_spec) m = str_operator_rex.search(filter_spec)
if m: if m:
comparison_value = m.group('value') comparison_value = m.group('value')
op = STR_OPERATORS[m.group('op')] op = STR_OPERATORS[m.group('op')]
if not m: if not m:
raise ValueError('Invalid format specification %r' % format_spec) raise ValueError('Invalid filter specification %r' % filter_spec)
def _filter(f): def _filter(f):
actual_value = f.get(m.group('key')) actual_value = f.get(m.group('key'))
if actual_value is None: if actual_value is None:
return m.group('none_inclusive') return m.group('none_inclusive')
return op(actual_value, comparison_value) return op(actual_value, comparison_value)
new_formats = [f for f in available_formats if _filter(f)] return _filter
def build_format_selector(self, format_spec):
def syntax_error(note, start):
message = (
'Invalid format specification: '
'{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
return SyntaxError(message)
PICKFIRST = 'PICKFIRST'
MERGE = 'MERGE'
SINGLE = 'SINGLE'
FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
def _parse_filter(tokens):
filter_parts = []
for type, string, start, _, _ in tokens:
if type == tokenize.OP and string == ']':
return ''.join(filter_parts)
else:
filter_parts.append(string)
def _parse_format_selection(tokens, endwith=[]):
selectors = []
current_selector = None
for type, string, start, _, _ in tokens:
# ENCODING is only defined in python 3.x
if type == getattr(tokenize, 'ENCODING', None):
continue
elif type in [tokenize.NAME, tokenize.NUMBER]:
current_selector = FormatSelector(SINGLE, string, [])
elif type == tokenize.OP:
if string in endwith:
break
if string == ',':
selectors.append(current_selector)
current_selector = None
elif string == '/':
first_choice = current_selector
second_choice = _parse_format_selection(tokens, [','])
current_selector = None
selectors.append(FormatSelector(PICKFIRST, (first_choice, second_choice), []))
elif string == '[':
if not current_selector:
current_selector = FormatSelector(SINGLE, 'best', [])
format_filter = _parse_filter(tokens)
current_selector.filters.append(format_filter)
elif string == '+':
video_selector = current_selector
audio_selector = _parse_format_selection(tokens, [','])
current_selector = None
selectors.append(FormatSelector(MERGE, (video_selector, audio_selector), []))
else:
raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
elif type == tokenize.ENDMARKER:
break
if current_selector:
selectors.append(current_selector)
return selectors
def _build_selector_function(selector):
if isinstance(selector, list):
fs = [_build_selector_function(s) for s in selector]
def selector_function(formats):
for f in fs:
for format in f(formats):
yield format
return selector_function
elif selector.type == PICKFIRST:
fs = [_build_selector_function(s) for s in selector.selector]
def selector_function(formats):
for f in fs:
picked_formats = list(f(formats))
if picked_formats:
return picked_formats
return []
elif selector.type == SINGLE:
format_spec = selector.selector
def selector_function(formats):
if format_spec in ['best', 'worst', None]:
format_idx = 0 if format_spec == 'worst' else -1
audiovideo_formats = [
f for f in formats
if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
if audiovideo_formats:
yield audiovideo_formats[format_idx]
# for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
elif (all(f.get('acodec') != 'none' for f in formats) or
all(f.get('vcodec') != 'none' for f in formats)):
yield formats[format_idx]
elif format_spec == 'bestaudio':
audio_formats = [
f for f in formats
if f.get('vcodec') == 'none']
if audio_formats:
yield audio_formats[-1]
elif format_spec == 'worstaudio':
audio_formats = [
f for f in formats
if f.get('vcodec') == 'none']
if audio_formats:
yield audio_formats[0]
elif format_spec == 'bestvideo':
video_formats = [
f for f in formats
if f.get('acodec') == 'none']
if video_formats:
yield video_formats[-1]
elif format_spec == 'worstvideo':
video_formats = [
f for f in formats
if f.get('acodec') == 'none']
if video_formats:
yield video_formats[0]
else:
extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
if format_spec in extensions:
filter_f = lambda f: f['ext'] == format_spec
else:
filter_f = lambda f: f['format_id'] == format_spec
matches = list(filter(filter_f, formats))
if matches:
yield matches[-1]
elif selector.type == MERGE:
def _merge(formats_info):
format_1, format_2 = [f['format_id'] for f in formats_info]
# The first format must contain the video and the
# second the audio
if formats_info[0].get('vcodec') == 'none':
self.report_error('The first format must '
'contain the video, try using '
'"-f %s+%s"' % (format_2, format_1))
return
output_ext = (
formats_info[0]['ext']
if self.params.get('merge_output_format') is None
else self.params['merge_output_format'])
return {
'requested_formats': formats_info,
'format': '%s+%s' % (formats_info[0].get('format'),
formats_info[1].get('format')),
'format_id': '%s+%s' % (formats_info[0].get('format_id'),
formats_info[1].get('format_id')),
'width': formats_info[0].get('width'),
'height': formats_info[0].get('height'),
'resolution': formats_info[0].get('resolution'),
'fps': formats_info[0].get('fps'),
'vcodec': formats_info[0].get('vcodec'),
'vbr': formats_info[0].get('vbr'),
'stretched_ratio': formats_info[0].get('stretched_ratio'),
'acodec': formats_info[1].get('acodec'),
'abr': formats_info[1].get('abr'),
'ext': output_ext,
}
video_selector, audio_selector = map(_build_selector_function, selector.selector)
new_format_spec = format_spec[:-len(m.group(0))] def selector_function(formats):
if not new_format_spec: formats = list(formats)
new_format_spec = 'best' for pair in itertools.product(video_selector(formats), audio_selector(formats)):
yield _merge(pair)
return (new_format_spec, new_formats) filters = [self._build_format_filter(f) for f in selector.filters]
def select_format(self, format_spec, available_formats): def final_selector(formats):
while format_spec.endswith(']'): for _filter in filters:
format_spec, available_formats = self._apply_format_filter( formats = list(filter(_filter, formats))
format_spec, available_formats) return selector_function(formats)
if not available_formats: return final_selector
return None
if format_spec in ['best', 'worst', None]: stream = io.BytesIO(format_spec.encode('utf-8'))
format_idx = 0 if format_spec == 'worst' else -1 tokens = compat_tokenize_tokenize(stream.readline)
audiovideo_formats = [ parsed_selector = _parse_format_selection(tokens)
f for f in available_formats return _build_selector_function(parsed_selector)
if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
if audiovideo_formats:
return audiovideo_formats[format_idx]
# for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
elif (all(f.get('acodec') != 'none' for f in available_formats) or
all(f.get('vcodec') != 'none' for f in available_formats)):
return available_formats[format_idx]
elif format_spec == 'bestaudio':
audio_formats = [
f for f in available_formats
if f.get('vcodec') == 'none']
if audio_formats:
return audio_formats[-1]
elif format_spec == 'worstaudio':
audio_formats = [
f for f in available_formats
if f.get('vcodec') == 'none']
if audio_formats:
return audio_formats[0]
elif format_spec == 'bestvideo':
video_formats = [
f for f in available_formats
if f.get('acodec') == 'none']
if video_formats:
return video_formats[-1]
elif format_spec == 'worstvideo':
video_formats = [
f for f in available_formats
if f.get('acodec') == 'none']
if video_formats:
return video_formats[0]
else:
extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
if format_spec in extensions:
filter_f = lambda f: f['ext'] == format_spec
else:
filter_f = lambda f: f['format_id'] == format_spec
matches = list(filter(filter_f, available_formats))
if matches:
return matches[-1]
return None
def _calc_headers(self, info_dict): def _calc_headers(self, info_dict):
res = std_headers.copy() res = std_headers.copy()
@ -1112,52 +1230,8 @@ class YoutubeDL(object):
if req_format == 'all': if req_format == 'all':
formats_to_download = formats formats_to_download = formats
else: else:
for rfstr in req_format.split(','): format_selector = self.build_format_selector(req_format)
# We can accept formats requested in the format: 34/5/best, we pick formats_to_download = list(format_selector(formats))
# the first that is available, starting from left
req_formats = rfstr.split('/')
for rf in req_formats:
if re.match(r'.+?\+.+?', rf) is not None:
# Two formats have been requested like '137+139'
format_1, format_2 = rf.split('+')
formats_info = (self.select_format(format_1, formats),
self.select_format(format_2, formats))
if all(formats_info):
# The first format must contain the video and the
# second the audio
if formats_info[0].get('vcodec') == 'none':
self.report_error('The first format must '
'contain the video, try using '
'"-f %s+%s"' % (format_2, format_1))
return
output_ext = (
formats_info[0]['ext']
if self.params.get('merge_output_format') is None
else self.params['merge_output_format'])
selected_format = {
'requested_formats': formats_info,
'format': '%s+%s' % (formats_info[0].get('format'),
formats_info[1].get('format')),
'format_id': '%s+%s' % (formats_info[0].get('format_id'),
formats_info[1].get('format_id')),
'width': formats_info[0].get('width'),
'height': formats_info[0].get('height'),
'resolution': formats_info[0].get('resolution'),
'fps': formats_info[0].get('fps'),
'vcodec': formats_info[0].get('vcodec'),
'vbr': formats_info[0].get('vbr'),
'stretched_ratio': formats_info[0].get('stretched_ratio'),
'acodec': formats_info[1].get('acodec'),
'abr': formats_info[1].get('abr'),
'ext': output_ext,
}
else:
selected_format = None
else:
selected_format = self.select_format(rf, formats)
if selected_format is not None:
formats_to_download.append(selected_format)
break
if not formats_to_download: if not formats_to_download:
raise ExtractorError('requested format not available', raise ExtractorError('requested format not available',
expected=True) expected=True)

@ -388,6 +388,10 @@ else:
pass pass
return _terminal_size(columns, lines) return _terminal_size(columns, lines)
if sys.version_info >= (3, 0):
from tokenize import tokenize as compat_tokenize_tokenize
else:
from tokenize import generate_tokens as compat_tokenize_tokenize
__all__ = [ __all__ = [
'compat_HTTPError', 'compat_HTTPError',
@ -408,6 +412,7 @@ __all__ = [
'compat_socket_create_connection', 'compat_socket_create_connection',
'compat_str', 'compat_str',
'compat_subprocess_get_DEVNULL', 'compat_subprocess_get_DEVNULL',
'compat_tokenize_tokenize',
'compat_urllib_error', 'compat_urllib_error',
'compat_urllib_parse', 'compat_urllib_parse',
'compat_urllib_parse_unquote', 'compat_urllib_parse_unquote',

Loading…
Cancel
Save