youtube-dl/youtube_dl/extractor/yahoo.py

from __future__ import unicode_literals

import itertools
import json
import re

from .common import InfoExtractor, SearchInfoExtractor
from ..utils import (
    compat_urllib_parse,
    compat_urlparse,
    clean_html,
    int_or_none,
)


class YahooIE(InfoExtractor):
    IE_DESC = 'Yahoo screen and movies'
    _VALID_URL = r'(?P<url>https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
    _TESTS = [
        {
            'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
            'md5': '4962b075c08be8690a922ee026d05e69',
            'info_dict': {
                'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
                'ext': 'mp4',
                'title': 'Julian Smith & Travis Legg Watch Julian Smith',
                'description': 'Julian and Travis watch Julian Smith',
            },
        },
        {
            'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
            'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
            'info_dict': {
                'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',
                'ext': 'mp4',
                'title': 'Codefellas - The Cougar Lies with Spanish Moss',
                'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
            },
        },
        {
            'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
            'md5': '60e8ac193d8fb71997caa8fce54c6460',
            'info_dict': {
                'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',
                'ext': 'mp4',
                'title': "Yahoo Saves 'Community'",
                'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',
            }
        },
    ]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        url = mobj.group('url')
        webpage = self._download_webpage(url, video_id)

        items_json = self._search_regex(
            r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
            default=None)
        if items_json is None:
            CONTENT_ID_REGEXES = [
                r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
                r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
                r'"first_videoid"\s*:\s*"([^"]+)"',
            ]
            long_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
            video_id = long_id
        else:
            items = json.loads(items_json)
            info = items['mediaItems']['query']['results']['mediaObj'][0]
            # The 'meta' field is not always in the video webpage, we request it
            # from another page
            long_id = info['id']
        return self._get_info(long_id, video_id, webpage)

    def _get_info(self, long_id, video_id, webpage):
        query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
                 ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
                 ' AND protocol="http"' % long_id)
        data = compat_urllib_parse.urlencode({
            'q': query,
            'env': 'prod',
            'format': 'json',
        })
        query_result = self._download_json(
            'http://video.query.yahoo.com/v1/public/yql?' + data,
            video_id, 'Downloading video info')
        info = query_result['query']['results']['mediaObj'][0]
        meta = info['meta']

        formats = []
        for s in info['streams']:
            format_info = {
                'width': int_or_none(s.get('width')),
                'height': int_or_none(s.get('height')),
                'tbr': int_or_none(s.get('bitrate')),
            }

            host = s['host']
            path = s['path']
            if host.startswith('rtmp'):
                format_info.update({
                    'url': host,
                    'play_path': path,
                    'ext': 'flv',
                })
            else:
                format_url = compat_urlparse.urljoin(host, path)
                format_info['url'] = format_url
            formats.append(format_info)

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': meta['title'],
            'formats': formats,
            'description': clean_html(meta['description']),
            'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
        }


class YahooNewsIE(YahooIE):
    IE_NAME = 'yahoo:news'
    _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'

    _TESTS = [{
        'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
        'md5': '67010fdf3a08d290e060a4dd96baa07b',
        'info_dict': {
            'id': '104538833',
            'ext': 'mp4',
            'title': 'China Moses Is Crazy About the Blues',
            'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
        },
    }]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        webpage = self._download_webpage(url, video_id)
        long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id')
        return self._get_info(long_id, video_id, webpage)


class YahooSearchIE(SearchInfoExtractor):
    IE_DESC = 'Yahoo screen search'
    _MAX_RESULTS = 1000
    IE_NAME = 'screen.yahoo:search'
    _SEARCH_KEY = 'yvsearch'

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""
        entries = []
        for pagenum in itertools.count(0):
            result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
            info = self._download_json(result_url, query,
                note='Downloading results page '+str(pagenum+1))
            m = info['m']
            results = info['results']

            for (i, r) in enumerate(results):
                if (pagenum * 30) + i >= n:
                    break
                mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
                e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
                entries.append(e)
            if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
                break

        return {
            '_type': 'playlist',
            'id': query,
            'entries': entries,
        }
[yahoo] use unicode_literals 10 years ago			`from __future__ import unicode_literals`

Move YahooSearchIE to youtube_dl.extractor.yahoo 11 years ago			`import itertools`
Remove unused imports 10 years ago			`import json`
Move yahoo into its own file 11 years ago			`import re`

Move YahooSearchIE to youtube_dl.extractor.yahoo 11 years ago			`from .common import InfoExtractor, SearchInfoExtractor`
Move yahoo into its own file 11 years ago			`from ..utils import (`
Move YahooSearchIE to youtube_dl.extractor.yahoo 11 years ago			`compat_urllib_parse,`
[yahoo] Fix video extraction (fixes #1521) There's no need to use two different methods. Now we can also download videos over http if possible. Also run the test for rtmp videos, but skip the download. 11 years ago			`compat_urlparse,`
			`clean_html,`
[yahoo] Use centralized sorting, and add tbr field 11 years ago			`int_or_none,`
Move yahoo into its own file 11 years ago			`)`

[yahoo] Fix video extraction (fixes #1521) There's no need to use two different methods. Now we can also download videos over http if possible. Also run the test for rtmp videos, but skip the download. 11 years ago
Move yahoo into its own file 11 years ago			`class YahooIE(InfoExtractor):`
[yahoo] Add support for movies (Fixes #2780) 10 years ago			`IE_DESC = 'Yahoo screen and movies'`
[yahoo] Add support for embedded videos (Closes #3525) 10 years ago			`_VALID_URL = r'(?P<url>https?://(?:screen\|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'`
[yahoo] Fix video extraction (fixes #1521) There's no need to use two different methods. Now we can also download videos over http if possible. Also run the test for rtmp videos, but skip the download. 11 years ago			`_TESTS = [`
			`{`
[yahoo] use unicode_literals 10 years ago			`'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',`
			`'md5': '4962b075c08be8690a922ee026d05e69',`
			`'info_dict': {`
[yahoo] Improve content id extraction 10 years ago			`'id': '2d25e626-2378-391f-ada0-ddaf1417e588',`
[yahoo] Modernize 10 years ago			`'ext': 'mp4',`
[yahoo] use unicode_literals 10 years ago			`'title': 'Julian Smith & Travis Legg Watch Julian Smith',`
			`'description': 'Julian and Travis watch Julian Smith',`
[yahoo] Fix video extraction (fixes #1521) There's no need to use two different methods. Now we can also download videos over http if possible. Also run the test for rtmp videos, but skip the download. 11 years ago			`},`
			`},`
			`{`
[yahoo] use unicode_literals 10 years ago			`'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',`
			`'md5': 'd6e6fc6e1313c608f316ddad7b82b306',`
			`'info_dict': {`
[yahoo] Improve content id extraction 10 years ago			`'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',`
[yahoo] Modernize 10 years ago			`'ext': 'mp4',`
[yahoo] use unicode_literals 10 years ago			`'title': 'Codefellas - The Cougar Lies with Spanish Moss',`
			`'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',`
[yahoo] Fix video extraction (fixes #1521) There's no need to use two different methods. Now we can also download videos over http if possible. Also run the test for rtmp videos, but skip the download. 11 years ago			`},`
Move tests to the IE definitions 11 years ago			`},`
[yahoo] Add support for embedded videos (Closes #3525) 10 years ago			`{`
			`'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',`
			`'md5': '60e8ac193d8fb71997caa8fce54c6460',`
			`'info_dict': {`
			`'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',`
			`'ext': 'mp4',`
			`'title': "Yahoo Saves 'Community'",`
			`'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',`
			`}`
			`},`
[yahoo] Fix video extraction (fixes #1521) There's no need to use two different methods. Now we can also download videos over http if possible. Also run the test for rtmp videos, but skip the download. 11 years ago			`]`
Move yahoo into its own file 11 years ago
			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`video_id = mobj.group('id')`
[yahoo] Add support for embedded videos (Closes #3525) 10 years ago			`url = mobj.group('url')`
Move yahoo into its own file 11 years ago			`webpage = self._download_webpage(url, video_id)`

[yahoo] Add support for movies (Fixes #2780) 10 years ago			`items_json = self._search_regex(`
			`r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,`
			`default=None)`
			`if items_json is None:`
[yahoo] Improve content id extraction 10 years ago			`CONTENT_ID_REGEXES = [`
[yahoo] Add support for movies (Fixes #2780) 10 years ago			`r'YUI\.namespace\("Media"\)\.CONTENT_ID\s=\s"([^"]+)"',`
[yahoo] Add one more pattern for content id 10 years ago			`r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',`
			`r'"first_videoid"\s:\s"([^"]+)"',`
[yahoo] Improve content id extraction 10 years ago			`]`
			`long_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')`
[yahoo] Add support for movies (Fixes #2780) 10 years ago			`video_id = long_id`
			`else:`
			`items = json.loads(items_json)`
			`info = items['mediaItems']['query']['results']['mediaObj'][0]`
			`# The 'meta' field is not always in the video webpage, we request it`
			`# from another page`
			`long_id = info['id']`
[yahoo] improve thumbnail extraction 10 years ago			`return self._get_info(long_id, video_id, webpage)`
[yahoo] Add an extractor for yahoo news (closes #1849) 11 years ago
[yahoo] improve thumbnail extraction 10 years ago			`def _get_info(self, long_id, video_id, webpage):`
[yahoo] Download the info from another page The 'meta' field is not always in the video webpage 11 years ago			`query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'`
[yahoo] Force use of the http protocol for downloading the videos. 11 years ago			`' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'`
			`' AND protocol="http"' % long_id)`
[yahoo] Download the info from another page The 'meta' field is not always in the video webpage 11 years ago			`data = compat_urllib_parse.urlencode({`
			`'q': query,`
			`'env': 'prod',`
			`'format': 'json',`
			`})`
[yahoo] Modernize 10 years ago			`query_result = self._download_json(`
[yahoo] Download the info from another page The 'meta' field is not always in the video webpage 11 years ago			`'http://video.query.yahoo.com/v1/public/yql?' + data,`
[yahoo] use unicode_literals 10 years ago			`video_id, 'Downloading video info')`
[yahoo] Download the info from another page The 'meta' field is not always in the video webpage 11 years ago			`info = query_result['query']['results']['mediaObj'][0]`
[yahoo] Fix video extraction (fixes #1521) There's no need to use two different methods. Now we can also download videos over http if possible. Also run the test for rtmp videos, but skip the download. 11 years ago			`meta = info['meta']`

			`formats = []`
			`for s in info['streams']:`
			`format_info = {`
[yahoo] Use centralized sorting, and add tbr field 11 years ago			`'width': int_or_none(s.get('width')),`
			`'height': int_or_none(s.get('height')),`
			`'tbr': int_or_none(s.get('bitrate')),`
[yahoo] Fix video extraction (fixes #1521) There's no need to use two different methods. Now we can also download videos over http if possible. Also run the test for rtmp videos, but skip the download. 11 years ago			`}`

			`host = s['host']`
			`path = s['path']`
			`if host.startswith('rtmp'):`
			`format_info.update({`
			`'url': host,`
			`'play_path': path,`
			`'ext': 'flv',`
			`})`
			`else:`
			`format_url = compat_urlparse.urljoin(host, path)`
			`format_info['url'] = format_url`
			`formats.append(format_info)`
[yahoo] Use centralized sorting, and add tbr field 11 years ago
			`self._sort_formats(formats)`
[yahoo] Fix video extraction (fixes #1521) There's no need to use two different methods. Now we can also download videos over http if possible. Also run the test for rtmp videos, but skip the download. 11 years ago
[yahoo] Fix video extraction and use the new format system exclusively 11 years ago			`return {`
[yahoo] Fix video extraction (fixes #1521) There's no need to use two different methods. Now we can also download videos over http if possible. Also run the test for rtmp videos, but skip the download. 11 years ago			`'id': video_id,`
			`'title': meta['title'],`
			`'formats': formats,`
			`'description': clean_html(meta['description']),`
[yahoo] improve thumbnail extraction 10 years ago			`'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),`
[yahoo] Fix video extraction (fixes #1521) There's no need to use two different methods. Now we can also download videos over http if possible. Also run the test for rtmp videos, but skip the download. 11 years ago			`}`
Move yahoo into its own file 11 years ago
Move YahooSearchIE to youtube_dl.extractor.yahoo 11 years ago
[yahoo] Add an extractor for yahoo news (closes #1849) 11 years ago			`class YahooNewsIE(YahooIE):`
			`IE_NAME = 'yahoo:news'`
			`_VALID_URL = r'http://news\.yahoo\.com/video/.?-(?P<id>\d?)\.html'`

[test_all_urls] Add support for distributed URL matching test definition 10 years ago			`_TESTS = [{`
[yahoo] use unicode_literals 10 years ago			`'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',`
			`'md5': '67010fdf3a08d290e060a4dd96baa07b',`
			`'info_dict': {`
			`'id': '104538833',`
			`'ext': 'mp4',`
			`'title': 'China Moses Is Crazy About the Blues',`
			`'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',`
[yahoo] Add an extractor for yahoo news (closes #1849) 11 years ago			`},`
[test_all_urls] Add support for distributed URL matching test definition 10 years ago			`}]`
[yahoo] Add an extractor for yahoo news (closes #1849) 11 years ago
			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`video_id = mobj.group('id')`
			`webpage = self._download_webpage(url, video_id)`
[yahoo] use unicode_literals 10 years ago			`long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id')`
[yahoo] improve thumbnail extraction 10 years ago			`return self._get_info(long_id, video_id, webpage)`
[yahoo] Add an extractor for yahoo news (closes #1849) 11 years ago

Move YahooSearchIE to youtube_dl.extractor.yahoo 11 years ago			`class YahooSearchIE(SearchInfoExtractor):`
[yahoo] use unicode_literals 10 years ago			`IE_DESC = 'Yahoo screen search'`
Move YahooSearchIE to youtube_dl.extractor.yahoo 11 years ago			`_MAX_RESULTS = 1000`
[yahoo] use unicode_literals 10 years ago			`IE_NAME = 'screen.yahoo:search'`
Move YahooSearchIE to youtube_dl.extractor.yahoo 11 years ago			`_SEARCH_KEY = 'yvsearch'`

			`def _get_n_results(self, query, n):`
			`"""Get a specified number of results for a query"""`
[yahoo] Modernize 10 years ago			`entries = []`
			`for pagenum in itertools.count(0):`
[yahoo] use unicode_literals 10 years ago			`result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)`
[yahoo] Modernize 10 years ago			`info = self._download_json(result_url, query,`
			`note='Downloading results page '+str(pagenum+1))`
[yahoo] use unicode_literals 10 years ago			`m = info['m']`
			`results = info['results']`
Move YahooSearchIE to youtube_dl.extractor.yahoo 11 years ago
			`for (i, r) in enumerate(results):`
[yahoo] Modernize 10 years ago			`if (pagenum * 30) + i >= n:`
Move YahooSearchIE to youtube_dl.extractor.yahoo 11 years ago			`break`
			`mobj = re.search(r'(?P<url>screen\.yahoo\.com/.?-\d?\.html)"', r)`
			`e = self.url_result('http://' + mobj.group('url'), 'Yahoo')`
[yahoo] Modernize 10 years ago			`entries.append(e)`
			`if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):`
Move YahooSearchIE to youtube_dl.extractor.yahoo 11 years ago			`break`

[yahoo] Modernize 10 years ago			`return {`
			`'_type': 'playlist',`
			`'id': query,`
			`'entries': entries,`
			`}`