youtube-dl/youtube_dl/extractor/vier.py

# coding: utf-8
from __future__ import unicode_literals

import re
import itertools

from .common import InfoExtractor
from ..utils import (
    urlencode_postdata,
    int_or_none,
    unified_strdate,
)


class VierIE(InfoExtractor):
    IE_NAME = 'vier'
    IE_DESC = 'vier.be and vijf.be'
    _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
    _NETRC_MACHINE = 'vier'
    _TESTS = [{
        'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
        'md5': 'e4ae2054a6b040ef1e289e20d111b46e',
        'info_dict': {
            'id': '16129',
            'display_id': 'het-wordt-warm-de-moestuin',
            'ext': 'mp4',
            'title': 'Het wordt warm in De Moestuin',
            'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...',
            'upload_date': '20121025',
        },
    }, {
        'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614',
        'info_dict': {
            'id': '2561614',
            'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',
            'ext': 'mp4',
            'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7',
            'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe',
            'upload_date': '20170228',
        },
        'params': {
            'skip_download': True,
        },
    }, {
        'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
        'info_dict': {
            'id': '2674839',
            'display_id': 'jani-gaat-naar-tokio-aflevering-4',
            'ext': 'mp4',
            'title': 'Jani gaat naar Tokio - Aflevering 4',
            'description': 'md5:aa8d611541db6ae9e863125704511f88',
            'upload_date': '20170501',
            'episode_number': 4,
        },
        'params': {
            'skip_download': True,
        },
        'skip': 'Requires account credentials',
    }, {
        # Requires account credentials but bypassed extraction via v3/embed page
        # without metadata
        'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
        'info_dict': {
            'id': '2674839',
            'display_id': 'jani-gaat-naar-tokio-aflevering-4',
            'ext': 'mp4',
            'title': 'jani-gaat-naar-tokio-aflevering-4',
        },
        'params': {
            'skip_download': True,
        },
        'expected_warnings': ['Log in to extract metadata'],
    }, {
        # Without video id in URL
        'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b',
        'only_matching': True,
    }, {
        'url': 'http://www.vier.be/video/v3/embed/16129',
        'only_matching': True,
    }]

    def _real_initialize(self):
        self._logged_in = False

    def _login(self, site):
        username, password = self._get_login_info()
        if username is None or password is None:
            return

        login_page = self._download_webpage(
            'http://www.%s.be/user/login' % site,
            None, note='Logging in', errnote='Unable to log in',
            data=urlencode_postdata({
                'form_id': 'user_login',
                'name': username,
                'pass': password,
            }),
            headers={'Content-Type': 'application/x-www-form-urlencoded'})

        login_error = self._html_search_regex(
            r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<',
            login_page, 'login error', default=None)
        if login_error:
            self.report_warning('Unable to log in: %s' % login_error)
        else:
            self._logged_in = True

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        embed_id = mobj.group('embed_id')
        display_id = mobj.group('display_id') or embed_id
        video_id = mobj.group('id') or embed_id
        site = mobj.group('site')

        if not self._logged_in:
            self._login(site)

        webpage = self._download_webpage(url, display_id)

        if r'id="user-login"' in webpage:
            self.report_warning(
                'Log in to extract metadata', video_id=display_id)
            webpage = self._download_webpage(
                'http://www.%s.be/video/v3/embed/%s' % (site, video_id),
                display_id)

        video_id = self._search_regex(
            [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
            webpage, 'video id', default=video_id or display_id)
        application = self._search_regex(
            [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
            webpage, 'application', default=site + '_vod')
        filename = self._search_regex(
            [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
            webpage, 'filename')

        playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)
        formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash'])
        self._sort_formats(formats)

        title = self._og_search_title(webpage, default=display_id)
        thumbnail = self._og_search_thumbnail(webpage, default=None)
        description = self._html_search_regex(
            r'''(?x)<div\ class="[^"]*field-type-text-with-summary[^"]*">\s*
                      (?:<div\ class="[^"]+">\s*)*
                     <p>\s*(?:<span>)?(.+?)</''',
            webpage, 'description', default=None)
        episode_number = int_or_none(self._search_regex(
            r'(?i)aflevering (\d+)', title, 'episode_number', default=None,
            fatal=False))
        upload_date = unified_strdate(self._html_search_regex(
            r'''(?x)<div\ class="[^"]*field-name-post-date[^"]*">\s*
                    (?:<div\ class="[^"]+">\s*)*
                      (\d{2}/\d{2}/\d{4})''',
            webpage, 'upload_date', default=None))

        return {
            'id': video_id,
            'display_id': display_id,
            'title': title,
            'description': description,
            'episode_number': episode_number,
            'upload_date': upload_date,
            'thumbnail': thumbnail,
            'formats': formats,
        }


class VierVideosIE(InfoExtractor):
    IE_NAME = 'vier:videos'
    _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)'
    _TESTS = [{
        'url': 'http://www.vier.be/demoestuin/videos',
        'info_dict': {
            'id': 'demoestuin',
        },
        'playlist_mincount': 153,
    }, {
        'url': 'http://www.vijf.be/temptationisland/videos',
        'info_dict': {
            'id': 'temptationisland',
        },
        'playlist_mincount': 159,
    }, {
        'url': 'http://www.vier.be/demoestuin/videos?page=6',
        'info_dict': {
            'id': 'demoestuin-page6',
        },
        'playlist_mincount': 20,
    }, {
        'url': 'http://www.vier.be/demoestuin/videos?page=7',
        'info_dict': {
            'id': 'demoestuin-page7',
        },
        'playlist_mincount': 13,
    }]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        program = mobj.group('program')
        site = mobj.group('site')

        page_id = mobj.group('page')
        if page_id:
            page_id = int(page_id)
            start_page = page_id
            playlist_id = '%s-page%d' % (program, page_id)
        else:
            start_page = 0
            playlist_id = program

        entries = []
        for current_page_id in itertools.count(start_page):
            current_page = self._download_webpage(
                'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id),
                program,
                'Downloading page %d' % (current_page_id + 1))
            page_entries = [
                self.url_result('http://www.' + site + '.be' + video_url, 'Vier')
                for video_url in re.findall(
                    r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]
            entries.extend(page_entries)
            if page_id or '>Meer<' not in current_page:
                break

        return self.playlist_result(entries, playlist_id)
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`# coding: utf-8`
			`from __future__ import unicode_literals`
[vier] Add new extractor 9 years ago
			`import re`
[vier:videos] Fix extraction with old approach (Closes #6806) 9 years ago			`import itertools`
[vier] Add new extractor 9 years ago
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`from .common import InfoExtractor`
[vier] Extract more info Extract the `episode_number` and `upload_date`. Also extract the real `description`. 7 years ago			`from ..utils import (`
			`urlencode_postdata,`
			`int_or_none,`
			`unified_strdate,`
			`)`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago

			`class VierIE(InfoExtractor):`
			`IE_NAME = 'vier'`
[vier] Add IE_DESC 7 years ago			`IE_DESC = 'vier.be and vijf.be'`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 7 years ago			`_VALID_URL = r'https?://(?:www\.)?(?P<site>vier\|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?\|video/v3/embed/(?P<embed_id>\d+))'`
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 7 years ago			`_NETRC_MACHINE = 'vier'`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`_TESTS = [{`
			`'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',`
[vier] PEP 8 and cleanup 7 years ago			`'md5': 'e4ae2054a6b040ef1e289e20d111b46e',`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`'info_dict': {`
			`'id': '16129',`
			`'display_id': 'het-wordt-warm-de-moestuin',`
			`'ext': 'mp4',`
			`'title': 'Het wordt warm in De Moestuin',`
			`'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...',`
[vier] Extract more info Extract the `episode_number` and `upload_date`. Also extract the real `description`. 7 years ago			`'upload_date': '20121025',`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`},`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 7 years ago			`}, {`
			`'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614',`
			`'info_dict': {`
			`'id': '2561614',`
			`'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',`
			`'ext': 'mp4',`
[vier] PEP 8 and cleanup 7 years ago			`'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7',`
			`'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe',`
[vier] Extract more info Extract the `episode_number` and `upload_date`. Also extract the real `description`. 7 years ago			`'upload_date': '20170228',`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 7 years ago			`},`
			`'params': {`
			`'skip_download': True,`
			`},`
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 7 years ago			`}, {`
			`'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',`
			`'info_dict': {`
			`'id': '2674839',`
			`'display_id': 'jani-gaat-naar-tokio-aflevering-4',`
			`'ext': 'mp4',`
			`'title': 'Jani gaat naar Tokio - Aflevering 4',`
[vier] Extract more info Extract the `episode_number` and `upload_date`. Also extract the real `description`. 7 years ago			`'description': 'md5:aa8d611541db6ae9e863125704511f88',`
			`'upload_date': '20170501',`
			`'episode_number': 4,`
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 7 years ago			`},`
			`'params': {`
			`'skip_download': True,`
			`},`
			`'skip': 'Requires account credentials',`
			`}, {`
[vier] PEP 8 and cleanup 7 years ago			`# Requires account credentials but bypassed extraction via v3/embed page`
			`# without metadata`
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 7 years ago			`'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',`
			`'info_dict': {`
			`'id': '2674839',`
			`'display_id': 'jani-gaat-naar-tokio-aflevering-4',`
			`'ext': 'mp4',`
			`'title': 'jani-gaat-naar-tokio-aflevering-4',`
			`},`
			`'params': {`
			`'skip_download': True,`
			`},`
			`'expected_warnings': ['Log in to extract metadata'],`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`}, {`
[vier] PEP 8 and cleanup 7 years ago			`# Without video id in URL`
			`'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b',`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`'only_matching': True,`
			`}, {`
			`'url': 'http://www.vier.be/video/v3/embed/16129',`
			`'only_matching': True,`
			`}]`

[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 7 years ago			`def _real_initialize(self):`
			`self._logged_in = False`

			`def _login(self, site):`
			`username, password = self._get_login_info()`
			`if username is None or password is None:`
			`return`

			`login_page = self._download_webpage(`
			`'http://www.%s.be/user/login' % site,`
			`None, note='Logging in', errnote='Unable to log in',`
			`data=urlencode_postdata({`
			`'form_id': 'user_login',`
			`'name': username,`
			`'pass': password,`
			`}),`
			`headers={'Content-Type': 'application/x-www-form-urlencoded'})`

			`login_error = self._html_search_regex(`
			`r'(?s)<div class="messages error">\s<div>\s<h2.+?</h2>(.+?)<',`
			`login_page, 'login error', default=None)`
			`if login_error:`
			`self.report_warning('Unable to log in: %s' % login_error)`
			`else:`
			`self._logged_in = True`

[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`embed_id = mobj.group('embed_id')`
			`display_id = mobj.group('display_id') or embed_id`
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 7 years ago			`video_id = mobj.group('id') or embed_id`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 7 years ago			`site = mobj.group('site')`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 7 years ago			`if not self._logged_in:`
			`self._login(site)`

[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`webpage = self._download_webpage(url, display_id)`

[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 7 years ago			`if r'id="user-login"' in webpage:`
			`self.report_warning(`
			`'Log in to extract metadata', video_id=display_id)`
			`webpage = self._download_webpage(`
			`'http://www.%s.be/video/v3/embed/%s' % (site, video_id),`
			`display_id)`

[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`video_id = self._search_regex(`
[vier] Fix extraction 9 years ago			`[r'data-nid="(\d+)"', r'"nid"\s:\s"(\d+)"'],`
[vier] PEP 8 and cleanup 7 years ago			`webpage, 'video id', default=video_id or display_id)`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`application = self._search_regex(`
[vier] Fix extraction 9 years ago			`[r'data-application="([^"]+)"', r'"application"\s:\s"([^"]+)"'],`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 7 years ago			`webpage, 'application', default=site + '_vod')`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`filename = self._search_regex(`
[vier] Fix extraction 9 years ago			`[r'data-filename="([^"]+)"', r'"filename"\s:\s"([^"]+)"'],`
			`webpage, 'filename')`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago
[common] add helper method for Wowza Streaming Engine format extraction 8 years ago			`playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)`
[extractor/common] try to extract non smil wowza mpd manifests 8 years ago			`formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash'])`
Remove _sort_formats from _extract_*_formats methods Now _sort_formats should be called explicitly. _sort_formats has been added to all the necessary places in code. Closes #8051 8 years ago			`self._sort_formats(formats)`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago
			`title = self._og_search_title(webpage, default=display_id)`
			`thumbnail = self._og_search_thumbnail(webpage, default=None)`
[vier] Extract more info Extract the `episode_number` and `upload_date`. Also extract the real `description`. 7 years ago			`description = self._html_search_regex(`
			`r'''(?x)<div\ class="[^"]field-type-text-with-summary[^"]">\s*`
			`(?:<div\ class="[^"]+">\s)`
			`<p>\s*(?:<span>)?(.+?)</''',`
			`webpage, 'description', default=None)`
			`episode_number = int_or_none(self._search_regex(`
			`r'(?i)aflevering (\d+)', title, 'episode_number', default=None,`
			`fatal=False))`
			`upload_date = unified_strdate(self._html_search_regex(`
			`r'''(?x)<div\ class="[^"]field-name-post-date[^"]">\s*`
			`(?:<div\ class="[^"]+">\s)`
			`(\d{2}/\d{2}/\d{4})''',`
			`webpage, 'upload_date', default=None))`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago
			`return {`
			`'id': video_id,`
			`'display_id': display_id,`
			`'title': title,`
			`'description': description,`
[vier] Extract more info Extract the `episode_number` and `upload_date`. Also extract the real `description`. 7 years ago			`'episode_number': episode_number,`
			`'upload_date': upload_date,`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`'thumbnail': thumbnail,`
			`'formats': formats,`
[vier] Add new extractor 9 years ago			`}`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago

			`class VierVideosIE(InfoExtractor):`
			`IE_NAME = 'vier:videos'`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 7 years ago			`_VALID_URL = r'https?://(?:www\.)?(?P<site>vier\|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)\|$)'`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`_TESTS = [{`
			`'url': 'http://www.vier.be/demoestuin/videos',`
			`'info_dict': {`
			`'id': 'demoestuin',`
			`},`
			`'playlist_mincount': 153,`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 7 years ago			`}, {`
			`'url': 'http://www.vijf.be/temptationisland/videos',`
			`'info_dict': {`
			`'id': 'temptationisland',`
			`},`
			`'playlist_mincount': 159,`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`}, {`
			`'url': 'http://www.vier.be/demoestuin/videos?page=6',`
			`'info_dict': {`
			`'id': 'demoestuin-page6',`
			`},`
			`'playlist_mincount': 20,`
			`}, {`
			`'url': 'http://www.vier.be/demoestuin/videos?page=7',`
			`'info_dict': {`
			`'id': 'demoestuin-page7',`
			`},`
			`'playlist_mincount': 13,`
			`}]`

			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`program = mobj.group('program')`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 7 years ago			`site = mobj.group('site')`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago
			`page_id = mobj.group('page')`
			`if page_id:`
			`page_id = int(page_id)`
			`start_page = page_id`
			`playlist_id = '%s-page%d' % (program, page_id)`
			`else:`
			`start_page = 0`
			`playlist_id = program`

			`entries = []`
[vier:videos] Fix extraction with old approach (Closes #6806) 9 years ago			`for current_page_id in itertools.count(start_page):`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`current_page = self._download_webpage(`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 7 years ago			`'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id),`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`program,`
[vier:videos] Fix extraction with old approach (Closes #6806) 9 years ago			`'Downloading page %d' % (current_page_id + 1))`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`page_entries = [`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 7 years ago			`self.url_result('http://www.' + site + '.be' + video_url, 'Vier')`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`for video_url in re.findall(`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 7 years ago			`r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago			`entries.extend(page_entries)`
[vier:videos] Fix extraction with old approach (Closes #6806) 9 years ago			`if page_id or '>Meer<' not in current_page:`
			`break`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 9 years ago
			`return self.playlist_result(entries, playlist_id)`