import base64 import datetime import itertools import netrc import os import re import socket import time import email.utils import xml.etree.ElementTree import random import math import operator import hashlib import binascii import urllib from .utils import * from .extractor.common import InfoExtractor, SearchInfoExtractor from .extractor.ard import ARDIE from .extractor.arte import ArteTvIE from .extractor.bliptv import BlipTVIE, BlipTVUserIE from .extractor.comedycentral import ComedyCentralIE from .extractor.collegehumor import CollegeHumorIE from .extractor.dailymotion import DailymotionIE from .extractor.depositfiles import DepositFilesIE from .extractor.eighttracks import EightTracksIE from .extractor.escapist import EscapistIE from .extractor.facebook import FacebookIE from .extractor.funnyordie import FunnyOrDieIE from .extractor.gametrailers import GametrailersIE from .extractor.generic import GenericIE from .extractor.googleplus import GooglePlusIE from .extractor.googlesearch import GoogleSearchIE from .extractor.infoq import InfoQIE from .extractor.justintv import JustinTVIE from .extractor.keek import KeekIE from .extractor.liveleak import LiveLeakIE from .extractor.metacafe import MetacafeIE from .extractor.mixcloud import MixcloudIE from .extractor.mtv import MTVIE from .extractor.myspass import MySpassIE from .extractor.myvideo import MyVideoIE from .extractor.nba import NBAIE from .extractor.statigram import StatigramIE from .extractor.photobucket import PhotobucketIE from .extractor.pornotube import PornotubeIE from .extractor.rbmaradio import RBMARadioIE from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE from .extractor.spiegel import SpiegelIE from .extractor.stanfordoc import StanfordOpenClassroomIE from .extractor.steam import SteamIE from .extractor.ted import TEDIE from .extractor.ustream import UstreamIE from .extractor.vimeo import VimeoIE from .extractor.worldstarhiphop import WorldStarHipHopIE from .extractor.xnxx import XNXXIE from .extractor.xvideos import XVideosIE from .extractor.yahoo import YahooIE, YahooSearchIE from .extractor.youjizz import YouJizzIE from .extractor.youku import YoukuIE from .extractor.youporn import YouPornIE from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE from .extractor.zdf import ZDFIE class TumblrIE(InfoExtractor): _VALID_URL = r'http://(?P.*?)\.tumblr\.com/((post)|(video))/(?P\d*)/(.*?)' def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) video_id = m_url.group('id') blog = m_url.group('blog_name') url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage = self._download_webpage(url, video_id) re_video = r'src=\\x22(?Phttp://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P.*?)\\x22' % (blog, video_id) video = re.search(re_video, webpage) if video is None: raise ExtractorError(u'Unable to extract video') video_url = video.group('video_url') ext = video.group('ext') video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P.*?)\\x22', webpage, u'thumbnail', fatal=False) # We pick the first poster if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '') # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos video_title = self._html_search_regex(r'(?P<title>.*?)', webpage, u'title', flags=re.DOTALL) return [{'id': video_id, 'url': video_url, 'title': video_title, 'thumbnail': video_thumbnail, 'ext': ext }] class BandcampIE(InfoExtractor): _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P.*)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') webpage = self._download_webpage(url, title) # We get the link to the free download page m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if m_download is None: raise ExtractorError(u'No free songs found') download_link = m_download.group(1) id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', webpage, re.MULTILINE|re.DOTALL).group('id') download_webpage = self._download_webpage(download_link, id, 'Downloading free downloads page') # We get the dictionary of the track from some javascrip code info = re.search(r'items: (.*?),$', download_webpage, re.MULTILINE).group(1) info = json.loads(info)[0] # We pick mp3-320 for now, until format selection can be easily implemented. mp3_info = info[u'downloads'][u'mp3-320'] # If we try to use this url it says the link has expired initial_url = mp3_info[u'url'] re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$' m_url = re.match(re_url, initial_url) #We build the url we will use to get the final track url # This url is build in Bandcamp in the script download_bunde_*.js request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts')) final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url') # If we could correctly generate the .rand field the url would be #in the "download_url" key final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1) track_info = {'id':id, 'title' : info[u'title'], 'ext' : 'mp3', 'url' : final_url, 'thumbnail' : info[u'thumb_url'], 'uploader' : info[u'artist'] } return [track_info] class RedTubeIE(InfoExtractor): """Information Extractor for redtube""" _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)' def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') video_extension = 'mp4' webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">', webpage, u'video URL') video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>', webpage, u'title') return [{ 'id': video_id, 'url': video_url, 'ext': video_extension, 'title': video_title, }] class InaIE(InfoExtractor): """Information Extractor for Ina.fr""" _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*' def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id video_extension = 'mp4' webpage = self._download_webpage(mrss_url, video_id) self.report_extraction(video_id) video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage, u'video URL') video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]>', webpage, u'title') return [{ 'id': video_id, 'url': video_url, 'ext': video_extension, 'title': video_title, }] class HowcastIE(InfoExtractor): """Information Extractor for Howcast.com""" _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P\d+)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage_url = 'http://www.howcast.com/videos/' + video_id webpage = self._download_webpage(webpage_url, video_id) self.report_extraction(video_id) video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', webpage, u'video URL') video_title = self._html_search_regex(r'\w+)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage_url = 'https://vine.co/v/' + video_id webpage = self._download_webpage(webpage_url, video_id) self.report_extraction(video_id) video_url = self._html_search_regex(r'.*?

(.+?)

', webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': video_title, 'thumbnail': thumbnail, 'uploader': uploader, }] class FlickrIE(InfoExtractor): """Information Extractor for Flickr videos""" _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P[\w\-_@]+)/(?P\d+).*' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') video_uploader_id = mobj.group('uploader_id') webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id webpage = self._download_webpage(webpage_url, video_id) secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret') first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') node_id = self._html_search_regex(r'(\d+-\d+)', first_xml, u'node_id') second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage') self.report_extraction(video_id) mobj = re.search(r'.*)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) url_title = mobj.group('url_title') webpage = self._download_webpage(url, url_title) video_id = self._html_search_regex(r'
(.*?)', data, u'video URL') return [{ 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': video_title, 'thumbnail': thumbnail, 'description': video_description, }] class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P[0-9]+)/.*\.html' def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id webpage = self._download_webpage(mrss_url, video_id) mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) if mobj is None: raise ExtractorError(u'Unable to extract media URL') if len(mobj.group('server')) == 0: video_url = compat_urllib_parse.unquote(mobj.group('file')) else: video_url = mobj.group('server')+'/key='+mobj.group('file') video_extension = video_url.split('.')[-1] video_title = self._html_search_regex(r'(?P<title>.+?) - xHamster\.com', webpage, u'title') # Can't see the description anywhere in the UI # video_description = self._html_search_regex(r'Description: (?P[^<]+)', # webpage, u'description', fatal=False) # if video_description: video_description = unescapeHTML(video_description) mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) if mobj: video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') else: video_upload_date = None self._downloader.report_warning(u'Unable to extract upload date') video_uploader_id = self._html_search_regex(r']+>(?P[^<]+)', webpage, u'uploader id', default=u'anonymous') video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, 'url': video_url, 'ext': video_extension, 'title': video_title, # 'description': video_description, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, 'thumbnail': video_thumbnail }] class HypemIE(InfoExtractor): """Information Extractor for hypem""" _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) track_id = mobj.group(1) data = { 'ax': 1, 'ts': time.time() } data_encoded = compat_urllib_parse.urlencode(data) complete_url = url + "?" + data_encoded request = compat_urllib_request.Request(complete_url) response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url') cookie = urlh.headers.get('Set-Cookie', '') self.report_extraction(track_id) html_tracks = self._html_search_regex(r'', response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() try: track_list = json.loads(html_tracks) track = track_list[u'tracks'][0] except ValueError: raise ExtractorError(u'Hypemachine contained invalid JSON.') key = track[u"key"] track_id = track[u"id"] artist = track[u"artist"] title = track[u"song"] serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key)) request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'}) request.add_header('cookie', cookie) song_data_json = self._download_webpage(request, track_id, u'Downloading metadata') try: song_data = json.loads(song_data_json) except ValueError: raise ExtractorError(u'Hypemachine contained invalid JSON.') final_url = song_data[u"url"] return [{ 'id': track_id, 'url': final_url, 'ext': "mp3", 'title': title, 'artist': artist, }] class Vbox7IE(InfoExtractor): """Information Extractor for Vbox7""" _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)' def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1) redirect_page, urlh = self._download_webpage_handle(url, video_id) new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location') redirect_url = urlh.geturl() + new_location webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page') title = self._html_search_regex(r'(.*)', webpage, u'title').split('/')[0].strip() ext = "flv" info_url = "http://vbox7.com/play/magare.do" data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id}) info_request = compat_urllib_request.Request(info_url, data) info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage') if info_response is None: raise ExtractorError(u'Unable to extract the media url') (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) return [{ 'id': video_id, 'url': final_url, 'ext': ext, 'title': title, 'thumbnail': thumbnail_url, }] def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. """ return [ YoutubePlaylistIE(), YoutubeChannelIE(), YoutubeUserIE(), YoutubeSearchIE(), YoutubeIE(), MetacafeIE(), DailymotionIE(), GoogleSearchIE(), PhotobucketIE(), YahooIE(), YahooSearchIE(), DepositFilesIE(), FacebookIE(), BlipTVIE(), BlipTVUserIE(), VimeoIE(), MyVideoIE(), ComedyCentralIE(), EscapistIE(), CollegeHumorIE(), XVideosIE(), SoundcloudSetIE(), SoundcloudIE(), InfoQIE(), MixcloudIE(), StanfordOpenClassroomIE(), MTVIE(), YoukuIE(), XNXXIE(), YouJizzIE(), PornotubeIE(), YouPornIE(), GooglePlusIE(), ArteTvIE(), NBAIE(), WorldStarHipHopIE(), JustinTVIE(), FunnyOrDieIE(), SteamIE(), UstreamIE(), RBMARadioIE(), EightTracksIE(), KeekIE(), TEDIE(), MySpassIE(), SpiegelIE(), LiveLeakIE(), ARDIE(), ZDFIE(), TumblrIE(), BandcampIE(), RedTubeIE(), InaIE(), HowcastIE(), VineIE(), FlickrIE(), TeamcocoIE(), XHamsterIE(), HypemIE(), Vbox7IE(), GametrailersIE(), StatigramIE(), GenericIE() ] def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" return globals()[ie_name+'IE']