Split Thumbzilla out into it's own extractor.

This simplifies the pornhub extractor and makes it easier to maintain in the future.
4 years ago · 87f50e3feb
parent d332ec725d
commit 87f50e3feb
3 changed files with 63 additions and 7 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1129,6 +1129,7 @@ from .thisamericanlife import ThisAmericanLifeIE
 from .thisav import ThisAVIE
 from .thisoldhouse import ThisOldHouseIE
 from .threeqsdn import ThreeQSDNIE
+from .thumbzilla import ThumbzillaIE
 from .tiktok import (
    TikTokIE,
    TikTokUserIE,
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@ -51,10 +51,7 @@ class PornHubIE(PornHubBaseIE):
    IE_DESC = 'PornHub and Thumbzilla'
    _VALID_URL = r'''(?x)
                    https?://
-                        (?:
-                            (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
-                            (?:www\.)?thumbzilla\.com/video/
-                        )
+                        (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)
                        (?P<id>[\da-z]+)
                    '''
    _TESTS = [{
@ -140,9 +137,6 @@ class PornHubIE(PornHubBaseIE):
        # private video
        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
        'only_matching': True,
-    }, {
-        'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
-        'only_matching': True,
    }, {
        'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
        'only_matching': True,
--- a/youtube_dl/extractor/thumbzilla.py
+++ b/youtube_dl/extractor/thumbzilla.py
@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_request
+from .openload import PhantomJSwrapper
+from .pornhub import PornHubIE
+from ..utils import ExtractorError
+
+
+class ThumbzillaIE(InfoExtractor):
+    """
+    ThumbzillaIE is a frontend for other 'Tube' sites (mostly PornHub). ThumbzillaIE will
+    parse the video and delegate to the appropriate extractor via a url_result.
+    """
+    IE_NAME = 'thumbzilla'
+    IE_DESC = 'Thumbzilla'
+    _VALID_URL = r'https?://(?P<host>(?:www\.)?thumbzilla\.com)/video/(?P<id>[\da-z]+)'
+
+    _TEST = {
+        'url': 'https://www.thumbzilla.com/video/ph5c8e8f15b40ff/hot-skinny-girl-gives-you',
+        'info_dict': {
+            'id': 'ph5c8e8f15b40ff',
+            'ext': 'mp4',
+            'upload_date': '20190317',
+            'age_limit': 18,
+            'uploader': 'lizashultz',
+            'title': 'Hot skinny girl gives you.',
+        }
+    }
+
+    def _download_webpage_handle(self, *args, **kwargs):
+        def dl(*args, **kwargs):
+            return super(ThumbzillaIE, self)._download_webpage_handle(*args, **kwargs)
+
+        webpage, urlh = dl(*args, **kwargs)
+
+        if any(re.search(p, webpage) for p in (
+                r'<body\b[^>]+\bonload=["\']go\(\)',
+                r'document\.cookie\s*=\s*["\']RNKEY=',
+                r'document\.location\.reload\(true\)')):
+            url_or_request = args[0]
+            url = (url_or_request.get_full_url()
+                   if isinstance(url_or_request, compat_urllib_request.Request)
+                   else url_or_request)
+            phantom = PhantomJSwrapper(self, required_version='2.0')
+            phantom.get(url, html=webpage)
+            webpage, urlh = dl(*args, **kwargs)
+
+        return webpage, urlh
+
+    def _real_extract(self, url):
+        host, video_id = re.match(self._VALID_URL, url).groups()
+
+        if video_id.startswith('ph'):
+            return self.url_result('https://pornhub.com/view_video.php?viewkey=%s' % video_id,
+                                   video_id=video_id, ie=PornHubIE.ie_key())
+        else:
+            raise ExtractorError('Unsupported video')