From d6a96153471ae7e93693cb4dee46cbec1492af7b Mon Sep 17 00:00:00 2001 From: Filippo Valsorda - Campagna Date: Tue, 10 Apr 2012 16:31:46 +0200 Subject: [PATCH] standardized the use of unescapeHTML; added clean_html() --- youtube_dl/__init__.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 5f874b72f..3fd5cadfd 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -242,6 +242,18 @@ def htmlentity_transform(matchobj): return (u'&%s;' % entity) +def clean_html(html): + """Clean an HTML snippet into a readable string""" + # Newline vs
+ html = html.replace('\n', ' ') + html = re.sub('<\s*br\s*/?\s*>', '\n', html) + # Strip html tags + html = re.sub('<.*?>', '', html) + # Replace html entities + html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html) + return html + + def sanitize_title(utitle): """Sanitizes a video title so it could be used as part of a filename.""" utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) @@ -3343,8 +3355,6 @@ class EscapistIE(InfoExtractor): self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) def _real_extract(self, url): - htmlParser = HTMLParser.HTMLParser() - mobj = re.match(self._VALID_URL, url) if mobj is None: self._downloader.trouble(u'ERROR: invalid URL: %s' % url) @@ -3360,11 +3370,11 @@ class EscapistIE(InfoExtractor): return descMatch = re.search('