From 7f6ed7b4381f0d9b130b2365406908cc87311e65 Mon Sep 17 00:00:00 2001 From: Alexei Baboulevitch Date: Mon, 12 May 2014 14:35:28 +0200 Subject: [PATCH] Fixed an uncaught exception caused by broken URLs. Examples of fixed pages: photo/11689, stackoverflow/315911 --- python/src/stackdump/app.py | 44 ++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/python/src/stackdump/app.py b/python/src/stackdump/app.py index 6043781..77b5b18 100644 --- a/python/src/stackdump/app.py +++ b/python/src/stackdump/app.py @@ -827,27 +827,31 @@ def _rewrite_html(html, app_url_root, sites_by_urls): internal_link = False url = t.get('href', None) if url: - host = urllib2.Request(url).get_host() - site = sites_by_urls.get(host, None) - if site: - # rewrite this URL for stackdump - question_id = SE_QUESTION_ID_RE.search(url) - if question_id: - question_id = question_id.groupdict()['id'] - url = '%s%s/%s' % (app_url_root, site.key, question_id) - t.set('href', url) - t.set('class', t.get('class', '') + ' internal-link') - internal_link = True - - answer_id = SE_ANSWER_ID_RE.search(url) - if answer_id: - answer_id = answer_id.groupdict()['id'] - url = '%s%s/a/%s' % (app_url_root, site.key, answer_id) - t.set('href', url) - t.set('class', t.get('class', '') + ' internal-link') - internal_link = True + try: + host = urllib2.Request(url).get_host() + except ValueError: + # invalid URL or local anchor, leaving as-is + internal_link = True + else: + site = sites_by_urls.get(host, None) + if site: + # rewrite this URL for stackdump + question_id = SE_QUESTION_ID_RE.search(url) + if question_id: + question_id = question_id.groupdict()['id'] + url = '%s%s/%s' % (app_url_root, site.key, question_id) + t.set('href', url) + internal_link = True + answer_id = SE_ANSWER_ID_RE.search(url) + if answer_id: + answer_id = answer_id.groupdict()['id'] + url = '%s%s/a/%s' % (app_url_root, site.key, answer_id) + t.set('href', url) + internal_link = True - if not internal_link: + if internal_link: + t.set('class', t.get('class', '') + ' internal-link') + else: t.set('class', t.get('class', '') + ' external-link') # get a string back