1
0
mirror of https://github.com/djohnlewis/stackdump synced 2024-12-04 15:07:36 +00:00

image download

This commit is contained in:
djohnlewis 2021-05-24 20:14:14 +01:00
parent 20693a8764
commit f53efd1422
3 changed files with 74 additions and 29 deletions

4
.gitignore vendored
View File

@ -23,7 +23,9 @@ tutorial/*$
^java/solr/server/logs/*
# ignore the downloaded logos
^python/media/images/logos/*
^python/media/images/logos48
^python/media/images/icons
^python/media/images/badges
# PyCharm project files
^.idea/

View File

@ -2,11 +2,14 @@
# This script downloads the sites RSS file and associated logos from the net.
import urllib
import urllib.request
from xml.etree import ElementTree
import os
import sys
We
import os, ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
sites_path = os.path.join(se_dir, 'Sites.xml')
@ -19,38 +22,78 @@ if not os.path.exists(os.path.dirname(sites_file_path)):
os.mkdir(os.path.dirname(sites_file_path))
# download the sites RSS file
print('Downloading StackExchange sites XML file...',)
urllib.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
print('Downloading StackExchange sites XML file...', )
urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
print('done.')
print('')
# parse sites RSS file and download logosc
images_dir_path = os.path.join(script_dir, '../../../media/images/logos')
logos_dir_path = os.path.join(images_dir_path, 'logos')
icons_dir_path = os.path.join(images_dir_path, 'icons')
badgos_dir_path = os.path.join(images_dir_path, 'badgos')
images_dir_path = os.path.join(script_dir, '../../../media/images')
print(os.listdir(images_dir_path))
logos_dir_path = os.path.join(images_dir_path, 'logos48')
if not os.path.exists(logos_dir_path):
os.mkdir(logos_dir_path)
icons_dir_path = os.path.join(images_dir_path, 'icons')
if not os.path.exists(icons_dir_path):
os.mkdir(icons_dir_path)
badges_dir_path = os.path.join(images_dir_path, 'badges')
if not os.path.exists(badges_dir_path):
os.mkdir(badges_dir_path)
with open(sites_path) as f:
sites_file = ElementTree.parse(f)
entries = sites_file.findall('sites/row')
print(entries)
for entry in entries:
entry_title = entry.find('{http://www.w3.org/2005/Atom}title').text.encode('ascii', 'ignore')
rows = sites_file.findall('row')
# print(rows[0].attrib)
for row in rows:
entry_title = row.attrib['Name']
print('Entry: ' + entry_title)
# extract the key from the url - remove the http:// and .com
site_key = entry.find('{http://www.w3.org/2005/Atom}id').text
if site_key.startswith('http://'):
site_key = site_key[len('http://'):]
if site_key.endswith('.com'):
site_key = site_key[:-len('.com')]
if site_key.endswith('.stackexchange'):
site_key = site_key[:-len('.stackexchange')]
print('Downloading logo for %s...' % entry_title,
urllib.urlretrieve('http://cdn.sstatic.net/Sites/%s/img/icon-48.png' % site_key,
os.path.join(logos_dir_path, '%s.png' % site_key)))
print('done.')
site_key = row.attrib['TinyName']
print('Site: ' + site_key)
logo_url = row.attrib['ImageUrl']
icon_url = row.attrib['IconUrl']
badge_url = row.attrib['BadgeIconUrl']
try:
print('Downloading logo for %s...' % entry_title,
urllib.request.urlretrieve(logo_url, os.path.join(logos_dir_path, 'logo-%s.png' % site_key)))
except:
print('Failed download logo for %s...' % entry_title)
try:
print('Downloading icon for %s...' % entry_title,
urllib.request.urlretrieve(icon_url, os.path.join(icons_dir_path, 'icon-%s.png' % site_key)))
except:
print('Failed download ico for %s...' % entry_title)
try:
print('Downloading badge for %s...' % entry_title,
urllib.request.urlretrieve(badge_url, os.path.join(badges_dir_path, 'badge-%s.png' % site_key)))
except:
print('Failed download badge for %s...' % entry_title)
print('done.')
# MAIN METHOD
if __name__ == '__main__':
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
parser.add_option('-n', '--site-name', help='Name of the site.')
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
(cmd_options, cmd_args) = parser.parse_args()
if len(cmd_args) < 1:
print('The path to the directory containing the extracted XML files is required.')
sys.exit(1)
prepare_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
cmd_options.site_desc, cmd_options.site_key,
cmd_options.base_url, answer_yes=cmd_options.answer_yes)

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# This script takes extracted site files and inserts them into the database.