1
0
mirror of https://github.com/djohnlewis/stackdump synced 2024-12-04 23:17:37 +00:00

Python update

This commit is contained in:
djohnlewis 2021-06-10 09:46:47 +01:00
parent f53efd1422
commit 4616d04c56
9 changed files with 1000 additions and 57 deletions

2
.gitignore vendored
View File

@ -4,7 +4,7 @@
.DS_Store .DS_Store
# ignore any data # ignore any data
^data/*$ data
# ignore working bytecode # ignore working bytecode
\.class$ \.class$

View File

@ -0,0 +1 @@
../default_settings.py

View File

@ -1,33 +1,49 @@
#!/usr/bin/env python #!/usr/bin/env python
# This script downloads the sites RSS file and associated logos from the net. # This script downloads the sites RSS file and associated logos from the net.
import tarfile
import urllib.request import urllib.request
from xml.etree import ElementTree from xml.etree import ElementTree
import sys import sys
import os, ssl def printf(format, *args):
sys.stdout.write(format % args)
from shutil import copy
import os, ssl, fnmatch
from optparse import OptionParser
from xml.etree import ElementTree
import elasticsearch
import settings
from sqlobject import sqlhub, connectionForURI,AND, IN, SQLObject, \
UnicodeCol, DateTimeCol, IntCol, DatabaseIndex, dbconnection
from sqlobject.sqlbuilder import Delete, Insert
from sqlobject.styles import DefaultStyle
from pysolr import Solr, SolrError
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
getattr(ssl, '_create_unverified_context', None)): getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context ssl._create_defat_https_context = ssl._create_unverified_context
se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange') se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
sites_path = os.path.join(se_dir, 'Sites.xml') sites_path = os.path.join(se_dir, 'Sites.xml')
script_dir = os.path.dirname(sys.argv[0]) script_dir = os.path.dirname(sys.argv[0])
sites_file_path = os.path.join(script_dir, '../../../../data/sites') sites_file_path = os.path.join(script_dir, ''
'../../../../data/')
# ensure the data directory exists\\\\ # ensure the data directory exists\\\\
# download the sites RSS file
if not os.path.exists(os.path.dirname(sites_file_path)): if not os.path.exists(os.path.dirname(sites_file_path)):
os.mkdir(os.path.dirname(sites_file_path)) os.mkdir(os.path.dirname(sites_file_path))
# download the sites RSS file
print('Downloading StackExchange sites XML file...', ) print('Downloading StackExchange sites XML file...', )
urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path) # urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
print('done.') print('done.')
print('') print('')
# parse sites RSS file and download logosc # parse sites RSS file and download logosc
images_dir_path = os.path.join(script_dir, '../../../media/images') images_dir_path = os.path.join(script_dir, '../../../media/images')
print(os.listdir(images_dir_path)) print(os.listdir(images_dir_path))
@ -43,50 +59,80 @@ if not os.path.exists(badges_dir_path):
with open(sites_path) as f: with open(sites_path) as f:
sites_file = ElementTree.parse(f) sites_file = ElementTree.parse(f)
rows = sites_file.findall('row') sites = sites_file.findall('row')
# print(rows[0].attrib) # print(rows[0].attrib)
for row in rows: for site in sites:
entry_title = row.attrib['Name'] site_title = site.attrib['LongName']
print('Entry: ' + entry_title) site_name = site.attrib['Name']
# extract the key from the url - remove the http:// and .com # extract the key from the url - remove the http:// and .com
site_key = row.attrib['TinyName'] site_key = site.attrib['TinyName']
print('Site: ' + site_key) site_url = site.attrib['Url'][8:]
logo_url = row.attrib['ImageUrl'] logo_url = site.attrib['ImageUrl']
icon_url = row.attrib['IconUrl'] icon_url = site.attrib['IconUrl']
badge_url = row.attrib['BadgeIconUrl'] badge_url = site.attrib['BadgeIconUrl']
site_vars = (site_url, site_key, site_name, site_title)
# print(site_vars)
printf('Site: %s, key=%s, name="%s", longname="%s"\n' % site_vars)
try:
logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key)
if not os.path.exists(logo_file):
print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file))
except:
print('Failed download logo for %s...' % site_title)
try: try:
print('Downloading logo for %s...' % entry_title, icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key)
urllib.request.urlretrieve(logo_url, os.path.join(logos_dir_path, 'logo-%s.png' % site_key))) if not os.path.exists(icon_path):
print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path))
except: except:
print('Failed download logo for %s...' % entry_title) print('Failed download ico for %s...' % site_title)
try: try:
print('Downloading icon for %s...' % entry_title, badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key)
urllib.request.urlretrieve(icon_url, os.path.join(icons_dir_path, 'icon-%s.png' % site_key))) if not os.path.exists(icon_path):
print('Downloading badge for %s...' % site_title, urllib.request.urlretrieve(badge_url, badge_file))
except: except:
print('Failed download ico for %s...' % entry_title) printf('Failed download badge for %s...' % site_title)
try: site_files = []
print('Downloading badge for %s...' % entry_title, print('Key: ' + site_url)
urllib.request.urlretrieve(badge_url, os.path.join(badges_dir_path, 'badge-%s.png' % site_key))) for root, dirs, files in os.walk(se_dir):
except: for name in files:
print('Failed download badge for %s...' % entry_title) if fnmatch.fnmatch(name, site_url + '*'):
print('Match: ' + os.path.join(root, name))
site_files.append(os.path.join(root, name))
print('done.') sites_data = sites_file_path
for site_file in site_files:
dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + '7z'\
+ os.sep + os.path.basename(site_file)
os.makedirs(dst, exist_ok=True)
os.chdir(dst)
os.system('tar xzf '+site_file)
print('Data: ' + site_file)
def prepare_site(xml_root, dump_date, site_key):
print('Using the XML root path: ' + xml_root + '\n')
if not os.path.exists(xml_root):
print('The given XML root path does not exist.')
sys.exit(1)
# connect to the database
print('Connecting to the Stackdump database...')
conn_str = settings.DATABASE_CONN_STR
sqlhub.processConnection = connectionForURI(conn_str)
print('Connected.\n')
# MAIN METHOD # MAIN METHOD
if __name__ == '__main__': if __name__ == '__main__':
parser = OptionParser(usage='usage: %prog [options] xml_root_dir') parser = OptionParser(usage='usage: %pro'
parser.add_option('-n', '--site-name', help='Name of the site.') 'g [options] xml_root_dir')
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).') parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
parser.add_option('-c', '--dump-date', help='Dump date of the site.') parser.add_option('-c', '--dump-date', help='Dump date of the site.')
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
(cmd_options, cmd_args) = parser.parse_args() (cmd_options, cmd_args) = parser.parse_args()
@ -94,6 +140,4 @@ if __name__ == '__main__':
print('The path to the directory containing the extracted XML files is required.') print('The path to the directory containing the extracted XML files is required.')
sys.exit(1) sys.exit(1)
prepare_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date, prepare_site(cmd_args[0], cmd_options.dump_date, cmd_options.site_key)
cmd_options.site_desc, cmd_options.site_key,
cmd_options.base_url, answer_yes=cmd_options.answer_yes)

File diff suppressed because it is too large Load Diff

View File

@ -2,8 +2,6 @@
# This script takes extracted site files and inserts them into the database. # This script takes extracted site files and inserts them into the database.
from __future__ import with_statement
import sys import sys
import os import os
import time import time
@ -25,13 +23,7 @@ from pysolr import Solr, SolrError
from stackdump.models import Site, Badge, User from stackdump.models import Site, Badge, User
from stackdump import settings from stackdump import settings
import json
try:
# For Python < 2.6 or people using a newer version of simplejson
import simplejson as json
except ImportError:
# For Python >= 2.6
import json
script_dir = os.path.dirname(sys.argv[0]) script_dir = os.path.dirname(sys.argv[0])
@ -649,8 +641,7 @@ def get_file_path(dir_path, filename):
return os.path.abspath(os.path.join(dir_path, matches[0])) return os.path.abspath(os.path.join(dir_path, matches[0]))
def import_site(xml_root, site_name, dump_date, site_desc, site_key, def import_site(xml_root, dump_date,site_key)
site_base_url, answer_yes=False):
print('Using the XML root path: ' + xml_root + '\n') print('Using the XML root path: ' + xml_root + '\n')
if not os.path.exists(xml_root): if not os.path.exists(xml_root):
@ -909,19 +900,12 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
# MAIN METHOD # MAIN METHOD
if __name__ == '__main__': if __name__ == '__main__':
parser = OptionParser(usage='usage: %prog [options] xml_root_dir') parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
parser.add_option('-n', '--site-name', help='Name of the site.')
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).') parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
parser.add_option('-c', '--dump-date', help='Dump date of the site.') parser.add_option('-c', '--dump-date', help='Dump date of the site.')
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
(cmd_options, cmd_args) = parser.parse_args() (cmd_options, cmd_args) = parser.parse_args()
if len(cmd_args) < 1: if len(cmd_args) < 1:
print('The path to the directory containing the extracted XML files is required.') print('The path to the directory containing the extracted XML files is required.')
sys.exit(1) sys.exit(1)
import_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date, import_site(cmd_args[0],cmd_options.dump_date, cmd_options.site_key)
cmd_options.site_desc, cmd_options.site_key,
cmd_options.base_url, answer_yes=cmd_options.answer_yes)

View File

@ -0,0 +1 @@
../settings.py

1
questions (1).json Normal file

File diff suppressed because one or more lines are too long

1
questions.json Normal file

File diff suppressed because one or more lines are too long

BIN
schema.xlsx Normal file

Binary file not shown.