1
0
mirror of https://github.com/djohnlewis/stackdump synced 2024-12-04 06:57:36 +00:00

Python update

This commit is contained in:
djohnlewis 2021-06-10 09:46:47 +01:00
parent f53efd1422
commit 4616d04c56
9 changed files with 1000 additions and 57 deletions

2
.gitignore vendored
View File

@ -4,7 +4,7 @@
.DS_Store
# ignore any data
^data/*$
data
# ignore working bytecode
\.class$

View File

@ -0,0 +1 @@
../default_settings.py

View File

@ -1,33 +1,49 @@
#!/usr/bin/env python
# This script downloads the sites RSS file and associated logos from the net.
import tarfile
import urllib.request
from xml.etree import ElementTree
import sys
import os, ssl
def printf(format, *args):
sys.stdout.write(format % args)
from shutil import copy
import os, ssl, fnmatch
from optparse import OptionParser
from xml.etree import ElementTree
import elasticsearch
import settings
from sqlobject import sqlhub, connectionForURI,AND, IN, SQLObject, \
UnicodeCol, DateTimeCol, IntCol, DatabaseIndex, dbconnection
from sqlobject.sqlbuilder import Delete, Insert
from sqlobject.styles import DefaultStyle
from pysolr import Solr, SolrError
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
ssl._create_defat_https_context = ssl._create_unverified_context
se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
sites_path = os.path.join(se_dir, 'Sites.xml')
script_dir = os.path.dirname(sys.argv[0])
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
sites_file_path = os.path.join(script_dir, ''
'../../../../data/')
# ensure the data directory exists\\\\
# download the sites RSS file
if not os.path.exists(os.path.dirname(sites_file_path)):
os.mkdir(os.path.dirname(sites_file_path))
# download the sites RSS file
print('Downloading StackExchange sites XML file...', )
urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
# urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
print('done.')
print('')
# parse sites RSS file and download logosc
images_dir_path = os.path.join(script_dir, '../../../media/images')
print(os.listdir(images_dir_path))
@ -43,50 +59,80 @@ if not os.path.exists(badges_dir_path):
with open(sites_path) as f:
sites_file = ElementTree.parse(f)
rows = sites_file.findall('row')
sites = sites_file.findall('row')
# print(rows[0].attrib)
for row in rows:
entry_title = row.attrib['Name']
print('Entry: ' + entry_title)
for site in sites:
site_title = site.attrib['LongName']
site_name = site.attrib['Name']
# extract the key from the url - remove the http:// and .com
site_key = row.attrib['TinyName']
print('Site: ' + site_key)
logo_url = row.attrib['ImageUrl']
icon_url = row.attrib['IconUrl']
badge_url = row.attrib['BadgeIconUrl']
site_key = site.attrib['TinyName']
site_url = site.attrib['Url'][8:]
logo_url = site.attrib['ImageUrl']
icon_url = site.attrib['IconUrl']
badge_url = site.attrib['BadgeIconUrl']
site_vars = (site_url, site_key, site_name, site_title)
# print(site_vars)
printf('Site: %s, key=%s, name="%s", longname="%s"\n' % site_vars)
try:
logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key)
if not os.path.exists(logo_file):
print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file))
except:
print('Failed download logo for %s...' % site_title)
try:
print('Downloading logo for %s...' % entry_title,
urllib.request.urlretrieve(logo_url, os.path.join(logos_dir_path, 'logo-%s.png' % site_key)))
icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key)
if not os.path.exists(icon_path):
print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path))
except:
print('Failed download logo for %s...' % entry_title)
print('Failed download ico for %s...' % site_title)
try:
print('Downloading icon for %s...' % entry_title,
urllib.request.urlretrieve(icon_url, os.path.join(icons_dir_path, 'icon-%s.png' % site_key)))
badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key)
if not os.path.exists(icon_path):
print('Downloading badge for %s...' % site_title, urllib.request.urlretrieve(badge_url, badge_file))
except:
print('Failed download ico for %s...' % entry_title)
printf('Failed download badge for %s...' % site_title)
try:
print('Downloading badge for %s...' % entry_title,
urllib.request.urlretrieve(badge_url, os.path.join(badges_dir_path, 'badge-%s.png' % site_key)))
except:
print('Failed download badge for %s...' % entry_title)
site_files = []
print('Key: ' + site_url)
for root, dirs, files in os.walk(se_dir):
for name in files:
if fnmatch.fnmatch(name, site_url + '*'):
print('Match: ' + os.path.join(root, name))
site_files.append(os.path.join(root, name))
print('done.')
sites_data = sites_file_path
for site_file in site_files:
dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + '7z'\
+ os.sep + os.path.basename(site_file)
os.makedirs(dst, exist_ok=True)
os.chdir(dst)
os.system('tar xzf '+site_file)
print('Data: ' + site_file)
def prepare_site(xml_root, dump_date, site_key):
print('Using the XML root path: ' + xml_root + '\n')
if not os.path.exists(xml_root):
print('The given XML root path does not exist.')
sys.exit(1)
# connect to the database
print('Connecting to the Stackdump database...')
conn_str = settings.DATABASE_CONN_STR
sqlhub.processConnection = connectionForURI(conn_str)
print('Connected.\n')
# MAIN METHOD
if __name__ == '__main__':
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
parser.add_option('-n', '--site-name', help='Name of the site.')
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
parser = OptionParser(usage='usage: %pro'
'g [options] xml_root_dir')
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
(cmd_options, cmd_args) = parser.parse_args()
@ -94,6 +140,4 @@ if __name__ == '__main__':
print('The path to the directory containing the extracted XML files is required.')
sys.exit(1)
prepare_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
cmd_options.site_desc, cmd_options.site_key,
cmd_options.base_url, answer_yes=cmd_options.answer_yes)
prepare_site(cmd_args[0], cmd_options.dump_date, cmd_options.site_key)

File diff suppressed because it is too large Load Diff

View File

@ -2,8 +2,6 @@
# This script takes extracted site files and inserts them into the database.
from __future__ import with_statement
import sys
import os
import time
@ -25,13 +23,7 @@ from pysolr import Solr, SolrError
from stackdump.models import Site, Badge, User
from stackdump import settings
try:
# For Python < 2.6 or people using a newer version of simplejson
import simplejson as json
except ImportError:
# For Python >= 2.6
import json
import json
script_dir = os.path.dirname(sys.argv[0])
@ -649,8 +641,7 @@ def get_file_path(dir_path, filename):
return os.path.abspath(os.path.join(dir_path, matches[0]))
def import_site(xml_root, site_name, dump_date, site_desc, site_key,
site_base_url, answer_yes=False):
def import_site(xml_root, dump_date,site_key)
print('Using the XML root path: ' + xml_root + '\n')
if not os.path.exists(xml_root):
@ -909,19 +900,12 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
# MAIN METHOD
if __name__ == '__main__':
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
parser.add_option('-n', '--site-name', help='Name of the site.')
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
(cmd_options, cmd_args) = parser.parse_args()
if len(cmd_args) < 1:
print('The path to the directory containing the extracted XML files is required.')
sys.exit(1)
import_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
cmd_options.site_desc, cmd_options.site_key,
cmd_options.base_url, answer_yes=cmd_options.answer_yes)
import_site(cmd_args[0],cmd_options.dump_date, cmd_options.site_key)

View File

@ -0,0 +1 @@
../settings.py

1
questions (1).json Normal file

File diff suppressed because one or more lines are too long

1
questions.json Normal file

File diff suppressed because one or more lines are too long

BIN
schema.xlsx Normal file

Binary file not shown.