1
0
mirror of https://github.com/djohnlewis/stackdump synced 2024-12-04 23:17:37 +00:00

Python update

This commit is contained in:
djohnlewis 2021-06-13 15:19:51 +01:00
parent 4616d04c56
commit a10a6d1e4d
4 changed files with 145 additions and 245 deletions

View File

@ -30,13 +30,13 @@ sites_path = os.path.join(se_dir, 'Sites.xml')
script_dir = os.path.dirname(sys.argv[0]) script_dir = os.path.dirname(sys.argv[0])
sites_file_path = os.path.join(script_dir, '' sites_file_path = os.path.join(script_dir, ''
'../../../../data/') '../../../../data/')
# ensure the data directory exists\\\\ # ensure the data directory exists
# download the sites RSS file # download the sites RSS file
if not os.path.exists(os.path.dirname(sites_file_path)): if not os.path.exists(os.path.dirname(sites_file_path)):
os.mkdir(os.path.dirname(sites_file_path)) os.mkdir(os.path.dirname(sites_file_path))
print('Downloading StackExchange sites XML file...', ) print('Downloading StackExchange sites XML file...')
# urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path) # urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
print('done.') print('done.')
@ -80,15 +80,15 @@ with open(sites_path) as f:
logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key) logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key)
if not os.path.exists(logo_file): if not os.path.exists(logo_file):
print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file)) print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file))
except: except Exception as e:
print('Failed download logo for %s...' % site_title) print('Failed download logo for %s...' % site_title, str(e))
try: try:
icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key) icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key)
if not os.path.exists(icon_path): if not os.path.exists(icon_path):
print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path)) print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path))
except: except:
print('Failed download ico for %s...' % site_title) print('Failed download ico for %s...' % site_title, icon_url)
try: try:
badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key) badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key)
@ -107,8 +107,7 @@ with open(sites_path) as f:
sites_data = sites_file_path sites_data = sites_file_path
for site_file in site_files: for site_file in site_files:
dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + '7z'\ dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + '7z'
+ os.sep + os.path.basename(site_file)
os.makedirs(dst, exist_ok=True) os.makedirs(dst, exist_ok=True)
os.chdir(dst) os.chdir(dst)
os.system('tar xzf '+site_file) os.system('tar xzf '+site_file)

View File

@ -71,7 +71,7 @@ class BaseContentHandler(xml.sax.ContentHandler):
self.conn.query(self.conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db))) self.conn.query(self.conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
except Exception, e: except Exception as e:
# could not insert this, so ignore the row # could not insert this, so ignore the row
print('Exception: ' + str(e)) print('Exception: ' + str(e))
import traceback import traceback
@ -104,7 +104,7 @@ class BadgeContentHandler(BaseContentHandler):
d['userId'] = int(attrs.get('UserId', 0)) d['userId'] = int(attrs.get('UserId', 0))
d['name'] = attrs.get('Name', '') d['name'] = attrs.get('Name', '')
d['date'] = attrs.get('Date') d['date'] = attrs.get('Date')
except Exception, e: except Exception as e:
# could not parse this, so ignore the row completely # could not parse this, so ignore the row completely
self.cur_props = None self.cur_props = None
print('Exception: ' + str(e)) print('Exception: ' + str(e))

View File

@ -8,7 +8,7 @@ import time
import xml.sax import xml.sax
from datetime import datetime from datetime import datetime
import re import re
import urllib2 import urllib
import socket import socket
import tempfile import tempfile
import traceback import traceback
@ -71,7 +71,7 @@ class BaseContentHandler(xml.sax.ContentHandler):
self.conn.query(self.conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db))) self.conn.query(self.conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
except Exception, e: except Exception as e:
# could not insert this, so ignore the row # could not insert this, so ignore the row
print('Exception: ' + str(e)) print('Exception: ' + str(e))
import traceback import traceback
@ -104,7 +104,7 @@ class BadgeContentHandler(BaseContentHandler):
d['userId'] = int(attrs.get('UserId', 0)) d['userId'] = int(attrs.get('UserId', 0))
d['name'] = attrs.get('Name', '') d['name'] = attrs.get('Name', '')
d['date'] = attrs.get('Date') d['date'] = attrs.get('Date')
except Exception, e: except Exception as e:
# could not parse this, so ignore the row completely # could not parse this, so ignore the row completely
self.cur_props = None self.cur_props = None
print('Exception: ' + str(e)) print('Exception: ' + str(e))
@ -138,7 +138,7 @@ class CommentContentHandler(BaseContentHandler):
d['creationDate'] = attrs.get('CreationDate') d['creationDate'] = attrs.get('CreationDate')
d['userId'] = int(attrs.get('UserId', 0)) d['userId'] = int(attrs.get('UserId', 0))
except Exception, e: except Exception as e:
# could not parse this, so ignore the row completely # could not parse this, so ignore the row completely
self.cur_props = None self.cur_props = None
print('Exception: ' + str(e)) print('Exception: ' + str(e))
@ -188,7 +188,7 @@ class UserContentHandler(BaseContentHandler):
d['upVotes'] = int(attrs.get('UpVotes', 0)) d['upVotes'] = int(attrs.get('UpVotes', 0))
d['downVotes'] = int(attrs.get('DownVotes', 0)) d['downVotes'] = int(attrs.get('DownVotes', 0))
except Exception, e: except Exception as e:
# could not parse this, so ignore the row completely # could not parse this, so ignore the row completely
self.cur_props = None self.cur_props = None
print('Exception: ' + str(e)) print('Exception: ' + str(e))
@ -235,7 +235,7 @@ class PostContentHandler(xml.sax.ContentHandler):
if hasattr(obj, 'isoformat'): if hasattr(obj, 'isoformat'):
return obj.isoformat() return obj.isoformat()
else: else:
raise TypeError, 'Object of type %s with value of %s is not JSON serializable' % (type(obj), repr(obj)) raise TypeError('Object of type %s with value of %s is not JSON serializable' % (type(obj), repr(obj)))
def startElement(self, name, attrs): def startElement(self, name, attrs):
if name != 'row': if name != 'row':
@ -292,7 +292,7 @@ class PostContentHandler(xml.sax.ContentHandler):
d['comments'] = [ ] d['comments'] = [ ]
except Exception, e: except Exception as e:
# could not parse this, so ignore the row completely # could not parse this, so ignore the row completely
self.cur_props = None self.cur_props = None
print('Exception: ' + str(e)) print('Exception: ' + str(e))
@ -338,7 +338,7 @@ class PostContentHandler(xml.sax.ContentHandler):
# remove orphan answers from the orphan list # remove orphan answers from the orphan list
del self.orphan_answers[d['id']] del self.orphan_answers[d['id']]
except Exception, e: except Exception as e:
# could not insert this, so ignore the row # could not insert this, so ignore the row
print('Exception: ' + str(e)) print('Exception: ' + str(e))
import traceback import traceback
@ -368,7 +368,7 @@ class PostContentHandler(xml.sax.ContentHandler):
# question is complete, store it. # question is complete, store it.
questions_to_commit.append(self.finalise_question(q)) questions_to_commit.append(self.finalise_question(q))
except Exception, e: except Exception as e:
# could not serialise and insert this question, so ignore it # could not serialise and insert this question, so ignore it
print('Exception: ' + str(e)) print('Exception: ' + str(e))
import traceback import traceback
@ -499,7 +499,7 @@ class PostContentHandler(xml.sax.ContentHandler):
if q['acceptedAnswerId'] in post_ids: if q['acceptedAnswerId'] in post_ids:
question_obj['acceptedAnswerId'] = q['acceptedAnswerId'] question_obj['acceptedAnswerId'] = q['acceptedAnswerId']
else: else:
print 'Question [ID# %i] had an unknown answer. Possibly been merged or migrated. Ignoring inconsistency.' % (q['id'], ) print('Question [ID# %i] had an unknown answer. Possibly been merged or migrated. Ignoring inconsistency.' % (q['id'], ))
question_obj['creationDate'] = q['creationDate'] question_obj['creationDate'] = q['creationDate']
question_obj['score'] = q['score'] question_obj['score'] = q['score']
question_obj['viewCount'] = q['viewCount'] question_obj['viewCount'] = q['viewCount']
@ -539,7 +539,7 @@ class PostContentHandler(xml.sax.ContentHandler):
try: try:
self.solr.add(questions, commit=commit) self.solr.add(questions, commit=commit)
break break
except SolrError, e: except SolrError:
print('A Solr error occurred while committing questions - ') print('A Solr error occurred while committing questions - ')
traceback.print_exc(file=sys.stdout) traceback.print_exc(file=sys.stdout)
print('') print('')
@ -572,7 +572,7 @@ class PostContentHandler(xml.sax.ContentHandler):
# question is complete, store it. # question is complete, store it.
questions_to_commit.append(self.finalise_question(q)) questions_to_commit.append(self.finalise_question(q))
except Exception, e: except Exception as e:
# could not serialise and insert this question, so ignore it # could not serialise and insert this question, so ignore it
print('Exception: ' + str(e)) print('Exception: ' + str(e))
import traceback import traceback
@ -641,7 +641,7 @@ def get_file_path(dir_path, filename):
return os.path.abspath(os.path.join(dir_path, matches[0])) return os.path.abspath(os.path.join(dir_path, matches[0]))
def import_site(xml_root, dump_date,site_key) def import_site(xml_root, dump_date, import_key):
print('Using the XML root path: ' + xml_root + '\n') print('Using the XML root path: ' + xml_root + '\n')
if not os.path.exists(xml_root): if not os.path.exists(xml_root):
@ -654,18 +654,6 @@ def import_site(xml_root, dump_date,site_key)
sqlhub.processConnection = connectionForURI(conn_str) sqlhub.processConnection = connectionForURI(conn_str)
print('Connected.\n') print('Connected.\n')
# connect to solr
print('Connecting to solr...')
solr = Solr(settings.SOLR_URL, assume_clean=True)
# pysolr doesn't try to connect until a request is made, so we'll make a ping request
try:
solr._send_request('GET', 'admin/ping')
except socket.error, e:
print('Failed to connect to solr - error was: %s' % str(e))
print('Aborting.')
sys.exit(2)
print('Connected.\n')
# ensure required tables exist # ensure required tables exist
print("Creating tables if they don't exist...") print("Creating tables if they don't exist...")
Site.createTable(ifNotExists=True) Site.createTable(ifNotExists=True)
@ -674,117 +662,30 @@ def import_site(xml_root, dump_date,site_key)
print('Created.\n') print('Created.\n')
# SITE INFO # SITE INFO
# only look if they were not specified at the command line; also only if # only look if they were not specified at the command line;
# readme.txt exists (they don't in dumps after Aug 2012)
readme_path = get_file_path(xml_root, 'readme.txt')
if not (site_name and dump_date) and readme_path:
# get the site name from the first line of readme.txt. This could be fragile.
with open(readme_path, 'r') as f:
site_readme_desc = f.readline().strip()
# assume if there's a colon in the name, the name part is before, and the date se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
# part is after. sites_path = os.path.join(se_dir, 'Sites.xml')
if ':' in site_readme_desc:
readme_site_name, readme_dump_date = site_readme_desc.split(':')
readme_site_name = readme_site_name.strip()
readme_dump_date = readme_dump_date.strip()
else:
readme_site_name = site_readme_desc
readme_dump_date = None
# if the phrase ' - Data Dump' is in the readme site name, remove it with open(sites_path) as f:
i = readme_site_name.rfind(' - Data Dump')
if i >= 0:
readme_site_name = readme_site_name[:i].strip()
if not site_name:
site_name = readme_site_name
if not dump_date:
dump_date = readme_dump_date
# look for the site in the sites RSS file using the base_url with the id in RSS
# scrub the URL scheme off the base_url
if site_base_url:
# if there is no URL scheme, add one so it can be parsed by urllib2 so it
# can strip off other bits in the URL that we don't want
if '://' not in site_base_url:
site_base_url = 'http://%s' % site_base_url
site_base_url = urllib2.Request(site_base_url).get_host()
# attempt to get more information from the sites RSS cache
if site_base_url and not (site_name and site_desc and site_key):
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
if os.path.exists(sites_file_path):
with open(sites_file_path) as f:
sites_file = ElementTree.parse(f) sites_file = ElementTree.parse(f)
entries = sites_file.findall('{http://www.w3.org/2005/Atom}entry') sites = sites_file.findall('row')
# print(rows[0].attrib)
for entry in entries: for site in sites:
entry_base_url = entry.find('{http://www.w3.org/2005/Atom}id').text site_title = site.attrib['LongName']
if '://' in entry_base_url: site_name = site.attrib['Name']
entry_base_url = urllib2.Request(entry_base_url).get_host() # extract the key from the url - remove the http:// and .com
if site_base_url == entry_base_url: site_key = site.attrib['TinyName']
# this entry matches the detected site id site_url = site.attrib['Url'][8:]
if not site_key: logo_url = site.attrib['ImageUrl']
# extract the key from the url icon_url = site.attrib['IconUrl']
rss_site_key = entry.find('{http://www.w3.org/2005/Atom}id').text badge_url = site.attrib['BadgeIconUrl']
# remove the URL scheme
if '://' in rss_site_key:
rss_site_key = rss_site_key[rss_site_key.find('://')+3:]
# remove the TLD
if rss_site_key.rfind('.') >= 0:
rss_site_key = rss_site_key[:rss_site_key.rfind('.')]
# remove the .stackexchange bit
if '.stackexchange' in rss_site_key:
rss_site_key = rss_site_key[:rss_site_key.find('.stackexchange')]
site_key = rss_site_key if (import_key != '') and (import_key != site_key):
continue
if not site_name: else:
site_name = entry.find('{http://www.w3.org/2005/Atom}title').text.strip() print('site_name: '+site_name)
if not site_desc:
site_desc = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
print 'Name: %s\nKey: %s\nDescription: %s\nDump Date: %s\nBase URL: %s\n' % (
site_name.encode('ascii', 'ignore') if site_name else None,
site_key,
site_desc.encode('ascii', 'ignore') if site_desc else None,
dump_date,
site_base_url
)
# the base URL is optional.
if not (site_name and site_key and site_desc and dump_date):
print 'Could not get all the details for the site.'
print 'Use command-line parameters to specify the missing details (listed as None).'
sys.exit(1)
# prevent importing sites with keys that clash with method names in the app,
# e.g. a site key of 'search' would clash with the Stackdump-wide search page.
if site_key in ('search', 'import', 'media', 'licenses'):
print 'The site key given, %s, is a reserved word in Stackdump.' % site_key
print 'Use the --site-key parameter to specify an alternate site key.'
sys.exit(2)
# confirm site details with user to make sure we don't accidentally overwrite
# another site.
if not answer_yes:
confirm_prompt = 'Are these details correct (answer "yes" to proceed, anything else to abort)? '
confirm_answer = raw_input(confirm_prompt)
if confirm_answer != 'yes':
print 'Import aborted on user request.'
sys.exit(3)
# rollback any uncommitted entries in solr. Uncommitted entries may occur if
# this import process is aborted. Solr doesn't have the concept of transactions
# like databases do, so without a rollback, we'll be committing the previously
# uncommitted entries plus the newly imported ones.
#
# This also means multiple dataproc processes cannot occur concurrently. If you
# do the import will be silently incomplete.
print('Clearing any uncommitted entries in solr...')
solr._update('<rollback />', waitFlush=None, waitSearcher=None)
print('Cleared.\n')
# check if site is already in database; if so, purge the data. # check if site is already in database; if so, purge the data.
site = list(Site.select(Site.q.key==site_key)) site = list(Site.select(Site.q.key==site_key))

View File

@ -44,7 +44,7 @@ then
echo "Using Python `which "$PYTHON_CMD"`" echo "Using Python `which "$PYTHON_CMD"`"
# execution ends here if Python is found # execution ends here if Python is found
PYTHONPATH=$SCRIPT_DIR/python3/packages:$SCRIPT_DIR/python3/src:$PYTHONPATH PYTHONPATH=$SCRIPT_DIR/pyth3/packages:$SCRIPT_DIR/python/src:$PYTHONPATH
env "PYTHONPATH=$PYTHONPATH" "$PYTHON_CMD" "$@" env "PYTHONPATH=$PYTHONPATH" "$PYTHON_CMD" "$@"
exit $? exit $?
fi fi