mirror of
https://github.com/djohnlewis/stackdump
synced 2024-12-04 23:17:37 +00:00
Python update
This commit is contained in:
parent
4616d04c56
commit
a10a6d1e4d
@ -30,13 +30,13 @@ sites_path = os.path.join(se_dir, 'Sites.xml')
|
|||||||
script_dir = os.path.dirname(sys.argv[0])
|
script_dir = os.path.dirname(sys.argv[0])
|
||||||
sites_file_path = os.path.join(script_dir, ''
|
sites_file_path = os.path.join(script_dir, ''
|
||||||
'../../../../data/')
|
'../../../../data/')
|
||||||
# ensure the data directory exists\\\\
|
# ensure the data directory exists
|
||||||
# download the sites RSS file
|
# download the sites RSS file
|
||||||
|
|
||||||
if not os.path.exists(os.path.dirname(sites_file_path)):
|
if not os.path.exists(os.path.dirname(sites_file_path)):
|
||||||
os.mkdir(os.path.dirname(sites_file_path))
|
os.mkdir(os.path.dirname(sites_file_path))
|
||||||
|
|
||||||
print('Downloading StackExchange sites XML file...', )
|
print('Downloading StackExchange sites XML file...')
|
||||||
# urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
|
# urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
|
||||||
print('done.')
|
print('done.')
|
||||||
|
|
||||||
@ -80,15 +80,15 @@ with open(sites_path) as f:
|
|||||||
logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key)
|
logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key)
|
||||||
if not os.path.exists(logo_file):
|
if not os.path.exists(logo_file):
|
||||||
print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file))
|
print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file))
|
||||||
except:
|
except Exception as e:
|
||||||
print('Failed download logo for %s...' % site_title)
|
print('Failed download logo for %s...' % site_title, str(e))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key)
|
icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key)
|
||||||
if not os.path.exists(icon_path):
|
if not os.path.exists(icon_path):
|
||||||
print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path))
|
print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path))
|
||||||
except:
|
except:
|
||||||
print('Failed download ico for %s...' % site_title)
|
print('Failed download ico for %s...' % site_title, icon_url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key)
|
badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key)
|
||||||
@ -107,8 +107,7 @@ with open(sites_path) as f:
|
|||||||
|
|
||||||
sites_data = sites_file_path
|
sites_data = sites_file_path
|
||||||
for site_file in site_files:
|
for site_file in site_files:
|
||||||
dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + '7z'\
|
dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + '7z'
|
||||||
+ os.sep + os.path.basename(site_file)
|
|
||||||
os.makedirs(dst, exist_ok=True)
|
os.makedirs(dst, exist_ok=True)
|
||||||
os.chdir(dst)
|
os.chdir(dst)
|
||||||
os.system('tar xzf '+site_file)
|
os.system('tar xzf '+site_file)
|
||||||
|
@ -71,7 +71,7 @@ class BaseContentHandler(xml.sax.ContentHandler):
|
|||||||
|
|
||||||
self.conn.query(self.conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
|
self.conn.query(self.conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
|
||||||
|
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# could not insert this, so ignore the row
|
# could not insert this, so ignore the row
|
||||||
print('Exception: ' + str(e))
|
print('Exception: ' + str(e))
|
||||||
import traceback
|
import traceback
|
||||||
@ -104,7 +104,7 @@ class BadgeContentHandler(BaseContentHandler):
|
|||||||
d['userId'] = int(attrs.get('UserId', 0))
|
d['userId'] = int(attrs.get('UserId', 0))
|
||||||
d['name'] = attrs.get('Name', '')
|
d['name'] = attrs.get('Name', '')
|
||||||
d['date'] = attrs.get('Date')
|
d['date'] = attrs.get('Date')
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# could not parse this, so ignore the row completely
|
# could not parse this, so ignore the row completely
|
||||||
self.cur_props = None
|
self.cur_props = None
|
||||||
print('Exception: ' + str(e))
|
print('Exception: ' + str(e))
|
||||||
|
@ -8,7 +8,7 @@ import time
|
|||||||
import xml.sax
|
import xml.sax
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import re
|
import re
|
||||||
import urllib2
|
import urllib
|
||||||
import socket
|
import socket
|
||||||
import tempfile
|
import tempfile
|
||||||
import traceback
|
import traceback
|
||||||
@ -71,7 +71,7 @@ class BaseContentHandler(xml.sax.ContentHandler):
|
|||||||
|
|
||||||
self.conn.query(self.conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
|
self.conn.query(self.conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
|
||||||
|
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# could not insert this, so ignore the row
|
# could not insert this, so ignore the row
|
||||||
print('Exception: ' + str(e))
|
print('Exception: ' + str(e))
|
||||||
import traceback
|
import traceback
|
||||||
@ -104,7 +104,7 @@ class BadgeContentHandler(BaseContentHandler):
|
|||||||
d['userId'] = int(attrs.get('UserId', 0))
|
d['userId'] = int(attrs.get('UserId', 0))
|
||||||
d['name'] = attrs.get('Name', '')
|
d['name'] = attrs.get('Name', '')
|
||||||
d['date'] = attrs.get('Date')
|
d['date'] = attrs.get('Date')
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# could not parse this, so ignore the row completely
|
# could not parse this, so ignore the row completely
|
||||||
self.cur_props = None
|
self.cur_props = None
|
||||||
print('Exception: ' + str(e))
|
print('Exception: ' + str(e))
|
||||||
@ -138,7 +138,7 @@ class CommentContentHandler(BaseContentHandler):
|
|||||||
d['creationDate'] = attrs.get('CreationDate')
|
d['creationDate'] = attrs.get('CreationDate')
|
||||||
d['userId'] = int(attrs.get('UserId', 0))
|
d['userId'] = int(attrs.get('UserId', 0))
|
||||||
|
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# could not parse this, so ignore the row completely
|
# could not parse this, so ignore the row completely
|
||||||
self.cur_props = None
|
self.cur_props = None
|
||||||
print('Exception: ' + str(e))
|
print('Exception: ' + str(e))
|
||||||
@ -188,7 +188,7 @@ class UserContentHandler(BaseContentHandler):
|
|||||||
d['upVotes'] = int(attrs.get('UpVotes', 0))
|
d['upVotes'] = int(attrs.get('UpVotes', 0))
|
||||||
d['downVotes'] = int(attrs.get('DownVotes', 0))
|
d['downVotes'] = int(attrs.get('DownVotes', 0))
|
||||||
|
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# could not parse this, so ignore the row completely
|
# could not parse this, so ignore the row completely
|
||||||
self.cur_props = None
|
self.cur_props = None
|
||||||
print('Exception: ' + str(e))
|
print('Exception: ' + str(e))
|
||||||
@ -235,7 +235,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
if hasattr(obj, 'isoformat'):
|
if hasattr(obj, 'isoformat'):
|
||||||
return obj.isoformat()
|
return obj.isoformat()
|
||||||
else:
|
else:
|
||||||
raise TypeError, 'Object of type %s with value of %s is not JSON serializable' % (type(obj), repr(obj))
|
raise TypeError('Object of type %s with value of %s is not JSON serializable' % (type(obj), repr(obj)))
|
||||||
|
|
||||||
def startElement(self, name, attrs):
|
def startElement(self, name, attrs):
|
||||||
if name != 'row':
|
if name != 'row':
|
||||||
@ -292,7 +292,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
d['comments'] = [ ]
|
d['comments'] = [ ]
|
||||||
|
|
||||||
|
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# could not parse this, so ignore the row completely
|
# could not parse this, so ignore the row completely
|
||||||
self.cur_props = None
|
self.cur_props = None
|
||||||
print('Exception: ' + str(e))
|
print('Exception: ' + str(e))
|
||||||
@ -338,7 +338,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
# remove orphan answers from the orphan list
|
# remove orphan answers from the orphan list
|
||||||
del self.orphan_answers[d['id']]
|
del self.orphan_answers[d['id']]
|
||||||
|
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# could not insert this, so ignore the row
|
# could not insert this, so ignore the row
|
||||||
print('Exception: ' + str(e))
|
print('Exception: ' + str(e))
|
||||||
import traceback
|
import traceback
|
||||||
@ -368,7 +368,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
# question is complete, store it.
|
# question is complete, store it.
|
||||||
questions_to_commit.append(self.finalise_question(q))
|
questions_to_commit.append(self.finalise_question(q))
|
||||||
|
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# could not serialise and insert this question, so ignore it
|
# could not serialise and insert this question, so ignore it
|
||||||
print('Exception: ' + str(e))
|
print('Exception: ' + str(e))
|
||||||
import traceback
|
import traceback
|
||||||
@ -499,7 +499,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
if q['acceptedAnswerId'] in post_ids:
|
if q['acceptedAnswerId'] in post_ids:
|
||||||
question_obj['acceptedAnswerId'] = q['acceptedAnswerId']
|
question_obj['acceptedAnswerId'] = q['acceptedAnswerId']
|
||||||
else:
|
else:
|
||||||
print 'Question [ID# %i] had an unknown answer. Possibly been merged or migrated. Ignoring inconsistency.' % (q['id'], )
|
print('Question [ID# %i] had an unknown answer. Possibly been merged or migrated. Ignoring inconsistency.' % (q['id'], ))
|
||||||
question_obj['creationDate'] = q['creationDate']
|
question_obj['creationDate'] = q['creationDate']
|
||||||
question_obj['score'] = q['score']
|
question_obj['score'] = q['score']
|
||||||
question_obj['viewCount'] = q['viewCount']
|
question_obj['viewCount'] = q['viewCount']
|
||||||
@ -539,7 +539,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
try:
|
try:
|
||||||
self.solr.add(questions, commit=commit)
|
self.solr.add(questions, commit=commit)
|
||||||
break
|
break
|
||||||
except SolrError, e:
|
except SolrError:
|
||||||
print('A Solr error occurred while committing questions - ')
|
print('A Solr error occurred while committing questions - ')
|
||||||
traceback.print_exc(file=sys.stdout)
|
traceback.print_exc(file=sys.stdout)
|
||||||
print('')
|
print('')
|
||||||
@ -572,7 +572,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
# question is complete, store it.
|
# question is complete, store it.
|
||||||
questions_to_commit.append(self.finalise_question(q))
|
questions_to_commit.append(self.finalise_question(q))
|
||||||
|
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# could not serialise and insert this question, so ignore it
|
# could not serialise and insert this question, so ignore it
|
||||||
print('Exception: ' + str(e))
|
print('Exception: ' + str(e))
|
||||||
import traceback
|
import traceback
|
||||||
@ -641,7 +641,7 @@ def get_file_path(dir_path, filename):
|
|||||||
return os.path.abspath(os.path.join(dir_path, matches[0]))
|
return os.path.abspath(os.path.join(dir_path, matches[0]))
|
||||||
|
|
||||||
|
|
||||||
def import_site(xml_root, dump_date,site_key)
|
def import_site(xml_root, dump_date, import_key):
|
||||||
print('Using the XML root path: ' + xml_root + '\n')
|
print('Using the XML root path: ' + xml_root + '\n')
|
||||||
|
|
||||||
if not os.path.exists(xml_root):
|
if not os.path.exists(xml_root):
|
||||||
@ -654,18 +654,6 @@ def import_site(xml_root, dump_date,site_key)
|
|||||||
sqlhub.processConnection = connectionForURI(conn_str)
|
sqlhub.processConnection = connectionForURI(conn_str)
|
||||||
print('Connected.\n')
|
print('Connected.\n')
|
||||||
|
|
||||||
# connect to solr
|
|
||||||
print('Connecting to solr...')
|
|
||||||
solr = Solr(settings.SOLR_URL, assume_clean=True)
|
|
||||||
# pysolr doesn't try to connect until a request is made, so we'll make a ping request
|
|
||||||
try:
|
|
||||||
solr._send_request('GET', 'admin/ping')
|
|
||||||
except socket.error, e:
|
|
||||||
print('Failed to connect to solr - error was: %s' % str(e))
|
|
||||||
print('Aborting.')
|
|
||||||
sys.exit(2)
|
|
||||||
print('Connected.\n')
|
|
||||||
|
|
||||||
# ensure required tables exist
|
# ensure required tables exist
|
||||||
print("Creating tables if they don't exist...")
|
print("Creating tables if they don't exist...")
|
||||||
Site.createTable(ifNotExists=True)
|
Site.createTable(ifNotExists=True)
|
||||||
@ -674,228 +662,141 @@ def import_site(xml_root, dump_date,site_key)
|
|||||||
print('Created.\n')
|
print('Created.\n')
|
||||||
|
|
||||||
# SITE INFO
|
# SITE INFO
|
||||||
# only look if they were not specified at the command line; also only if
|
# only look if they were not specified at the command line;
|
||||||
# readme.txt exists (they don't in dumps after Aug 2012)
|
|
||||||
readme_path = get_file_path(xml_root, 'readme.txt')
|
|
||||||
if not (site_name and dump_date) and readme_path:
|
|
||||||
# get the site name from the first line of readme.txt. This could be fragile.
|
|
||||||
with open(readme_path, 'r') as f:
|
|
||||||
site_readme_desc = f.readline().strip()
|
|
||||||
|
|
||||||
# assume if there's a colon in the name, the name part is before, and the date
|
se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
|
||||||
# part is after.
|
sites_path = os.path.join(se_dir, 'Sites.xml')
|
||||||
if ':' in site_readme_desc:
|
|
||||||
readme_site_name, readme_dump_date = site_readme_desc.split(':')
|
with open(sites_path) as f:
|
||||||
readme_site_name = readme_site_name.strip()
|
sites_file = ElementTree.parse(f)
|
||||||
readme_dump_date = readme_dump_date.strip()
|
sites = sites_file.findall('row')
|
||||||
|
# print(rows[0].attrib)
|
||||||
|
|
||||||
|
for site in sites:
|
||||||
|
site_title = site.attrib['LongName']
|
||||||
|
site_name = site.attrib['Name']
|
||||||
|
# extract the key from the url - remove the http:// and .com
|
||||||
|
site_key = site.attrib['TinyName']
|
||||||
|
site_url = site.attrib['Url'][8:]
|
||||||
|
logo_url = site.attrib['ImageUrl']
|
||||||
|
icon_url = site.attrib['IconUrl']
|
||||||
|
badge_url = site.attrib['BadgeIconUrl']
|
||||||
|
|
||||||
|
if (import_key != '') and (import_key != site_key):
|
||||||
|
continue
|
||||||
else:
|
else:
|
||||||
readme_site_name = site_readme_desc
|
print('site_name: '+site_name)
|
||||||
readme_dump_date = None
|
|
||||||
|
|
||||||
# if the phrase ' - Data Dump' is in the readme site name, remove it
|
# check if site is already in database; if so, purge the data.
|
||||||
i = readme_site_name.rfind(' - Data Dump')
|
site = list(Site.select(Site.q.key==site_key))
|
||||||
if i >= 0:
|
if len(site) > 0:
|
||||||
readme_site_name = readme_site_name[:i].strip()
|
site = site[0]
|
||||||
|
print('Deleting site "%s" from the database... ' % site.name)
|
||||||
|
sys.stdout.flush()
|
||||||
|
# Using SQLObject to delete rows takes too long, so we're going to do it directly
|
||||||
|
#Site.delete(site.id) # the relationship cascades, so other rows will be deleted
|
||||||
|
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
||||||
|
conn = sqlhub.threadConnection
|
||||||
|
# these deletions are done in this order to avoid FK constraint issues
|
||||||
|
print('\tDeleting badges...')
|
||||||
|
conn.query(conn.sqlrepr(Delete(Badge.sqlmeta.table, where=(Badge.q.site==site))))
|
||||||
|
print('\tDeleting users...')
|
||||||
|
conn.query(conn.sqlrepr(Delete(User.sqlmeta.table, where=(User.q.site==site))))
|
||||||
|
print('\tDeleting site...')
|
||||||
|
conn.query(conn.sqlrepr(Delete(Site.sqlmeta.table, where=(Site.q.id==site.id))))
|
||||||
|
sqlhub.threadConnection.commit(close=True)
|
||||||
|
print('Deleted.\n')
|
||||||
|
|
||||||
if not site_name:
|
print('Deleting site "%s" from the solr... ' % site.name)
|
||||||
site_name = readme_site_name
|
solr.delete(q='siteKey:"%s"' % site.key, commit=False)
|
||||||
if not dump_date:
|
solr.commit(expungeDeletes=True)
|
||||||
dump_date = readme_dump_date
|
print('Deleted.\n')
|
||||||
|
|
||||||
# look for the site in the sites RSS file using the base_url with the id in RSS
|
# create the temporary comments database
|
||||||
# scrub the URL scheme off the base_url
|
print('Connecting to the temporary comments database...')
|
||||||
if site_base_url:
|
temp_db_file, temp_db_path = tempfile.mkstemp('.sqlite', 'temp_comment_db-' + re.sub(r'[^\w]', '_', site_key) + '-', settings.TEMP_COMMENTS_DATABASE_DIR)
|
||||||
# if there is no URL scheme, add one so it can be parsed by urllib2 so it
|
os.close(temp_db_file)
|
||||||
# can strip off other bits in the URL that we don't want
|
conn_str = 'sqlite:///' + temp_db_path
|
||||||
if '://' not in site_base_url:
|
comment_db_sqlhub.processConnection = connectionForURI(conn_str)
|
||||||
site_base_url = 'http://%s' % site_base_url
|
print('Connected.')
|
||||||
site_base_url = urllib2.Request(site_base_url).get_host()
|
Comment.createTable()
|
||||||
|
print('Schema created.')
|
||||||
|
comment_db_sqlhub.processConnection.getConnection().execute('PRAGMA synchronous = OFF')
|
||||||
|
comment_db_sqlhub.processConnection.getConnection().execute('PRAGMA journal_mode = MEMORY')
|
||||||
|
print('Pragma configured.\n')
|
||||||
|
|
||||||
# attempt to get more information from the sites RSS cache
|
timing_start = time.time()
|
||||||
if site_base_url and not (site_name and site_desc and site_key):
|
|
||||||
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
|
|
||||||
if os.path.exists(sites_file_path):
|
|
||||||
with open(sites_file_path) as f:
|
|
||||||
sites_file = ElementTree.parse(f)
|
|
||||||
entries = sites_file.findall('{http://www.w3.org/2005/Atom}entry')
|
|
||||||
|
|
||||||
for entry in entries:
|
# start a new transaction
|
||||||
entry_base_url = entry.find('{http://www.w3.org/2005/Atom}id').text
|
|
||||||
if '://' in entry_base_url:
|
|
||||||
entry_base_url = urllib2.Request(entry_base_url).get_host()
|
|
||||||
if site_base_url == entry_base_url:
|
|
||||||
# this entry matches the detected site id
|
|
||||||
if not site_key:
|
|
||||||
# extract the key from the url
|
|
||||||
rss_site_key = entry.find('{http://www.w3.org/2005/Atom}id').text
|
|
||||||
# remove the URL scheme
|
|
||||||
if '://' in rss_site_key:
|
|
||||||
rss_site_key = rss_site_key[rss_site_key.find('://')+3:]
|
|
||||||
# remove the TLD
|
|
||||||
if rss_site_key.rfind('.') >= 0:
|
|
||||||
rss_site_key = rss_site_key[:rss_site_key.rfind('.')]
|
|
||||||
# remove the .stackexchange bit
|
|
||||||
if '.stackexchange' in rss_site_key:
|
|
||||||
rss_site_key = rss_site_key[:rss_site_key.find('.stackexchange')]
|
|
||||||
|
|
||||||
site_key = rss_site_key
|
|
||||||
|
|
||||||
if not site_name:
|
|
||||||
site_name = entry.find('{http://www.w3.org/2005/Atom}title').text.strip()
|
|
||||||
if not site_desc:
|
|
||||||
site_desc = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
|
|
||||||
|
|
||||||
print 'Name: %s\nKey: %s\nDescription: %s\nDump Date: %s\nBase URL: %s\n' % (
|
|
||||||
site_name.encode('ascii', 'ignore') if site_name else None,
|
|
||||||
site_key,
|
|
||||||
site_desc.encode('ascii', 'ignore') if site_desc else None,
|
|
||||||
dump_date,
|
|
||||||
site_base_url
|
|
||||||
)
|
|
||||||
|
|
||||||
# the base URL is optional.
|
|
||||||
if not (site_name and site_key and site_desc and dump_date):
|
|
||||||
print 'Could not get all the details for the site.'
|
|
||||||
print 'Use command-line parameters to specify the missing details (listed as None).'
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# prevent importing sites with keys that clash with method names in the app,
|
|
||||||
# e.g. a site key of 'search' would clash with the Stackdump-wide search page.
|
|
||||||
if site_key in ('search', 'import', 'media', 'licenses'):
|
|
||||||
print 'The site key given, %s, is a reserved word in Stackdump.' % site_key
|
|
||||||
print 'Use the --site-key parameter to specify an alternate site key.'
|
|
||||||
sys.exit(2)
|
|
||||||
|
|
||||||
# confirm site details with user to make sure we don't accidentally overwrite
|
|
||||||
# another site.
|
|
||||||
if not answer_yes:
|
|
||||||
confirm_prompt = 'Are these details correct (answer "yes" to proceed, anything else to abort)? '
|
|
||||||
confirm_answer = raw_input(confirm_prompt)
|
|
||||||
if confirm_answer != 'yes':
|
|
||||||
print 'Import aborted on user request.'
|
|
||||||
sys.exit(3)
|
|
||||||
|
|
||||||
# rollback any uncommitted entries in solr. Uncommitted entries may occur if
|
|
||||||
# this import process is aborted. Solr doesn't have the concept of transactions
|
|
||||||
# like databases do, so without a rollback, we'll be committing the previously
|
|
||||||
# uncommitted entries plus the newly imported ones.
|
|
||||||
#
|
|
||||||
# This also means multiple dataproc processes cannot occur concurrently. If you
|
|
||||||
# do the import will be silently incomplete.
|
|
||||||
print('Clearing any uncommitted entries in solr...')
|
|
||||||
solr._update('<rollback />', waitFlush=None, waitSearcher=None)
|
|
||||||
print('Cleared.\n')
|
|
||||||
|
|
||||||
# check if site is already in database; if so, purge the data.
|
|
||||||
site = list(Site.select(Site.q.key==site_key))
|
|
||||||
if len(site) > 0:
|
|
||||||
site = site[0]
|
|
||||||
print('Deleting site "%s" from the database... ' % site.name)
|
|
||||||
sys.stdout.flush()
|
|
||||||
# Using SQLObject to delete rows takes too long, so we're going to do it directly
|
|
||||||
#Site.delete(site.id) # the relationship cascades, so other rows will be deleted
|
|
||||||
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
||||||
conn = sqlhub.threadConnection
|
conn = sqlhub.threadConnection
|
||||||
# these deletions are done in this order to avoid FK constraint issues
|
comment_db_sqlhub.threadConnection = comment_db_sqlhub.processConnection.transaction()
|
||||||
print('\tDeleting badges...')
|
temp_db_conn = comment_db_sqlhub.threadConnection
|
||||||
conn.query(conn.sqlrepr(Delete(Badge.sqlmeta.table, where=(Badge.q.site==site))))
|
|
||||||
print('\tDeleting users...')
|
# create a new Site
|
||||||
conn.query(conn.sqlrepr(Delete(User.sqlmeta.table, where=(User.q.site==site))))
|
site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date,
|
||||||
print('\tDeleting site...')
|
import_date=datetime.now(), base_url=site_base_url)
|
||||||
conn.query(conn.sqlrepr(Delete(Site.sqlmeta.table, where=(Site.q.id==site.id))))
|
|
||||||
|
# BADGES
|
||||||
|
# Processing of badges has been disabled because they don't offer any useful
|
||||||
|
# information in the offline situation.
|
||||||
|
#print('[badge] PARSING BADGES...')
|
||||||
|
#xml_path = get_file_path(xml_root, 'badges.xml')
|
||||||
|
#print('[badge] start parsing badges.xml...')
|
||||||
|
#handler = BadgeContentHandler(conn, site)
|
||||||
|
#xml.sax.parse(xml_path, handler)
|
||||||
|
#print('[badge]\tProcessed %d rows.' % (handler.row_count))
|
||||||
|
#print('[badge] FINISHED PARSING BADGES.\n')
|
||||||
|
|
||||||
|
# COMMENTS
|
||||||
|
# comments are temporarily stored in the database for retrieval when parsing
|
||||||
|
# posts only.
|
||||||
|
print('[comment] PARSING COMMENTS...')
|
||||||
|
xml_path = get_file_path(xml_root, 'comments.xml')
|
||||||
|
print('[comment] start parsing comments.xml...')
|
||||||
|
handler = CommentContentHandler(temp_db_conn, site)
|
||||||
|
xml.sax.parse(xml_path, handler)
|
||||||
|
print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
|
||||||
|
print('[comment] FINISHED PARSING COMMENTS.\n')
|
||||||
|
|
||||||
|
# USERS
|
||||||
|
print('[user] PARSING USERS...')
|
||||||
|
xml_path = get_file_path(xml_root, 'users.xml')
|
||||||
|
print('[user] start parsing users.xml...')
|
||||||
|
handler = UserContentHandler(conn, site)
|
||||||
|
xml.sax.parse(xml_path, handler)
|
||||||
|
print('%-10s Processed %d rows.' % ('[user]', handler.row_count))
|
||||||
|
print('[user] FINISHED PARSING USERS.\n')
|
||||||
|
|
||||||
|
# POSTS
|
||||||
|
# posts are added directly to the Solr index; they are not added to the database.
|
||||||
|
print('[post] PARSING POSTS...')
|
||||||
|
xml_path = get_file_path(xml_root, 'posts.xml')
|
||||||
|
print('[post] start parsing posts.xml...')
|
||||||
|
handler = PostContentHandler(solr, site)
|
||||||
|
xml.sax.parse(xml_path, handler)
|
||||||
|
handler.commit_all_questions()
|
||||||
|
print('%-10s Processed %d rows.' % ('[post]', handler.row_count))
|
||||||
|
|
||||||
|
print('[post] FINISHED PARSING POSTS.\n')
|
||||||
|
|
||||||
|
# DELETE COMMENTS
|
||||||
|
print('[comment] DELETING TEMPORARY COMMENTS DATABASE (they are no longer needed)...')
|
||||||
|
temp_db_conn.commit(close=True)
|
||||||
|
comment_db_sqlhub.processConnection.close()
|
||||||
|
os.remove(temp_db_path)
|
||||||
|
print('[comment] FINISHED DELETING COMMENTS.\n')
|
||||||
|
|
||||||
|
# commit transaction
|
||||||
|
print('COMMITTING IMPORTED DATA TO DISK...')
|
||||||
sqlhub.threadConnection.commit(close=True)
|
sqlhub.threadConnection.commit(close=True)
|
||||||
print('Deleted.\n')
|
solr.commit()
|
||||||
|
print('FINISHED COMMITTING IMPORTED DATA TO DISK.\n')
|
||||||
|
|
||||||
print('Deleting site "%s" from the solr... ' % site.name)
|
timing_end = time.time()
|
||||||
solr.delete(q='siteKey:"%s"' % site.key, commit=False)
|
|
||||||
solr.commit(expungeDeletes=True)
|
|
||||||
print('Deleted.\n')
|
|
||||||
|
|
||||||
# create the temporary comments database
|
print('Time taken for site insertion into Stackdump: %f seconds.' % (timing_end - timing_start))
|
||||||
print('Connecting to the temporary comments database...')
|
print('')
|
||||||
temp_db_file, temp_db_path = tempfile.mkstemp('.sqlite', 'temp_comment_db-' + re.sub(r'[^\w]', '_', site_key) + '-', settings.TEMP_COMMENTS_DATABASE_DIR)
|
|
||||||
os.close(temp_db_file)
|
|
||||||
conn_str = 'sqlite:///' + temp_db_path
|
|
||||||
comment_db_sqlhub.processConnection = connectionForURI(conn_str)
|
|
||||||
print('Connected.')
|
|
||||||
Comment.createTable()
|
|
||||||
print('Schema created.')
|
|
||||||
comment_db_sqlhub.processConnection.getConnection().execute('PRAGMA synchronous = OFF')
|
|
||||||
comment_db_sqlhub.processConnection.getConnection().execute('PRAGMA journal_mode = MEMORY')
|
|
||||||
print('Pragma configured.\n')
|
|
||||||
|
|
||||||
timing_start = time.time()
|
|
||||||
|
|
||||||
# start a new transaction
|
|
||||||
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
|
||||||
conn = sqlhub.threadConnection
|
|
||||||
comment_db_sqlhub.threadConnection = comment_db_sqlhub.processConnection.transaction()
|
|
||||||
temp_db_conn = comment_db_sqlhub.threadConnection
|
|
||||||
|
|
||||||
# create a new Site
|
|
||||||
site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date,
|
|
||||||
import_date=datetime.now(), base_url=site_base_url)
|
|
||||||
|
|
||||||
# BADGES
|
|
||||||
# Processing of badges has been disabled because they don't offer any useful
|
|
||||||
# information in the offline situation.
|
|
||||||
#print('[badge] PARSING BADGES...')
|
|
||||||
#xml_path = get_file_path(xml_root, 'badges.xml')
|
|
||||||
#print('[badge] start parsing badges.xml...')
|
|
||||||
#handler = BadgeContentHandler(conn, site)
|
|
||||||
#xml.sax.parse(xml_path, handler)
|
|
||||||
#print('[badge]\tProcessed %d rows.' % (handler.row_count))
|
|
||||||
#print('[badge] FINISHED PARSING BADGES.\n')
|
|
||||||
|
|
||||||
# COMMENTS
|
|
||||||
# comments are temporarily stored in the database for retrieval when parsing
|
|
||||||
# posts only.
|
|
||||||
print('[comment] PARSING COMMENTS...')
|
|
||||||
xml_path = get_file_path(xml_root, 'comments.xml')
|
|
||||||
print('[comment] start parsing comments.xml...')
|
|
||||||
handler = CommentContentHandler(temp_db_conn, site)
|
|
||||||
xml.sax.parse(xml_path, handler)
|
|
||||||
print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
|
|
||||||
print('[comment] FINISHED PARSING COMMENTS.\n')
|
|
||||||
|
|
||||||
# USERS
|
|
||||||
print('[user] PARSING USERS...')
|
|
||||||
xml_path = get_file_path(xml_root, 'users.xml')
|
|
||||||
print('[user] start parsing users.xml...')
|
|
||||||
handler = UserContentHandler(conn, site)
|
|
||||||
xml.sax.parse(xml_path, handler)
|
|
||||||
print('%-10s Processed %d rows.' % ('[user]', handler.row_count))
|
|
||||||
print('[user] FINISHED PARSING USERS.\n')
|
|
||||||
|
|
||||||
# POSTS
|
|
||||||
# posts are added directly to the Solr index; they are not added to the database.
|
|
||||||
print('[post] PARSING POSTS...')
|
|
||||||
xml_path = get_file_path(xml_root, 'posts.xml')
|
|
||||||
print('[post] start parsing posts.xml...')
|
|
||||||
handler = PostContentHandler(solr, site)
|
|
||||||
xml.sax.parse(xml_path, handler)
|
|
||||||
handler.commit_all_questions()
|
|
||||||
print('%-10s Processed %d rows.' % ('[post]', handler.row_count))
|
|
||||||
|
|
||||||
print('[post] FINISHED PARSING POSTS.\n')
|
|
||||||
|
|
||||||
# DELETE COMMENTS
|
|
||||||
print('[comment] DELETING TEMPORARY COMMENTS DATABASE (they are no longer needed)...')
|
|
||||||
temp_db_conn.commit(close=True)
|
|
||||||
comment_db_sqlhub.processConnection.close()
|
|
||||||
os.remove(temp_db_path)
|
|
||||||
print('[comment] FINISHED DELETING COMMENTS.\n')
|
|
||||||
|
|
||||||
# commit transaction
|
|
||||||
print('COMMITTING IMPORTED DATA TO DISK...')
|
|
||||||
sqlhub.threadConnection.commit(close=True)
|
|
||||||
solr.commit()
|
|
||||||
print('FINISHED COMMITTING IMPORTED DATA TO DISK.\n')
|
|
||||||
|
|
||||||
timing_end = time.time()
|
|
||||||
|
|
||||||
print('Time taken for site insertion into Stackdump: %f seconds.' % (timing_end - timing_start))
|
|
||||||
print('')
|
|
||||||
|
|
||||||
# MAIN METHOD
|
# MAIN METHOD
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -44,7 +44,7 @@ then
|
|||||||
echo "Using Python `which "$PYTHON_CMD"`"
|
echo "Using Python `which "$PYTHON_CMD"`"
|
||||||
|
|
||||||
# execution ends here if Python is found
|
# execution ends here if Python is found
|
||||||
PYTHONPATH=$SCRIPT_DIR/python3/packages:$SCRIPT_DIR/python3/src:$PYTHONPATH
|
PYTHONPATH=$SCRIPT_DIR/pyth3/packages:$SCRIPT_DIR/python/src:$PYTHONPATH
|
||||||
env "PYTHONPATH=$PYTHONPATH" "$PYTHON_CMD" "$@"
|
env "PYTHONPATH=$PYTHONPATH" "$PYTHON_CMD" "$@"
|
||||||
exit $?
|
exit $?
|
||||||
fi
|
fi
|
||||||
|
Loading…
Reference in New Issue
Block a user