mirror of
https://github.com/djohnlewis/stackdump
synced 2025-04-07 10:13:27 +00:00
Refactored the import_site command; now caters for filenames of different case (the case changed in 2013 dumps).
This commit is contained in:
parent
70fa72b04e
commit
a472517736
@ -39,7 +39,8 @@ class BaseContentHandler(xml.sax.ContentHandler):
|
|||||||
"""
|
"""
|
||||||
Base content handler.
|
Base content handler.
|
||||||
"""
|
"""
|
||||||
def __init__(self, site, obj_class):
|
def __init__(self, conn, site, obj_class):
|
||||||
|
self.conn = conn
|
||||||
self.site = site
|
self.site = site
|
||||||
self.obj_class = obj_class
|
self.obj_class = obj_class
|
||||||
self.cur_props = None
|
self.cur_props = None
|
||||||
@ -73,7 +74,7 @@ class BaseContentHandler(xml.sax.ContentHandler):
|
|||||||
# need to convert the attr names to DB column names
|
# need to convert the attr names to DB column names
|
||||||
props_for_db[self.db_style.pythonAttrToDBColumn(k)] = v
|
props_for_db[self.db_style.pythonAttrToDBColumn(k)] = v
|
||||||
|
|
||||||
conn.query(conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
|
self.conn.query(self.conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
|
||||||
|
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
# could not insert this, so ignore the row
|
# could not insert this, so ignore the row
|
||||||
@ -95,8 +96,8 @@ class BadgeContentHandler(BaseContentHandler):
|
|||||||
|
|
||||||
<row Id="15" UserId="6" Name="Supporter" Date="2010-05-19T21:57:31.000" />
|
<row Id="15" UserId="6" Name="Supporter" Date="2010-05-19T21:57:31.000" />
|
||||||
"""
|
"""
|
||||||
def __init__(self, site):
|
def __init__(self, conn, site):
|
||||||
BaseContentHandler.__init__(self, site, Badge)
|
BaseContentHandler.__init__(self, conn, site, Badge)
|
||||||
|
|
||||||
def startElement(self, name, attrs):
|
def startElement(self, name, attrs):
|
||||||
if name != 'row':
|
if name != 'row':
|
||||||
@ -126,8 +127,8 @@ class CommentContentHandler(BaseContentHandler):
|
|||||||
key. " CreationDate="2010-05-19T23:48:05.680" UserId="23" />
|
key. " CreationDate="2010-05-19T23:48:05.680" UserId="23" />
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, site):
|
def __init__(self, conn, site):
|
||||||
BaseContentHandler.__init__(self, site, Comment)
|
BaseContentHandler.__init__(self, conn, site, Comment)
|
||||||
|
|
||||||
def startElement(self, name, attrs):
|
def startElement(self, name, attrs):
|
||||||
if name != 'row':
|
if name != 'row':
|
||||||
@ -169,15 +170,15 @@ class UserContentHandler(BaseContentHandler):
|
|||||||
</p>
" Views="52" UpVotes="11" DownVotes="1" />
|
</p>
" Views="52" UpVotes="11" DownVotes="1" />
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, site):
|
def __init__(self, conn, site):
|
||||||
BaseContentHandler.__init__(self, site, User)
|
BaseContentHandler.__init__(self, conn, site, User)
|
||||||
|
|
||||||
def startElement(self, name, attrs):
|
def startElement(self, name, attrs):
|
||||||
if name != 'row':
|
if name != 'row':
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
d = self.cur_props = { 'site' : site }
|
d = self.cur_props = { 'site' : self.site }
|
||||||
d['sourceId'] = int(attrs['Id'])
|
d['sourceId'] = int(attrs['Id'])
|
||||||
d['reputation'] = int(attrs.get('Reputation', 0))
|
d['reputation'] = int(attrs.get('Reputation', 0))
|
||||||
d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT)
|
d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT)
|
||||||
@ -226,7 +227,8 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
"""
|
"""
|
||||||
TAGS_RE = re.compile(u'<([^>]+)>')
|
TAGS_RE = re.compile(u'<([^>]+)>')
|
||||||
|
|
||||||
def __init__(self, site):
|
def __init__(self, solr, site):
|
||||||
|
self.solr = solr
|
||||||
self.site = site
|
self.site = site
|
||||||
self.unfinished_questions = { }
|
self.unfinished_questions = { }
|
||||||
self.orphan_answers = { }
|
self.orphan_answers = { }
|
||||||
@ -513,7 +515,7 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
"""
|
"""
|
||||||
Commits the given list of questions to solr.
|
Commits the given list of questions to solr.
|
||||||
"""
|
"""
|
||||||
solr.add(questions, commit=commit)
|
self.solr.add(questions, commit=commit)
|
||||||
|
|
||||||
def commit_all_questions(self):
|
def commit_all_questions(self):
|
||||||
"""
|
"""
|
||||||
@ -548,61 +550,78 @@ class PostContentHandler(xml.sax.ContentHandler):
|
|||||||
for question_id, answers in self.orphan_answers.items():
|
for question_id, answers in self.orphan_answers.items():
|
||||||
print('There are %d answers for missing question [ID# %d]. Ignoring orphan answers.' % (len(answers), question_id))
|
print('There are %d answers for missing question [ID# %d]. Ignoring orphan answers.' % (len(answers), question_id))
|
||||||
|
|
||||||
# MAIN METHOD
|
# METHODS
|
||||||
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
|
def get_file_path(dir_path, filename):
|
||||||
parser.add_option('-n', '--site-name', help='Name of the site.')
|
"""
|
||||||
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
|
Returns the absolute path to the file, matching the file in a
|
||||||
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
|
case-insensitive manner.
|
||||||
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
|
|
||||||
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
|
|
||||||
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
|
|
||||||
|
|
||||||
(cmd_options, cmd_args) = parser.parse_args()
|
If multiple files match the filename (e.g. on case-sensitive filesystems) a
|
||||||
|
ValueError will be thrown.
|
||||||
|
|
||||||
if len(cmd_args) < 1:
|
If no matching files are found, None will be returned.
|
||||||
print('The path to the directory containing the extracted XML files is required.')
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
xml_root = cmd_args[0]
|
@param dir_path: the directory containing the file
|
||||||
print('Using the XML root path: ' + xml_root + '\n')
|
@param filename: the filename to make a case-insensitive match with
|
||||||
|
@return: the absolute path to the matched file
|
||||||
|
"""
|
||||||
|
files = [ f for f in os.listdir(dir_path) ]
|
||||||
|
matches = [ ]
|
||||||
|
|
||||||
if not os.path.exists(xml_root):
|
for f in files:
|
||||||
|
if not os.path.isfile(os.path.join(dir_path, f)):
|
||||||
|
continue
|
||||||
|
if f.lower() == filename.lower():
|
||||||
|
matches.append(f)
|
||||||
|
|
||||||
|
if len(matches) == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if len(matches) > 1:
|
||||||
|
raise ValueError('More than one file matched the given filename - ' + repr(matches))
|
||||||
|
|
||||||
|
return os.path.abspath(os.path.join(dir_path, matches[0]))
|
||||||
|
|
||||||
|
|
||||||
|
def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
||||||
|
site_base_url, answer_yes=False):
|
||||||
|
print('Using the XML root path: ' + xml_root + '\n')
|
||||||
|
|
||||||
|
if not os.path.exists(xml_root):
|
||||||
print('The given XML root path does not exist.')
|
print('The given XML root path does not exist.')
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# connect to the database
|
# connect to the database
|
||||||
print('Connecting to the database...')
|
print('Connecting to the database...')
|
||||||
conn_str = settings.DATABASE_CONN_STR
|
conn_str = settings.DATABASE_CONN_STR
|
||||||
sqlhub.processConnection = connectionForURI(conn_str)
|
sqlhub.processConnection = connectionForURI(conn_str)
|
||||||
print('Connected.\n')
|
print('Connected.\n')
|
||||||
|
|
||||||
# connect to solr
|
# connect to solr
|
||||||
print('Connecting to solr...')
|
print('Connecting to solr...')
|
||||||
solr = Solr(settings.SOLR_URL)
|
solr = Solr(settings.SOLR_URL)
|
||||||
# pysolr doesn't try to connect until a request is made, so we'll make a ping request
|
# pysolr doesn't try to connect until a request is made, so we'll make a ping request
|
||||||
try:
|
try:
|
||||||
solr._send_request('GET', '%s/admin/ping' % solr.path)
|
solr._send_request('GET', '%s/admin/ping' % solr.path)
|
||||||
except socket.error, e:
|
except socket.error, e:
|
||||||
print('Failed to connect to solr - error was: %s' % str(e))
|
print('Failed to connect to solr - error was: %s' % str(e))
|
||||||
print('Aborting.')
|
print('Aborting.')
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
print('Connected.\n')
|
print('Connected.\n')
|
||||||
|
|
||||||
# ensure required tables exist
|
# ensure required tables exist
|
||||||
print("Creating tables if they don't exist...")
|
print("Creating tables if they don't exist...")
|
||||||
Site.createTable(ifNotExists=True)
|
Site.createTable(ifNotExists=True)
|
||||||
Badge.createTable(ifNotExists=True)
|
Badge.createTable(ifNotExists=True)
|
||||||
Comment.createTable(ifNotExists=True)
|
Comment.createTable(ifNotExists=True)
|
||||||
User.createTable(ifNotExists=True)
|
User.createTable(ifNotExists=True)
|
||||||
print('Created.\n')
|
print('Created.\n')
|
||||||
|
|
||||||
# SITE INFO
|
# SITE INFO
|
||||||
site_name = cmd_options.site_name
|
# only look if they were not specified at the command line; also only if
|
||||||
dump_date = cmd_options.dump_date
|
# readme.txt exists (they don't in dumps after Aug 2012)
|
||||||
# only look if they were not specified at the command line; also only if
|
readme_path = get_file_path(xml_root, 'readme.txt')
|
||||||
# readme.txt exists (they don't in dumps after Aug 2012)
|
if not (site_name and dump_date) and readme_path:
|
||||||
readme_path = os.path.join(xml_root, 'readme.txt')
|
|
||||||
if not (site_name and dump_date) and os.path.exists(readme_path):
|
|
||||||
# get the site name from the first line of readme.txt. This could be fragile.
|
# get the site name from the first line of readme.txt. This could be fragile.
|
||||||
with open(readme_path, 'r') as f:
|
with open(readme_path, 'r') as f:
|
||||||
site_readme_desc = f.readline().strip()
|
site_readme_desc = f.readline().strip()
|
||||||
@ -627,21 +646,17 @@ if not (site_name and dump_date) and os.path.exists(readme_path):
|
|||||||
if not dump_date:
|
if not dump_date:
|
||||||
dump_date = readme_dump_date
|
dump_date = readme_dump_date
|
||||||
|
|
||||||
# look for the site in the sites RSS file using the base_url with the id in RSS
|
# look for the site in the sites RSS file using the base_url with the id in RSS
|
||||||
site_desc = cmd_options.site_desc
|
# scrub the URL scheme off the base_url
|
||||||
site_key = cmd_options.site_key
|
if site_base_url:
|
||||||
site_base_url = cmd_options.base_url
|
|
||||||
|
|
||||||
# scrub the URL scheme off the base_url
|
|
||||||
if site_base_url:
|
|
||||||
# if there is no URL scheme, add one so it can be parsed by urllib2 so it
|
# if there is no URL scheme, add one so it can be parsed by urllib2 so it
|
||||||
# can strip off other bits in the URL that we don't want
|
# can strip off other bits in the URL that we don't want
|
||||||
if '://' not in site_base_url:
|
if '://' not in site_base_url:
|
||||||
site_base_url = 'http://%s' % site_base_url
|
site_base_url = 'http://%s' % site_base_url
|
||||||
site_base_url = urllib2.Request(site_base_url).get_host()
|
site_base_url = urllib2.Request(site_base_url).get_host()
|
||||||
|
|
||||||
# attempt to get more information from the sites RSS cache
|
# attempt to get more information from the sites RSS cache
|
||||||
if site_base_url and not (site_name and site_desc and site_key):
|
if site_base_url and not (site_name and site_desc and site_key):
|
||||||
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
|
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
|
||||||
if os.path.exists(sites_file_path):
|
if os.path.exists(sites_file_path):
|
||||||
with open(sites_file_path) as f:
|
with open(sites_file_path) as f:
|
||||||
@ -674,44 +689,44 @@ if site_base_url and not (site_name and site_desc and site_key):
|
|||||||
if not site_desc:
|
if not site_desc:
|
||||||
site_desc = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
|
site_desc = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
|
||||||
|
|
||||||
print 'Name: %s\nKey: %s\nDescription: %s\nDump Date: %s\nBase URL: %s\n' % (site_name, site_key, site_desc, dump_date, site_base_url)
|
print 'Name: %s\nKey: %s\nDescription: %s\nDump Date: %s\nBase URL: %s\n' % (site_name, site_key, site_desc, dump_date, site_base_url)
|
||||||
|
|
||||||
# the base URL is optional.
|
# the base URL is optional.
|
||||||
if not (site_name and site_key and site_desc and dump_date):
|
if not (site_name and site_key and site_desc and dump_date):
|
||||||
print 'Could not get all the details for the site.'
|
print 'Could not get all the details for the site.'
|
||||||
print 'Use command-line parameters to specify the missing details (listed as None).'
|
print 'Use command-line parameters to specify the missing details (listed as None).'
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# prevent importing sites with keys that clash with method names in the app,
|
# prevent importing sites with keys that clash with method names in the app,
|
||||||
# e.g. a site key of 'search' would clash with the Stackdump-wide search page.
|
# e.g. a site key of 'search' would clash with the Stackdump-wide search page.
|
||||||
if site_key in ('search', 'import', 'media', 'licenses'):
|
if site_key in ('search', 'import', 'media', 'licenses'):
|
||||||
print 'The site key given, %s, is a reserved word in Stackdump.' % site_key
|
print 'The site key given, %s, is a reserved word in Stackdump.' % site_key
|
||||||
print 'Use the --site-key parameter to specify an alternate site key.'
|
print 'Use the --site-key parameter to specify an alternate site key.'
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|
||||||
# confirm site details with user to make sure we don't accidentally overwrite
|
# confirm site details with user to make sure we don't accidentally overwrite
|
||||||
# another site.
|
# another site.
|
||||||
if not cmd_options.answer_yes:
|
if not answer_yes:
|
||||||
confirm_prompt = 'Are these details correct (answer "yes" to proceed, anything else to abort)? '
|
confirm_prompt = 'Are these details correct (answer "yes" to proceed, anything else to abort)? '
|
||||||
confirm_answer = raw_input(confirm_prompt)
|
confirm_answer = raw_input(confirm_prompt)
|
||||||
if confirm_answer != 'yes':
|
if confirm_answer != 'yes':
|
||||||
print 'Import aborted on user request.'
|
print 'Import aborted on user request.'
|
||||||
sys.exit(3)
|
sys.exit(3)
|
||||||
|
|
||||||
# rollback any uncommitted entries in solr. Uncommitted entries may occur if
|
# rollback any uncommitted entries in solr. Uncommitted entries may occur if
|
||||||
# this import process is aborted. Solr doesn't have the concept of transactions
|
# this import process is aborted. Solr doesn't have the concept of transactions
|
||||||
# like databases do, so without a rollback, we'll be committing the previously
|
# like databases do, so without a rollback, we'll be committing the previously
|
||||||
# uncommitted entries plus the newly imported ones.
|
# uncommitted entries plus the newly imported ones.
|
||||||
#
|
#
|
||||||
# This also means multiple dataproc processes cannot occur concurrently. If you
|
# This also means multiple dataproc processes cannot occur concurrently. If you
|
||||||
# do the import will be silently incomplete.
|
# do the import will be silently incomplete.
|
||||||
print('Clearing any uncommitted entries in solr...')
|
print('Clearing any uncommitted entries in solr...')
|
||||||
solr._update('<rollback />', waitFlush=None, waitSearcher=None)
|
solr._update('<rollback />', waitFlush=None, waitSearcher=None)
|
||||||
print('Cleared.\n')
|
print('Cleared.\n')
|
||||||
|
|
||||||
# check if site is already in database; if so, purge the data.
|
# check if site is already in database; if so, purge the data.
|
||||||
site = list(Site.select(Site.q.key==site_key))
|
site = list(Site.select(Site.q.key==site_key))
|
||||||
if len(site) > 0:
|
if len(site) > 0:
|
||||||
site = site[0]
|
site = site[0]
|
||||||
print('Deleting site "%s" from the database... ' % site.name)
|
print('Deleting site "%s" from the database... ' % site.name)
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
@ -736,71 +751,91 @@ if len(site) > 0:
|
|||||||
solr.commit(expungeDeletes=True)
|
solr.commit(expungeDeletes=True)
|
||||||
print('Deleted.\n')
|
print('Deleted.\n')
|
||||||
|
|
||||||
timing_start = time.time()
|
timing_start = time.time()
|
||||||
|
|
||||||
# start a new transaction
|
# start a new transaction
|
||||||
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
sqlhub.threadConnection = sqlhub.processConnection.transaction()
|
||||||
conn = sqlhub.threadConnection
|
conn = sqlhub.threadConnection
|
||||||
|
|
||||||
# create a new Site
|
# create a new Site
|
||||||
site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date,
|
site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date,
|
||||||
import_date=datetime.now(), base_url=site_base_url)
|
import_date=datetime.now(), base_url=site_base_url)
|
||||||
|
|
||||||
# BADGES
|
# BADGES
|
||||||
# Processing of badges has been disabled because they don't offer any useful
|
# Processing of badges has been disabled because they don't offer any useful
|
||||||
# information in the offline situation.
|
# information in the offline situation.
|
||||||
#print('[badge] PARSING BADGES...')
|
#print('[badge] PARSING BADGES...')
|
||||||
#xml_path = os.path.join(xml_root, 'badges.xml')
|
#xml_path = get_file_path(xml_root, 'badges.xml')
|
||||||
#print('[badge] start parsing badges.xml...')
|
#print('[badge] start parsing badges.xml...')
|
||||||
#handler = BadgeContentHandler(site)
|
#handler = BadgeContentHandler(conn, site)
|
||||||
#xml.sax.parse(xml_path, handler)
|
#xml.sax.parse(xml_path, handler)
|
||||||
#print('[badge]\tProcessed %d rows.' % (handler.row_count))
|
#print('[badge]\tProcessed %d rows.' % (handler.row_count))
|
||||||
#print('[badge] FINISHED PARSING BADGES.\n')
|
#print('[badge] FINISHED PARSING BADGES.\n')
|
||||||
|
|
||||||
# COMMENTS
|
# COMMENTS
|
||||||
# comments are temporarily stored in the database for retrieval when parsing
|
# comments are temporarily stored in the database for retrieval when parsing
|
||||||
# posts only.
|
# posts only.
|
||||||
print('[comment] PARSING COMMENTS...')
|
print('[comment] PARSING COMMENTS...')
|
||||||
xml_path = os.path.join(xml_root, 'comments.xml')
|
xml_path = get_file_path(xml_root, 'comments.xml')
|
||||||
print('[comment] start parsing comments.xml...')
|
print('[comment] start parsing comments.xml...')
|
||||||
handler = CommentContentHandler(site)
|
handler = CommentContentHandler(conn, site)
|
||||||
xml.sax.parse(xml_path, handler)
|
xml.sax.parse(xml_path, handler)
|
||||||
print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
|
print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
|
||||||
print('[comment] FINISHED PARSING COMMENTS.\n')
|
print('[comment] FINISHED PARSING COMMENTS.\n')
|
||||||
|
|
||||||
# USERS
|
# USERS
|
||||||
print('[user] PARSING USERS...')
|
print('[user] PARSING USERS...')
|
||||||
xml_path = os.path.join(xml_root, 'users.xml')
|
xml_path = get_file_path(xml_root, 'users.xml')
|
||||||
print('[user] start parsing users.xml...')
|
print('[user] start parsing users.xml...')
|
||||||
handler = UserContentHandler(site)
|
handler = UserContentHandler(conn, site)
|
||||||
xml.sax.parse(xml_path, handler)
|
xml.sax.parse(xml_path, handler)
|
||||||
print('%-10s Processed %d rows.' % ('[user]', handler.row_count))
|
print('%-10s Processed %d rows.' % ('[user]', handler.row_count))
|
||||||
print('[user] FINISHED PARSING USERS.\n')
|
print('[user] FINISHED PARSING USERS.\n')
|
||||||
|
|
||||||
# POSTS
|
# POSTS
|
||||||
# posts are added directly to the Solr index; they are not added to the database.
|
# posts are added directly to the Solr index; they are not added to the database.
|
||||||
print('[post] PARSING POSTS...')
|
print('[post] PARSING POSTS...')
|
||||||
xml_path = os.path.join(xml_root, 'posts.xml')
|
xml_path = get_file_path(xml_root, 'posts.xml')
|
||||||
print('[post] start parsing posts.xml...')
|
print('[post] start parsing posts.xml...')
|
||||||
handler = PostContentHandler(site)
|
handler = PostContentHandler(solr, site)
|
||||||
xml.sax.parse(xml_path, handler)
|
xml.sax.parse(xml_path, handler)
|
||||||
handler.commit_all_questions()
|
handler.commit_all_questions()
|
||||||
print('%-10s Processed %d rows.' % ('[post]', handler.row_count))
|
print('%-10s Processed %d rows.' % ('[post]', handler.row_count))
|
||||||
|
|
||||||
print('[post] FINISHED PARSING POSTS.\n')
|
print('[post] FINISHED PARSING POSTS.\n')
|
||||||
|
|
||||||
# DELETE COMMENTS
|
# DELETE COMMENTS
|
||||||
print('[comment] DELETING COMMENTS FROM DATABASE (they are no longer needed)...')
|
print('[comment] DELETING COMMENTS FROM DATABASE (they are no longer needed)...')
|
||||||
conn.query(conn.sqlrepr(Delete(Comment.sqlmeta.table, where=(Comment.q.site == site))))
|
conn.query(conn.sqlrepr(Delete(Comment.sqlmeta.table, where=(Comment.q.site == site))))
|
||||||
print('[comment] FINISHED DELETING COMMENTS.\n')
|
print('[comment] FINISHED DELETING COMMENTS.\n')
|
||||||
|
|
||||||
# commit transaction
|
# commit transaction
|
||||||
print('COMMITTING IMPORTED DATA TO DISK...')
|
print('COMMITTING IMPORTED DATA TO DISK...')
|
||||||
sqlhub.threadConnection.commit(close=True)
|
sqlhub.threadConnection.commit(close=True)
|
||||||
solr.commit()
|
solr.commit()
|
||||||
print('FINISHED COMMITTING IMPORTED DATA TO DISK.\n')
|
print('FINISHED COMMITTING IMPORTED DATA TO DISK.\n')
|
||||||
|
|
||||||
timing_end = time.time()
|
timing_end = time.time()
|
||||||
|
|
||||||
print('Time taken for site insertion into Stackdump: %f seconds.' % (timing_end - timing_start))
|
print('Time taken for site insertion into Stackdump: %f seconds.' % (timing_end - timing_start))
|
||||||
print('')
|
print('')
|
||||||
|
|
||||||
|
# MAIN METHOD
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
|
||||||
|
parser.add_option('-n', '--site-name', help='Name of the site.')
|
||||||
|
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
|
||||||
|
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
|
||||||
|
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
|
||||||
|
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
|
||||||
|
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
|
||||||
|
|
||||||
|
(cmd_options, cmd_args) = parser.parse_args()
|
||||||
|
|
||||||
|
if len(cmd_args) < 1:
|
||||||
|
print('The path to the directory containing the extracted XML files is required.')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
import_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
|
||||||
|
cmd_options.site_desc, cmd_options.site_key,
|
||||||
|
cmd_options.base_url, answer_yes=cmd_options.answer_yes)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user