1
0
mirror of https://github.com/djohnlewis/stackdump synced 2025-04-07 10:13:27 +00:00

Refactored the import_site command; now caters for filenames of different case (the case changed in 2013 dumps).

This commit is contained in:
Sam 2013-09-24 18:07:55 +10:00
parent 70fa72b04e
commit a472517736

@ -39,7 +39,8 @@ class BaseContentHandler(xml.sax.ContentHandler):
""" """
Base content handler. Base content handler.
""" """
def __init__(self, site, obj_class): def __init__(self, conn, site, obj_class):
self.conn = conn
self.site = site self.site = site
self.obj_class = obj_class self.obj_class = obj_class
self.cur_props = None self.cur_props = None
@ -73,7 +74,7 @@ class BaseContentHandler(xml.sax.ContentHandler):
# need to convert the attr names to DB column names # need to convert the attr names to DB column names
props_for_db[self.db_style.pythonAttrToDBColumn(k)] = v props_for_db[self.db_style.pythonAttrToDBColumn(k)] = v
conn.query(conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db))) self.conn.query(self.conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
except Exception, e: except Exception, e:
# could not insert this, so ignore the row # could not insert this, so ignore the row
@ -95,8 +96,8 @@ class BadgeContentHandler(BaseContentHandler):
<row Id="15" UserId="6" Name="Supporter" Date="2010-05-19T21:57:31.000" /> <row Id="15" UserId="6" Name="Supporter" Date="2010-05-19T21:57:31.000" />
""" """
def __init__(self, site): def __init__(self, conn, site):
BaseContentHandler.__init__(self, site, Badge) BaseContentHandler.__init__(self, conn, site, Badge)
def startElement(self, name, attrs): def startElement(self, name, attrs):
if name != 'row': if name != 'row':
@ -126,8 +127,8 @@ class CommentContentHandler(BaseContentHandler):
key. " CreationDate="2010-05-19T23:48:05.680" UserId="23" /> key. " CreationDate="2010-05-19T23:48:05.680" UserId="23" />
""" """
def __init__(self, site): def __init__(self, conn, site):
BaseContentHandler.__init__(self, site, Comment) BaseContentHandler.__init__(self, conn, site, Comment)
def startElement(self, name, attrs): def startElement(self, name, attrs):
if name != 'row': if name != 'row':
@ -169,15 +170,15 @@ class UserContentHandler(BaseContentHandler):
&lt;/p&gt;&#xA;" Views="52" UpVotes="11" DownVotes="1" /> &lt;/p&gt;&#xA;" Views="52" UpVotes="11" DownVotes="1" />
""" """
def __init__(self, site): def __init__(self, conn, site):
BaseContentHandler.__init__(self, site, User) BaseContentHandler.__init__(self, conn, site, User)
def startElement(self, name, attrs): def startElement(self, name, attrs):
if name != 'row': if name != 'row':
return return
try: try:
d = self.cur_props = { 'site' : site } d = self.cur_props = { 'site' : self.site }
d['sourceId'] = int(attrs['Id']) d['sourceId'] = int(attrs['Id'])
d['reputation'] = int(attrs.get('Reputation', 0)) d['reputation'] = int(attrs.get('Reputation', 0))
d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT) d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT)
@ -226,7 +227,8 @@ class PostContentHandler(xml.sax.ContentHandler):
""" """
TAGS_RE = re.compile(u'<([^>]+)>') TAGS_RE = re.compile(u'<([^>]+)>')
def __init__(self, site): def __init__(self, solr, site):
self.solr = solr
self.site = site self.site = site
self.unfinished_questions = { } self.unfinished_questions = { }
self.orphan_answers = { } self.orphan_answers = { }
@ -513,7 +515,7 @@ class PostContentHandler(xml.sax.ContentHandler):
""" """
Commits the given list of questions to solr. Commits the given list of questions to solr.
""" """
solr.add(questions, commit=commit) self.solr.add(questions, commit=commit)
def commit_all_questions(self): def commit_all_questions(self):
""" """
@ -548,22 +550,41 @@ class PostContentHandler(xml.sax.ContentHandler):
for question_id, answers in self.orphan_answers.items(): for question_id, answers in self.orphan_answers.items():
print('There are %d answers for missing question [ID# %d]. Ignoring orphan answers.' % (len(answers), question_id)) print('There are %d answers for missing question [ID# %d]. Ignoring orphan answers.' % (len(answers), question_id))
# MAIN METHOD # METHODS
parser = OptionParser(usage='usage: %prog [options] xml_root_dir') def get_file_path(dir_path, filename):
parser.add_option('-n', '--site-name', help='Name of the site.') """
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).') Returns the absolute path to the file, matching the file in a
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).') case-insensitive manner.
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
(cmd_options, cmd_args) = parser.parse_args() If multiple files match the filename (e.g. on case-sensitive filesystems) a
ValueError will be thrown.
if len(cmd_args) < 1: If no matching files are found, None will be returned.
print('The path to the directory containing the extracted XML files is required.')
sys.exit(1)
xml_root = cmd_args[0] @param dir_path: the directory containing the file
@param filename: the filename to make a case-insensitive match with
@return: the absolute path to the matched file
"""
files = [ f for f in os.listdir(dir_path) ]
matches = [ ]
for f in files:
if not os.path.isfile(os.path.join(dir_path, f)):
continue
if f.lower() == filename.lower():
matches.append(f)
if len(matches) == 0:
return None
if len(matches) > 1:
raise ValueError('More than one file matched the given filename - ' + repr(matches))
return os.path.abspath(os.path.join(dir_path, matches[0]))
def import_site(xml_root, site_name, dump_date, site_desc, site_key,
site_base_url, answer_yes=False):
print('Using the XML root path: ' + xml_root + '\n') print('Using the XML root path: ' + xml_root + '\n')
if not os.path.exists(xml_root): if not os.path.exists(xml_root):
@ -597,12 +618,10 @@ User.createTable(ifNotExists=True)
print('Created.\n') print('Created.\n')
# SITE INFO # SITE INFO
site_name = cmd_options.site_name
dump_date = cmd_options.dump_date
# only look if they were not specified at the command line; also only if # only look if they were not specified at the command line; also only if
# readme.txt exists (they don't in dumps after Aug 2012) # readme.txt exists (they don't in dumps after Aug 2012)
readme_path = os.path.join(xml_root, 'readme.txt') readme_path = get_file_path(xml_root, 'readme.txt')
if not (site_name and dump_date) and os.path.exists(readme_path): if not (site_name and dump_date) and readme_path:
# get the site name from the first line of readme.txt. This could be fragile. # get the site name from the first line of readme.txt. This could be fragile.
with open(readme_path, 'r') as f: with open(readme_path, 'r') as f:
site_readme_desc = f.readline().strip() site_readme_desc = f.readline().strip()
@ -628,10 +647,6 @@ if not (site_name and dump_date) and os.path.exists(readme_path):
dump_date = readme_dump_date dump_date = readme_dump_date
# look for the site in the sites RSS file using the base_url with the id in RSS # look for the site in the sites RSS file using the base_url with the id in RSS
site_desc = cmd_options.site_desc
site_key = cmd_options.site_key
site_base_url = cmd_options.base_url
# scrub the URL scheme off the base_url # scrub the URL scheme off the base_url
if site_base_url: if site_base_url:
# if there is no URL scheme, add one so it can be parsed by urllib2 so it # if there is no URL scheme, add one so it can be parsed by urllib2 so it
@ -691,7 +706,7 @@ if site_key in ('search', 'import', 'media', 'licenses'):
# confirm site details with user to make sure we don't accidentally overwrite # confirm site details with user to make sure we don't accidentally overwrite
# another site. # another site.
if not cmd_options.answer_yes: if not answer_yes:
confirm_prompt = 'Are these details correct (answer "yes" to proceed, anything else to abort)? ' confirm_prompt = 'Are these details correct (answer "yes" to proceed, anything else to abort)? '
confirm_answer = raw_input(confirm_prompt) confirm_answer = raw_input(confirm_prompt)
if confirm_answer != 'yes': if confirm_answer != 'yes':
@ -750,9 +765,9 @@ site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date,
# Processing of badges has been disabled because they don't offer any useful # Processing of badges has been disabled because they don't offer any useful
# information in the offline situation. # information in the offline situation.
#print('[badge] PARSING BADGES...') #print('[badge] PARSING BADGES...')
#xml_path = os.path.join(xml_root, 'badges.xml') #xml_path = get_file_path(xml_root, 'badges.xml')
#print('[badge] start parsing badges.xml...') #print('[badge] start parsing badges.xml...')
#handler = BadgeContentHandler(site) #handler = BadgeContentHandler(conn, site)
#xml.sax.parse(xml_path, handler) #xml.sax.parse(xml_path, handler)
#print('[badge]\tProcessed %d rows.' % (handler.row_count)) #print('[badge]\tProcessed %d rows.' % (handler.row_count))
#print('[badge] FINISHED PARSING BADGES.\n') #print('[badge] FINISHED PARSING BADGES.\n')
@ -761,18 +776,18 @@ site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date,
# comments are temporarily stored in the database for retrieval when parsing # comments are temporarily stored in the database for retrieval when parsing
# posts only. # posts only.
print('[comment] PARSING COMMENTS...') print('[comment] PARSING COMMENTS...')
xml_path = os.path.join(xml_root, 'comments.xml') xml_path = get_file_path(xml_root, 'comments.xml')
print('[comment] start parsing comments.xml...') print('[comment] start parsing comments.xml...')
handler = CommentContentHandler(site) handler = CommentContentHandler(conn, site)
xml.sax.parse(xml_path, handler) xml.sax.parse(xml_path, handler)
print('%-10s Processed %d rows.' % ('[comment]', handler.row_count)) print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
print('[comment] FINISHED PARSING COMMENTS.\n') print('[comment] FINISHED PARSING COMMENTS.\n')
# USERS # USERS
print('[user] PARSING USERS...') print('[user] PARSING USERS...')
xml_path = os.path.join(xml_root, 'users.xml') xml_path = get_file_path(xml_root, 'users.xml')
print('[user] start parsing users.xml...') print('[user] start parsing users.xml...')
handler = UserContentHandler(site) handler = UserContentHandler(conn, site)
xml.sax.parse(xml_path, handler) xml.sax.parse(xml_path, handler)
print('%-10s Processed %d rows.' % ('[user]', handler.row_count)) print('%-10s Processed %d rows.' % ('[user]', handler.row_count))
print('[user] FINISHED PARSING USERS.\n') print('[user] FINISHED PARSING USERS.\n')
@ -780,9 +795,9 @@ print('[user] FINISHED PARSING USERS.\n')
# POSTS # POSTS
# posts are added directly to the Solr index; they are not added to the database. # posts are added directly to the Solr index; they are not added to the database.
print('[post] PARSING POSTS...') print('[post] PARSING POSTS...')
xml_path = os.path.join(xml_root, 'posts.xml') xml_path = get_file_path(xml_root, 'posts.xml')
print('[post] start parsing posts.xml...') print('[post] start parsing posts.xml...')
handler = PostContentHandler(site) handler = PostContentHandler(solr, site)
xml.sax.parse(xml_path, handler) xml.sax.parse(xml_path, handler)
handler.commit_all_questions() handler.commit_all_questions()
print('%-10s Processed %d rows.' % ('[post]', handler.row_count)) print('%-10s Processed %d rows.' % ('[post]', handler.row_count))
@ -804,3 +819,23 @@ timing_end = time.time()
print('Time taken for site insertion into Stackdump: %f seconds.' % (timing_end - timing_start)) print('Time taken for site insertion into Stackdump: %f seconds.' % (timing_end - timing_start))
print('') print('')
# MAIN METHOD
if __name__ == '__main__':
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
parser.add_option('-n', '--site-name', help='Name of the site.')
parser.add_option('-d', '--site-desc', help='Description of the site (if not in sites).')
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
parser.add_option('-u', '--base-url', help='Base URL of the site on the web.')
parser.add_option('-Y', help='Answer yes to any confirmation questions.', dest='answer_yes', action='store_true', default=False)
(cmd_options, cmd_args) = parser.parse_args()
if len(cmd_args) < 1:
print('The path to the directory containing the extracted XML files is required.')
sys.exit(1)
import_site(cmd_args[0], cmd_options.site_name, cmd_options.dump_date,
cmd_options.site_desc, cmd_options.site_key,
cmd_options.base_url, answer_yes=cmd_options.answer_yes)