mirror of https://github.com/djohnlewis/stackdump synced 2025-02-23 13:44:46 +00:00

Renamed dataproc management commands to better names.

This commit is contained in:
Samuel Lai 2012-02-12 13:55:18 +11:00
parent f075580a2e
commit adccd41724
3 changed files with 220 additions and 91 deletions

View File

@ -1,47 +0,0 @@
#!/usr/bin/env python
# This script deletes the site specified by the ID in the first parameter.
import os
import sys
from sqlobject import sqlhub, connectionForURI, AND, OR
from pysolr import Solr
from stackdump.models import Site
script_dir = os.path.dirname(sys.argv[0])
if len(sys.argv) < 2:
print 'The site ID needs to be specified as the first parameter.'
# connect to the data sources
db_path = os.path.abspath(os.path.join(script_dir, '../../../../data/stackdump.sqlite'))
# connect to the database
print('Connecting to the database...')
conn_str = 'sqlite://' + db_path
sqlhub.processConnection = connectionForURI(conn_str)
# connect to solr
print('Connecting to solr...')
solr = Solr("http://localhost:8983/solr/")
site_id = int(sys.argv[1])
site = Site.select(Site.q.id==site_id).getOne(None)
if not site:
print 'Site ID %d does not exist.' % site_id
site_name = site.name
print('Deleting site "%s" from the database... ' % site.name)
Site.delete(site.id) # the relationship cascades, so other rows will be deleted
print('Deleting site "%s" from solr... ' % site_name)
solr.delete(q='siteName:"%s"' % site_name)

View File

@ -6,13 +6,16 @@ from __future__ import with_statement
import sys
import os
import time
import xml.sax
from datetime import datetime
import re
from optparse import OptionParser
from xml.etree import ElementTree
from sqlobject import sqlhub, connectionForURI, AND, OR
from sqlobject import sqlhub, connectionForURI, AND, OR, IN, SQLObject
from sqlobject.sqlbuilder import Delete, Insert
from sqlobject.styles import DefaultStyle
from pysolr import Solr
from stackdump.models import Site, Badge, Comment, User
@ -38,6 +41,7 @@ class BaseContentHandler(xml.sax.ContentHandler):
self.obj_class = obj_class
self.cur_props = None
self.row_count = 0
self.db_style = DefaultStyle()
def endElement(self, name):
if name != 'row':
@ -53,7 +57,21 @@ class BaseContentHandler(xml.sax.ContentHandler):
# the cur_props is now complete. Save it.
# the object is automatically saved to the database on creation
# adding records using the SQLObject object takes too long
# so we're going to go closer to the metal
props_for_db = { }
for k,v in self.cur_props.items():
# if this is a reference to a FK, massage the values to fit
if isinstance(v, SQLObject):
k += 'Id'
v = v.id
# need to convert the attr names to DB column names
props_for_db[self.db_style.pythonAttrToDBColumn(k)] = v
conn.query(conn.sqlrepr(Insert(self.obj_class.sqlmeta.table, values=props_for_db)))
except Exception, e:
# could not insert this, so ignore the row
print('Exception: ' + str(e))
@ -64,7 +82,9 @@ class BaseContentHandler(xml.sax.ContentHandler):
self.cur_props = None
if self.row_count % 1000 == 0:
print('[badge]\t\tProcessed %d rows.' % (self.row_count))
print('%-10s Processed %d rows.' % ('[%s]' % self.obj_class.sqlmeta.table,
class BadgeContentHandler(BaseContentHandler):
@ -206,6 +226,7 @@ class PostContentHandler(xml.sax.ContentHandler):
def __init__(self, site):
self.site = site
self.unfinished_questions = { }
self.orphan_answers = { }
self.cur_props = None
self.row_count = 0
@ -233,7 +254,7 @@ class PostContentHandler(xml.sax.ContentHandler):
d['answerCount'] = int(attrs.get('AnswerCount', 0))
d['viewCount'] = int(attrs.get('ViewCount', 0))
raise ValueError('Unknown PostTypeId [%s] for row ID [%s]' % (attrs.get('PostTypeId', -1), attrs.get('Id', -1)))
raise ValueError('Unknown PostTypeId [%s] for row ID [%s]. Probably a tag wiki page.' % (attrs.get('PostTypeId', -1), attrs.get('Id', -1)))
if 'AcceptedAnswerId' in attrs:
d['acceptedAnswerId'] = int(attrs.get('AcceptedAnswerId', 0))
@ -281,24 +302,13 @@ class PostContentHandler(xml.sax.ContentHandler):
d = self.cur_props
# find, convert to JSON and attach any comments for this question
comments = Comment.select(AND(Comment.q.site == self.site,
Comment.q.postId == int(d['id'])))
for comment in comments:
c = { }
for f in Comment.json_fields:
c[f] = getattr(comment, f)
if len(d['comments']) != d['commentCount']:
print('Post ID [%s] expected to have %d comments, but got %d instead. Ignoring inconsistency.' % (d['id'], d['commentCount'], len(d['comments'])))
# the cur_props is now complete. Stash it away until question is complete.
if d.has_key('parentId'):
# this is an answer.
if not self.unfinished_questions.has_key(d['parentId']):
print('lookup keys: ' + repr(self.unfinished_questions.keys()))
raise ValueError("This answer's [ID# %s] question [ID# %s] has not been processed yet. Incorrect order in XML? Ignoring answer." % (d['id'], d['parentId']))
if not self.orphan_answers.has_key(d['parentId']):
self.orphan_answers[d['parentId']] = [ ]
@ -309,6 +319,11 @@ class PostContentHandler(xml.sax.ContentHandler):
(d['id'], d['title'], self.unfinished_questions[d['id']]['title']))
self.unfinished_questions[d['id']] = d
# check if any of the orphan answers are for this question
if self.orphan_answers.has_key(d['id']):
# remove orphan answers from the orphan list
del self.orphan_answers[d['id']]
except Exception, e:
# could not insert this, so ignore the row
@ -320,9 +335,11 @@ class PostContentHandler(xml.sax.ContentHandler):
self.cur_props = None
if self.row_count % 1000 == 0:
print('\tProcessed %d rows.' % (self.row_count))
print('%-10s Processed %d rows.' % ('[post]', self.row_count))
# only check for finished questions every 200 rows to speed things up
if self.row_count % 200 == 0:
def commit_finished_questions(self):
# check if any questions are now complete (answerCount=len(answers))
@ -356,6 +373,7 @@ class PostContentHandler(xml.sax.ContentHandler):
the search index in the form that we want.
Things this does -
* fetch comments for question and answers and attach them to the objects
* creates the 'text' field for the search index that contains all the
text of the question (title, question, answers and all comments).
* serialises answers to JSON
@ -365,6 +383,46 @@ class PostContentHandler(xml.sax.ContentHandler):
* add question JSON to document
* commit document to search index.
# find and attach any comments for this question and its answers
# get the set of post ids
post_ids = set()
for a in q['answers']:
# get the comments
comment_objs = Comment.select(AND(Comment.q.site == self.site,
IN(Comment.q.postId, list(post_ids))))
# sort the comments out into a dict keyed on the post id
comments = { }
for c in comment_objs:
# convert comment object to a JSON-serialisable object
comment_json = { }
for f in Comment.json_fields:
comment_json[f] = getattr(c, f)
# we already know that this comment comes from the current site, so
# we only need to filter on post ID
if not comments.has_key(c.postId):
comments[c.postId] = [ ]
# add comments to the question
if comments.has_key(q['id']):
if len(q['comments']) != q['commentCount']:
print('Post ID [%s] expected to have %d comments, but got %d instead. Ignoring inconsistency.' % (q['id'], q['commentCount'], len(q['comments'])))
# add comments to the answers
for a in q['answers']:
if comments.has_key(a['id']):
if len(a['comments']) != a['commentCount']:
print('Post ID [%s] expected to have %d comments, but got %d instead. Ignoring inconsistency.' % (a['id'], a['commentCount'], len(a['comments'])))
doc = { }
# create the text field contents
@ -442,6 +500,8 @@ class PostContentHandler(xml.sax.ContentHandler):
Should be called after all XML has been parsed.
for id,q in self.unfinished_questions.items():
print('Question [ID# %d] was expected to have %d answers, but got %d instead. Ignoring inconsistency.' % (q['id'], q['answerCount'], len(q['answers'])))
@ -458,6 +518,10 @@ class PostContentHandler(xml.sax.ContentHandler):
# we're committing all questions, so nothing is now unfinished
# check if there are any orphan answers
for question_id, answers in self.orphan_answers.items():
print('There are %d answers for missing question [ID# %d]. Ignoring orphan answers.' % (len(answers), question_id))
parser = OptionParser(usage='usage: %prog [options] xml_root_dir')
@ -469,7 +533,7 @@ parser.add_option('-c', '--dump-date', help='Dump date of the site.')
(cmd_options, cmd_args) = parser.parse_args()
if len(cmd_args) < 1:
print('The path to the extracted XML files is required.')
print('The path to the directory containing the extracted XML files is required.')
xml_root = cmd_args[0]
@ -557,65 +621,92 @@ if not (site_name and site_key and site_desc and dump_date):
# check if site is already in database; if so, purge the data.
sites = Site.select(Site.q.name==site_name)
# the site really shouldn't exist more than once, but just in case
for site in sites:
site = list(Site.select(Site.q.key==site_key))
if len(site) > 0:
site = site[0]
print('Deleting site "%s" from the database... ' % site.name)
Site.delete(site.id) # the relationship cascades, so other rows will be deleted
# Using SQLObject to delete rows takes too long, so we're going to do it directly
#Site.delete(site.id) # the relationship cascades, so other rows will be deleted
sqlhub.threadConnection = sqlhub.processConnection.transaction()
conn = sqlhub.threadConnection
# these deletions are done in this order to avoid FK constraint issues
print('\tDeleting comments...')
conn.query(conn.sqlrepr(Delete(Comment.sqlmeta.table, where=(Comment.q.site==site))))
print('\tDeleting badges...')
conn.query(conn.sqlrepr(Delete(Badge.sqlmeta.table, where=(Badge.q.site==site))))
print('\tDeleting users...')
conn.query(conn.sqlrepr(Delete(User.sqlmeta.table, where=(User.q.site==site))))
print('\tDeleting site...')
conn.query(conn.sqlrepr(Delete(Site.sqlmeta.table, where=(Site.q.id==site.id))))
print('Deleting site "%s" from the solr... ' % site.name)
solr.delete(q='siteKey:"%s"' % site.key)
timing_start = time.time()
# start a new transaction
sqlhub.threadConnection = sqlhub.processConnection.transaction()
conn = sqlhub.threadConnection
# create a new Site
site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date, import_date=datetime.now())
print('[badge] PARSING BADGES...')
sqlhub.threadConnection = sqlhub.processConnection.transaction()
xml_path = os.path.join(xml_root, 'badges.xml')
print('[badge] start parsing badges.xml...')
handler = BadgeContentHandler(site)
xml.sax.parse(xml_path, handler)
print('[badge]\t\tProcessed %d rows.' % (handler.row_count))
print('[badge] FINISHED PARSING BADGES.\n')
# Processing of badges has been disabled because they don't offer any useful
# information in the offline situation.
#print('[badge] PARSING BADGES...')
#xml_path = os.path.join(xml_root, 'badges.xml')
#print('[badge] start parsing badges.xml...')
#handler = BadgeContentHandler(site)
#xml.sax.parse(xml_path, handler)
#print('[badge]\tProcessed %d rows.' % (handler.row_count))
#print('[badge] FINISHED PARSING BADGES.\n')
# comments are temporarily stored in the database for retrieval when parsing
# posts only.
print('[comment] PARSING COMMENTS...')
sqlhub.threadConnection = sqlhub.processConnection.transaction()
xml_path = os.path.join(xml_root, 'comments.xml')
print('[comment] start parsing comments.xml...')
handler = CommentContentHandler(site)
xml.sax.parse(xml_path, handler)
print('[comment]\tProcessed %d rows.' % (handler.row_count))
print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
print('[comment] FINISHED PARSING COMMENTS.\n')
print('[user] PARSING USERS...')
sqlhub.threadConnection = sqlhub.processConnection.transaction()
xml_path = os.path.join(xml_root, 'users.xml')
print('[user] start parsing users.xml...')
handler = UserContentHandler(site)
xml.sax.parse(xml_path, handler)
print('[user]\t\tProcessed %d rows.' % (handler.row_count))
print('%-10s Processed %d rows.' % ('[user]', handler.row_count))
print('[user] FINISHED PARSING USERS.\n')
# posts are added directly to the Solr index; they are not added to the database.
print('[post] PARSING POSTS...')
sqlhub.threadConnection = sqlhub.processConnection.transaction()
xml_path = os.path.join(xml_root, 'posts.xml')
print('[post] start parsing posts.xml...')
handler = PostContentHandler(site)
xml.sax.parse(xml_path, handler)
print('[post]\tProcessed %d rows.' % (handler.row_count))
print('%-10s Processed %d rows.' % ('[post]', handler.row_count))
print('[post] FINISHED PARSING POSTS.\n')
# TODO: delete comments?
print('[comment] DELETING COMMENTS FROM DATABASE (they are no longer needed)...')
conn.query(conn.sqlrepr(Delete(Comment.sqlmeta.table, where=(Comment.q.site == site))))
print('[comment] FINISHED DELETING COMMENTS.\n')
# commit transaction
timing_end = time.time()
print('Time taken for site insertion into Stackdump: %f seconds.' % (timing_end - timing_start))

View File

@ -0,0 +1,85 @@
#!/usr/bin/env python
# This script provides the ability to manage sites in Stackdump.
import os
import sys
from optparse import OptionParser
from sqlobject import sqlhub, connectionForURI
from pysolr import Solr
from stackdump.models import Site
script_dir = os.path.dirname(sys.argv[0])
def list_sites():
# connect to the data sources
db_path = os.path.abspath(os.path.join(script_dir, '../../../../data/stackdump.sqlite'))
# connect to the database
print('Connecting to the database...')
conn_str = 'sqlite://' + db_path
sqlhub.processConnection = connectionForURI(conn_str)
sites = list(Site.select()) # force the lazy method to execute
if len(sites) > 0:
print('[site key] site name')
print('-' * 80)
for site in sites:
print('[%s] %s' % (site.key, site.name))
def delete_site(site_key):
# connect to the data sources
db_path = os.path.abspath(os.path.join(script_dir, '../../../../data/stackdump.sqlite'))
# connect to the database
print('Connecting to the database...')
conn_str = 'sqlite://' + db_path
sqlhub.processConnection = connectionForURI(conn_str)
# connect to solr
print('Connecting to solr...')
solr = Solr("http://localhost:8983/solr/")
site = Site.select(Site.q.key==site_key).getOne(None)
if not site:
print 'Site key %s does not exist.' % site_key
sqlhub.threadConnection = sqlhub.processConnection.transaction()
print('Deleting site "%s" from the database... ' % site.name)
Site.delete(site.id) # the relationship cascades, so other rows will be deleted
print('Deleting site "%s" from solr... ' % site.name)
solr.delete(q='siteKey:"%s"' % site_key)
if __name__ == '__main__':
parser = OptionParser()
parser.add_option('-l', '--list-sites', help='List sites imported into Stackdump.', action="store_true")
parser.add_option('-d', '--delete-site', help='Delete a site from Stackdump.', metavar='SITE_KEY')
(cmd_options, cmd_args) = parser.parse_args()
if cmd_options.list_sites:
elif cmd_options.delete_site: