1
0
mirror of https://github.com/djohnlewis/stackdump synced 2025-01-23 07:01:41 +00:00
stackdump/python/dataproc/insert.py

623 lines
25 KiB
Python
Raw Normal View History

#!/usr/bin/env python
# This script takes extracted site files and inserts them into the database.
from __future__ import with_statement
import sys
import os
import xml.sax
from datetime import datetime
import platform
import re
from sqlobject import *
from pysolr import Solr
try:
# For Python < 2.6 or people using a newer version of simplejson
import simplejson as json
except ImportError:
# For Python >= 2.6
import json
is_jython = 'Java' in platform.system()
script_dir = os.path.dirname(sys.argv[0])
# MODELS
# use UnicodeCol instead of StringCol; StringCol defaults to ascii when encoding
# is unspecified, as it is with Jython zxJDBC.
class Site(SQLObject):
name = UnicodeCol()
desc = UnicodeCol()
class Badge(SQLObject):
sourceId = IntCol()
site = ForeignKey('Site', cascade=True)
userId = IntCol()
name = UnicodeCol()
date = DateTimeCol()
class Comment(SQLObject):
sourceId = IntCol()
site = ForeignKey('Site', cascade=True)
postId = IntCol()
score = IntCol()
text = UnicodeCol()
creationDate = DateTimeCol()
userId = IntCol()
json_fields = [ 'id', 'score', 'text', 'creationDate', 'userId' ]
class User(SQLObject):
sourceId = IntCol()
site = ForeignKey('Site', cascade=True)
reputation = IntCol()
creationDate = DateTimeCol()
displayName = UnicodeCol()
emailHash = UnicodeCol()
lastAccessDate = DateTimeCol()
websiteUrl = UnicodeCol()
location = UnicodeCol()
age = IntCol()
aboutMe = UnicodeCol()
views = IntCol()
upVotes = IntCol()
downVotes = IntCol()
# SAX HANDLERS
# Jython can't handle the %f format specifier
if is_jython:
ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S'
else:
ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
class BaseContentHandler(xml.sax.ContentHandler):
"""
Base content handler.
"""
def __init__(self, site, obj_class):
self.site = site
self.obj_class = obj_class
self.cur_props = None
self.row_count = 0
def endElement(self, name):
if name != 'row':
return
if not self.cur_props:
return
# we want to count failed rows as well as successful ones as this is
# a count of rows processed.
self.row_count += 1
# the cur_props is now complete. Save it.
try:
# the object is automatically saved to the database on creation
self.obj_class(**self.cur_props)
except Exception, e:
# could not insert this, so ignore the row
print('Exception: ' + str(e))
import traceback
traceback.print_exc()
print('Could not insert the row ' + repr(self.cur_props))
self.cur_props = None
if self.row_count % 1000 == 0:
print('[badge]\t\tProcessed %d rows.' % (self.row_count))
class BadgeContentHandler(BaseContentHandler):
"""
Parses the string -
<row Id="15" UserId="6" Name="Supporter" Date="2010-05-19T21:57:31.000" />
"""
def __init__(self, site):
BaseContentHandler.__init__(self, site, Badge)
def startElement(self, name, attrs):
if name != 'row':
return
try:
d = self.cur_props = { 'site' : self.site }
# this hack to get the Id attr is needed due to Jython bug #1768
d['sourceId'] = is_jython and int(attrs._attrs.getValue('Id')) or int(attrs['Id'])
d['userId'] = int(attrs.get('UserId', 0))
d['name'] = attrs.get('Name', '')
d['date'] = datetime.strptime(attrs.get('Date'), ISO_DATE_FORMAT)
except Exception, e:
# could not parse this, so ignore the row completely
self.cur_props = None
print('Exception: ' + str(e))
import traceback
traceback.print_exc()
print('Could not parse the row ' + repr(attrs))
class CommentContentHandler(BaseContentHandler):
"""
Parses the string -
<row Id="6" PostId="12" Score="1" Text="Just as an addition, if you are \
going to test your library, don't use a key (that is what no key \
is for). However, make sure the final product demands and API \
key. " CreationDate="2010-05-19T23:48:05.680" UserId="23" />
"""
def __init__(self, site):
BaseContentHandler.__init__(self, site, Comment)
def startElement(self, name, attrs):
if name != 'row':
return
try:
d = self.cur_props = { 'site' : self.site }
# this hack to get the Id attr is needed due to Jython bug #1768
d['sourceId'] = is_jython and int(attrs._attrs.getValue('Id')) or int(attrs['Id'])
d['postId'] = int(attrs.get('PostId', 0))
d['score'] = int(attrs.get('Score', 0))
d['text'] = attrs.get('Text', '')
d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT)
d['userId'] = int(attrs.get('UserId', 0))
except Exception, e:
# could not parse this, so ignore the row completely
self.cur_props = None
print('Exception: ' + str(e))
import traceback
traceback.print_exc()
print('Could not parse the row ' + repr(attrs))
class UserContentHandler(BaseContentHandler):
"""
Parses the string -
<row Id="1" Reputation="176" CreationDate="2010-05-19T21:30:04.837" \
DisplayName="Geoff Dalgas" \
EmailHash="b437f461b3fd27387c5d8ab47a293d35" \
LastAccessDate="2011-04-17T04:58:28.830" \
WebsiteUrl="http://stackoverflow.com" Location="Corvallis, OR" \
Age="33" AboutMe="&lt;p&gt;Developer on the StackOverflow team. \
Find me on&lt;/p&gt;&#xA;&#xA;&lt;p&gt;&lt;a \
href=&quot;http://www.twitter.com/SuperDalgas&quot; \
rel=&quot;nofollow&quot;&gt;Twitter&lt;/a&gt;&#xA;&lt;br&gt;&lt;br\
&gt;&#xA;&lt;a href=&quot;http://blog.stackoverflow.com/2009/05/\
welcome-stack-overflow-valued-associate-00003/&quot; rel=&quot;\
nofollow&quot;&gt;Stack Overflow Valued Associate #00003&lt;/a&gt;\
&lt;/p&gt;&#xA;" Views="52" UpVotes="11" DownVotes="1" />
"""
def __init__(self, site):
BaseContentHandler.__init__(self, site, User)
def startElement(self, name, attrs):
if name != 'row':
return
try:
d = self.cur_props = { 'site' : site }
# this hack to get the Id attr is needed due to Jython bug #1768
d['sourceId'] = is_jython and int(attrs._attrs.getValue('Id')) or int(attrs['Id'])
d['reputation'] = int(attrs.get('Reputation', 0))
d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT)
d['displayName'] = attrs.get('DisplayName', '')
d['emailHash'] = attrs.get('EmailHash', '')
d['lastAccessDate'] = datetime.strptime(attrs.get('LastAccessDate'), ISO_DATE_FORMAT)
d['websiteUrl'] = attrs.get('WebsiteUrl', '')
d['location'] = attrs.get('Location', '')
d['age'] = int(attrs.get('Age', 0))
d['aboutMe'] = attrs.get('AboutMe', '')
d['views'] = int(attrs.get('Views', 0))
d['upVotes'] = int(attrs.get('UpVotes', 0))
d['downVotes'] = int(attrs.get('DownVotes', 0))
except Exception, e:
# could not parse this, so ignore the row completely
self.cur_props = None
print('Exception: ' + str(e))
import traceback
traceback.print_exc()
print('Could not parse the row ' + repr(attrs))
class PostContentHandler(xml.sax.ContentHandler):
"""
Parses the string -
<row Id="1" PostTypeId="1" AcceptedAnswerId="509" \
CreationDate="2009-04-30T06:49:01.807" Score="13" ViewCount="820" \
Body="&lt;p&gt;Our nightly full (and periodic differential) \
backups are becoming quite large, due mostly to the amount of \
indexes on our tables; roughly half the backup size is comprised \
of indexes.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;We're using the \
&lt;strong&gt;Simple&lt;/strong&gt; recovery model for our \
backups.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;Is there any way, through \
using &lt;code&gt;FileGroups&lt;/code&gt; or some other \
file-partitioning method, to &lt;strong&gt;exclude&lt;/strong&gt; \
indexes from the backups?&lt;/p&gt;&#xA;&#xA;&lt;p&gt;It would be \
nice if this could be extended to full-text catalogs, as \
well.&lt;/p&gt;&#xA;" OwnerUserId="3" LastEditorUserId="919" \
LastEditorDisplayName="" LastEditDate="2009-05-04T02:11:16.667" \
LastActivityDate="2009-05-10T15:22:39.707" Title="How to exclude \
indexes from backups in SQL Server 2008" \
Tags="&lt;sql-server&gt;&lt;backup&gt;&lt;sql-server-2008&gt;&lt;indexes&gt;" \
AnswerCount="3" CommentCount="1" FavoriteCount="3" />
"""
TAGS_RE = re.compile(u'&lt;([\w\d\-]+)&gt;')
def __init__(self, site):
self.site = site
self.unfinished_questions = { }
self.cur_props = None
self.row_count = 0
def json_default_handler(self, obj):
# for date object handling
if hasattr(obj, 'isoformat'):
return obj.isoformat()
else:
raise TypeError, 'Object of type %s with value of %s is not JSON serializable' % (type(obj), repr(obj))
def startElement(self, name, attrs):
if name != 'row':
return
try:
d = self.cur_props = { }
# this hack to get the Id attr is needed due to Jython bug #1768
d['id'] = is_jython and int(attrs._attrs.getValue('Id')) or int(attrs['Id'])
if attrs['PostTypeId'] == '2':
# I am an answer.
d['parentId'] = int(attrs['ParentId'])
elif attrs['PostTypeId'] == '1':
# I am a question.
d['answers'] = [ ]
d['answerCount'] = int(attrs.get('AnswerCount', 0))
d['viewCount'] = int(attrs.get('ViewCount', 0))
else:
raise ValueError('Unknown PostTypeId [%s] for row ID [%s]' % (attrs.get('PostTypeId', -1), attrs.get('Id', -1)))
if 'AcceptedAnswerId' in attrs:
d['acceptedAnswerId'] = int(attrs.get('AcceptedAnswerId', 0))
d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT)
d['score'] = int(attrs.get('Score', 0))
d['body'] = attrs.get('Body', '')
d['ownerUserId'] = int(attrs.get('OwnerUserId', 0))
if 'LastEditorUserId' in attrs:
d['lastEditorUserId'] = int(attrs.get('LastEditorUserId', ''))
if 'LastEditDate' in attrs:
d['lastEditDate'] = datetime.strptime(attrs.get('LastEditDate'), ISO_DATE_FORMAT)
d['lastActivityDate'] = datetime.strptime(attrs.get('LastActivityDate'), ISO_DATE_FORMAT)
if 'CommunityOwnedDate' in attrs:
d['communityOwnedDate'] = datetime.strptime(attrs.get('CommunityOwnedDate'), ISO_DATE_FORMAT)
if 'ClosedDate' in attrs:
d['closedDate'] = datetime.strptime(attrs.get('ClosedDate'), ISO_DATE_FORMAT)
d['title'] = attrs.get('Title', '')
if 'Tags' in attrs:
d['tags'] = attrs.get('Tags', '')
d['commentCount'] = int(attrs.get('CommentCount', 0))
d['favoriteCount'] = int(attrs.get('FavoriteCount', 0))
d['comments'] = [ ]
except Exception, e:
# could not parse this, so ignore the row completely
self.cur_props = None
print('Exception: ' + str(e))
# TODO: enable these in verbose/debug output mode
#import traceback
#traceback.print_exc()
#print('Could not parse the row ' + repr(dict([(k,attrs[k]) for k in attrs.getNames()])))
def endElement(self, name):
if name != 'row':
return
if not self.cur_props:
return
# we want to count failed rows as well as successful ones as this is
# a count of rows processed.
self.row_count += 1
try:
d = self.cur_props
# find, convert to JSON and attach any comments for this question
comments = Comment.select(AND(Comment.q.site == self.site,
Comment.q.postId == int(d['id'])))
for comment in comments:
c = { }
for f in Comment.json_fields:
c[f] = getattr(comment, f)
d['comments'].append(c)
if len(d['comments']) != d['commentCount']:
print('Post ID [%s] expected to have %d comments, but got %d instead. Ignoring inconsistency.' % (d['id'], d['commentCount'], len(d['comments'])))
# the cur_props is now complete. Stash it away until question is complete.
if d.has_key('parentId'):
# this is an answer.
if not self.unfinished_questions.has_key(d['parentId']):
print('lookup keys: ' + repr(self.unfinished_questions.keys()))
raise ValueError("This answer's [ID# %s] question [ID# %s] has not been processed yet. Incorrect order in XML? Ignoring answer." % (d['id'], d['parentId']))
else:
self.unfinished_questions[d['parentId']]['answers'].append(d)
else:
# this is a question.
if self.unfinished_questions.has_key(d['id']):
# this should not occur; duplicate question id.
raise ValueError('Question ID [%s] already exists.\nThis title: %s\nDuplicate title:%s\nIgnoring duplicate.' %
(d['id'], d['title'], self.unfinished_questions[d['id']]['title']))
else:
self.unfinished_questions[d['id']] = d
except Exception, e:
# could not insert this, so ignore the row
print('Exception: ' + str(e))
import traceback
traceback.print_exc()
print('Could not process the row ' + repr(self.cur_props))
self.cur_props = None
if self.row_count % 1000 == 0:
print('\tProcessed %d rows.' % (self.row_count))
self.commit_finished_questions()
def commit_finished_questions(self):
# check if any questions are now complete (answerCount=len(answers))
finished_question_ids = [ ]
for id, q in self.unfinished_questions.items():
if len(q['answers']) >= q['answerCount']:
if len(q['answers']) > q['answerCount']:
print('Question ID [%s] expected to have %d answers, but got %d instead. Ignoring inconsistency.' % (q['id'], q['answerCount'], len(q['answers'])))
try:
# question is complete, store it.
self.commit_question(q)
except Exception, e:
# could not serialise and insert this question, so ignore it
print('Exception: ' + str(e))
import traceback
traceback.print_exc()
print('Could not process the completed question ' + repr(q))
finally:
finished_question_ids.append(id)
# remove any finished questions from the unfinished list
for id in finished_question_ids:
self.unfinished_questions.pop(id)
def commit_question(self, q):
"""
Massages and serialises the question object so it can be inserted into
the search index in the form that we want.
Things this does -
* creates the 'text' field for the search index that contains all the
text of the question (title, question, answers and all comments).
* serialises answers to JSON
* creates dict that maps to the search index document schema
* remove unwanted attributes from the q object and serialise question to
JSON
* add question JSON to document
* commit document to search index.
"""
doc = { }
# create the text field contents
search_text = [ ]
# question bits
search_text.append(q['title'])
search_text.append(q['body'])
for c in q['comments']:
search_text.append(c['text'])
# answer bits
for a in q['answers']:
search_text.append(a['body'])
for c in a['comments']:
search_text.append(c['text'])
search_text = ' '.join(search_text)
doc['text'] = search_text
# serialise answers to JSON
doc['answer-json'] = [ json.dumps(a, default=self.json_default_handler) for a in q['answers'] ]
# map other fields to search index doc
doc['id'] = str(q['id'])
doc['siteName'] = self.site.name
doc['creationDate'] = q['creationDate']
doc['score'] = q['score']
doc['viewCount'] = q['viewCount']
doc['title'] = q['title']
doc['ownerUserId'] = q['ownerUserId']
if 'lastEditorUserId' in q:
doc['lastEditorUserId'] = q['lastEditorUserId']
doc['lastActivityDate'] = q['lastActivityDate']
if 'communityOwnedDate' in q:
doc['communityOwnedDate'] = q['communityOwnedDate']
if 'closedDate' in q:
doc['closedDate'] = q['closedDate']
if 'tags' in q:
# parse tags into a list
doc['tags'] = PostContentHandler.TAGS_RE.findall(q['tags'])
# serialise question to JSON (the q object has cruft we don't want)
question_obj = { }
question_obj['id'] = q['id']
if 'acceptedAnswerId' in q:
question_obj['acceptedAnswerId'] = q['acceptedAnswerId']
question_obj['creationDate'] = q['creationDate']
question_obj['score'] = q['score']
question_obj['viewCount'] = q['viewCount']
question_obj['body'] = q['body']
question_obj['ownerUserId'] = q['ownerUserId']
if 'lastEditorUserId' in q:
question_obj['lastEditorUserId'] = q['lastEditorUserId']
if 'LastEditDate' in q:
question_obj['lastEditDate'] = q['lastEditDate']
question_obj['lastActivityDate'] = q['lastActivityDate']
if 'communityOwnedDate' in q:
question_obj['communityOwnedDate'] = q['communityOwnedDate']
if 'closedDate' in q:
question_obj['closedDate'] = q['closedDate']
question_obj['title'] = q['title']
if 'tags' in q:
question_obj['tags'] = q['tags']
question_obj['favoriteCount'] = q['favoriteCount']
question_obj['comments'] = q['comments']
doc['question-json'] = json.dumps(question_obj, default=self.json_default_handler)
solr.add([ doc ])
def commit_all_questions(self):
"""
Commits all questions, regardless of whether they're completed or not.
Should be called after all XML has been parsed.
"""
for id,q in self.unfinished_questions.items():
print('Question [ID# %d] was expected to have %d answers, but got %d instead. Ignoring inconsistency.' % (q['id'], q['answerCount'], len(q['answers'])))
try:
# question is complete, store it.
self.commit_question(q)
except Exception, e:
# could not serialise and insert this question, so ignore it
print('Exception: ' + str(e))
import traceback
traceback.print_exc()
print('Could not process the question ' + repr(q))
# we're committing all questions, so nothing is now unfinished
self.unfinished_questions.clear()
# MAIN METHOD
if len(sys.argv) != 2:
print('One argument is expected - the path to the extracted XML files.')
sys.exit(1)
xml_root = sys.argv[1]
print('Using the XML root path: ' + xml_root + '\n')
if not os.path.exists(xml_root):
print('The given XML root path does not exist.')
sys.exit(1)
db_path = os.path.abspath(os.path.join(script_dir, '../../data/stackdump.sqlite'))
# connect to the database
print('Connecting to the database...')
if is_jython:
conn_str = 'jython_sqlite://' + db_path
else: # assume cPython
conn_str = 'sqlite://' + db_path
sqlhub.processConnection = connectionForURI(conn_str)
#sqlhub.processConnection = connectionForURI('jython_sqlite://:memory:')
print('Connected.\n')
# connect to solr
print('Connecting to solr...')
solr = Solr("http://localhost:8983/solr/")
print('Connected.\n')
# ensure required tables exist
print("Creating tables if they don't exist...")
Site.createTable(ifNotExists=True)
Badge.createTable(ifNotExists=True)
Comment.createTable(ifNotExists=True)
User.createTable(ifNotExists=True)
print('Created.\n')
# SITE NAME
# get the site name from the first line of readme.txt. This could be fragile.
with open(os.path.join(xml_root, 'readme.txt')) as f:
site_desc = f.readline().strip()
# assume if there's a colon in the name, the name part is before, and the date
# part is after.
if ':' in site_desc:
site_name, site_date = site_desc.split(':')
else:
site_name = site_desc
site_date = ''
print('Site name is %s\n' % site_name)
# check if site is already in database; if so, purge the data.
sites = Site.select(Site.q.name==site_name)
# the site really shouldn't exist more than once, but just in case
for site in sites:
print('Deleting site "%s" from the database... ' % site.desc)
sys.stdout.flush()
Site.delete(site.id) # the relationship cascades, so other rows will be deleted
print('Deleted.\n')
print('Deleting site "%s" from the solr... ' % site_desc)
solr.delete(q='siteName:"%s"' % site_name)
print('Deleted.\n')
# create a new Site
site = Site(name=site_name, desc=site_desc)
# BADGES
print('[badge] PARSING BADGES...')
sqlhub.threadConnection = sqlhub.processConnection.transaction()
xml_path = os.path.join(xml_root, 'badges.xml')
print('[badge] start parsing badges.xml...')
handler = BadgeContentHandler(site)
xml.sax.parse(xml_path, handler)
sqlhub.threadConnection.commit(close=True)
print('[badge]\t\tProcessed %d rows.' % (handler.row_count))
print('[badge] FINISHED PARSING BADGES.\n')
# COMMENTS
print('[comment] PARSING COMMENTS...')
sqlhub.threadConnection = sqlhub.processConnection.transaction()
xml_path = os.path.join(xml_root, 'comments.xml')
print('[comment] start parsing comments.xml...')
handler = CommentContentHandler(site)
xml.sax.parse(xml_path, handler)
sqlhub.threadConnection.commit(close=True)
print('[comment]\tProcessed %d rows.' % (handler.row_count))
print('[comment] FINISHED PARSING COMMENTS.\n')
# USERS
print('[user] PARSING USERS...')
sqlhub.threadConnection = sqlhub.processConnection.transaction()
xml_path = os.path.join(xml_root, 'users.xml')
print('[user] start parsing users.xml...')
handler = UserContentHandler(site)
xml.sax.parse(xml_path, handler)
sqlhub.threadConnection.commit(close=True)
print('[user]\t\tProcessed %d rows.' % (handler.row_count))
print('[user] FINISHED PARSING USERS.\n')
# POSTS
print('[post] PARSING POSTS...')
sqlhub.threadConnection = sqlhub.processConnection.transaction()
xml_path = os.path.join(xml_root, 'posts.xml')
print('[post] start parsing posts.xml...')
handler = PostContentHandler(site)
xml.sax.parse(xml_path, handler)
handler.commit_all_questions()
sqlhub.threadConnection.commit(close=True)
print('[post]\tProcessed %d rows.' % (handler.row_count))
print('[post] FINISHED PARSING POSTS.\n')
# TODO: delete comments?