1
0
mirror of https://github.com/djohnlewis/stackdump synced 2024-12-04 23:17:37 +00:00

Changed other models to avoid unnecessary date/time parsing.

Added PRAGMA statements for comments table and changed flow so the siteId_postId index is now created after data has been inserted.
This commit is contained in:
Samuel Lai 2013-11-29 00:18:54 +11:00
parent cdb8d96508
commit bf09e36928
2 changed files with 19 additions and 9 deletions

View File

@ -110,7 +110,7 @@ class BadgeContentHandler(BaseContentHandler):
d['sourceId'] = int(attrs['Id']) d['sourceId'] = int(attrs['Id'])
d['userId'] = int(attrs.get('UserId', 0)) d['userId'] = int(attrs.get('UserId', 0))
d['name'] = attrs.get('Name', '') d['name'] = attrs.get('Name', '')
d['date'] = datetime.strptime(attrs.get('Date'), ISO_DATE_FORMAT) d['date'] = attrs.get('Date')
except Exception, e: except Exception, e:
# could not parse this, so ignore the row completely # could not parse this, so ignore the row completely
self.cur_props = None self.cur_props = None
@ -256,10 +256,10 @@ class UserContentHandler(BaseContentHandler):
d = self.cur_props = { 'site' : self.site } d = self.cur_props = { 'site' : self.site }
d['sourceId'] = int(attrs['Id']) d['sourceId'] = int(attrs['Id'])
d['reputation'] = int(attrs.get('Reputation', 0)) d['reputation'] = int(attrs.get('Reputation', 0))
d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT) d['creationDate'] = attrs.get('CreationDate')
d['displayName'] = attrs.get('DisplayName', '') d['displayName'] = attrs.get('DisplayName', '')
d['emailHash'] = attrs.get('EmailHash', '') d['emailHash'] = attrs.get('EmailHash', '')
d['lastAccessDate'] = datetime.strptime(attrs.get('LastAccessDate'), ISO_DATE_FORMAT) d['lastAccessDate'] = attrs.get('LastAccessDate')
d['websiteUrl'] = attrs.get('WebsiteUrl', '') d['websiteUrl'] = attrs.get('WebsiteUrl', '')
d['location'] = attrs.get('Location', '') d['location'] = attrs.get('Location', '')
d['age'] = int(attrs.get('Age', 0)) d['age'] = int(attrs.get('Age', 0))
@ -639,8 +639,6 @@ class Comment(SQLObject):
creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT) creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
userId = IntCol() userId = IntCol()
siteId_postId_index = DatabaseIndex(siteId, postId)
_connection = comment_db_sqlhub _connection = comment_db_sqlhub
json_fields = [ 'id', 'score', 'text', 'creationDate', 'userId' ] json_fields = [ 'id', 'score', 'text', 'creationDate', 'userId' ]
@ -678,6 +676,9 @@ def get_file_path(dir_path, filename):
return os.path.abspath(os.path.join(dir_path, matches[0])) return os.path.abspath(os.path.join(dir_path, matches[0]))
def create_comment_indices(conn):
# (site_id, post_id) index
conn.execute('CREATE INDEX IF NOT EXISTS comment_siteId_postId_index ON comment (site_id, post_id)')
def import_site(xml_root, site_name, dump_date, site_desc, site_key, def import_site(xml_root, site_name, dump_date, site_desc, site_key,
site_base_url, answer_yes=False): site_base_url, answer_yes=False):
@ -858,7 +859,10 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
comment_db_sqlhub.processConnection = connectionForURI(conn_str) comment_db_sqlhub.processConnection = connectionForURI(conn_str)
print('Connected.') print('Connected.')
Comment.createTable() Comment.createTable()
print('Schema created.\n') print('Schema created.')
comment_db_sqlhub.processConnection.getConnection().execute('PRAGMA synchronous = OFF')
comment_db_sqlhub.processConnection.getConnection().execute('PRAGMA journal_mode = MEMORY')
print('Pragma configured.\n')
timing_start = time.time() timing_start = time.time()
@ -891,6 +895,8 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
xml.sax.parse(xml_path, handler) xml.sax.parse(xml_path, handler)
handler.commit_comments_batch() handler.commit_comments_batch()
print('%-10s Processed %d rows.' % ('[comment]', handler.row_count)) print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
print('[comment] creating database indices...')
create_comment_indices(comment_db_sqlhub.processConnection.getConnection())
print('[comment] FINISHED PARSING COMMENTS.\n') print('[comment] FINISHED PARSING COMMENTS.\n')
# USERS # USERS

View File

@ -5,6 +5,10 @@
from sqlobject import SQLObject, UnicodeCol, DateTimeCol, IntCol, ForeignKey, \ from sqlobject import SQLObject, UnicodeCol, DateTimeCol, IntCol, ForeignKey, \
DatabaseIndex DatabaseIndex
ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
class Site(SQLObject): class Site(SQLObject):
name = UnicodeCol() name = UnicodeCol()
desc = UnicodeCol() desc = UnicodeCol()
@ -21,17 +25,17 @@ class Badge(SQLObject):
site = ForeignKey('Site', cascade=True) site = ForeignKey('Site', cascade=True)
userId = IntCol() userId = IntCol()
name = UnicodeCol() name = UnicodeCol()
date = DateTimeCol() date = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
class User(SQLObject): class User(SQLObject):
sourceId = IntCol() sourceId = IntCol()
site = ForeignKey('Site', cascade=True) site = ForeignKey('Site', cascade=True)
reputation = IntCol() reputation = IntCol()
creationDate = DateTimeCol() creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
displayName = UnicodeCol() displayName = UnicodeCol()
emailHash = UnicodeCol() emailHash = UnicodeCol()
lastAccessDate = DateTimeCol() lastAccessDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
websiteUrl = UnicodeCol() websiteUrl = UnicodeCol()
location = UnicodeCol() location = UnicodeCol()
age = IntCol() age = IntCol()