mirror of
https://github.com/djohnlewis/stackdump
synced 2024-12-04 23:17:37 +00:00
Changed other models to avoid unnecessary date/time parsing.
Added PRAGMA statements for comments table and changed flow so the siteId_postId index is now created after data has been inserted.
This commit is contained in:
parent
cdb8d96508
commit
bf09e36928
@ -110,7 +110,7 @@ class BadgeContentHandler(BaseContentHandler):
|
|||||||
d['sourceId'] = int(attrs['Id'])
|
d['sourceId'] = int(attrs['Id'])
|
||||||
d['userId'] = int(attrs.get('UserId', 0))
|
d['userId'] = int(attrs.get('UserId', 0))
|
||||||
d['name'] = attrs.get('Name', '')
|
d['name'] = attrs.get('Name', '')
|
||||||
d['date'] = datetime.strptime(attrs.get('Date'), ISO_DATE_FORMAT)
|
d['date'] = attrs.get('Date')
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
# could not parse this, so ignore the row completely
|
# could not parse this, so ignore the row completely
|
||||||
self.cur_props = None
|
self.cur_props = None
|
||||||
@ -256,10 +256,10 @@ class UserContentHandler(BaseContentHandler):
|
|||||||
d = self.cur_props = { 'site' : self.site }
|
d = self.cur_props = { 'site' : self.site }
|
||||||
d['sourceId'] = int(attrs['Id'])
|
d['sourceId'] = int(attrs['Id'])
|
||||||
d['reputation'] = int(attrs.get('Reputation', 0))
|
d['reputation'] = int(attrs.get('Reputation', 0))
|
||||||
d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT)
|
d['creationDate'] = attrs.get('CreationDate')
|
||||||
d['displayName'] = attrs.get('DisplayName', '')
|
d['displayName'] = attrs.get('DisplayName', '')
|
||||||
d['emailHash'] = attrs.get('EmailHash', '')
|
d['emailHash'] = attrs.get('EmailHash', '')
|
||||||
d['lastAccessDate'] = datetime.strptime(attrs.get('LastAccessDate'), ISO_DATE_FORMAT)
|
d['lastAccessDate'] = attrs.get('LastAccessDate')
|
||||||
d['websiteUrl'] = attrs.get('WebsiteUrl', '')
|
d['websiteUrl'] = attrs.get('WebsiteUrl', '')
|
||||||
d['location'] = attrs.get('Location', '')
|
d['location'] = attrs.get('Location', '')
|
||||||
d['age'] = int(attrs.get('Age', 0))
|
d['age'] = int(attrs.get('Age', 0))
|
||||||
@ -639,8 +639,6 @@ class Comment(SQLObject):
|
|||||||
creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
|
creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
|
||||||
userId = IntCol()
|
userId = IntCol()
|
||||||
|
|
||||||
siteId_postId_index = DatabaseIndex(siteId, postId)
|
|
||||||
|
|
||||||
_connection = comment_db_sqlhub
|
_connection = comment_db_sqlhub
|
||||||
|
|
||||||
json_fields = [ 'id', 'score', 'text', 'creationDate', 'userId' ]
|
json_fields = [ 'id', 'score', 'text', 'creationDate', 'userId' ]
|
||||||
@ -678,6 +676,9 @@ def get_file_path(dir_path, filename):
|
|||||||
|
|
||||||
return os.path.abspath(os.path.join(dir_path, matches[0]))
|
return os.path.abspath(os.path.join(dir_path, matches[0]))
|
||||||
|
|
||||||
|
def create_comment_indices(conn):
|
||||||
|
# (site_id, post_id) index
|
||||||
|
conn.execute('CREATE INDEX IF NOT EXISTS comment_siteId_postId_index ON comment (site_id, post_id)')
|
||||||
|
|
||||||
def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
||||||
site_base_url, answer_yes=False):
|
site_base_url, answer_yes=False):
|
||||||
@ -858,7 +859,10 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
|||||||
comment_db_sqlhub.processConnection = connectionForURI(conn_str)
|
comment_db_sqlhub.processConnection = connectionForURI(conn_str)
|
||||||
print('Connected.')
|
print('Connected.')
|
||||||
Comment.createTable()
|
Comment.createTable()
|
||||||
print('Schema created.\n')
|
print('Schema created.')
|
||||||
|
comment_db_sqlhub.processConnection.getConnection().execute('PRAGMA synchronous = OFF')
|
||||||
|
comment_db_sqlhub.processConnection.getConnection().execute('PRAGMA journal_mode = MEMORY')
|
||||||
|
print('Pragma configured.\n')
|
||||||
|
|
||||||
timing_start = time.time()
|
timing_start = time.time()
|
||||||
|
|
||||||
@ -891,6 +895,8 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
|
|||||||
xml.sax.parse(xml_path, handler)
|
xml.sax.parse(xml_path, handler)
|
||||||
handler.commit_comments_batch()
|
handler.commit_comments_batch()
|
||||||
print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
|
print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
|
||||||
|
print('[comment] creating database indices...')
|
||||||
|
create_comment_indices(comment_db_sqlhub.processConnection.getConnection())
|
||||||
print('[comment] FINISHED PARSING COMMENTS.\n')
|
print('[comment] FINISHED PARSING COMMENTS.\n')
|
||||||
|
|
||||||
# USERS
|
# USERS
|
||||||
|
@ -5,6 +5,10 @@
|
|||||||
from sqlobject import SQLObject, UnicodeCol, DateTimeCol, IntCol, ForeignKey, \
|
from sqlobject import SQLObject, UnicodeCol, DateTimeCol, IntCol, ForeignKey, \
|
||||||
DatabaseIndex
|
DatabaseIndex
|
||||||
|
|
||||||
|
|
||||||
|
ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
|
||||||
|
|
||||||
|
|
||||||
class Site(SQLObject):
|
class Site(SQLObject):
|
||||||
name = UnicodeCol()
|
name = UnicodeCol()
|
||||||
desc = UnicodeCol()
|
desc = UnicodeCol()
|
||||||
@ -21,17 +25,17 @@ class Badge(SQLObject):
|
|||||||
site = ForeignKey('Site', cascade=True)
|
site = ForeignKey('Site', cascade=True)
|
||||||
userId = IntCol()
|
userId = IntCol()
|
||||||
name = UnicodeCol()
|
name = UnicodeCol()
|
||||||
date = DateTimeCol()
|
date = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
|
||||||
|
|
||||||
|
|
||||||
class User(SQLObject):
|
class User(SQLObject):
|
||||||
sourceId = IntCol()
|
sourceId = IntCol()
|
||||||
site = ForeignKey('Site', cascade=True)
|
site = ForeignKey('Site', cascade=True)
|
||||||
reputation = IntCol()
|
reputation = IntCol()
|
||||||
creationDate = DateTimeCol()
|
creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
|
||||||
displayName = UnicodeCol()
|
displayName = UnicodeCol()
|
||||||
emailHash = UnicodeCol()
|
emailHash = UnicodeCol()
|
||||||
lastAccessDate = DateTimeCol()
|
lastAccessDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
|
||||||
websiteUrl = UnicodeCol()
|
websiteUrl = UnicodeCol()
|
||||||
location = UnicodeCol()
|
location = UnicodeCol()
|
||||||
age = IntCol()
|
age = IntCol()
|
||||||
|
Loading…
Reference in New Issue
Block a user