1
0
mirror of https://github.com/djohnlewis/stackdump synced 2025-12-17 13:23:25 +00:00

16 Commits

Author SHA1 Message Date
Samuel Lai
4a9c4504b3 Updated bad docs. 2013-11-29 12:57:06 +11:00
Samuel Lai
77dd2def42 Oops, forgot to re-instate the comment index during the backout. 2013-11-29 01:42:17 +11:00
Samuel Lai
75a216f5a4 Backed out the comments-batching change.
It was causing weird perf issues and errors. Didn't really seem like it made things faster; if anything, things became slower.
2013-11-29 01:12:09 +11:00
Samuel Lai
bf09e36928 Changed other models to avoid unnecessary date/time parsing.
Added PRAGMA statements for comments table and changed flow so the siteId_postId index is now created after data has been inserted.
2013-11-29 00:18:54 +11:00
Samuel Lai
cdb8d96508 Comments are now committed in batches and using a 'prepared' statement via executemany.
Also fixed a Windows compatibility bug with the new temp comments db and a bug with the webapp now that the Comment model has moved. Dates are also no longer parsed from their ISO form for comments; instead left as strings and parsed by SQLObject internally as needed.
2013-11-28 23:51:53 +11:00
Samuel Lai
8e3d21f817 Fixed settings for Windows compatibility. 2013-11-28 22:06:33 +11:00
Samuel Lai
6469691e4b Added PowerShell equivalents to launch and manage Stackdump on Windows. 2013-11-28 21:53:45 +11:00
Samuel Lai
65394ac516 More minor fixes. Really should get Stackdump set-up on my dev machine. 2013-11-28 15:07:05 +11:00
Samuel Lai
bcf1d7c71a Again. Forgot to fix site->siteId rename. 2013-11-28 14:39:25 +11:00
Samuel Lai
d36146ae46 More bugs - forgot to rename uses when renaming Comment.site to siteId 2013-11-28 14:38:21 +11:00
Samuel Lai
e1272ce58a Oops, bug with closing temp_db file handle. 2013-11-28 14:35:24 +11:00
Samuel Lai
bff7e13d83 Comment data used during importing is now stored in a separate database to make it easier to delete them afterwards. 2013-11-28 14:23:55 +11:00
Samuel Lai
c0766de8d4 Skips valid XML character scrubbing if configured for faster performance. 2013-11-28 14:01:00 +11:00
Samuel Lai
644269dd5d Added PyCharm project files to the ignore list. 2013-11-28 13:54:47 +11:00
Sam
6bbf0d7b28 Removed a big duplicate file in Solr. 2013-10-22 23:36:46 +11:00
Sam
71c875437e Added tag v1.1 for changeset 3ad1ff15b528 2013-10-22 23:21:20 +11:00
13 changed files with 916 additions and 870 deletions

View File

@@ -22,3 +22,6 @@ tutorial/.*$
# ignore the downloaded logos # ignore the downloaded logos
^python/media/images/logos/.* ^python/media/images/logos/.*
# PyCharm project files
^.idea/

BIN
List-StackdumpCommands.ps1 Normal file

Binary file not shown.

BIN
Run-StackdumpCommand.ps1 Normal file

Binary file not shown.

BIN
Start-Python.ps1 Normal file

Binary file not shown.

BIN
Start-Solr.ps1 Normal file

Binary file not shown.

BIN
Start-StackdumpWeb.ps1 Normal file

Binary file not shown.

Binary file not shown.

View File

@@ -246,6 +246,11 @@ class Solr(object):
Optionally accepts ``timeout`` for wait seconds until giving up on a Optionally accepts ``timeout`` for wait seconds until giving up on a
request. Default is ``60`` seconds. request. Default is ``60`` seconds.
Optionally accepts ``assume_clean`` to skip cleaning request of invalid XML
characters. This offers a slight performance improvement, but only set this
to ``True`` if you know your request is clean (e.g. coming from other XML
data). Bad things will happen otherwise. Default is ``False``.
Usage:: Usage::
solr = pysolr.Solr('http://localhost:8983/solr') solr = pysolr.Solr('http://localhost:8983/solr')
@@ -253,10 +258,11 @@ class Solr(object):
solr = pysolr.Solr('http://localhost:8983/solr', timeout=10) solr = pysolr.Solr('http://localhost:8983/solr', timeout=10)
""" """
def __init__(self, url, decoder=None, timeout=60): def __init__(self, url, decoder=None, timeout=60, assume_clean=False):
self.decoder = decoder or json.JSONDecoder() self.decoder = decoder or json.JSONDecoder()
self.url = url self.url = url
self.timeout = timeout self.timeout = timeout
self.assume_clean = assume_clean
self.log = self._get_log() self.log = self._get_log()
self.session = requests.Session() self.session = requests.Session()
self.session.stream = False self.session.stream = False
@@ -506,6 +512,9 @@ class Solr(object):
value = "{0}".format(value) value = "{0}".format(value)
if self.assume_clean:
return value
else:
return clean_xml_string(value) return clean_xml_string(value)
def _to_python(self, value): def _to_python(self, value):

View File

@@ -26,7 +26,7 @@ import html5lib
from html5lib.filters._base import Filter as HTML5LibFilterBase from html5lib.filters._base import Filter as HTML5LibFilterBase
import markdown import markdown
from stackdump.models import Site, Badge, Comment, User from stackdump.models import Site, Badge, User
from stackdump import settings from stackdump import settings
# STATIC VARIABLES # STATIC VARIABLES

View File

@@ -12,15 +12,17 @@ from datetime import datetime
import re import re
import urllib2 import urllib2
import socket import socket
import tempfile
from optparse import OptionParser from optparse import OptionParser
from xml.etree import ElementTree from xml.etree import ElementTree
from sqlobject import sqlhub, connectionForURI, AND, OR, IN, SQLObject from sqlobject import sqlhub, connectionForURI, AND, IN, SQLObject, \
UnicodeCol, DateTimeCol, IntCol, DatabaseIndex, dbconnection
from sqlobject.sqlbuilder import Delete, Insert from sqlobject.sqlbuilder import Delete, Insert
from sqlobject.styles import DefaultStyle from sqlobject.styles import DefaultStyle
from pysolr import Solr from pysolr import Solr
from stackdump.models import Site, Badge, Comment, User from stackdump.models import Site, Badge, User
from stackdump import settings from stackdump import settings
try: try:
@@ -108,7 +110,7 @@ class BadgeContentHandler(BaseContentHandler):
d['sourceId'] = int(attrs['Id']) d['sourceId'] = int(attrs['Id'])
d['userId'] = int(attrs.get('UserId', 0)) d['userId'] = int(attrs.get('UserId', 0))
d['name'] = attrs.get('Name', '') d['name'] = attrs.get('Name', '')
d['date'] = datetime.strptime(attrs.get('Date'), ISO_DATE_FORMAT) d['date'] = attrs.get('Date')
except Exception, e: except Exception, e:
# could not parse this, so ignore the row completely # could not parse this, so ignore the row completely
self.cur_props = None self.cur_props = None
@@ -135,12 +137,12 @@ class CommentContentHandler(BaseContentHandler):
return return
try: try:
d = self.cur_props = { 'site' : self.site } d = self.cur_props = { 'siteId' : self.site.id }
d['sourceId'] = int(attrs['Id']) d['sourceId'] = int(attrs['Id'])
d['postId'] = int(attrs.get('PostId', 0)) d['postId'] = int(attrs.get('PostId', 0))
d['score'] = int(attrs.get('Score', 0)) d['score'] = int(attrs.get('Score', 0))
d['text'] = attrs.get('Text', '') d['text'] = attrs.get('Text', '')
d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT) d['creationDate'] = attrs.get('CreationDate')
d['userId'] = int(attrs.get('UserId', 0)) d['userId'] = int(attrs.get('UserId', 0))
except Exception, e: except Exception, e:
@@ -181,10 +183,10 @@ class UserContentHandler(BaseContentHandler):
d = self.cur_props = { 'site' : self.site } d = self.cur_props = { 'site' : self.site }
d['sourceId'] = int(attrs['Id']) d['sourceId'] = int(attrs['Id'])
d['reputation'] = int(attrs.get('Reputation', 0)) d['reputation'] = int(attrs.get('Reputation', 0))
d['creationDate'] = datetime.strptime(attrs.get('CreationDate'), ISO_DATE_FORMAT) d['creationDate'] = attrs.get('CreationDate')
d['displayName'] = attrs.get('DisplayName', '') d['displayName'] = attrs.get('DisplayName', '')
d['emailHash'] = attrs.get('EmailHash', '') d['emailHash'] = attrs.get('EmailHash', '')
d['lastAccessDate'] = datetime.strptime(attrs.get('LastAccessDate'), ISO_DATE_FORMAT) d['lastAccessDate'] = attrs.get('LastAccessDate')
d['websiteUrl'] = attrs.get('WebsiteUrl', '') d['websiteUrl'] = attrs.get('WebsiteUrl', '')
d['location'] = attrs.get('Location', '') d['location'] = attrs.get('Location', '')
d['age'] = int(attrs.get('Age', 0)) d['age'] = int(attrs.get('Age', 0))
@@ -342,8 +344,9 @@ class PostContentHandler(xml.sax.ContentHandler):
if self.row_count % 1000 == 0: if self.row_count % 1000 == 0:
print('%-10s Processed %d rows.' % ('[post]', self.row_count)) print('%-10s Processed %d rows.' % ('[post]', self.row_count))
# only check for finished questions every 1000 rows to speed things up # only check for finished questions every 10000 rows to speed things up
if self.row_count % 1000 == 0: if self.row_count % 10000 == 0:
print('Committing completed questions...')
self.commit_finished_questions() self.commit_finished_questions()
def commit_finished_questions(self): def commit_finished_questions(self):
@@ -400,7 +403,7 @@ class PostContentHandler(xml.sax.ContentHandler):
post_ids.add(a['id']) post_ids.add(a['id'])
# get the comments # get the comments
comment_objs = Comment.select(AND(Comment.q.site == self.site, comment_objs = Comment.select(AND(Comment.q.siteId == self.site.id,
IN(Comment.q.postId, list(post_ids)))) IN(Comment.q.postId, list(post_ids))))
# sort the comments out into a dict keyed on the post id # sort the comments out into a dict keyed on the post id
@@ -514,7 +517,10 @@ class PostContentHandler(xml.sax.ContentHandler):
def commit_questions(self, questions, commit=True): def commit_questions(self, questions, commit=True):
""" """
Commits the given list of questions to solr. Adds the given list of questions to solr.
By default, they are committed immediately. Set the ``commit`` argument
to False to disable this behaviour.
""" """
self.solr.add(questions, commit=commit) self.solr.add(questions, commit=commit)
@@ -551,6 +557,25 @@ class PostContentHandler(xml.sax.ContentHandler):
for question_id, answers in self.orphan_answers.items(): for question_id, answers in self.orphan_answers.items():
print('There are %d answers for missing question [ID# %d]. Ignoring orphan answers.' % (len(answers), question_id)) print('There are %d answers for missing question [ID# %d]. Ignoring orphan answers.' % (len(answers), question_id))
# TEMP COMMENT DATABASE DEFINITION
comment_db_sqlhub = dbconnection.ConnectionHub()
class Comment(SQLObject):
sourceId = IntCol()
siteId = IntCol()
postId = IntCol()
score = IntCol()
text = UnicodeCol()
creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
userId = IntCol()
siteId_postId_index = DatabaseIndex(siteId, postId)
_connection = comment_db_sqlhub
json_fields = [ 'id', 'score', 'text', 'creationDate', 'userId' ]
# METHODS # METHODS
def get_file_path(dir_path, filename): def get_file_path(dir_path, filename):
""" """
@@ -593,14 +618,14 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
sys.exit(1) sys.exit(1)
# connect to the database # connect to the database
print('Connecting to the database...') print('Connecting to the Stackdump database...')
conn_str = settings.DATABASE_CONN_STR conn_str = settings.DATABASE_CONN_STR
sqlhub.processConnection = connectionForURI(conn_str) sqlhub.processConnection = connectionForURI(conn_str)
print('Connected.\n') print('Connected.\n')
# connect to solr # connect to solr
print('Connecting to solr...') print('Connecting to solr...')
solr = Solr(settings.SOLR_URL) solr = Solr(settings.SOLR_URL, assume_clean=True)
# pysolr doesn't try to connect until a request is made, so we'll make a ping request # pysolr doesn't try to connect until a request is made, so we'll make a ping request
try: try:
solr._send_request('GET', 'admin/ping') solr._send_request('GET', 'admin/ping')
@@ -614,7 +639,6 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
print("Creating tables if they don't exist...") print("Creating tables if they don't exist...")
Site.createTable(ifNotExists=True) Site.createTable(ifNotExists=True)
Badge.createTable(ifNotExists=True) Badge.createTable(ifNotExists=True)
Comment.createTable(ifNotExists=True)
User.createTable(ifNotExists=True) User.createTable(ifNotExists=True)
print('Created.\n') print('Created.\n')
@@ -742,8 +766,6 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
sqlhub.threadConnection = sqlhub.processConnection.transaction() sqlhub.threadConnection = sqlhub.processConnection.transaction()
conn = sqlhub.threadConnection conn = sqlhub.threadConnection
# these deletions are done in this order to avoid FK constraint issues # these deletions are done in this order to avoid FK constraint issues
print('\tDeleting comments...')
conn.query(conn.sqlrepr(Delete(Comment.sqlmeta.table, where=(Comment.q.site==site))))
print('\tDeleting badges...') print('\tDeleting badges...')
conn.query(conn.sqlrepr(Delete(Badge.sqlmeta.table, where=(Badge.q.site==site)))) conn.query(conn.sqlrepr(Delete(Badge.sqlmeta.table, where=(Badge.q.site==site))))
print('\tDeleting users...') print('\tDeleting users...')
@@ -758,11 +780,26 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
solr.commit(expungeDeletes=True) solr.commit(expungeDeletes=True)
print('Deleted.\n') print('Deleted.\n')
# create the temporary comments database
print('Connecting to the temporary comments database...')
temp_db_file, temp_db_path = tempfile.mkstemp('.sqlite', 'temp_comment_db-' + re.sub(r'[^\w]', '_', site_key) + '-', settings.TEMP_COMMENTS_DATABASE_DIR)
os.close(temp_db_file)
conn_str = 'sqlite:///' + temp_db_path
comment_db_sqlhub.processConnection = connectionForURI(conn_str)
print('Connected.')
Comment.createTable()
print('Schema created.')
comment_db_sqlhub.processConnection.getConnection().execute('PRAGMA synchronous = OFF')
comment_db_sqlhub.processConnection.getConnection().execute('PRAGMA journal_mode = MEMORY')
print('Pragma configured.\n')
timing_start = time.time() timing_start = time.time()
# start a new transaction # start a new transaction
sqlhub.threadConnection = sqlhub.processConnection.transaction() sqlhub.threadConnection = sqlhub.processConnection.transaction()
conn = sqlhub.threadConnection conn = sqlhub.threadConnection
comment_db_sqlhub.threadConnection = comment_db_sqlhub.processConnection.transaction()
temp_db_conn = comment_db_sqlhub.threadConnection
# create a new Site # create a new Site
site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date, site = Site(name=site_name, desc=site_desc, key=site_key, dump_date=dump_date,
@@ -785,7 +822,7 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
print('[comment] PARSING COMMENTS...') print('[comment] PARSING COMMENTS...')
xml_path = get_file_path(xml_root, 'comments.xml') xml_path = get_file_path(xml_root, 'comments.xml')
print('[comment] start parsing comments.xml...') print('[comment] start parsing comments.xml...')
handler = CommentContentHandler(conn, site) handler = CommentContentHandler(temp_db_conn, site)
xml.sax.parse(xml_path, handler) xml.sax.parse(xml_path, handler)
print('%-10s Processed %d rows.' % ('[comment]', handler.row_count)) print('%-10s Processed %d rows.' % ('[comment]', handler.row_count))
print('[comment] FINISHED PARSING COMMENTS.\n') print('[comment] FINISHED PARSING COMMENTS.\n')
@@ -812,8 +849,10 @@ def import_site(xml_root, site_name, dump_date, site_desc, site_key,
print('[post] FINISHED PARSING POSTS.\n') print('[post] FINISHED PARSING POSTS.\n')
# DELETE COMMENTS # DELETE COMMENTS
print('[comment] DELETING COMMENTS FROM DATABASE (they are no longer needed)...') print('[comment] DELETING TEMPORARY COMMENTS DATABASE (they are no longer needed)...')
conn.query(conn.sqlrepr(Delete(Comment.sqlmeta.table, where=(Comment.q.site == site)))) temp_db_conn.commit(close=True)
comment_db_sqlhub.processConnection.close()
os.remove(temp_db_path)
print('[comment] FINISHED DELETING COMMENTS.\n') print('[comment] FINISHED DELETING COMMENTS.\n')
# commit transaction # commit transaction

View File

@@ -18,7 +18,8 @@ SERVER_PORT = 8080
SOLR_URL = 'http://localhost:8983/solr/stackdump/' SOLR_URL = 'http://localhost:8983/solr/stackdump/'
import os import os
DATABASE_CONN_STR = 'sqlite://%s/../../../data/stackdump.sqlite' % os.path.dirname(__file__) DATABASE_CONN_STR = 'sqlite:///' + os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data', 'stackdump.sqlite')
TEMP_COMMENTS_DATABASE_DIR = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data')
# if the website is hosted under a subpath, specify it here. It must end with a # if the website is hosted under a subpath, specify it here. It must end with a
# slash. # slash.

View File

@@ -5,6 +5,10 @@
from sqlobject import SQLObject, UnicodeCol, DateTimeCol, IntCol, ForeignKey, \ from sqlobject import SQLObject, UnicodeCol, DateTimeCol, IntCol, ForeignKey, \
DatabaseIndex DatabaseIndex
ISO_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
class Site(SQLObject): class Site(SQLObject):
name = UnicodeCol() name = UnicodeCol()
desc = UnicodeCol() desc = UnicodeCol()
@@ -15,34 +19,23 @@ class Site(SQLObject):
siteKey_index = DatabaseIndex(key, unique=True) siteKey_index = DatabaseIndex(key, unique=True)
class Badge(SQLObject): class Badge(SQLObject):
sourceId = IntCol() sourceId = IntCol()
site = ForeignKey('Site', cascade=True) site = ForeignKey('Site', cascade=True)
userId = IntCol() userId = IntCol()
name = UnicodeCol() name = UnicodeCol()
date = DateTimeCol() date = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
class Comment(SQLObject):
sourceId = IntCol()
site = ForeignKey('Site', cascade=True)
postId = IntCol()
score = IntCol()
text = UnicodeCol()
creationDate = DateTimeCol()
userId = IntCol()
siteId_postId_index = DatabaseIndex(site, postId)
json_fields = [ 'id', 'score', 'text', 'creationDate', 'userId' ]
class User(SQLObject): class User(SQLObject):
sourceId = IntCol() sourceId = IntCol()
site = ForeignKey('Site', cascade=True) site = ForeignKey('Site', cascade=True)
reputation = IntCol() reputation = IntCol()
creationDate = DateTimeCol() creationDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
displayName = UnicodeCol() displayName = UnicodeCol()
emailHash = UnicodeCol() emailHash = UnicodeCol()
lastAccessDate = DateTimeCol() lastAccessDate = DateTimeCol(datetimeFormat=ISO_DATE_FORMAT)
websiteUrl = UnicodeCol() websiteUrl = UnicodeCol()
location = UnicodeCol() location = UnicodeCol()
age = IntCol() age = IntCol()

View File

@@ -19,9 +19,10 @@ from default_settings import *
# uncomment if the default host and port for Solr is different. # uncomment if the default host and port for Solr is different.
#SOLR_URL = 'http://localhost:8983/solr/stackdump/' #SOLR_URL = 'http://localhost:8983/solr/stackdump/'
# uncomment if the database for Stackdump is not the default SQLite one # uncomment if the database for Stackdump is not the default SQLite one or you
#import os # wish to have the database at a different path to the stackdump_root/data
#DATABASE_CONN_STR = 'sqlite://%s/../../../data/stackdump.sqlite' % os.path.dirname(__file__) # directory
#DATABASE_CONN_STR = 'sqlite:///' + path_to_the_database
# if the website is hosted under a subpath, specify it here. It must end with a # if the website is hosted under a subpath, specify it here. It must end with a
# slash. # slash.